From a8b2fef09b7f4666885924303f93958d1b3fbd32 Mon Sep 17 00:00:00 2001 From: jsing Date: Fri, 17 Feb 2023 05:13:34 +0000 Subject: [PATCH] Reimplement bn_sqr_comba{4,8}(). Use bignum primitives rather than the current mess of macros.The sqr_add_c macro gets replaced with bn_mulw_addtw(), while the sqr_add_c2 macro gets replaced with bn_mul2_mulw_addtw(). The variables in the comba functions have also been reordered, so that the patterns are easier to understand - the compiler can take care of optimising the inputs and outputs to avoid register moves. ok tb@ --- lib/libcrypto/bn/bn_internal.h | 30 +++++- lib/libcrypto/bn/bn_sqr.c | 182 +++++++++++++++------------------ 2 files changed, 110 insertions(+), 102 deletions(-) diff --git a/lib/libcrypto/bn/bn_internal.h b/lib/libcrypto/bn/bn_internal.h index acee2b4020d..859f00d05d8 100644 --- a/lib/libcrypto/bn/bn_internal.h +++ b/lib/libcrypto/bn/bn_internal.h @@ -1,4 +1,4 @@ -/* $OpenBSD: bn_internal.h,v 1.8 2023/02/16 10:58:06 jsing Exp $ */ +/* $OpenBSD: bn_internal.h,v 1.9 2023/02/17 05:13:34 jsing Exp $ */ /* * Copyright (c) 2023 Joel Sing * @@ -361,4 +361,32 @@ bn_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0, } #endif +/* + * bn_mulw_addtw() computes (r2:r1:r0) = 2 * a * b + (c2:c1:c0), where a and b + * are single words and (c2:c1:c0) is a triple word, producing a triple word + * result. The caller must ensure that the inputs provided do not result in c2 + * overflowing. + */ +#ifndef HAVE_BN_MUL2_MULW_ADDTW +static inline void +bn_mul2_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0, + BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0) +{ + BN_ULONG r2, r1, r0, x1, x0; + BN_ULONG carry; + + bn_mulw(a, b, &x1, &x0); + bn_addw(c0, x0, &carry, &r0); + bn_addw(c1, x1 + carry, &r2, &r1); + bn_addw(c2, r2, &carry, &r2); + bn_addw(r0, x0, &carry, &r0); + bn_addw(r1, x1 + carry, &carry, &r1); + r2 += carry; + + *out_r2 = r2; + *out_r1 = r1; + *out_r0 = r0; +} +#endif + #endif diff --git a/lib/libcrypto/bn/bn_sqr.c b/lib/libcrypto/bn/bn_sqr.c index f649b9bce87..6e784541bde 100644 --- a/lib/libcrypto/bn/bn_sqr.c +++ b/lib/libcrypto/bn/bn_sqr.c @@ -1,4 +1,4 @@ -/* $OpenBSD: bn_sqr.c,v 1.26 2023/02/16 10:41:03 jsing Exp $ */ +/* $OpenBSD: bn_sqr.c,v 1.27 2023/02/17 05:13:34 jsing Exp $ */ /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) * All rights reserved. * @@ -66,117 +66,97 @@ int bn_sqr(BIGNUM *r, const BIGNUM *a, int max, BN_CTX *ctx); +/* + * bn_sqr_comba4() computes r[] = a[] * a[] using Comba multiplication + * (https://everything2.com/title/Comba+multiplication), where a is a + * four word array, producing an eight word array result. + */ #ifndef HAVE_BN_SQR_COMBA4 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { - BN_ULONG c1, c2, c3; - - c1 = 0; - c2 = 0; - c3 = 0; - sqr_add_c(a, 0, c1, c2, c3); - r[0] = c1; - c1 = 0; - sqr_add_c2(a, 1, 0, c2, c3, c1); - r[1] = c2; - c2 = 0; - sqr_add_c(a, 1, c3, c1, c2); - sqr_add_c2(a, 2, 0, c3, c1, c2); - r[2] = c3; - c3 = 0; - sqr_add_c2(a, 3, 0, c1, c2, c3); - sqr_add_c2(a, 2, 1, c1, c2, c3); - r[3] = c1; - c1 = 0; - sqr_add_c(a, 2, c2, c3, c1); - sqr_add_c2(a, 3, 1, c2, c3, c1); - r[4] = c2; - c2 = 0; - sqr_add_c2(a, 3, 2, c3, c1, c2); - r[5] = c3; - c3 = 0; - sqr_add_c(a, 3, c1, c2, c3); - r[6] = c1; - r[7] = c2; + BN_ULONG c2, c1, c0; + + bn_mulw_addtw(a[0], a[0], 0, 0, 0, &c2, &c1, &r[0]); + + bn_mul2_mulw_addtw(a[1], a[0], 0, c2, c1, &c2, &c1, &r[1]); + + bn_mulw_addtw(a[1], a[1], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[2], a[0], c2, c1, c0, &c2, &c1, &r[2]); + + bn_mul2_mulw_addtw(a[3], a[0], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[2], a[1], c2, c1, c0, &c2, &c1, &r[3]); + + bn_mulw_addtw(a[2], a[2], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[3], a[1], c2, c1, c0, &c2, &c1, &r[4]); + + bn_mul2_mulw_addtw(a[3], a[2], 0, c2, c1, &c2, &c1, &r[5]); + + bn_mulw_addtw(a[3], a[3], 0, c2, c1, &c2, &r[7], &r[6]); } #endif +/* + * bn_sqr_comba8() computes r[] = a[] * a[] using Comba multiplication + * (https://everything2.com/title/Comba+multiplication), where a is an + * eight word array, producing an 16 word array result. + */ #ifndef HAVE_BN_SQR_COMBA8 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { - BN_ULONG c1, c2, c3; - - c1 = 0; - c2 = 0; - c3 = 0; - sqr_add_c(a, 0, c1, c2, c3); - r[0] = c1; - c1 = 0; - sqr_add_c2(a, 1, 0, c2, c3, c1); - r[1] = c2; - c2 = 0; - sqr_add_c(a, 1, c3, c1, c2); - sqr_add_c2(a, 2, 0, c3, c1, c2); - r[2] = c3; - c3 = 0; - sqr_add_c2(a, 3, 0, c1, c2, c3); - sqr_add_c2(a, 2, 1, c1, c2, c3); - r[3] = c1; - c1 = 0; - sqr_add_c(a, 2, c2, c3, c1); - sqr_add_c2(a, 3, 1, c2, c3, c1); - sqr_add_c2(a, 4, 0, c2, c3, c1); - r[4] = c2; - c2 = 0; - sqr_add_c2(a, 5, 0, c3, c1, c2); - sqr_add_c2(a, 4, 1, c3, c1, c2); - sqr_add_c2(a, 3, 2, c3, c1, c2); - r[5] = c3; - c3 = 0; - sqr_add_c(a, 3, c1, c2, c3); - sqr_add_c2(a, 4, 2, c1, c2, c3); - sqr_add_c2(a, 5, 1, c1, c2, c3); - sqr_add_c2(a, 6, 0, c1, c2, c3); - r[6] = c1; - c1 = 0; - sqr_add_c2(a, 7, 0, c2, c3, c1); - sqr_add_c2(a, 6, 1, c2, c3, c1); - sqr_add_c2(a, 5, 2, c2, c3, c1); - sqr_add_c2(a, 4, 3, c2, c3, c1); - r[7] = c2; - c2 = 0; - sqr_add_c(a, 4, c3, c1, c2); - sqr_add_c2(a, 5, 3, c3, c1, c2); - sqr_add_c2(a, 6, 2, c3, c1, c2); - sqr_add_c2(a, 7, 1, c3, c1, c2); - r[8] = c3; - c3 = 0; - sqr_add_c2(a, 7, 2, c1, c2, c3); - sqr_add_c2(a, 6, 3, c1, c2, c3); - sqr_add_c2(a, 5, 4, c1, c2, c3); - r[9] = c1; - c1 = 0; - sqr_add_c(a, 5, c2, c3, c1); - sqr_add_c2(a, 6, 4, c2, c3, c1); - sqr_add_c2(a, 7, 3, c2, c3, c1); - r[10] = c2; - c2 = 0; - sqr_add_c2(a, 7, 4, c3, c1, c2); - sqr_add_c2(a, 6, 5, c3, c1, c2); - r[11] = c3; - c3 = 0; - sqr_add_c(a, 6, c1, c2, c3); - sqr_add_c2(a, 7, 5, c1, c2, c3); - r[12] = c1; - c1 = 0; - sqr_add_c2(a, 7, 6, c2, c3, c1); - r[13] = c2; - c2 = 0; - sqr_add_c(a, 7, c3, c1, c2); - r[14] = c3; - r[15] = c1; + BN_ULONG c2, c1, c0; + + bn_mulw_addtw(a[0], a[0], 0, 0, 0, &c2, &c1, &r[0]); + + bn_mul2_mulw_addtw(a[1], a[0], 0, c2, c1, &c2, &c1, &r[1]); + + bn_mulw_addtw(a[1], a[1], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[2], a[0], c2, c1, c0, &c2, &c1, &r[2]); + + bn_mul2_mulw_addtw(a[3], a[0], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[2], a[1], c2, c1, c0, &c2, &c1, &r[3]); + + bn_mulw_addtw(a[2], a[2], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[3], a[1], c2, c1, c0, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[4], a[0], c2, c1, c0, &c2, &c1, &r[4]); + + bn_mul2_mulw_addtw(a[5], a[0], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[4], a[1], c2, c1, c0, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[3], a[2], c2, c1, c0, &c2, &c1, &r[5]); + + bn_mulw_addtw(a[3], a[3], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[4], a[2], c2, c1, c0, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[5], a[1], c2, c1, c0, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[6], a[0], c2, c1, c0, &c2, &c1, &r[6]); + + bn_mul2_mulw_addtw(a[7], a[0], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[6], a[1], c2, c1, c0, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[5], a[2], c2, c1, c0, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[4], a[3], c2, c1, c0, &c2, &c1, &r[7]); + + bn_mulw_addtw(a[4], a[4], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[5], a[3], c2, c1, c0, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[6], a[2], c2, c1, c0, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[7], a[1], c2, c1, c0, &c2, &c1, &r[8]); + + bn_mul2_mulw_addtw(a[7], a[2], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[6], a[3], c2, c1, c0, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[5], a[4], c2, c1, c0, &c2, &c1, &r[9]); + + bn_mulw_addtw(a[5], a[5], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[6], a[4], c2, c1, c0, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[7], a[3], c2, c1, c0, &c2, &c1, &r[10]); + + bn_mul2_mulw_addtw(a[7], a[4], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[6], a[5], c2, c1, c0, &c2, &c1, &r[11]); + + bn_mulw_addtw(a[6], a[6], 0, c2, c1, &c2, &c1, &c0); + bn_mul2_mulw_addtw(a[7], a[5], c2, c1, c0, &c2, &c1, &r[12]); + + bn_mul2_mulw_addtw(a[7], a[6], 0, c2, c1, &c2, &c1, &r[13]); + + bn_mulw_addtw(a[7], a[7], 0, c2, c1, &c2, &r[15], &r[14]); } #endif -- 2.20.1