From df1d0ed5bb4253fbd3e57fbba5daf089ed1fc61f Mon Sep 17 00:00:00 2001 From: jsing Date: Tue, 14 Feb 2023 18:37:15 +0000 Subject: [PATCH] Reimplement bn_mul_words(), bn_mul_add_words() and bn_mul_comba{4,8}(). Use bignum primitives rather than the current mess of macros, which also allows us to remove the essentially duplicate versions of bn_mul_words() and bn_mul_add_words() for BN_LLONG. The "mul" macro gets replaced by bn_mulw_addw(), "mul_add" with bn_mulw_addw_addw() and "mul_add_c" with bn_mulw_addtw() (where 'w' indicates single word input and 'tw' indicates triple word input). The variables in the comba functions have also been reordered, so that the patterns are easier to understand - the compiler can take care of optimising the inputs and outputs to avoid register moves. ok tb@ --- lib/libcrypto/bn/bn_mul.c | 387 +++++++++++++++----------------------- 1 file changed, 152 insertions(+), 235 deletions(-) diff --git a/lib/libcrypto/bn/bn_mul.c b/lib/libcrypto/bn/bn_mul.c index 38c01dad185..965c1ad0365 100644 --- a/lib/libcrypto/bn/bn_mul.c +++ b/lib/libcrypto/bn/bn_mul.c @@ -1,4 +1,4 @@ -/* $OpenBSD: bn_mul.c,v 1.31 2023/02/13 04:25:37 jsing Exp $ */ +/* $OpenBSD: bn_mul.c,v 1.32 2023/02/14 18:37:15 jsing Exp $ */ /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) * All rights reserved. * @@ -63,293 +63,210 @@ #include #include "bn_arch.h" +#include "bn_internal.h" #include "bn_local.h" +/* + * bn_mul_add_words() computes (carry:r[i]) = a[i] * w + r[i] + carry, where + * a is an array of words and w is a single word. This should really be called + * bn_mulw_add_words() since only one input is an array. This is used as a step + * in the multiplication of word arrays. + */ #ifndef HAVE_BN_MUL_ADD_WORDS -#if defined(BN_LLONG) || defined(BN_UMULT_HIGH) - BN_ULONG -bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) +bn_mul_add_words(BN_ULONG *r, const BN_ULONG *a, int num, BN_ULONG w) { - BN_ULONG c1 = 0; + BN_ULONG carry = 0; assert(num >= 0); if (num <= 0) - return (c1); + return 0; #ifndef OPENSSL_SMALL_FOOTPRINT while (num & ~3) { - mul_add(rp[0], ap[0], w, c1); - mul_add(rp[1], ap[1], w, c1); - mul_add(rp[2], ap[2], w, c1); - mul_add(rp[3], ap[3], w, c1); - ap += 4; - rp += 4; + bn_mulw_addw_addw(a[0], w, r[0], carry, &carry, &r[0]); + bn_mulw_addw_addw(a[1], w, r[1], carry, &carry, &r[1]); + bn_mulw_addw_addw(a[2], w, r[2], carry, &carry, &r[2]); + bn_mulw_addw_addw(a[3], w, r[3], carry, &carry, &r[3]); + a += 4; + r += 4; num -= 4; } #endif while (num) { - mul_add(rp[0], ap[0], w, c1); - ap++; - rp++; + bn_mulw_addw_addw(a[0], w, r[0], carry, &carry, &r[0]); + a++; + r++; num--; } - return (c1); + return carry; } +#endif -#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ - -BN_ULONG -bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) +/* + * bn_mul_comba4() computes r[] = a[] * b[] using Comba multiplication + * (https://everything2.com/title/Comba+multiplication), where a and b are both + * four word arrays, producing an eight word array result. + */ +#ifndef HAVE_BN_MUL_COMBA4 +void +bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { - BN_ULONG c = 0; - BN_ULONG bl, bh; + BN_ULONG c0, c1, c2; - assert(num >= 0); - if (num <= 0) - return ((BN_ULONG)0); + bn_mulw_addtw(a[0], b[0], 0, 0, 0, &c2, &c1, &r[0]); - bl = LBITS(w); - bh = HBITS(w); + bn_mulw_addtw(a[0], b[1], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[1], b[0], c2, c1, c0, &c2, &c1, &r[1]); -#ifndef OPENSSL_SMALL_FOOTPRINT - while (num & ~3) { - mul_add(rp[0], ap[0], bl, bh, c); - mul_add(rp[1], ap[1], bl, bh, c); - mul_add(rp[2], ap[2], bl, bh, c); - mul_add(rp[3], ap[3], bl, bh, c); - ap += 4; - rp += 4; - num -= 4; - } -#endif - while (num) { - mul_add(rp[0], ap[0], bl, bh, c); - ap++; - rp++; - num--; - } - return (c); -} + bn_mulw_addtw(a[2], b[0], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[1], b[1], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[0], b[2], c2, c1, c0, &c2, &c1, &r[2]); -#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ -#endif + bn_mulw_addtw(a[0], b[3], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[1], b[2], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[2], b[1], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[3], b[0], c2, c1, c0, &c2, &c1, &r[3]); -#ifndef HAVE_BN_MUL_COMBA4 -void -bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) -{ - BN_ULONG c1, c2, c3; - - c1 = 0; - c2 = 0; - c3 = 0; - mul_add_c(a[0], b[0], c1, c2, c3); - r[0] = c1; - c1 = 0; - mul_add_c(a[0], b[1], c2, c3, c1); - mul_add_c(a[1], b[0], c2, c3, c1); - r[1] = c2; - c2 = 0; - mul_add_c(a[2], b[0], c3, c1, c2); - mul_add_c(a[1], b[1], c3, c1, c2); - mul_add_c(a[0], b[2], c3, c1, c2); - r[2] = c3; - c3 = 0; - mul_add_c(a[0], b[3], c1, c2, c3); - mul_add_c(a[1], b[2], c1, c2, c3); - mul_add_c(a[2], b[1], c1, c2, c3); - mul_add_c(a[3], b[0], c1, c2, c3); - r[3] = c1; - c1 = 0; - mul_add_c(a[3], b[1], c2, c3, c1); - mul_add_c(a[2], b[2], c2, c3, c1); - mul_add_c(a[1], b[3], c2, c3, c1); - r[4] = c2; - c2 = 0; - mul_add_c(a[2], b[3], c3, c1, c2); - mul_add_c(a[3], b[2], c3, c1, c2); - r[5] = c3; - c3 = 0; - mul_add_c(a[3], b[3], c1, c2, c3); - r[6] = c1; - r[7] = c2; + bn_mulw_addtw(a[3], b[1], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[2], b[2], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[1], b[3], c2, c1, c0, &c2, &c1, &r[4]); + + bn_mulw_addtw(a[2], b[3], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[3], b[2], c2, c1, c0, &c2, &c1, &r[5]); + + bn_mulw_addtw(a[3], b[3], 0, c2, c1, &c2, &r[7], &r[6]); } #endif +/* + * bn_mul_comba8() computes r[] = a[] * b[] using Comba multiplication + * (https://everything2.com/title/Comba+multiplication), where a and b are both + * eight word arrays, producing a 16 word array result. + */ #ifndef HAVE_BN_MUL_COMBA8 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { - BN_ULONG c1, c2, c3; - - c1 = 0; - c2 = 0; - c3 = 0; - mul_add_c(a[0], b[0], c1, c2, c3); - r[0] = c1; - c1 = 0; - mul_add_c(a[0], b[1], c2, c3, c1); - mul_add_c(a[1], b[0], c2, c3, c1); - r[1] = c2; - c2 = 0; - mul_add_c(a[2], b[0], c3, c1, c2); - mul_add_c(a[1], b[1], c3, c1, c2); - mul_add_c(a[0], b[2], c3, c1, c2); - r[2] = c3; - c3 = 0; - mul_add_c(a[0], b[3], c1, c2, c3); - mul_add_c(a[1], b[2], c1, c2, c3); - mul_add_c(a[2], b[1], c1, c2, c3); - mul_add_c(a[3], b[0], c1, c2, c3); - r[3] = c1; - c1 = 0; - mul_add_c(a[4], b[0], c2, c3, c1); - mul_add_c(a[3], b[1], c2, c3, c1); - mul_add_c(a[2], b[2], c2, c3, c1); - mul_add_c(a[1], b[3], c2, c3, c1); - mul_add_c(a[0], b[4], c2, c3, c1); - r[4] = c2; - c2 = 0; - mul_add_c(a[0], b[5], c3, c1, c2); - mul_add_c(a[1], b[4], c3, c1, c2); - mul_add_c(a[2], b[3], c3, c1, c2); - mul_add_c(a[3], b[2], c3, c1, c2); - mul_add_c(a[4], b[1], c3, c1, c2); - mul_add_c(a[5], b[0], c3, c1, c2); - r[5] = c3; - c3 = 0; - mul_add_c(a[6], b[0], c1, c2, c3); - mul_add_c(a[5], b[1], c1, c2, c3); - mul_add_c(a[4], b[2], c1, c2, c3); - mul_add_c(a[3], b[3], c1, c2, c3); - mul_add_c(a[2], b[4], c1, c2, c3); - mul_add_c(a[1], b[5], c1, c2, c3); - mul_add_c(a[0], b[6], c1, c2, c3); - r[6] = c1; - c1 = 0; - mul_add_c(a[0], b[7], c2, c3, c1); - mul_add_c(a[1], b[6], c2, c3, c1); - mul_add_c(a[2], b[5], c2, c3, c1); - mul_add_c(a[3], b[4], c2, c3, c1); - mul_add_c(a[4], b[3], c2, c3, c1); - mul_add_c(a[5], b[2], c2, c3, c1); - mul_add_c(a[6], b[1], c2, c3, c1); - mul_add_c(a[7], b[0], c2, c3, c1); - r[7] = c2; - c2 = 0; - mul_add_c(a[7], b[1], c3, c1, c2); - mul_add_c(a[6], b[2], c3, c1, c2); - mul_add_c(a[5], b[3], c3, c1, c2); - mul_add_c(a[4], b[4], c3, c1, c2); - mul_add_c(a[3], b[5], c3, c1, c2); - mul_add_c(a[2], b[6], c3, c1, c2); - mul_add_c(a[1], b[7], c3, c1, c2); - r[8] = c3; - c3 = 0; - mul_add_c(a[2], b[7], c1, c2, c3); - mul_add_c(a[3], b[6], c1, c2, c3); - mul_add_c(a[4], b[5], c1, c2, c3); - mul_add_c(a[5], b[4], c1, c2, c3); - mul_add_c(a[6], b[3], c1, c2, c3); - mul_add_c(a[7], b[2], c1, c2, c3); - r[9] = c1; - c1 = 0; - mul_add_c(a[7], b[3], c2, c3, c1); - mul_add_c(a[6], b[4], c2, c3, c1); - mul_add_c(a[5], b[5], c2, c3, c1); - mul_add_c(a[4], b[6], c2, c3, c1); - mul_add_c(a[3], b[7], c2, c3, c1); - r[10] = c2; - c2 = 0; - mul_add_c(a[4], b[7], c3, c1, c2); - mul_add_c(a[5], b[6], c3, c1, c2); - mul_add_c(a[6], b[5], c3, c1, c2); - mul_add_c(a[7], b[4], c3, c1, c2); - r[11] = c3; - c3 = 0; - mul_add_c(a[7], b[5], c1, c2, c3); - mul_add_c(a[6], b[6], c1, c2, c3); - mul_add_c(a[5], b[7], c1, c2, c3); - r[12] = c1; - c1 = 0; - mul_add_c(a[6], b[7], c2, c3, c1); - mul_add_c(a[7], b[6], c2, c3, c1); - r[13] = c2; - c2 = 0; - mul_add_c(a[7], b[7], c3, c1, c2); - r[14] = c3; - r[15] = c1; + BN_ULONG c0, c1, c2; + + bn_mulw_addtw(a[0], b[0], 0, 0, 0, &c2, &c1, &r[0]); + + bn_mulw_addtw(a[0], b[1], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[1], b[0], c2, c1, c0, &c2, &c1, &r[1]); + + bn_mulw_addtw(a[2], b[0], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[1], b[1], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[0], b[2], c2, c1, c0, &c2, &c1, &r[2]); + + bn_mulw_addtw(a[0], b[3], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[1], b[2], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[2], b[1], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[3], b[0], c2, c1, c0, &c2, &c1, &r[3]); + + bn_mulw_addtw(a[4], b[0], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[3], b[1], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[2], b[2], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[1], b[3], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[0], b[4], c2, c1, c0, &c2, &c1, &r[4]); + + bn_mulw_addtw(a[0], b[5], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[1], b[4], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[2], b[3], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[3], b[2], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[4], b[1], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[5], b[0], c2, c1, c0, &c2, &c1, &r[5]); + + bn_mulw_addtw(a[6], b[0], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[5], b[1], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[4], b[2], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[3], b[3], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[2], b[4], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[1], b[5], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[0], b[6], c2, c1, c0, &c2, &c1, &r[6]); + + bn_mulw_addtw(a[0], b[7], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[1], b[6], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[2], b[5], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[3], b[4], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[4], b[3], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[5], b[2], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[6], b[1], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[7], b[0], c2, c1, c0, &c2, &c1, &r[7]); + + bn_mulw_addtw(a[7], b[1], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[6], b[2], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[5], b[3], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[4], b[4], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[3], b[5], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[2], b[6], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[1], b[7], c2, c1, c0, &c2, &c1, &r[8]); + + bn_mulw_addtw(a[2], b[7], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[3], b[6], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[4], b[5], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[5], b[4], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[6], b[3], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[7], b[2], c2, c1, c0, &c2, &c1, &r[9]); + + bn_mulw_addtw(a[7], b[3], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[6], b[4], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[5], b[5], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[4], b[6], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[3], b[7], c2, c1, c0, &c2, &c1, &r[10]); + + bn_mulw_addtw(a[4], b[7], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[5], b[6], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[6], b[5], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[7], b[4], c2, c1, c0, &c2, &c1, &r[11]); + + bn_mulw_addtw(a[7], b[5], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[6], b[6], c2, c1, c0, &c2, &c1, &c0); + bn_mulw_addtw(a[5], b[7], c2, c1, c0, &c2, &c1, &r[12]); + + bn_mulw_addtw(a[6], b[7], 0, c2, c1, &c2, &c1, &c0); + bn_mulw_addtw(a[7], b[6], c2, c1, c0, &c2, &c1, &r[13]); + + bn_mulw_addtw(a[7], b[7], 0, c2, c1, &c2, &r[15], &r[14]); } #endif +/* + * bn_mul_words() computes (carry:r[i]) = a[i] * w + carry, where a is an array + * of words and w is a single word. This should really be called bn_mulw_words() + * since only one input is an array. This is used as a step in the multiplication + * of word arrays. + */ #ifndef HAVE_BN_MUL_WORDS -#if defined(BN_LLONG) || defined(BN_UMULT_HIGH) - BN_ULONG -bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) -{ - BN_ULONG c1 = 0; - - assert(num >= 0); - if (num <= 0) - return (c1); - -#ifndef OPENSSL_SMALL_FOOTPRINT - while (num & ~3) { - mul(rp[0], ap[0], w, c1); - mul(rp[1], ap[1], w, c1); - mul(rp[2], ap[2], w, c1); - mul(rp[3], ap[3], w, c1); - ap += 4; - rp += 4; - num -= 4; - } -#endif - while (num) { - mul(rp[0], ap[0], w, c1); - ap++; - rp++; - num--; - } - return (c1); -} -#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ - -BN_ULONG -bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) +bn_mul_words(BN_ULONG *r, const BN_ULONG *a, int num, BN_ULONG w) { BN_ULONG carry = 0; - BN_ULONG bl, bh; assert(num >= 0); if (num <= 0) - return ((BN_ULONG)0); - - bl = LBITS(w); - bh = HBITS(w); + return 0; #ifndef OPENSSL_SMALL_FOOTPRINT while (num & ~3) { - mul(rp[0], ap[0], bl, bh, carry); - mul(rp[1], ap[1], bl, bh, carry); - mul(rp[2], ap[2], bl, bh, carry); - mul(rp[3], ap[3], bl, bh, carry); - ap += 4; - rp += 4; + bn_mulw_addw(a[0], w, carry, &carry, &r[0]); + bn_mulw_addw(a[1], w, carry, &carry, &r[1]); + bn_mulw_addw(a[2], w, carry, &carry, &r[2]); + bn_mulw_addw(a[3], w, carry, &carry, &r[3]); + a += 4; + r += 4; num -= 4; } #endif while (num) { - mul(rp[0], ap[0], bl, bh, carry); - ap++; - rp++; + bn_mulw_addw(a[0], w, carry, &carry, &r[0]); + a++; + r++; num--; } - return (carry); + return carry; } -#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ #endif #if defined(OPENSSL_NO_ASM) || !defined(OPENSSL_BN_ASM_PART_WORDS) -- 2.20.1