From 5d702902492229c5da84ebeefd7524929d358611 Mon Sep 17 00:00:00 2001 From: jsing Date: Wed, 27 Mar 2024 12:42:30 +0000 Subject: [PATCH] Remove assembly for stitched modes. The stitched modes have been removed, so having assembly for them is of little use. --- lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl | 1237 -------------------- lib/libcrypto/arch/amd64/Makefile.inc | 5 +- lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl | 515 -------- 3 files changed, 1 insertion(+), 1756 deletions(-) delete mode 100644 lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl delete mode 100644 lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl diff --git a/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl b/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl deleted file mode 100644 index 5eb5b7bf65f..00000000000 --- a/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl +++ /dev/null @@ -1,1237 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# June 2011 -# -# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled -# in http://download.intel.com/design/intarch/papers/323686.pdf, is -# that since AESNI-CBC encrypt exhibit *very* low instruction-level -# parallelism, interleaving it with another algorithm would allow to -# utilize processor resources better and achieve better performance. -# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and -# AESNI code is weaved into it. Below are performance numbers in -# cycles per processed byte, less is better, for standalone AESNI-CBC -# encrypt, sum of the latter and standalone SHA1, and "stitched" -# subroutine: -# -# AES-128-CBC +SHA1 stitch gain -# Westmere 3.77[+5.6] 9.37 6.65 +41% -# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%) -# -# AES-192-CBC -# Westmere 4.51 10.11 6.97 +45% -# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%) -# -# AES-256-CBC -# Westmere 5.25 10.85 7.25 +50% -# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%) -# -# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for -# background information. Above numbers in parentheses are SSSE3 -# results collected on AVX-capable CPU, i.e. apply on OSes that -# don't support AVX. -# -# Needless to mention that it makes no sense to implement "stitched" -# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1 -# fully utilize parallelism, so stitching would not give any gain -# anyway. Well, there might be some, e.g. because of better cache -# locality... For reference, here are performance results for -# standalone AESNI-CBC decrypt: -# -# AES-128-CBC AES-192-CBC AES-256-CBC -# Westmere 1.31 1.55 1.80 -# Sandy Bridge 0.93 1.06 1.22 - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/ && - $1>=2.19); -$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && - $1>=2.09); -$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./ && - $1>=10); - -open OUT,"| \"$^X\" $xlate $flavour $output"; -*STDOUT=*OUT; - -# void aesni_cbc_sha1_enc(const void *inp, -# void *out, -# size_t length, -# const AES_KEY *key, -# unsigned char *iv, -# SHA_CTX *ctx, -# const void *in0); - -$code.=<<___; -.text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - -.globl aesni_cbc_sha1_enc -.type aesni_cbc_sha1_enc,\@abi-omnipotent -.align 16 -aesni_cbc_sha1_enc: - _CET_ENDBR - # caller should check for SSSE3 and AES-NI bits - mov OPENSSL_ia32cap_P+0(%rip),%r10d - mov OPENSSL_ia32cap_P+4(%rip),%r11d -___ -$code.=<<___ if ($avx); - and \$IA32CAP_MASK1_AVX,%r11d # mask AVX bit - and \$IA32CAP_MASK0_INTEL,%r10d # mask "Intel CPU" bit - or %r11d,%r10d - cmp \$(IA32CAP_MASK1_AVX|IA32CAP_MASK0_INTEL),%r10d - je aesni_cbc_sha1_enc_avx -___ -$code.=<<___; - jmp aesni_cbc_sha1_enc_ssse3 - ret -.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc -___ - -my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); - -my $Xi=4; -my @X=map("%xmm$_",(4..7,0..3)); -my @Tx=map("%xmm$_",(8..10)); -my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization -my @T=("%esi","%edi"); -my $j=0; my $jj=0; my $r=0; my $sn=0; -my $K_XX_XX="%r11"; -my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13)); -my @rndkey=("%xmm14","%xmm15"); - -sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; - my $arg = pop; - $arg = "\$$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; -} - -my $_rol=sub { &rol(@_) }; -my $_ror=sub { &ror(@_) }; - -$code.=<<___; -.type aesni_cbc_sha1_enc_ssse3,\@function,6 -.align 16 -aesni_cbc_sha1_enc_ssse3: - _CET_ENDBR - mov `($win64?56:8)`(%rsp),$inp # load 7th argument - #shr \$6,$len # debugging artefact - #jz .Lepilogue_ssse3 # debugging artefact - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - lea `-104-($win64?10*16:0)`(%rsp),%rsp - #mov $in0,$inp # debugging artefact - #lea 64(%rsp),$ctx # debugging artefact -___ -$code.=<<___ if ($win64); - movaps %xmm6,96+0(%rsp) - movaps %xmm7,96+16(%rsp) - movaps %xmm8,96+32(%rsp) - movaps %xmm9,96+48(%rsp) - movaps %xmm10,96+64(%rsp) - movaps %xmm11,96+80(%rsp) - movaps %xmm12,96+96(%rsp) - movaps %xmm13,96+112(%rsp) - movaps %xmm14,96+128(%rsp) - movaps %xmm15,96+144(%rsp) -.Lprologue_ssse3: -___ -$code.=<<___; - mov $in0,%r12 # reassign arguments - mov $out,%r13 - mov $len,%r14 - mov $key,%r15 - movdqu ($ivp),$iv # load IV - mov $ivp,88(%rsp) # save $ivp -___ -my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments -my $rounds="${ivp}d"; -$code.=<<___; - shl \$6,$len - sub $in0,$out - mov 240($key),$rounds - add $inp,$len # end of input - - lea K_XX_XX(%rip),$K_XX_XX - mov 0($ctx),$A # load context - mov 4($ctx),$B - mov 8($ctx),$C - mov 12($ctx),$D - mov $B,@T[0] # magic seed - mov 16($ctx),$E - - movdqa 64($K_XX_XX),@X[2] # pbswap mask - movdqa 0($K_XX_XX),@Tx[1] # K_00_19 - movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] - movdqu 16($inp),@X[-3&7] - movdqu 32($inp),@X[-2&7] - movdqu 48($inp),@X[-1&7] - pshufb @X[2],@X[-4&7] # byte swap - add \$64,$inp - pshufb @X[2],@X[-3&7] - pshufb @X[2],@X[-2&7] - pshufb @X[2],@X[-1&7] - paddd @Tx[1],@X[-4&7] # add K_00_19 - paddd @Tx[1],@X[-3&7] - paddd @Tx[1],@X[-2&7] - movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU - psubd @Tx[1],@X[-4&7] # restore X[] - movdqa @X[-3&7],16(%rsp) - psubd @Tx[1],@X[-3&7] - movdqa @X[-2&7],32(%rsp) - psubd @Tx[1],@X[-2&7] - movups ($key),$rndkey0 # $key[0] - movups 16($key),$rndkey[0] # forward reference - jmp .Loop_ssse3 -___ - -my $aesenc=sub { - use integer; - my ($n,$k)=($r/10,$r%10); - if ($k==0) { - $code.=<<___; - movups `16*$n`($in0),$in # load input - xorps $rndkey0,$in -___ - $code.=<<___ if ($n); - movups $iv,`16*($n-1)`($out,$in0) # write output -___ - $code.=<<___; - xorps $in,$iv - aesenc $rndkey[0],$iv - movups `32+16*$k`($key),$rndkey[1] -___ - } elsif ($k==9) { - $sn++; - $code.=<<___; - cmp \$11,$rounds - jb .Laesenclast$sn - movups `32+16*($k+0)`($key),$rndkey[1] - aesenc $rndkey[0],$iv - movups `32+16*($k+1)`($key),$rndkey[0] - aesenc $rndkey[1],$iv - je .Laesenclast$sn - movups `32+16*($k+2)`($key),$rndkey[1] - aesenc $rndkey[0],$iv - movups `32+16*($k+3)`($key),$rndkey[0] - aesenc $rndkey[1],$iv -.Laesenclast$sn: - aesenclast $rndkey[0],$iv - movups 16($key),$rndkey[1] # forward reference -___ - } else { - $code.=<<___; - aesenc $rndkey[0],$iv - movups `32+16*$k`($key),$rndkey[1] -___ - } - $r++; unshift(@rndkey,pop(@rndkey)); -}; - -sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 40 instructions - my ($a,$b,$c,$d,$e); - - &movdqa (@X[0],@X[-3&7]); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (@Tx[0],@X[-1&7]); - &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" - eval(shift(@insns)); - eval(shift(@insns)); - - &paddd (@Tx[1],@X[-1&7]); - eval(shift(@insns)); - eval(shift(@insns)); - &psrldq (@Tx[0],4); # "X[-3]", 3 dwords - eval(shift(@insns)); - eval(shift(@insns)); - &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" - eval(shift(@insns)); - eval(shift(@insns)); - - &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - - &movdqa (@Tx[2],@X[0]); - &movdqa (@Tx[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword - &paddd (@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &psrld (@Tx[0],31); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (@Tx[1],@Tx[2]); - eval(shift(@insns)); - eval(shift(@insns)); - - &psrld (@Tx[2],30); - &por (@X[0],@Tx[0]); # "X[0]"<<<=1 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &pslld (@Tx[1],2); - &pxor (@X[0],@Tx[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX - eval(shift(@insns)); - eval(shift(@insns)); - - &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 - - foreach (@insns) { eval; } # remaining instructions [if any] - - $Xi++; push(@X,shift(@X)); # "rotate" X[] - push(@Tx,shift(@Tx)); -} - -sub Xupdate_ssse3_32_79() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions - my ($a,$b,$c,$d,$e); - - &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); - eval(shift(@insns)); # body_20_39 - &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - - &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" - eval(shift(@insns)); - eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); - if ($Xi%5) { - &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... - } else { # ... or load next one - &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); - } - &paddd (@Tx[1],@X[-1&7]); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - - &movdqa (@Tx[0],@X[0]); - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &pslld (@X[0],2); - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - &psrld (@Tx[0],30); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &por (@X[0],@Tx[0]); # "X[0]"<<<=2 - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - &movdqa (@Tx[1],@X[0]) if ($Xi<19); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - - foreach (@insns) { eval; } # remaining instructions - - $Xi++; push(@X,shift(@X)); # "rotate" X[] - push(@Tx,shift(@Tx)); -} - -sub Xuplast_ssse3_80() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - &paddd (@Tx[1],@X[-1&7]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU - - foreach (@insns) { eval; } # remaining instructions - - &cmp ($inp,$len); - &je (".Ldone_ssse3"); - - unshift(@Tx,pop(@Tx)); - - &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask - &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 - &movdqu (@X[-4&7],"0($inp)"); # load input - &movdqu (@X[-3&7],"16($inp)"); - &movdqu (@X[-2&7],"32($inp)"); - &movdqu (@X[-1&7],"48($inp)"); - &pshufb (@X[-4&7],@X[2]); # byte swap - &add ($inp,64); - - $Xi=0; -} - -sub Xloop_ssse3() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - eval(shift(@insns)); - &pshufb (@X[($Xi-3)&7],@X[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &paddd (@X[($Xi-4)&7],@Tx[1]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - &psubd (@X[($Xi-4)&7],@Tx[1]); - - foreach (@insns) { eval; } - $Xi++; -} - -sub Xtail_ssse3() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - foreach (@insns) { eval; } -} - -sub body_00_19 () { - use integer; - my ($k,$n); - my @r=( - '($a,$b,$c,$d,$e)=@V;'. - '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer - '&xor ($c,$d);', - '&mov (@T[1],$a);', # $b in next round - '&$_rol ($a,5);', - '&and (@T[0],$c);', # ($b&($c^$d)) - '&xor ($c,$d);', # restore $c - '&xor (@T[0],$d);', - '&add ($e,$a);', - '&$_ror ($b,$j?7:2);', # $b>>>2 - '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' - ); - $n = scalar(@r); - $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds - @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); - $jj++; - return @r; -} - -sub body_20_39 () { - use integer; - my ($k,$n); - my @r=( - '($a,$b,$c,$d,$e)=@V;'. - '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer - '&xor (@T[0],$d);', # ($b^$d) - '&mov (@T[1],$a);', # $b in next round - '&$_rol ($a,5);', - '&xor (@T[0],$c);', # ($b^$d^$c) - '&add ($e,$a);', - '&$_ror ($b,7);', # $b>>>2 - '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' - ); - $n = scalar(@r); - $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds - @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); - $jj++; - return @r; -} - -sub body_40_59 () { - use integer; - my ($k,$n); - my @r=( - '($a,$b,$c,$d,$e)=@V;'. - '&mov (@T[1],$c);', - '&xor ($c,$d);', - '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer - '&and (@T[1],$d);', - '&and (@T[0],$c);', # ($b&($c^$d)) - '&$_ror ($b,7);', # $b>>>2 - '&add ($e,@T[1]);', - '&mov (@T[1],$a);', # $b in next round - '&$_rol ($a,5);', - '&add ($e,@T[0]);', - '&xor ($c,$d);', # restore $c - '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' - ); - $n = scalar(@r); - $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds - @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); - $jj++; - return @r; -} -$code.=<<___; -.align 16 -.Loop_ssse3: -___ - &Xupdate_ssse3_16_31(\&body_00_19); - &Xupdate_ssse3_16_31(\&body_00_19); - &Xupdate_ssse3_16_31(\&body_00_19); - &Xupdate_ssse3_16_31(\&body_00_19); - &Xupdate_ssse3_32_79(\&body_00_19); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" - - $saved_j=$j; @saved_V=@V; - $saved_r=$r; @saved_rndkey=@rndkey; - - &Xloop_ssse3(\&body_20_39); - &Xloop_ssse3(\&body_20_39); - &Xloop_ssse3(\&body_20_39); - -$code.=<<___; - movups $iv,48($out,$in0) # write output - lea 64($in0),$in0 - - add 0($ctx),$A # update context - add 4($ctx),@T[0] - add 8($ctx),$C - add 12($ctx),$D - mov $A,0($ctx) - add 16($ctx),$E - mov @T[0],4($ctx) - mov @T[0],$B # magic seed - mov $C,8($ctx) - mov $D,12($ctx) - mov $E,16($ctx) - jmp .Loop_ssse3 - -.align 16 -.Ldone_ssse3: -___ - $jj=$j=$saved_j; @V=@saved_V; - $r=$saved_r; @rndkey=@saved_rndkey; - - &Xtail_ssse3(\&body_20_39); - &Xtail_ssse3(\&body_20_39); - &Xtail_ssse3(\&body_20_39); - -$code.=<<___; - movups $iv,48($out,$in0) # write output - mov 88(%rsp),$ivp # restore $ivp - - add 0($ctx),$A # update context - add 4($ctx),@T[0] - add 8($ctx),$C - mov $A,0($ctx) - add 12($ctx),$D - mov @T[0],4($ctx) - add 16($ctx),$E - mov $C,8($ctx) - mov $D,12($ctx) - mov $E,16($ctx) - movups $iv,($ivp) # write IV -___ -$code.=<<___ if ($win64); - movaps 96+0(%rsp),%xmm6 - movaps 96+16(%rsp),%xmm7 - movaps 96+32(%rsp),%xmm8 - movaps 96+48(%rsp),%xmm9 - movaps 96+64(%rsp),%xmm10 - movaps 96+80(%rsp),%xmm11 - movaps 96+96(%rsp),%xmm12 - movaps 96+112(%rsp),%xmm13 - movaps 96+128(%rsp),%xmm14 - movaps 96+144(%rsp),%xmm15 -___ -$code.=<<___; - lea `104+($win64?10*16:0)`(%rsp),%rsi - mov 0(%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp -.Lepilogue_ssse3: - ret -.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 -___ - -$j=$jj=$r=$sn=0; - -if ($avx) { -my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); - -my $Xi=4; -my @X=map("%xmm$_",(4..7,0..3)); -my @Tx=map("%xmm$_",(8..10)); -my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization -my @T=("%esi","%edi"); - -my $_rol=sub { &shld(@_[0],@_) }; -my $_ror=sub { &shrd(@_[0],@_) }; - -$code.=<<___; -.type aesni_cbc_sha1_enc_avx,\@function,6 -.align 16 -aesni_cbc_sha1_enc_avx: - _CET_ENDBR - mov `($win64?56:8)`(%rsp),$inp # load 7th argument - #shr \$6,$len # debugging artefact - #jz .Lepilogue_avx # debugging artefact - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - lea `-104-($win64?10*16:0)`(%rsp),%rsp - #mov $in0,$inp # debugging artefact - #lea 64(%rsp),$ctx # debugging artefact -___ -$code.=<<___ if ($win64); - movaps %xmm6,96+0(%rsp) - movaps %xmm7,96+16(%rsp) - movaps %xmm8,96+32(%rsp) - movaps %xmm9,96+48(%rsp) - movaps %xmm10,96+64(%rsp) - movaps %xmm11,96+80(%rsp) - movaps %xmm12,96+96(%rsp) - movaps %xmm13,96+112(%rsp) - movaps %xmm14,96+128(%rsp) - movaps %xmm15,96+144(%rsp) -.Lprologue_avx: -___ -$code.=<<___; - vzeroall - mov $in0,%r12 # reassign arguments - mov $out,%r13 - mov $len,%r14 - mov $key,%r15 - vmovdqu ($ivp),$iv # load IV - mov $ivp,88(%rsp) # save $ivp -___ -my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments -my $rounds="${ivp}d"; -$code.=<<___; - shl \$6,$len - sub $in0,$out - mov 240($key),$rounds - add \$112,$key # size optimization - add $inp,$len # end of input - - lea K_XX_XX(%rip),$K_XX_XX - mov 0($ctx),$A # load context - mov 4($ctx),$B - mov 8($ctx),$C - mov 12($ctx),$D - mov $B,@T[0] # magic seed - mov 16($ctx),$E - - vmovdqa 64($K_XX_XX),@X[2] # pbswap mask - vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 - vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] - vmovdqu 16($inp),@X[-3&7] - vmovdqu 32($inp),@X[-2&7] - vmovdqu 48($inp),@X[-1&7] - vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap - add \$64,$inp - vpshufb @X[2],@X[-3&7],@X[-3&7] - vpshufb @X[2],@X[-2&7],@X[-2&7] - vpshufb @X[2],@X[-1&7],@X[-1&7] - vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 - vpaddd @Tx[1],@X[-3&7],@X[1] - vpaddd @Tx[1],@X[-2&7],@X[2] - vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU - vmovdqa @X[1],16(%rsp) - vmovdqa @X[2],32(%rsp) - vmovups -112($key),$rndkey0 # $key[0] - vmovups 16-112($key),$rndkey[0] # forward reference - jmp .Loop_avx -___ - -my $aesenc=sub { - use integer; - my ($n,$k)=($r/10,$r%10); - if ($k==0) { - $code.=<<___; - vmovups `16*$n`($in0),$in # load input - vxorps $rndkey0,$in,$in -___ - $code.=<<___ if ($n); - vmovups $iv,`16*($n-1)`($out,$in0) # write output -___ - $code.=<<___; - vxorps $in,$iv,$iv - vaesenc $rndkey[0],$iv,$iv - vmovups `32+16*$k-112`($key),$rndkey[1] -___ - } elsif ($k==9) { - $sn++; - $code.=<<___; - cmp \$11,$rounds - jb .Lvaesenclast$sn - vaesenc $rndkey[0],$iv,$iv - vmovups `32+16*($k+0)-112`($key),$rndkey[1] - vaesenc $rndkey[1],$iv,$iv - vmovups `32+16*($k+1)-112`($key),$rndkey[0] - je .Lvaesenclast$sn - vaesenc $rndkey[0],$iv,$iv - vmovups `32+16*($k+2)-112`($key),$rndkey[1] - vaesenc $rndkey[1],$iv,$iv - vmovups `32+16*($k+3)-112`($key),$rndkey[0] -.Lvaesenclast$sn: - vaesenclast $rndkey[0],$iv,$iv - vmovups 16-112($key),$rndkey[1] # forward reference -___ - } else { - $code.=<<___; - vaesenc $rndkey[0],$iv,$iv - vmovups `32+16*$k-112`($key),$rndkey[1] -___ - } - $r++; unshift(@rndkey,pop(@rndkey)); -}; - -sub Xupdate_avx_16_31() # recall that $Xi starts with 4 -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 40 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - eval(shift(@insns)); - &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" - eval(shift(@insns)); - eval(shift(@insns)); - - &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrld (@Tx[0],@X[0],31); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword - &vpaddd (@X[0],@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrld (@Tx[1],@Tx[2],30); - &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpslld (@Tx[2],@Tx[2],2); - &vpxor (@X[0],@X[0],@Tx[1]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 - eval(shift(@insns)); - eval(shift(@insns)); - &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX - eval(shift(@insns)); - eval(shift(@insns)); - - - foreach (@insns) { eval; } # remaining instructions [if any] - - $Xi++; push(@X,shift(@X)); # "rotate" X[] - push(@Tx,shift(@Tx)); -} - -sub Xupdate_avx_32_79() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions - my ($a,$b,$c,$d,$e); - - &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" - &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - - &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" - eval(shift(@insns)); - eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); - if ($Xi%5) { - &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... - } else { # ... or load next one - &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); - } - &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - - &vpsrld (@Tx[0],@X[0],30); - &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &vpslld (@X[0],@X[0],2); - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - - foreach (@insns) { eval; } # remaining instructions - - $Xi++; push(@X,shift(@X)); # "rotate" X[] - push(@Tx,shift(@Tx)); -} - -sub Xuplast_avx_80() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU - - foreach (@insns) { eval; } # remaining instructions - - &cmp ($inp,$len); - &je (".Ldone_avx"); - - unshift(@Tx,pop(@Tx)); - - &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask - &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 - &vmovdqu(@X[-4&7],"0($inp)"); # load input - &vmovdqu(@X[-3&7],"16($inp)"); - &vmovdqu(@X[-2&7],"32($inp)"); - &vmovdqu(@X[-1&7],"48($inp)"); - &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap - &add ($inp,64); - - $Xi=0; -} - -sub Xloop_avx() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - eval(shift(@insns)); - &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - - foreach (@insns) { eval; } - $Xi++; -} - -sub Xtail_avx() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - foreach (@insns) { eval; } -} - -$code.=<<___; -.align 16 -.Loop_avx: -___ - &Xupdate_avx_16_31(\&body_00_19); - &Xupdate_avx_16_31(\&body_00_19); - &Xupdate_avx_16_31(\&body_00_19); - &Xupdate_avx_16_31(\&body_00_19); - &Xupdate_avx_32_79(\&body_00_19); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_20_39); - &Xuplast_avx_80(\&body_20_39); # can jump to "done" - - $saved_j=$j; @saved_V=@V; - $saved_r=$r; @saved_rndkey=@rndkey; - - &Xloop_avx(\&body_20_39); - &Xloop_avx(\&body_20_39); - &Xloop_avx(\&body_20_39); - -$code.=<<___; - vmovups $iv,48($out,$in0) # write output - lea 64($in0),$in0 - - add 0($ctx),$A # update context - add 4($ctx),@T[0] - add 8($ctx),$C - add 12($ctx),$D - mov $A,0($ctx) - add 16($ctx),$E - mov @T[0],4($ctx) - mov @T[0],$B # magic seed - mov $C,8($ctx) - mov $D,12($ctx) - mov $E,16($ctx) - jmp .Loop_avx - -.align 16 -.Ldone_avx: -___ - $jj=$j=$saved_j; @V=@saved_V; - $r=$saved_r; @rndkey=@saved_rndkey; - - &Xtail_avx(\&body_20_39); - &Xtail_avx(\&body_20_39); - &Xtail_avx(\&body_20_39); - -$code.=<<___; - vmovups $iv,48($out,$in0) # write output - mov 88(%rsp),$ivp # restore $ivp - - add 0($ctx),$A # update context - add 4($ctx),@T[0] - add 8($ctx),$C - mov $A,0($ctx) - add 12($ctx),$D - mov @T[0],4($ctx) - add 16($ctx),$E - mov $C,8($ctx) - mov $D,12($ctx) - mov $E,16($ctx) - vmovups $iv,($ivp) # write IV - vzeroall -___ -$code.=<<___ if ($win64); - movaps 96+0(%rsp),%xmm6 - movaps 96+16(%rsp),%xmm7 - movaps 96+32(%rsp),%xmm8 - movaps 96+48(%rsp),%xmm9 - movaps 96+64(%rsp),%xmm10 - movaps 96+80(%rsp),%xmm11 - movaps 96+96(%rsp),%xmm12 - movaps 96+112(%rsp),%xmm13 - movaps 96+128(%rsp),%xmm14 - movaps 96+144(%rsp),%xmm15 -___ -$code.=<<___; - lea `104+($win64?10*16:0)`(%rsp),%rsi - mov 0(%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp -.Lepilogue_avx: - ret -.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx -___ -} -$code.=<<___; -.section .rodata -.align 64 -K_XX_XX: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask -.align 64 -.text -___ - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type ssse3_handler,\@abi-omnipotent -.align 16 -ssse3_handler: - _CET_ENDBR - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->RipRsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lcommon_seh_tail - - lea 96(%rax),%rsi - lea 512($context),%rdi # &context.Xmm6 - mov \$20,%ecx - .long 0xa548f3fc # cld; rep movsq - lea `104+10*16`(%rax),%rax # adjust stack pointer - - mov 0(%rax),%r15 - mov 8(%rax),%r14 - mov 16(%rax),%r13 - mov 24(%rax),%r12 - mov 32(%rax),%rbp - mov 40(%rax),%rbx - lea 48(%rax),%rax - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R15 - -.Lcommon_seh_tail: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$154,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - ret -.size ssse3_handler,.-ssse3_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3 - .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3 - .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3 -___ -$code.=<<___ if ($avx); - .rva .LSEH_begin_aesni_cbc_sha1_enc_avx - .rva .LSEH_end_aesni_cbc_sha1_enc_avx - .rva .LSEH_info_aesni_cbc_sha1_enc_avx -___ -$code.=<<___; -.section .xdata -.align 8 -.LSEH_info_aesni_cbc_sha1_enc_ssse3: - .byte 9,0,0,0 - .rva ssse3_handler - .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] -___ -$code.=<<___ if ($avx); -.LSEH_info_aesni_cbc_sha1_enc_avx: - .byte 9,0,0,0 - .rva ssse3_handler - .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] -___ -} - -#################################################################### -sub rex { - local *opcode=shift; - my ($dst,$src)=@_; - my $rex=0; - - $rex|=0x04 if($dst>=8); - $rex|=0x01 if($src>=8); - push @opcode,$rex|0x40 if($rex); -} - -$code =~ s/\`([^\`]*)\`/eval($1)/gem; - -print $code; -close STDOUT; diff --git a/lib/libcrypto/arch/amd64/Makefile.inc b/lib/libcrypto/arch/amd64/Makefile.inc index 878965f5665..ab10e43a143 100644 --- a/lib/libcrypto/arch/amd64/Makefile.inc +++ b/lib/libcrypto/arch/amd64/Makefile.inc @@ -1,4 +1,4 @@ -# $OpenBSD: Makefile.inc,v 1.14 2024/03/27 11:12:08 jsing Exp $ +# $OpenBSD: Makefile.inc,v 1.15 2024/03/27 12:42:30 jsing Exp $ # amd64-specific libcrypto build rules @@ -13,7 +13,6 @@ SSLASM+= aes bsaes-x86_64 CFLAGS+= -DVPAES_ASM SSLASM+= aes vpaes-x86_64 SSLASM+= aes aesni-x86_64 -SSLASM+= aes aesni-sha1-x86_64 # bn CFLAGS+= -DOPENSSL_IA32_SSE2 CFLAGS+= -DRSA_ASM @@ -49,9 +48,7 @@ SSLASM+= md5 md5-x86_64 CFLAGS+= -DGHASH_ASM SSLASM+= modes ghash-x86_64 # rc4 -CFLAGS+= -DRC4_MD5_ASM SSLASM+= rc4 rc4-x86_64 -SSLASM+= rc4 rc4-md5-x86_64 # ripemd # sha CFLAGS+= -DSHA1_ASM diff --git a/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl b/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl deleted file mode 100644 index e5e8aa08a19..00000000000 --- a/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl +++ /dev/null @@ -1,515 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# June 2011 -# -# This is RC4+MD5 "stitch" implementation. The idea, as spelled in -# http://download.intel.com/design/intarch/papers/323686.pdf, is that -# since both algorithms exhibit instruction-level parallelism, ILP, -# below theoretical maximum, interleaving them would allow to utilize -# processor resources better and achieve better performance. RC4 -# instruction sequence is virtually identical to rc4-x86_64.pl, which -# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin -# and Jim Guilford of Intel. MD5 is fresh implementation aiming to -# minimize register usage, which was used as "main thread" with RC4 -# weaved into it, one RC4 round per one MD5 round. In addition to the -# stiched subroutine the script can generate standalone replacement -# md5_block_asm_data_order and RC4. Below are performance numbers in -# cycles per processed byte, less is better, for these the standalone -# subroutines, sum of them, and stitched one: -# -# RC4 MD5 RC4+MD5 stitch gain -# Opteron 6.5(*) 5.4 11.9 7.0 +70%(*) -# Core2 6.5 5.8 12.3 7.7 +60% -# Westmere 4.3 5.2 9.5 7.0 +36% -# Sandy Bridge 4.2 5.5 9.7 6.8 +43% -# Atom 9.3 6.5 15.8 11.1 +42% -# -# (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement -# is +53%... - -my ($rc4,$md5)=(1,1); # what to generate? -my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(), - # but its result is discarded. Idea here is - # to be able to use 'openssl speed rc4' for - # benchmarking the stitched subroutine... - -my $flavour = shift; -my $output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open OUT,"| \"$^X\" $xlate $flavour $output"; -*STDOUT=*OUT; - -my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs); - -if ($rc4 && !$md5) { - ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx"); - $func="RC4"; $nargs=4; -} elsif ($md5 && !$rc4) { - ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx"); - $func="md5_block_asm_data_order"; $nargs=3; -} else { - ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); - $func="rc4_md5_enc"; $nargs=6; - # void rc4_md5_enc( - # RC4_KEY *key, # - # const void *in0, # RC4 input - # void *out, # RC4 output - # MD5_CTX *ctx, # - # const void *inp, # MD5 input - # size_t len); # number of 64-byte blocks -} - -my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee, - 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501, - 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be, - 0x6b901122,0xfd987193,0xa679438e,0x49b40821, - - 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa, - 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8, - 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed, - 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a, - - 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c, - 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70, - 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05, - 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665, - - 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039, - 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1, - 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1, - 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 ); - -my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers -my $tmp="%r12d"; - -my @XX=("%rbp","%rsi"); # RC4 registers -my @TX=("%rax","%rbx"); -my $YY="%rcx"; -my $TY="%rdx"; - -my $MOD=32; # 16, 32 or 64 - -$code.=<<___; -.text -.align 16 - -.globl $func -.type $func,\@function,$nargs -$func: - _CET_ENDBR - cmp \$0,$len - je .Labort - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - sub \$40,%rsp -.Lbody: -___ -if ($rc4) { -$code.=<<___; -$D#md5# mov $ctx,%r11 # reassign arguments - mov $len,%r12 - mov $in0,%r13 - mov $out,%r14 -$D#md5# mov $inp,%r15 -___ - $ctx="%r11" if ($md5); # reassign arguments - $len="%r12"; - $in0="%r13"; - $out="%r14"; - $inp="%r15" if ($md5); - $inp=$in0 if (!$md5); -$code.=<<___; - xor $XX[0],$XX[0] - xor $YY,$YY - - lea 8($dat),$dat - mov -8($dat),$XX[0]#b - mov -4($dat),$YY#b - - inc $XX[0]#b - sub $in0,$out - movl ($dat,$XX[0],4),$TX[0]#d -___ -$code.=<<___ if (!$md5); - xor $TX[1],$TX[1] - test \$-128,$len - jz .Loop1 - sub $XX[0],$TX[1] - and \$`$MOD-1`,$TX[1] - jz .Loop${MOD}_is_hot - sub $TX[1],$len -.Loop${MOD}_warmup: - add $TX[0]#b,$YY#b - movl ($dat,$YY,4),$TY#d - movl $TX[0]#d,($dat,$YY,4) - movl $TY#d,($dat,$XX[0],4) - add $TY#b,$TX[0]#b - inc $XX[0]#b - movl ($dat,$TX[0],4),$TY#d - movl ($dat,$XX[0],4),$TX[0]#d - xorb ($in0),$TY#b - movb $TY#b,($out,$in0) - lea 1($in0),$in0 - dec $TX[1] - jnz .Loop${MOD}_warmup - - mov $YY,$TX[1] - xor $YY,$YY - mov $TX[1]#b,$YY#b - -.Loop${MOD}_is_hot: - mov $len,32(%rsp) # save original $len - shr \$6,$len # number of 64-byte blocks -___ - if ($D && !$md5) { # stitch in dummy MD5 - $md5=1; - $ctx="%r11"; - $inp="%r15"; - $code.=<<___; - mov %rsp,$ctx - mov $in0,$inp -___ - } -} -$code.=<<___; -#rc4# add $TX[0]#b,$YY#b -#rc4# lea ($dat,$XX[0],4),$XX[1] - shl \$6,$len - add $inp,$len # pointer to the end of input - mov $len,16(%rsp) - -#md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX -#md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX -#md5# mov 1*4($ctx),$V[1] -#md5# mov 2*4($ctx),$V[2] -#md5# mov 3*4($ctx),$V[3] - jmp .Loop - -.align 16 -.Loop: -#md5# mov $V[0],0*4(%rsp) # put aside current hash value -#md5# mov $V[1],1*4(%rsp) -#md5# mov $V[2],2*4(%rsp) -#md5# mov $V[3],$tmp # forward reference -#md5# mov $V[3],3*4(%rsp) -___ - -sub R0 { - my ($i,$a,$b,$c,$d)=@_; - my @rot0=(7,12,17,22); - my $j=$i%16; - my $k=$i%$MOD; - my $xmm="%xmm".($j&1); - $code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15); - $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); - $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); - $code.=<<___; -#rc4# movl ($dat,$YY,4),$TY#d -#md5# xor $c,$tmp -#rc4# movl $TX[0]#d,($dat,$YY,4) -#md5# and $b,$tmp -#md5# add 4*`$j`($inp),$a -#rc4# add $TY#b,$TX[0]#b -#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d -#md5# add \$$K[$i],$a -#md5# xor $d,$tmp -#rc4# movz $TX[0]#b,$TX[0]#d -#rc4# movl $TY#d,4*$k($XX[1]) -#md5# add $tmp,$a -#rc4# add $TX[1]#b,$YY#b -#md5# rol \$$rot0[$j%4],$a -#md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference -#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n -#md5# add $b,$a -___ - $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); - mov $YY,$XX[1] - xor $YY,$YY # keyword to partial register - mov $XX[1]#b,$YY#b - lea ($dat,$XX[0],4),$XX[1] -___ - $code.=<<___ if ($rc4 && $j==15); - psllq \$8,%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm1,%xmm2 -___ -} -sub R1 { - my ($i,$a,$b,$c,$d)=@_; - my @rot1=(5,9,14,20); - my $j=$i%16; - my $k=$i%$MOD; - my $xmm="%xmm".($j&1); - $code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15); - $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); - $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); - $code.=<<___; -#rc4# movl ($dat,$YY,4),$TY#d -#md5# xor $b,$tmp -#rc4# movl $TX[0]#d,($dat,$YY,4) -#md5# and $d,$tmp -#md5# add 4*`((1+5*$j)%16)`($inp),$a -#rc4# add $TY#b,$TX[0]#b -#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d -#md5# add \$$K[$i],$a -#md5# xor $c,$tmp -#rc4# movz $TX[0]#b,$TX[0]#d -#rc4# movl $TY#d,4*$k($XX[1]) -#md5# add $tmp,$a -#rc4# add $TX[1]#b,$YY#b -#md5# rol \$$rot1[$j%4],$a -#md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference -#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n -#md5# add $b,$a -___ - $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); - mov $YY,$XX[1] - xor $YY,$YY # keyword to partial register - mov $XX[1]#b,$YY#b - lea ($dat,$XX[0],4),$XX[1] -___ - $code.=<<___ if ($rc4 && $j==15); - psllq \$8,%xmm1 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 -___ -} -sub R2 { - my ($i,$a,$b,$c,$d)=@_; - my @rot2=(4,11,16,23); - my $j=$i%16; - my $k=$i%$MOD; - my $xmm="%xmm".($j&1); - $code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15); - $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); - $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); - $code.=<<___; -#rc4# movl ($dat,$YY,4),$TY#d -#md5# xor $c,$tmp -#rc4# movl $TX[0]#d,($dat,$YY,4) -#md5# xor $b,$tmp -#md5# add 4*`((5+3*$j)%16)`($inp),$a -#rc4# add $TY#b,$TX[0]#b -#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d -#md5# add \$$K[$i],$a -#rc4# movz $TX[0]#b,$TX[0]#d -#md5# add $tmp,$a -#rc4# movl $TY#d,4*$k($XX[1]) -#rc4# add $TX[1]#b,$YY#b -#md5# rol \$$rot2[$j%4],$a -#md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference -#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n -#md5# add $b,$a -___ - $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); - mov $YY,$XX[1] - xor $YY,$YY # keyword to partial register - mov $XX[1]#b,$YY#b - lea ($dat,$XX[0],4),$XX[1] -___ - $code.=<<___ if ($rc4 && $j==15); - psllq \$8,%xmm1 - pxor %xmm0,%xmm4 - pxor %xmm1,%xmm4 -___ -} -sub R3 { - my ($i,$a,$b,$c,$d)=@_; - my @rot3=(6,10,15,21); - my $j=$i%16; - my $k=$i%$MOD; - my $xmm="%xmm".($j&1); - $code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15); - $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); - $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); - $code.=<<___; -#rc4# movl ($dat,$YY,4),$TY#d -#md5# xor $d,$tmp -#rc4# movl $TX[0]#d,($dat,$YY,4) -#md5# or $b,$tmp -#md5# add 4*`((7*$j)%16)`($inp),$a -#rc4# add $TY#b,$TX[0]#b -#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d -#md5# add \$$K[$i],$a -#rc4# movz $TX[0]#b,$TX[0]#d -#md5# xor $c,$tmp -#rc4# movl $TY#d,4*$k($XX[1]) -#md5# add $tmp,$a -#rc4# add $TX[1]#b,$YY#b -#md5# rol \$$rot3[$j%4],$a -#md5# mov \$-1,$tmp # forward reference -#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n -#md5# add $b,$a -___ - $code.=<<___ if ($rc4 && $j==15); - mov $XX[0],$XX[1] - xor $XX[0],$XX[0] # keyword to partial register - mov $XX[1]#b,$XX[0]#b - mov $YY,$XX[1] - xor $YY,$YY # keyword to partial register - mov $XX[1]#b,$YY#b - lea ($dat,$XX[0],4),$XX[1] - psllq \$8,%xmm1 - pxor %xmm0,%xmm5 - pxor %xmm1,%xmm5 -___ -} - -my $i=0; -for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } -for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } -for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } -for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } - -$code.=<<___; -#md5# add 0*4(%rsp),$V[0] # accumulate hash value -#md5# add 1*4(%rsp),$V[1] -#md5# add 2*4(%rsp),$V[2] -#md5# add 3*4(%rsp),$V[3] - -#rc4# movdqu %xmm2,($out,$in0) # write RC4 output -#rc4# movdqu %xmm3,16($out,$in0) -#rc4# movdqu %xmm4,32($out,$in0) -#rc4# movdqu %xmm5,48($out,$in0) -#md5# lea 64($inp),$inp -#rc4# lea 64($in0),$in0 - cmp 16(%rsp),$inp # are we done? - jb .Loop - -#md5# mov 24(%rsp),$len # restore pointer to MD5_CTX -#rc4# sub $TX[0]#b,$YY#b # correct $YY -#md5# mov $V[0],0*4($len) # write MD5_CTX -#md5# mov $V[1],1*4($len) -#md5# mov $V[2],2*4($len) -#md5# mov $V[3],3*4($len) -___ -$code.=<<___ if ($rc4 && (!$md5 || $D)); - mov 32(%rsp),$len # restore original $len - and \$63,$len # remaining bytes - jnz .Loop1 - jmp .Ldone - -.align 16 -.Loop1: - add $TX[0]#b,$YY#b - movl ($dat,$YY,4),$TY#d - movl $TX[0]#d,($dat,$YY,4) - movl $TY#d,($dat,$XX[0],4) - add $TY#b,$TX[0]#b - inc $XX[0]#b - movl ($dat,$TX[0],4),$TY#d - movl ($dat,$XX[0],4),$TX[0]#d - xorb ($in0),$TY#b - movb $TY#b,($out,$in0) - lea 1($in0),$in0 - dec $len - jnz .Loop1 - -.Ldone: -___ -$code.=<<___; -#rc4# sub \$1,$XX[0]#b -#rc4# movl $XX[0]#d,-8($dat) -#rc4# movl $YY#d,-4($dat) - - mov 40(%rsp),%r15 - mov 48(%rsp),%r14 - mov 56(%rsp),%r13 - mov 64(%rsp),%r12 - mov 72(%rsp),%rbp - mov 80(%rsp),%rbx - lea 88(%rsp),%rsp -.Lepilogue: -.Labort: - ret -.size $func,.-$func -___ - -if ($rc4 && $D) { # sole purpose of this section is to provide - # option to use the generated module as drop-in - # replacement for rc4-x86_64.pl for debugging - # and testing purposes... -my ($idx,$ido)=("%r8","%r9"); -my ($dat,$len,$inp)=("%rdi","%rsi","%rdx"); - -$code.=<<___; -.globl RC4_set_key -.type RC4_set_key,\@function,3 -.align 16 -RC4_set_key: - _CET_ENDBR - lea 8($dat),$dat - lea ($inp,$len),$inp - neg $len - mov $len,%rcx - xor %eax,%eax - xor $ido,$ido - xor %r10,%r10 - xor %r11,%r11 - jmp .Lw1stloop - -.align 16 -.Lw1stloop: - mov %eax,($dat,%rax,4) - add \$1,%al - jnc .Lw1stloop - - xor $ido,$ido - xor $idx,$idx -.align 16 -.Lw2ndloop: - mov ($dat,$ido,4),%r10d - add ($inp,$len,1),$idx#b - add %r10b,$idx#b - add \$1,$len - mov ($dat,$idx,4),%r11d - cmovz %rcx,$len - mov %r10d,($dat,$idx,4) - mov %r11d,($dat,$ido,4) - add \$1,$ido#b - jnc .Lw2ndloop - - xor %eax,%eax - mov %eax,-8($dat) - mov %eax,-4($dat) - ret -.size RC4_set_key,.-RC4_set_key -___ -} - -sub reg_part { -my ($reg,$conv)=@_; - if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } - elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } - elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } - elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } - return $reg; -} - -$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/pinsrw\s+\$0,/movd /gm; - -$code =~ s/#md5#//gm if ($md5); -$code =~ s/#rc4#//gm if ($rc4); - -print $code; - -close STDOUT; -- 2.20.1