These are just creating clutter and cause grep noise.
ok miod@
+++ /dev/null
-;
-; PA-RISC 2.0 implementation of bn_asm code, based on the
-; 64-bit version of the code. This code is effectively the
-; same as the 64-bit version except the register model is
-; slightly different given all values must be 32-bit between
-; function calls. Thus the 64-bit return values are returned
-; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
-;
-;
-; This code is approximately 2x faster than the C version
-; for RSA/DSA.
-;
-; See http://devresource.hp.com/ for more details on the PA-RISC
-; architecture. Also see the book "PA-RISC 2.0 Architecture"
-; by Gerry Kane for information on the instruction set architecture.
-;
-; Code written by Chris Ruemmler (with some help from the HP C
-; compiler).
-;
-; The code compiles with HP's assembler
-;
-
- .level 2.0N
- .space $TEXT$
- .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
-
-;
-; Global Register definitions used for the routines.
-;
-; Some information about HP's runtime architecture for 32-bits.
-;
-; "Caller save" means the calling function must save the register
-; if it wants the register to be preserved.
-; "Callee save" means if a function uses the register, it must save
-; the value before using it.
-;
-; For the floating point registers
-;
-; "caller save" registers: fr4-fr11, fr22-fr31
-; "callee save" registers: fr12-fr21
-; "special" registers: fr0-fr3 (status and exception registers)
-;
-; For the integer registers
-; value zero : r0
-; "caller save" registers: r1,r19-r26
-; "callee save" registers: r3-r18
-; return register : r2 (rp)
-; return values ; r28,r29 (ret0,ret1)
-; Stack pointer ; r30 (sp)
-; millicode return ptr ; r31 (also a caller save register)
-
-
-;
-; Arguments to the routines
-;
-r_ptr .reg %r26
-a_ptr .reg %r25
-b_ptr .reg %r24
-num .reg %r24
-n .reg %r23
-
-;
-; Note that the "w" argument for bn_mul_add_words and bn_mul_words
-; is passed on the stack at a delta of -56 from the top of stack
-; as the routine is entered.
-;
-
-;
-; Globals used in some routines
-;
-
-top_overflow .reg %r23
-high_mask .reg %r22 ; value 0xffffffff80000000L
-
-
-;------------------------------------------------------------------------------
-;
-; bn_mul_add_words
-;
-;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
-; int num, BN_ULONG w)
-;
-; arg0 = r_ptr
-; arg1 = a_ptr
-; arg3 = num
-; -56(sp) = w
-;
-; Local register definitions
-;
-
-fm1 .reg %fr22
-fm .reg %fr23
-ht_temp .reg %fr24
-ht_temp_1 .reg %fr25
-lt_temp .reg %fr26
-lt_temp_1 .reg %fr27
-fm1_1 .reg %fr28
-fm_1 .reg %fr29
-
-fw_h .reg %fr7L
-fw_l .reg %fr7R
-fw .reg %fr7
-
-fht_0 .reg %fr8L
-flt_0 .reg %fr8R
-t_float_0 .reg %fr8
-
-fht_1 .reg %fr9L
-flt_1 .reg %fr9R
-t_float_1 .reg %fr9
-
-tmp_0 .reg %r31
-tmp_1 .reg %r21
-m_0 .reg %r20
-m_1 .reg %r19
-ht_0 .reg %r1
-ht_1 .reg %r3
-lt_0 .reg %r4
-lt_1 .reg %r5
-m1_0 .reg %r6
-m1_1 .reg %r7
-rp_val .reg %r8
-rp_val_1 .reg %r9
-
-bn_mul_add_words
- .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
- .proc
- .callinfo frame=128
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- NOP ; Needed to make the loop 16-byte aligned
- NOP ; needed to make the loop 16-byte aligned
-
- STD %r5,16(%sp) ; save r5
- NOP
- STD %r6,24(%sp) ; save r6
- STD %r7,32(%sp) ; save r7
-
- STD %r8,40(%sp) ; save r8
- STD %r9,48(%sp) ; save r9
- COPY %r0,%ret1 ; return 0 by default
- DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
-
- CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
- LDO 128(%sp),%sp ; bump stack
-
- ;
- ; The loop is unrolled twice, so if there is only 1 number
- ; then go straight to the cleanup code.
- ;
- CMPIB,= 1,num,bn_mul_add_words_single_top
- FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
- ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
- ; two 32-bit mutiplies can be issued per cycle.
- ;
-bn_mul_add_words_unroll2
-
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- LDD 0(r_ptr),rp_val ; rp[0]
- LDD 8(r_ptr),rp_val_1 ; rp[1]
-
- XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
- XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
- FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
- FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
-
- XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
- XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
- FSTD fm,-8(%sp) ; -8(sp) = m[0]
- FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
-
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
- XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
- FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
-
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
- FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
-
- LDD -8(%sp),m_0 ; m[0]
- LDD -40(%sp),m_1 ; m[1]
- LDD -16(%sp),m1_0 ; m1[0]
- LDD -48(%sp),m1_1 ; m1[1]
-
- LDD -24(%sp),ht_0 ; ht[0]
- LDD -56(%sp),ht_1 ; ht[1]
- ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
- ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
-
- LDD -32(%sp),lt_0
- LDD -64(%sp),lt_1
- CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
- ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
-
- CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
- ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
- EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
-
- EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
- DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
- ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
- ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
-
- ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
- ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
- ADD,DC ht_1,%r0,ht_1 ; ht[1]++
-
- ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c;
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
- ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
-
- LDO -2(num),num ; num = num - 2;
- ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
- ADD,DC ht_1,%r0,ht_1 ; ht[1]++
- STD lt_0,0(r_ptr) ; rp[0] = lt[0]
-
- ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
- ADD,DC ht_1,%r0,%ret1 ; ht[1]++
- LDO 16(a_ptr),a_ptr ; a_ptr += 2
-
- STD lt_1,8(r_ptr) ; rp[1] = lt[1]
- CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
- LDO 16(r_ptr),r_ptr ; r_ptr += 2
-
- CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
-
- ;
- ; Top of loop aligned on 64-byte boundary
- ;
-bn_mul_add_words_single_top
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- LDD 0(r_ptr),rp_val ; rp[0]
- LDO 8(a_ptr),a_ptr ; a_ptr++
- XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
- FSTD fm1,-16(%sp) ; -16(sp) = m1
- XMPYU flt_0,fw_h,fm ; m = lt*fw_h
- FSTD fm,-8(%sp) ; -8(sp) = m
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt
-
- LDD -8(%sp),m_0
- LDD -16(%sp),m1_0 ; m1 = temp1
- ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
- LDD -24(%sp),ht_0
- LDD -32(%sp),lt_0
-
- CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
- ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
-
- EXTRD,U tmp_0,31,32,m_0 ; m>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
-
- ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
- ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
- ADD,DC ht_0,%r0,ht_0 ; ht++
- ADD %ret1,tmp_0,lt_0 ; lt = lt + c;
- ADD,DC ht_0,%r0,ht_0 ; ht++
- ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
- ADD,DC ht_0,%r0,%ret1 ; ht++
- STD lt_0,0(r_ptr) ; rp[0] = lt
-
-bn_mul_add_words_exit
- .EXIT
-
- EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
- LDD -80(%sp),%r9 ; restore r9
- LDD -88(%sp),%r8 ; restore r8
- LDD -96(%sp),%r7 ; restore r7
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3 ; restore r3
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;----------------------------------------------------------------------------
-;
-;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
-;
-; arg0 = rp
-; arg1 = ap
-; arg3 = num
-; w on stack at -56(sp)
-
-bn_mul_words
- .proc
- .callinfo frame=128
- .entry
- .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- NOP
- STD %r5,16(%sp) ; save r5
-
- STD %r6,24(%sp) ; save r6
- STD %r7,32(%sp) ; save r7
- COPY %r0,%ret1 ; return 0 by default
- DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
-
- CMPIB,>= 0,num,bn_mul_words_exit
- LDO 128(%sp),%sp ; bump stack
-
- ;
- ; See if only 1 word to do, thus just do cleanup
- ;
- CMPIB,= 1,num,bn_mul_words_single_top
- FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
- ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
- ; two 32-bit mutiplies can be issued per cycle.
- ;
-bn_mul_words_unroll2
-
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
- XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
-
- FSTD fm1,-16(%sp) ; -16(sp) = m1
- FSTD fm1_1,-48(%sp) ; -48(sp) = m1
- XMPYU flt_0,fw_h,fm ; m = lt*fw_h
- XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
-
- FSTD fm,-8(%sp) ; -8(sp) = m
- FSTD fm_1,-40(%sp) ; -40(sp) = m
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
- XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
-
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht
- FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
-
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt
- FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
- LDD -8(%sp),m_0
- LDD -40(%sp),m_1
-
- LDD -16(%sp),m1_0
- LDD -48(%sp),m1_1
- LDD -24(%sp),ht_0
- LDD -56(%sp),ht_1
-
- ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
- ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
- LDD -32(%sp),lt_0
- LDD -64(%sp),lt_1
-
- CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
- ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
- CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
- ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
-
- EXTRD,U tmp_0,31,32,m_0 ; m>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
- EXTRD,U tmp_1,31,32,m_1 ; m>>32
- DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
-
- ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
- ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
- ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
- ADD,DC ht_1,%r0,ht_1 ; ht++
- ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1);
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
- ADD,DC ht_1,%r0,ht_1 ; ht++
- STD lt_0,0(r_ptr) ; rp[0] = lt
- STD lt_1,8(r_ptr) ; rp[1] = lt
-
- COPY ht_1,%ret1 ; carry = ht
- LDO -2(num),num ; num = num - 2;
- LDO 16(a_ptr),a_ptr ; ap += 2
- CMPIB,<= 2,num,bn_mul_words_unroll2
- LDO 16(r_ptr),r_ptr ; rp++
-
- CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
-
- ;
- ; Top of loop aligned on 64-byte boundary
- ;
-bn_mul_words_single_top
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
-
- XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
- FSTD fm1,-16(%sp) ; -16(sp) = m1
- XMPYU flt_0,fw_h,fm ; m = lt*fw_h
- FSTD fm,-8(%sp) ; -8(sp) = m
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt
-
- LDD -8(%sp),m_0
- LDD -16(%sp),m1_0
- ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
- LDD -24(%sp),ht_0
- LDD -32(%sp),lt_0
-
- CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
- ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
-
- EXTRD,U tmp_0,31,32,m_0 ; m>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
-
- ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
- ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- ADD %ret1,lt_0,lt_0 ; lt = lt + c;
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- COPY ht_0,%ret1 ; copy carry
- STD lt_0,0(r_ptr) ; rp[0] = lt
-
-bn_mul_words_exit
- .EXIT
- EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
- LDD -96(%sp),%r7 ; restore r7
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3 ; restore r3
- .PROCEND
-
-;----------------------------------------------------------------------------
-;
-;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = num
-;
-
-bn_sqr_words
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- NOP
- STD %r5,16(%sp) ; save r5
-
- CMPIB,>= 0,num,bn_sqr_words_exit
- LDO 128(%sp),%sp ; bump stack
-
- ;
- ; If only 1, the goto straight to cleanup
- ;
- CMPIB,= 1,num,bn_sqr_words_single_top
- DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
-
-bn_sqr_words_unroll2
- FLDD 0(a_ptr),t_float_0 ; a[0]
- FLDD 8(a_ptr),t_float_1 ; a[1]
- XMPYU fht_0,flt_0,fm ; m[0]
- XMPYU fht_1,flt_1,fm_1 ; m[1]
-
- FSTD fm,-24(%sp) ; store m[0]
- FSTD fm_1,-56(%sp) ; store m[1]
- XMPYU flt_0,flt_0,lt_temp ; lt[0]
- XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
-
- FSTD lt_temp,-16(%sp) ; store lt[0]
- FSTD lt_temp_1,-48(%sp) ; store lt[1]
- XMPYU fht_0,fht_0,ht_temp ; ht[0]
- XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
-
- FSTD ht_temp,-8(%sp) ; store ht[0]
- FSTD ht_temp_1,-40(%sp) ; store ht[1]
- LDD -24(%sp),m_0
- LDD -56(%sp),m_1
-
- AND m_0,high_mask,tmp_0 ; m[0] & Mask
- AND m_1,high_mask,tmp_1 ; m[1] & Mask
- DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
- DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
-
- LDD -16(%sp),lt_0
- LDD -48(%sp),lt_1
- EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
- EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
-
- LDD -8(%sp),ht_0
- LDD -40(%sp),ht_1
- ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
- ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
-
- ADD lt_0,m_0,lt_0 ; lt = lt+m
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
- STD lt_0,0(r_ptr) ; rp[0] = lt[0]
- STD ht_0,8(r_ptr) ; rp[1] = ht[1]
-
- ADD lt_1,m_1,lt_1 ; lt = lt+m
- ADD,DC ht_1,%r0,ht_1 ; ht[1]++
- STD lt_1,16(r_ptr) ; rp[2] = lt[1]
- STD ht_1,24(r_ptr) ; rp[3] = ht[1]
-
- LDO -2(num),num ; num = num - 2;
- LDO 16(a_ptr),a_ptr ; ap += 2
- CMPIB,<= 2,num,bn_sqr_words_unroll2
- LDO 32(r_ptr),r_ptr ; rp += 4
-
- CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
-
- ;
- ; Top of loop aligned on 64-byte boundary
- ;
-bn_sqr_words_single_top
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
-
- XMPYU fht_0,flt_0,fm ; m
- FSTD fm,-24(%sp) ; store m
-
- XMPYU flt_0,flt_0,lt_temp ; lt
- FSTD lt_temp,-16(%sp) ; store lt
-
- XMPYU fht_0,fht_0,ht_temp ; ht
- FSTD ht_temp,-8(%sp) ; store ht
-
- LDD -24(%sp),m_0 ; load m
- AND m_0,high_mask,tmp_0 ; m & Mask
- DEPD,Z m_0,30,31,m_0 ; m << 32+1
- LDD -16(%sp),lt_0 ; lt
-
- LDD -8(%sp),ht_0 ; ht
- EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
- ADD m_0,lt_0,lt_0 ; lt = lt+m
- ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- STD lt_0,0(r_ptr) ; rp[0] = lt
- STD ht_0,8(r_ptr) ; rp[1] = ht
-
-bn_sqr_words_exit
- .EXIT
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-
-;----------------------------------------------------------------------------
-;
-;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = bp
-; arg3 = n
-
-t .reg %r22
-b .reg %r21
-l .reg %r20
-
-bn_add_words
- .proc
- .entry
- .callinfo
- .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .align 64
-
- CMPIB,>= 0,n,bn_add_words_exit
- COPY %r0,%ret1 ; return 0 by default
-
- ;
- ; If 2 or more numbers do the loop
- ;
- CMPIB,= 1,n,bn_add_words_single_top
- NOP
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
-bn_add_words_unroll2
- LDD 0(a_ptr),t
- LDD 0(b_ptr),b
- ADD t,%ret1,t ; t = t+c;
- ADD,DC %r0,%r0,%ret1 ; set c to carry
- ADD t,b,l ; l = t + b[0]
- ADD,DC %ret1,%r0,%ret1 ; c+= carry
- STD l,0(r_ptr)
-
- LDD 8(a_ptr),t
- LDD 8(b_ptr),b
- ADD t,%ret1,t ; t = t+c;
- ADD,DC %r0,%r0,%ret1 ; set c to carry
- ADD t,b,l ; l = t + b[0]
- ADD,DC %ret1,%r0,%ret1 ; c+= carry
- STD l,8(r_ptr)
-
- LDO -2(n),n
- LDO 16(a_ptr),a_ptr
- LDO 16(b_ptr),b_ptr
-
- CMPIB,<= 2,n,bn_add_words_unroll2
- LDO 16(r_ptr),r_ptr
-
- CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
-
-bn_add_words_single_top
- LDD 0(a_ptr),t
- LDD 0(b_ptr),b
-
- ADD t,%ret1,t ; t = t+c;
- ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??)
- ADD t,b,l ; l = t + b[0]
- ADD,DC %ret1,%r0,%ret1 ; c+= carry
- STD l,0(r_ptr)
-
-bn_add_words_exit
- .EXIT
- BVE (%rp)
- EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;----------------------------------------------------------------------------
-;
-;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = bp
-; arg3 = n
-
-t1 .reg %r22
-t2 .reg %r21
-sub_tmp1 .reg %r20
-sub_tmp2 .reg %r19
-
-
-bn_sub_words
- .proc
- .callinfo
- .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- CMPIB,>= 0,n,bn_sub_words_exit
- COPY %r0,%ret1 ; return 0 by default
-
- ;
- ; If 2 or more numbers do the loop
- ;
- CMPIB,= 1,n,bn_sub_words_single_top
- NOP
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
-bn_sub_words_unroll2
- LDD 0(a_ptr),t1
- LDD 0(b_ptr),t2
- SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
- SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
-
- CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
- LDO 1(%r0),sub_tmp2
-
- CMPCLR,*= t1,t2,%r0
- COPY sub_tmp2,%ret1
- STD sub_tmp1,0(r_ptr)
-
- LDD 8(a_ptr),t1
- LDD 8(b_ptr),t2
- SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
- SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
- CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
- LDO 1(%r0),sub_tmp2
-
- CMPCLR,*= t1,t2,%r0
- COPY sub_tmp2,%ret1
- STD sub_tmp1,8(r_ptr)
-
- LDO -2(n),n
- LDO 16(a_ptr),a_ptr
- LDO 16(b_ptr),b_ptr
-
- CMPIB,<= 2,n,bn_sub_words_unroll2
- LDO 16(r_ptr),r_ptr
-
- CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
-
-bn_sub_words_single_top
- LDD 0(a_ptr),t1
- LDD 0(b_ptr),t2
- SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
- SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
- CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
- LDO 1(%r0),sub_tmp2
-
- CMPCLR,*= t1,t2,%r0
- COPY sub_tmp2,%ret1
-
- STD sub_tmp1,0(r_ptr)
-
-bn_sub_words_exit
- .EXIT
- BVE (%rp)
- EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;------------------------------------------------------------------------------
-;
-; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
-;
-; arg0 = h
-; arg1 = l
-; arg2 = d
-;
-; This is mainly just output from the HP C compiler.
-;
-;------------------------------------------------------------------------------
-bn_div_words
- .PROC
- .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
- .IMPORT BN_num_bits_word,CODE
- ;--- not PIC .IMPORT __iob,DATA
- ;--- not PIC .IMPORT fprintf,CODE
- .IMPORT abort,CODE
- .IMPORT $$div2U,MILLICODE
- .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
- .ENTRY
- STW %r2,-20(%r30) ;offset 0x8ec
- STW,MA %r3,192(%r30) ;offset 0x8f0
- STW %r4,-188(%r30) ;offset 0x8f4
- DEPD %r5,31,32,%r6 ;offset 0x8f8
- STD %r6,-184(%r30) ;offset 0x8fc
- DEPD %r7,31,32,%r8 ;offset 0x900
- STD %r8,-176(%r30) ;offset 0x904
- STW %r9,-168(%r30) ;offset 0x908
- LDD -248(%r30),%r3 ;offset 0x90c
- COPY %r26,%r4 ;offset 0x910
- COPY %r24,%r5 ;offset 0x914
- DEPD %r25,31,32,%r4 ;offset 0x918
- CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c
- DEPD %r23,31,32,%r5 ;offset 0x920
- MOVIB,TR -1,%r29,$00060002 ;offset 0x924
- EXTRD,U %r29,31,32,%r28 ;offset 0x928
-$0006002A
- LDO -1(%r29),%r29 ;offset 0x92c
- SUB %r23,%r7,%r23 ;offset 0x930
-$00060024
- SUB %r4,%r31,%r25 ;offset 0x934
- AND %r25,%r19,%r26 ;offset 0x938
- CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c
- DEPD,Z %r25,31,32,%r20 ;offset 0x940
- OR %r20,%r24,%r21 ;offset 0x944
- CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948
- SUB %r31,%r2,%r31 ;offset 0x94c
-$00060046
-$0006002E
- DEPD,Z %r23,31,32,%r25 ;offset 0x950
- EXTRD,U %r23,31,32,%r26 ;offset 0x954
- AND %r25,%r19,%r24 ;offset 0x958
- ADD,L %r31,%r26,%r31 ;offset 0x95c
- CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960
- LDO 1(%r31),%r31 ;offset 0x964
-$00060032
- CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968
- LDO -1(%r29),%r29 ;offset 0x96c
- ADD,L %r4,%r3,%r4 ;offset 0x970
-$00060036
- ADDIB,=,N -1,%r8,$D0 ;offset 0x974
- SUB %r5,%r24,%r28 ;offset 0x978
-$0006003A
- SUB %r4,%r31,%r24 ;offset 0x97c
- SHRPD %r24,%r28,32,%r4 ;offset 0x980
- DEPD,Z %r29,31,32,%r9 ;offset 0x984
- DEPD,Z %r28,31,32,%r5 ;offset 0x988
-$0006001C
- EXTRD,U %r4,31,32,%r31 ;offset 0x98c
- CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990
- MOVB,TR %r6,%r29,$D1 ;offset 0x994
- STD %r29,-152(%r30) ;offset 0x998
-$0006000C
- EXTRD,U %r3,31,32,%r25 ;offset 0x99c
- COPY %r3,%r26 ;offset 0x9a0
- EXTRD,U %r3,31,32,%r9 ;offset 0x9a4
- EXTRD,U %r4,31,32,%r8 ;offset 0x9a8
- .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28;
- B,L BN_num_bits_word,%r2 ;offset 0x9ac
- EXTRD,U %r5,31,32,%r7 ;offset 0x9b0
- LDI 64,%r20 ;offset 0x9b4
- DEPD %r7,31,32,%r5 ;offset 0x9b8
- DEPD %r8,31,32,%r4 ;offset 0x9bc
- DEPD %r9,31,32,%r3 ;offset 0x9c0
- CMPB,= %r28,%r20,$00060012 ;offset 0x9c4
- COPY %r28,%r24 ;offset 0x9c8
- MTSARCM %r24 ;offset 0x9cc
- DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0
- CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4
-$00060012
- SUBI 64,%r24,%r31 ;offset 0x9d8
- CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc
- SUB %r4,%r3,%r4 ;offset 0x9e0
-$00060016
- CMPB,= %r31,%r0,$0006001A ;offset 0x9e4
- COPY %r0,%r9 ;offset 0x9e8
- MTSARCM %r31 ;offset 0x9ec
- DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0
- SUBI 64,%r31,%r26 ;offset 0x9f4
- MTSAR %r26 ;offset 0x9f8
- SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc
- MTSARCM %r31 ;offset 0xa00
- DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04
-$0006001A
- DEPDI,Z -1,31,32,%r19 ;offset 0xa08
- AND %r3,%r19,%r29 ;offset 0xa0c
- EXTRD,U %r29,31,32,%r2 ;offset 0xa10
- DEPDI,Z -1,63,32,%r6 ;offset 0xa14
- MOVIB,TR 2,%r8,$0006001C ;offset 0xa18
- EXTRD,U %r3,63,32,%r7 ;offset 0xa1c
-$D2
- ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20
- ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24
- ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28
- ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28;
- ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c
- ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30
- .CALL ;
- B,L abort,%r2 ;offset 0xa34
- NOP ;offset 0xa38
- B $D3 ;offset 0xa3c
- LDW -212(%r30),%r2 ;offset 0xa40
-$00060020
- COPY %r4,%r26 ;offset 0xa44
- EXTRD,U %r4,31,32,%r25 ;offset 0xa48
- COPY %r2,%r24 ;offset 0xa4c
- .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
- B,L $$div2U,%r31 ;offset 0xa50
- EXTRD,U %r2,31,32,%r23 ;offset 0xa54
- DEPD %r28,31,32,%r29 ;offset 0xa58
-$00060022
- STD %r29,-152(%r30) ;offset 0xa5c
-$D1
- AND %r5,%r19,%r24 ;offset 0xa60
- EXTRD,U %r24,31,32,%r24 ;offset 0xa64
- STW %r2,-160(%r30) ;offset 0xa68
- STW %r7,-128(%r30) ;offset 0xa6c
- FLDD -152(%r30),%fr4 ;offset 0xa70
- FLDD -152(%r30),%fr7 ;offset 0xa74
- FLDW -160(%r30),%fr8L ;offset 0xa78
- FLDW -128(%r30),%fr5L ;offset 0xa7c
- XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80
- FSTD %fr10,-136(%r30) ;offset 0xa84
- XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88
- FSTD %fr22,-144(%r30) ;offset 0xa8c
- XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90
- XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94
- FSTD %fr11,-112(%r30) ;offset 0xa98
- FSTD %fr23,-120(%r30) ;offset 0xa9c
- LDD -136(%r30),%r28 ;offset 0xaa0
- DEPD,Z %r28,31,32,%r31 ;offset 0xaa4
- LDD -144(%r30),%r20 ;offset 0xaa8
- ADD,L %r20,%r31,%r31 ;offset 0xaac
- LDD -112(%r30),%r22 ;offset 0xab0
- DEPD,Z %r22,31,32,%r22 ;offset 0xab4
- LDD -120(%r30),%r21 ;offset 0xab8
- B $00060024 ;offset 0xabc
- ADD,L %r21,%r22,%r23 ;offset 0xac0
-$D0
- OR %r9,%r29,%r29 ;offset 0xac4
-$00060040
- EXTRD,U %r29,31,32,%r28 ;offset 0xac8
-$00060002
-$L2
- LDW -212(%r30),%r2 ;offset 0xacc
-$D3
- LDW -168(%r30),%r9 ;offset 0xad0
- LDD -176(%r30),%r8 ;offset 0xad4
- EXTRD,U %r8,31,32,%r7 ;offset 0xad8
- LDD -184(%r30),%r6 ;offset 0xadc
- EXTRD,U %r6,31,32,%r5 ;offset 0xae0
- LDW -188(%r30),%r4 ;offset 0xae4
- BVE (%r2) ;offset 0xae8
- .EXIT
- LDW,MB -192(%r30),%r3 ;offset 0xaec
- .PROCEND ;in=23,25;out=28,29;fpin=105,107;
-
-
-
-
-;----------------------------------------------------------------------------
-;
-; Registers to hold 64-bit values to manipulate. The "L" part
-; of the register corresponds to the upper 32-bits, while the "R"
-; part corresponds to the lower 32-bits
-;
-; Note, that when using b6 and b7, the code must save these before
-; using them because they are callee save registers
-;
-;
-; Floating point registers to use to save values that
-; are manipulated. These don't collide with ftemp1-6 and
-; are all caller save registers
-;
-a0 .reg %fr22
-a0L .reg %fr22L
-a0R .reg %fr22R
-
-a1 .reg %fr23
-a1L .reg %fr23L
-a1R .reg %fr23R
-
-a2 .reg %fr24
-a2L .reg %fr24L
-a2R .reg %fr24R
-
-a3 .reg %fr25
-a3L .reg %fr25L
-a3R .reg %fr25R
-
-a4 .reg %fr26
-a4L .reg %fr26L
-a4R .reg %fr26R
-
-a5 .reg %fr27
-a5L .reg %fr27L
-a5R .reg %fr27R
-
-a6 .reg %fr28
-a6L .reg %fr28L
-a6R .reg %fr28R
-
-a7 .reg %fr29
-a7L .reg %fr29L
-a7R .reg %fr29R
-
-b0 .reg %fr30
-b0L .reg %fr30L
-b0R .reg %fr30R
-
-b1 .reg %fr31
-b1L .reg %fr31L
-b1R .reg %fr31R
-
-;
-; Temporary floating point variables, these are all caller save
-; registers
-;
-ftemp1 .reg %fr4
-ftemp2 .reg %fr5
-ftemp3 .reg %fr6
-ftemp4 .reg %fr7
-
-;
-; The B set of registers when used.
-;
-
-b2 .reg %fr8
-b2L .reg %fr8L
-b2R .reg %fr8R
-
-b3 .reg %fr9
-b3L .reg %fr9L
-b3R .reg %fr9R
-
-b4 .reg %fr10
-b4L .reg %fr10L
-b4R .reg %fr10R
-
-b5 .reg %fr11
-b5L .reg %fr11L
-b5R .reg %fr11R
-
-b6 .reg %fr12
-b6L .reg %fr12L
-b6R .reg %fr12R
-
-b7 .reg %fr13
-b7L .reg %fr13L
-b7R .reg %fr13R
-
-c1 .reg %r21 ; only reg
-temp1 .reg %r20 ; only reg
-temp2 .reg %r19 ; only reg
-temp3 .reg %r31 ; only reg
-
-m1 .reg %r28
-c2 .reg %r23
-high_one .reg %r1
-ht .reg %r6
-lt .reg %r5
-m .reg %r4
-c3 .reg %r3
-
-SQR_ADD_C .macro A0L,A0R,C1,C2,C3
- XMPYU A0L,A0R,ftemp1 ; m
- FSTD ftemp1,-24(%sp) ; store m
-
- XMPYU A0R,A0R,ftemp2 ; lt
- FSTD ftemp2,-16(%sp) ; store lt
-
- XMPYU A0L,A0L,ftemp3 ; ht
- FSTD ftemp3,-8(%sp) ; store ht
-
- LDD -24(%sp),m ; load m
- AND m,high_mask,temp2 ; m & Mask
- DEPD,Z m,30,31,temp3 ; m << 32+1
- LDD -16(%sp),lt ; lt
-
- LDD -8(%sp),ht ; ht
- EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
- ADD temp3,lt,lt ; lt = lt+m
- ADD,L ht,temp1,ht ; ht += temp1
- ADD,DC ht,%r0,ht ; ht++
-
- ADD C1,lt,C1 ; c1=c1+lt
- ADD,DC ht,%r0,ht ; ht++
-
- ADD C2,ht,C2 ; c2=c2+ht
- ADD,DC C3,%r0,C3 ; c3++
-.endm
-
-SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
- XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
- FSTD ftemp1,-16(%sp) ;
- XMPYU A0R,A1L,ftemp2 ; m = bh*lt
- FSTD ftemp2,-8(%sp) ;
- XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
- FSTD ftemp3,-32(%sp)
- XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
- FSTD ftemp4,-24(%sp) ;
-
- LDD -8(%sp),m ; r21 = m
- LDD -16(%sp),m1 ; r19 = m1
- ADD,L m,m1,m ; m+m1
-
- DEPD,Z m,31,32,temp3 ; (m+m1<<32)
- LDD -24(%sp),ht ; r24 = ht
-
- CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
- ADD,L ht,high_one,ht ; ht+=high_one
-
- EXTRD,U m,31,32,temp1 ; m >> 32
- LDD -32(%sp),lt ; lt
- ADD,L ht,temp1,ht ; ht+= m>>32
- ADD lt,temp3,lt ; lt = lt+m1
- ADD,DC ht,%r0,ht ; ht++
-
- ADD ht,ht,ht ; ht=ht+ht;
- ADD,DC C3,%r0,C3 ; add in carry (c3++)
-
- ADD lt,lt,lt ; lt=lt+lt;
- ADD,DC ht,%r0,ht ; add in carry (ht++)
-
- ADD C1,lt,C1 ; c1=c1+lt
- ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
- LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
-
- ADD C2,ht,C2 ; c2 = c2 + ht
- ADD,DC C3,%r0,C3 ; add in carry (c3++)
-.endm
-
-;
-;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
-; arg0 = r_ptr
-; arg1 = a_ptr
-;
-
-bn_sqr_comba8
- .PROC
- .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .ENTRY
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
- FLDD 32(a_ptr),a4
- FLDD 40(a_ptr),a5
- FLDD 48(a_ptr),a6
- FLDD 56(a_ptr),a7
-
- SQR_ADD_C a0L,a0R,c1,c2,c3
- STD c1,0(r_ptr) ; r[0] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
- STD c2,8(r_ptr) ; r[1] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a1L,a1R,c3,c1,c2
- SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
- STD c3,16(r_ptr) ; r[2] = c3;
- COPY %r0,c3
-
- SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
- SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
- STD c1,24(r_ptr) ; r[3] = c1;
- COPY %r0,c1
-
- SQR_ADD_C a2L,a2R,c2,c3,c1
- SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
- SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
- STD c2,32(r_ptr) ; r[4] = c2;
- COPY %r0,c2
-
- SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
- SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
- SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
- STD c3,40(r_ptr) ; r[5] = c3;
- COPY %r0,c3
-
- SQR_ADD_C a3L,a3R,c1,c2,c3
- SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
- SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
- SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
- STD c1,48(r_ptr) ; r[6] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
- SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
- SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
- SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
- STD c2,56(r_ptr) ; r[7] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a4L,a4R,c3,c1,c2
- SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
- SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
- SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
- STD c3,64(r_ptr) ; r[8] = c3;
- COPY %r0,c3
-
- SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
- SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
- SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
- STD c1,72(r_ptr) ; r[9] = c1;
- COPY %r0,c1
-
- SQR_ADD_C a5L,a5R,c2,c3,c1
- SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
- SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
- STD c2,80(r_ptr) ; r[10] = c2;
- COPY %r0,c2
-
- SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
- SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
- STD c3,88(r_ptr) ; r[11] = c3;
- COPY %r0,c3
-
- SQR_ADD_C a6L,a6R,c1,c2,c3
- SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
- STD c1,96(r_ptr) ; r[12] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
- STD c2,104(r_ptr) ; r[13] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a7L,a7R,c3,c1,c2
- STD c3, 112(r_ptr) ; r[14] = c3
- STD c1, 120(r_ptr) ; r[15] = c1
-
- .EXIT
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-;-----------------------------------------------------------------------------
-;
-;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
-; arg0 = r_ptr
-; arg1 = a_ptr
-;
-
-bn_sqr_comba4
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
- FLDD 32(a_ptr),a4
- FLDD 40(a_ptr),a5
- FLDD 48(a_ptr),a6
- FLDD 56(a_ptr),a7
-
- SQR_ADD_C a0L,a0R,c1,c2,c3
-
- STD c1,0(r_ptr) ; r[0] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
-
- STD c2,8(r_ptr) ; r[1] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a1L,a1R,c3,c1,c2
- SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
-
- STD c3,16(r_ptr) ; r[2] = c3;
- COPY %r0,c3
-
- SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
- SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
-
- STD c1,24(r_ptr) ; r[3] = c1;
- COPY %r0,c1
-
- SQR_ADD_C a2L,a2R,c2,c3,c1
- SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
-
- STD c2,32(r_ptr) ; r[4] = c2;
- COPY %r0,c2
-
- SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
- STD c3,40(r_ptr) ; r[5] = c3;
- COPY %r0,c3
-
- SQR_ADD_C a3L,a3R,c1,c2,c3
- STD c1,48(r_ptr) ; r[6] = c1;
- STD c2,56(r_ptr) ; r[7] = c2;
-
- .EXIT
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-
-;---------------------------------------------------------------------------
-
-MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
- XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
- FSTD ftemp1,-16(%sp) ;
- XMPYU A0R,B0L,ftemp2 ; m = bh*lt
- FSTD ftemp2,-8(%sp) ;
- XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
- FSTD ftemp3,-32(%sp)
- XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
- FSTD ftemp4,-24(%sp) ;
-
- LDD -8(%sp),m ; r21 = m
- LDD -16(%sp),m1 ; r19 = m1
- ADD,L m,m1,m ; m+m1
-
- DEPD,Z m,31,32,temp3 ; (m+m1<<32)
- LDD -24(%sp),ht ; r24 = ht
-
- CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
- ADD,L ht,high_one,ht ; ht+=high_one
-
- EXTRD,U m,31,32,temp1 ; m >> 32
- LDD -32(%sp),lt ; lt
- ADD,L ht,temp1,ht ; ht+= m>>32
- ADD lt,temp3,lt ; lt = lt+m1
- ADD,DC ht,%r0,ht ; ht++
-
- ADD C1,lt,C1 ; c1=c1+lt
- ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
-
- ADD C2,ht,C2 ; c2 = c2 + ht
- ADD,DC C3,%r0,C3 ; add in carry (c3++)
-.endm
-
-
-;
-;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-; arg0 = r_ptr
-; arg1 = a_ptr
-; arg2 = b_ptr
-;
-
-bn_mul_comba8
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
- FSTD %fr12,32(%sp) ; save r6
- FSTD %fr13,40(%sp) ; save r7
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
- FLDD 32(a_ptr),a4
- FLDD 40(a_ptr),a5
- FLDD 48(a_ptr),a6
- FLDD 56(a_ptr),a7
-
- FLDD 0(b_ptr),b0
- FLDD 8(b_ptr),b1
- FLDD 16(b_ptr),b2
- FLDD 24(b_ptr),b3
- FLDD 32(b_ptr),b4
- FLDD 40(b_ptr),b5
- FLDD 48(b_ptr),b6
- FLDD 56(b_ptr),b7
-
- MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
- STD c1,0(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
- STD c2,8(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
- STD c3,16(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
- MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
- STD c1,24(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
- MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
- MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
- STD c2,32(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
- MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
- MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
- MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
- STD c3,40(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
- MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
- MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
- MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
- MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
- STD c1,48(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
- MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
- MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
- MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
- MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
- MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
- STD c2,56(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
- MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
- MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
- MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
- MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
- STD c3,64(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
- MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
- MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
- MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
- STD c1,72(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
- MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
- MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
- MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
- MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
- STD c2,80(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
- MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
- MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
- MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
- STD c3,88(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
- MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
- MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
- STD c1,96(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
- MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
- STD c2,104(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
- STD c3,112(r_ptr)
- STD c1,120(r_ptr)
-
- .EXIT
- FLDD -88(%sp),%fr13
- FLDD -96(%sp),%fr12
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-;-----------------------------------------------------------------------------
-;
-;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-; arg0 = r_ptr
-; arg1 = a_ptr
-; arg2 = b_ptr
-;
-
-bn_mul_comba4
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
- FSTD %fr12,32(%sp) ; save r6
- FSTD %fr13,40(%sp) ; save r7
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
-
- FLDD 0(b_ptr),b0
- FLDD 8(b_ptr),b1
- FLDD 16(b_ptr),b2
- FLDD 24(b_ptr),b3
-
- MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
- STD c1,0(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
- STD c2,8(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
- STD c3,16(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
- MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
- STD c1,24(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
- STD c2,32(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
- MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
- STD c3,40(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
- STD c1,48(r_ptr)
- STD c2,56(r_ptr)
-
- .EXIT
- FLDD -88(%sp),%fr13
- FLDD -96(%sp),%fr12
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-
-;--- not PIC .SPACE $TEXT$
-;--- not PIC .SUBSPA $CODE$
-;--- not PIC .SPACE $PRIVATE$,SORT=16
-;--- not PIC .IMPORT $global$,DATA
-;--- not PIC .SPACE $TEXT$
-;--- not PIC .SUBSPA $CODE$
-;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c
-;--- not PIC C$7
-;--- not PIC .ALIGN 8
-;--- not PIC .STRINGZ "Division would overflow (%d)\n"
- .END
+++ /dev/null
-;
-; PA-RISC 64-bit implementation of bn_asm code
-;
-; This code is approximately 2x faster than the C version
-; for RSA/DSA.
-;
-; See http://devresource.hp.com/ for more details on the PA-RISC
-; architecture. Also see the book "PA-RISC 2.0 Architecture"
-; by Gerry Kane for information on the instruction set architecture.
-;
-; Code written by Chris Ruemmler (with some help from the HP C
-; compiler).
-;
-; The code compiles with HP's assembler
-;
-
- .level 2.0W
- .space $TEXT$
- .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
-
-;
-; Global Register definitions used for the routines.
-;
-; Some information about HP's runtime architecture for 64-bits.
-;
-; "Caller save" means the calling function must save the register
-; if it wants the register to be preserved.
-; "Callee save" means if a function uses the register, it must save
-; the value before using it.
-;
-; For the floating point registers
-;
-; "caller save" registers: fr4-fr11, fr22-fr31
-; "callee save" registers: fr12-fr21
-; "special" registers: fr0-fr3 (status and exception registers)
-;
-; For the integer registers
-; value zero : r0
-; "caller save" registers: r1,r19-r26
-; "callee save" registers: r3-r18
-; return register : r2 (rp)
-; return values ; r28 (ret0,ret1)
-; Stack pointer ; r30 (sp)
-; global data pointer ; r27 (dp)
-; argument pointer ; r29 (ap)
-; millicode return ptr ; r31 (also a caller save register)
-
-
-;
-; Arguments to the routines
-;
-r_ptr .reg %r26
-a_ptr .reg %r25
-b_ptr .reg %r24
-num .reg %r24
-w .reg %r23
-n .reg %r23
-
-
-;
-; Globals used in some routines
-;
-
-top_overflow .reg %r29
-high_mask .reg %r22 ; value 0xffffffff80000000L
-
-
-;------------------------------------------------------------------------------
-;
-; bn_mul_add_words
-;
-;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
-; int num, BN_ULONG w)
-;
-; arg0 = r_ptr
-; arg1 = a_ptr
-; arg2 = num
-; arg3 = w
-;
-; Local register definitions
-;
-
-fm1 .reg %fr22
-fm .reg %fr23
-ht_temp .reg %fr24
-ht_temp_1 .reg %fr25
-lt_temp .reg %fr26
-lt_temp_1 .reg %fr27
-fm1_1 .reg %fr28
-fm_1 .reg %fr29
-
-fw_h .reg %fr7L
-fw_l .reg %fr7R
-fw .reg %fr7
-
-fht_0 .reg %fr8L
-flt_0 .reg %fr8R
-t_float_0 .reg %fr8
-
-fht_1 .reg %fr9L
-flt_1 .reg %fr9R
-t_float_1 .reg %fr9
-
-tmp_0 .reg %r31
-tmp_1 .reg %r21
-m_0 .reg %r20
-m_1 .reg %r19
-ht_0 .reg %r1
-ht_1 .reg %r3
-lt_0 .reg %r4
-lt_1 .reg %r5
-m1_0 .reg %r6
-m1_1 .reg %r7
-rp_val .reg %r8
-rp_val_1 .reg %r9
-
-bn_mul_add_words
- .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
- .proc
- .callinfo frame=128
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- NOP ; Needed to make the loop 16-byte aligned
- NOP ; Needed to make the loop 16-byte aligned
-
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
- STD %r7,32(%sp) ; save r7
- STD %r8,40(%sp) ; save r8
-
- STD %r9,48(%sp) ; save r9
- COPY %r0,%ret0 ; return 0 by default
- DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
- STD w,56(%sp) ; store w on stack
-
- CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
- LDO 128(%sp),%sp ; bump stack
-
- ;
- ; The loop is unrolled twice, so if there is only 1 number
- ; then go straight to the cleanup code.
- ;
- CMPIB,= 1,num,bn_mul_add_words_single_top
- FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
- ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
- ; two 32-bit mutiplies can be issued per cycle.
- ;
-bn_mul_add_words_unroll2
-
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- LDD 0(r_ptr),rp_val ; rp[0]
- LDD 8(r_ptr),rp_val_1 ; rp[1]
-
- XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
- XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
- FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
- FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
-
- XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
- XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
- FSTD fm,-8(%sp) ; -8(sp) = m[0]
- FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
-
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
- XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
- FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
-
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
- FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
-
- LDD -8(%sp),m_0 ; m[0]
- LDD -40(%sp),m_1 ; m[1]
- LDD -16(%sp),m1_0 ; m1[0]
- LDD -48(%sp),m1_1 ; m1[1]
-
- LDD -24(%sp),ht_0 ; ht[0]
- LDD -56(%sp),ht_1 ; ht[1]
- ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
- ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
-
- LDD -32(%sp),lt_0
- LDD -64(%sp),lt_1
- CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
- ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
-
- CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
- ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
- EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
-
- EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
- DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
- ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
- ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
-
- ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
- ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
- ADD,DC ht_1,%r0,ht_1 ; ht[1]++
-
- ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c;
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
- ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
-
- LDO -2(num),num ; num = num - 2;
- ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
- ADD,DC ht_1,%r0,ht_1 ; ht[1]++
- STD lt_0,0(r_ptr) ; rp[0] = lt[0]
-
- ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
- ADD,DC ht_1,%r0,%ret0 ; ht[1]++
- LDO 16(a_ptr),a_ptr ; a_ptr += 2
-
- STD lt_1,8(r_ptr) ; rp[1] = lt[1]
- CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
- LDO 16(r_ptr),r_ptr ; r_ptr += 2
-
- CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
-
- ;
- ; Top of loop aligned on 64-byte boundary
- ;
-bn_mul_add_words_single_top
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- LDD 0(r_ptr),rp_val ; rp[0]
- LDO 8(a_ptr),a_ptr ; a_ptr++
- XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
- FSTD fm1,-16(%sp) ; -16(sp) = m1
- XMPYU flt_0,fw_h,fm ; m = lt*fw_h
- FSTD fm,-8(%sp) ; -8(sp) = m
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt
-
- LDD -8(%sp),m_0
- LDD -16(%sp),m1_0 ; m1 = temp1
- ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
- LDD -24(%sp),ht_0
- LDD -32(%sp),lt_0
-
- CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
- ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
-
- EXTRD,U tmp_0,31,32,m_0 ; m>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
-
- ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
- ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
- ADD,DC ht_0,%r0,ht_0 ; ht++
- ADD %ret0,tmp_0,lt_0 ; lt = lt + c;
- ADD,DC ht_0,%r0,ht_0 ; ht++
- ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
- ADD,DC ht_0,%r0,%ret0 ; ht++
- STD lt_0,0(r_ptr) ; rp[0] = lt
-
-bn_mul_add_words_exit
- .EXIT
- LDD -80(%sp),%r9 ; restore r9
- LDD -88(%sp),%r8 ; restore r8
- LDD -96(%sp),%r7 ; restore r7
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3 ; restore r3
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;----------------------------------------------------------------------------
-;
-;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = num
-; arg3 = w
-
-bn_mul_words
- .proc
- .callinfo frame=128
- .entry
- .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
-
- STD %r7,32(%sp) ; save r7
- COPY %r0,%ret0 ; return 0 by default
- DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
- STD w,56(%sp) ; w on stack
-
- CMPIB,>= 0,num,bn_mul_words_exit
- LDO 128(%sp),%sp ; bump stack
-
- ;
- ; See if only 1 word to do, thus just do cleanup
- ;
- CMPIB,= 1,num,bn_mul_words_single_top
- FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
- ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
- ; two 32-bit mutiplies can be issued per cycle.
- ;
-bn_mul_words_unroll2
-
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
- XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
-
- FSTD fm1,-16(%sp) ; -16(sp) = m1
- FSTD fm1_1,-48(%sp) ; -48(sp) = m1
- XMPYU flt_0,fw_h,fm ; m = lt*fw_h
- XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
-
- FSTD fm,-8(%sp) ; -8(sp) = m
- FSTD fm_1,-40(%sp) ; -40(sp) = m
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
- XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
-
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht
- FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
-
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt
- FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
- LDD -8(%sp),m_0
- LDD -40(%sp),m_1
-
- LDD -16(%sp),m1_0
- LDD -48(%sp),m1_1
- LDD -24(%sp),ht_0
- LDD -56(%sp),ht_1
-
- ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
- ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
- LDD -32(%sp),lt_0
- LDD -64(%sp),lt_1
-
- CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
- ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
- CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
- ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
-
- EXTRD,U tmp_0,31,32,m_0 ; m>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
- EXTRD,U tmp_1,31,32,m_1 ; m>>32
- DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
-
- ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
- ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
- ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
- ADD,DC ht_1,%r0,ht_1 ; ht++
- ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0);
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
- ADD,DC ht_1,%r0,ht_1 ; ht++
- STD lt_0,0(r_ptr) ; rp[0] = lt
- STD lt_1,8(r_ptr) ; rp[1] = lt
-
- COPY ht_1,%ret0 ; carry = ht
- LDO -2(num),num ; num = num - 2;
- LDO 16(a_ptr),a_ptr ; ap += 2
- CMPIB,<= 2,num,bn_mul_words_unroll2
- LDO 16(r_ptr),r_ptr ; rp++
-
- CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
-
- ;
- ; Top of loop aligned on 64-byte boundary
- ;
-bn_mul_words_single_top
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
-
- XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
- FSTD fm1,-16(%sp) ; -16(sp) = m1
- XMPYU flt_0,fw_h,fm ; m = lt*fw_h
- FSTD fm,-8(%sp) ; -8(sp) = m
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt
-
- LDD -8(%sp),m_0
- LDD -16(%sp),m1_0
- ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
- LDD -24(%sp),ht_0
- LDD -32(%sp),lt_0
-
- CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
- ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
-
- EXTRD,U tmp_0,31,32,m_0 ; m>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
-
- ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
- ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- ADD %ret0,lt_0,lt_0 ; lt = lt + c;
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- COPY ht_0,%ret0 ; copy carry
- STD lt_0,0(r_ptr) ; rp[0] = lt
-
-bn_mul_words_exit
- .EXIT
- LDD -96(%sp),%r7 ; restore r7
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3 ; restore r3
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;----------------------------------------------------------------------------
-;
-;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = num
-;
-
-bn_sqr_words
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- NOP
- STD %r5,16(%sp) ; save r5
-
- CMPIB,>= 0,num,bn_sqr_words_exit
- LDO 128(%sp),%sp ; bump stack
-
- ;
- ; If only 1, the goto straight to cleanup
- ;
- CMPIB,= 1,num,bn_sqr_words_single_top
- DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
-
-bn_sqr_words_unroll2
- FLDD 0(a_ptr),t_float_0 ; a[0]
- FLDD 8(a_ptr),t_float_1 ; a[1]
- XMPYU fht_0,flt_0,fm ; m[0]
- XMPYU fht_1,flt_1,fm_1 ; m[1]
-
- FSTD fm,-24(%sp) ; store m[0]
- FSTD fm_1,-56(%sp) ; store m[1]
- XMPYU flt_0,flt_0,lt_temp ; lt[0]
- XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
-
- FSTD lt_temp,-16(%sp) ; store lt[0]
- FSTD lt_temp_1,-48(%sp) ; store lt[1]
- XMPYU fht_0,fht_0,ht_temp ; ht[0]
- XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
-
- FSTD ht_temp,-8(%sp) ; store ht[0]
- FSTD ht_temp_1,-40(%sp) ; store ht[1]
- LDD -24(%sp),m_0
- LDD -56(%sp),m_1
-
- AND m_0,high_mask,tmp_0 ; m[0] & Mask
- AND m_1,high_mask,tmp_1 ; m[1] & Mask
- DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
- DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
-
- LDD -16(%sp),lt_0
- LDD -48(%sp),lt_1
- EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
- EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
-
- LDD -8(%sp),ht_0
- LDD -40(%sp),ht_1
- ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
- ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
-
- ADD lt_0,m_0,lt_0 ; lt = lt+m
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
- STD lt_0,0(r_ptr) ; rp[0] = lt[0]
- STD ht_0,8(r_ptr) ; rp[1] = ht[1]
-
- ADD lt_1,m_1,lt_1 ; lt = lt+m
- ADD,DC ht_1,%r0,ht_1 ; ht[1]++
- STD lt_1,16(r_ptr) ; rp[2] = lt[1]
- STD ht_1,24(r_ptr) ; rp[3] = ht[1]
-
- LDO -2(num),num ; num = num - 2;
- LDO 16(a_ptr),a_ptr ; ap += 2
- CMPIB,<= 2,num,bn_sqr_words_unroll2
- LDO 32(r_ptr),r_ptr ; rp += 4
-
- CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
-
- ;
- ; Top of loop aligned on 64-byte boundary
- ;
-bn_sqr_words_single_top
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
-
- XMPYU fht_0,flt_0,fm ; m
- FSTD fm,-24(%sp) ; store m
-
- XMPYU flt_0,flt_0,lt_temp ; lt
- FSTD lt_temp,-16(%sp) ; store lt
-
- XMPYU fht_0,fht_0,ht_temp ; ht
- FSTD ht_temp,-8(%sp) ; store ht
-
- LDD -24(%sp),m_0 ; load m
- AND m_0,high_mask,tmp_0 ; m & Mask
- DEPD,Z m_0,30,31,m_0 ; m << 32+1
- LDD -16(%sp),lt_0 ; lt
-
- LDD -8(%sp),ht_0 ; ht
- EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
- ADD m_0,lt_0,lt_0 ; lt = lt+m
- ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- STD lt_0,0(r_ptr) ; rp[0] = lt
- STD ht_0,8(r_ptr) ; rp[1] = ht
-
-bn_sqr_words_exit
- .EXIT
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-
-;----------------------------------------------------------------------------
-;
-;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = bp
-; arg3 = n
-
-t .reg %r22
-b .reg %r21
-l .reg %r20
-
-bn_add_words
- .proc
- .entry
- .callinfo
- .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .align 64
-
- CMPIB,>= 0,n,bn_add_words_exit
- COPY %r0,%ret0 ; return 0 by default
-
- ;
- ; If 2 or more numbers do the loop
- ;
- CMPIB,= 1,n,bn_add_words_single_top
- NOP
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
-bn_add_words_unroll2
- LDD 0(a_ptr),t
- LDD 0(b_ptr),b
- ADD t,%ret0,t ; t = t+c;
- ADD,DC %r0,%r0,%ret0 ; set c to carry
- ADD t,b,l ; l = t + b[0]
- ADD,DC %ret0,%r0,%ret0 ; c+= carry
- STD l,0(r_ptr)
-
- LDD 8(a_ptr),t
- LDD 8(b_ptr),b
- ADD t,%ret0,t ; t = t+c;
- ADD,DC %r0,%r0,%ret0 ; set c to carry
- ADD t,b,l ; l = t + b[0]
- ADD,DC %ret0,%r0,%ret0 ; c+= carry
- STD l,8(r_ptr)
-
- LDO -2(n),n
- LDO 16(a_ptr),a_ptr
- LDO 16(b_ptr),b_ptr
-
- CMPIB,<= 2,n,bn_add_words_unroll2
- LDO 16(r_ptr),r_ptr
-
- CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
-
-bn_add_words_single_top
- LDD 0(a_ptr),t
- LDD 0(b_ptr),b
-
- ADD t,%ret0,t ; t = t+c;
- ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??)
- ADD t,b,l ; l = t + b[0]
- ADD,DC %ret0,%r0,%ret0 ; c+= carry
- STD l,0(r_ptr)
-
-bn_add_words_exit
- .EXIT
- BVE (%rp)
- NOP
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;----------------------------------------------------------------------------
-;
-;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = bp
-; arg3 = n
-
-t1 .reg %r22
-t2 .reg %r21
-sub_tmp1 .reg %r20
-sub_tmp2 .reg %r19
-
-
-bn_sub_words
- .proc
- .callinfo
- .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- CMPIB,>= 0,n,bn_sub_words_exit
- COPY %r0,%ret0 ; return 0 by default
-
- ;
- ; If 2 or more numbers do the loop
- ;
- CMPIB,= 1,n,bn_sub_words_single_top
- NOP
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
-bn_sub_words_unroll2
- LDD 0(a_ptr),t1
- LDD 0(b_ptr),t2
- SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
- SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
-
- CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
- LDO 1(%r0),sub_tmp2
-
- CMPCLR,*= t1,t2,%r0
- COPY sub_tmp2,%ret0
- STD sub_tmp1,0(r_ptr)
-
- LDD 8(a_ptr),t1
- LDD 8(b_ptr),t2
- SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
- SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
- CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
- LDO 1(%r0),sub_tmp2
-
- CMPCLR,*= t1,t2,%r0
- COPY sub_tmp2,%ret0
- STD sub_tmp1,8(r_ptr)
-
- LDO -2(n),n
- LDO 16(a_ptr),a_ptr
- LDO 16(b_ptr),b_ptr
-
- CMPIB,<= 2,n,bn_sub_words_unroll2
- LDO 16(r_ptr),r_ptr
-
- CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
-
-bn_sub_words_single_top
- LDD 0(a_ptr),t1
- LDD 0(b_ptr),t2
- SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
- SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
- CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
- LDO 1(%r0),sub_tmp2
-
- CMPCLR,*= t1,t2,%r0
- COPY sub_tmp2,%ret0
-
- STD sub_tmp1,0(r_ptr)
-
-bn_sub_words_exit
- .EXIT
- BVE (%rp)
- NOP
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;------------------------------------------------------------------------------
-;
-; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
-;
-; arg0 = h
-; arg1 = l
-; arg2 = d
-;
-; This is mainly just modified assembly from the compiler, thus the
-; lack of variable names.
-;
-;------------------------------------------------------------------------------
-bn_div_words
- .proc
- .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .IMPORT BN_num_bits_word,CODE,NO_RELOCATION
- .IMPORT __iob,DATA
- .IMPORT fprintf,CODE,NO_RELOCATION
- .IMPORT abort,CODE,NO_RELOCATION
- .IMPORT $$div2U,MILLICODE
- .entry
- STD %r2,-16(%r30)
- STD,MA %r3,352(%r30)
- STD %r4,-344(%r30)
- STD %r5,-336(%r30)
- STD %r6,-328(%r30)
- STD %r7,-320(%r30)
- STD %r8,-312(%r30)
- STD %r9,-304(%r30)
- STD %r10,-296(%r30)
-
- STD %r27,-288(%r30) ; save gp
-
- COPY %r24,%r3 ; save d
- COPY %r26,%r4 ; save h (high 64-bits)
- LDO -1(%r0),%ret0 ; return -1 by default
-
- CMPB,*= %r0,%arg2,$D3 ; if (d == 0)
- COPY %r25,%r5 ; save l (low 64-bits)
-
- LDO -48(%r30),%r29 ; create ap
- .CALL ;in=26,29;out=28;
- B,L BN_num_bits_word,%r2
- COPY %r3,%r26
- LDD -288(%r30),%r27 ; restore gp
- LDI 64,%r21
-
- CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward)
- COPY %ret0,%r24 ; i
- MTSARCM %r24
- DEPDI,Z -1,%sar,1,%r29
- CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward)
-
-$00000012
- SUBI 64,%r24,%r31 ; i = 64 - i;
- CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d)
- SUB %r4,%r3,%r4 ; h -= d
- CMPB,= %r31,%r0,$0000001A ; if (i)
- COPY %r0,%r10 ; ret = 0
- MTSARCM %r31 ; i to shift
- DEPD,Z %r3,%sar,64,%r3 ; d <<= i;
- SUBI 64,%r31,%r19 ; 64 - i; redundant
- MTSAR %r19 ; (64 -i) to shift
- SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i)
- MTSARCM %r31 ; i to shift
- DEPD,Z %r5,%sar,64,%r5 ; l <<= i;
-
-$0000001A
- DEPDI,Z -1,31,32,%r19
- EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32
- EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff
- LDO 2(%r0),%r9
- STD %r3,-280(%r30) ; "d" to stack
-
-$0000001C
- DEPDI,Z -1,63,32,%r29 ;
- EXTRD,U %r4,31,32,%r31 ; h >> 32
- CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div
- COPY %r4,%r26
- EXTRD,U %r4,31,32,%r25
- COPY %r6,%r24
- .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
- B,L $$div2U,%r2
- EXTRD,U %r6,31,32,%r23
- DEPD %r28,31,32,%r29
-$D2
- STD %r29,-272(%r30) ; q
- AND %r5,%r19,%r24 ; t & 0xffffffff00000000;
- EXTRD,U %r24,31,32,%r24 ; ???
- FLDD -272(%r30),%fr7 ; q
- FLDD -280(%r30),%fr8 ; d
- XMPYU %fr8L,%fr7L,%fr10
- FSTD %fr10,-256(%r30)
- XMPYU %fr8L,%fr7R,%fr22
- FSTD %fr22,-264(%r30)
- XMPYU %fr8R,%fr7L,%fr11
- XMPYU %fr8R,%fr7R,%fr23
- FSTD %fr11,-232(%r30)
- FSTD %fr23,-240(%r30)
- LDD -256(%r30),%r28
- DEPD,Z %r28,31,32,%r2
- LDD -264(%r30),%r20
- ADD,L %r20,%r2,%r31
- LDD -232(%r30),%r22
- DEPD,Z %r22,31,32,%r22
- LDD -240(%r30),%r21
- B $00000024 ; enter loop
- ADD,L %r21,%r22,%r23
-
-$0000002A
- LDO -1(%r29),%r29
- SUB %r23,%r8,%r23
-$00000024
- SUB %r4,%r31,%r25
- AND %r25,%r19,%r26
- CMPB,*<>,N %r0,%r26,$00000046 ; (forward)
- DEPD,Z %r25,31,32,%r20
- OR %r20,%r24,%r21
- CMPB,*<<,N %r21,%r23,$0000002A ;(backward)
- SUB %r31,%r6,%r31
-;-------------Break path---------------------
-
-$00000046
- DEPD,Z %r23,31,32,%r25 ;tl
- EXTRD,U %r23,31,32,%r26 ;t
- AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L
- ADD,L %r31,%r26,%r31 ;th += t;
- CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl)
- LDO 1(%r31),%r31 ; th++;
- CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward)
- LDO -1(%r29),%r29 ;q--;
- ADD,L %r4,%r3,%r4 ;h += d;
-$00000036
- ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward)
- SUB %r5,%r24,%r28 ; l -= tl;
- SUB %r4,%r31,%r24 ; h -= th;
- SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32));
- DEPD,Z %r29,31,32,%r10 ; ret = q<<32
- b $0000001C
- DEPD,Z %r28,31,32,%r5 ; l = l << 32
-
-$D1
- OR %r10,%r29,%r28 ; ret |= q
-$D3
- LDD -368(%r30),%r2
-$D0
- LDD -296(%r30),%r10
- LDD -304(%r30),%r9
- LDD -312(%r30),%r8
- LDD -320(%r30),%r7
- LDD -328(%r30),%r6
- LDD -336(%r30),%r5
- LDD -344(%r30),%r4
- BVE (%r2)
- .EXIT
- LDD,MB -352(%r30),%r3
-
-bn_div_err_case
- MFIA %r6
- ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1
- LDO R'bn_div_words-bn_div_err_case(%r1),%r6
- ADDIL LT'__iob,%r27,%r1
- LDD RT'__iob(%r1),%r26
- ADDIL L'C$4-bn_div_words,%r6,%r1
- LDO R'C$4-bn_div_words(%r1),%r25
- LDO 64(%r26),%r26
- .CALL ;in=24,25,26,29;out=28;
- B,L fprintf,%r2
- LDO -48(%r30),%r29
- LDD -288(%r30),%r27
- .CALL ;in=29;
- B,L abort,%r2
- LDO -48(%r30),%r29
- LDD -288(%r30),%r27
- B $D0
- LDD -368(%r30),%r2
- .PROCEND ;in=24,25,26,29;out=28;
-
-;----------------------------------------------------------------------------
-;
-; Registers to hold 64-bit values to manipulate. The "L" part
-; of the register corresponds to the upper 32-bits, while the "R"
-; part corresponds to the lower 32-bits
-;
-; Note, that when using b6 and b7, the code must save these before
-; using them because they are callee save registers
-;
-;
-; Floating point registers to use to save values that
-; are manipulated. These don't collide with ftemp1-6 and
-; are all caller save registers
-;
-a0 .reg %fr22
-a0L .reg %fr22L
-a0R .reg %fr22R
-
-a1 .reg %fr23
-a1L .reg %fr23L
-a1R .reg %fr23R
-
-a2 .reg %fr24
-a2L .reg %fr24L
-a2R .reg %fr24R
-
-a3 .reg %fr25
-a3L .reg %fr25L
-a3R .reg %fr25R
-
-a4 .reg %fr26
-a4L .reg %fr26L
-a4R .reg %fr26R
-
-a5 .reg %fr27
-a5L .reg %fr27L
-a5R .reg %fr27R
-
-a6 .reg %fr28
-a6L .reg %fr28L
-a6R .reg %fr28R
-
-a7 .reg %fr29
-a7L .reg %fr29L
-a7R .reg %fr29R
-
-b0 .reg %fr30
-b0L .reg %fr30L
-b0R .reg %fr30R
-
-b1 .reg %fr31
-b1L .reg %fr31L
-b1R .reg %fr31R
-
-;
-; Temporary floating point variables, these are all caller save
-; registers
-;
-ftemp1 .reg %fr4
-ftemp2 .reg %fr5
-ftemp3 .reg %fr6
-ftemp4 .reg %fr7
-
-;
-; The B set of registers when used.
-;
-
-b2 .reg %fr8
-b2L .reg %fr8L
-b2R .reg %fr8R
-
-b3 .reg %fr9
-b3L .reg %fr9L
-b3R .reg %fr9R
-
-b4 .reg %fr10
-b4L .reg %fr10L
-b4R .reg %fr10R
-
-b5 .reg %fr11
-b5L .reg %fr11L
-b5R .reg %fr11R
-
-b6 .reg %fr12
-b6L .reg %fr12L
-b6R .reg %fr12R
-
-b7 .reg %fr13
-b7L .reg %fr13L
-b7R .reg %fr13R
-
-c1 .reg %r21 ; only reg
-temp1 .reg %r20 ; only reg
-temp2 .reg %r19 ; only reg
-temp3 .reg %r31 ; only reg
-
-m1 .reg %r28
-c2 .reg %r23
-high_one .reg %r1
-ht .reg %r6
-lt .reg %r5
-m .reg %r4
-c3 .reg %r3
-
-SQR_ADD_C .macro A0L,A0R,C1,C2,C3
- XMPYU A0L,A0R,ftemp1 ; m
- FSTD ftemp1,-24(%sp) ; store m
-
- XMPYU A0R,A0R,ftemp2 ; lt
- FSTD ftemp2,-16(%sp) ; store lt
-
- XMPYU A0L,A0L,ftemp3 ; ht
- FSTD ftemp3,-8(%sp) ; store ht
-
- LDD -24(%sp),m ; load m
- AND m,high_mask,temp2 ; m & Mask
- DEPD,Z m,30,31,temp3 ; m << 32+1
- LDD -16(%sp),lt ; lt
-
- LDD -8(%sp),ht ; ht
- EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
- ADD temp3,lt,lt ; lt = lt+m
- ADD,L ht,temp1,ht ; ht += temp1
- ADD,DC ht,%r0,ht ; ht++
-
- ADD C1,lt,C1 ; c1=c1+lt
- ADD,DC ht,%r0,ht ; ht++
-
- ADD C2,ht,C2 ; c2=c2+ht
- ADD,DC C3,%r0,C3 ; c3++
-.endm
-
-SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
- XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
- FSTD ftemp1,-16(%sp) ;
- XMPYU A0R,A1L,ftemp2 ; m = bh*lt
- FSTD ftemp2,-8(%sp) ;
- XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
- FSTD ftemp3,-32(%sp)
- XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
- FSTD ftemp4,-24(%sp) ;
-
- LDD -8(%sp),m ; r21 = m
- LDD -16(%sp),m1 ; r19 = m1
- ADD,L m,m1,m ; m+m1
-
- DEPD,Z m,31,32,temp3 ; (m+m1<<32)
- LDD -24(%sp),ht ; r24 = ht
-
- CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
- ADD,L ht,high_one,ht ; ht+=high_one
-
- EXTRD,U m,31,32,temp1 ; m >> 32
- LDD -32(%sp),lt ; lt
- ADD,L ht,temp1,ht ; ht+= m>>32
- ADD lt,temp3,lt ; lt = lt+m1
- ADD,DC ht,%r0,ht ; ht++
-
- ADD ht,ht,ht ; ht=ht+ht;
- ADD,DC C3,%r0,C3 ; add in carry (c3++)
-
- ADD lt,lt,lt ; lt=lt+lt;
- ADD,DC ht,%r0,ht ; add in carry (ht++)
-
- ADD C1,lt,C1 ; c1=c1+lt
- ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
- LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
-
- ADD C2,ht,C2 ; c2 = c2 + ht
- ADD,DC C3,%r0,C3 ; add in carry (c3++)
-.endm
-
-;
-;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
-; arg0 = r_ptr
-; arg1 = a_ptr
-;
-
-bn_sqr_comba8
- .PROC
- .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .ENTRY
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
- FLDD 32(a_ptr),a4
- FLDD 40(a_ptr),a5
- FLDD 48(a_ptr),a6
- FLDD 56(a_ptr),a7
-
- SQR_ADD_C a0L,a0R,c1,c2,c3
- STD c1,0(r_ptr) ; r[0] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
- STD c2,8(r_ptr) ; r[1] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a1L,a1R,c3,c1,c2
- SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
- STD c3,16(r_ptr) ; r[2] = c3;
- COPY %r0,c3
-
- SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
- SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
- STD c1,24(r_ptr) ; r[3] = c1;
- COPY %r0,c1
-
- SQR_ADD_C a2L,a2R,c2,c3,c1
- SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
- SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
- STD c2,32(r_ptr) ; r[4] = c2;
- COPY %r0,c2
-
- SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
- SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
- SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
- STD c3,40(r_ptr) ; r[5] = c3;
- COPY %r0,c3
-
- SQR_ADD_C a3L,a3R,c1,c2,c3
- SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
- SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
- SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
- STD c1,48(r_ptr) ; r[6] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
- SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
- SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
- SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
- STD c2,56(r_ptr) ; r[7] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a4L,a4R,c3,c1,c2
- SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
- SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
- SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
- STD c3,64(r_ptr) ; r[8] = c3;
- COPY %r0,c3
-
- SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
- SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
- SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
- STD c1,72(r_ptr) ; r[9] = c1;
- COPY %r0,c1
-
- SQR_ADD_C a5L,a5R,c2,c3,c1
- SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
- SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
- STD c2,80(r_ptr) ; r[10] = c2;
- COPY %r0,c2
-
- SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
- SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
- STD c3,88(r_ptr) ; r[11] = c3;
- COPY %r0,c3
-
- SQR_ADD_C a6L,a6R,c1,c2,c3
- SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
- STD c1,96(r_ptr) ; r[12] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
- STD c2,104(r_ptr) ; r[13] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a7L,a7R,c3,c1,c2
- STD c3, 112(r_ptr) ; r[14] = c3
- STD c1, 120(r_ptr) ; r[15] = c1
-
- .EXIT
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-;-----------------------------------------------------------------------------
-;
-;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
-; arg0 = r_ptr
-; arg1 = a_ptr
-;
-
-bn_sqr_comba4
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
- FLDD 32(a_ptr),a4
- FLDD 40(a_ptr),a5
- FLDD 48(a_ptr),a6
- FLDD 56(a_ptr),a7
-
- SQR_ADD_C a0L,a0R,c1,c2,c3
-
- STD c1,0(r_ptr) ; r[0] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
-
- STD c2,8(r_ptr) ; r[1] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a1L,a1R,c3,c1,c2
- SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
-
- STD c3,16(r_ptr) ; r[2] = c3;
- COPY %r0,c3
-
- SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
- SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
-
- STD c1,24(r_ptr) ; r[3] = c1;
- COPY %r0,c1
-
- SQR_ADD_C a2L,a2R,c2,c3,c1
- SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
-
- STD c2,32(r_ptr) ; r[4] = c2;
- COPY %r0,c2
-
- SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
- STD c3,40(r_ptr) ; r[5] = c3;
- COPY %r0,c3
-
- SQR_ADD_C a3L,a3R,c1,c2,c3
- STD c1,48(r_ptr) ; r[6] = c1;
- STD c2,56(r_ptr) ; r[7] = c2;
-
- .EXIT
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-
-;---------------------------------------------------------------------------
-
-MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
- XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
- FSTD ftemp1,-16(%sp) ;
- XMPYU A0R,B0L,ftemp2 ; m = bh*lt
- FSTD ftemp2,-8(%sp) ;
- XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
- FSTD ftemp3,-32(%sp)
- XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
- FSTD ftemp4,-24(%sp) ;
-
- LDD -8(%sp),m ; r21 = m
- LDD -16(%sp),m1 ; r19 = m1
- ADD,L m,m1,m ; m+m1
-
- DEPD,Z m,31,32,temp3 ; (m+m1<<32)
- LDD -24(%sp),ht ; r24 = ht
-
- CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
- ADD,L ht,high_one,ht ; ht+=high_one
-
- EXTRD,U m,31,32,temp1 ; m >> 32
- LDD -32(%sp),lt ; lt
- ADD,L ht,temp1,ht ; ht+= m>>32
- ADD lt,temp3,lt ; lt = lt+m1
- ADD,DC ht,%r0,ht ; ht++
-
- ADD C1,lt,C1 ; c1=c1+lt
- ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
-
- ADD C2,ht,C2 ; c2 = c2 + ht
- ADD,DC C3,%r0,C3 ; add in carry (c3++)
-.endm
-
-
-;
-;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-; arg0 = r_ptr
-; arg1 = a_ptr
-; arg2 = b_ptr
-;
-
-bn_mul_comba8
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
- FSTD %fr12,32(%sp) ; save r6
- FSTD %fr13,40(%sp) ; save r7
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
- FLDD 32(a_ptr),a4
- FLDD 40(a_ptr),a5
- FLDD 48(a_ptr),a6
- FLDD 56(a_ptr),a7
-
- FLDD 0(b_ptr),b0
- FLDD 8(b_ptr),b1
- FLDD 16(b_ptr),b2
- FLDD 24(b_ptr),b3
- FLDD 32(b_ptr),b4
- FLDD 40(b_ptr),b5
- FLDD 48(b_ptr),b6
- FLDD 56(b_ptr),b7
-
- MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
- STD c1,0(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
- STD c2,8(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
- STD c3,16(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
- MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
- STD c1,24(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
- MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
- MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
- STD c2,32(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
- MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
- MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
- MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
- STD c3,40(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
- MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
- MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
- MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
- MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
- STD c1,48(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
- MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
- MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
- MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
- MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
- MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
- STD c2,56(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
- MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
- MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
- MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
- MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
- STD c3,64(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
- MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
- MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
- MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
- STD c1,72(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
- MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
- MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
- MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
- MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
- STD c2,80(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
- MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
- MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
- MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
- STD c3,88(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
- MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
- MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
- STD c1,96(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
- MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
- STD c2,104(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
- STD c3,112(r_ptr)
- STD c1,120(r_ptr)
-
- .EXIT
- FLDD -88(%sp),%fr13
- FLDD -96(%sp),%fr12
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-;-----------------------------------------------------------------------------
-;
-;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-; arg0 = r_ptr
-; arg1 = a_ptr
-; arg2 = b_ptr
-;
-
-bn_mul_comba4
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
- FSTD %fr12,32(%sp) ; save r6
- FSTD %fr13,40(%sp) ; save r7
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
-
- FLDD 0(b_ptr),b0
- FLDD 8(b_ptr),b1
- FLDD 16(b_ptr),b2
- FLDD 24(b_ptr),b3
-
- MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
- STD c1,0(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
- STD c2,8(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
- STD c3,16(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
- MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
- STD c1,24(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
- STD c2,32(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
- MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
- STD c3,40(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
- STD c1,48(r_ptr)
- STD c2,56(r_ptr)
-
- .EXIT
- FLDD -88(%sp),%fr13
- FLDD -96(%sp),%fr12
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-
- .SPACE $TEXT$
- .SUBSPA $CODE$
- .SPACE $PRIVATE$,SORT=16
- .IMPORT $global$,DATA
- .SPACE $TEXT$
- .SUBSPA $CODE$
- .SUBSPA $LIT$,ACCESS=0x2c
-C$4
- .ALIGN 8
- .STRINGZ "Division would overflow (%d)\n"
- .END
+++ /dev/null
-.ident "sparcv8plus.s, Version 1.4"
-.ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
-
-/*
- * ====================================================================
- * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
- * project.
- *
- * Rights for redistribution and usage in source and binary forms are
- * granted according to the OpenSSL license. Warranty of any kind is
- * disclaimed.
- * ====================================================================
- */
-
-/*
- * This is my modest contribution to OpenSSL project (see
- * http://www.openssl.org/ for more information about it) and is
- * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
- * module. For updates see http://fy.chalmers.se/~appro/hpe/.
- *
- * Questions-n-answers.
- *
- * Q. How to compile?
- * A. With SC4.x/SC5.x:
- *
- * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
- *
- * and with gcc:
- *
- * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
- *
- * or if above fails (it does if you have gas installed):
- *
- * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
- *
- * Quick-n-dirty way to fuse the module into the library.
- * Provided that the library is already configured and built
- * (in 0.9.2 case with no-asm option):
- *
- * # cd crypto/bn
- * # cp /some/place/bn_asm.sparc.v8plus.S .
- * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
- * # make
- * # cd ../..
- * # make; make test
- *
- * Quick-n-dirty way to get rid of it:
- *
- * # cd crypto/bn
- * # touch bn_asm.c
- * # make
- * # cd ../..
- * # make; make test
- *
- * Q. V8plus architecture? What kind of beast is that?
- * A. Well, it's rather a programming model than an architecture...
- * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
- * special conditions, namely when kernel doesn't preserve upper
- * 32 bits of otherwise 64-bit registers during a context switch.
- *
- * Q. Why just UltraSPARC? What about SuperSPARC?
- * A. Original release did target UltraSPARC only. Now SuperSPARC
- * version is provided along. Both version share bn_*comba[48]
- * implementations (see comment later in code for explanation).
- * But what's so special about this UltraSPARC implementation?
- * Why didn't I let compiler do the job? Trouble is that most of
- * available compilers (well, SC5.0 is the only exception) don't
- * attempt to take advantage of UltraSPARC's 64-bitness under
- * 32-bit kernels even though it's perfectly possible (see next
- * question).
- *
- * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
- * doesn't work?
- * A. You can't address *all* registers as 64-bit wide:-( The catch is
- * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
- * preserved if you're in a leaf function, i.e. such never calling
- * any other functions. All functions in this module are leaf and
- * 10 registers is a handful. And as a matter of fact none-"comba"
- * routines don't require even that much and I could even afford to
- * not allocate own stack frame for 'em:-)
- *
- * Q. What about 64-bit kernels?
- * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
- * under evaluation and development...
- *
- * Q. What about shared libraries?
- * A. What about 'em? Kidding again:-) Code does *not* contain any
- * code position dependencies and it's safe to include it into
- * shared library as is.
- *
- * Q. How much faster does it go?
- * A. Do you have a good benchmark? In either case below is what I
- * experience with crypto/bn/expspeed.c test program:
- *
- * v8plus module on U10/300MHz against bn_asm.c compiled with:
- *
- * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12%
- * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35%
- * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45%
- *
- * v8 module on SS10/60MHz against bn_asm.c compiled with:
- *
- * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10%
- * cc-4.2 -xarch=v8 -xO5 -xdepend +10%
- * egcs-1.1.2 -mv8 -O3 +35-45%
- *
- * As you can see it's damn hard to beat the new Sun C compiler
- * and it's in first place GNU C users who will appreciate this
- * assembler implementation:-)
- */
-
-/*
- * Revision history.
- *
- * 1.0 - initial release;
- * 1.1 - new loop unrolling model(*);
- * - some more fine tuning;
- * 1.2 - made gas friendly;
- * - updates to documentation concerning v9;
- * - new performance comparison matrix;
- * 1.3 - fixed problem with /usr/ccs/lib/cpp;
- * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient)
- * resulting in slight overall performance kick;
- * - some retunes;
- * - support for GNU as added;
- *
- * (*) Originally unrolled loop looked like this:
- * for (;;) {
- * op(p+0); if (--n==0) break;
- * op(p+1); if (--n==0) break;
- * op(p+2); if (--n==0) break;
- * op(p+3); if (--n==0) break;
- * p+=4;
- * }
- * I unroll according to following:
- * while (n&~3) {
- * op(p+0); op(p+1); op(p+2); op(p+3);
- * p+=4; n=-4;
- * }
- * if (n) {
- * op(p+0); if (--n==0) return;
- * op(p+2); if (--n==0) return;
- * op(p+3); return;
- * }
- */
-
-#if defined(__SUNPRO_C) && defined(__sparcv9)
- /* They've said -xarch=v9 at command line */
- .register %g2,#scratch
- .register %g3,#scratch
-# define FRAME_SIZE -192
-#elif defined(__GNUC__) && defined(__arch64__)
- /* They've said -m64 at command line */
- .register %g2,#scratch
- .register %g3,#scratch
-# define FRAME_SIZE -192
-#else
-# define FRAME_SIZE -96
-#endif
-/*
- * GNU assembler can't stand stuw:-(
- */
-#define stuw st
-
-.section ".text",#alloc,#execinstr
-.file "bn_asm.sparc.v8plus.S"
-
-.align 32
-
-.global bn_mul_add_words
-/*
- * BN_ULONG bn_mul_add_words(rp,ap,num,w)
- * BN_ULONG *rp,*ap;
- * int num;
- * BN_ULONG w;
- */
-bn_mul_add_words:
- sra %o2,%g0,%o2 ! signx %o2
- brgz,a %o2,.L_bn_mul_add_words_proceed
- lduw [%o1],%g2
- retl
- clr %o0
- nop
- nop
- nop
-
-.L_bn_mul_add_words_proceed:
- srl %o3,%g0,%o3 ! clruw %o3
- andcc %o2,-4,%g0
- bz,pn %icc,.L_bn_mul_add_words_tail
- clr %o5
-
-.L_bn_mul_add_words_loop: ! wow! 32 aligned!
- lduw [%o0],%g1
- lduw [%o1+4],%g3
- mulx %o3,%g2,%g2
- add %g1,%o5,%o4
- nop
- add %o4,%g2,%o4
- stuw %o4,[%o0]
- srlx %o4,32,%o5
-
- lduw [%o0+4],%g1
- lduw [%o1+8],%g2
- mulx %o3,%g3,%g3
- add %g1,%o5,%o4
- dec 4,%o2
- add %o4,%g3,%o4
- stuw %o4,[%o0+4]
- srlx %o4,32,%o5
-
- lduw [%o0+8],%g1
- lduw [%o1+12],%g3
- mulx %o3,%g2,%g2
- add %g1,%o5,%o4
- inc 16,%o1
- add %o4,%g2,%o4
- stuw %o4,[%o0+8]
- srlx %o4,32,%o5
-
- lduw [%o0+12],%g1
- mulx %o3,%g3,%g3
- add %g1,%o5,%o4
- inc 16,%o0
- add %o4,%g3,%o4
- andcc %o2,-4,%g0
- stuw %o4,[%o0-4]
- srlx %o4,32,%o5
- bnz,a,pt %icc,.L_bn_mul_add_words_loop
- lduw [%o1],%g2
-
- brnz,a,pn %o2,.L_bn_mul_add_words_tail
- lduw [%o1],%g2
-.L_bn_mul_add_words_return:
- retl
- mov %o5,%o0
-
-.L_bn_mul_add_words_tail:
- lduw [%o0],%g1
- mulx %o3,%g2,%g2
- add %g1,%o5,%o4
- dec %o2
- add %o4,%g2,%o4
- srlx %o4,32,%o5
- brz,pt %o2,.L_bn_mul_add_words_return
- stuw %o4,[%o0]
-
- lduw [%o1+4],%g2
- lduw [%o0+4],%g1
- mulx %o3,%g2,%g2
- add %g1,%o5,%o4
- dec %o2
- add %o4,%g2,%o4
- srlx %o4,32,%o5
- brz,pt %o2,.L_bn_mul_add_words_return
- stuw %o4,[%o0+4]
-
- lduw [%o1+8],%g2
- lduw [%o0+8],%g1
- mulx %o3,%g2,%g2
- add %g1,%o5,%o4
- add %o4,%g2,%o4
- stuw %o4,[%o0+8]
- retl
- srlx %o4,32,%o0
-
-.type bn_mul_add_words,#function
-.size bn_mul_add_words,(.-bn_mul_add_words)
-
-.align 32
-
-.global bn_mul_words
-/*
- * BN_ULONG bn_mul_words(rp,ap,num,w)
- * BN_ULONG *rp,*ap;
- * int num;
- * BN_ULONG w;
- */
-bn_mul_words:
- sra %o2,%g0,%o2 ! signx %o2
- brgz,a %o2,.L_bn_mul_words_proceeed
- lduw [%o1],%g2
- retl
- clr %o0
- nop
- nop
- nop
-
-.L_bn_mul_words_proceeed:
- srl %o3,%g0,%o3 ! clruw %o3
- andcc %o2,-4,%g0
- bz,pn %icc,.L_bn_mul_words_tail
- clr %o5
-
-.L_bn_mul_words_loop: ! wow! 32 aligned!
- lduw [%o1+4],%g3
- mulx %o3,%g2,%g2
- add %g2,%o5,%o4
- nop
- stuw %o4,[%o0]
- srlx %o4,32,%o5
-
- lduw [%o1+8],%g2
- mulx %o3,%g3,%g3
- add %g3,%o5,%o4
- dec 4,%o2
- stuw %o4,[%o0+4]
- srlx %o4,32,%o5
-
- lduw [%o1+12],%g3
- mulx %o3,%g2,%g2
- add %g2,%o5,%o4
- inc 16,%o1
- stuw %o4,[%o0+8]
- srlx %o4,32,%o5
-
- mulx %o3,%g3,%g3
- add %g3,%o5,%o4
- inc 16,%o0
- stuw %o4,[%o0-4]
- srlx %o4,32,%o5
- andcc %o2,-4,%g0
- bnz,a,pt %icc,.L_bn_mul_words_loop
- lduw [%o1],%g2
- nop
- nop
-
- brnz,a,pn %o2,.L_bn_mul_words_tail
- lduw [%o1],%g2
-.L_bn_mul_words_return:
- retl
- mov %o5,%o0
-
-.L_bn_mul_words_tail:
- mulx %o3,%g2,%g2
- add %g2,%o5,%o4
- dec %o2
- srlx %o4,32,%o5
- brz,pt %o2,.L_bn_mul_words_return
- stuw %o4,[%o0]
-
- lduw [%o1+4],%g2
- mulx %o3,%g2,%g2
- add %g2,%o5,%o4
- dec %o2
- srlx %o4,32,%o5
- brz,pt %o2,.L_bn_mul_words_return
- stuw %o4,[%o0+4]
-
- lduw [%o1+8],%g2
- mulx %o3,%g2,%g2
- add %g2,%o5,%o4
- stuw %o4,[%o0+8]
- retl
- srlx %o4,32,%o0
-
-.type bn_mul_words,#function
-.size bn_mul_words,(.-bn_mul_words)
-
-.align 32
-.global bn_sqr_words
-/*
- * void bn_sqr_words(r,a,n)
- * BN_ULONG *r,*a;
- * int n;
- */
-bn_sqr_words:
- sra %o2,%g0,%o2 ! signx %o2
- brgz,a %o2,.L_bn_sqr_words_proceeed
- lduw [%o1],%g2
- retl
- clr %o0
- nop
- nop
- nop
-
-.L_bn_sqr_words_proceeed:
- andcc %o2,-4,%g0
- nop
- bz,pn %icc,.L_bn_sqr_words_tail
- nop
-
-.L_bn_sqr_words_loop: ! wow! 32 aligned!
- lduw [%o1+4],%g3
- mulx %g2,%g2,%o4
- stuw %o4,[%o0]
- srlx %o4,32,%o5
- stuw %o5,[%o0+4]
- nop
-
- lduw [%o1+8],%g2
- mulx %g3,%g3,%o4
- dec 4,%o2
- stuw %o4,[%o0+8]
- srlx %o4,32,%o5
- stuw %o5,[%o0+12]
-
- lduw [%o1+12],%g3
- mulx %g2,%g2,%o4
- srlx %o4,32,%o5
- stuw %o4,[%o0+16]
- inc 16,%o1
- stuw %o5,[%o0+20]
-
- mulx %g3,%g3,%o4
- inc 32,%o0
- stuw %o4,[%o0-8]
- srlx %o4,32,%o5
- andcc %o2,-4,%g2
- stuw %o5,[%o0-4]
- bnz,a,pt %icc,.L_bn_sqr_words_loop
- lduw [%o1],%g2
- nop
-
- brnz,a,pn %o2,.L_bn_sqr_words_tail
- lduw [%o1],%g2
-.L_bn_sqr_words_return:
- retl
- clr %o0
-
-.L_bn_sqr_words_tail:
- mulx %g2,%g2,%o4
- dec %o2
- stuw %o4,[%o0]
- srlx %o4,32,%o5
- brz,pt %o2,.L_bn_sqr_words_return
- stuw %o5,[%o0+4]
-
- lduw [%o1+4],%g2
- mulx %g2,%g2,%o4
- dec %o2
- stuw %o4,[%o0+8]
- srlx %o4,32,%o5
- brz,pt %o2,.L_bn_sqr_words_return
- stuw %o5,[%o0+12]
-
- lduw [%o1+8],%g2
- mulx %g2,%g2,%o4
- srlx %o4,32,%o5
- stuw %o4,[%o0+16]
- stuw %o5,[%o0+20]
- retl
- clr %o0
-
-.type bn_sqr_words,#function
-.size bn_sqr_words,(.-bn_sqr_words)
-
-.align 32
-.global bn_div_words
-/*
- * BN_ULONG bn_div_words(h,l,d)
- * BN_ULONG h,l,d;
- */
-bn_div_words:
- sllx %o0,32,%o0
- or %o0,%o1,%o0
- udivx %o0,%o2,%o0
- retl
- srl %o0,%g0,%o0 ! clruw %o0
-
-.type bn_div_words,#function
-.size bn_div_words,(.-bn_div_words)
-
-.align 32
-
-.global bn_add_words
-/*
- * BN_ULONG bn_add_words(rp,ap,bp,n)
- * BN_ULONG *rp,*ap,*bp;
- * int n;
- */
-bn_add_words:
- sra %o3,%g0,%o3 ! signx %o3
- brgz,a %o3,.L_bn_add_words_proceed
- lduw [%o1],%o4
- retl
- clr %o0
-
-.L_bn_add_words_proceed:
- andcc %o3,-4,%g0
- bz,pn %icc,.L_bn_add_words_tail
- addcc %g0,0,%g0 ! clear carry flag
-
-.L_bn_add_words_loop: ! wow! 32 aligned!
- dec 4,%o3
- lduw [%o2],%o5
- lduw [%o1+4],%g1
- lduw [%o2+4],%g2
- lduw [%o1+8],%g3
- lduw [%o2+8],%g4
- addccc %o5,%o4,%o5
- stuw %o5,[%o0]
-
- lduw [%o1+12],%o4
- lduw [%o2+12],%o5
- inc 16,%o1
- addccc %g1,%g2,%g1
- stuw %g1,[%o0+4]
-
- inc 16,%o2
- addccc %g3,%g4,%g3
- stuw %g3,[%o0+8]
-
- inc 16,%o0
- addccc %o5,%o4,%o5
- stuw %o5,[%o0-4]
- and %o3,-4,%g1
- brnz,a,pt %g1,.L_bn_add_words_loop
- lduw [%o1],%o4
-
- brnz,a,pn %o3,.L_bn_add_words_tail
- lduw [%o1],%o4
-.L_bn_add_words_return:
- clr %o0
- retl
- movcs %icc,1,%o0
- nop
-
-.L_bn_add_words_tail:
- lduw [%o2],%o5
- dec %o3
- addccc %o5,%o4,%o5
- brz,pt %o3,.L_bn_add_words_return
- stuw %o5,[%o0]
-
- lduw [%o1+4],%o4
- lduw [%o2+4],%o5
- dec %o3
- addccc %o5,%o4,%o5
- brz,pt %o3,.L_bn_add_words_return
- stuw %o5,[%o0+4]
-
- lduw [%o1+8],%o4
- lduw [%o2+8],%o5
- addccc %o5,%o4,%o5
- stuw %o5,[%o0+8]
- clr %o0
- retl
- movcs %icc,1,%o0
-
-.type bn_add_words,#function
-.size bn_add_words,(.-bn_add_words)
-
-.global bn_sub_words
-/*
- * BN_ULONG bn_sub_words(rp,ap,bp,n)
- * BN_ULONG *rp,*ap,*bp;
- * int n;
- */
-bn_sub_words:
- sra %o3,%g0,%o3 ! signx %o3
- brgz,a %o3,.L_bn_sub_words_proceed
- lduw [%o1],%o4
- retl
- clr %o0
-
-.L_bn_sub_words_proceed:
- andcc %o3,-4,%g0
- bz,pn %icc,.L_bn_sub_words_tail
- addcc %g0,0,%g0 ! clear carry flag
-
-.L_bn_sub_words_loop: ! wow! 32 aligned!
- dec 4,%o3
- lduw [%o2],%o5
- lduw [%o1+4],%g1
- lduw [%o2+4],%g2
- lduw [%o1+8],%g3
- lduw [%o2+8],%g4
- subccc %o4,%o5,%o5
- stuw %o5,[%o0]
-
- lduw [%o1+12],%o4
- lduw [%o2+12],%o5
- inc 16,%o1
- subccc %g1,%g2,%g2
- stuw %g2,[%o0+4]
-
- inc 16,%o2
- subccc %g3,%g4,%g4
- stuw %g4,[%o0+8]
-
- inc 16,%o0
- subccc %o4,%o5,%o5
- stuw %o5,[%o0-4]
- and %o3,-4,%g1
- brnz,a,pt %g1,.L_bn_sub_words_loop
- lduw [%o1],%o4
-
- brnz,a,pn %o3,.L_bn_sub_words_tail
- lduw [%o1],%o4
-.L_bn_sub_words_return:
- clr %o0
- retl
- movcs %icc,1,%o0
- nop
-
-.L_bn_sub_words_tail: ! wow! 32 aligned!
- lduw [%o2],%o5
- dec %o3
- subccc %o4,%o5,%o5
- brz,pt %o3,.L_bn_sub_words_return
- stuw %o5,[%o0]
-
- lduw [%o1+4],%o4
- lduw [%o2+4],%o5
- dec %o3
- subccc %o4,%o5,%o5
- brz,pt %o3,.L_bn_sub_words_return
- stuw %o5,[%o0+4]
-
- lduw [%o1+8],%o4
- lduw [%o2+8],%o5
- subccc %o4,%o5,%o5
- stuw %o5,[%o0+8]
- clr %o0
- retl
- movcs %icc,1,%o0
-
-.type bn_sub_words,#function
-.size bn_sub_words,(.-bn_sub_words)
-
-/*
- * Code below depends on the fact that upper parts of the %l0-%l7
- * and %i0-%i7 are zeroed by kernel after context switch. In
- * previous versions this comment stated that "the trouble is that
- * it's not feasible to implement the mumbo-jumbo in less V9
- * instructions:-(" which apparently isn't true thanks to
- * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
- * results not from the shorter code, but from elimination of
- * multicycle none-pairable 'rd %y,%rd' instructions.
- *
- * Andy.
- */
-
-/*
- * Here is register usage map for *all* routines below.
- */
-#define t_1 %o0
-#define t_2 %o1
-#define c_12 %o2
-#define c_3 %o3
-
-#define ap(I) [%i1+4*I]
-#define bp(I) [%i2+4*I]
-#define rp(I) [%i0+4*I]
-
-#define a_0 %l0
-#define a_1 %l1
-#define a_2 %l2
-#define a_3 %l3
-#define a_4 %l4
-#define a_5 %l5
-#define a_6 %l6
-#define a_7 %l7
-
-#define b_0 %i3
-#define b_1 %i4
-#define b_2 %i5
-#define b_3 %o4
-#define b_4 %o5
-#define b_5 %o7
-#define b_6 %g1
-#define b_7 %g4
-
-.align 32
-.global bn_mul_comba8
-/*
- * void bn_mul_comba8(r,a,b)
- * BN_ULONG *r,*a,*b;
- */
-bn_mul_comba8:
- save %sp,FRAME_SIZE,%sp
- mov 1,t_2
- lduw ap(0),a_0
- sllx t_2,32,t_2
- lduw bp(0),b_0 !=
- lduw bp(1),b_1
- mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
- srlx t_1,32,c_12
- stuw t_1,rp(0) !=!r[0]=c1;
-
- lduw ap(1),a_1
- mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3 !=
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw ap(2),a_2
- mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12 !=
- stuw t_1,rp(1) !r[1]=c2;
- or c_12,c_3,c_12
-
- mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
- addcc c_12,t_1,c_12 !=
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw bp(2),b_2 !=
- mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3 !=
- lduw bp(3),b_3
- mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(2) !r[2]=c3;
- or c_12,c_3,c_12 !=
-
- mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- lduw ap(3),a_3
- mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
- addcc c_12,t_1,c_12 !=
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw ap(4),a_4
- mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12 !=
- stuw t_1,rp(3) !r[3]=c1;
- or c_12,c_3,c_12
-
- mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
- addcc c_12,t_1,c_12 !=
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw bp(4),b_4 !=
- mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3 !=
- lduw bp(5),b_5
- mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(4) !r[4]=c2;
- or c_12,c_3,c_12 !=
-
- mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- lduw ap(5),a_5
- mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
- addcc c_12,t_1,c_12 !=
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw ap(6),a_6
- mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12 !=
- stuw t_1,rp(5) !r[5]=c3;
- or c_12,c_3,c_12
-
- mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
- addcc c_12,t_1,c_12 !=
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw bp(6),b_6 !=
- mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3 !=
- lduw bp(7),b_7
- mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(6) !r[6]=c1;
- or c_12,c_3,c_12 !=
-
- mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- lduw ap(7),a_7
- mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12 !=
- stuw t_1,rp(7) !r[7]=c2;
- or c_12,c_3,c_12
-
- mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3 !=
- mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3 !=
- mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3 !=
- mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3 !=
- mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3 !=
- mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3 !=
- mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3 !=
- srlx t_1,32,c_12
- stuw t_1,rp(8) !r[8]=c3;
- or c_12,c_3,c_12
-
- mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3 !=
- mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(9) !r[9]=c1;
- or c_12,c_3,c_12 !=
-
- mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(10) !r[10]=c2;
- or c_12,c_3,c_12 !=
-
- mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(11) !r[11]=c3;
- or c_12,c_3,c_12 !=
-
- mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(12) !r[12]=c1;
- or c_12,c_3,c_12 !=
-
- mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- st t_1,rp(13) !r[13]=c2;
- or c_12,c_3,c_12 !=
-
- mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2);
- addcc c_12,t_1,t_1
- srlx t_1,32,c_12 !=
- stuw t_1,rp(14) !r[14]=c3;
- stuw c_12,rp(15) !r[15]=c1;
-
- ret
- restore %g0,%g0,%o0 !=
-
-.type bn_mul_comba8,#function
-.size bn_mul_comba8,(.-bn_mul_comba8)
-
-.align 32
-
-.global bn_mul_comba4
-/*
- * void bn_mul_comba4(r,a,b)
- * BN_ULONG *r,*a,*b;
- */
-bn_mul_comba4:
- save %sp,FRAME_SIZE,%sp
- lduw ap(0),a_0
- mov 1,t_2
- lduw bp(0),b_0
- sllx t_2,32,t_2 !=
- lduw bp(1),b_1
- mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
- srlx t_1,32,c_12
- stuw t_1,rp(0) !=!r[0]=c1;
-
- lduw ap(1),a_1
- mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3 !=
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw ap(2),a_2
- mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12 !=
- stuw t_1,rp(1) !r[1]=c2;
- or c_12,c_3,c_12
-
- mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
- addcc c_12,t_1,c_12 !=
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw bp(2),b_2 !=
- mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3 !=
- lduw bp(3),b_3
- mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(2) !r[2]=c3;
- or c_12,c_3,c_12 !=
-
- mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8 !=
- add c_3,t_2,c_3
- lduw ap(3),a_3
- mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
- addcc c_12,t_1,c_12 !=
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
- addcc c_12,t_1,t_1 !=
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(3) !=!r[3]=c1;
- or c_12,c_3,c_12
-
- mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3 !=
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
- addcc c_12,t_1,c_12 !=
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
- addcc c_12,t_1,t_1 !=
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(4) !=!r[4]=c2;
- or c_12,c_3,c_12
-
- mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
- addcc c_12,t_1,c_12
- clr c_3 !=
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
- addcc c_12,t_1,t_1 !=
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(5) !=!r[5]=c3;
- or c_12,c_3,c_12
-
- mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
- addcc c_12,t_1,t_1
- srlx t_1,32,c_12 !=
- stuw t_1,rp(6) !r[6]=c1;
- stuw c_12,rp(7) !r[7]=c2;
-
- ret
- restore %g0,%g0,%o0
-
-.type bn_mul_comba4,#function
-.size bn_mul_comba4,(.-bn_mul_comba4)
-
-.align 32
-
-.global bn_sqr_comba8
-bn_sqr_comba8:
- save %sp,FRAME_SIZE,%sp
- mov 1,t_2
- lduw ap(0),a_0
- sllx t_2,32,t_2
- lduw ap(1),a_1
- mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
- srlx t_1,32,c_12
- stuw t_1,rp(0) !r[0]=c1;
-
- lduw ap(2),a_2
- mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(1) !r[1]=c2;
- or c_12,c_3,c_12
-
- mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw ap(3),a_3
- mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(2) !r[2]=c3;
- or c_12,c_3,c_12
-
- mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw ap(4),a_4
- mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- st t_1,rp(3) !r[3]=c1;
- or c_12,c_3,c_12
-
- mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw ap(5),a_5
- mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(4) !r[4]=c2;
- or c_12,c_3,c_12
-
- mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw ap(6),a_6
- mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(5) !r[5]=c3;
- or c_12,c_3,c_12
-
- mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw ap(7),a_7
- mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(6) !r[6]=c1;
- or c_12,c_3,c_12
-
- mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(7) !r[7]=c2;
- or c_12,c_3,c_12
-
- mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(8) !r[8]=c3;
- or c_12,c_3,c_12
-
- mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(9) !r[9]=c1;
- or c_12,c_3,c_12
-
- mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(10) !r[10]=c2;
- or c_12,c_3,c_12
-
- mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(11) !r[11]=c3;
- or c_12,c_3,c_12
-
- mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(12) !r[12]=c1;
- or c_12,c_3,c_12
-
- mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(13) !r[13]=c2;
- or c_12,c_3,c_12
-
- mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
- addcc c_12,t_1,t_1
- srlx t_1,32,c_12
- stuw t_1,rp(14) !r[14]=c3;
- stuw c_12,rp(15) !r[15]=c1;
-
- ret
- restore %g0,%g0,%o0
-
-.type bn_sqr_comba8,#function
-.size bn_sqr_comba8,(.-bn_sqr_comba8)
-
-.align 32
-
-.global bn_sqr_comba4
-/*
- * void bn_sqr_comba4(r,a)
- * BN_ULONG *r,*a;
- */
-bn_sqr_comba4:
- save %sp,FRAME_SIZE,%sp
- mov 1,t_2
- lduw ap(0),a_0
- sllx t_2,32,t_2
- lduw ap(1),a_1
- mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
- srlx t_1,32,c_12
- stuw t_1,rp(0) !r[0]=c1;
-
- lduw ap(2),a_2
- mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(1) !r[1]=c2;
- or c_12,c_3,c_12
-
- mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- lduw ap(3),a_3
- mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(2) !r[2]=c3;
- or c_12,c_3,c_12
-
- mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(3) !r[3]=c1;
- or c_12,c_3,c_12
-
- mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,c_12
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(4) !r[4]=c2;
- or c_12,c_3,c_12
-
- mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
- addcc c_12,t_1,c_12
- clr c_3
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- addcc c_12,t_1,t_1
- bcs,a %xcc,.+8
- add c_3,t_2,c_3
- srlx t_1,32,c_12
- stuw t_1,rp(5) !r[5]=c3;
- or c_12,c_3,c_12
-
- mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
- addcc c_12,t_1,t_1
- srlx t_1,32,c_12
- stuw t_1,rp(6) !r[6]=c1;
- stuw c_12,rp(7) !r[7]=c2;
-
- ret
- restore %g0,%g0,%o0
-
-.type bn_sqr_comba4,#function
-.size bn_sqr_comba4,(.-bn_sqr_comba4)
-
-.align 32
+++ /dev/null
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# December 2005
-#
-# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
-# for undertaken effort are multiple. First of all, UltraSPARC is not
-# the whole SPARCv9 universe and other VIS-free implementations deserve
-# optimized code as much. Secondly, newly introduced UltraSPARC T1,
-# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
-# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
-# several integrated RSA/DSA accelerator circuits accessible through
-# kernel driver [only(*)], but having decent user-land software
-# implementation is important too. Finally, reasons like desire to
-# experiment with dedicated squaring procedure. Yes, this module
-# implements one, because it was easiest to draft it in SPARCv9
-# instructions...
-
-# (*) Engine accessing the driver in question is on my TODO list.
-# For reference, acceleator is estimated to give 6 to 10 times
-# improvement on single-threaded RSA sign. It should be noted
-# that 6-10x improvement coefficient does not actually mean
-# something extraordinary in terms of absolute [single-threaded]
-# performance, as SPARCv9 instruction set is by all means least
-# suitable for high performance crypto among other 64 bit
-# platforms. 6-10x factor simply places T1 in same performance
-# domain as say AMD64 and IA-64. Improvement of RSA verify don't
-# appear impressive at all, but it's the sign operation which is
-# far more critical/interesting.
-
-# You might notice that inner loops are modulo-scheduled:-) This has
-# essentially negligible impact on UltraSPARC performance, it's
-# Fujitsu SPARC64 V users who should notice and hopefully appreciate
-# the advantage... Currently this module surpasses sparcv9a-mont.pl
-# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
-# module still have hidden potential [see TODO list there], which is
-# estimated to be larger than 20%...
-
-# int bn_mul_mont(
-$rp="%i0"; # BN_ULONG *rp,
-$ap="%i1"; # const BN_ULONG *ap,
-$bp="%i2"; # const BN_ULONG *bp,
-$np="%i3"; # const BN_ULONG *np,
-$n0="%i4"; # const BN_ULONG *n0,
-$num="%i5"; # int num);
-
-$bits=32;
-for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64) { $bias=2047; $frame=192; }
-else { $bias=0; $frame=128; }
-
-$car0="%o0";
-$car1="%o1";
-$car2="%o2"; # 1 bit
-$acc0="%o3";
-$acc1="%o4";
-$mask="%g1"; # 32 bits, what a waste...
-$tmp0="%g4";
-$tmp1="%g5";
-
-$i="%l0";
-$j="%l1";
-$mul0="%l2";
-$mul1="%l3";
-$tp="%l4";
-$apj="%l5";
-$npj="%l6";
-$tpj="%l7";
-
-$fname="bn_mul_mont_int";
-
-$code=<<___;
-.section ".text",#alloc,#execinstr
-
-.global $fname
-.align 32
-$fname:
- cmp %o5,4 ! 128 bits minimum
- bge,pt %icc,.Lenter
- sethi %hi(0xffffffff),$mask
- retl
- clr %o0
-.align 32
-.Lenter:
- save %sp,-$frame,%sp
- sll $num,2,$num ! num*=4
- or $mask,%lo(0xffffffff),$mask
- ld [$n0],$n0
- cmp $ap,$bp
- and $num,$mask,$num
- ld [$bp],$mul0 ! bp[0]
- nop
-
- add %sp,$bias,%o7 ! real top of stack
- ld [$ap],$car0 ! ap[0] ! redundant in squaring context
- sub %o7,$num,%o7
- ld [$ap+4],$apj ! ap[1]
- and %o7,-1024,%o7
- ld [$np],$car1 ! np[0]
- sub %o7,$bias,%sp ! alloca
- ld [$np+4],$npj ! np[1]
- be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
- mov 12,$j
-
- mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
- mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
- and $car0,$mask,$acc0
- add %sp,$bias+$frame,$tp
- ld [$ap+8],$apj !prologue!
-
- mulx $n0,$acc0,$mul1 ! "t[0]"*n0
- and $mul1,$mask,$mul1
-
- mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
- mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- ld [$np+8],$npj !prologue!
- srlx $car1,32,$car1
- mov $tmp0,$acc0 !prologue!
-
-.L1st:
- mulx $apj,$mul0,$tmp0
- mulx $npj,$mul1,$tmp1
- add $acc0,$car0,$car0
- ld [$ap+$j],$apj ! ap[j]
- and $car0,$mask,$acc0
- add $acc1,$car1,$car1
- ld [$np+$j],$npj ! np[j]
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- add $j,4,$j ! j++
- mov $tmp0,$acc0
- st $car1,[$tp]
- cmp $j,$num
- mov $tmp1,$acc1
- srlx $car1,32,$car1
- bl %icc,.L1st
- add $tp,4,$tp ! tp++
-!.L1st
-
- mulx $apj,$mul0,$tmp0 !epilogue!
- mulx $npj,$mul1,$tmp1
- add $acc0,$car0,$car0
- and $car0,$mask,$acc0
- add $acc1,$car1,$car1
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- st $car1,[$tp]
- srlx $car1,32,$car1
-
- add $tmp0,$car0,$car0
- and $car0,$mask,$acc0
- add $tmp1,$car1,$car1
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- st $car1,[$tp+4]
- srlx $car1,32,$car1
-
- add $car0,$car1,$car1
- st $car1,[$tp+8]
- srlx $car1,32,$car2
-\f
- mov 4,$i ! i++
- ld [$bp+4],$mul0 ! bp[1]
-.Louter:
- add %sp,$bias+$frame,$tp
- ld [$ap],$car0 ! ap[0]
- ld [$ap+4],$apj ! ap[1]
- ld [$np],$car1 ! np[0]
- ld [$np+4],$npj ! np[1]
- ld [$tp],$tmp1 ! tp[0]
- ld [$tp+4],$tpj ! tp[1]
- mov 12,$j
-
- mulx $car0,$mul0,$car0
- mulx $apj,$mul0,$tmp0 !prologue!
- add $tmp1,$car0,$car0
- ld [$ap+8],$apj !prologue!
- and $car0,$mask,$acc0
-
- mulx $n0,$acc0,$mul1
- and $mul1,$mask,$mul1
-
- mulx $car1,$mul1,$car1
- mulx $npj,$mul1,$acc1 !prologue!
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- ld [$np+8],$npj !prologue!
- srlx $car1,32,$car1
- mov $tmp0,$acc0 !prologue!
-
-.Linner:
- mulx $apj,$mul0,$tmp0
- mulx $npj,$mul1,$tmp1
- add $tpj,$car0,$car0
- ld [$ap+$j],$apj ! ap[j]
- add $acc0,$car0,$car0
- add $acc1,$car1,$car1
- ld [$np+$j],$npj ! np[j]
- and $car0,$mask,$acc0
- ld [$tp+8],$tpj ! tp[j]
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- add $j,4,$j ! j++
- mov $tmp0,$acc0
- st $car1,[$tp] ! tp[j-1]
- srlx $car1,32,$car1
- mov $tmp1,$acc1
- cmp $j,$num
- bl %icc,.Linner
- add $tp,4,$tp ! tp++
-!.Linner
-
- mulx $apj,$mul0,$tmp0 !epilogue!
- mulx $npj,$mul1,$tmp1
- add $tpj,$car0,$car0
- add $acc0,$car0,$car0
- ld [$tp+8],$tpj ! tp[j]
- and $car0,$mask,$acc0
- add $acc1,$car1,$car1
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- st $car1,[$tp] ! tp[j-1]
- srlx $car1,32,$car1
-
- add $tpj,$car0,$car0
- add $tmp0,$car0,$car0
- and $car0,$mask,$acc0
- add $tmp1,$car1,$car1
- add $acc0,$car1,$car1
- st $car1,[$tp+4] ! tp[j-1]
- srlx $car0,32,$car0
- add $i,4,$i ! i++
- srlx $car1,32,$car1
-
- add $car0,$car1,$car1
- cmp $i,$num
- add $car2,$car1,$car1
- st $car1,[$tp+8]
-
- srlx $car1,32,$car2
- bl,a %icc,.Louter
- ld [$bp+$i],$mul0 ! bp[i]
-!.Louter
-
- add $tp,12,$tp
-\f
-.Ltail:
- add $np,$num,$np
- add $rp,$num,$rp
- mov $tp,$ap
- sub %g0,$num,%o7 ! k=-num
- ba .Lsub
- subcc %g0,%g0,%g0 ! clear %icc.c
-.align 16
-.Lsub:
- ld [$tp+%o7],%o0
- ld [$np+%o7],%o1
- subccc %o0,%o1,%o1 ! tp[j]-np[j]
- add $rp,%o7,$i
- add %o7,4,%o7
- brnz %o7,.Lsub
- st %o1,[$i]
- subc $car2,0,$car2 ! handle upmost overflow bit
- and $tp,$car2,$ap
- andn $rp,$car2,$np
- or $ap,$np,$ap
- sub %g0,$num,%o7
-
-.Lcopy:
- ld [$ap+%o7],%o0 ! copy or in-place refresh
- st %g0,[$tp+%o7] ! zap tp
- st %o0,[$rp+%o7]
- add %o7,4,%o7
- brnz %o7,.Lcopy
- nop
- mov 1,%i0
- ret
- restore
-___
-\f
-########
-######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
-######## code without following dedicated squaring procedure.
-########
-$sbit="%i2"; # re-use $bp!
-
-$code.=<<___;
-.align 32
-.Lbn_sqr_mont:
- mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
- mulx $apj,$mul0,$tmp0 !prologue!
- and $car0,$mask,$acc0
- add %sp,$bias+$frame,$tp
- ld [$ap+8],$apj !prologue!
-
- mulx $n0,$acc0,$mul1 ! "t[0]"*n0
- srlx $car0,32,$car0
- and $mul1,$mask,$mul1
-
- mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
- mulx $npj,$mul1,$acc1 !prologue!
- and $car0,1,$sbit
- ld [$np+8],$npj !prologue!
- srlx $car0,1,$car0
- add $acc0,$car1,$car1
- srlx $car1,32,$car1
- mov $tmp0,$acc0 !prologue!
-
-.Lsqr_1st:
- mulx $apj,$mul0,$tmp0
- mulx $npj,$mul1,$tmp1
- add $acc0,$car0,$car0 ! ap[j]*a0+c0
- add $acc1,$car1,$car1
- ld [$ap+$j],$apj ! ap[j]
- and $car0,$mask,$acc0
- ld [$np+$j],$npj ! np[j]
- srlx $car0,32,$car0
- add $acc0,$acc0,$acc0
- or $sbit,$acc0,$acc0
- mov $tmp1,$acc1
- srlx $acc0,32,$sbit
- add $j,4,$j ! j++
- and $acc0,$mask,$acc0
- cmp $j,$num
- add $acc0,$car1,$car1
- st $car1,[$tp]
- mov $tmp0,$acc0
- srlx $car1,32,$car1
- bl %icc,.Lsqr_1st
- add $tp,4,$tp ! tp++
-!.Lsqr_1st
-
- mulx $apj,$mul0,$tmp0 ! epilogue
- mulx $npj,$mul1,$tmp1
- add $acc0,$car0,$car0 ! ap[j]*a0+c0
- add $acc1,$car1,$car1
- and $car0,$mask,$acc0
- srlx $car0,32,$car0
- add $acc0,$acc0,$acc0
- or $sbit,$acc0,$acc0
- srlx $acc0,32,$sbit
- and $acc0,$mask,$acc0
- add $acc0,$car1,$car1
- st $car1,[$tp]
- srlx $car1,32,$car1
-
- add $tmp0,$car0,$car0 ! ap[j]*a0+c0
- add $tmp1,$car1,$car1
- and $car0,$mask,$acc0
- srlx $car0,32,$car0
- add $acc0,$acc0,$acc0
- or $sbit,$acc0,$acc0
- srlx $acc0,32,$sbit
- and $acc0,$mask,$acc0
- add $acc0,$car1,$car1
- st $car1,[$tp+4]
- srlx $car1,32,$car1
-
- add $car0,$car0,$car0
- or $sbit,$car0,$car0
- add $car0,$car1,$car1
- st $car1,[$tp+8]
- srlx $car1,32,$car2
-\f
- ld [%sp+$bias+$frame],$tmp0 ! tp[0]
- ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
- ld [%sp+$bias+$frame+8],$tpj ! tp[2]
- ld [$ap+4],$mul0 ! ap[1]
- ld [$ap+8],$apj ! ap[2]
- ld [$np],$car1 ! np[0]
- ld [$np+4],$npj ! np[1]
- mulx $n0,$tmp0,$mul1
-
- mulx $mul0,$mul0,$car0
- and $mul1,$mask,$mul1
-
- mulx $car1,$mul1,$car1
- mulx $npj,$mul1,$acc1
- add $tmp0,$car1,$car1
- and $car0,$mask,$acc0
- ld [$np+8],$npj ! np[2]
- srlx $car1,32,$car1
- add $tmp1,$car1,$car1
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- and $car0,1,$sbit
- add $acc1,$car1,$car1
- srlx $car0,1,$car0
- mov 12,$j
- st $car1,[%sp+$bias+$frame] ! tp[0]=
- srlx $car1,32,$car1
- add %sp,$bias+$frame+4,$tp
-
-.Lsqr_2nd:
- mulx $apj,$mul0,$acc0
- mulx $npj,$mul1,$acc1
- add $acc0,$car0,$car0
- add $tpj,$car1,$car1
- ld [$ap+$j],$apj ! ap[j]
- and $car0,$mask,$acc0
- ld [$np+$j],$npj ! np[j]
- srlx $car0,32,$car0
- add $acc1,$car1,$car1
- ld [$tp+8],$tpj ! tp[j]
- add $acc0,$acc0,$acc0
- add $j,4,$j ! j++
- or $sbit,$acc0,$acc0
- srlx $acc0,32,$sbit
- and $acc0,$mask,$acc0
- cmp $j,$num
- add $acc0,$car1,$car1
- st $car1,[$tp] ! tp[j-1]
- srlx $car1,32,$car1
- bl %icc,.Lsqr_2nd
- add $tp,4,$tp ! tp++
-!.Lsqr_2nd
-
- mulx $apj,$mul0,$acc0
- mulx $npj,$mul1,$acc1
- add $acc0,$car0,$car0
- add $tpj,$car1,$car1
- and $car0,$mask,$acc0
- srlx $car0,32,$car0
- add $acc1,$car1,$car1
- add $acc0,$acc0,$acc0
- or $sbit,$acc0,$acc0
- srlx $acc0,32,$sbit
- and $acc0,$mask,$acc0
- add $acc0,$car1,$car1
- st $car1,[$tp] ! tp[j-1]
- srlx $car1,32,$car1
-
- add $car0,$car0,$car0
- or $sbit,$car0,$car0
- add $car0,$car1,$car1
- add $car2,$car1,$car1
- st $car1,[$tp+4]
- srlx $car1,32,$car2
-\f
- ld [%sp+$bias+$frame],$tmp1 ! tp[0]
- ld [%sp+$bias+$frame+4],$tpj ! tp[1]
- ld [$ap+8],$mul0 ! ap[2]
- ld [$np],$car1 ! np[0]
- ld [$np+4],$npj ! np[1]
- mulx $n0,$tmp1,$mul1
- and $mul1,$mask,$mul1
- mov 8,$i
-
- mulx $mul0,$mul0,$car0
- mulx $car1,$mul1,$car1
- and $car0,$mask,$acc0
- add $tmp1,$car1,$car1
- srlx $car0,32,$car0
- add %sp,$bias+$frame,$tp
- srlx $car1,32,$car1
- and $car0,1,$sbit
- srlx $car0,1,$car0
- mov 4,$j
-
-.Lsqr_outer:
-.Lsqr_inner1:
- mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
- add $j,4,$j
- ld [$tp+8],$tpj
- cmp $j,$i
- add $acc1,$car1,$car1
- ld [$np+$j],$npj
- st $car1,[$tp]
- srlx $car1,32,$car1
- bl %icc,.Lsqr_inner1
- add $tp,4,$tp
-!.Lsqr_inner1
-
- add $j,4,$j
- ld [$ap+$j],$apj ! ap[j]
- mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
- ld [$np+$j],$npj ! np[j]
- add $acc0,$car1,$car1
- ld [$tp+8],$tpj ! tp[j]
- add $acc1,$car1,$car1
- st $car1,[$tp]
- srlx $car1,32,$car1
-
- add $j,4,$j
- cmp $j,$num
- be,pn %icc,.Lsqr_no_inner2
- add $tp,4,$tp
-
-.Lsqr_inner2:
- mulx $apj,$mul0,$acc0
- mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
- add $acc0,$car0,$car0
- ld [$ap+$j],$apj ! ap[j]
- and $car0,$mask,$acc0
- ld [$np+$j],$npj ! np[j]
- srlx $car0,32,$car0
- add $acc0,$acc0,$acc0
- ld [$tp+8],$tpj ! tp[j]
- or $sbit,$acc0,$acc0
- add $j,4,$j ! j++
- srlx $acc0,32,$sbit
- and $acc0,$mask,$acc0
- cmp $j,$num
- add $acc0,$car1,$car1
- add $acc1,$car1,$car1
- st $car1,[$tp] ! tp[j-1]
- srlx $car1,32,$car1
- bl %icc,.Lsqr_inner2
- add $tp,4,$tp ! tp++
-
-.Lsqr_no_inner2:
- mulx $apj,$mul0,$acc0
- mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
- add $acc0,$car0,$car0
- and $car0,$mask,$acc0
- srlx $car0,32,$car0
- add $acc0,$acc0,$acc0
- or $sbit,$acc0,$acc0
- srlx $acc0,32,$sbit
- and $acc0,$mask,$acc0
- add $acc0,$car1,$car1
- add $acc1,$car1,$car1
- st $car1,[$tp] ! tp[j-1]
- srlx $car1,32,$car1
-
- add $car0,$car0,$car0
- or $sbit,$car0,$car0
- add $car0,$car1,$car1
- add $car2,$car1,$car1
- st $car1,[$tp+4]
- srlx $car1,32,$car2
-\f
- add $i,4,$i ! i++
- ld [%sp+$bias+$frame],$tmp1 ! tp[0]
- ld [%sp+$bias+$frame+4],$tpj ! tp[1]
- ld [$ap+$i],$mul0 ! ap[j]
- ld [$np],$car1 ! np[0]
- ld [$np+4],$npj ! np[1]
- mulx $n0,$tmp1,$mul1
- and $mul1,$mask,$mul1
- add $i,4,$tmp0
-
- mulx $mul0,$mul0,$car0
- mulx $car1,$mul1,$car1
- and $car0,$mask,$acc0
- add $tmp1,$car1,$car1
- srlx $car0,32,$car0
- add %sp,$bias+$frame,$tp
- srlx $car1,32,$car1
- and $car0,1,$sbit
- srlx $car0,1,$car0
-
- cmp $tmp0,$num ! i<num-1
- bl %icc,.Lsqr_outer
- mov 4,$j
-\f
-.Lsqr_last:
- mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
- add $j,4,$j
- ld [$tp+8],$tpj
- cmp $j,$i
- add $acc1,$car1,$car1
- ld [$np+$j],$npj
- st $car1,[$tp]
- srlx $car1,32,$car1
- bl %icc,.Lsqr_last
- add $tp,4,$tp
-!.Lsqr_last
-
- mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
- add $acc0,$car1,$car1
- add $acc1,$car1,$car1
- st $car1,[$tp]
- srlx $car1,32,$car1
-
- add $car0,$car0,$car0 ! recover $car0
- or $sbit,$car0,$car0
- add $car0,$car1,$car1
- add $car2,$car1,$car1
- st $car1,[$tp+4]
- srlx $car1,32,$car2
-
- ba .Ltail
- add $tp,8,$tp
-.type $fname,#function
-.size $fname,(.-$fname)
-___
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-print $code;
-close STDOUT;
+++ /dev/null
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# October 2005
-#
-# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
-# Because unlike integer multiplier, which simply stalls whole CPU,
-# FPU is fully pipelined and can effectively emit 48 bit partial
-# product every cycle. Why not blended SPARC v9? One can argue that
-# making this module dependent on UltraSPARC VIS extension limits its
-# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
-# implementations from compatibility matrix. But the rest, whole Sun
-# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
-# VIS extension instructions used in this module. This is considered
-# good enough to not care about HAL SPARC64 users [if any] who have
-# integer-only pure SPARCv9 module to "fall down" to.
-
-# USI&II cores currently exhibit uniform 2x improvement [over pre-
-# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
-# performance improves few percents for shorter keys and worsens few
-# percents for longer keys. This is because USIII integer multiplier
-# is >3x faster than USI&II one, which is harder to match [but see
-# TODO list below]. It should also be noted that SPARC64 V features
-# out-of-order execution, which *might* mean that integer multiplier
-# is pipelined, which in turn *might* be impossible to match... On
-# additional note, SPARC64 V implements FP Multiply-Add instruction,
-# which is perfectly usable in this context... In other words, as far
-# as Fujitsu SPARC64 V goes, talk to the author:-)
-
-# The implementation implies following "non-natural" limitations on
-# input arguments:
-# - num may not be less than 4;
-# - num has to be even;
-# Failure to meet either condition has no fatal effects, simply
-# doesn't give any performance gain.
-
-# TODO:
-# - modulo-schedule inner loop for better performance (on in-order
-# execution core such as UltraSPARC this shall result in further
-# noticeable(!) improvement);
-# - dedicated squaring procedure[?];
-
-######################################################################
-# November 2006
-#
-# Modulo-scheduled inner loops allow to interleave floating point and
-# integer instructions and minimize Read-After-Write penalties. This
-# results in *further* 20-50% performance improvement [depending on
-# key length, more for longer keys] on USI&II cores and 30-80% - on
-# USIII&IV.
-
-$fname="bn_mul_mont_fpu";
-$bits=32;
-for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-
-if ($bits==64) {
- $bias=2047;
- $frame=192;
-} else {
- $bias=0;
- $frame=128; # 96 rounded up to largest known cache-line
-}
-$locals=64;
-
-# In order to provide for 32-/64-bit ABI duality, I keep integers wider
-# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
-# exclusively for pointers, indexes and other small values...
-# int bn_mul_mont(
-$rp="%i0"; # BN_ULONG *rp,
-$ap="%i1"; # const BN_ULONG *ap,
-$bp="%i2"; # const BN_ULONG *bp,
-$np="%i3"; # const BN_ULONG *np,
-$n0="%i4"; # const BN_ULONG *n0,
-$num="%i5"; # int num);
-
-$tp="%l0"; # t[num]
-$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
-$ap_h="%l2"; # to these four vectors as double-precision FP values.
-$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
-$np_h="%l4"; # loop and L1-cache aliasing is minimized...
-$i="%l5";
-$j="%l6";
-$mask="%l7"; # 16-bit mask, 0xffff
-
-$n0="%g4"; # reassigned(!) to "64-bit" register
-$carry="%i4"; # %i4 reused(!) for a carry bit
-
-# FP register naming chart
-#
-# ..HILO
-# dcba
-# --------
-# LOa
-# LOb
-# LOc
-# LOd
-# HIa
-# HIb
-# HIc
-# HId
-# ..a
-# ..b
-$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
-$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
-$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
-$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
-
-$dota="%f24"; $dotb="%f26";
-
-$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
-$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
-$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
-$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
-
-$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
-
-$code=<<___;
-.section ".text",#alloc,#execinstr
-
-.global $fname
-.align 32
-$fname:
- save %sp,-$frame-$locals,%sp
-
- cmp $num,4
- bl,a,pn %icc,.Lret
- clr %i0
- andcc $num,1,%g0 ! $num has to be even...
- bnz,a,pn %icc,.Lret
- clr %i0 ! signal "unsupported input value"
-
- srl $num,1,$num
- sethi %hi(0xffff),$mask
- ld [%i4+0],$n0 ! $n0 reassigned, remember?
- or $mask,%lo(0xffff),$mask
- ld [%i4+4],%o0
- sllx %o0,32,%o0
- or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
-
- sll $num,3,$num ! num*=8
-
- add %sp,$bias,%o0 ! real top of stack
- sll $num,2,%o1
- add %o1,$num,%o1 ! %o1=num*5
- sub %o0,%o1,%o0
- and %o0,-2048,%o0 ! optimize TLB utilization
- sub %o0,$bias,%sp ! alloca(5*num*8)
-
- rd %asi,%o7 ! save %asi
- add %sp,$bias+$frame+$locals,$tp
- add $tp,$num,$ap_l
- add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
- add $ap_l,$num,$ap_h
- add $ap_h,$num,$np_l
- add $np_l,$num,$np_h
-
- wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
-
- add $rp,$num,$rp ! readjust input pointers to point
- add $ap,$num,$ap ! at the ends too...
- add $bp,$num,$bp
- add $np,$num,$np
-
- stx %o7,[%sp+$bias+$frame+48] ! save %asi
-\f
- sub %g0,$num,$i ! i=-num
- sub %g0,$num,$j ! j=-num
-
- add $ap,$j,%o3
- add $bp,$i,%o4
-
- ld [%o3+4],%g1 ! bp[0]
- ld [%o3+0],%o0
- ld [%o4+4],%g5 ! ap[0]
- sllx %g1,32,%g1
- ld [%o4+0],%o1
- sllx %g5,32,%g5
- or %g1,%o0,%o0
- or %g5,%o1,%o1
-
- add $np,$j,%o5
-
- mulx %o1,%o0,%o0 ! ap[0]*bp[0]
- mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
- stx %o0,[%sp+$bias+$frame+0]
-
- ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
- fzeros $alo
- ld [%o3+4],$ahi_
- fzeros $ahi
- ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
- fzeros $nlo
- ld [%o5+4],$nhi_
- fzeros $nhi
-
- ! transfer b[i] to FPU as 4x16-bit values
- ldda [%o4+2]%asi,$ba
- fxtod $alo,$alo
- ldda [%o4+0]%asi,$bb
- fxtod $ahi,$ahi
- ldda [%o4+6]%asi,$bc
- fxtod $nlo,$nlo
- ldda [%o4+4]%asi,$bd
- fxtod $nhi,$nhi
-
- ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
- ldda [%sp+$bias+$frame+6]%asi,$na
- fxtod $ba,$ba
- ldda [%sp+$bias+$frame+4]%asi,$nb
- fxtod $bb,$bb
- ldda [%sp+$bias+$frame+2]%asi,$nc
- fxtod $bc,$bc
- ldda [%sp+$bias+$frame+0]%asi,$nd
- fxtod $bd,$bd
-
- std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
- fxtod $na,$na
- std $ahi,[$ap_h+$j]
- fxtod $nb,$nb
- std $nlo,[$np_l+$j] ! save smashed np[j] in double format
- fxtod $nc,$nc
- std $nhi,[$np_h+$j]
- fxtod $nd,$nd
-
- fmuld $alo,$ba,$aloa
- fmuld $nlo,$na,$nloa
- fmuld $alo,$bb,$alob
- fmuld $nlo,$nb,$nlob
- fmuld $alo,$bc,$aloc
- faddd $aloa,$nloa,$nloa
- fmuld $nlo,$nc,$nloc
- fmuld $alo,$bd,$alod
- faddd $alob,$nlob,$nlob
- fmuld $nlo,$nd,$nlod
- fmuld $ahi,$ba,$ahia
- faddd $aloc,$nloc,$nloc
- fmuld $nhi,$na,$nhia
- fmuld $ahi,$bb,$ahib
- faddd $alod,$nlod,$nlod
- fmuld $nhi,$nb,$nhib
- fmuld $ahi,$bc,$ahic
- faddd $ahia,$nhia,$nhia
- fmuld $nhi,$nc,$nhic
- fmuld $ahi,$bd,$ahid
- faddd $ahib,$nhib,$nhib
- fmuld $nhi,$nd,$nhid
-
- faddd $ahic,$nhic,$dota ! $nhic
- faddd $ahid,$nhid,$dotb ! $nhid
-
- faddd $nloc,$nhia,$nloc
- faddd $nlod,$nhib,$nlod
-
- fdtox $nloa,$nloa
- fdtox $nlob,$nlob
- fdtox $nloc,$nloc
- fdtox $nlod,$nlod
-
- std $nloa,[%sp+$bias+$frame+0]
- add $j,8,$j
- std $nlob,[%sp+$bias+$frame+8]
- add $ap,$j,%o4
- std $nloc,[%sp+$bias+$frame+16]
- add $np,$j,%o5
- std $nlod,[%sp+$bias+$frame+24]
-\f
- ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
- fzeros $alo
- ld [%o4+4],$ahi_
- fzeros $ahi
- ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
- fzeros $nlo
- ld [%o5+4],$nhi_
- fzeros $nhi
-
- fxtod $alo,$alo
- fxtod $ahi,$ahi
- fxtod $nlo,$nlo
- fxtod $nhi,$nhi
-
- ldx [%sp+$bias+$frame+0],%o0
- fmuld $alo,$ba,$aloa
- ldx [%sp+$bias+$frame+8],%o1
- fmuld $nlo,$na,$nloa
- ldx [%sp+$bias+$frame+16],%o2
- fmuld $alo,$bb,$alob
- ldx [%sp+$bias+$frame+24],%o3
- fmuld $nlo,$nb,$nlob
-
- srlx %o0,16,%o7
- std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
- fmuld $alo,$bc,$aloc
- add %o7,%o1,%o1
- std $ahi,[$ap_h+$j]
- faddd $aloa,$nloa,$nloa
- fmuld $nlo,$nc,$nloc
- srlx %o1,16,%o7
- std $nlo,[$np_l+$j] ! save smashed np[j] in double format
- fmuld $alo,$bd,$alod
- add %o7,%o2,%o2
- std $nhi,[$np_h+$j]
- faddd $alob,$nlob,$nlob
- fmuld $nlo,$nd,$nlod
- srlx %o2,16,%o7
- fmuld $ahi,$ba,$ahia
- add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
- faddd $aloc,$nloc,$nloc
- fmuld $nhi,$na,$nhia
- !and %o0,$mask,%o0
- !and %o1,$mask,%o1
- !and %o2,$mask,%o2
- !sllx %o1,16,%o1
- !sllx %o2,32,%o2
- !sllx %o3,48,%o7
- !or %o1,%o0,%o0
- !or %o2,%o0,%o0
- !or %o7,%o0,%o0 ! 64-bit result
- srlx %o3,16,%g1 ! 34-bit carry
- fmuld $ahi,$bb,$ahib
-
- faddd $alod,$nlod,$nlod
- fmuld $nhi,$nb,$nhib
- fmuld $ahi,$bc,$ahic
- faddd $ahia,$nhia,$nhia
- fmuld $nhi,$nc,$nhic
- fmuld $ahi,$bd,$ahid
- faddd $ahib,$nhib,$nhib
- fmuld $nhi,$nd,$nhid
-
- faddd $dota,$nloa,$nloa
- faddd $dotb,$nlob,$nlob
- faddd $ahic,$nhic,$dota ! $nhic
- faddd $ahid,$nhid,$dotb ! $nhid
-
- faddd $nloc,$nhia,$nloc
- faddd $nlod,$nhib,$nlod
-
- fdtox $nloa,$nloa
- fdtox $nlob,$nlob
- fdtox $nloc,$nloc
- fdtox $nlod,$nlod
-
- std $nloa,[%sp+$bias+$frame+0]
- std $nlob,[%sp+$bias+$frame+8]
- addcc $j,8,$j
- std $nloc,[%sp+$bias+$frame+16]
- bz,pn %icc,.L1stskip
- std $nlod,[%sp+$bias+$frame+24]
-\f
-.align 32 ! incidentally already aligned !
-.L1st:
- add $ap,$j,%o4
- add $np,$j,%o5
- ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
- fzeros $alo
- ld [%o4+4],$ahi_
- fzeros $ahi
- ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
- fzeros $nlo
- ld [%o5+4],$nhi_
- fzeros $nhi
-
- fxtod $alo,$alo
- fxtod $ahi,$ahi
- fxtod $nlo,$nlo
- fxtod $nhi,$nhi
-
- ldx [%sp+$bias+$frame+0],%o0
- fmuld $alo,$ba,$aloa
- ldx [%sp+$bias+$frame+8],%o1
- fmuld $nlo,$na,$nloa
- ldx [%sp+$bias+$frame+16],%o2
- fmuld $alo,$bb,$alob
- ldx [%sp+$bias+$frame+24],%o3
- fmuld $nlo,$nb,$nlob
-
- srlx %o0,16,%o7
- std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
- fmuld $alo,$bc,$aloc
- add %o7,%o1,%o1
- std $ahi,[$ap_h+$j]
- faddd $aloa,$nloa,$nloa
- fmuld $nlo,$nc,$nloc
- srlx %o1,16,%o7
- std $nlo,[$np_l+$j] ! save smashed np[j] in double format
- fmuld $alo,$bd,$alod
- add %o7,%o2,%o2
- std $nhi,[$np_h+$j]
- faddd $alob,$nlob,$nlob
- fmuld $nlo,$nd,$nlod
- srlx %o2,16,%o7
- fmuld $ahi,$ba,$ahia
- add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
- and %o0,$mask,%o0
- faddd $aloc,$nloc,$nloc
- fmuld $nhi,$na,$nhia
- and %o1,$mask,%o1
- and %o2,$mask,%o2
- fmuld $ahi,$bb,$ahib
- sllx %o1,16,%o1
- faddd $alod,$nlod,$nlod
- fmuld $nhi,$nb,$nhib
- sllx %o2,32,%o2
- fmuld $ahi,$bc,$ahic
- sllx %o3,48,%o7
- or %o1,%o0,%o0
- faddd $ahia,$nhia,$nhia
- fmuld $nhi,$nc,$nhic
- or %o2,%o0,%o0
- fmuld $ahi,$bd,$ahid
- or %o7,%o0,%o0 ! 64-bit result
- faddd $ahib,$nhib,$nhib
- fmuld $nhi,$nd,$nhid
- addcc %g1,%o0,%o0
- faddd $dota,$nloa,$nloa
- srlx %o3,16,%g1 ! 34-bit carry
- faddd $dotb,$nlob,$nlob
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- stx %o0,[$tp] ! tp[j-1]=
-
- faddd $ahic,$nhic,$dota ! $nhic
- faddd $ahid,$nhid,$dotb ! $nhid
-
- faddd $nloc,$nhia,$nloc
- faddd $nlod,$nhib,$nlod
-
- fdtox $nloa,$nloa
- fdtox $nlob,$nlob
- fdtox $nloc,$nloc
- fdtox $nlod,$nlod
-
- std $nloa,[%sp+$bias+$frame+0]
- std $nlob,[%sp+$bias+$frame+8]
- std $nloc,[%sp+$bias+$frame+16]
- std $nlod,[%sp+$bias+$frame+24]
-
- addcc $j,8,$j
- bnz,pt %icc,.L1st
- add $tp,8,$tp
-\f
-.L1stskip:
- fdtox $dota,$dota
- fdtox $dotb,$dotb
-
- ldx [%sp+$bias+$frame+0],%o0
- ldx [%sp+$bias+$frame+8],%o1
- ldx [%sp+$bias+$frame+16],%o2
- ldx [%sp+$bias+$frame+24],%o3
-
- srlx %o0,16,%o7
- std $dota,[%sp+$bias+$frame+32]
- add %o7,%o1,%o1
- std $dotb,[%sp+$bias+$frame+40]
- srlx %o1,16,%o7
- add %o7,%o2,%o2
- srlx %o2,16,%o7
- add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
- and %o0,$mask,%o0
- and %o1,$mask,%o1
- and %o2,$mask,%o2
- sllx %o1,16,%o1
- sllx %o2,32,%o2
- sllx %o3,48,%o7
- or %o1,%o0,%o0
- or %o2,%o0,%o0
- or %o7,%o0,%o0 ! 64-bit result
- ldx [%sp+$bias+$frame+32],%o4
- addcc %g1,%o0,%o0
- ldx [%sp+$bias+$frame+40],%o5
- srlx %o3,16,%g1 ! 34-bit carry
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- stx %o0,[$tp] ! tp[j-1]=
- add $tp,8,$tp
-
- srlx %o4,16,%o7
- add %o7,%o5,%o5
- and %o4,$mask,%o4
- sllx %o5,16,%o7
- or %o7,%o4,%o4
- addcc %g1,%o4,%o4
- srlx %o5,48,%g1
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- mov %g1,$carry
- stx %o4,[$tp] ! tp[num-1]=
-\f
- ba .Louter
- add $i,8,$i
-.align 32
-.Louter:
- sub %g0,$num,$j ! j=-num
- add %sp,$bias+$frame+$locals,$tp
-
- add $ap,$j,%o3
- add $bp,$i,%o4
-
- ld [%o3+4],%g1 ! bp[i]
- ld [%o3+0],%o0
- ld [%o4+4],%g5 ! ap[0]
- sllx %g1,32,%g1
- ld [%o4+0],%o1
- sllx %g5,32,%g5
- or %g1,%o0,%o0
- or %g5,%o1,%o1
-
- ldx [$tp],%o2 ! tp[0]
- mulx %o1,%o0,%o0
- addcc %o2,%o0,%o0
- mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
- stx %o0,[%sp+$bias+$frame+0]
-
- ! transfer b[i] to FPU as 4x16-bit values
- ldda [%o4+2]%asi,$ba
- ldda [%o4+0]%asi,$bb
- ldda [%o4+6]%asi,$bc
- ldda [%o4+4]%asi,$bd
-
- ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
- ldda [%sp+$bias+$frame+6]%asi,$na
- fxtod $ba,$ba
- ldda [%sp+$bias+$frame+4]%asi,$nb
- fxtod $bb,$bb
- ldda [%sp+$bias+$frame+2]%asi,$nc
- fxtod $bc,$bc
- ldda [%sp+$bias+$frame+0]%asi,$nd
- fxtod $bd,$bd
- ldd [$ap_l+$j],$alo ! load a[j] in double format
- fxtod $na,$na
- ldd [$ap_h+$j],$ahi
- fxtod $nb,$nb
- ldd [$np_l+$j],$nlo ! load n[j] in double format
- fxtod $nc,$nc
- ldd [$np_h+$j],$nhi
- fxtod $nd,$nd
-
- fmuld $alo,$ba,$aloa
- fmuld $nlo,$na,$nloa
- fmuld $alo,$bb,$alob
- fmuld $nlo,$nb,$nlob
- fmuld $alo,$bc,$aloc
- faddd $aloa,$nloa,$nloa
- fmuld $nlo,$nc,$nloc
- fmuld $alo,$bd,$alod
- faddd $alob,$nlob,$nlob
- fmuld $nlo,$nd,$nlod
- fmuld $ahi,$ba,$ahia
- faddd $aloc,$nloc,$nloc
- fmuld $nhi,$na,$nhia
- fmuld $ahi,$bb,$ahib
- faddd $alod,$nlod,$nlod
- fmuld $nhi,$nb,$nhib
- fmuld $ahi,$bc,$ahic
- faddd $ahia,$nhia,$nhia
- fmuld $nhi,$nc,$nhic
- fmuld $ahi,$bd,$ahid
- faddd $ahib,$nhib,$nhib
- fmuld $nhi,$nd,$nhid
-
- faddd $ahic,$nhic,$dota ! $nhic
- faddd $ahid,$nhid,$dotb ! $nhid
-
- faddd $nloc,$nhia,$nloc
- faddd $nlod,$nhib,$nlod
-
- fdtox $nloa,$nloa
- fdtox $nlob,$nlob
- fdtox $nloc,$nloc
- fdtox $nlod,$nlod
-
- std $nloa,[%sp+$bias+$frame+0]
- std $nlob,[%sp+$bias+$frame+8]
- std $nloc,[%sp+$bias+$frame+16]
- add $j,8,$j
- std $nlod,[%sp+$bias+$frame+24]
-\f
- ldd [$ap_l+$j],$alo ! load a[j] in double format
- ldd [$ap_h+$j],$ahi
- ldd [$np_l+$j],$nlo ! load n[j] in double format
- ldd [$np_h+$j],$nhi
-
- fmuld $alo,$ba,$aloa
- fmuld $nlo,$na,$nloa
- fmuld $alo,$bb,$alob
- fmuld $nlo,$nb,$nlob
- fmuld $alo,$bc,$aloc
- ldx [%sp+$bias+$frame+0],%o0
- faddd $aloa,$nloa,$nloa
- fmuld $nlo,$nc,$nloc
- ldx [%sp+$bias+$frame+8],%o1
- fmuld $alo,$bd,$alod
- ldx [%sp+$bias+$frame+16],%o2
- faddd $alob,$nlob,$nlob
- fmuld $nlo,$nd,$nlod
- ldx [%sp+$bias+$frame+24],%o3
- fmuld $ahi,$ba,$ahia
-
- srlx %o0,16,%o7
- faddd $aloc,$nloc,$nloc
- fmuld $nhi,$na,$nhia
- add %o7,%o1,%o1
- fmuld $ahi,$bb,$ahib
- srlx %o1,16,%o7
- faddd $alod,$nlod,$nlod
- fmuld $nhi,$nb,$nhib
- add %o7,%o2,%o2
- fmuld $ahi,$bc,$ahic
- srlx %o2,16,%o7
- faddd $ahia,$nhia,$nhia
- fmuld $nhi,$nc,$nhic
- add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
- ! why?
- and %o0,$mask,%o0
- fmuld $ahi,$bd,$ahid
- and %o1,$mask,%o1
- and %o2,$mask,%o2
- faddd $ahib,$nhib,$nhib
- fmuld $nhi,$nd,$nhid
- sllx %o1,16,%o1
- faddd $dota,$nloa,$nloa
- sllx %o2,32,%o2
- faddd $dotb,$nlob,$nlob
- sllx %o3,48,%o7
- or %o1,%o0,%o0
- faddd $ahic,$nhic,$dota ! $nhic
- or %o2,%o0,%o0
- faddd $ahid,$nhid,$dotb ! $nhid
- or %o7,%o0,%o0 ! 64-bit result
- ldx [$tp],%o7
- faddd $nloc,$nhia,$nloc
- addcc %o7,%o0,%o0
- ! end-of-why?
- faddd $nlod,$nhib,$nlod
- srlx %o3,16,%g1 ! 34-bit carry
- fdtox $nloa,$nloa
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- fdtox $nlob,$nlob
- fdtox $nloc,$nloc
- fdtox $nlod,$nlod
-
- std $nloa,[%sp+$bias+$frame+0]
- std $nlob,[%sp+$bias+$frame+8]
- addcc $j,8,$j
- std $nloc,[%sp+$bias+$frame+16]
- bz,pn %icc,.Linnerskip
- std $nlod,[%sp+$bias+$frame+24]
-\f
- ba .Linner
- nop
-.align 32
-.Linner:
- ldd [$ap_l+$j],$alo ! load a[j] in double format
- ldd [$ap_h+$j],$ahi
- ldd [$np_l+$j],$nlo ! load n[j] in double format
- ldd [$np_h+$j],$nhi
-
- fmuld $alo,$ba,$aloa
- fmuld $nlo,$na,$nloa
- fmuld $alo,$bb,$alob
- fmuld $nlo,$nb,$nlob
- fmuld $alo,$bc,$aloc
- ldx [%sp+$bias+$frame+0],%o0
- faddd $aloa,$nloa,$nloa
- fmuld $nlo,$nc,$nloc
- ldx [%sp+$bias+$frame+8],%o1
- fmuld $alo,$bd,$alod
- ldx [%sp+$bias+$frame+16],%o2
- faddd $alob,$nlob,$nlob
- fmuld $nlo,$nd,$nlod
- ldx [%sp+$bias+$frame+24],%o3
- fmuld $ahi,$ba,$ahia
-
- srlx %o0,16,%o7
- faddd $aloc,$nloc,$nloc
- fmuld $nhi,$na,$nhia
- add %o7,%o1,%o1
- fmuld $ahi,$bb,$ahib
- srlx %o1,16,%o7
- faddd $alod,$nlod,$nlod
- fmuld $nhi,$nb,$nhib
- add %o7,%o2,%o2
- fmuld $ahi,$bc,$ahic
- srlx %o2,16,%o7
- faddd $ahia,$nhia,$nhia
- fmuld $nhi,$nc,$nhic
- add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
- and %o0,$mask,%o0
- fmuld $ahi,$bd,$ahid
- and %o1,$mask,%o1
- and %o2,$mask,%o2
- faddd $ahib,$nhib,$nhib
- fmuld $nhi,$nd,$nhid
- sllx %o1,16,%o1
- faddd $dota,$nloa,$nloa
- sllx %o2,32,%o2
- faddd $dotb,$nlob,$nlob
- sllx %o3,48,%o7
- or %o1,%o0,%o0
- faddd $ahic,$nhic,$dota ! $nhic
- or %o2,%o0,%o0
- faddd $ahid,$nhid,$dotb ! $nhid
- or %o7,%o0,%o0 ! 64-bit result
- faddd $nloc,$nhia,$nloc
- addcc %g1,%o0,%o0
- ldx [$tp+8],%o7 ! tp[j]
- faddd $nlod,$nhib,$nlod
- srlx %o3,16,%g1 ! 34-bit carry
- fdtox $nloa,$nloa
- bcs,a %xcc,.+8
- add %g1,1,%g1
- fdtox $nlob,$nlob
- addcc %o7,%o0,%o0
- fdtox $nloc,$nloc
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- stx %o0,[$tp] ! tp[j-1]
- fdtox $nlod,$nlod
-
- std $nloa,[%sp+$bias+$frame+0]
- std $nlob,[%sp+$bias+$frame+8]
- std $nloc,[%sp+$bias+$frame+16]
- addcc $j,8,$j
- std $nlod,[%sp+$bias+$frame+24]
- bnz,pt %icc,.Linner
- add $tp,8,$tp
-\f
-.Linnerskip:
- fdtox $dota,$dota
- fdtox $dotb,$dotb
-
- ldx [%sp+$bias+$frame+0],%o0
- ldx [%sp+$bias+$frame+8],%o1
- ldx [%sp+$bias+$frame+16],%o2
- ldx [%sp+$bias+$frame+24],%o3
-
- srlx %o0,16,%o7
- std $dota,[%sp+$bias+$frame+32]
- add %o7,%o1,%o1
- std $dotb,[%sp+$bias+$frame+40]
- srlx %o1,16,%o7
- add %o7,%o2,%o2
- srlx %o2,16,%o7
- add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
- and %o0,$mask,%o0
- and %o1,$mask,%o1
- and %o2,$mask,%o2
- sllx %o1,16,%o1
- sllx %o2,32,%o2
- sllx %o3,48,%o7
- or %o1,%o0,%o0
- or %o2,%o0,%o0
- ldx [%sp+$bias+$frame+32],%o4
- or %o7,%o0,%o0 ! 64-bit result
- ldx [%sp+$bias+$frame+40],%o5
- addcc %g1,%o0,%o0
- ldx [$tp+8],%o7 ! tp[j]
- srlx %o3,16,%g1 ! 34-bit carry
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- addcc %o7,%o0,%o0
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- stx %o0,[$tp] ! tp[j-1]
- add $tp,8,$tp
-
- srlx %o4,16,%o7
- add %o7,%o5,%o5
- and %o4,$mask,%o4
- sllx %o5,16,%o7
- or %o7,%o4,%o4
- addcc %g1,%o4,%o4
- srlx %o5,48,%g1
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- addcc $carry,%o4,%o4
- stx %o4,[$tp] ! tp[num-1]
- mov %g1,$carry
- bcs,a %xcc,.+8
- add $carry,1,$carry
-
- addcc $i,8,$i
- bnz %icc,.Louter
- nop
-\f
- add $tp,8,$tp ! adjust tp to point at the end
- orn %g0,%g0,%g4
- sub %g0,$num,%o7 ! n=-num
- ba .Lsub
- subcc %g0,%g0,%g0 ! clear %icc.c
-
-.align 32
-.Lsub:
- ldx [$tp+%o7],%o0
- add $np,%o7,%g1
- ld [%g1+0],%o2
- ld [%g1+4],%o3
- srlx %o0,32,%o1
- subccc %o0,%o2,%o2
- add $rp,%o7,%g1
- subccc %o1,%o3,%o3
- st %o2,[%g1+0]
- add %o7,8,%o7
- brnz,pt %o7,.Lsub
- st %o3,[%g1+4]
- subc $carry,0,%g4
- sub %g0,$num,%o7 ! n=-num
- ba .Lcopy
- nop
-
-.align 32
-.Lcopy:
- ldx [$tp+%o7],%o0
- add $rp,%o7,%g1
- ld [%g1+0],%o2
- ld [%g1+4],%o3
- stx %g0,[$tp+%o7]
- and %o0,%g4,%o0
- srlx %o0,32,%o1
- andn %o2,%g4,%o2
- andn %o3,%g4,%o3
- or %o2,%o0,%o0
- or %o3,%o1,%o1
- st %o0,[%g1+0]
- add %o7,8,%o7
- brnz,pt %o7,.Lcopy
- st %o1,[%g1+4]
- sub %g0,$num,%o7 ! n=-num
-
-.Lzap:
- stx %g0,[$ap_l+%o7]
- stx %g0,[$ap_h+%o7]
- stx %g0,[$np_l+%o7]
- stx %g0,[$np_h+%o7]
- add %o7,8,%o7
- brnz,pt %o7,.Lzap
- nop
-
- ldx [%sp+$bias+$frame+48],%o7
- wr %g0,%o7,%asi ! restore %asi
-
- mov 1,%i0
-.Lret:
- ret
- restore
-.type $fname,#function
-.size $fname,(.-$fname)
-___
-
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-
-# Below substitution makes it possible to compile without demanding
-# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
-# dare to do this, because VIS capability is detected at run-time now
-# and this routine is not called on CPU not capable to execute it. Do
-# note that fzeros is not the only VIS dependency! Another dependency
-# is implicit and is just _a_ numerical value loaded to %asi register,
-# which assembler can't recognize as VIS specific...
-$code =~ s/fzeros\s+%f([0-9]+)/
- sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
- /gem;
-
-print $code;
-# flush
-close STDOUT;
+++ /dev/null
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# Wrapper around 'rep montmul', VIA-specific instruction accessing
-# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
-# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
-#
-# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
-# different software configurations on 1.5GHz VIA Esther processor.
-# Lines marked with "software integer" denote performance of hand-
-# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
-# refers to hand-coded SSE2 Montgomery multiplication procedure found
-# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
-# Padlock SDK 2.0.1 available for download from VIA, which naturally
-# utilizes the magic 'repz montmul' instruction. And finally "hardware
-# this" refers to *this* implementation which also uses 'repz montmul'
-#
-# sign verify sign/s verify/s
-# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
-# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
-# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
-# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
-#
-# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
-# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
-# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
-# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
-#
-# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
-# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
-# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
-# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
-#
-# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
-# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
-# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
-# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
-#
-# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
-# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
-# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
-# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
-#
-# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
-# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
-# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
-# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
-#
-# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
-# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
-# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
-# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
-#
-# To give you some other reference point here is output for 2.4GHz P4
-# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
-# SSE2" in above terms.
-#
-# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
-# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
-# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
-# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
-# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
-# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
-# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
-#
-# Conclusions:
-# - VIA SDK leaves a *lot* of room for improvement (which this
-# implementation successfully fills:-);
-# - 'rep montmul' gives up to >3x performance improvement depending on
-# key length;
-# - in terms of absolute performance it delivers approximately as much
-# as modern out-of-order 32-bit cores [again, for longer keys].
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-
-&asm_init($ARGV[0],"via-mont.pl");
-
-# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
-$func="bn_mul_mont_padlock";
-
-$pad=16*1; # amount of reserved bytes on top of every vector
-
-# stack layout
-$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
-$A=&DWP(4,"esp");
-$B=&DWP(8,"esp");
-$T=&DWP(12,"esp");
-$M=&DWP(16,"esp");
-$scratch=&DWP(20,"esp");
-$rp=&DWP(24,"esp"); # these are mine
-$sp=&DWP(28,"esp");
-# &DWP(32,"esp") # 32 byte scratch area
-# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
-# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
-# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
-# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
-# Note that SDK suggests to unconditionally allocate 2K per vector. This
-# has quite an impact on performance. It naturally depends on key length,
-# but to give an example 1024 bit private RSA key operations suffer >30%
-# penalty. I allocate only as much as actually required...
-
-&function_begin($func);
- &xor ("eax","eax");
- &mov ("ecx",&wparam(5)); # num
- # meet VIA's limitations for num [note that the specification
- # expresses them in bits, while we work with amount of 32-bit words]
- &test ("ecx",3);
- &jnz (&label("leave")); # num % 4 != 0
- &cmp ("ecx",8);
- &jb (&label("leave")); # num < 8
- &cmp ("ecx",1024);
- &ja (&label("leave")); # num > 1024
-
- &pushf ();
- &cld ();
-
- &mov ("edi",&wparam(0)); # rp
- &mov ("eax",&wparam(1)); # ap
- &mov ("ebx",&wparam(2)); # bp
- &mov ("edx",&wparam(3)); # np
- &mov ("esi",&wparam(4)); # n0
- &mov ("esi",&DWP(0,"esi")); # *n0
-
- &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
- &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
- &neg ("ebp");
- &add ("ebp","esp");
- &and ("ebp",-64); # align to cache-line
- &xchg ("ebp","esp"); # alloca
-
- &mov ($rp,"edi"); # save rp
- &mov ($sp,"ebp"); # save esp
-
- &mov ($mZeroPrime,"esi");
- &lea ("esi",&DWP(64,"esp")); # tp
- &mov ($T,"esi");
- &lea ("edi",&DWP(32,"esp")); # scratch area
- &mov ($scratch,"edi");
- &mov ("esi","eax");
-
- &lea ("ebp",&DWP(-$pad,"ecx"));
- &shr ("ebp",2); # restore original num value in ebp
-
- &xor ("eax","eax");
-
- &mov ("ecx","ebp");
- &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
- &data_byte(0xf3,0xab); # rep stosl, bzero
-
- &mov ("ecx","ebp");
- &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
- &mov ($A,"edi");
- &data_byte(0xf3,0xa5); # rep movsl, memcpy
- &mov ("ecx",$pad/4);
- &data_byte(0xf3,0xab); # rep stosl, bzero pad
- # edi points at the end of padded ap copy...
-
- &mov ("ecx","ebp");
- &mov ("esi","ebx");
- &mov ($B,"edi");
- &data_byte(0xf3,0xa5); # rep movsl, memcpy
- &mov ("ecx",$pad/4);
- &data_byte(0xf3,0xab); # rep stosl, bzero pad
- # edi points at the end of padded bp copy...
-
- &mov ("ecx","ebp");
- &mov ("esi","edx");
- &mov ($M,"edi");
- &data_byte(0xf3,0xa5); # rep movsl, memcpy
- &mov ("ecx",$pad/4);
- &data_byte(0xf3,0xab); # rep stosl, bzero pad
- # edi points at the end of padded np copy...
-
- # let magic happen...
- &mov ("ecx","ebp");
- &mov ("esi","esp");
- &shl ("ecx",5); # convert word counter to bit counter
- &align (4);
- &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
-
- &mov ("ecx","ebp");
- &lea ("esi",&DWP(64,"esp")); # tp
- # edi still points at the end of padded np copy...
- &neg ("ebp");
- &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
- &mov ("edi",$rp); # restore rp
- &xor ("edx","edx"); # i=0 and clear CF
-
-&set_label("sub",8);
- &mov ("eax",&DWP(0,"esi","edx",4));
- &sbb ("eax",&DWP(0,"ebp","edx",4));
- &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
- &lea ("edx",&DWP(1,"edx")); # i++
- &loop (&label("sub")); # doesn't affect CF!
-
- &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
- &sbb ("eax",0);
- &and ("esi","eax");
- ¬ ("eax");
- &mov ("ebp","edi");
- &and ("ebp","eax");
- &or ("esi","ebp"); # tp=carry?tp:rp
-
- &mov ("ecx","edx"); # num
- &xor ("edx","edx"); # i=0
-
-&set_label("copy",8);
- &mov ("eax",&DWP(0,"esi","edx",4));
- &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
- &mov (&DWP(0,"edi","edx",4),"eax");
- &lea ("edx",&DWP(1,"edx")); # i++
- &loop (&label("copy"));
-
- &mov ("ebp",$sp);
- &xor ("eax","eax");
-
- &mov ("ecx",64/4);
- &mov ("edi","esp"); # zap frame including scratch area
- &data_byte(0xf3,0xab); # rep stosl, bzero
-
- # zap copies of ap, bp and np
- &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
- &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
- &data_byte(0xf3,0xab); # rep stosl, bzero
-
- &mov ("esp","ebp");
- &inc ("eax"); # signal "done"
- &popf ();
-&set_label("leave");
-&function_end($func);
-
-&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
-
-&asm_finish();
+++ /dev/null
-#!/usr/local/bin/perl
-
-# define for pentium pro friendly version
-$ppro=1;
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-require "cbc.pl";
-
-&asm_init($ARGV[0],"cast-586.pl",$ARGV[$#ARGV] eq "386");
-
-$CAST_ROUNDS=16;
-$L="edi";
-$R="esi";
-$K="ebp";
-$tmp1="ecx";
-$tmp2="ebx";
-$tmp3="eax";
-$tmp4="edx";
-$S1="CAST_S_table0";
-$S2="CAST_S_table1";
-$S3="CAST_S_table2";
-$S4="CAST_S_table3";
-
-@F1=("add","xor","sub");
-@F2=("xor","sub","add");
-@F3=("sub","add","xor");
-
-&CAST_encrypt("CAST_encrypt",1);
-&CAST_encrypt("CAST_decrypt",0);
-&cbc("CAST_cbc_encrypt","CAST_encrypt","CAST_decrypt",1,4,5,3,-1,-1) unless $main'openbsd;
-
-&asm_finish();
-
-sub CAST_encrypt {
- local($name,$enc)=@_;
-
- local($win_ex)=<<"EOF";
-EXTERN _CAST_S_table0:DWORD
-EXTERN _CAST_S_table1:DWORD
-EXTERN _CAST_S_table2:DWORD
-EXTERN _CAST_S_table3:DWORD
-EOF
- &main::external_label(
- "CAST_S_table0",
- "CAST_S_table1",
- "CAST_S_table2",
- "CAST_S_table3",
- );
-
- &function_begin_B($name,$win_ex);
-
- &comment("");
-
- &push("ebp");
- &push("ebx");
- &mov($tmp2,&wparam(0));
- &mov($K,&wparam(1));
- &push("esi");
- &push("edi");
-
- &comment("Load the 2 words");
- &mov($L,&DWP(0,$tmp2,"",0));
- &mov($R,&DWP(4,$tmp2,"",0));
-
- &comment('Get short key flag');
- &mov($tmp3,&DWP(128,$K,"",0));
- if($enc) {
- &push($tmp3);
- } else {
- &or($tmp3,$tmp3);
- &jnz(&label('cast_dec_skip'));
- }
-
- &xor($tmp3, $tmp3);
-
- # encrypting part
-
- if ($enc) {
- &E_CAST( 0,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 1,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 2,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 3,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 4,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 5,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 6,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 7,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 8,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 9,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST(10,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST(11,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
- &comment('test short key flag');
- &pop($tmp4);
- &or($tmp4,$tmp4);
- &jnz(&label('cast_enc_done'));
- &E_CAST(12,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST(13,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST(14,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST(15,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
- } else {
- &E_CAST(15,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST(14,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST(13,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST(12,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
- &set_label('cast_dec_skip');
- &E_CAST(11,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST(10,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 9,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 8,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 7,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 6,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 5,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 4,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 3,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 2,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 1,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
- &E_CAST( 0,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
- }
-
- &set_label('cast_enc_done') if $enc;
-# Why the nop? - Ben 17/1/99
- &nop();
- &mov($tmp3,&wparam(0));
- &mov(&DWP(4,$tmp3,"",0),$L);
- &mov(&DWP(0,$tmp3,"",0),$R);
- &function_end($name);
-}
-
-sub E_CAST {
- local($i,$S,$L,$R,$K,$OP1,$OP2,$OP3,$tmp1,$tmp2,$tmp3,$tmp4)=@_;
- # Ri needs to have 16 pre added.
-
- &comment("round $i");
- &mov( $tmp4, &DWP($i*8,$K,"",1));
-
- &mov( $tmp1, &DWP($i*8+4,$K,"",1));
- &$OP1( $tmp4, $R);
-
- &rotl( $tmp4, &LB($tmp1));
-
- if ($ppro) {
- &mov( $tmp2, $tmp4); # B
- &xor( $tmp1, $tmp1);
-
- &movb( &LB($tmp1), &HB($tmp4)); # A
- &and( $tmp2, 0xff);
-
- &shr( $tmp4, 16); #
- &xor( $tmp3, $tmp3);
- } else {
- &mov( $tmp2, $tmp4); # B
- &movb( &LB($tmp1), &HB($tmp4)); # A # BAD BAD BAD
-
- &shr( $tmp4, 16); #
- &and( $tmp2, 0xff);
- }
-
- &movb( &LB($tmp3), &HB($tmp4)); # C # BAD BAD BAD
- &and( $tmp4, 0xff); # D
-
- &mov( $tmp1, &DWP($S1,"",$tmp1,4));
- &mov( $tmp2, &DWP($S2,"",$tmp2,4));
-
- &$OP2( $tmp1, $tmp2);
- &mov( $tmp2, &DWP($S3,"",$tmp3,4));
-
- &$OP3( $tmp1, $tmp2);
- &mov( $tmp2, &DWP($S4,"",$tmp4,4));
-
- &$OP1( $tmp1, $tmp2);
- # XXX
-
- &xor( $L, $tmp1);
- # XXX
-}
-
+++ /dev/null
-#!/usr/local/bin/perl
-#
-# The inner loop instruction sequence and the IP/FP modifications are from
-# Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk>
-# I've added the stuff needed for crypt() but I've not worried about making
-# things perfect.
-#
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-
-&asm_init($ARGV[0],"crypt586.pl");
-
-$L="edi";
-$R="esi";
-
-&external_label("DES_SPtrans");
-&fcrypt_body("fcrypt_body");
-&asm_finish();
-
-sub fcrypt_body
- {
- local($name,$do_ip)=@_;
-
- &function_begin($name);
-
- &comment("");
- &comment("Load the 2 words");
- $trans="ebp";
-
- &xor( $L, $L);
- &xor( $R, $R);
-
- # PIC-ification:-)
- &picmeup("edx","DES_SPtrans");
- #if ($cpp) { &picmeup("edx","DES_SPtrans"); }
- #else { &lea("edx",&DWP("DES_SPtrans")); }
- &push("edx"); # becomes &swtmp(1)
- #
- &mov($trans,&wparam(1)); # reloaded with DES_SPtrans in D_ENCRYPT
-
- &push(&DWC(25)); # add a variable
-
- &set_label("start");
- for ($i=0; $i<16; $i+=2)
- {
- &comment("");
- &comment("Round $i");
- &D_ENCRYPT($i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx");
-
- &comment("");
- &comment("Round ".sprintf("%d",$i+1));
- &D_ENCRYPT($i+1,$R,$L,($i+1)*2,$trans,"eax","ebx","ecx","edx");
- }
- &mov("ebx", &swtmp(0));
- &mov("eax", $L);
- &dec("ebx");
- &mov($L, $R);
- &mov($R, "eax");
- &mov(&swtmp(0), "ebx");
- &jnz(&label("start"));
-
- &comment("");
- &comment("FP");
- &mov("edx",&wparam(0));
-
- &FP_new($R,$L,"eax",3);
- &mov(&DWP(0,"edx","",0),"eax");
- &mov(&DWP(4,"edx","",0),$L);
-
- &add("esp",8); # remove variables
-
- &function_end($name);
- }
-
-sub D_ENCRYPT
- {
- local($r,$L,$R,$S,$trans,$u,$tmp1,$tmp2,$t)=@_;
-
- &mov( $u, &wparam(2)); # 2
- &mov( $t, $R);
- &shr( $t, 16); # 1
- &mov( $tmp2, &wparam(3)); # 2
- &xor( $t, $R); # 1
-
- &and( $u, $t); # 2
- &and( $t, $tmp2); # 2
-
- &mov( $tmp1, $u);
- &shl( $tmp1, 16); # 1
- &mov( $tmp2, $t);
- &shl( $tmp2, 16); # 1
- &xor( $u, $tmp1); # 2
- &xor( $t, $tmp2); # 2
- &mov( $tmp1, &DWP(&n2a($S*4),$trans,"",0)); # 2
- &xor( $u, $tmp1);
- &mov( $tmp2, &DWP(&n2a(($S+1)*4),$trans,"",0)); # 2
- &xor( $u, $R);
- &xor( $t, $R);
- &xor( $t, $tmp2);
-
- &and( $u, "0xfcfcfcfc" ); # 2
- &xor( $tmp1, $tmp1); # 1
- &and( $t, "0xcfcfcfcf" ); # 2
- &xor( $tmp2, $tmp2);
- &movb( &LB($tmp1), &LB($u) );
- &movb( &LB($tmp2), &HB($u) );
- &rotr( $t, 4 );
- &mov( $trans, &swtmp(1));
- &xor( $L, &DWP(" ",$trans,$tmp1,0));
- &movb( &LB($tmp1), &LB($t) );
- &xor( $L, &DWP("0x200",$trans,$tmp2,0));
- &movb( &LB($tmp2), &HB($t) );
- &shr( $u, 16);
- &xor( $L, &DWP("0x100",$trans,$tmp1,0));
- &movb( &LB($tmp1), &HB($u) );
- &shr( $t, 16);
- &xor( $L, &DWP("0x300",$trans,$tmp2,0));
- &movb( &LB($tmp2), &HB($t) );
- &and( $u, "0xff" );
- &and( $t, "0xff" );
- &mov( $tmp1, &DWP("0x600",$trans,$tmp1,0));
- &xor( $L, $tmp1);
- &mov( $tmp1, &DWP("0x700",$trans,$tmp2,0));
- &xor( $L, $tmp1);
- &mov( $tmp1, &DWP("0x400",$trans,$u,0));
- &xor( $L, $tmp1);
- &mov( $tmp1, &DWP("0x500",$trans,$t,0));
- &xor( $L, $tmp1);
- &mov( $trans, &wparam(1));
- }
-
-sub n2a
- {
- sprintf("%d",$_[0]);
- }
-
-# now has a side affect of rotating $a by $shift
-sub R_PERM_OP
- {
- local($a,$b,$tt,$shift,$mask,$last)=@_;
-
- &rotl( $a, $shift ) if ($shift != 0);
- &mov( $tt, $a );
- &xor( $a, $b );
- &and( $a, $mask );
- if ($notlast eq $b)
- {
- &xor( $b, $a );
- &xor( $tt, $a );
- }
- else
- {
- &xor( $tt, $a );
- &xor( $b, $a );
- }
- &comment("");
- }
-
-sub IP_new
- {
- local($l,$r,$tt,$lr)=@_;
-
- &R_PERM_OP($l,$r,$tt, 4,"0xf0f0f0f0",$l);
- &R_PERM_OP($r,$tt,$l,20,"0xfff0000f",$l);
- &R_PERM_OP($l,$tt,$r,14,"0x33333333",$r);
- &R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r);
- &R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r);
-
- if ($lr != 3)
- {
- if (($lr-3) < 0)
- { &rotr($tt, 3-$lr); }
- else { &rotl($tt, $lr-3); }
- }
- if ($lr != 2)
- {
- if (($lr-2) < 0)
- { &rotr($r, 2-$lr); }
- else { &rotl($r, $lr-2); }
- }
- }
-
-sub FP_new
- {
- local($l,$r,$tt,$lr)=@_;
-
- if ($lr != 2)
- {
- if (($lr-2) < 0)
- { &rotl($r, 2-$lr); }
- else { &rotr($r, $lr-2); }
- }
- if ($lr != 3)
- {
- if (($lr-3) < 0)
- { &rotl($l, 3-$lr); }
- else { &rotr($l, $lr-3); }
- }
-
- &R_PERM_OP($l,$r,$tt, 0,"0xaaaaaaaa",$r);
- &R_PERM_OP($tt,$r,$l,23,"0x03fc03fc",$r);
- &R_PERM_OP($l,$r,$tt,10,"0x33333333",$l);
- &R_PERM_OP($r,$tt,$l,18,"0xfff0000f",$l);
- &R_PERM_OP($l,$tt,$r,12,"0xf0f0f0f0",$r);
- &rotr($tt , 4);
- }
-
+++ /dev/null
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# January 2009
-#
-# Provided that UltraSPARC VIS instructions are pipe-lined(*) and
-# pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC
-# Graphic Unit would make it possible to achieve higher instruction-
-# level parallelism, ILP, and thus higher performance. It should be
-# explicitly noted that ILP is the keyword, and it means that this
-# code would be unsuitable for cores like UltraSPARC-Tx. The idea is
-# not really novel, Sun had VIS-powered implementation for a while.
-# Unlike Sun's implementation this one can process multiple unaligned
-# input blocks, and as such works as drop-in replacement for OpenSSL
-# sha1_block_data_order. Performance improvement was measured to be
-# 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on
-# UltraSPARC-III. See below for discussion...
-#
-# The module does not present direct interest for OpenSSL, because
-# it doesn't provide better performance on contemporary SPARCv9 CPUs,
-# UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they
-# absolutely must score on UltraSPARC-I-IV can simply replace
-# crypto/sha/asm/sha1-sparcv9.pl with this module.
-#
-# (*) "Pipe-lined" means that even if it takes several cycles to
-# complete, next instruction using same functional unit [but not
-# depending on the result of the current instruction] can start
-# execution without having to wait for the unit. "Pairable"
-# means that two [or more] independent instructions can be
-# issued at the very same time.
-
-$bits=32;
-for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64) { $bias=2047; $frame=192; }
-else { $bias=0; $frame=112; }
-
-$output=shift;
-open STDOUT,">$output";
-
-$ctx="%i0";
-$inp="%i1";
-$len="%i2";
-$tmp0="%i3";
-$tmp1="%i4";
-$tmp2="%i5";
-$tmp3="%g5";
-
-$base="%g1";
-$align="%g4";
-$Xfer="%o5";
-$nXfer=$tmp3;
-$Xi="%o7";
-
-$A="%l0";
-$B="%l1";
-$C="%l2";
-$D="%l3";
-$E="%l4";
-@V=($A,$B,$C,$D,$E);
-
-$Actx="%o0";
-$Bctx="%o1";
-$Cctx="%o2";
-$Dctx="%o3";
-$Ectx="%o4";
-
-$fmul="%f32";
-$VK_00_19="%f34";
-$VK_20_39="%f36";
-$VK_40_59="%f38";
-$VK_60_79="%f40";
-@VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79);
-@X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
- "%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16");
-
-# This is reference 2x-parallelized VIS-powered Xupdate procedure. It
-# covers even K_NN_MM addition...
-sub Xupdate {
-my ($i)=@_;
-my $K=@VK[($i+16)/20];
-my $j=($i+16)%16;
-
-# [ provided that GSR.alignaddr_offset is 5, $mul contains
-# 0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to
-# chosen registers... ]
-$code.=<<___;
- fxors @X[($j+13)%16],@X[$j],@X[$j] !-1/-1/-1:X[0]^=X[13]
- fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
- fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
- fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
- faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
- fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
- fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
- ![fxors %f15,%f2,%f2]
- for %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
- ![fxors %f0,%f3,%f3] !10/17/12:X[0] dependency
- fpadd32 $K,@X[$j],%f20
- std %f20,[$Xfer+`4*$j`]
-___
-# The numbers delimited with slash are the earliest possible dispatch
-# cycles for given instruction assuming 1 cycle latency for simple VIS
-# instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as
-# on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being
-# 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1
-# round. As [long as] FPU/VIS instructions are perfectly pairable with
-# IALU ones, the round timing is defined by the maximum between VIS
-# and IALU timings. The latter varies from round to round and averages
-# out at 6.25 ticks. This means that USI&II should operate at IALU
-# rate, while USIII&IV - at VIS rate. This explains why performance
-# improvement varies among processors. Well, given that pure IALU
-# sha1-sparcv9.pl module exhibits virtually uniform performance of
-# ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical
-# lower limits. Real-life performance was measured to be 6.6 cycles
-# per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than
-# half-round VIS timing, because there are 16 Xupdate-free rounds,
-# which "push down" average theoretical timing to 8 cycles...
-
-# (*) SPARC64-V[II] was originally believed to have 2 cycles VIS
-# latency. Well, it might have, but it doesn't have dedicated
-# VIS-unit. Instead, VIS instructions are executed by other
-# functional units, ones used here - by IALU. This doesn't
-# improve effective ILP...
-}
-
-# The reference Xupdate procedure is then "strained" over *pairs* of
-# BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13]
-# and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves
-# plenty of room to amortize for read-after-write hazard, as well as
-# to fetch and align input for the next spin. The VIS instructions are
-# scheduled for latency of 2 cycles, because there are not enough IALU
-# instructions to schedule for latency of 3, while scheduling for 1
-# would give no gain on USI&II anyway.
-
-sub BODY_00_19 {
-my ($i,$a,$b,$c,$d,$e)=@_;
-my $j=$i&~1;
-my $k=($j+16+2)%16; # ahead reference
-my $l=($j+16-2)%16; # behind reference
-my $K=@VK[($j+16-2)/20];
-
-$j=($j+16)%16;
-
-$code.=<<___ if (!($i&1));
- sll $a,5,$tmp0 !! $i
- and $c,$b,$tmp3
- ld [$Xfer+`4*($i%16)`],$Xi
- fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
- srl $a,27,$tmp1
- add $tmp0,$e,$e
- fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
- sll $b,30,$tmp2
- add $tmp1,$e,$e
- andn $d,$b,$tmp1
- add $Xi,$e,$e
- fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
- srl $b,2,$b
- or $tmp1,$tmp3,$tmp1
- or $tmp2,$b,$b
- add $tmp1,$e,$e
- faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
-___
-$code.=<<___ if ($i&1);
- sll $a,5,$tmp0 !! $i
- and $c,$b,$tmp3
- ld [$Xfer+`4*($i%16)`],$Xi
- fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
- srl $a,27,$tmp1
- add $tmp0,$e,$e
- fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
- sll $b,30,$tmp2
- add $tmp1,$e,$e
- fpadd32 $K,@X[$l],%f20 !
- andn $d,$b,$tmp1
- add $Xi,$e,$e
- fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
- srl $b,2,$b
- or $tmp1,$tmp3,$tmp1
- fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
- or $tmp2,$b,$b
- add $tmp1,$e,$e
-___
-$code.=<<___ if ($i&1 && $i>=2);
- std %f20,[$Xfer+`4*$l`] !
-___
-}
-
-sub BODY_20_39 {
-my ($i,$a,$b,$c,$d,$e)=@_;
-my $j=$i&~1;
-my $k=($j+16+2)%16; # ahead reference
-my $l=($j+16-2)%16; # behind reference
-my $K=@VK[($j+16-2)/20];
-
-$j=($j+16)%16;
-
-$code.=<<___ if (!($i&1) && $i<64);
- sll $a,5,$tmp0 !! $i
- ld [$Xfer+`4*($i%16)`],$Xi
- fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
- srl $a,27,$tmp1
- add $tmp0,$e,$e
- fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
- xor $c,$b,$tmp0
- add $tmp1,$e,$e
- sll $b,30,$tmp2
- xor $d,$tmp0,$tmp1
- fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
- srl $b,2,$b
- add $tmp1,$e,$e
- or $tmp2,$b,$b
- add $Xi,$e,$e
- faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
-___
-$code.=<<___ if ($i&1 && $i<64);
- sll $a,5,$tmp0 !! $i
- ld [$Xfer+`4*($i%16)`],$Xi
- fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
- srl $a,27,$tmp1
- add $tmp0,$e,$e
- fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
- xor $c,$b,$tmp0
- add $tmp1,$e,$e
- fpadd32 $K,@X[$l],%f20 !
- sll $b,30,$tmp2
- xor $d,$tmp0,$tmp1
- fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
- srl $b,2,$b
- add $tmp1,$e,$e
- fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
- or $tmp2,$b,$b
- add $Xi,$e,$e
- std %f20,[$Xfer+`4*$l`] !
-___
-$code.=<<___ if ($i==64);
- sll $a,5,$tmp0 !! $i
- ld [$Xfer+`4*($i%16)`],$Xi
- fpadd32 $K,@X[$l],%f20
- srl $a,27,$tmp1
- add $tmp0,$e,$e
- xor $c,$b,$tmp0
- add $tmp1,$e,$e
- sll $b,30,$tmp2
- xor $d,$tmp0,$tmp1
- std %f20,[$Xfer+`4*$l`]
- srl $b,2,$b
- add $tmp1,$e,$e
- or $tmp2,$b,$b
- add $Xi,$e,$e
-___
-$code.=<<___ if ($i>64);
- sll $a,5,$tmp0 !! $i
- ld [$Xfer+`4*($i%16)`],$Xi
- srl $a,27,$tmp1
- add $tmp0,$e,$e
- xor $c,$b,$tmp0
- add $tmp1,$e,$e
- sll $b,30,$tmp2
- xor $d,$tmp0,$tmp1
- srl $b,2,$b
- add $tmp1,$e,$e
- or $tmp2,$b,$b
- add $Xi,$e,$e
-___
-}
-
-sub BODY_40_59 {
-my ($i,$a,$b,$c,$d,$e)=@_;
-my $j=$i&~1;
-my $k=($j+16+2)%16; # ahead reference
-my $l=($j+16-2)%16; # behind reference
-my $K=@VK[($j+16-2)/20];
-
-$j=($j+16)%16;
-
-$code.=<<___ if (!($i&1));
- sll $a,5,$tmp0 !! $i
- ld [$Xfer+`4*($i%16)`],$Xi
- fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
- srl $a,27,$tmp1
- add $tmp0,$e,$e
- fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
- and $c,$b,$tmp0
- add $tmp1,$e,$e
- sll $b,30,$tmp2
- or $c,$b,$tmp1
- fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
- srl $b,2,$b
- and $d,$tmp1,$tmp1
- add $Xi,$e,$e
- or $tmp1,$tmp0,$tmp1
- faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
- or $tmp2,$b,$b
- add $tmp1,$e,$e
- fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
-___
-$code.=<<___ if ($i&1);
- sll $a,5,$tmp0 !! $i
- ld [$Xfer+`4*($i%16)`],$Xi
- srl $a,27,$tmp1
- add $tmp0,$e,$e
- fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
- and $c,$b,$tmp0
- add $tmp1,$e,$e
- fpadd32 $K,@X[$l],%f20 !
- sll $b,30,$tmp2
- or $c,$b,$tmp1
- fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
- srl $b,2,$b
- and $d,$tmp1,$tmp1
- fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
- add $Xi,$e,$e
- or $tmp1,$tmp0,$tmp1
- or $tmp2,$b,$b
- add $tmp1,$e,$e
- std %f20,[$Xfer+`4*$l`] !
-___
-}
-
-# If there is more data to process, then we pre-fetch the data for
-# next iteration in last ten rounds...
-sub BODY_70_79 {
-my ($i,$a,$b,$c,$d,$e)=@_;
-my $j=$i&~1;
-my $m=($i%8)*2;
-
-$j=($j+16)%16;
-
-$code.=<<___ if ($i==70);
- sll $a,5,$tmp0 !! $i
- ld [$Xfer+`4*($i%16)`],$Xi
- srl $a,27,$tmp1
- add $tmp0,$e,$e
- ldd [$inp+64],@X[0]
- xor $c,$b,$tmp0
- add $tmp1,$e,$e
- sll $b,30,$tmp2
- xor $d,$tmp0,$tmp1
- srl $b,2,$b
- add $tmp1,$e,$e
- or $tmp2,$b,$b
- add $Xi,$e,$e
-
- and $inp,-64,$nXfer
- inc 64,$inp
- and $nXfer,255,$nXfer
- alignaddr %g0,$align,%g0
- add $base,$nXfer,$nXfer
-___
-$code.=<<___ if ($i==71);
- sll $a,5,$tmp0 !! $i
- ld [$Xfer+`4*($i%16)`],$Xi
- srl $a,27,$tmp1
- add $tmp0,$e,$e
- xor $c,$b,$tmp0
- add $tmp1,$e,$e
- sll $b,30,$tmp2
- xor $d,$tmp0,$tmp1
- srl $b,2,$b
- add $tmp1,$e,$e
- or $tmp2,$b,$b
- add $Xi,$e,$e
-___
-$code.=<<___ if ($i>=72);
- faligndata @X[$m],@X[$m+2],@X[$m]
- sll $a,5,$tmp0 !! $i
- ld [$Xfer+`4*($i%16)`],$Xi
- srl $a,27,$tmp1
- add $tmp0,$e,$e
- xor $c,$b,$tmp0
- add $tmp1,$e,$e
- fpadd32 $VK_00_19,@X[$m],%f20
- sll $b,30,$tmp2
- xor $d,$tmp0,$tmp1
- srl $b,2,$b
- add $tmp1,$e,$e
- or $tmp2,$b,$b
- add $Xi,$e,$e
-___
-$code.=<<___ if ($i<77);
- ldd [$inp+`8*($i+1-70)`],@X[2*($i+1-70)]
-___
-$code.=<<___ if ($i==77); # redundant if $inp was aligned
- add $align,63,$tmp0
- and $tmp0,-8,$tmp0
- ldd [$inp+$tmp0],@X[16]
-___
-$code.=<<___ if ($i>=72);
- std %f20,[$nXfer+`4*$m`]
-___
-}
-
-$code.=<<___;
-.section ".rodata",#alloc
-
-.align 64
-vis_const:
-.long 0x5a827999,0x5a827999 ! K_00_19
-.long 0x6ed9eba1,0x6ed9eba1 ! K_20_39
-.long 0x8f1bbcdc,0x8f1bbcdc ! K_40_59
-.long 0xca62c1d6,0xca62c1d6 ! K_60_79
-.long 0x00000100,0x00000100
-.align 64
-.type vis_const,#object
-.size vis_const,(.-vis_const)
-
-.section ".text",#alloc,#execinstr
-.globl sha1_block_data_order
-sha1_block_data_order:
- save %sp,-$frame,%sp
- add %fp,$bias-256,$base
-
-#ifdef __PIC__
- sethi %hi(_GLOBAL_OFFSET_TABLE_-4), %o5
- rd %pc, %o4
- or %o5, %lo(_GLOBAL_OFFSET_TABLE_+4), %o5
- add %o5, %o4, %o5
- set vis_const, %o4
- ldx [%o4+%o5], %o4
-#else
- set vis_const, %o4
-#endif
-
- ldd [$tmp0+0],$VK_00_19
- ldd [$tmp0+8],$VK_20_39
- ldd [$tmp0+16],$VK_40_59
- ldd [$tmp0+24],$VK_60_79
- ldd [$tmp0+32],$fmul
-
- ld [$ctx+0],$Actx
- and $base,-256,$base
- ld [$ctx+4],$Bctx
- sub $base,$bias+$frame,%sp
- ld [$ctx+8],$Cctx
- and $inp,7,$align
- ld [$ctx+12],$Dctx
- and $inp,-8,$inp
- ld [$ctx+16],$Ectx
-
- ! X[16] is maintained in FP register bank
- alignaddr %g0,$align,%g0
- ldd [$inp+0],@X[0]
- sub $inp,-64,$Xfer
- ldd [$inp+8],@X[2]
- and $Xfer,-64,$Xfer
- ldd [$inp+16],@X[4]
- and $Xfer,255,$Xfer
- ldd [$inp+24],@X[6]
- add $base,$Xfer,$Xfer
- ldd [$inp+32],@X[8]
- ldd [$inp+40],@X[10]
- ldd [$inp+48],@X[12]
- brz,pt $align,.Laligned
- ldd [$inp+56],@X[14]
-
- ldd [$inp+64],@X[16]
- faligndata @X[0],@X[2],@X[0]
- faligndata @X[2],@X[4],@X[2]
- faligndata @X[4],@X[6],@X[4]
- faligndata @X[6],@X[8],@X[6]
- faligndata @X[8],@X[10],@X[8]
- faligndata @X[10],@X[12],@X[10]
- faligndata @X[12],@X[14],@X[12]
- faligndata @X[14],@X[16],@X[14]
-
-.Laligned:
- mov 5,$tmp0
- dec 1,$len
- alignaddr %g0,$tmp0,%g0
- fpadd32 $VK_00_19,@X[0],%f16
- fpadd32 $VK_00_19,@X[2],%f18
- fpadd32 $VK_00_19,@X[4],%f20
- fpadd32 $VK_00_19,@X[6],%f22
- fpadd32 $VK_00_19,@X[8],%f24
- fpadd32 $VK_00_19,@X[10],%f26
- fpadd32 $VK_00_19,@X[12],%f28
- fpadd32 $VK_00_19,@X[14],%f30
- std %f16,[$Xfer+0]
- mov $Actx,$A
- std %f18,[$Xfer+8]
- mov $Bctx,$B
- std %f20,[$Xfer+16]
- mov $Cctx,$C
- std %f22,[$Xfer+24]
- mov $Dctx,$D
- std %f24,[$Xfer+32]
- mov $Ectx,$E
- std %f26,[$Xfer+40]
- fxors @X[13],@X[0],@X[0]
- std %f28,[$Xfer+48]
- ba .Loop
- std %f30,[$Xfer+56]
-.align 32
-.Loop:
-___
-for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
-for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
-for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
-for (;$i<70;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
- tst $len
- bz,pn `$bits==32?"%icc":"%xcc"`,.Ltail
- nop
-___
-for (;$i<80;$i++) { &BODY_70_79($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
- add $A,$Actx,$Actx
- add $B,$Bctx,$Bctx
- add $C,$Cctx,$Cctx
- add $D,$Dctx,$Dctx
- add $E,$Ectx,$Ectx
- mov 5,$tmp0
- fxors @X[13],@X[0],@X[0]
- mov $Actx,$A
- mov $Bctx,$B
- mov $Cctx,$C
- mov $Dctx,$D
- mov $Ectx,$E
- alignaddr %g0,$tmp0,%g0
- dec 1,$len
- ba .Loop
- mov $nXfer,$Xfer
-
-.align 32
-.Ltail:
-___
-for($i=70;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
- add $A,$Actx,$Actx
- add $B,$Bctx,$Bctx
- add $C,$Cctx,$Cctx
- add $D,$Dctx,$Dctx
- add $E,$Ectx,$Ectx
-
- st $Actx,[$ctx+0]
- st $Bctx,[$ctx+4]
- st $Cctx,[$ctx+8]
- st $Dctx,[$ctx+12]
- st $Ectx,[$ctx+16]
-
- ret
- restore
-.type sha1_block_data_order,#function
-.size sha1_block_data_order,(.-sha1_block_data_order)
-___
-
-# Purpose of these subroutines is to explicitly encode VIS instructions,
-# so that one can compile the module without having to specify VIS
-# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
-# Idea is to reserve for option to produce "universal" binary and let
-# programmer detect if current CPU is VIS capable at run-time.
-sub unvis {
-my ($mnemonic,$rs1,$rs2,$rd)=@_;
-my ($ref,$opf);
-my %visopf = ( "fmul8ulx16" => 0x037,
- "faligndata" => 0x048,
- "fpadd32" => 0x052,
- "fxor" => 0x06c,
- "fxors" => 0x06d );
-
- $ref = "$mnemonic\t$rs1,$rs2,$rd";
-
- if ($opf=$visopf{$mnemonic}) {
- foreach ($rs1,$rs2,$rd) {
- return $ref if (!/%f([0-9]{1,2})/);
- $_=$1;
- if ($1>=32) {
- return $ref if ($1&1);
- # re-encode for upper double register addressing
- $_=($1|$1>>5)&31;
- }
- }
-
- return sprintf ".word\t0x%08x !%s",
- 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
- $ref;
- } else {
- return $ref;
- }
-}
-sub unalignaddr {
-my ($mnemonic,$rs1,$rs2,$rd)=@_;
-my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
-my $ref="$mnemonic\t$rs1,$rs2,$rd";
-
- foreach ($rs1,$rs2,$rd) {
- if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
- else { return $ref; }
- }
- return sprintf ".word\t0x%08x !%s",
- 0x81b00300|$rd<<25|$rs1<<14|$rs2,
- $ref;
-}
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/
- &unvis($1,$2,$3,$4)
- /gem;
-$code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/
- &unalignaddr($1,$2,$3,$4)
- /gem;
-print $code;
-close STDOUT;
+++ /dev/null
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# sha1_block for Thumb.
-#
-# January 2007.
-#
-# The code does not present direct interest to OpenSSL, because of low
-# performance. Its purpose is to establish _size_ benchmark. Pretty
-# useless one I must say, because 30% or 88 bytes larger ARMv4 code
-# [available on demand] is almost _twice_ as fast. It should also be
-# noted that in-lining of .Lcommon and .Lrotate improves performance
-# by over 40%, while code increases by only 10% or 32 bytes. But once
-# again, the goal was to establish _size_ benchmark, not performance.
-
-$output=shift;
-open STDOUT,">$output";
-
-$inline=0;
-#$cheat_on_binutils=1;
-
-$t0="r0";
-$t1="r1";
-$t2="r2";
-$a="r3";
-$b="r4";
-$c="r5";
-$d="r6";
-$e="r7";
-$K="r8"; # "upper" registers can be used in add/sub and mov insns
-$ctx="r9";
-$inp="r10";
-$len="r11";
-$Xi="r12";
-
-sub common {
-<<___;
- sub $t0,#4
- ldr $t1,[$t0]
- add $e,$K @ E+=K_xx_xx
- lsl $t2,$a,#5
- add $t2,$e
- lsr $e,$a,#27
- add $t2,$e @ E+=ROR(A,27)
- add $t2,$t1 @ E+=X[i]
-___
-}
-sub rotate {
-<<___;
- mov $e,$d @ E=D
- mov $d,$c @ D=C
- lsl $c,$b,#30
- lsr $b,$b,#2
- orr $c,$b @ C=ROR(B,2)
- mov $b,$a @ B=A
- add $a,$t2,$t1 @ A=E+F_xx_xx(B,C,D)
-___
-}
-
-sub BODY_00_19 {
-$code.=$inline?&common():"\tbl .Lcommon\n";
-$code.=<<___;
- mov $t1,$c
- eor $t1,$d
- and $t1,$b
- eor $t1,$d @ F_00_19(B,C,D)
-___
-$code.=$inline?&rotate():"\tbl .Lrotate\n";
-}
-
-sub BODY_20_39 {
-$code.=$inline?&common():"\tbl .Lcommon\n";
-$code.=<<___;
- mov $t1,$b
- eor $t1,$c
- eor $t1,$d @ F_20_39(B,C,D)
-___
-$code.=$inline?&rotate():"\tbl .Lrotate\n";
-}
-
-sub BODY_40_59 {
-$code.=$inline?&common():"\tbl .Lcommon\n";
-$code.=<<___;
- mov $t1,$b
- and $t1,$c
- mov $e,$b
- orr $e,$c
- and $e,$d
- orr $t1,$e @ F_40_59(B,C,D)
-___
-$code.=$inline?&rotate():"\tbl .Lrotate\n";
-}
-
-$code=<<___;
-.text
-.code 16
-
-.global sha1_block_data_order
-.type sha1_block_data_order,%function
-
-.align 2
-sha1_block_data_order:
-___
-if ($cheat_on_binutils) {
-$code.=<<___;
-.code 32
- add r3,pc,#1
- bx r3 @ switch to Thumb ISA
-.code 16
-___
-}
-$code.=<<___;
- push {r4-r7}
- mov r3,r8
- mov r4,r9
- mov r5,r10
- mov r6,r11
- mov r7,r12
- push {r3-r7,lr}
- lsl r2,#6
- mov $ctx,r0 @ save context
- mov $inp,r1 @ save inp
- mov $len,r2 @ save len
- add $len,$inp @ $len to point at inp end
-
-.Lloop:
- mov $Xi,sp
- mov $t2,sp
- sub $t2,#16*4 @ [3]
-.LXload:
- ldrb $a,[$t1,#0] @ $t1 is r1 and holds inp
- ldrb $b,[$t1,#1]
- ldrb $c,[$t1,#2]
- ldrb $d,[$t1,#3]
- lsl $a,#24
- lsl $b,#16
- lsl $c,#8
- orr $a,$b
- orr $a,$c
- orr $a,$d
- add $t1,#4
- push {$a}
- cmp sp,$t2
- bne .LXload @ [+14*16]
-
- mov $inp,$t1 @ update $inp
- sub $t2,#32*4
- sub $t2,#32*4
- mov $e,#31 @ [+4]
-.LXupdate:
- ldr $a,[sp,#15*4]
- ldr $b,[sp,#13*4]
- ldr $c,[sp,#7*4]
- ldr $d,[sp,#2*4]
- eor $a,$b
- eor $a,$c
- eor $a,$d
- ror $a,$e
- push {$a}
- cmp sp,$t2
- bne .LXupdate @ [+(11+1)*64]
-
- ldmia $t0!,{$a,$b,$c,$d,$e} @ $t0 is r0 and holds ctx
- mov $t0,$Xi
-
- ldr $t2,.LK_00_19
- mov $t1,$t0
- sub $t1,#20*4
- mov $Xi,$t1
- mov $K,$t2 @ [+7+4]
-.L_00_19:
-___
- &BODY_00_19();
-$code.=<<___;
- cmp $Xi,$t0
- bne .L_00_19 @ [+(2+9+4+2+8+2)*20]
-
- ldr $t2,.LK_20_39
- mov $t1,$t0
- sub $t1,#20*4
- mov $Xi,$t1
- mov $K,$t2 @ [+5]
-.L_20_39_or_60_79:
-___
- &BODY_20_39();
-$code.=<<___;
- cmp $Xi,$t0
- bne .L_20_39_or_60_79 @ [+(2+9+3+2+8+2)*20*2]
- cmp sp,$t0
- beq .Ldone @ [+2]
-
- ldr $t2,.LK_40_59
- mov $t1,$t0
- sub $t1,#20*4
- mov $Xi,$t1
- mov $K,$t2 @ [+5]
-.L_40_59:
-___
- &BODY_40_59();
-$code.=<<___;
- cmp $Xi,$t0
- bne .L_40_59 @ [+(2+9+6+2+8+2)*20]
-
- ldr $t2,.LK_60_79
- mov $Xi,sp
- mov $K,$t2
- b .L_20_39_or_60_79 @ [+4]
-.Ldone:
- mov $t0,$ctx
- ldr $t1,[$t0,#0]
- ldr $t2,[$t0,#4]
- add $a,$t1
- ldr $t1,[$t0,#8]
- add $b,$t2
- ldr $t2,[$t0,#12]
- add $c,$t1
- ldr $t1,[$t0,#16]
- add $d,$t2
- add $e,$t1
- stmia $t0!,{$a,$b,$c,$d,$e} @ [+20]
-
- add sp,#80*4 @ deallocate stack frame
- mov $t0,$ctx @ restore ctx
- mov $t1,$inp @ restore inp
- cmp $t1,$len
- beq .Lexit
- b .Lloop @ [+6] total 3212 cycles
-.Lexit:
- pop {r2-r7}
- mov r8,r2
- mov r9,r3
- mov r10,r4
- mov r11,r5
- mov r12,r6
- mov lr,r7
- pop {r4-r7}
- bx lr
-.align 2
-___
-$code.=".Lcommon:\n".&common()."\tmov pc,lr\n" if (!$inline);
-$code.=".Lrotate:\n".&rotate()."\tmov pc,lr\n" if (!$inline);
-$code.=<<___;
-.align 2
-.LK_00_19: .word 0x5a827999
-.LK_20_39: .word 0x6ed9eba1
-.LK_40_59: .word 0x8f1bbcdc
-.LK_60_79: .word 0xca62c1d6
-.size sha1_block_data_order,.-sha1_block_data_order
-.asciz "SHA1 block transform for Thumb, CRYPTOGAMS by <appro\@openssl.org>"
-___
-
-print $code;
-close STDOUT; # enforce flush