Lines Matching +full:1 +full:- +full:9 +full:a +full:- +full:e
2 # SPDX-License-Identifier: GPL-2.0
12 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
15 # this file except in compliance with the License. You can obtain a copy
31 # SHA256-hw SHA256(*) SHA512
32 # Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
33 # Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
34 # Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
36 # X-Gene 20.0 (+100%) 12.8 (+300%(***))
41 # (**) The result is a trade-off: it's possible to improve it by
42 # 10% (or by 1 cycle per round), but at the cost of 20% loss
43 # on Cortex-A53 (or by 4 cycles per round).
44 # (***) Super-impressive coefficients over gcc-generated code are
46 # generated with -mgeneral-regs-only is significantly faster
47 # and the gap is only 40-90%.
52 # version of SHA256 for 64-bit processors. This is because performance
53 # improvement on most wide-spread Cortex-A5x processors was observed
54 # to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
55 # observed that 32-bit NEON SHA256 performs significantly better than
56 # 64-bit scalar version on *some* of the more recent processors. As
57 # result 64-bit NEON version of SHA256 was added to provide best
58 # all-round performance. For example it executes ~30% faster on X-Gene
60 # deliver much less improvement, likely *negative* on Cortex-A5x.
67 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
69 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
70 die "can't locate arm-xlate.pl";
83 @sigma0=(1, 8, 7);
103 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
107 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
108 my $j=($i+1)&15;
109 my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
117 $code.=<<___ if ($i<13 && ($i&1));
118 ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
124 ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
127 add $a,$a,$t1 // h+=Sigma0(a)
130 str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
132 # While ARMv8 specifies merged rotate-n-logical operation such as
137 # Cortex-A5x handles merged instructions much better than disjoint
140 ror $t0,$e,#$Sigma1[0]
142 eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
143 and $t1,$f,$e
144 bic $t2,$g,$e
146 orr $t1,$t1,$t2 // Ch(e,f,g)
147 eor $t2,$a,$b // a^b, b^c in next round
148 eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
149 ror $T0,$a,#$Sigma0[0]
150 add $h,$h,$t1 // h+=Ch(e,f,g)
151 eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
152 add $h,$h,$t0 // h+=Sigma1(e)
153 and $t3,$t3,$t2 // (b^c)&=(a^b)
155 eor $t3,$t3,$b // Maj(a,b,c)
156 eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
157 add $h,$h,$t3 // h+=Maj(a,b,c)
159 //add $h,$h,$t1 // h+=Sigma0(a)
162 ror $t0,$e,#$Sigma1[0]
164 ror $T1,@X[($j+1)&15],#$sigma0[0]
165 and $t1,$f,$e
167 bic $t2,$g,$e
168 ror $T0,$a,#$Sigma0[0]
170 eor $t0,$t0,$e,ror#$Sigma1[1]
171 eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
172 orr $t1,$t1,$t2 // Ch(e,f,g)
173 eor $t2,$a,$b // a^b, b^c in next round
174 eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
175 eor $T0,$T0,$a,ror#$Sigma0[1]
176 add $h,$h,$t1 // h+=Ch(e,f,g)
177 and $t3,$t3,$t2 // (b^c)&=(a^b)
178 eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
179 eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
180 add $h,$h,$t0 // h+=Sigma1(e)
181 eor $t3,$t3,$b // Maj(a,b,c)
182 eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
184 add @X[$j],@X[$j],@X[($j+9)&15]
186 add $h,$h,$t3 // h+=Maj(a,b,c)
189 add $h,$h,$t1 // h+=Sigma0(a)
225 stp x29,x30,[sp,#-128]!
235 ldp $A,$B,[$ctx] // load context
237 ldp $E,$F,[$ctx,#4*$SZ]
244 ldp @X[0],@X[1],[$inp],#2*$SZ
257 sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
259 ldp @X[0],@X[1],[$ctx]
263 add $A,$A,@X[0]
265 add $B,$B,@X[1]
268 stp $A,$B,[$ctx]
269 add $E,$E,@X[4]
275 stp $E,$F,[$ctx,#4*$SZ]
287 .size $func,.-$func
356 .size .LK$BITS,.-.LK$BITS
361 .long OPENSSL_armcap_P-.
363 .quad OPENSSL_armcap_P-.
384 stp x29,x30,[sp,#-16]!
391 ld1 {@MSG[0]-@MSG[3]},[$inp],#64
392 sub $num,$num,#1
395 rev32 @MSG[1],@MSG[1]
405 sha256su0 @MSG[0],@MSG[1]
421 add.i32 $W1,$W1,@MSG[1]
428 sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
447 .size sha256_block_armv8,.-sha256_block_armv8
453 # You'll surely note a lot of similarities with sha256-armv4 module,
454 # and of course it's not a coincidence. sha256-armv4 was used as
456 # extensively re-tuned for all-round performance.
458 my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
466 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
469 $arg = "#$arg" if ($arg*1 eq $arg);
473 sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
474 sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
475 sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
481 my ($a,$b,$c,$d,$e,$f,$g,$h);
483 &ext_8 ($T0,@X[0],@X[1],4); # X[1..4]
487 &ext_8 ($T3,@X[2],@X[3],4); # X[9..12]
497 &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12]
499 &sli_32 ($T2,$T0,32-$sigma0[0]);
502 &ushr_32 ($T3,$T0,$sigma0[1]);
508 &sli_32 ($T3,$T0,32-$sigma0[1]);
514 &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4])
517 &sli_32 ($T4,$T7,32-$sigma1[0]);
523 &ushr_32 ($T3,$T7,$sigma1[1]);
526 &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
529 &sli_u32 ($T3,$T7,32-$sigma1[1]);
540 &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15])
549 &sli_32 ($T6,@X[0],32-$sigma1[0]);
551 &ushr_32 ($T5,@X[0],$sigma1[1]);
557 &sli_32 ($T5,@X[0],32-$sigma1[1]);
577 while($#insns>=1) { eval(shift(@insns)); }
588 my ($a,$b,$c,$d,$e,$f,$g,$h);
614 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
616 '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past
617 '&and ($t1,$f,$e)',
618 '&bic ($t4,$g,$e)',
619 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
620 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
621 '&orr ($t1,$t1,$t4)', # Ch(e,f,g)
622 '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
623 '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
624 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
626 '&eor ($t2,$a,$b)', # a^b, b^c in next round
627 '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
628 '&add ($h,$h,$t0)', # h+=Sigma1(e)
629 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
631 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
634 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
647 stp x29, x30, [sp, #-16]!
655 ld1.8 {@X[1]},[$inp], #16
663 rev32 @X[1],@X[1] // big-endian
668 add.32 $T1,$T1,@X[1]
670 st1.32 {$T0-$T1},[$Xfer], #32
672 st1.32 {$T2-$T3},[$Xfer]
675 ldp $A,$B,[$ctx]
677 ldp $E,$F,[$ctx,#16]
710 add $A,$A,$t4 // h+=Sigma0(a) from the past
712 add $A,$A,$t2 // h+=Maj(a,b,c) from the past
714 add $A,$A,$t0 // accumulate
720 add $E,$E,$t0
723 stp $A,$B,[$ctx,#0]
728 stp $E,$F,[$ctx,#16]
738 .size sha256_block_neon,.-sha256_block_neon
755 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
758 $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
773 s/\`([^\`]*)\`/eval($1)/ge;
775 s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
777 s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
779 s/\.[ui]?8(\s)/$1/;
781 m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;