Lines Matching +full:- +full:16 +full:g

2 # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
23 # SHA256-hw SHA256(*) SHA512
24 # Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
25 # Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
26 # Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
28 # X-Gene 20.0 (+100%) 12.8 (+300%(***))
35 # (**) The result is a trade-off: it's possible to improve it by
37 # on Cortex-A53 (or by 4 cycles per round).
38 # (***) Super-impressive coefficients over gcc-generated code are
40 # generated with -mgeneral-regs-only is significantly faster
41 # and the gap is only 40-90%.
46 # version of SHA256 for 64-bit processors. This is because performance
47 # improvement on most wide-spread Cortex-A5x processors was observed
48 # to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
49 # observed that 32-bit NEON SHA256 performs significantly better than
50 # 64-bit scalar version on *some* of the more recent processors. As
51 # result 64-bit NEON version of SHA256 was added to provide best
52 # all-round performance. For example it executes ~30% faster on X-Gene
54 # deliver much less improvement, likely *negative* on Cortex-A5x.
64 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
65 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
66 die "can't locate arm-xlate.pl";
100 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
101 ($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
104 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
106 my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
109 $code.=<<___ if ($i<16);
121 ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
123 $code.=<<___ if ($i>0 && $i<16);
127 str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
129 # While ARMv8 specifies merged rotate-n-logical operation such as
134 # Cortex-A5x handles merged instructions much better than disjoint
139 eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
141 bic $t2,$g,$e
143 orr $t1,$t1,$t2 // Ch(e,f,g)
147 add $h,$h,$t1 // h+=Ch(e,f,g)
148 eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
164 bic $t2,$g,$e
169 orr $t1,$t1,$t2 // Ch(e,f,g)
173 add $h,$h,$t1 // h+=Ch(e,f,g)
223 stp x29,x30,[sp,#-128]!
226 stp x19,x20,[sp,#16]
236 add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
237 ldp $G,$H,[$ctx,#6*$SZ]
247 for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
270 add $G,$G,@X[6]
274 stp $G,$H,[$ctx,#6*$SZ]
277 ldp x19,x20,[x29,#16]
286 .size $func,.-$func
355 .size .LK$BITS,.-.LK$BITS
363 my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
364 my @MSG=map("v$_.16b",(4..7));
366 my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
374 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
375 stp x29,x30,[sp,#-16]!
382 ld1 {@MSG[0]-@MSG[3]},[$inp],#64
384 ld1.32 {$W0},[$Ktbl],#16
394 ld1.32 {$W1},[$Ktbl],#16
405 ld1.32 {$W1},[$Ktbl],#16
411 ld1.32 {$W0},[$Ktbl],#16
419 sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
436 ldr x29,[sp],#16
438 .size sha256_block_armv8,.-sha256_block_armv8
444 # You'll surely note a lot of similarities with sha256-armv4 module,
445 # and of course it's not a coincidence. sha256-armv4 was used as
447 # extensively re-tuned for all-round performance.
449 my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
454 my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
457 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
464 sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
465 sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
466 sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
472 my ($a,$b,$c,$d,$e,$f,$g,$h);
490 &sli_32 ($T2,$T0,32-$sigma0[0]);
499 &sli_32 ($T3,$T0,32-$sigma0[1]);
508 &sli_32 ($T4,$T7,32-$sigma1[0]);
520 &sli_u32 ($T3,$T7,32-$sigma1[1]);
540 &sli_32 ($T6,@X[0],32-$sigma1[0]);
548 &sli_32 ($T5,@X[0],32-$sigma1[1]);
551 &ld1_32 ("{$T0}","[$Ktbl], #16");
553 &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17])
563 &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17])
569 &st1_32 ("{$T0}","[$Xfer], #16");
579 my ($a,$b,$c,$d,$e,$f,$g,$h);
583 &ld1_8 ("{@X[0]}","[$inp],#16");
586 &ld1_32 ("{$T0}","[$Ktbl],#16");
598 &st1_32 ("{$T0}","[$Xfer], #16");
605 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
609 '&bic ($t4,$g,$e)',
610 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
612 '&orr ($t1,$t1,$t4)', # Ch(e,f,g)
613 '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
614 '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
615 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
618 '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
639 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
640 stp x29, x30, [sp, #-16]!
642 sub sp,sp,#16*4
647 ld1.8 {@X[0]},[$inp], #16
648 ld1.8 {@X[1]},[$inp], #16
649 ld1.8 {@X[2]},[$inp], #16
650 ld1.8 {@X[3]},[$inp], #16
651 ld1.32 {$T0},[$Ktbl], #16
652 ld1.32 {$T1},[$Ktbl], #16
653 ld1.32 {$T2},[$Ktbl], #16
654 ld1.32 {$T3},[$Ktbl], #16
656 rev32 @X[1],@X[1] // big-endian
663 st1.32 {$T0-$T1},[$Xfer], #32
665 st1.32 {$T2-$T3},[$Xfer]
670 ldp $E,$F,[$ctx,#16]
671 ldp $G,$H,[$ctx,#24]
709 ldp $t0,$t1,[$ctx,#16]
717 add $G,$G,$t2
721 stp $E,$F,[$ctx,#16]
723 stp $G,$H,[$ctx,#24]
729 add sp,sp,#16*4+16
731 .size sha256_block_neon,.-sha256_block_neon
738 my @H = map("v$_.16b",(0..4));
739 my ($fg,$de,$m9_10)=map("v$_.16b",(5..7));
740 my @MSG=map("v$_.16b",(16..23));
742 my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29));
750 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
751 stp x29,x30,[sp,#-16]!
754 ld1 {@MSG[0]-@MSG[3]},[$inp],#64 // load input
755 ld1 {@MSG[4]-@MSG[7]},[$inp],#64
757 ld1.64 {@H[0]-@H[3]},[$ctx] // load context
772 ld1.64 {$W0},[$Ktbl],#16
784 ld1.64 {$W1},[$Ktbl],#16
801 ld1.64 {$W1},[$Ktbl],#16
808 ld1 {@MSG[0]},[$inp],#16 // load next input
829 st1.64 {@H[0]-@H[3]},[$ctx] // store context
831 ldr x29,[sp],#16
833 .size sha512_block_armv8,.-sha512_block_armv8
845 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
848 $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
860 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
863 $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
883 s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
886 s/\.\w?64\b// and s/\.16b/\.2d/g or
887 s/\.\w?32\b// and s/\.16b/\.4s/g;
888 m/\bext\b/ and s/\.2d/\.16b/g or
889 m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;