Lines Matching +full:0 +full:- +full:7 +full:a +full:- +full:e

2 # SPDX-License-Identifier: GPL-2.0
21 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
22 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23 # byte [on single-issue Xscale PXA250 core].
27 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
32 # Profiler-assisted and platform-specific optimization resulted in 16%
38 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40 # code (meaning that latter performs sub-optimally, nothing was done
47 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
54 $A="r4";
58 $E="r8";
62 @V=($A,$B,$C,$D,$E,$F,$G,$H);
68 @sigma0=( 7,18, 3);
72 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
75 #if __ARM_ARCH__>=7
80 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
81 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
82 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
88 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
97 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
99 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
107 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
108 and $t1,$t1,$e
110 eor $t1,$t1,$g @ Ch(e,f,g)
111 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
112 add $h,$h,$t1 @ h+=Ch(e,f,g)
114 and $t2,$t2,#0xff
115 cmp $t2,#0xf2 @ done?
118 # if __ARM_ARCH__>=7
123 eor $t2,$a,$b @ a^b, b^c in next round
126 eor $t2,$a,$b @ a^b, b^c in next round
129 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
130 and $t3,$t3,$t2 @ (b^c)&=(a^b)
132 eor $t3,$t3,$b @ Maj(a,b,c)
133 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
134 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
140 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
145 mov $t0,$t1,ror#$sigma0[0]
146 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
147 mov $t2,$t4,ror#$sigma1[0]
151 ldr $t1,[sp,#`($i+0)%16`*4]
156 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
158 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
169 # define __ARM_MAX_ARCH__ 7
173 #if __ARM_ARCH__<7
187 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
188 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
189 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
190 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
191 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
192 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
193 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
194 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
195 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
196 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
197 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
198 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
199 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
202 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203 .size K256,.-K256
204 .word 0 @ terminator
205 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
207 .word OPENSSL_armcap_P-sha256_block_data_order
215 #if __ARM_ARCH__<7
220 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
229 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
230 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
234 # if __ARM_ARCH__>=7
242 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
246 #if __ARM_ARCH__>=7
252 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
253 ldr $t0,[$t3,#0]
256 add $A,$A,$t0
264 add $E,$E,$t1
271 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
278 ldmia sp!,{r4-r11,pc}
280 ldmia sp!,{r4-r11,lr}
283 bx lr @ interoperable with Thumb ISA:-)
285 .size sha256_block_data_order,.-sha256_block_data_order
291 my @X=map("q$_",(0..3));
294 my $j=0;
296 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
297 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
299 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
310 my ($a,$b,$c,$d,$e,$f,$g,$h);
312 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
320 &vshr_u32 ($T2,$T0,$sigma0[0]);
323 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
329 &vsli_32 ($T2,$T0,32-$sigma0[0]);
338 &vsli_32 ($T3,$T0,32-$sigma0[1]);
341 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
347 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
353 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
362 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
368 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
371 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
374 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
377 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
383 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
389 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
395 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
398 &vadd_i32 ($T0,$T0,@X[0]);
411 my ($a,$b,$c,$d,$e,$f,$g,$h);
422 &vrev32_8 (@X[0],@X[0]);
427 &vadd_i32 ($T0,$T0,@X[0]);
436 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
439 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
440 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
441 '&and ($t1,$t1,$e)',
442 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
443 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
444 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
445 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
446 '&eor ($t2,$a,$b)', # a^b, b^c in next round
447 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
448 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
452 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
454 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
455 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
461 #if __ARM_MAX_ARCH__>=7
462 .arch armv7-a
470 stmdb sp!,{r4-r12,lr}
474 sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
475 bic $H,$H,#15 @ align for 128-bit stores
480 vld1.8 {@X[0]},[$inp]!
488 vrev32.8 @X[0],@X[0] @ yes, even on
490 vrev32.8 @X[1],@X[1] @ big-endian
497 vadd.i32 $T0,$T0,@X[0]
506 ldmia $ctx,{$A-$H}
508 ldr $t1,[sp,#0]
521 teq $t1,#0 @ check for K256 terminator
522 ldr $t1,[sp,#0]
532 vld1.8 {@X[0]},[$inp]! @ load next input block
545 ldr $t0,[$t1,#0]
546 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
550 add $A,$A,$t0 @ accumulate
558 add $E,$E,$t0
559 str $A,[$t1],#4
566 stmia $t1,{$E-$H}
570 ldrne $t1,[sp,#0]
577 ldmia sp!,{r4-r12,pc}
578 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
586 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
592 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
595 # define INST(a,b,c,d) .byte c,d|0xc,a,b
597 # define INST(a,b,c,d) .byte a,b,c,d
607 sub $Ktbl,$Ktbl,#.LARMv8-K256
614 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
615 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
617 vrev32.8 @MSG[0],@MSG[0]
625 for($i=0;$i<12;$i++) {
628 vadd.i32 $W0,$W0,@MSG[0]
629 sha256su0 @MSG[0],@MSG[1]
633 sha256su1 @MSG[0],@MSG[2],@MSG[3]
639 vadd.i32 $W0,$W0,@MSG[0]
652 sub $Ktbl,$Ktbl,#256-16 @ rewind
670 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
677 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
682 open SELF,$0;
691 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
692 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
697 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
698 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
699 |(($2&7)<<17)|(($2&8)<<4)
700 |(($3&7)<<1) |(($3&8)<<2);
701 # since ARMv7 instructions are always encoded little-endian.
703 # assemblers don't implement it:-(
704 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
705 $word&0xff,($word>>8)&0xff,
706 ($word>>16)&0xff,($word>>24)&0xff,
719 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4