Lines Matching +full:0 +full:- +full:7 +full:a +full:- +full:e
2 # SPDX-License-Identifier: GPL-2.0
21 # This code is ~4.5 (four and a half) times faster than code generated
22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
32 # Profiler-assisted and platform-specific optimization resulted in 7%
38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
44 # Technical writers asserted that 3-way S4 pipeline can sustain
46 # not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
47 # for further details. On side note Cortex-A15 processes one byte in
53 # h[0-7], namely with most significant dword at *lower* address, which
54 # was reflected in below two parameters as 0 and 4. Now caller is
55 # expected to maintain native byte order for whole 64-bit values.
60 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
81 $Aoff=8*0;
88 $Hoff=8*7;
98 str $Tlo,[sp,#$Xoff+0]
102 ldr $t2,[sp,#$Hoff+0] @ h.lo
112 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
114 ldr $t0,[sp,#$Foff+0] @ f.lo
115 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
118 ldr $t2,[sp,#$Goff+0] @ g.lo
123 str $Elo,[sp,#$Eoff+0]
127 str $Alo,[sp,#$Aoff+0]
132 eor $t1,$t1,$t3 @ Ch(e,f,g)
136 ldr $Elo,[sp,#$Doff+0] @ d.lo
137 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
140 and $t0,$t2,#0xff
143 ldr $t2,[sp,#$Boff+0] @ b.lo
147 ldr $t3,[sp,#$Coff+0] @ c.lo
148 #if __ARM_ARCH__>=7
153 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
154 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
163 eor $t0,$t0,$Ahi,lsr#7
164 eor $t1,$t1,$Alo,lsr#7
166 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
169 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
177 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
180 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
190 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
191 # define VFP_ABI_POP vldmia sp!,{d8-d15}
194 # define __ARM_MAX_ARCH__ 7
200 # define LO 0
204 # define HI 0
210 #if __ARM_ARCH__<7
224 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
225 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
226 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
227 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
228 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
229 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
230 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
231 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
232 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
233 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
234 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
235 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
236 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
237 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
238 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
239 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
240 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
241 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
242 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
243 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
244 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
245 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
246 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
247 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
248 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
249 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
250 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
251 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
252 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
253 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
254 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
255 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
256 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
257 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
258 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
259 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
260 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
261 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
262 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
263 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
264 .size K512,.-K512
265 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
267 .word OPENSSL_armcap_P-sha512_block_data_order
268 .skip 32-4
277 #if __ARM_ARCH__<7
282 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
288 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
289 stmdb sp!,{r4-r12,lr}
300 str $t0, [sp,#$Goff+0]
302 str $t2, [sp,#$Hoff+0]
312 str $Tlo,[sp,#$Boff+0]
314 str $t0, [sp,#$Coff+0]
316 str $t2, [sp,#$Doff+0]
320 str $Tlo,[sp,#$Foff+0]
324 #if __ARM_ARCH__<7
325 ldrb $Tlo,[$inp,#7]
348 &BODY_00_15(0x94);
352 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
353 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
356 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
357 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
358 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
360 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
362 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
369 eor $Tlo,$Tlo,$t0,lsr#7
370 eor $Thi,$Thi,$t1,lsr#7
386 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
389 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
391 ldr $t0,[sp,#`$Xoff+8*16`+0]
400 &BODY_00_15(0x17);
402 #if __ARM_ARCH__>=7
405 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
406 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
410 ldr $Tlo,[sp,#$Boff+0]
425 ldr $Alo,[sp,#$Coff+0]
427 ldr $Tlo,[sp,#$Doff+0]
442 ldr $Tlo,[sp,#$Foff+0]
457 ldr $Alo,[sp,#$Goff+0]
459 ldr $Tlo,[sp,#$Hoff+0]
482 ldmia sp!,{r4-r12,pc}
484 ldmia sp!,{r4-r12,lr}
487 bx lr @ interoperable with Thumb ISA:-)
489 .size sha512_block_data_order,.-sha512_block_data_order
495 my @sigma0=(1, 8, 7);
499 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
501 my @X=map("d$_",(0..15));
502 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
506 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
510 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
514 vshr.u64 $t1,$e,#@Sigma1[1]
515 #if $i>0
516 vadd.i64 $a,$Maj @ h+=Maj from the past
518 vshr.u64 $t2,$e,#@Sigma1[2]
522 vsli.64 $t0,$e,#`64-@Sigma1[0]`
523 vsli.64 $t1,$e,#`64-@Sigma1[1]`
524 vmov $Ch,$e
525 vsli.64 $t2,$e,#`64-@Sigma1[2]`
530 vbsl $Ch,$f,$g @ Ch(e,f,g)
531 vshr.u64 $t0,$a,#@Sigma0[0]
532 veor $t2,$t1 @ Sigma1(e)
534 vshr.u64 $t1,$a,#@Sigma0[1]
535 vsli.64 $t0,$a,#`64-@Sigma0[0]`
537 vshr.u64 $t2,$a,#@Sigma0[2]
539 vsli.64 $t1,$a,#`64-@Sigma0[1]`
540 veor $Maj,$a,$b
541 vsli.64 $t2,$a,#`64-@Sigma0[2]`
544 vbsl $Maj,$c,$b @ Maj(a,b,c)
545 veor $h,$t2 @ Sigma0(a)
557 # 2x-vectorized, therefore runs every 2nd round
558 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
561 my $e=@_[4]; # $e from NEON_00_15
564 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
565 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
566 vadd.i64 @_[0],d30 @ h+=Maj from the past
567 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
568 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
570 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
572 vshr.u64 $t0,$s0,#@sigma0[0]
577 vsli.64 $t0,$s0,#`64-@sigma0[0]`
578 vsli.64 $t1,$s0,#`64-@sigma0[1]`
581 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
583 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
585 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
592 #if __ARM_MAX_ARCH__>=7
593 .arch armv7-a
602 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
605 sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
606 vldmia $ctx,{$A-$H} @ load context
609 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
619 vadd.i64 $A,d30 @ h+=Maj from the past
620 vldmia $ctx,{d24-d31} @ load context to temp
625 vstmia $ctx,{$A-$H} @ save context
632 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
639 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
645 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
648 open SELF,$0;