Lines Matching +full:- +full:d2
2 # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
5 # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
9 # IALU(*)/gcc-4.4 NEON
11 # ARM11xx(ARMv6) 7.78/+100% -
12 # Cortex-A5 6.35/+130% 3.00
13 # Cortex-A8 6.25/+115% 2.36
14 # Cortex-A9 5.10/+95% 2.55
15 # Cortex-A15 3.85/+85% 1.25(**)
18 # (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
19 # (**) these are trade-off results, they can be improved by ~8% but at
20 # the cost of 15/12% regression on Cortex-A5/A7, it's even possible
21 # to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
24 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
25 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
29 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
30 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
31 die "can't locate arm-xlate.pl";
68 stmdb sp!,{r4-r11}
87 mov r3,#-1
97 and r3,r10,#-4 @ 0x0ffffffc
133 orr r11,r11,#1 @ thumb-ify addresses
136 add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
138 addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
139 addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
167 ldmia sp!,{r4-r11}
173 bx lr @ interoperable with Thumb ISA:-)
175 .size poly1305_init,.-poly1305_init
186 stmdb sp!,{r3-r11,lr}
188 ands $len,$len,#-16
195 ldmia $ctx,{$h0-$r3} @ load context
201 ldmia $ctx!,{$h0-$h4} @ load hash value
205 adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
222 ldmia $ctx,{$r0-$r3} @ load key
242 ldrb r1,[lr,#-15]
243 ldrb r2,[lr,#-14]
244 ldrb r3,[lr,#-13]
246 ldrb r0,[lr,#-12]
248 ldrb r1,[lr,#-11]
250 ldrb r2,[lr,#-10]
253 ldrb r3,[lr,#-9]
255 ldrb r0,[lr,#-8]
257 ldrb r1,[lr,#-7]
259 ldrb r2,[lr,#-6]
262 ldrb r3,[lr,#-5]
264 ldrb r0,[lr,#-4]
266 ldrb r1,[lr,#-3]
268 ldrb r2,[lr,#-2]
271 ldrb r3,[lr,#-1]
281 ldr r1,[lr,#-12]
282 ldr r2,[lr,#-8]
283 ldr r3,[lr,#-4]
334 adds $h2,lr,r0 @ d2+=d1>>32
337 adds $h3,r2,r1 @ d3+=d2>>32
342 and r1,$h4,#-4
356 stmdb $ctx,{$h0-$h4} @ store the result
360 ldmia sp!,{r3-r11,pc}
362 ldmia sp!,{r3-r11,lr}
365 bx lr @ interoperable with Thumb ISA:-)
367 .size poly1305_blocks,.-poly1305_blocks
380 stmdb sp!,{r4-r11}
382 ldmia $ctx,{$h0-$h4}
387 adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
484 ldmia sp!,{r4-r11}
490 bx lr @ interoperable with Thumb ISA:-)
492 .size poly1305_emit,.-poly1305_emit
496 my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
510 cmp r3,#-1 @ is value impossible?
518 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
550 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
556 vmull.u32 $D2,$R2,${R0}[1]
562 vmlal.u32 $D2,$R1,${R1}[1]
569 vmlal.u32 $D2,$R0,${R2}[1]
575 vmlal.u32 $D2,$R4,${S3}[1]
581 vmlal.u32 $D2,$R3,${S4}[1]
593 @ Result of multiplication of n-bit number by m-bit number is
594 @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
595 @ m-bit number multiplied by 2^n is still n+m bits wide.
597 @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
598 @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
606 @ of 52-bit numbers as long as the amount of addends is not a
618 @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
619 @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
622 @ instruction accepts 2x32-bit input and writes 2x64-bit result.
624 @ loop wrap-around. This can be done in the process of reduction
626 @ 128-bit instructions, which benefits low-end processors], but
637 vadd.i64 $D4,$D4,$T0 @ h3 -> h4
639 vadd.i64 $D1,$D1,$T1 @ h0 -> h1
646 vadd.i64 $D2,$D2,$T1 @ h1 -> h2
652 vshrn.u64 $T1#lo,$D2,#26
653 vmovn.i64 $D2#lo,$D2
654 vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
655 vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
656 vbic.i32 $D2#lo,#0xfc000000
662 vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
663 vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
672 vtrn.32 $R2,$D2#lo
703 vshl.u32 $S2,$D2#lo,#2
704 vmov $R2,$D2#lo
710 vadd.i32 $S2,$S2,$D2#lo
723 .size poly1305_init_neon,.-poly1305_init_neon
734 stmdb sp!,{r4-r7}
735 vstmdb sp!,{d8-d15} @ ABI specification says so
740 stmdb sp!,{r1-r3,lr}
749 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
757 veor $D2#lo,$D2#lo,$D2#lo
771 vmov.32 $D2#lo[0],r4
776 ldmia sp!,{r1-r3,lr}
786 veor $D2#lo,$D2#lo,$D2#lo
789 vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
811 vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
828 vadd.i32 $H2#hi,$H2#lo,$D2#lo
860 vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
899 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
906 vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
907 vmull.u32 $D2,$H2#hi,${R0}[1]
912 vmlal.u32 $D2,$H1#hi,${R1}[1]
931 vmlal.u32 $D2,$H0#hi,${R2}[1]
937 vmlal.u32 $D2,$H4#hi,${S3}[1]
943 vmlal.u32 $D2,$H3#hi,${S4}[1]
955 vmlal.u32 $D2,$H2#lo,${R0}[0]
962 vmlal.u32 $D2,$H1#lo,${R1}[0]
968 vmlal.u32 $D2,$H0#lo,${R2}[0]
976 vmlal.u32 $D2,$H4#lo,${S3}[0]
981 vmlal.u32 $D2,$H3#lo,${S4}[0]
993 @ lazy reduction interleaved with base 2^32 -> base 2^26 of
994 @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
1000 vadd.i64 $D4,$D4,$T0 @ h3 -> h4
1002 vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
1003 vadd.i64 $D1,$D1,$T1 @ h0 -> h1
1011 vadd.i64 $D2,$D2,$T1 @ h1 -> h2
1020 vshrn.u64 $T1#lo,$D2,#26
1021 vmovn.i64 $D2#lo,$D2
1022 vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
1024 vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
1026 vbic.i32 $D2#lo,#0xfc000000
1029 vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
1036 vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
1037 vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
1053 vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
1063 vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
1064 vmull.u32 $D2,$H2#hi,$R0
1080 vmlal.u32 $D2,$H1#hi,$R1
1088 vmlal.u32 $D2,$H0#hi,$R2
1098 vmlal.u32 $D2,$H4#hi,$S3
1101 vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
1106 vmlal.u32 $D2,$H3#hi,$S4
1116 vmlal.u32 $D2,$H2#lo,$R0
1128 vmlal.u32 $D2,$H1#lo,$R1
1136 vmlal.u32 $D2,$H0#lo,$R2
1142 vmlal.u32 $D2,$H4#lo,$S3
1145 vorn $MASK,$MASK,$MASK @ all-ones
1150 vmlal.u32 $D2,$H3#lo,$S4
1160 vadd.i64 $D2#lo,$D2#lo,$D2#hi
1169 vadd.i64 $D4,$D4,$T0 @ h3 -> h4
1170 vadd.i64 $D1,$D1,$T1 @ h0 -> h1
1176 vadd.i64 $D2,$D2,$T1 @ h1 -> h2
1180 vshr.u64 $T1,$D2,#26
1181 vand.i64 $D2,$D2,$MASK
1182 vadd.i64 $D0,$D0,$T0 @ h4 -> h0
1183 vadd.i64 $D3,$D3,$T1 @ h2 -> h3
1189 vadd.i64 $D1,$D1,$T0 @ h0 -> h1
1190 vadd.i64 $D4,$D4,$T1 @ h3 -> h4
1198 vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
1201 vldmia sp!,{d8-d15} @ epilogue
1202 ldmia sp!,{r4-r7}
1204 .size poly1305_blocks_neon,.-poly1305_blocks_neon
1214 .word OPENSSL_armcap_P-.Lpoly1305_init
1223 .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
1230 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
1232 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4