Lines Matching +full:multiply +full:- +full:accumulate

2 # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
5 # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
15 # IALU/gcc-4.9 NEON
18 # Cortex-A53 2.69/+58% 1.47
19 # Cortex-A57 2.70/+7% 1.14
21 # X-Gene 2.13/+68% 2.27
35 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
36 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
37 die "can't locate arm-xlate.pl";
85 and $s1,$s1,#-4
87 mov w#$s1,#-1
109 .size poly1305_init,.-poly1305_init
115 ands $len,$len,#-16
136 add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
158 adds $h0,$h0,$t0 // accumulate input
186 and $t0,$d2,#-4 // final reduction
200 .size poly1305_blocks,.-poly1305_blocks
224 add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
242 tst $d2,#-4 // see if it's carried/borrowed
251 adds $h0,$h0,$t0 // accumulate nonce
260 .size poly1305_emit,.-poly1305_emit
300 and $t0,$d2,#-4 // final reduction
308 .size poly1305_mult,.-poly1305_mult
313 and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
335 .size poly1305_splat,.-poly1305_splat
349 stp x29,x30,[sp,#-80]!
368 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
385 adds $h0,$h0,$d0 // accumulate input
391 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
417 adds $h0,$h0,$d0 // accumulate input
425 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
432 cmp w17,#-1 // is value impossible?
488 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
524 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
543 movi $MASK.2d,#-1
587 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
633 // (hash+inp[0:1])*r^4 and accumulate
654 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
704 // [see discussion in poly1305-armv4 module]
710 add $ACC4,$ACC4,$T0.2d // h3 -> h4
712 add $ACC1,$ACC1,$T1.2d // h0 -> h1
719 add $ACC2,$ACC2,$T1.2d // h1 -> h2
725 add $ACC0,$ACC0,$T0.2d // h4 -> h0
727 add $H3,$H3,$T1.2s // h2 -> h3
735 add $H1,$H1,$T0.2s // h0 -> h1
736 add $H4,$H4,$T1.2s // h3 -> h4
745 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
794 // (hash+inp[0:1])*r^4:r^3 and accumulate
853 add $ACC4,$ACC4,$T0.2d // h3 -> h4
854 add $ACC1,$ACC1,$T1.2d // h0 -> h1
860 add $ACC2,$ACC2,$T1.2d // h1 -> h2
866 add $ACC0,$ACC0,$T0.2d // h4 -> h0
867 add $ACC3,$ACC3,$T1.2d // h2 -> h3
873 add $ACC1,$ACC1,$T0.2d // h0 -> h1
874 add $ACC4,$ACC4,$T1.2d // h3 -> h4
887 .size poly1305_blocks_neon,.-poly1305_blocks_neon
893 .asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
904 s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
905 s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
910 (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
913 s/w#x([0-9]+)/w$1/g;