Lines Matching +full:in0 +full:- +full:in1

2 # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
5 # Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
16 # R1x000 ~5.5/+130% (big-endian)
17 # Octeon II 2.50/+70% (little-endian)
21 # Add 32-bit code path.
25 # Modulo-scheduling reduction allows to omit dependency chain at the
30 # R1x000 ~9.8/? (big-endian)
31 # Octeon II 3.65/+140% (little-endian)
32 # MT7621/1004K 4.75/? (little-endian)
48 # - never ever touch $tp, "thread pointer", former $gp [o32 can be
50 # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
52 # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
72 # 64-bit code path
76 my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
134 ld $in0,0($inp)
135 ld $in1,8($inp)
141 dsllv $in0,$in0,$tmp0
142 dsrlv $tmp3,$in1,$tmp1
143 dsllv $in1,$in1,$tmp0
146 dsrlv $in0,$in0,$tmp0
147 dsllv $tmp3,$in1,$tmp1
148 dsrlv $in1,$in1,$tmp0
151 or $in0,$in0,$tmp3
152 or $in1,$in1,$tmp2
155 ldl $in0,0+MSB($inp)
156 ldl $in1,8+MSB($inp)
157 ldr $in0,0+LSB($inp)
158 ldr $in1,8+LSB($inp)
162 dsbh $in0,$in0 # byte swap
163 dsbh $in1,$in1
164 dshd $in0,$in0
165 dshd $in1,$in1
171 and $tmp1,$in0,$tmp0 # byte swap
172 and $tmp3,$in1,$tmp0
173 dsrl $tmp2,$in0,24
174 dsrl $tmp4,$in1,24
182 and $tmp2,$in0,$tmp0
183 and $tmp4,$in1,$tmp0
184 dsrl $in0,8
185 dsrl $in1,8
188 and $in0,$tmp0
189 and $in1,$tmp0
192 or $in0,$tmp1
193 or $in1,$tmp3
194 dsrl $tmp1,$in0,32
195 dsrl $tmp3,$in1,32
196 dsll $in0,32
197 dsll $in1,32
198 or $in0,$tmp1
199 or $in1,$tmp3
204 daddiu $tmp0,-63 # 0x00000000ffffffc1
206 daddiu $tmp0,-1 # 0x0ffffffc0fffffff
208 and $in0,$tmp0
209 daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
210 and $in1,$tmp0
212 sd $in0,24($ctx)
213 dsrl $tmp0,$in1,2
214 sd $in1,32($ctx)
215 daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
227 ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
249 .mask $SAVED_REGS_MASK|0x000c0000,-8
255 .mask $SAVED_REGS_MASK,-8
261 $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
292 ld $in0,0($inp) # load input
293 ld $in1,8($inp)
298 dsllv $in0,$in0,$shr
299 dsrlv $tmp3,$in1,$shl
300 dsllv $in1,$in1,$shr
303 dsrlv $in0,$in0,$shr
304 dsllv $tmp3,$in1,$shl
305 dsrlv $in1,$in1,$shr
308 or $in0,$in0,$tmp3
309 or $in1,$in1,$tmp2
312 ldl $in0,0+MSB($inp) # load input
313 ldl $in1,8+MSB($inp)
314 ldr $in0,0+LSB($inp)
315 ldr $in1,8+LSB($inp)
320 dsbh $in0,$in0 # byte swap
321 dsbh $in1,$in1
322 dshd $in0,$in0
323 dshd $in1,$in1
329 and $tmp1,$in0,$tmp0 # byte swap
330 and $tmp3,$in1,$tmp0
331 dsrl $tmp2,$in0,24
332 dsrl $tmp4,$in1,24
340 and $tmp2,$in0,$tmp0
341 and $tmp4,$in1,$tmp0
342 dsrl $in0,8
343 dsrl $in1,8
346 and $in0,$tmp0
347 and $in1,$tmp0
350 or $in0,$tmp1
351 or $in1,$tmp3
352 dsrl $tmp1,$in0,32
353 dsrl $tmp3,$in1,32
354 dsll $in0,32
355 dsll $in1,32
356 or $in0,$tmp1
357 or $in1,$tmp3
360 dsrl $tmp1,$h2,2 # modulo-scheduled reduction
364 daddu $d0,$h0,$in0 # accumulate input
369 daddu $d1,$h1,$in1
430 $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
461 li $in0,-4 # final reduction
462 dsrl $in1,$tmp2,2
463 and $in0,$tmp2
465 daddu $in0,$in1
467 daddu $tmp0,$tmp0,$in0
468 sltu $in1,$tmp0,$in0
469 daddiu $in0,$tmp0,5 # compare to modulus
470 daddu $tmp1,$tmp1,$in1
471 sltiu $tmp3,$in0,5
472 sltu $tmp4,$tmp1,$in1
473 daddu $in1,$tmp1,$tmp3
475 sltu $tmp3,$in1,$tmp3
481 xor $in0,$tmp0
482 xor $in1,$tmp1
483 and $in0,$tmp2
484 and $in1,$tmp2
485 xor $in0,$tmp0
486 xor $in1,$tmp1
497 daddu $in0,$tmp0 # accumulate nonce
498 daddu $in1,$tmp2
499 sltu $tmp0,$in0,$tmp0
500 daddu $in1,$tmp0
502 dsrl $tmp0,$in0,8 # write mac value
503 dsrl $tmp1,$in0,16
504 dsrl $tmp2,$in0,24
505 sb $in0,0($mac)
506 dsrl $tmp3,$in0,32
508 dsrl $tmp0,$in0,40
510 dsrl $tmp1,$in0,48
512 dsrl $tmp2,$in0,56
514 dsrl $tmp3,$in1,8
516 dsrl $tmp0,$in1,16
518 dsrl $tmp1,$in1,24
521 sb $in1,8($mac)
522 dsrl $tmp2,$in1,32
524 dsrl $tmp3,$in1,40
526 dsrl $tmp0,$in1,48
528 dsrl $tmp1,$in1,56
537 .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
543 # 32-bit code path
547 my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
608 lw $in0,0($inp)
609 lw $in1,4($inp)
617 sllv $in0,$in0,$tmp0
618 srlv $tmp3,$in1,$tmp1
619 sllv $in1,$in1,$tmp0
620 or $in0,$in0,$tmp3
623 or $in1,$in1,$tmp3
630 srlv $in0,$in0,$tmp0
631 sllv $tmp3,$in1,$tmp1
632 srlv $in1,$in1,$tmp0
633 or $in0,$in0,$tmp3
636 or $in1,$in1,$tmp3
645 lwl $in0,0+MSB($inp)
646 lwl $in1,4+MSB($inp)
649 lwr $in0,0+LSB($inp)
650 lwr $in1,4+LSB($inp)
656 wsbh $in0,$in0 # byte swap
657 wsbh $in1,$in1
660 rotr $in0,$in0,16
661 rotr $in1,$in1,16
665 srl $tmp0,$in0,24 # byte swap
666 srl $tmp1,$in0,8
667 andi $tmp2,$in0,0xFF00
668 sll $in0,$in0,24
671 or $in0,$tmp0
672 srl $tmp0,$in1,24
674 srl $tmp2,$in1,8
675 or $in0,$tmp1
676 andi $tmp1,$in1,0xFF00
677 sll $in1,$in1,24
680 or $in1,$tmp0
684 or $in1,$tmp2
705 and $in0,$in0,$tmp0
707 and $in1,$in1,$tmp0
711 sw $in0,20($ctx)
712 sw $in1,24($ctx)
716 srl $tmp1,$in1,2
719 addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
722 sw $in1,36($ctx)
746 .mask $SAVED_REGS_MASK,-4
758 $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
893 srl $t0,$h4,2 # modulo-scheduled reduction
1140 $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1169 li $in0,-4 # final reduction
1171 and $in0,$in0,$tmp4
1173 addu $ctx,$ctx,$in0
1177 addiu $in0,$tmp0,5 # compare to modulus
1179 sltiu $in1,$in0,5
1181 addu $in1,$in1,$tmp1
1183 sltu $in2,$in1,$tmp1
1197 xor $in0,$tmp0
1198 xor $in1,$tmp1
1201 and $in0,$ctx
1202 and $in1,$ctx
1205 xor $in0,$tmp0
1206 xor $in1,$tmp1
1215 addu $in0,$tmp0 # accumulate nonce
1216 sltu $ctx,$in0,$tmp0
1218 addu $in1,$tmp1
1219 sltu $tmp1,$in1,$tmp1
1220 addu $in1,$ctx
1221 sltu $ctx,$in1,$ctx
1233 srl $tmp0,$in0,8 # write mac value
1234 srl $tmp1,$in0,16
1235 srl $tmp2,$in0,24
1236 sb $in0, 0($mac)
1238 srl $tmp0,$in1,8
1240 srl $tmp1,$in1,16
1242 srl $tmp2,$in1,24
1243 sb $in1, 4($mac)
1265 .asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"