1#!/usr/bin/env perl 2# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 3# 4# ==================================================================== 5# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL 6# project. 7# ==================================================================== 8 9# Poly1305 hash for MIPS. 10# 11# May 2016 12# 13# Numbers are cycles per processed byte with poly1305_blocks alone. 14# 15# IALU/gcc 16# R1x000 ~5.5/+130% (big-endian) 17# Octeon II 2.50/+70% (little-endian) 18# 19# March 2019 20# 21# Add 32-bit code path. 22# 23# October 2019 24# 25# Modulo-scheduling reduction allows to omit dependency chain at the 26# end of inner loop and improve performance. Also optimize MIPS32R2 27# code path for MIPS 1004K core. Per René von Dorst's suggestions. 28# 29# IALU/gcc 30# R1x000 ~9.8/? (big-endian) 31# Octeon II 3.65/+140% (little-endian) 32# MT7621/1004K 4.75/? (little-endian) 33# 34###################################################################### 35# There is a number of MIPS ABI in use, O32 and N32/64 are most 36# widely used. Then there is a new contender: NUBI. It appears that if 37# one picks the latter, it's possible to arrange code in ABI neutral 38# manner. Therefore let's stick to NUBI register layout: 39# 40($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 41($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 42($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 43($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 44# 45# The return value is placed in $a0. Following coding rules facilitate 46# interoperability: 47# 48# - never ever touch $tp, "thread pointer", former $gp [o32 can be 49# excluded from the rule, because it's specified volatile]; 50# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 51# old code]; 52# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 53# 54# For reference here is register layout for N32/64 MIPS ABIs: 55# 56# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 57# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 58# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 59# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 60# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 61# 62# <appro@openssl.org> 63# 64###################################################################### 65 66$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 67 68$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; 69 70if ($flavour =~ /64|n32/i) {{{ 71###################################################################### 72# 64-bit code path 73# 74 75my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 76my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); 77 78$code.=<<___; 79#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ 80 defined(_MIPS_ARCH_MIPS64R6)) \\ 81 && !defined(_MIPS_ARCH_MIPS64R2) 82# define _MIPS_ARCH_MIPS64R2 83#endif 84 85#if defined(_MIPS_ARCH_MIPS64R6) 86# define dmultu(rs,rt) 87# define mflo(rd,rs,rt) dmulu rd,rs,rt 88# define mfhi(rd,rs,rt) dmuhu rd,rs,rt 89#else 90# define dmultu(rs,rt) dmultu rs,rt 91# define mflo(rd,rs,rt) mflo rd 92# define mfhi(rd,rs,rt) mfhi rd 93#endif 94 95#ifdef __KERNEL__ 96# define poly1305_init poly1305_block_init 97#endif 98 99#if defined(__MIPSEB__) && !defined(MIPSEB) 100# define MIPSEB 101#endif 102 103#ifdef MIPSEB 104# define MSB 0 105# define LSB 7 106#else 107# define MSB 7 108# define LSB 0 109#endif 110 111.text 112.set noat 113.set noreorder 114 115.align 5 116.globl poly1305_init 117.ent poly1305_init 118poly1305_init: 119 .frame $sp,0,$ra 120 .set reorder 121 122 sd $zero,0($ctx) 123 sd $zero,8($ctx) 124 sd $zero,16($ctx) 125 126 beqz $inp,.Lno_key 127 128#if defined(_MIPS_ARCH_MIPS64R6) 129 andi $tmp0,$inp,7 # $inp % 8 130 dsubu $inp,$inp,$tmp0 # align $inp 131 sll $tmp0,$tmp0,3 # byte to bit offset 132 ld $in0,0($inp) 133 ld $in1,8($inp) 134 beqz $tmp0,.Laligned_key 135 ld $tmp2,16($inp) 136 137 subu $tmp1,$zero,$tmp0 138# ifdef MIPSEB 139 dsllv $in0,$in0,$tmp0 140 dsrlv $tmp3,$in1,$tmp1 141 dsllv $in1,$in1,$tmp0 142 dsrlv $tmp2,$tmp2,$tmp1 143# else 144 dsrlv $in0,$in0,$tmp0 145 dsllv $tmp3,$in1,$tmp1 146 dsrlv $in1,$in1,$tmp0 147 dsllv $tmp2,$tmp2,$tmp1 148# endif 149 or $in0,$in0,$tmp3 150 or $in1,$in1,$tmp2 151.Laligned_key: 152#else 153 ldl $in0,0+MSB($inp) 154 ldl $in1,8+MSB($inp) 155 ldr $in0,0+LSB($inp) 156 ldr $in1,8+LSB($inp) 157#endif 158#ifdef MIPSEB 159# if defined(_MIPS_ARCH_MIPS64R2) 160 dsbh $in0,$in0 # byte swap 161 dsbh $in1,$in1 162 dshd $in0,$in0 163 dshd $in1,$in1 164# else 165 ori $tmp0,$zero,0xFF 166 dsll $tmp2,$tmp0,32 167 or $tmp0,$tmp2 # 0x000000FF000000FF 168 169 and $tmp1,$in0,$tmp0 # byte swap 170 and $tmp3,$in1,$tmp0 171 dsrl $tmp2,$in0,24 172 dsrl $tmp4,$in1,24 173 dsll $tmp1,24 174 dsll $tmp3,24 175 and $tmp2,$tmp0 176 and $tmp4,$tmp0 177 dsll $tmp0,8 # 0x0000FF000000FF00 178 or $tmp1,$tmp2 179 or $tmp3,$tmp4 180 and $tmp2,$in0,$tmp0 181 and $tmp4,$in1,$tmp0 182 dsrl $in0,8 183 dsrl $in1,8 184 dsll $tmp2,8 185 dsll $tmp4,8 186 and $in0,$tmp0 187 and $in1,$tmp0 188 or $tmp1,$tmp2 189 or $tmp3,$tmp4 190 or $in0,$tmp1 191 or $in1,$tmp3 192 dsrl $tmp1,$in0,32 193 dsrl $tmp3,$in1,32 194 dsll $in0,32 195 dsll $in1,32 196 or $in0,$tmp1 197 or $in1,$tmp3 198# endif 199#endif 200 li $tmp0,1 201 dsll $tmp0,32 # 0x0000000100000000 202 daddiu $tmp0,-63 # 0x00000000ffffffc1 203 dsll $tmp0,28 # 0x0ffffffc10000000 204 daddiu $tmp0,-1 # 0x0ffffffc0fffffff 205 206 and $in0,$tmp0 207 daddiu $tmp0,-3 # 0x0ffffffc0ffffffc 208 and $in1,$tmp0 209 210 sd $in0,24($ctx) 211 dsrl $tmp0,$in1,2 212 sd $in1,32($ctx) 213 daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) 214 sd $tmp0,40($ctx) 215 216.Lno_key: 217 li $v0,0 # return 0 218 jr $ra 219.end poly1305_init 220___ 221{ 222my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; 223 224my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = 225 ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); 226my ($shr,$shl) = ($s6,$s7); # used on R6 227 228$code.=<<___; 229.align 5 230.globl poly1305_blocks 231.ent poly1305_blocks 232poly1305_blocks: 233 .set noreorder 234 dsrl $len,4 # number of complete blocks 235 bnez $len,poly1305_blocks_internal 236 nop 237 jr $ra 238 nop 239.end poly1305_blocks 240 241.align 5 242.ent poly1305_blocks_internal 243poly1305_blocks_internal: 244 .set noreorder 245#if defined(_MIPS_ARCH_MIPS64R6) 246 .frame $sp,8*8,$ra 247 .mask $SAVED_REGS_MASK|0x000c0000,-8 248 dsubu $sp,8*8 249 sd $s7,56($sp) 250 sd $s6,48($sp) 251#else 252 .frame $sp,6*8,$ra 253 .mask $SAVED_REGS_MASK,-8 254 dsubu $sp,6*8 255#endif 256 sd $s5,40($sp) 257 sd $s4,32($sp) 258___ 259$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 260 sd $s3,24($sp) 261 sd $s2,16($sp) 262 sd $s1,8($sp) 263 sd $s0,0($sp) 264___ 265$code.=<<___; 266 .set reorder 267 268#if defined(_MIPS_ARCH_MIPS64R6) 269 andi $shr,$inp,7 270 dsubu $inp,$inp,$shr # align $inp 271 sll $shr,$shr,3 # byte to bit offset 272 subu $shl,$zero,$shr 273#endif 274 275 ld $h0,0($ctx) # load hash value 276 ld $h1,8($ctx) 277 ld $h2,16($ctx) 278 279 ld $r0,24($ctx) # load key 280 ld $r1,32($ctx) 281 ld $rs1,40($ctx) 282 283 dsll $len,4 284 daddu $len,$inp # end of buffer 285 b .Loop 286 287.align 4 288.Loop: 289#if defined(_MIPS_ARCH_MIPS64R6) 290 ld $in0,0($inp) # load input 291 ld $in1,8($inp) 292 beqz $shr,.Laligned_inp 293 294 ld $tmp2,16($inp) 295# ifdef MIPSEB 296 dsllv $in0,$in0,$shr 297 dsrlv $tmp3,$in1,$shl 298 dsllv $in1,$in1,$shr 299 dsrlv $tmp2,$tmp2,$shl 300# else 301 dsrlv $in0,$in0,$shr 302 dsllv $tmp3,$in1,$shl 303 dsrlv $in1,$in1,$shr 304 dsllv $tmp2,$tmp2,$shl 305# endif 306 or $in0,$in0,$tmp3 307 or $in1,$in1,$tmp2 308.Laligned_inp: 309#else 310 ldl $in0,0+MSB($inp) # load input 311 ldl $in1,8+MSB($inp) 312 ldr $in0,0+LSB($inp) 313 ldr $in1,8+LSB($inp) 314#endif 315 daddiu $inp,16 316#ifdef MIPSEB 317# if defined(_MIPS_ARCH_MIPS64R2) 318 dsbh $in0,$in0 # byte swap 319 dsbh $in1,$in1 320 dshd $in0,$in0 321 dshd $in1,$in1 322# else 323 ori $tmp0,$zero,0xFF 324 dsll $tmp2,$tmp0,32 325 or $tmp0,$tmp2 # 0x000000FF000000FF 326 327 and $tmp1,$in0,$tmp0 # byte swap 328 and $tmp3,$in1,$tmp0 329 dsrl $tmp2,$in0,24 330 dsrl $tmp4,$in1,24 331 dsll $tmp1,24 332 dsll $tmp3,24 333 and $tmp2,$tmp0 334 and $tmp4,$tmp0 335 dsll $tmp0,8 # 0x0000FF000000FF00 336 or $tmp1,$tmp2 337 or $tmp3,$tmp4 338 and $tmp2,$in0,$tmp0 339 and $tmp4,$in1,$tmp0 340 dsrl $in0,8 341 dsrl $in1,8 342 dsll $tmp2,8 343 dsll $tmp4,8 344 and $in0,$tmp0 345 and $in1,$tmp0 346 or $tmp1,$tmp2 347 or $tmp3,$tmp4 348 or $in0,$tmp1 349 or $in1,$tmp3 350 dsrl $tmp1,$in0,32 351 dsrl $tmp3,$in1,32 352 dsll $in0,32 353 dsll $in1,32 354 or $in0,$tmp1 355 or $in1,$tmp3 356# endif 357#endif 358 dsrl $tmp1,$h2,2 # modulo-scheduled reduction 359 andi $h2,$h2,3 360 dsll $tmp0,$tmp1,2 361 362 daddu $d0,$h0,$in0 # accumulate input 363 daddu $tmp1,$tmp0 364 sltu $tmp0,$d0,$h0 365 daddu $d0,$d0,$tmp1 # ... and residue 366 sltu $tmp1,$d0,$tmp1 367 daddu $d1,$h1,$in1 368 daddu $tmp0,$tmp1 369 sltu $tmp1,$d1,$h1 370 daddu $d1,$tmp0 371 372 dmultu ($r0,$d0) # h0*r0 373 daddu $d2,$h2,$padbit 374 sltu $tmp0,$d1,$tmp0 375 mflo ($h0,$r0,$d0) 376 mfhi ($h1,$r0,$d0) 377 378 dmultu ($rs1,$d1) # h1*5*r1 379 daddu $d2,$tmp1 380 daddu $d2,$tmp0 381 mflo ($tmp0,$rs1,$d1) 382 mfhi ($tmp1,$rs1,$d1) 383 384 dmultu ($r1,$d0) # h0*r1 385 mflo ($tmp2,$r1,$d0) 386 mfhi ($h2,$r1,$d0) 387 daddu $h0,$tmp0 388 daddu $h1,$tmp1 389 sltu $tmp0,$h0,$tmp0 390 391 dmultu ($r0,$d1) # h1*r0 392 daddu $h1,$tmp0 393 daddu $h1,$tmp2 394 mflo ($tmp0,$r0,$d1) 395 mfhi ($tmp1,$r0,$d1) 396 397 dmultu ($rs1,$d2) # h2*5*r1 398 sltu $tmp2,$h1,$tmp2 399 daddu $h2,$tmp2 400 mflo ($tmp2,$rs1,$d2) 401 402 dmultu ($r0,$d2) # h2*r0 403 daddu $h1,$tmp0 404 daddu $h2,$tmp1 405 mflo ($tmp3,$r0,$d2) 406 sltu $tmp0,$h1,$tmp0 407 daddu $h2,$tmp0 408 409 daddu $h1,$tmp2 410 sltu $tmp2,$h1,$tmp2 411 daddu $h2,$tmp2 412 daddu $h2,$tmp3 413 414 bne $inp,$len,.Loop 415 416 sd $h0,0($ctx) # store hash value 417 sd $h1,8($ctx) 418 sd $h2,16($ctx) 419 420 .set noreorder 421#if defined(_MIPS_ARCH_MIPS64R6) 422 ld $s7,56($sp) 423 ld $s6,48($sp) 424#endif 425 ld $s5,40($sp) # epilogue 426 ld $s4,32($sp) 427___ 428$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue 429 ld $s3,24($sp) 430 ld $s2,16($sp) 431 ld $s1,8($sp) 432 ld $s0,0($sp) 433___ 434$code.=<<___; 435 jr $ra 436#if defined(_MIPS_ARCH_MIPS64R6) 437 daddu $sp,8*8 438#else 439 daddu $sp,6*8 440#endif 441.end poly1305_blocks_internal 442___ 443} 444{ 445my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); 446 447$code.=<<___; 448.align 5 449.globl poly1305_emit 450.ent poly1305_emit 451poly1305_emit: 452 .frame $sp,0,$ra 453 .set reorder 454 455 ld $tmp2,16($ctx) 456 ld $tmp0,0($ctx) 457 ld $tmp1,8($ctx) 458 459 li $in0,-4 # final reduction 460 dsrl $in1,$tmp2,2 461 and $in0,$tmp2 462 andi $tmp2,$tmp2,3 463 daddu $in0,$in1 464 465 daddu $tmp0,$tmp0,$in0 466 sltu $in1,$tmp0,$in0 467 daddiu $in0,$tmp0,5 # compare to modulus 468 daddu $tmp1,$tmp1,$in1 469 sltiu $tmp3,$in0,5 470 sltu $tmp4,$tmp1,$in1 471 daddu $in1,$tmp1,$tmp3 472 daddu $tmp2,$tmp2,$tmp4 473 sltu $tmp3,$in1,$tmp3 474 daddu $tmp2,$tmp2,$tmp3 475 476 dsrl $tmp2,2 # see if it carried/borrowed 477 dsubu $tmp2,$zero,$tmp2 478 479 xor $in0,$tmp0 480 xor $in1,$tmp1 481 and $in0,$tmp2 482 and $in1,$tmp2 483 xor $in0,$tmp0 484 xor $in1,$tmp1 485 486 lwu $tmp0,0($nonce) # load nonce 487 lwu $tmp1,4($nonce) 488 lwu $tmp2,8($nonce) 489 lwu $tmp3,12($nonce) 490 dsll $tmp1,32 491 dsll $tmp3,32 492 or $tmp0,$tmp1 493 or $tmp2,$tmp3 494 495 daddu $in0,$tmp0 # accumulate nonce 496 daddu $in1,$tmp2 497 sltu $tmp0,$in0,$tmp0 498 daddu $in1,$tmp0 499 500 dsrl $tmp0,$in0,8 # write mac value 501 dsrl $tmp1,$in0,16 502 dsrl $tmp2,$in0,24 503 sb $in0,0($mac) 504 dsrl $tmp3,$in0,32 505 sb $tmp0,1($mac) 506 dsrl $tmp0,$in0,40 507 sb $tmp1,2($mac) 508 dsrl $tmp1,$in0,48 509 sb $tmp2,3($mac) 510 dsrl $tmp2,$in0,56 511 sb $tmp3,4($mac) 512 dsrl $tmp3,$in1,8 513 sb $tmp0,5($mac) 514 dsrl $tmp0,$in1,16 515 sb $tmp1,6($mac) 516 dsrl $tmp1,$in1,24 517 sb $tmp2,7($mac) 518 519 sb $in1,8($mac) 520 dsrl $tmp2,$in1,32 521 sb $tmp3,9($mac) 522 dsrl $tmp3,$in1,40 523 sb $tmp0,10($mac) 524 dsrl $tmp0,$in1,48 525 sb $tmp1,11($mac) 526 dsrl $tmp1,$in1,56 527 sb $tmp2,12($mac) 528 sb $tmp3,13($mac) 529 sb $tmp0,14($mac) 530 sb $tmp1,15($mac) 531 532 jr $ra 533.end poly1305_emit 534.rdata 535.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" 536.align 2 537___ 538} 539}}} else {{{ 540###################################################################### 541# 32-bit code path 542# 543 544my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 545my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = 546 ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); 547 548$code.=<<___; 549#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ 550 defined(_MIPS_ARCH_MIPS32R6)) \\ 551 && !defined(_MIPS_ARCH_MIPS32R2) 552# define _MIPS_ARCH_MIPS32R2 553#endif 554 555#if defined(_MIPS_ARCH_MIPS32R6) 556# define multu(rs,rt) 557# define mflo(rd,rs,rt) mulu rd,rs,rt 558# define mfhi(rd,rs,rt) muhu rd,rs,rt 559#else 560# define multu(rs,rt) multu rs,rt 561# define mflo(rd,rs,rt) mflo rd 562# define mfhi(rd,rs,rt) mfhi rd 563#endif 564 565#ifdef __KERNEL__ 566# define poly1305_init poly1305_block_init 567#endif 568 569#if defined(__MIPSEB__) && !defined(MIPSEB) 570# define MIPSEB 571#endif 572 573#ifdef MIPSEB 574# define MSB 0 575# define LSB 3 576#else 577# define MSB 3 578# define LSB 0 579#endif 580 581.text 582.set noat 583.set noreorder 584 585.align 5 586.globl poly1305_init 587.ent poly1305_init 588poly1305_init: 589 .frame $sp,0,$ra 590 .set reorder 591 592 sw $zero,0($ctx) 593 sw $zero,4($ctx) 594 sw $zero,8($ctx) 595 sw $zero,12($ctx) 596 sw $zero,16($ctx) 597 598 beqz $inp,.Lno_key 599 600#if defined(_MIPS_ARCH_MIPS32R6) 601 andi $tmp0,$inp,3 # $inp % 4 602 subu $inp,$inp,$tmp0 # align $inp 603 sll $tmp0,$tmp0,3 # byte to bit offset 604 lw $in0,0($inp) 605 lw $in1,4($inp) 606 lw $in2,8($inp) 607 lw $in3,12($inp) 608 beqz $tmp0,.Laligned_key 609 610 lw $tmp2,16($inp) 611 subu $tmp1,$zero,$tmp0 612# ifdef MIPSEB 613 sllv $in0,$in0,$tmp0 614 srlv $tmp3,$in1,$tmp1 615 sllv $in1,$in1,$tmp0 616 or $in0,$in0,$tmp3 617 srlv $tmp3,$in2,$tmp1 618 sllv $in2,$in2,$tmp0 619 or $in1,$in1,$tmp3 620 srlv $tmp3,$in3,$tmp1 621 sllv $in3,$in3,$tmp0 622 or $in2,$in2,$tmp3 623 srlv $tmp2,$tmp2,$tmp1 624 or $in3,$in3,$tmp2 625# else 626 srlv $in0,$in0,$tmp0 627 sllv $tmp3,$in1,$tmp1 628 srlv $in1,$in1,$tmp0 629 or $in0,$in0,$tmp3 630 sllv $tmp3,$in2,$tmp1 631 srlv $in2,$in2,$tmp0 632 or $in1,$in1,$tmp3 633 sllv $tmp3,$in3,$tmp1 634 srlv $in3,$in3,$tmp0 635 or $in2,$in2,$tmp3 636 sllv $tmp2,$tmp2,$tmp1 637 or $in3,$in3,$tmp2 638# endif 639.Laligned_key: 640#else 641 lwl $in0,0+MSB($inp) 642 lwl $in1,4+MSB($inp) 643 lwl $in2,8+MSB($inp) 644 lwl $in3,12+MSB($inp) 645 lwr $in0,0+LSB($inp) 646 lwr $in1,4+LSB($inp) 647 lwr $in2,8+LSB($inp) 648 lwr $in3,12+LSB($inp) 649#endif 650#ifdef MIPSEB 651# if defined(_MIPS_ARCH_MIPS32R2) 652 wsbh $in0,$in0 # byte swap 653 wsbh $in1,$in1 654 wsbh $in2,$in2 655 wsbh $in3,$in3 656 rotr $in0,$in0,16 657 rotr $in1,$in1,16 658 rotr $in2,$in2,16 659 rotr $in3,$in3,16 660# else 661 srl $tmp0,$in0,24 # byte swap 662 srl $tmp1,$in0,8 663 andi $tmp2,$in0,0xFF00 664 sll $in0,$in0,24 665 andi $tmp1,0xFF00 666 sll $tmp2,$tmp2,8 667 or $in0,$tmp0 668 srl $tmp0,$in1,24 669 or $tmp1,$tmp2 670 srl $tmp2,$in1,8 671 or $in0,$tmp1 672 andi $tmp1,$in1,0xFF00 673 sll $in1,$in1,24 674 andi $tmp2,0xFF00 675 sll $tmp1,$tmp1,8 676 or $in1,$tmp0 677 srl $tmp0,$in2,24 678 or $tmp2,$tmp1 679 srl $tmp1,$in2,8 680 or $in1,$tmp2 681 andi $tmp2,$in2,0xFF00 682 sll $in2,$in2,24 683 andi $tmp1,0xFF00 684 sll $tmp2,$tmp2,8 685 or $in2,$tmp0 686 srl $tmp0,$in3,24 687 or $tmp1,$tmp2 688 srl $tmp2,$in3,8 689 or $in2,$tmp1 690 andi $tmp1,$in3,0xFF00 691 sll $in3,$in3,24 692 andi $tmp2,0xFF00 693 sll $tmp1,$tmp1,8 694 or $in3,$tmp0 695 or $tmp2,$tmp1 696 or $in3,$tmp2 697# endif 698#endif 699 lui $tmp0,0x0fff 700 ori $tmp0,0xffff # 0x0fffffff 701 and $in0,$in0,$tmp0 702 subu $tmp0,3 # 0x0ffffffc 703 and $in1,$in1,$tmp0 704 and $in2,$in2,$tmp0 705 and $in3,$in3,$tmp0 706 707 sw $in0,20($ctx) 708 sw $in1,24($ctx) 709 sw $in2,28($ctx) 710 sw $in3,32($ctx) 711 712 srl $tmp1,$in1,2 713 srl $tmp2,$in2,2 714 srl $tmp3,$in3,2 715 addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) 716 addu $in2,$in2,$tmp2 717 addu $in3,$in3,$tmp3 718 sw $in1,36($ctx) 719 sw $in2,40($ctx) 720 sw $in3,44($ctx) 721.Lno_key: 722 li $v0,0 723 jr $ra 724.end poly1305_init 725___ 726{ 727my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; 728 729my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = 730 ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); 731my ($d0,$d1,$d2,$d3) = 732 ($a4,$a5,$a6,$a7); 733my $shr = $t2; # used on R6 734my $one = $t2; # used on R2 735 736$code.=<<___; 737.globl poly1305_blocks 738.align 5 739.ent poly1305_blocks 740poly1305_blocks: 741 .frame $sp,16*4,$ra 742 .mask $SAVED_REGS_MASK,-4 743 .set noreorder 744 subu $sp, $sp,4*12 745 sw $s11,4*11($sp) 746 sw $s10,4*10($sp) 747 sw $s9, 4*9($sp) 748 sw $s8, 4*8($sp) 749 sw $s7, 4*7($sp) 750 sw $s6, 4*6($sp) 751 sw $s5, 4*5($sp) 752 sw $s4, 4*4($sp) 753___ 754$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 755 sw $s3, 4*3($sp) 756 sw $s2, 4*2($sp) 757 sw $s1, 4*1($sp) 758 sw $s0, 4*0($sp) 759___ 760$code.=<<___; 761 .set reorder 762 763 srl $len,4 # number of complete blocks 764 li $one,1 765 beqz $len,.Labort 766 767#if defined(_MIPS_ARCH_MIPS32R6) 768 andi $shr,$inp,3 769 subu $inp,$inp,$shr # align $inp 770 sll $shr,$shr,3 # byte to bit offset 771#endif 772 773 lw $h0,0($ctx) # load hash value 774 lw $h1,4($ctx) 775 lw $h2,8($ctx) 776 lw $h3,12($ctx) 777 lw $h4,16($ctx) 778 779 lw $r0,20($ctx) # load key 780 lw $r1,24($ctx) 781 lw $r2,28($ctx) 782 lw $r3,32($ctx) 783 lw $rs1,36($ctx) 784 lw $rs2,40($ctx) 785 lw $rs3,44($ctx) 786 787 sll $len,4 788 addu $len,$len,$inp # end of buffer 789 b .Loop 790 791.align 4 792.Loop: 793#if defined(_MIPS_ARCH_MIPS32R6) 794 lw $d0,0($inp) # load input 795 lw $d1,4($inp) 796 lw $d2,8($inp) 797 lw $d3,12($inp) 798 beqz $shr,.Laligned_inp 799 800 lw $t0,16($inp) 801 subu $t1,$zero,$shr 802# ifdef MIPSEB 803 sllv $d0,$d0,$shr 804 srlv $at,$d1,$t1 805 sllv $d1,$d1,$shr 806 or $d0,$d0,$at 807 srlv $at,$d2,$t1 808 sllv $d2,$d2,$shr 809 or $d1,$d1,$at 810 srlv $at,$d3,$t1 811 sllv $d3,$d3,$shr 812 or $d2,$d2,$at 813 srlv $t0,$t0,$t1 814 or $d3,$d3,$t0 815# else 816 srlv $d0,$d0,$shr 817 sllv $at,$d1,$t1 818 srlv $d1,$d1,$shr 819 or $d0,$d0,$at 820 sllv $at,$d2,$t1 821 srlv $d2,$d2,$shr 822 or $d1,$d1,$at 823 sllv $at,$d3,$t1 824 srlv $d3,$d3,$shr 825 or $d2,$d2,$at 826 sllv $t0,$t0,$t1 827 or $d3,$d3,$t0 828# endif 829.Laligned_inp: 830#else 831 lwl $d0,0+MSB($inp) # load input 832 lwl $d1,4+MSB($inp) 833 lwl $d2,8+MSB($inp) 834 lwl $d3,12+MSB($inp) 835 lwr $d0,0+LSB($inp) 836 lwr $d1,4+LSB($inp) 837 lwr $d2,8+LSB($inp) 838 lwr $d3,12+LSB($inp) 839#endif 840#ifdef MIPSEB 841# if defined(_MIPS_ARCH_MIPS32R2) 842 wsbh $d0,$d0 # byte swap 843 wsbh $d1,$d1 844 wsbh $d2,$d2 845 wsbh $d3,$d3 846 rotr $d0,$d0,16 847 rotr $d1,$d1,16 848 rotr $d2,$d2,16 849 rotr $d3,$d3,16 850# else 851 srl $at,$d0,24 # byte swap 852 srl $t0,$d0,8 853 andi $t1,$d0,0xFF00 854 sll $d0,$d0,24 855 andi $t0,0xFF00 856 sll $t1,$t1,8 857 or $d0,$at 858 srl $at,$d1,24 859 or $t0,$t1 860 srl $t1,$d1,8 861 or $d0,$t0 862 andi $t0,$d1,0xFF00 863 sll $d1,$d1,24 864 andi $t1,0xFF00 865 sll $t0,$t0,8 866 or $d1,$at 867 srl $at,$d2,24 868 or $t1,$t0 869 srl $t0,$d2,8 870 or $d1,$t1 871 andi $t1,$d2,0xFF00 872 sll $d2,$d2,24 873 andi $t0,0xFF00 874 sll $t1,$t1,8 875 or $d2,$at 876 srl $at,$d3,24 877 or $t0,$t1 878 srl $t1,$d3,8 879 or $d2,$t0 880 andi $t0,$d3,0xFF00 881 sll $d3,$d3,24 882 andi $t1,0xFF00 883 sll $t0,$t0,8 884 or $d3,$at 885 or $t1,$t0 886 or $d3,$t1 887# endif 888#endif 889 srl $t0,$h4,2 # modulo-scheduled reduction 890 andi $h4,$h4,3 891 sll $at,$t0,2 892 893 addu $d0,$d0,$h0 # accumulate input 894 addu $t0,$t0,$at 895 sltu $h0,$d0,$h0 896 addu $d0,$d0,$t0 # ... and residue 897 sltu $at,$d0,$t0 898 899 addu $d1,$d1,$h1 900 addu $h0,$h0,$at # carry 901 sltu $h1,$d1,$h1 902 addu $d1,$d1,$h0 903 sltu $h0,$d1,$h0 904 905 addu $d2,$d2,$h2 906 addu $h1,$h1,$h0 # carry 907 sltu $h2,$d2,$h2 908 addu $d2,$d2,$h1 909 sltu $h1,$d2,$h1 910 911 addu $d3,$d3,$h3 912 addu $h2,$h2,$h1 # carry 913 sltu $h3,$d3,$h3 914 addu $d3,$d3,$h2 915 916#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) 917 multu $r0,$d0 # d0*r0 918 sltu $h2,$d3,$h2 919 maddu $rs3,$d1 # d1*s3 920 addu $h3,$h3,$h2 # carry 921 maddu $rs2,$d2 # d2*s2 922 addu $h4,$h4,$padbit 923 maddu $rs1,$d3 # d3*s1 924 addu $h4,$h4,$h3 925 mfhi $at 926 mflo $h0 927 928 multu $r1,$d0 # d0*r1 929 maddu $r0,$d1 # d1*r0 930 maddu $rs3,$d2 # d2*s3 931 maddu $rs2,$d3 # d3*s2 932 maddu $rs1,$h4 # h4*s1 933 maddu $at,$one # hi*1 934 mfhi $at 935 mflo $h1 936 937 multu $r2,$d0 # d0*r2 938 maddu $r1,$d1 # d1*r1 939 maddu $r0,$d2 # d2*r0 940 maddu $rs3,$d3 # d3*s3 941 maddu $rs2,$h4 # h4*s2 942 maddu $at,$one # hi*1 943 mfhi $at 944 mflo $h2 945 946 mul $t0,$r0,$h4 # h4*r0 947 948 multu $r3,$d0 # d0*r3 949 maddu $r2,$d1 # d1*r2 950 maddu $r1,$d2 # d2*r1 951 maddu $r0,$d3 # d3*r0 952 maddu $rs3,$h4 # h4*s3 953 maddu $at,$one # hi*1 954 mfhi $at 955 mflo $h3 956 957 addiu $inp,$inp,16 958 959 addu $h4,$t0,$at 960#else 961 multu ($r0,$d0) # d0*r0 962 mflo ($h0,$r0,$d0) 963 mfhi ($h1,$r0,$d0) 964 965 sltu $h2,$d3,$h2 966 addu $h3,$h3,$h2 # carry 967 968 multu ($rs3,$d1) # d1*s3 969 mflo ($at,$rs3,$d1) 970 mfhi ($t0,$rs3,$d1) 971 972 addu $h4,$h4,$padbit 973 addiu $inp,$inp,16 974 addu $h4,$h4,$h3 975 976 multu ($rs2,$d2) # d2*s2 977 mflo ($a3,$rs2,$d2) 978 mfhi ($t1,$rs2,$d2) 979 addu $h0,$h0,$at 980 addu $h1,$h1,$t0 981 multu ($rs1,$d3) # d3*s1 982 sltu $at,$h0,$at 983 addu $h1,$h1,$at 984 985 mflo ($at,$rs1,$d3) 986 mfhi ($t0,$rs1,$d3) 987 addu $h0,$h0,$a3 988 addu $h1,$h1,$t1 989 multu ($r1,$d0) # d0*r1 990 sltu $a3,$h0,$a3 991 addu $h1,$h1,$a3 992 993 994 mflo ($a3,$r1,$d0) 995 mfhi ($h2,$r1,$d0) 996 addu $h0,$h0,$at 997 addu $h1,$h1,$t0 998 multu ($r0,$d1) # d1*r0 999 sltu $at,$h0,$at 1000 addu $h1,$h1,$at 1001 1002 mflo ($at,$r0,$d1) 1003 mfhi ($t0,$r0,$d1) 1004 addu $h1,$h1,$a3 1005 sltu $a3,$h1,$a3 1006 multu ($rs3,$d2) # d2*s3 1007 addu $h2,$h2,$a3 1008 1009 mflo ($a3,$rs3,$d2) 1010 mfhi ($t1,$rs3,$d2) 1011 addu $h1,$h1,$at 1012 addu $h2,$h2,$t0 1013 multu ($rs2,$d3) # d3*s2 1014 sltu $at,$h1,$at 1015 addu $h2,$h2,$at 1016 1017 mflo ($at,$rs2,$d3) 1018 mfhi ($t0,$rs2,$d3) 1019 addu $h1,$h1,$a3 1020 addu $h2,$h2,$t1 1021 multu ($rs1,$h4) # h4*s1 1022 sltu $a3,$h1,$a3 1023 addu $h2,$h2,$a3 1024 1025 mflo ($a3,$rs1,$h4) 1026 addu $h1,$h1,$at 1027 addu $h2,$h2,$t0 1028 multu ($r2,$d0) # d0*r2 1029 sltu $at,$h1,$at 1030 addu $h2,$h2,$at 1031 1032 1033 mflo ($at,$r2,$d0) 1034 mfhi ($h3,$r2,$d0) 1035 addu $h1,$h1,$a3 1036 sltu $a3,$h1,$a3 1037 multu ($r1,$d1) # d1*r1 1038 addu $h2,$h2,$a3 1039 1040 mflo ($a3,$r1,$d1) 1041 mfhi ($t1,$r1,$d1) 1042 addu $h2,$h2,$at 1043 sltu $at,$h2,$at 1044 multu ($r0,$d2) # d2*r0 1045 addu $h3,$h3,$at 1046 1047 mflo ($at,$r0,$d2) 1048 mfhi ($t0,$r0,$d2) 1049 addu $h2,$h2,$a3 1050 addu $h3,$h3,$t1 1051 multu ($rs3,$d3) # d3*s3 1052 sltu $a3,$h2,$a3 1053 addu $h3,$h3,$a3 1054 1055 mflo ($a3,$rs3,$d3) 1056 mfhi ($t1,$rs3,$d3) 1057 addu $h2,$h2,$at 1058 addu $h3,$h3,$t0 1059 multu ($rs2,$h4) # h4*s2 1060 sltu $at,$h2,$at 1061 addu $h3,$h3,$at 1062 1063 mflo ($at,$rs2,$h4) 1064 addu $h2,$h2,$a3 1065 addu $h3,$h3,$t1 1066 multu ($r3,$d0) # d0*r3 1067 sltu $a3,$h2,$a3 1068 addu $h3,$h3,$a3 1069 1070 1071 mflo ($a3,$r3,$d0) 1072 mfhi ($t1,$r3,$d0) 1073 addu $h2,$h2,$at 1074 sltu $at,$h2,$at 1075 multu ($r2,$d1) # d1*r2 1076 addu $h3,$h3,$at 1077 1078 mflo ($at,$r2,$d1) 1079 mfhi ($t0,$r2,$d1) 1080 addu $h3,$h3,$a3 1081 sltu $a3,$h3,$a3 1082 multu ($r0,$d3) # d3*r0 1083 addu $t1,$t1,$a3 1084 1085 mflo ($a3,$r0,$d3) 1086 mfhi ($d3,$r0,$d3) 1087 addu $h3,$h3,$at 1088 addu $t1,$t1,$t0 1089 multu ($r1,$d2) # d2*r1 1090 sltu $at,$h3,$at 1091 addu $t1,$t1,$at 1092 1093 mflo ($at,$r1,$d2) 1094 mfhi ($t0,$r1,$d2) 1095 addu $h3,$h3,$a3 1096 addu $t1,$t1,$d3 1097 multu ($rs3,$h4) # h4*s3 1098 sltu $a3,$h3,$a3 1099 addu $t1,$t1,$a3 1100 1101 mflo ($a3,$rs3,$h4) 1102 addu $h3,$h3,$at 1103 addu $t1,$t1,$t0 1104 multu ($r0,$h4) # h4*r0 1105 sltu $at,$h3,$at 1106 addu $t1,$t1,$at 1107 1108 1109 mflo ($h4,$r0,$h4) 1110 addu $h3,$h3,$a3 1111 sltu $a3,$h3,$a3 1112 addu $t1,$t1,$a3 1113 addu $h4,$h4,$t1 1114 1115 li $padbit,1 # if we loop, padbit is 1 1116#endif 1117 bne $inp,$len,.Loop 1118 1119 sw $h0,0($ctx) # store hash value 1120 sw $h1,4($ctx) 1121 sw $h2,8($ctx) 1122 sw $h3,12($ctx) 1123 sw $h4,16($ctx) 1124 1125 .set noreorder 1126.Labort: 1127 lw $s11,4*11($sp) 1128 lw $s10,4*10($sp) 1129 lw $s9, 4*9($sp) 1130 lw $s8, 4*8($sp) 1131 lw $s7, 4*7($sp) 1132 lw $s6, 4*6($sp) 1133 lw $s5, 4*5($sp) 1134 lw $s4, 4*4($sp) 1135___ 1136$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 1137 lw $s3, 4*3($sp) 1138 lw $s2, 4*2($sp) 1139 lw $s1, 4*1($sp) 1140 lw $s0, 4*0($sp) 1141___ 1142$code.=<<___; 1143 jr $ra 1144 addu $sp,$sp,4*12 1145.end poly1305_blocks 1146___ 1147} 1148{ 1149my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); 1150 1151$code.=<<___; 1152.align 5 1153.globl poly1305_emit 1154.ent poly1305_emit 1155poly1305_emit: 1156 .frame $sp,0,$ra 1157 .set reorder 1158 1159 lw $tmp4,16($ctx) 1160 lw $tmp0,0($ctx) 1161 lw $tmp1,4($ctx) 1162 lw $tmp2,8($ctx) 1163 lw $tmp3,12($ctx) 1164 1165 li $in0,-4 # final reduction 1166 srl $ctx,$tmp4,2 1167 and $in0,$in0,$tmp4 1168 andi $tmp4,$tmp4,3 1169 addu $ctx,$ctx,$in0 1170 1171 addu $tmp0,$tmp0,$ctx 1172 sltu $ctx,$tmp0,$ctx 1173 addiu $in0,$tmp0,5 # compare to modulus 1174 addu $tmp1,$tmp1,$ctx 1175 sltiu $in1,$in0,5 1176 sltu $ctx,$tmp1,$ctx 1177 addu $in1,$in1,$tmp1 1178 addu $tmp2,$tmp2,$ctx 1179 sltu $in2,$in1,$tmp1 1180 sltu $ctx,$tmp2,$ctx 1181 addu $in2,$in2,$tmp2 1182 addu $tmp3,$tmp3,$ctx 1183 sltu $in3,$in2,$tmp2 1184 sltu $ctx,$tmp3,$ctx 1185 addu $in3,$in3,$tmp3 1186 addu $tmp4,$tmp4,$ctx 1187 sltu $ctx,$in3,$tmp3 1188 addu $ctx,$tmp4 1189 1190 srl $ctx,2 # see if it carried/borrowed 1191 subu $ctx,$zero,$ctx 1192 1193 xor $in0,$tmp0 1194 xor $in1,$tmp1 1195 xor $in2,$tmp2 1196 xor $in3,$tmp3 1197 and $in0,$ctx 1198 and $in1,$ctx 1199 and $in2,$ctx 1200 and $in3,$ctx 1201 xor $in0,$tmp0 1202 xor $in1,$tmp1 1203 xor $in2,$tmp2 1204 xor $in3,$tmp3 1205 1206 lw $tmp0,0($nonce) # load nonce 1207 lw $tmp1,4($nonce) 1208 lw $tmp2,8($nonce) 1209 lw $tmp3,12($nonce) 1210 1211 addu $in0,$tmp0 # accumulate nonce 1212 sltu $ctx,$in0,$tmp0 1213 1214 addu $in1,$tmp1 1215 sltu $tmp1,$in1,$tmp1 1216 addu $in1,$ctx 1217 sltu $ctx,$in1,$ctx 1218 addu $ctx,$tmp1 1219 1220 addu $in2,$tmp2 1221 sltu $tmp2,$in2,$tmp2 1222 addu $in2,$ctx 1223 sltu $ctx,$in2,$ctx 1224 addu $ctx,$tmp2 1225 1226 addu $in3,$tmp3 1227 addu $in3,$ctx 1228 1229 srl $tmp0,$in0,8 # write mac value 1230 srl $tmp1,$in0,16 1231 srl $tmp2,$in0,24 1232 sb $in0, 0($mac) 1233 sb $tmp0,1($mac) 1234 srl $tmp0,$in1,8 1235 sb $tmp1,2($mac) 1236 srl $tmp1,$in1,16 1237 sb $tmp2,3($mac) 1238 srl $tmp2,$in1,24 1239 sb $in1, 4($mac) 1240 sb $tmp0,5($mac) 1241 srl $tmp0,$in2,8 1242 sb $tmp1,6($mac) 1243 srl $tmp1,$in2,16 1244 sb $tmp2,7($mac) 1245 srl $tmp2,$in2,24 1246 sb $in2, 8($mac) 1247 sb $tmp0,9($mac) 1248 srl $tmp0,$in3,8 1249 sb $tmp1,10($mac) 1250 srl $tmp1,$in3,16 1251 sb $tmp2,11($mac) 1252 srl $tmp2,$in3,24 1253 sb $in3, 12($mac) 1254 sb $tmp0,13($mac) 1255 sb $tmp1,14($mac) 1256 sb $tmp2,15($mac) 1257 1258 jr $ra 1259.end poly1305_emit 1260.rdata 1261.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" 1262.align 2 1263___ 1264} 1265}}} 1266 1267$output=pop and open STDOUT,">$output"; 1268print $code; 1269close STDOUT; 1270