1#!/usr/bin/env perl 2# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 3# 4# ==================================================================== 5# Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL. 6# ==================================================================== 7# 8# Poly1305 hash for RISC-V. 9# 10# February 2019 11# 12# In the essence it's pretty straightforward transliteration of MIPS 13# module [without big-endian option]. 14# 15# 1.8 cycles per byte on U74, >100% faster than compiler-generated 16# code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69% 17# improvement. 18# 19# June 2024. 20# 21# Add CHERI support. 22# 23###################################################################### 24# 25($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4)); 26($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31)); 27($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17)); 28($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27)); 29# 30###################################################################### 31 32$flavour = shift || "64"; 33 34for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } 35open STDOUT,">$output"; 36 37$code.=<<___; 38#ifdef __KERNEL__ 39# ifdef __riscv_zicfilp 40# undef __riscv_zicfilp // calls are expected to be direct 41# endif 42#endif 43 44#if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast) 45# define __riscv_misaligned_fast 1 46#endif 47___ 48 49if ($flavour =~ /64/) {{{ 50###################################################################### 51# 64-bit code path... 52# 53my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 54my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2); 55 56$code.=<<___; 57#if __riscv_xlen == 64 58# if __SIZEOF_POINTER__ == 16 59# define PUSH csc 60# define POP clc 61# else 62# define PUSH sd 63# define POP ld 64# endif 65#else 66# error "unsupported __riscv_xlen" 67#endif 68 69.option pic 70.text 71 72.globl poly1305_init 73.type poly1305_init,\@function 74poly1305_init: 75#ifdef __riscv_zicfilp 76 lpad 0 77#endif 78 sd $zero,0($ctx) 79 sd $zero,8($ctx) 80 sd $zero,16($ctx) 81 82 beqz $inp,.Lno_key 83 84#ifndef __riscv_misaligned_fast 85 andi $tmp0,$inp,7 # $inp % 8 86 andi $inp,$inp,-8 # align $inp 87 slli $tmp0,$tmp0,3 # byte to bit offset 88#endif 89 ld $in0,0($inp) 90 ld $in1,8($inp) 91#ifndef __riscv_misaligned_fast 92 beqz $tmp0,.Laligned_key 93 94 ld $tmp2,16($inp) 95 neg $tmp1,$tmp0 # implicit &63 in sll 96 srl $in0,$in0,$tmp0 97 sll $tmp3,$in1,$tmp1 98 srl $in1,$in1,$tmp0 99 sll $tmp2,$tmp2,$tmp1 100 or $in0,$in0,$tmp3 101 or $in1,$in1,$tmp2 102 103.Laligned_key: 104#endif 105 li $tmp0,1 106 slli $tmp0,$tmp0,32 # 0x0000000100000000 107 addi $tmp0,$tmp0,-63 # 0x00000000ffffffc1 108 slli $tmp0,$tmp0,28 # 0x0ffffffc10000000 109 addi $tmp0,$tmp0,-1 # 0x0ffffffc0fffffff 110 111 and $in0,$in0,$tmp0 112 addi $tmp0,$tmp0,-3 # 0x0ffffffc0ffffffc 113 and $in1,$in1,$tmp0 114 115 sd $in0,24($ctx) 116 srli $tmp0,$in1,2 117 sd $in1,32($ctx) 118 add $tmp0,$tmp0,$in1 # s1 = r1 + (r1 >> 2) 119 sd $tmp0,40($ctx) 120 121.Lno_key: 122 li $a0,0 # return 0 123 ret 124.size poly1305_init,.-poly1305_init 125___ 126{ 127my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = 128 ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2); 129my ($shr,$shl) = ($t5,$t6); # used on R6 130 131$code.=<<___; 132.globl poly1305_blocks 133.type poly1305_blocks,\@function 134poly1305_blocks: 135#ifdef __riscv_zicfilp 136 lpad 0 137#endif 138 andi $len,$len,-16 # complete blocks only 139 beqz $len,.Lno_data 140 141 caddi $sp,$sp,-4*__SIZEOF_POINTER__ 142 PUSH $s0,3*__SIZEOF_POINTER__($sp) 143 PUSH $s1,2*__SIZEOF_POINTER__($sp) 144 PUSH $s2,1*__SIZEOF_POINTER__($sp) 145 PUSH $s3,0*__SIZEOF_POINTER__($sp) 146 147#ifndef __riscv_misaligned_fast 148 andi $shr,$inp,7 149 andi $inp,$inp,-8 # align $inp 150 slli $shr,$shr,3 # byte to bit offset 151 neg $shl,$shr # implicit &63 in sll 152#endif 153 154 ld $h0,0($ctx) # load hash value 155 ld $h1,8($ctx) 156 ld $h2,16($ctx) 157 158 ld $r0,24($ctx) # load key 159 ld $r1,32($ctx) 160 ld $rs1,40($ctx) 161 162 add $len,$len,$inp # end of buffer 163 164.Loop: 165 ld $in0,0($inp) # load input 166 ld $in1,8($inp) 167#ifndef __riscv_misaligned_fast 168 beqz $shr,.Laligned_inp 169 170 ld $tmp2,16($inp) 171 srl $in0,$in0,$shr 172 sll $tmp3,$in1,$shl 173 srl $in1,$in1,$shr 174 sll $tmp2,$tmp2,$shl 175 or $in0,$in0,$tmp3 176 or $in1,$in1,$tmp2 177 178.Laligned_inp: 179#endif 180 caddi $inp,$inp,16 181 182 andi $tmp0,$h2,-4 # modulo-scheduled reduction 183 srli $tmp1,$h2,2 184 andi $h2,$h2,3 185 186 add $d0,$h0,$in0 # accumulate input 187 add $tmp1,$tmp1,$tmp0 188 sltu $tmp0,$d0,$h0 189 add $d0,$d0,$tmp1 # ... and residue 190 sltu $tmp1,$d0,$tmp1 191 add $d1,$h1,$in1 192 add $tmp0,$tmp0,$tmp1 193 sltu $tmp1,$d1,$h1 194 add $d1,$d1,$tmp0 195 196 add $d2,$h2,$padbit 197 sltu $tmp0,$d1,$tmp0 198 mulhu $h1,$r0,$d0 # h0*r0 199 mul $h0,$r0,$d0 200 201 add $d2,$d2,$tmp1 202 add $d2,$d2,$tmp0 203 mulhu $tmp1,$rs1,$d1 # h1*5*r1 204 mul $tmp0,$rs1,$d1 205 206 mulhu $h2,$r1,$d0 # h0*r1 207 mul $tmp2,$r1,$d0 208 add $h0,$h0,$tmp0 209 add $h1,$h1,$tmp1 210 sltu $tmp0,$h0,$tmp0 211 212 add $h1,$h1,$tmp0 213 add $h1,$h1,$tmp2 214 mulhu $tmp1,$r0,$d1 # h1*r0 215 mul $tmp0,$r0,$d1 216 217 sltu $tmp2,$h1,$tmp2 218 add $h2,$h2,$tmp2 219 mul $tmp2,$rs1,$d2 # h2*5*r1 220 221 add $h1,$h1,$tmp0 222 add $h2,$h2,$tmp1 223 mul $tmp3,$r0,$d2 # h2*r0 224 sltu $tmp0,$h1,$tmp0 225 add $h2,$h2,$tmp0 226 227 add $h1,$h1,$tmp2 228 sltu $tmp2,$h1,$tmp2 229 add $h2,$h2,$tmp2 230 add $h2,$h2,$tmp3 231 232 bne $inp,$len,.Loop 233 234 sd $h0,0($ctx) # store hash value 235 sd $h1,8($ctx) 236 sd $h2,16($ctx) 237 238 POP $s0,3*__SIZEOF_POINTER__($sp) # epilogue 239 POP $s1,2*__SIZEOF_POINTER__($sp) 240 POP $s2,1*__SIZEOF_POINTER__($sp) 241 POP $s3,0*__SIZEOF_POINTER__($sp) 242 caddi $sp,$sp,4*__SIZEOF_POINTER__ 243 244.Lno_data: 245 ret 246.size poly1305_blocks,.-poly1305_blocks 247___ 248} 249{ 250my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); 251 252$code.=<<___; 253.globl poly1305_emit 254.type poly1305_emit,\@function 255poly1305_emit: 256#ifdef __riscv_zicfilp 257 lpad 0 258#endif 259 ld $tmp2,16($ctx) 260 ld $tmp0,0($ctx) 261 ld $tmp1,8($ctx) 262 263 andi $in0,$tmp2,-4 # final reduction 264 srl $in1,$tmp2,2 265 andi $tmp2,$tmp2,3 266 add $in0,$in0,$in1 267 268 add $tmp0,$tmp0,$in0 269 sltu $in1,$tmp0,$in0 270 addi $in0,$tmp0,5 # compare to modulus 271 add $tmp1,$tmp1,$in1 272 sltiu $tmp3,$in0,5 273 sltu $tmp4,$tmp1,$in1 274 add $in1,$tmp1,$tmp3 275 add $tmp2,$tmp2,$tmp4 276 sltu $tmp3,$in1,$tmp3 277 add $tmp2,$tmp2,$tmp3 278 279 srli $tmp2,$tmp2,2 # see if it carried/borrowed 280 neg $tmp2,$tmp2 281 282 xor $in0,$in0,$tmp0 283 xor $in1,$in1,$tmp1 284 and $in0,$in0,$tmp2 285 and $in1,$in1,$tmp2 286 xor $in0,$in0,$tmp0 287 xor $in1,$in1,$tmp1 288 289 lwu $tmp0,0($nonce) # load nonce 290 lwu $tmp1,4($nonce) 291 lwu $tmp2,8($nonce) 292 lwu $tmp3,12($nonce) 293 slli $tmp1,$tmp1,32 294 slli $tmp3,$tmp3,32 295 or $tmp0,$tmp0,$tmp1 296 or $tmp2,$tmp2,$tmp3 297 298 add $in0,$in0,$tmp0 # accumulate nonce 299 add $in1,$in1,$tmp2 300 sltu $tmp0,$in0,$tmp0 301 add $in1,$in1,$tmp0 302 303#ifdef __riscv_misaligned_fast 304 sd $in0,0($mac) # write mac value 305 sd $in1,8($mac) 306#else 307 srli $tmp0,$in0,8 # write mac value 308 srli $tmp1,$in0,16 309 srli $tmp2,$in0,24 310 sb $in0,0($mac) 311 srli $tmp3,$in0,32 312 sb $tmp0,1($mac) 313 srli $tmp0,$in0,40 314 sb $tmp1,2($mac) 315 srli $tmp1,$in0,48 316 sb $tmp2,3($mac) 317 srli $tmp2,$in0,56 318 sb $tmp3,4($mac) 319 srli $tmp3,$in1,8 320 sb $tmp0,5($mac) 321 srli $tmp0,$in1,16 322 sb $tmp1,6($mac) 323 srli $tmp1,$in1,24 324 sb $tmp2,7($mac) 325 326 sb $in1,8($mac) 327 srli $tmp2,$in1,32 328 sb $tmp3,9($mac) 329 srli $tmp3,$in1,40 330 sb $tmp0,10($mac) 331 srli $tmp0,$in1,48 332 sb $tmp1,11($mac) 333 srli $tmp1,$in1,56 334 sb $tmp2,12($mac) 335 sb $tmp3,13($mac) 336 sb $tmp0,14($mac) 337 sb $tmp1,15($mac) 338#endif 339 340 ret 341.size poly1305_emit,.-poly1305_emit 342.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm" 343___ 344} 345}}} else {{{ 346###################################################################### 347# 32-bit code path 348# 349 350my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 351my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = 352 ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3); 353 354$code.=<<___; 355#if __riscv_xlen == 32 356# if __SIZEOF_POINTER__ == 8 357# define PUSH csc 358# define POP clc 359# else 360# define PUSH sw 361# define POP lw 362# endif 363# define MULX(hi,lo,a,b) mulhu hi,a,b; mul lo,a,b 364# define srliw srli 365# define srlw srl 366# define sllw sll 367# define addw add 368# define addiw addi 369# define mulw mul 370#elif __riscv_xlen == 64 371# if __SIZEOF_POINTER__ == 16 372# define PUSH csc 373# define POP clc 374# else 375# define PUSH sd 376# define POP ld 377# endif 378# define MULX(hi,lo,a,b) slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32 379#else 380# error "unsupported __riscv_xlen" 381#endif 382 383.option pic 384.text 385 386.globl poly1305_init 387.type poly1305_init,\@function 388poly1305_init: 389#ifdef __riscv_zicfilp 390 lpad 0 391#endif 392 sw $zero,0($ctx) 393 sw $zero,4($ctx) 394 sw $zero,8($ctx) 395 sw $zero,12($ctx) 396 sw $zero,16($ctx) 397 398 beqz $inp,.Lno_key 399 400#ifndef __riscv_misaligned_fast 401 andi $tmp0,$inp,3 # $inp % 4 402 sub $inp,$inp,$tmp0 # align $inp 403 sll $tmp0,$tmp0,3 # byte to bit offset 404#endif 405 lw $in0,0($inp) 406 lw $in1,4($inp) 407 lw $in2,8($inp) 408 lw $in3,12($inp) 409#ifndef __riscv_misaligned_fast 410 beqz $tmp0,.Laligned_key 411 412 lw $tmp2,16($inp) 413 sub $tmp1,$zero,$tmp0 414 srlw $in0,$in0,$tmp0 415 sllw $tmp3,$in1,$tmp1 416 srlw $in1,$in1,$tmp0 417 or $in0,$in0,$tmp3 418 sllw $tmp3,$in2,$tmp1 419 srlw $in2,$in2,$tmp0 420 or $in1,$in1,$tmp3 421 sllw $tmp3,$in3,$tmp1 422 srlw $in3,$in3,$tmp0 423 or $in2,$in2,$tmp3 424 sllw $tmp2,$tmp2,$tmp1 425 or $in3,$in3,$tmp2 426.Laligned_key: 427#endif 428 429 lui $tmp0,0x10000 430 addi $tmp0,$tmp0,-1 # 0x0fffffff 431 and $in0,$in0,$tmp0 432 addi $tmp0,$tmp0,-3 # 0x0ffffffc 433 and $in1,$in1,$tmp0 434 and $in2,$in2,$tmp0 435 and $in3,$in3,$tmp0 436 437 sw $in0,20($ctx) 438 sw $in1,24($ctx) 439 sw $in2,28($ctx) 440 sw $in3,32($ctx) 441 442 srlw $tmp1,$in1,2 443 srlw $tmp2,$in2,2 444 srlw $tmp3,$in3,2 445 addw $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) 446 addw $in2,$in2,$tmp2 447 addw $in3,$in3,$tmp3 448 sw $in1,36($ctx) 449 sw $in2,40($ctx) 450 sw $in3,44($ctx) 451.Lno_key: 452 li $a0,0 453 ret 454.size poly1305_init,.-poly1305_init 455___ 456{ 457my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = 458 ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2); 459my ($d0,$d1,$d2,$d3) = 460 ($a4,$a5,$a6,$a7); 461my $shr = $ra; # used on R6 462 463$code.=<<___; 464.globl poly1305_blocks 465.type poly1305_blocks,\@function 466poly1305_blocks: 467#ifdef __riscv_zicfilp 468 lpad 0 469#endif 470 andi $len,$len,-16 # complete blocks only 471 beqz $len,.Labort 472 473#ifdef __riscv_zcmp 474 cm.push {ra,s0-s8}, -48 475#else 476 caddi $sp,$sp,-__SIZEOF_POINTER__*12 477 PUSH $ra, __SIZEOF_POINTER__*11($sp) 478 PUSH $s0, __SIZEOF_POINTER__*10($sp) 479 PUSH $s1, __SIZEOF_POINTER__*9($sp) 480 PUSH $s2, __SIZEOF_POINTER__*8($sp) 481 PUSH $s3, __SIZEOF_POINTER__*7($sp) 482 PUSH $s4, __SIZEOF_POINTER__*6($sp) 483 PUSH $s5, __SIZEOF_POINTER__*5($sp) 484 PUSH $s6, __SIZEOF_POINTER__*4($sp) 485 PUSH $s7, __SIZEOF_POINTER__*3($sp) 486 PUSH $s8, __SIZEOF_POINTER__*2($sp) 487#endif 488 489#ifndef __riscv_misaligned_fast 490 andi $shr,$inp,3 491 andi $inp,$inp,-4 # align $inp 492 slli $shr,$shr,3 # byte to bit offset 493#endif 494 495 lw $h0,0($ctx) # load hash value 496 lw $h1,4($ctx) 497 lw $h2,8($ctx) 498 lw $h3,12($ctx) 499 lw $h4,16($ctx) 500 501 lw $r0,20($ctx) # load key 502 lw $r1,24($ctx) 503 lw $r2,28($ctx) 504 lw $r3,32($ctx) 505 lw $rs1,36($ctx) 506 lw $rs2,40($ctx) 507 lw $rs3,44($ctx) 508 509 add $len,$len,$inp # end of buffer 510 511.Loop: 512 lw $d0,0($inp) # load input 513 lw $d1,4($inp) 514 lw $d2,8($inp) 515 lw $d3,12($inp) 516#ifndef __riscv_misaligned_fast 517 beqz $shr,.Laligned_inp 518 519 lw $t4,16($inp) 520 sub $t5,$zero,$shr 521 srlw $d0,$d0,$shr 522 sllw $t3,$d1,$t5 523 srlw $d1,$d1,$shr 524 or $d0,$d0,$t3 525 sllw $t3,$d2,$t5 526 srlw $d2,$d2,$shr 527 or $d1,$d1,$t3 528 sllw $t3,$d3,$t5 529 srlw $d3,$d3,$shr 530 or $d2,$d2,$t3 531 sllw $t4,$t4,$t5 532 or $d3,$d3,$t4 533 534.Laligned_inp: 535#endif 536 srliw $t3,$h4,2 # modulo-scheduled reduction 537 andi $t4,$h4,-4 538 andi $h4,$h4,3 539 540 addw $d0,$d0,$h0 # accumulate input 541 addw $t4,$t4,$t3 542 sltu $h0,$d0,$h0 543 addw $d0,$d0,$t4 # ... and residue 544 sltu $t4,$d0,$t4 545 546 addw $d1,$d1,$h1 547 addw $h0,$h0,$t4 # carry 548 sltu $h1,$d1,$h1 549 addw $d1,$d1,$h0 550 sltu $h0,$d1,$h0 551 552 addw $d2,$d2,$h2 553 addw $h1,$h1,$h0 # carry 554 sltu $h2,$d2,$h2 555 addw $d2,$d2,$h1 556 sltu $h1,$d2,$h1 557 558 addw $d3,$d3,$h3 559 addw $h2,$h2,$h1 # carry 560 sltu $h3,$d3,$h3 561 addw $d3,$d3,$h2 562 563 MULX ($h1,$h0,$r0,$d0) # d0*r0 564 565 sltu $h2,$d3,$h2 566 addw $h3,$h3,$h2 # carry 567 568 MULX ($t4,$t3,$rs3,$d1) # d1*s3 569 570 addw $h4,$h4,$padbit 571 caddi $inp,$inp,16 572 addw $h4,$h4,$h3 573 574 MULX ($t6,$a3,$rs2,$d2) # d2*s2 575 addw $h0,$h0,$t3 576 addw $h1,$h1,$t4 577 sltu $t3,$h0,$t3 578 addw $h1,$h1,$t3 579 580 MULX ($t4,$t3,$rs1,$d3) # d3*s1 581 addw $h0,$h0,$a3 582 addw $h1,$h1,$t6 583 sltu $a3,$h0,$a3 584 addw $h1,$h1,$a3 585 586 587 MULX ($h2,$a3,$r1,$d0) # d0*r1 588 addw $h0,$h0,$t3 589 addw $h1,$h1,$t4 590 sltu $t3,$h0,$t3 591 addw $h1,$h1,$t3 592 593 MULX ($t4,$t3,$r0,$d1) # d1*r0 594 addw $h1,$h1,$a3 595 sltu $a3,$h1,$a3 596 addw $h2,$h2,$a3 597 598 MULX ($t6,$a3,$rs3,$d2) # d2*s3 599 addw $h1,$h1,$t3 600 addw $h2,$h2,$t4 601 sltu $t3,$h1,$t3 602 addw $h2,$h2,$t3 603 604 MULX ($t4,$t3,$rs2,$d3) # d3*s2 605 addw $h1,$h1,$a3 606 addw $h2,$h2,$t6 607 sltu $a3,$h1,$a3 608 addw $h2,$h2,$a3 609 610 mulw $a3,$rs1,$h4 # h4*s1 611 addw $h1,$h1,$t3 612 addw $h2,$h2,$t4 613 sltu $t3,$h1,$t3 614 addw $h2,$h2,$t3 615 616 617 MULX ($h3,$t3,$r2,$d0) # d0*r2 618 addw $h1,$h1,$a3 619 sltu $a3,$h1,$a3 620 addw $h2,$h2,$a3 621 622 MULX ($t6,$a3,$r1,$d1) # d1*r1 623 addw $h2,$h2,$t3 624 sltu $t3,$h2,$t3 625 addw $h3,$h3,$t3 626 627 MULX ($t4,$t3,$r0,$d2) # d2*r0 628 addw $h2,$h2,$a3 629 addw $h3,$h3,$t6 630 sltu $a3,$h2,$a3 631 addw $h3,$h3,$a3 632 633 MULX ($t6,$a3,$rs3,$d3) # d3*s3 634 addw $h2,$h2,$t3 635 addw $h3,$h3,$t4 636 sltu $t3,$h2,$t3 637 addw $h3,$h3,$t3 638 639 mulw $t3,$rs2,$h4 # h4*s2 640 addw $h2,$h2,$a3 641 addw $h3,$h3,$t6 642 sltu $a3,$h2,$a3 643 addw $h3,$h3,$a3 644 645 646 MULX ($t6,$a3,$r3,$d0) # d0*r3 647 addw $h2,$h2,$t3 648 sltu $t3,$h2,$t3 649 addw $h3,$h3,$t3 650 651 MULX ($t4,$t3,$r2,$d1) # d1*r2 652 addw $h3,$h3,$a3 653 sltu $a3,$h3,$a3 654 addw $t6,$t6,$a3 655 656 MULX ($a3,$d3,$r0,$d3) # d3*r0 657 addw $h3,$h3,$t3 658 addw $t6,$t6,$t4 659 sltu $t3,$h3,$t3 660 addw $t6,$t6,$t3 661 662 MULX ($t4,$t3,$r1,$d2) # d2*r1 663 addw $h3,$h3,$d3 664 addw $t6,$t6,$a3 665 sltu $d3,$h3,$d3 666 addw $t6,$t6,$d3 667 668 mulw $a3,$rs3,$h4 # h4*s3 669 addw $h3,$h3,$t3 670 addw $t6,$t6,$t4 671 sltu $t3,$h3,$t3 672 addw $t6,$t6,$t3 673 674 675 mulw $h4,$r0,$h4 # h4*r0 676 addw $h3,$h3,$a3 677 sltu $a3,$h3,$a3 678 addw $t6,$t6,$a3 679 addw $h4,$t6,$h4 680 681 li $padbit,1 # if we loop, padbit is 1 682 683 bne $inp,$len,.Loop 684 685 sw $h0,0($ctx) # store hash value 686 sw $h1,4($ctx) 687 sw $h2,8($ctx) 688 sw $h3,12($ctx) 689 sw $h4,16($ctx) 690 691#ifdef __riscv_zcmp 692 cm.popret {ra,s0-s8}, 48 693#else 694 POP $ra, __SIZEOF_POINTER__*11($sp) 695 POP $s0, __SIZEOF_POINTER__*10($sp) 696 POP $s1, __SIZEOF_POINTER__*9($sp) 697 POP $s2, __SIZEOF_POINTER__*8($sp) 698 POP $s3, __SIZEOF_POINTER__*7($sp) 699 POP $s4, __SIZEOF_POINTER__*6($sp) 700 POP $s5, __SIZEOF_POINTER__*5($sp) 701 POP $s6, __SIZEOF_POINTER__*4($sp) 702 POP $s7, __SIZEOF_POINTER__*3($sp) 703 POP $s8, __SIZEOF_POINTER__*2($sp) 704 caddi $sp,$sp,__SIZEOF_POINTER__*12 705#endif 706.Labort: 707 ret 708.size poly1305_blocks,.-poly1305_blocks 709___ 710} 711{ 712my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); 713 714$code.=<<___; 715.globl poly1305_emit 716.type poly1305_emit,\@function 717poly1305_emit: 718#ifdef __riscv_zicfilp 719 lpad 0 720#endif 721 lw $tmp4,16($ctx) 722 lw $tmp0,0($ctx) 723 lw $tmp1,4($ctx) 724 lw $tmp2,8($ctx) 725 lw $tmp3,12($ctx) 726 727 srliw $ctx,$tmp4,2 # final reduction 728 andi $in0,$tmp4,-4 729 andi $tmp4,$tmp4,3 730 addw $ctx,$ctx,$in0 731 732 addw $tmp0,$tmp0,$ctx 733 sltu $ctx,$tmp0,$ctx 734 addiw $in0,$tmp0,5 # compare to modulus 735 addw $tmp1,$tmp1,$ctx 736 sltiu $in1,$in0,5 737 sltu $ctx,$tmp1,$ctx 738 addw $in1,$in1,$tmp1 739 addw $tmp2,$tmp2,$ctx 740 sltu $in2,$in1,$tmp1 741 sltu $ctx,$tmp2,$ctx 742 addw $in2,$in2,$tmp2 743 addw $tmp3,$tmp3,$ctx 744 sltu $in3,$in2,$tmp2 745 sltu $ctx,$tmp3,$ctx 746 addw $in3,$in3,$tmp3 747 addw $tmp4,$tmp4,$ctx 748 sltu $ctx,$in3,$tmp3 749 addw $ctx,$ctx,$tmp4 750 751 srl $ctx,$ctx,2 # see if it carried/borrowed 752 sub $ctx,$zero,$ctx 753 754 xor $in0,$in0,$tmp0 755 xor $in1,$in1,$tmp1 756 xor $in2,$in2,$tmp2 757 xor $in3,$in3,$tmp3 758 and $in0,$in0,$ctx 759 and $in1,$in1,$ctx 760 and $in2,$in2,$ctx 761 and $in3,$in3,$ctx 762 xor $in0,$in0,$tmp0 763 xor $in1,$in1,$tmp1 764 xor $in2,$in2,$tmp2 765 xor $in3,$in3,$tmp3 766 767 lw $tmp0,0($nonce) # load nonce 768 lw $tmp1,4($nonce) 769 lw $tmp2,8($nonce) 770 lw $tmp3,12($nonce) 771 772 addw $in0,$in0,$tmp0 # accumulate nonce 773 sltu $ctx,$in0,$tmp0 774 775 addw $in1,$in1,$tmp1 776 sltu $tmp1,$in1,$tmp1 777 addw $in1,$in1,$ctx 778 sltu $ctx,$in1,$ctx 779 addw $ctx,$ctx,$tmp1 780 781 addw $in2,$in2,$tmp2 782 sltu $tmp2,$in2,$tmp2 783 addw $in2,$in2,$ctx 784 sltu $ctx,$in2,$ctx 785 addw $ctx,$ctx,$tmp2 786 787 addw $in3,$in3,$tmp3 788 addw $in3,$in3,$ctx 789 790#ifdef __riscv_misaligned_fast 791 sw $in0,0($mac) # write mac value 792 sw $in1,4($mac) 793 sw $in2,8($mac) 794 sw $in3,12($mac) 795#else 796 srl $tmp0,$in0,8 # write mac value 797 srl $tmp1,$in0,16 798 srl $tmp2,$in0,24 799 sb $in0, 0($mac) 800 sb $tmp0,1($mac) 801 srl $tmp0,$in1,8 802 sb $tmp1,2($mac) 803 srl $tmp1,$in1,16 804 sb $tmp2,3($mac) 805 srl $tmp2,$in1,24 806 sb $in1, 4($mac) 807 sb $tmp0,5($mac) 808 srl $tmp0,$in2,8 809 sb $tmp1,6($mac) 810 srl $tmp1,$in2,16 811 sb $tmp2,7($mac) 812 srl $tmp2,$in2,24 813 sb $in2, 8($mac) 814 sb $tmp0,9($mac) 815 srl $tmp0,$in3,8 816 sb $tmp1,10($mac) 817 srl $tmp1,$in3,16 818 sb $tmp2,11($mac) 819 srl $tmp2,$in3,24 820 sb $in3, 12($mac) 821 sb $tmp0,13($mac) 822 sb $tmp1,14($mac) 823 sb $tmp2,15($mac) 824#endif 825 826 ret 827.size poly1305_emit,.-poly1305_emit 828.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm" 829___ 830} 831}}} 832 833foreach (split("\n", $code)) { 834 if ($flavour =~ /^cheri/) { 835 s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/; 836 s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or 837 s/\b(ret|jal)\b/c$1/; 838 s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or 839 m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g; 840 } else { 841 s/\bcaddi?\b/add/ or 842 s/\bcmove\b/mv/; 843 } 844 print $_, "\n"; 845} 846 847close STDOUT; 848