1#! /usr/bin/env perl 2# Copyright 2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# X25519 lower-level primitives for PPC64. 17# 18# July 2018. 19# 20# Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15% 21# faster on PPC970/G5. POWER8 on the other hand seems to trip on own 22# shoelaces when handling longer carry chains. As base 2^51 has just 23# single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is 24# pretty old, base 2^64 implementation is not engaged. Comparison to 25# compiler-generated code is complicated by the fact that not all 26# compilers support 128-bit integers. When compiler doesn't, like xlc, 27# this module delivers more than 2x improvement, and when it does, 28# from 12% to 30% improvement was measured... 29 30$flavour = shift; 31while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 32 33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 34( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 35( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 36die "can't locate ppc-xlate.pl"; 37 38open OUT,"| \"$^X\" $xlate $flavour $output"; 39*STDOUT=*OUT; 40 41my $sp = "r1"; 42my ($rp,$ap,$bp) = map("r$_",3..5); 43 44####################################################### base 2^64 45if (0) { 46my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3, 47 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = 48 map("r$_",(6..12,22..31)); 49my $zero = "r0"; 50my $FRAME = 16*8; 51 52$code.=<<___; 53.text 54 55.globl x25519_fe64_mul 56.type x25519_fe64_mul,\@function 57.align 5 58x25519_fe64_mul: 59 stdu $sp,-$FRAME($sp) 60 std r22,`$FRAME-8*10`($sp) 61 std r23,`$FRAME-8*9`($sp) 62 std r24,`$FRAME-8*8`($sp) 63 std r25,`$FRAME-8*7`($sp) 64 std r26,`$FRAME-8*6`($sp) 65 std r27,`$FRAME-8*5`($sp) 66 std r28,`$FRAME-8*4`($sp) 67 std r29,`$FRAME-8*3`($sp) 68 std r30,`$FRAME-8*2`($sp) 69 std r31,`$FRAME-8*1`($sp) 70 71 ld $bi,0($bp) 72 ld $a0,0($ap) 73 xor $zero,$zero,$zero 74 ld $a1,8($ap) 75 ld $a2,16($ap) 76 ld $a3,24($ap) 77 78 mulld $acc0,$a0,$bi # a[0]*b[0] 79 mulhdu $t0,$a0,$bi 80 mulld $acc1,$a1,$bi # a[1]*b[0] 81 mulhdu $t1,$a1,$bi 82 mulld $acc2,$a2,$bi # a[2]*b[0] 83 mulhdu $t2,$a2,$bi 84 mulld $acc3,$a3,$bi # a[3]*b[0] 85 mulhdu $t3,$a3,$bi 86___ 87for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7), 88 my $i=1; $i<4; shift(@acc), $i++) { 89my $acc4 = $i==1? $zero : @acc[4]; 90 91$code.=<<___; 92 ld $bi,`8*$i`($bp) 93 addc @acc[1],@acc[1],$t0 # accumulate high parts 94 mulld $t0,$a0,$bi 95 adde @acc[2],@acc[2],$t1 96 mulld $t1,$a1,$bi 97 adde @acc[3],@acc[3],$t2 98 mulld $t2,$a2,$bi 99 adde @acc[4],$acc4,$t3 100 mulld $t3,$a3,$bi 101 addc @acc[1],@acc[1],$t0 # accumulate low parts 102 mulhdu $t0,$a0,$bi 103 adde @acc[2],@acc[2],$t1 104 mulhdu $t1,$a1,$bi 105 adde @acc[3],@acc[3],$t2 106 mulhdu $t2,$a2,$bi 107 adde @acc[4],@acc[4],$t3 108 mulhdu $t3,$a3,$bi 109 adde @acc[5],$zero,$zero 110___ 111} 112$code.=<<___; 113 li $bi,38 114 addc $acc4,$acc4,$t0 115 mulld $t0,$acc4,$bi 116 adde $acc5,$acc5,$t1 117 mulld $t1,$acc5,$bi 118 adde $acc6,$acc6,$t2 119 mulld $t2,$acc6,$bi 120 adde $acc7,$acc7,$t3 121 mulld $t3,$acc7,$bi 122 123 addc $acc0,$acc0,$t0 124 mulhdu $t0,$acc4,$bi 125 adde $acc1,$acc1,$t1 126 mulhdu $t1,$acc5,$bi 127 adde $acc2,$acc2,$t2 128 mulhdu $t2,$acc6,$bi 129 adde $acc3,$acc3,$t3 130 mulhdu $t3,$acc7,$bi 131 adde $acc4,$zero,$zero 132 133 addc $acc1,$acc1,$t0 134 adde $acc2,$acc2,$t1 135 adde $acc3,$acc3,$t2 136 adde $acc4,$acc4,$t3 137 138 mulld $acc4,$acc4,$bi 139 140 addc $acc0,$acc0,$acc4 141 addze $acc1,$acc1 142 addze $acc2,$acc2 143 addze $acc3,$acc3 144 145 subfe $acc4,$acc4,$acc4 # carry -> ~mask 146 std $acc1,8($rp) 147 andc $acc4,$bi,$acc4 148 std $acc2,16($rp) 149 add $acc0,$acc0,$acc4 150 std $acc3,24($rp) 151 std $acc0,0($rp) 152 153 ld r22,`$FRAME-8*10`($sp) 154 ld r23,`$FRAME-8*9`($sp) 155 ld r24,`$FRAME-8*8`($sp) 156 ld r25,`$FRAME-8*7`($sp) 157 ld r26,`$FRAME-8*6`($sp) 158 ld r27,`$FRAME-8*5`($sp) 159 ld r28,`$FRAME-8*4`($sp) 160 ld r29,`$FRAME-8*3`($sp) 161 ld r30,`$FRAME-8*2`($sp) 162 ld r31,`$FRAME-8*1`($sp) 163 addi $sp,$sp,$FRAME 164 blr 165 .long 0 166 .byte 0,12,4,0,0x80,10,3,0 167 .long 0 168.size x25519_fe64_mul,.-x25519_fe64_mul 169 170.globl x25519_fe64_sqr 171.type x25519_fe64_sqr,\@function 172.align 5 173x25519_fe64_sqr: 174 stdu $sp,-$FRAME($sp) 175 std r22,`$FRAME-8*10`($sp) 176 std r23,`$FRAME-8*9`($sp) 177 std r24,`$FRAME-8*8`($sp) 178 std r25,`$FRAME-8*7`($sp) 179 std r26,`$FRAME-8*6`($sp) 180 std r27,`$FRAME-8*5`($sp) 181 std r28,`$FRAME-8*4`($sp) 182 std r29,`$FRAME-8*3`($sp) 183 std r30,`$FRAME-8*2`($sp) 184 std r31,`$FRAME-8*1`($sp) 185 186 ld $a0,0($ap) 187 xor $zero,$zero,$zero 188 ld $a1,8($ap) 189 ld $a2,16($ap) 190 ld $a3,24($ap) 191 192 ################################ 193 # | | | | | |a1*a0| | 194 # | | | | |a2*a0| | | 195 # | |a3*a2|a3*a0| | | | 196 # | | | |a2*a1| | | | 197 # | | |a3*a1| | | | | 198 # *| | | | | | | | 2| 199 # +|a3*a3|a2*a2|a1*a1|a0*a0| 200 # |--+--+--+--+--+--+--+--| 201 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 202 # 203 # "can't overflow" below mark carrying into high part of 204 # multiplication result, which can't overflow, because it 205 # can never be all ones. 206 207 mulld $acc1,$a1,$a0 # a[1]*a[0] 208 mulhdu $t1,$a1,$a0 209 mulld $acc2,$a2,$a0 # a[2]*a[0] 210 mulhdu $t2,$a2,$a0 211 mulld $acc3,$a3,$a0 # a[3]*a[0] 212 mulhdu $acc4,$a3,$a0 213 214 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication 215 mulld $t0,$a2,$a1 # a[2]*a[1] 216 mulhdu $t1,$a2,$a1 217 adde $acc3,$acc3,$t2 218 mulld $t2,$a3,$a1 # a[3]*a[1] 219 mulhdu $t3,$a3,$a1 220 addze $acc4,$acc4 # can't overflow 221 222 mulld $acc5,$a3,$a2 # a[3]*a[2] 223 mulhdu $acc6,$a3,$a2 224 225 addc $t1,$t1,$t2 # accumulate high parts of multiplication 226 mulld $acc0,$a0,$a0 # a[0]*a[0] 227 addze $t2,$t3 # can't overflow 228 229 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication 230 mulhdu $a0,$a0,$a0 231 adde $acc4,$acc4,$t1 232 mulld $t1,$a1,$a1 # a[1]*a[1] 233 adde $acc5,$acc5,$t2 234 mulhdu $a1,$a1,$a1 235 addze $acc6,$acc6 # can't overflow 236 237 addc $acc1,$acc1,$acc1 # acc[1-6]*=2 238 mulld $t2,$a2,$a2 # a[2]*a[2] 239 adde $acc2,$acc2,$acc2 240 mulhdu $a2,$a2,$a2 241 adde $acc3,$acc3,$acc3 242 mulld $t3,$a3,$a3 # a[3]*a[3] 243 adde $acc4,$acc4,$acc4 244 mulhdu $a3,$a3,$a3 245 adde $acc5,$acc5,$acc5 246 adde $acc6,$acc6,$acc6 247 addze $acc7,$zero 248 249 addc $acc1,$acc1,$a0 # +a[i]*a[i] 250 li $bi,38 251 adde $acc2,$acc2,$t1 252 adde $acc3,$acc3,$a1 253 adde $acc4,$acc4,$t2 254 adde $acc5,$acc5,$a2 255 adde $acc6,$acc6,$t3 256 adde $acc7,$acc7,$a3 257 258 mulld $t0,$acc4,$bi 259 mulld $t1,$acc5,$bi 260 mulld $t2,$acc6,$bi 261 mulld $t3,$acc7,$bi 262 263 addc $acc0,$acc0,$t0 264 mulhdu $t0,$acc4,$bi 265 adde $acc1,$acc1,$t1 266 mulhdu $t1,$acc5,$bi 267 adde $acc2,$acc2,$t2 268 mulhdu $t2,$acc6,$bi 269 adde $acc3,$acc3,$t3 270 mulhdu $t3,$acc7,$bi 271 addze $acc4,$zero 272 273 addc $acc1,$acc1,$t0 274 adde $acc2,$acc2,$t1 275 adde $acc3,$acc3,$t2 276 adde $acc4,$acc4,$t3 277 278 mulld $acc4,$acc4,$bi 279 280 addc $acc0,$acc0,$acc4 281 addze $acc1,$acc1 282 addze $acc2,$acc2 283 addze $acc3,$acc3 284 285 subfe $acc4,$acc4,$acc4 # carry -> ~mask 286 std $acc1,8($rp) 287 andc $acc4,$bi,$acc4 288 std $acc2,16($rp) 289 add $acc0,$acc0,$acc4 290 std $acc3,24($rp) 291 std $acc0,0($rp) 292 293 ld r22,`$FRAME-8*10`($sp) 294 ld r23,`$FRAME-8*9`($sp) 295 ld r24,`$FRAME-8*8`($sp) 296 ld r25,`$FRAME-8*7`($sp) 297 ld r26,`$FRAME-8*6`($sp) 298 ld r27,`$FRAME-8*5`($sp) 299 ld r28,`$FRAME-8*4`($sp) 300 ld r29,`$FRAME-8*3`($sp) 301 ld r30,`$FRAME-8*2`($sp) 302 ld r31,`$FRAME-8*1`($sp) 303 addi $sp,$sp,$FRAME 304 blr 305 .long 0 306 .byte 0,12,4,0,0x80,10,2,0 307 .long 0 308.size x25519_fe64_sqr,.-x25519_fe64_sqr 309 310.globl x25519_fe64_mul121666 311.type x25519_fe64_mul121666,\@function 312.align 5 313x25519_fe64_mul121666: 314 lis $bi,`65536>>16` 315 ori $bi,$bi,`121666-65536` 316 317 ld $t0,0($ap) 318 ld $t1,8($ap) 319 ld $bp,16($ap) 320 ld $ap,24($ap) 321 322 mulld $a0,$t0,$bi 323 mulhdu $t0,$t0,$bi 324 mulld $a1,$t1,$bi 325 mulhdu $t1,$t1,$bi 326 mulld $a2,$bp,$bi 327 mulhdu $bp,$bp,$bi 328 mulld $a3,$ap,$bi 329 mulhdu $ap,$ap,$bi 330 331 addc $a1,$a1,$t0 332 adde $a2,$a2,$t1 333 adde $a3,$a3,$bp 334 addze $ap, $ap 335 336 mulli $ap,$ap,38 337 338 addc $a0,$a0,$ap 339 addze $a1,$a1 340 addze $a2,$a2 341 addze $a3,$a3 342 343 subfe $t1,$t1,$t1 # carry -> ~mask 344 std $a1,8($rp) 345 andc $t0,$t0,$t1 346 std $a2,16($rp) 347 add $a0,$a0,$t0 348 std $a3,24($rp) 349 std $a0,0($rp) 350 351 blr 352 .long 0 353 .byte 0,12,0x14,0,0,0,2,0 354 .long 0 355.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 356 357.globl x25519_fe64_add 358.type x25519_fe64_add,\@function 359.align 5 360x25519_fe64_add: 361 ld $a0,0($ap) 362 ld $t0,0($bp) 363 ld $a1,8($ap) 364 ld $t1,8($bp) 365 ld $a2,16($ap) 366 ld $bi,16($bp) 367 ld $a3,24($ap) 368 ld $bp,24($bp) 369 370 addc $a0,$a0,$t0 371 adde $a1,$a1,$t1 372 adde $a2,$a2,$bi 373 adde $a3,$a3,$bp 374 375 li $t0,38 376 subfe $t1,$t1,$t1 # carry -> ~mask 377 andc $t1,$t0,$t1 378 379 addc $a0,$a0,$t1 380 addze $a1,$a1 381 addze $a2,$a2 382 addze $a3,$a3 383 384 subfe $t1,$t1,$t1 # carry -> ~mask 385 std $a1,8($rp) 386 andc $t0,$t0,$t1 387 std $a2,16($rp) 388 add $a0,$a0,$t0 389 std $a3,24($rp) 390 std $a0,0($rp) 391 392 blr 393 .long 0 394 .byte 0,12,0x14,0,0,0,3,0 395 .long 0 396.size x25519_fe64_add,.-x25519_fe64_add 397 398.globl x25519_fe64_sub 399.type x25519_fe64_sub,\@function 400.align 5 401x25519_fe64_sub: 402 ld $a0,0($ap) 403 ld $t0,0($bp) 404 ld $a1,8($ap) 405 ld $t1,8($bp) 406 ld $a2,16($ap) 407 ld $bi,16($bp) 408 ld $a3,24($ap) 409 ld $bp,24($bp) 410 411 subfc $a0,$t0,$a0 412 subfe $a1,$t1,$a1 413 subfe $a2,$bi,$a2 414 subfe $a3,$bp,$a3 415 416 li $t0,38 417 subfe $t1,$t1,$t1 # borrow -> mask 418 xor $zero,$zero,$zero 419 and $t1,$t0,$t1 420 421 subfc $a0,$t1,$a0 422 subfe $a1,$zero,$a1 423 subfe $a2,$zero,$a2 424 subfe $a3,$zero,$a3 425 426 subfe $t1,$t1,$t1 # borrow -> mask 427 std $a1,8($rp) 428 and $t0,$t0,$t1 429 std $a2,16($rp) 430 subf $a0,$t0,$a0 431 std $a3,24($rp) 432 std $a0,0($rp) 433 434 blr 435 .long 0 436 .byte 0,12,0x14,0,0,0,3,0 437 .long 0 438.size x25519_fe64_sub,.-x25519_fe64_sub 439 440.globl x25519_fe64_tobytes 441.type x25519_fe64_tobytes,\@function 442.align 5 443x25519_fe64_tobytes: 444 ld $a3,24($ap) 445 ld $a0,0($ap) 446 ld $a1,8($ap) 447 ld $a2,16($ap) 448 449 sradi $t0,$a3,63 # most significant bit -> mask 450 li $t1,19 451 and $t0,$t0,$t1 452 sldi $a3,$a3,1 453 add $t0,$t0,$t1 # compare to modulus in the same go 454 srdi $a3,$a3,1 # most signifcant bit cleared 455 456 addc $a0,$a0,$t0 457 addze $a1,$a1 458 addze $a2,$a2 459 addze $a3,$a3 460 461 xor $zero,$zero,$zero 462 sradi $t0,$a3,63 # most significant bit -> mask 463 sldi $a3,$a3,1 464 andc $t0,$t1,$t0 465 srdi $a3,$a3,1 # most signifcant bit cleared 466 467 subi $rp,$rp,1 468 subfc $a0,$t0,$a0 469 subfe $a1,$zero,$a1 470 subfe $a2,$zero,$a2 471 subfe $a3,$zero,$a3 472 473___ 474for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) { 475$code.=<<___; 476 srdi $t0,@a[0],8 477 stbu @a[0],1($rp) 478 srdi @a[0],@a[0],16 479 stbu $t0,1($rp) 480 srdi $t0,@a[0],8 481 stbu @a[0],1($rp) 482 srdi @a[0],@a[0],16 483 stbu $t0,1($rp) 484 srdi $t0,@a[0],8 485 stbu @a[0],1($rp) 486 srdi @a[0],@a[0],16 487 stbu $t0,1($rp) 488 srdi $t0,@a[0],8 489 stbu @a[0],1($rp) 490 stbu $t0,1($rp) 491___ 492} 493$code.=<<___; 494 blr 495 .long 0 496 .byte 0,12,0x14,0,0,0,2,0 497 .long 0 498.size x25519_fe64_tobytes,.-x25519_fe64_tobytes 499___ 500} 501####################################################### base 2^51 502{ 503my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1, 504 $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) = 505 map("r$_",(6..12,21..31)); 506my $mask = "r0"; 507my $FRAME = 18*8; 508 509$code.=<<___; 510.text 511 512.globl x25519_fe51_mul 513.type x25519_fe51_mul,\@function 514.align 5 515x25519_fe51_mul: 516 stdu $sp,-$FRAME($sp) 517 std r21,`$FRAME-8*11`($sp) 518 std r22,`$FRAME-8*10`($sp) 519 std r23,`$FRAME-8*9`($sp) 520 std r24,`$FRAME-8*8`($sp) 521 std r25,`$FRAME-8*7`($sp) 522 std r26,`$FRAME-8*6`($sp) 523 std r27,`$FRAME-8*5`($sp) 524 std r28,`$FRAME-8*4`($sp) 525 std r29,`$FRAME-8*3`($sp) 526 std r30,`$FRAME-8*2`($sp) 527 std r31,`$FRAME-8*1`($sp) 528 529 ld $bi,0($bp) 530 ld $a0,0($ap) 531 ld $a1,8($ap) 532 ld $a2,16($ap) 533 ld $a3,24($ap) 534 ld $a4,32($ap) 535 536 mulld $h0lo,$a0,$bi # a[0]*b[0] 537 mulhdu $h0hi,$a0,$bi 538 539 mulld $h1lo,$a1,$bi # a[1]*b[0] 540 mulhdu $h1hi,$a1,$bi 541 542 mulld $h4lo,$a4,$bi # a[4]*b[0] 543 mulhdu $h4hi,$a4,$bi 544 ld $ap,8($bp) 545 mulli $a4,$a4,19 546 547 mulld $h2lo,$a2,$bi # a[2]*b[0] 548 mulhdu $h2hi,$a2,$bi 549 550 mulld $h3lo,$a3,$bi # a[3]*b[0] 551 mulhdu $h3hi,$a3,$bi 552___ 553for(my @a=($a0,$a1,$a2,$a3,$a4), 554 my $i=1; $i<4; $i++) { 555 ($ap,$bi) = ($bi,$ap); 556$code.=<<___; 557 mulld $t0,@a[4],$bi 558 mulhdu $t1,@a[4],$bi 559 addc $h0lo,$h0lo,$t0 560 adde $h0hi,$h0hi,$t1 561 562 mulld $t0,@a[0],$bi 563 mulhdu $t1,@a[0],$bi 564 addc $h1lo,$h1lo,$t0 565 adde $h1hi,$h1hi,$t1 566 567 mulld $t0,@a[3],$bi 568 mulhdu $t1,@a[3],$bi 569 ld $ap,`8*($i+1)`($bp) 570 mulli @a[3],@a[3],19 571 addc $h4lo,$h4lo,$t0 572 adde $h4hi,$h4hi,$t1 573 574 mulld $t0,@a[1],$bi 575 mulhdu $t1,@a[1],$bi 576 addc $h2lo,$h2lo,$t0 577 adde $h2hi,$h2hi,$t1 578 579 mulld $t0,@a[2],$bi 580 mulhdu $t1,@a[2],$bi 581 addc $h3lo,$h3lo,$t0 582 adde $h3hi,$h3hi,$t1 583___ 584 unshift(@a,pop(@a)); 585} 586 ($ap,$bi) = ($bi,$ap); 587$code.=<<___; 588 mulld $t0,$a1,$bi 589 mulhdu $t1,$a1,$bi 590 addc $h0lo,$h0lo,$t0 591 adde $h0hi,$h0hi,$t1 592 593 mulld $t0,$a2,$bi 594 mulhdu $t1,$a2,$bi 595 addc $h1lo,$h1lo,$t0 596 adde $h1hi,$h1hi,$t1 597 598 mulld $t0,$a3,$bi 599 mulhdu $t1,$a3,$bi 600 addc $h2lo,$h2lo,$t0 601 adde $h2hi,$h2hi,$t1 602 603 mulld $t0,$a4,$bi 604 mulhdu $t1,$a4,$bi 605 addc $h3lo,$h3lo,$t0 606 adde $h3hi,$h3hi,$t1 607 608 mulld $t0,$a0,$bi 609 mulhdu $t1,$a0,$bi 610 addc $h4lo,$h4lo,$t0 611 adde $h4hi,$h4hi,$t1 612 613.Lfe51_reduce: 614 li $mask,-1 615 srdi $mask,$mask,13 # 0x7ffffffffffff 616 617 srdi $t0,$h2lo,51 618 and $a2,$h2lo,$mask 619 insrdi $t0,$h2hi,51,0 # h2>>51 620 srdi $t1,$h0lo,51 621 and $a0,$h0lo,$mask 622 insrdi $t1,$h0hi,51,0 # h0>>51 623 addc $h3lo,$h3lo,$t0 624 addze $h3hi,$h3hi 625 addc $h1lo,$h1lo,$t1 626 addze $h1hi,$h1hi 627 628 srdi $t0,$h3lo,51 629 and $a3,$h3lo,$mask 630 insrdi $t0,$h3hi,51,0 # h3>>51 631 srdi $t1,$h1lo,51 632 and $a1,$h1lo,$mask 633 insrdi $t1,$h1hi,51,0 # h1>>51 634 addc $h4lo,$h4lo,$t0 635 addze $h4hi,$h4hi 636 add $a2,$a2,$t1 637 638 srdi $t0,$h4lo,51 639 and $a4,$h4lo,$mask 640 insrdi $t0,$h4hi,51,0 641 mulli $t0,$t0,19 # (h4 >> 51) * 19 642 643 add $a0,$a0,$t0 644 645 srdi $t1,$a2,51 646 and $a2,$a2,$mask 647 add $a3,$a3,$t1 648 649 srdi $t0,$a0,51 650 and $a0,$a0,$mask 651 add $a1,$a1,$t0 652 653 std $a2,16($rp) 654 std $a3,24($rp) 655 std $a4,32($rp) 656 std $a0,0($rp) 657 std $a1,8($rp) 658 659 ld r21,`$FRAME-8*11`($sp) 660 ld r22,`$FRAME-8*10`($sp) 661 ld r23,`$FRAME-8*9`($sp) 662 ld r24,`$FRAME-8*8`($sp) 663 ld r25,`$FRAME-8*7`($sp) 664 ld r26,`$FRAME-8*6`($sp) 665 ld r27,`$FRAME-8*5`($sp) 666 ld r28,`$FRAME-8*4`($sp) 667 ld r29,`$FRAME-8*3`($sp) 668 ld r30,`$FRAME-8*2`($sp) 669 ld r31,`$FRAME-8*1`($sp) 670 addi $sp,$sp,$FRAME 671 blr 672 .long 0 673 .byte 0,12,4,0,0x80,11,3,0 674 .long 0 675.size x25519_fe51_mul,.-x25519_fe51_mul 676___ 677{ 678my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1); 679$code.=<<___; 680.globl x25519_fe51_sqr 681.type x25519_fe51_sqr,\@function 682.align 5 683x25519_fe51_sqr: 684 stdu $sp,-$FRAME($sp) 685 std r21,`$FRAME-8*11`($sp) 686 std r22,`$FRAME-8*10`($sp) 687 std r23,`$FRAME-8*9`($sp) 688 std r24,`$FRAME-8*8`($sp) 689 std r25,`$FRAME-8*7`($sp) 690 std r26,`$FRAME-8*6`($sp) 691 std r27,`$FRAME-8*5`($sp) 692 std r28,`$FRAME-8*4`($sp) 693 std r29,`$FRAME-8*3`($sp) 694 std r30,`$FRAME-8*2`($sp) 695 std r31,`$FRAME-8*1`($sp) 696 697 ld $a0,0($ap) 698 ld $a1,8($ap) 699 ld $a2,16($ap) 700 ld $a3,24($ap) 701 ld $a4,32($ap) 702 703 add $bi,$a0,$a0 # a[0]*2 704 mulli $t1,$a4,19 # a[4]*19 705 706 mulld $h0lo,$a0,$a0 707 mulhdu $h0hi,$a0,$a0 708 mulld $h1lo,$a1,$bi 709 mulhdu $h1hi,$a1,$bi 710 mulld $h2lo,$a2,$bi 711 mulhdu $h2hi,$a2,$bi 712 mulld $h3lo,$a3,$bi 713 mulhdu $h3hi,$a3,$bi 714 mulld $h4lo,$a4,$bi 715 mulhdu $h4hi,$a4,$bi 716 add $bi,$a1,$a1 # a[1]*2 717___ 718 ($a4,$t1) = ($t1,$a4); 719$code.=<<___; 720 mulld $t0,$t1,$a4 721 mulhdu $t1,$t1,$a4 722 addc $h3lo,$h3lo,$t0 723 adde $h3hi,$h3hi,$t1 724 725 mulli $bp,$a3,19 # a[3]*19 726 727 mulld $t0,$a1,$a1 728 mulhdu $t1,$a1,$a1 729 addc $h2lo,$h2lo,$t0 730 adde $h2hi,$h2hi,$t1 731 mulld $t0,$a2,$bi 732 mulhdu $t1,$a2,$bi 733 addc $h3lo,$h3lo,$t0 734 adde $h3hi,$h3hi,$t1 735 mulld $t0,$a3,$bi 736 mulhdu $t1,$a3,$bi 737 addc $h4lo,$h4lo,$t0 738 adde $h4hi,$h4hi,$t1 739 mulld $t0,$a4,$bi 740 mulhdu $t1,$a4,$bi 741 add $bi,$a3,$a3 # a[3]*2 742 addc $h0lo,$h0lo,$t0 743 adde $h0hi,$h0hi,$t1 744___ 745 ($a3,$t1) = ($bp,$a3); 746$code.=<<___; 747 mulld $t0,$t1,$a3 748 mulhdu $t1,$t1,$a3 749 addc $h1lo,$h1lo,$t0 750 adde $h1hi,$h1hi,$t1 751 mulld $t0,$bi,$a4 752 mulhdu $t1,$bi,$a4 753 add $bi,$a2,$a2 # a[2]*2 754 addc $h2lo,$h2lo,$t0 755 adde $h2hi,$h2hi,$t1 756 757 mulld $t0,$a2,$a2 758 mulhdu $t1,$a2,$a2 759 addc $h4lo,$h4lo,$t0 760 adde $h4hi,$h4hi,$t1 761 mulld $t0,$a3,$bi 762 mulhdu $t1,$a3,$bi 763 addc $h0lo,$h0lo,$t0 764 adde $h0hi,$h0hi,$t1 765 mulld $t0,$a4,$bi 766 mulhdu $t1,$a4,$bi 767 addc $h1lo,$h1lo,$t0 768 adde $h1hi,$h1hi,$t1 769 770 b .Lfe51_reduce 771 .long 0 772 .byte 0,12,4,0,0x80,11,2,0 773 .long 0 774.size x25519_fe51_sqr,.-x25519_fe51_sqr 775___ 776} 777$code.=<<___; 778.globl x25519_fe51_mul121666 779.type x25519_fe51_mul121666,\@function 780.align 5 781x25519_fe51_mul121666: 782 stdu $sp,-$FRAME($sp) 783 std r21,`$FRAME-8*11`($sp) 784 std r22,`$FRAME-8*10`($sp) 785 std r23,`$FRAME-8*9`($sp) 786 std r24,`$FRAME-8*8`($sp) 787 std r25,`$FRAME-8*7`($sp) 788 std r26,`$FRAME-8*6`($sp) 789 std r27,`$FRAME-8*5`($sp) 790 std r28,`$FRAME-8*4`($sp) 791 std r29,`$FRAME-8*3`($sp) 792 std r30,`$FRAME-8*2`($sp) 793 std r31,`$FRAME-8*1`($sp) 794 795 lis $bi,`65536>>16` 796 ori $bi,$bi,`121666-65536` 797 ld $a0,0($ap) 798 ld $a1,8($ap) 799 ld $a2,16($ap) 800 ld $a3,24($ap) 801 ld $a4,32($ap) 802 803 mulld $h0lo,$a0,$bi # a[0]*121666 804 mulhdu $h0hi,$a0,$bi 805 mulld $h1lo,$a1,$bi # a[1]*121666 806 mulhdu $h1hi,$a1,$bi 807 mulld $h2lo,$a2,$bi # a[2]*121666 808 mulhdu $h2hi,$a2,$bi 809 mulld $h3lo,$a3,$bi # a[3]*121666 810 mulhdu $h3hi,$a3,$bi 811 mulld $h4lo,$a4,$bi # a[4]*121666 812 mulhdu $h4hi,$a4,$bi 813 814 b .Lfe51_reduce 815 .long 0 816 .byte 0,12,4,0,0x80,11,2,0 817 .long 0 818.size x25519_fe51_mul121666,.-x25519_fe51_mul121666 819___ 820} 821 822$code =~ s/\`([^\`]*)\`/eval $1/gem; 823print $code; 824close STDOUT; 825