1#! /usr/bin/env perl 2# Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL 12# project. The module is dual licensed under OpenSSL and CRYPTOGAMS 13# licenses depending on where you obtain it. For further details see 14# https://github.com/dot-asm/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for PowerPC. 18# 19# June 2015 20# 21# Numbers are cycles per processed byte with poly1305_blocks alone, 22# and improvement coefficients relative to gcc-generated code. 23# 24# -m32 -m64 25# 26# Freescale e300 14.8/+80% - 27# PPC74x0 7.60/+60% - 28# PPC970 7.00/+114% 3.51/+205% 29# POWER7 3.75/+260% 1.93/+100% 30# POWER8 - 2.03/+200% 31# POWER9 - 2.00/+150% 32# 33# Do we need floating-point implementation for PPC? Results presented 34# in poly1305_ieee754.c are tricky to compare to, because they are for 35# compiler-generated code. On the other hand it's known that floating- 36# point performance can be dominated by FPU latency, which means that 37# there is limit even for ideally optimized (and even vectorized) code. 38# And this limit is estimated to be higher than above -m64 results. Or 39# in other words floating-point implementation can be meaningful to 40# consider only in 32-bit application context. We probably have to 41# recognize that 32-bit builds are getting less popular on high-end 42# systems and therefore tend to target embedded ones, which might not 43# even have FPU... 44# 45# On side note, Power ISA 2.07 enables vector base 2^26 implementation, 46# and POWER8 might have capacity to break 1.0 cycle per byte barrier... 47# 48# January 2019 49# 50# ... Unfortunately not:-( Estimate was a projection of ARM result, 51# but ARM has vector multiply-n-add instruction, while PowerISA does 52# not, not one usable in the context. Improvement is ~40% over -m64 53# result above and is ~1.43 on little-endian systems. 54 55# $output is the last argument if it looks like a file (it has an extension) 56# $flavour is the first argument if it doesn't look like a file 57$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 58$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 59 60if ($flavour =~ /64/) { 61 $SIZE_T =8; 62 $LRSAVE =2*$SIZE_T; 63 $UCMP ="cmpld"; 64 $STU ="stdu"; 65 $POP ="ld"; 66 $PUSH ="std"; 67} elsif ($flavour =~ /32/) { 68 $SIZE_T =4; 69 $LRSAVE =$SIZE_T; 70 $UCMP ="cmplw"; 71 $STU ="stwu"; 72 $POP ="lwz"; 73 $PUSH ="stw"; 74} else { die "nonsense $flavour"; } 75 76# Define endianness based on flavour 77# i.e.: linux64le 78$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; 79 80$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 81( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 82( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 83die "can't locate ppc-xlate.pl"; 84 85open STDOUT,"| $^X $xlate $flavour \"$output\"" 86 or die "can't call $xlate: $!"; 87 88$FRAME=24*$SIZE_T; 89 90$sp="r1"; 91my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6)); 92my ($mac,$nonce)=($inp,$len); 93my $mask = "r0"; 94 95$code=<<___; 96.machine "any" 97.text 98___ 99 if ($flavour =~ /64/) { 100############################################################################### 101# base 2^64 implementation 102 103my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31)); 104 105$code.=<<___; 106.globl .poly1305_init_int 107.align 4 108.poly1305_init_int: 109 xor r0,r0,r0 110 std r0,0($ctx) # zero hash value 111 std r0,8($ctx) 112 std r0,16($ctx) 113 stw r0,24($ctx) # clear is_base2_26 114 115 $UCMP $inp,r0 116 beq- Lno_key 117___ 118$code.=<<___ if ($LITTLE_ENDIAN); 119 ld $d0,0($inp) # load key material 120 ld $d1,8($inp) 121___ 122$code.=<<___ if (!$LITTLE_ENDIAN); 123 li $h0,4 124 lwbrx $d0,0,$inp # load key material 125 li $d1,8 126 lwbrx $h0,$h0,$inp 127 li $h1,12 128 lwbrx $d1,$d1,$inp 129 lwbrx $h1,$h1,$inp 130 insrdi $d0,$h0,32,0 131 insrdi $d1,$h1,32,0 132___ 133$code.=<<___; 134 lis $h1,0xfff # 0x0fff0000 135 ori $h1,$h1,0xfffc # 0x0ffffffc 136 insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc 137 ori $h0,$h1,3 # 0x0ffffffc0fffffff 138 139 and $d0,$d0,$h0 140 and $d1,$d1,$h1 141 142 std $d0,32($ctx) # store key 143 std $d1,40($ctx) 144 145Lno_key: 146 xor r3,r3,r3 147 blr 148 .long 0 149 .byte 0,12,0x14,0,0,0,2,0 150.size .poly1305_init_int,.-.poly1305_init_int 151 152.globl .poly1305_blocks 153.align 4 154.poly1305_blocks: 155Lpoly1305_blocks: 156 srdi. $len,$len,4 157 beq- Labort 158 159 $STU $sp,-$FRAME($sp) 160 mflr r0 161 $PUSH r27,`$FRAME-$SIZE_T*5`($sp) 162 $PUSH r28,`$FRAME-$SIZE_T*4`($sp) 163 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 164 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 165 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 166 $PUSH r0,`$FRAME+$LRSAVE`($sp) 167 168 ld $r0,32($ctx) # load key 169 ld $r1,40($ctx) 170 171 ld $h0,0($ctx) # load hash value 172 ld $h1,8($ctx) 173 ld $h2,16($ctx) 174 175 srdi $s1,$r1,2 176 mtctr $len 177 add $s1,$s1,$r1 # s1 = r1 + r1>>2 178 li $mask,3 179 b Loop 180 181.align 4 182Loop: 183___ 184$code.=<<___ if ($LITTLE_ENDIAN); 185 ld $t0,0($inp) # load input 186 ld $t1,8($inp) 187___ 188$code.=<<___ if (!$LITTLE_ENDIAN); 189 li $d0,4 190 lwbrx $t0,0,$inp # load input 191 li $t1,8 192 lwbrx $d0,$d0,$inp 193 li $d1,12 194 lwbrx $t1,$t1,$inp 195 lwbrx $d1,$d1,$inp 196 insrdi $t0,$d0,32,0 197 insrdi $t1,$d1,32,0 198___ 199$code.=<<___; 200 addi $inp,$inp,16 201 202 addc $h0,$h0,$t0 # accumulate input 203 adde $h1,$h1,$t1 204 205 mulld $d0,$h0,$r0 # h0*r0 206 mulhdu $d1,$h0,$r0 207 adde $h2,$h2,$padbit 208 209 mulld $t0,$h1,$s1 # h1*5*r1 210 mulhdu $t1,$h1,$s1 211 addc $d0,$d0,$t0 212 adde $d1,$d1,$t1 213 214 mulld $t0,$h0,$r1 # h0*r1 215 mulhdu $d2,$h0,$r1 216 addc $d1,$d1,$t0 217 addze $d2,$d2 218 219 mulld $t0,$h1,$r0 # h1*r0 220 mulhdu $t1,$h1,$r0 221 addc $d1,$d1,$t0 222 adde $d2,$d2,$t1 223 224 mulld $t0,$h2,$s1 # h2*5*r1 225 mulld $t1,$h2,$r0 # h2*r0 226 addc $d1,$d1,$t0 227 adde $d2,$d2,$t1 228 229 andc $t0,$d2,$mask # final reduction step 230 and $h2,$d2,$mask 231 srdi $t1,$t0,2 232 add $t0,$t0,$t1 233 addc $h0,$d0,$t0 234 addze $h1,$d1 235 addze $h2,$h2 236 237 bdnz Loop 238 239 std $h0,0($ctx) # store hash value 240 std $h1,8($ctx) 241 std $h2,16($ctx) 242 243 $POP r27,`$FRAME-$SIZE_T*5`($sp) 244 $POP r28,`$FRAME-$SIZE_T*4`($sp) 245 $POP r29,`$FRAME-$SIZE_T*3`($sp) 246 $POP r30,`$FRAME-$SIZE_T*2`($sp) 247 $POP r31,`$FRAME-$SIZE_T*1`($sp) 248 addi $sp,$sp,$FRAME 249Labort: 250 blr 251 .long 0 252 .byte 0,12,4,1,0x80,5,4,0 253.size .poly1305_blocks,.-.poly1305_blocks 254___ 255{ 256my ($h0,$h1,$h2,$h3,$h4,$t0) = map("r$_",(7..12)); 257 258$code.=<<___; 259.globl .poly1305_emit 260.align 5 261.poly1305_emit: 262 lwz $h0,0($ctx) # load hash value base 2^26 263 lwz $h1,4($ctx) 264 lwz $h2,8($ctx) 265 lwz $h3,12($ctx) 266 lwz $h4,16($ctx) 267 lwz r0,24($ctx) # is_base2_26 268 269 sldi $h1,$h1,26 # base 2^26 -> base 2^64 270 sldi $t0,$h2,52 271 srdi $h2,$h2,12 272 sldi $h3,$h3,14 273 add $h0,$h0,$h1 274 addc $h0,$h0,$t0 275 sldi $t0,$h4,40 276 srdi $h4,$h4,24 277 adde $h1,$h2,$h3 278 addc $h1,$h1,$t0 279 addze $h2,$h4 280 281 ld $h3,0($ctx) # load hash value base 2^64 282 ld $h4,8($ctx) 283 ld $t0,16($ctx) 284 285 neg r0,r0 286 xor $h0,$h0,$h3 # choose between radixes 287 xor $h1,$h1,$h4 288 xor $h2,$h2,$t0 289 and $h0,$h0,r0 290 and $h1,$h1,r0 291 and $h2,$h2,r0 292 xor $h0,$h0,$h3 293 xor $h1,$h1,$h4 294 xor $h2,$h2,$t0 295 296 addic $h3,$h0,5 # compare to modulus 297 addze $h4,$h1 298 addze $t0,$h2 299 300 srdi $t0,$t0,2 # see if it carried/borrowed 301 neg $t0,$t0 302 303 andc $h0,$h0,$t0 304 and $h3,$h3,$t0 305 andc $h1,$h1,$t0 306 and $h4,$h4,$t0 307 or $h0,$h0,$h3 308 or $h1,$h1,$h4 309 310 lwz $t0,4($nonce) 311 lwz $h2,12($nonce) 312 lwz $h3,0($nonce) 313 lwz $h4,8($nonce) 314 315 insrdi $h3,$t0,32,0 316 insrdi $h4,$h2,32,0 317 318 addc $h0,$h0,$h3 # accumulate nonce 319 adde $h1,$h1,$h4 320 321 addi $ctx,$mac,-1 322 addi $mac,$mac,7 323 324 stbu $h0,1($ctx) # write [little-endian] result 325 srdi $h0,$h0,8 326 stbu $h1,1($mac) 327 srdi $h1,$h1,8 328 329 stbu $h0,1($ctx) 330 srdi $h0,$h0,8 331 stbu $h1,1($mac) 332 srdi $h1,$h1,8 333 334 stbu $h0,1($ctx) 335 srdi $h0,$h0,8 336 stbu $h1,1($mac) 337 srdi $h1,$h1,8 338 339 stbu $h0,1($ctx) 340 srdi $h0,$h0,8 341 stbu $h1,1($mac) 342 srdi $h1,$h1,8 343 344 stbu $h0,1($ctx) 345 srdi $h0,$h0,8 346 stbu $h1,1($mac) 347 srdi $h1,$h1,8 348 349 stbu $h0,1($ctx) 350 srdi $h0,$h0,8 351 stbu $h1,1($mac) 352 srdi $h1,$h1,8 353 354 stbu $h0,1($ctx) 355 srdi $h0,$h0,8 356 stbu $h1,1($mac) 357 srdi $h1,$h1,8 358 359 stbu $h0,1($ctx) 360 stbu $h1,1($mac) 361 362 blr 363 .long 0 364 .byte 0,12,0x14,0,0,0,3,0 365.size .poly1305_emit,.-.poly1305_emit 366___ 367} } else { 368############################################################################### 369# base 2^32 implementation 370 371my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3, 372 $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3 373 ) = map("r$_",(7..12,14..31)); 374 375$code.=<<___; 376.globl .poly1305_init_int 377.align 4 378.poly1305_init_int: 379 xor r0,r0,r0 380 stw r0,0($ctx) # zero hash value 381 stw r0,4($ctx) 382 stw r0,8($ctx) 383 stw r0,12($ctx) 384 stw r0,16($ctx) 385 stw r0,24($ctx) # clear is_base2_26 386 387 $UCMP $inp,r0 388 beq- Lno_key 389___ 390$code.=<<___ if ($LITTLE_ENDIAN); 391 lw $h0,0($inp) # load key material 392 lw $h1,4($inp) 393 lw $h2,8($inp) 394 lw $h3,12($inp) 395___ 396$code.=<<___ if (!$LITTLE_ENDIAN); 397 li $h1,4 398 lwbrx $h0,0,$inp # load key material 399 li $h2,8 400 lwbrx $h1,$h1,$inp 401 li $h3,12 402 lwbrx $h2,$h2,$inp 403 lwbrx $h3,$h3,$inp 404___ 405$code.=<<___; 406 lis $mask,0xf000 # 0xf0000000 407 li $r0,-4 408 andc $r0,$r0,$mask # 0x0ffffffc 409 410 andc $h0,$h0,$mask 411 and $h1,$h1,$r0 412 and $h2,$h2,$r0 413 and $h3,$h3,$r0 414 415 stw $h0,32($ctx) # store key 416 stw $h1,36($ctx) 417 stw $h2,40($ctx) 418 stw $h3,44($ctx) 419 420Lno_key: 421 xor r3,r3,r3 422 blr 423 .long 0 424 .byte 0,12,0x14,0,0,0,2,0 425.size .poly1305_init_int,.-.poly1305_init_int 426 427.globl .poly1305_blocks 428.align 4 429.poly1305_blocks: 430Lpoly1305_blocks: 431 srwi. $len,$len,4 432 beq- Labort 433 434 $STU $sp,-$FRAME($sp) 435 mflr r0 436 $PUSH r14,`$FRAME-$SIZE_T*18`($sp) 437 $PUSH r15,`$FRAME-$SIZE_T*17`($sp) 438 $PUSH r16,`$FRAME-$SIZE_T*16`($sp) 439 $PUSH r17,`$FRAME-$SIZE_T*15`($sp) 440 $PUSH r18,`$FRAME-$SIZE_T*14`($sp) 441 $PUSH r19,`$FRAME-$SIZE_T*13`($sp) 442 $PUSH r20,`$FRAME-$SIZE_T*12`($sp) 443 $PUSH r21,`$FRAME-$SIZE_T*11`($sp) 444 $PUSH r22,`$FRAME-$SIZE_T*10`($sp) 445 $PUSH r23,`$FRAME-$SIZE_T*9`($sp) 446 $PUSH r24,`$FRAME-$SIZE_T*8`($sp) 447 $PUSH r25,`$FRAME-$SIZE_T*7`($sp) 448 $PUSH r26,`$FRAME-$SIZE_T*6`($sp) 449 $PUSH r27,`$FRAME-$SIZE_T*5`($sp) 450 $PUSH r28,`$FRAME-$SIZE_T*4`($sp) 451 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 452 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 453 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 454 $PUSH r0,`$FRAME+$LRSAVE`($sp) 455 456 lwz $r0,32($ctx) # load key 457 lwz $r1,36($ctx) 458 lwz $r2,40($ctx) 459 lwz $r3,44($ctx) 460 461 lwz $h0,0($ctx) # load hash value 462 lwz $h1,4($ctx) 463 lwz $h2,8($ctx) 464 lwz $h3,12($ctx) 465 lwz $h4,16($ctx) 466 467 srwi $s1,$r1,2 468 srwi $s2,$r2,2 469 srwi $s3,$r3,2 470 add $s1,$s1,$r1 # si = ri + ri>>2 471 add $s2,$s2,$r2 472 add $s3,$s3,$r3 473 mtctr $len 474 li $mask,3 475 b Loop 476 477.align 4 478Loop: 479___ 480$code.=<<___ if ($LITTLE_ENDIAN); 481 lwz $d0,0($inp) # load input 482 lwz $d1,4($inp) 483 lwz $d2,8($inp) 484 lwz $d3,12($inp) 485___ 486$code.=<<___ if (!$LITTLE_ENDIAN); 487 li $d1,4 488 lwbrx $d0,0,$inp # load input 489 li $d2,8 490 lwbrx $d1,$d1,$inp 491 li $d3,12 492 lwbrx $d2,$d2,$inp 493 lwbrx $d3,$d3,$inp 494___ 495$code.=<<___; 496 addi $inp,$inp,16 497 498 addc $h0,$h0,$d0 # accumulate input 499 adde $h1,$h1,$d1 500 adde $h2,$h2,$d2 501 502 mullw $d0,$h0,$r0 # h0*r0 503 mulhwu $D0,$h0,$r0 504 505 mullw $d1,$h0,$r1 # h0*r1 506 mulhwu $D1,$h0,$r1 507 508 mullw $d2,$h0,$r2 # h0*r2 509 mulhwu $D2,$h0,$r2 510 511 adde $h3,$h3,$d3 512 adde $h4,$h4,$padbit 513 514 mullw $d3,$h0,$r3 # h0*r3 515 mulhwu $D3,$h0,$r3 516 517 mullw $t0,$h1,$s3 # h1*s3 518 mulhwu $t1,$h1,$s3 519 520 mullw $t2,$h1,$r0 # h1*r0 521 mulhwu $t3,$h1,$r0 522 addc $d0,$d0,$t0 523 adde $D0,$D0,$t1 524 525 mullw $t0,$h1,$r1 # h1*r1 526 mulhwu $t1,$h1,$r1 527 addc $d1,$d1,$t2 528 adde $D1,$D1,$t3 529 530 mullw $t2,$h1,$r2 # h1*r2 531 mulhwu $t3,$h1,$r2 532 addc $d2,$d2,$t0 533 adde $D2,$D2,$t1 534 535 mullw $t0,$h2,$s2 # h2*s2 536 mulhwu $t1,$h2,$s2 537 addc $d3,$d3,$t2 538 adde $D3,$D3,$t3 539 540 mullw $t2,$h2,$s3 # h2*s3 541 mulhwu $t3,$h2,$s3 542 addc $d0,$d0,$t0 543 adde $D0,$D0,$t1 544 545 mullw $t0,$h2,$r0 # h2*r0 546 mulhwu $t1,$h2,$r0 547 addc $d1,$d1,$t2 548 adde $D1,$D1,$t3 549 550 mullw $t2,$h2,$r1 # h2*r1 551 mulhwu $t3,$h2,$r1 552 addc $d2,$d2,$t0 553 adde $D2,$D2,$t1 554 555 mullw $t0,$h3,$s1 # h3*s1 556 mulhwu $t1,$h3,$s1 557 addc $d3,$d3,$t2 558 adde $D3,$D3,$t3 559 560 mullw $t2,$h3,$s2 # h3*s2 561 mulhwu $t3,$h3,$s2 562 addc $d0,$d0,$t0 563 adde $D0,$D0,$t1 564 565 mullw $t0,$h3,$s3 # h3*s3 566 mulhwu $t1,$h3,$s3 567 addc $d1,$d1,$t2 568 adde $D1,$D1,$t3 569 570 mullw $t2,$h3,$r0 # h3*r0 571 mulhwu $t3,$h3,$r0 572 addc $d2,$d2,$t0 573 adde $D2,$D2,$t1 574 575 mullw $t0,$h4,$s1 # h4*s1 576 addc $d3,$d3,$t2 577 adde $D3,$D3,$t3 578 addc $d1,$d1,$t0 579 580 mullw $t1,$h4,$s2 # h4*s2 581 addze $D1,$D1 582 addc $d2,$d2,$t1 583 addze $D2,$D2 584 585 mullw $t2,$h4,$s3 # h4*s3 586 addc $d3,$d3,$t2 587 addze $D3,$D3 588 589 mullw $h4,$h4,$r0 # h4*r0 590 591 addc $h1,$d1,$D0 592 adde $h2,$d2,$D1 593 adde $h3,$d3,$D2 594 adde $h4,$h4,$D3 595 596 andc $D0,$h4,$mask # final reduction step 597 and $h4,$h4,$mask 598 srwi $D1,$D0,2 599 add $D0,$D0,$D1 600 addc $h0,$d0,$D0 601 addze $h1,$h1 602 addze $h2,$h2 603 addze $h3,$h3 604 addze $h4,$h4 605 606 bdnz Loop 607 608 stw $h0,0($ctx) # store hash value 609 stw $h1,4($ctx) 610 stw $h2,8($ctx) 611 stw $h3,12($ctx) 612 stw $h4,16($ctx) 613 614 $POP r14,`$FRAME-$SIZE_T*18`($sp) 615 $POP r15,`$FRAME-$SIZE_T*17`($sp) 616 $POP r16,`$FRAME-$SIZE_T*16`($sp) 617 $POP r17,`$FRAME-$SIZE_T*15`($sp) 618 $POP r18,`$FRAME-$SIZE_T*14`($sp) 619 $POP r19,`$FRAME-$SIZE_T*13`($sp) 620 $POP r20,`$FRAME-$SIZE_T*12`($sp) 621 $POP r21,`$FRAME-$SIZE_T*11`($sp) 622 $POP r22,`$FRAME-$SIZE_T*10`($sp) 623 $POP r23,`$FRAME-$SIZE_T*9`($sp) 624 $POP r24,`$FRAME-$SIZE_T*8`($sp) 625 $POP r25,`$FRAME-$SIZE_T*7`($sp) 626 $POP r26,`$FRAME-$SIZE_T*6`($sp) 627 $POP r27,`$FRAME-$SIZE_T*5`($sp) 628 $POP r28,`$FRAME-$SIZE_T*4`($sp) 629 $POP r29,`$FRAME-$SIZE_T*3`($sp) 630 $POP r30,`$FRAME-$SIZE_T*2`($sp) 631 $POP r31,`$FRAME-$SIZE_T*1`($sp) 632 addi $sp,$sp,$FRAME 633Labort: 634 blr 635 .long 0 636 .byte 0,12,4,1,0x80,18,4,0 637.size .poly1305_blocks,.-.poly1305_blocks 638___ 639{ 640my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(6..12)); 641 642$code.=<<___; 643.globl .poly1305_emit 644.align 5 645.poly1305_emit: 646 lwz r0,24($ctx) # is_base2_26 647 lwz $h0,0($ctx) # load hash value 648 lwz $h1,4($ctx) 649 lwz $h2,8($ctx) 650 lwz $h3,12($ctx) 651 lwz $h4,16($ctx) 652 cmplwi r0,0 653 beq Lemit_base2_32 654 655 slwi $t0,$h1,26 # base 2^26 -> base 2^32 656 srwi $h1,$h1,6 657 slwi $t1,$h2,20 658 srwi $h2,$h2,12 659 addc $h0,$h0,$t0 660 slwi $t0,$h3,14 661 srwi $h3,$h3,18 662 adde $h1,$h1,$t1 663 slwi $t1,$h4,8 664 srwi $h4,$h4,24 665 adde $h2,$h2,$t0 666 adde $h3,$h3,$t1 667 addze $h4,$h4 668 669Lemit_base2_32: 670 addic r0,$h0,5 # compare to modulus 671 addze r0,$h1 672 addze r0,$h2 673 addze r0,$h3 674 addze r0,$h4 675 676 srwi r0,r0,2 # see if it carried/borrowed 677 neg r0,r0 678 andi. r0,r0,5 679 680 addc $h0,$h0,r0 681 lwz r0,0($nonce) 682 addze $h1,$h1 683 lwz $t0,4($nonce) 684 addze $h2,$h2 685 lwz $t1,8($nonce) 686 addze $h3,$h3 687 lwz $h4,12($nonce) 688 689 addc $h0,$h0,r0 # accumulate nonce 690 adde $h1,$h1,$t0 691 adde $h2,$h2,$t1 692 adde $h3,$h3,$h4 693 694 addi $ctx,$mac,-1 695 addi $mac,$mac,7 696 697 stbu $h0,1($ctx) # write [little-endian] result 698 srwi $h0,$h0,8 699 stbu $h2,1($mac) 700 srwi $h2,$h2,8 701 702 stbu $h0,1($ctx) 703 srwi $h0,$h0,8 704 stbu $h2,1($mac) 705 srwi $h2,$h2,8 706 707 stbu $h0,1($ctx) 708 srwi $h0,$h0,8 709 stbu $h2,1($mac) 710 srwi $h2,$h2,8 711 712 stbu $h0,1($ctx) 713 stbu $h2,1($mac) 714 715 stbu $h1,1($ctx) 716 srwi $h1,$h1,8 717 stbu $h3,1($mac) 718 srwi $h3,$h3,8 719 720 stbu $h1,1($ctx) 721 srwi $h1,$h1,8 722 stbu $h3,1($mac) 723 srwi $h3,$h3,8 724 725 stbu $h1,1($ctx) 726 srwi $h1,$h1,8 727 stbu $h3,1($mac) 728 srwi $h3,$h3,8 729 730 stbu $h1,1($ctx) 731 stbu $h3,1($mac) 732 733 blr 734 .long 0 735 .byte 0,12,0x14,0,0,0,3,0 736.size .poly1305_emit,.-.poly1305_emit 737___ 738} } 739{{{ 740######################################################################## 741# PowerISA 2.07/VSX section # 742######################################################################## 743 744my $LOCALS= 6*$SIZE_T; 745my $VSXFRAME = $LOCALS + 6*$SIZE_T; 746 $VSXFRAME += 128; # local variables 747 $VSXFRAME += 12*16; # v20-v31 offload 748 749my $BIG_ENDIAN = ($flavour !~ /le/) ? 4 : 0; 750 751######################################################################## 752# Layout of opaque area is following: 753# 754# unsigned __int32 h[5]; # current hash value base 2^26 755# unsigned __int32 pad; 756# unsigned __int32 is_base2_26, pad; 757# unsigned __int64 r[2]; # key value base 2^64 758# struct { unsigned __int32 r^2, r^4, r^1, r^3; } r[9]; 759# 760# where r^n are base 2^26 digits of powers of multiplier key. There are 761# 5 digits, but last four are interleaved with multiples of 5, totalling 762# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. Order of 763# powers is as they appear in register, not memory. 764 765my ($H0, $H1, $H2, $H3, $H4) = map("v$_",(0..4)); 766my ($I0, $I1, $I2, $I3, $I4) = map("v$_",(5..9)); 767my ($R0, $R1, $S1, $R2, $S2) = map("v$_",(10..14)); 768my ($R3, $S3, $R4, $S4) = ($R1, $S1, $R2, $S2); 769my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("v$_",(15..19)); 770my ($T0, $T1, $T2, $T3, $T4) = map("v$_",(20..24)); 771my ($_26,$_4,$_40,$_14,$mask26,$padbits,$I2perm) = map("v$_",(25..31)); 772my ($x00,$x60,$x70,$x10,$x20,$x30,$x40,$x50) = (0, map("r$_",(7,8,27..31))); 773my ($ctx_,$_ctx,$const) = map("r$_",(10..12)); 774 775 if ($flavour =~ /64/) { 776############################################################################### 777# setup phase of poly1305_blocks_vsx is different on 32- and 64-bit platforms, 778# but the base 2^26 computational part is same... 779 780my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(6..11,27..31)); 781my $mask = "r0"; 782 783$code.=<<___; 784.globl .poly1305_blocks_vsx 785.align 5 786.poly1305_blocks_vsx: 787 lwz r7,24($ctx) # is_base2_26 788 cmpldi $len,128 789 bge __poly1305_blocks_vsx 790 791 neg r0,r7 # is_base2_26 as mask 792 lwz r7,0($ctx) # load hash base 2^26 793 lwz r8,4($ctx) 794 lwz r9,8($ctx) 795 lwz r10,12($ctx) 796 lwz r11,16($ctx) 797 798 sldi r8,r8,26 # base 2^26 -> base 2^64 799 sldi r12,r9,52 800 add r7,r7,r8 801 srdi r9,r9,12 802 sldi r10,r10,14 803 addc r7,r7,r12 804 sldi r8,r11,40 805 adde r9,r9,r10 806 srdi r11,r11,24 807 addc r9,r9,r8 808 addze r11,r11 809 810 ld r8,0($ctx) # load hash base 2^64 811 ld r10,8($ctx) 812 ld r12,16($ctx) 813 814 xor r7,r7,r8 # select between radixes 815 xor r9,r9,r10 816 xor r11,r11,r12 817 and r7,r7,r0 818 and r9,r9,r0 819 and r11,r11,r0 820 xor r7,r7,r8 821 xor r9,r9,r10 822 xor r11,r11,r12 823 824 li r0,0 825 std r7,0($ctx) # store hash base 2^64 826 std r9,8($ctx) 827 std r11,16($ctx) 828 stw r0,24($ctx) # clear is_base2_26 829 830 b Lpoly1305_blocks 831 .long 0 832 .byte 0,12,0x14,0,0,0,4,0 833.size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx 834 835.align 5 836__poly1305_mul: 837 mulld $d0,$h0,$r0 # h0*r0 838 mulhdu $d1,$h0,$r0 839 840 mulld $t0,$h1,$s1 # h1*5*r1 841 mulhdu $t1,$h1,$s1 842 addc $d0,$d0,$t0 843 adde $d1,$d1,$t1 844 845 mulld $t0,$h0,$r1 # h0*r1 846 mulhdu $d2,$h0,$r1 847 addc $d1,$d1,$t0 848 addze $d2,$d2 849 850 mulld $t0,$h1,$r0 # h1*r0 851 mulhdu $t1,$h1,$r0 852 addc $d1,$d1,$t0 853 adde $d2,$d2,$t1 854 855 mulld $t0,$h2,$s1 # h2*5*r1 856 mulld $t1,$h2,$r0 # h2*r0 857 addc $d1,$d1,$t0 858 adde $d2,$d2,$t1 859 860 andc $t0,$d2,$mask # final reduction step 861 and $h2,$d2,$mask 862 srdi $t1,$t0,2 863 add $t0,$t0,$t1 864 addc $h0,$d0,$t0 865 addze $h1,$d1 866 addze $h2,$h2 867 868 blr 869 .long 0 870 .byte 0,12,0x14,0,0,0,0,0 871.size __poly1305_mul,.-__poly1305_mul 872 873.align 5 874__poly1305_splat: 875 extrdi $d0,$h0,26,38 876 extrdi $d1,$h0,26,12 877 stw $d0,0x00($t1) 878 879 extrdi $d2,$h0,12,0 880 slwi $d0,$d1,2 881 stw $d1,0x10($t1) 882 add $d0,$d0,$d1 # * 5 883 stw $d0,0x20($t1) 884 885 insrdi $d2,$h1,14,38 886 slwi $d0,$d2,2 887 stw $d2,0x30($t1) 888 add $d0,$d0,$d2 # * 5 889 stw $d0,0x40($t1) 890 891 extrdi $d1,$h1,26,24 892 extrdi $d2,$h1,24,0 893 slwi $d0,$d1,2 894 stw $d1,0x50($t1) 895 add $d0,$d0,$d1 # * 5 896 stw $d0,0x60($t1) 897 898 insrdi $d2,$h2,3,37 899 slwi $d0,$d2,2 900 stw $d2,0x70($t1) 901 add $d0,$d0,$d2 # * 5 902 stw $d0,0x80($t1) 903 904 blr 905 .long 0 906 .byte 0,12,0x14,0,0,0,0,0 907.size __poly1305_splat,.-__poly1305_splat 908 909.align 5 910__poly1305_blocks_vsx: 911 $STU $sp,-$VSXFRAME($sp) 912 mflr r0 913 li r10,`15+$LOCALS+128` 914 li r11,`31+$LOCALS+128` 915 mfspr r12,256 916 stvx v20,r10,$sp 917 addi r10,r10,32 918 stvx v21,r11,$sp 919 addi r11,r11,32 920 stvx v22,r10,$sp 921 addi r10,r10,32 922 stvx v23,r11,$sp 923 addi r11,r11,32 924 stvx v24,r10,$sp 925 addi r10,r10,32 926 stvx v25,r11,$sp 927 addi r11,r11,32 928 stvx v26,r10,$sp 929 addi r10,r10,32 930 stvx v27,r11,$sp 931 addi r11,r11,32 932 stvx v28,r10,$sp 933 addi r10,r10,32 934 stvx v29,r11,$sp 935 addi r11,r11,32 936 stvx v30,r10,$sp 937 stvx v31,r11,$sp 938 stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave 939 li r12,-1 940 mtspr 256,r12 # preserve all AltiVec registers 941 $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp) 942 $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp) 943 $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp) 944 $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp) 945 $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp) 946 $PUSH r0,`$VSXFRAME+$LRSAVE`($sp) 947 948 bl LPICmeup 949 950 li $x10,0x10 951 li $x20,0x20 952 li $x30,0x30 953 li $x40,0x40 954 li $x50,0x50 955 lvx_u $mask26,$x00,$const 956 lvx_u $_26,$x10,$const 957 lvx_u $_40,$x20,$const 958 lvx_u $I2perm,$x30,$const 959 lvx_u $padbits,$x40,$const 960 961 cmplwi r7,0 # is_base2_26? 962 bne Lskip_init_vsx 963 964 ld $r0,32($ctx) # load key base 2^64 965 ld $r1,40($ctx) 966 srdi $s1,$r1,2 967 li $mask,3 968 add $s1,$s1,$r1 # s1 = r1 + r1>>2 969 970 mr $h0,$r0 # "calculate" r^1 971 mr $h1,$r1 972 li $h2,0 973 addi $t1,$ctx,`48+(12^$BIG_ENDIAN)` 974 bl __poly1305_splat 975 976 bl __poly1305_mul # calculate r^2 977 addi $t1,$ctx,`48+(4^$BIG_ENDIAN)` 978 bl __poly1305_splat 979 980 bl __poly1305_mul # calculate r^3 981 addi $t1,$ctx,`48+(8^$BIG_ENDIAN)` 982 bl __poly1305_splat 983 984 bl __poly1305_mul # calculate r^4 985 addi $t1,$ctx,`48+(0^$BIG_ENDIAN)` 986 bl __poly1305_splat 987 988 ld $h0,0($ctx) # load hash 989 ld $h1,8($ctx) 990 ld $h2,16($ctx) 991 992 extrdi $d0,$h0,26,38 # base 2^64 -> base 2^26 993 extrdi $d1,$h0,26,12 994 extrdi $d2,$h0,12,0 995 mtvrwz $H0,$d0 996 insrdi $d2,$h1,14,38 997 mtvrwz $H1,$d1 998 extrdi $d1,$h1,26,24 999 mtvrwz $H2,$d2 1000 extrdi $d2,$h1,24,0 1001 mtvrwz $H3,$d1 1002 insrdi $d2,$h2,3,37 1003 mtvrwz $H4,$d2 1004___ 1005 } else { 1006############################################################################### 1007# 32-bit initialization 1008 1009my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(7..11,0,12)); 1010my ($R3,$S3,$R4,$S4)=($I1,$I2,$I3,$I4); 1011 1012$code.=<<___; 1013.globl .poly1305_blocks_vsx 1014.align 5 1015.poly1305_blocks_vsx: 1016 lwz r7,24($ctx) # is_base2_26 1017 cmplwi $len,128 1018 bge __poly1305_blocks_vsx 1019 cmplwi r7,0 1020 beq Lpoly1305_blocks 1021 1022 lwz $h0,0($ctx) # load hash 1023 lwz $h1,4($ctx) 1024 lwz $h2,8($ctx) 1025 lwz $h3,12($ctx) 1026 lwz $h4,16($ctx) 1027 1028 slwi $t0,$h1,26 # base 2^26 -> base 2^32 1029 srwi $h1,$h1,6 1030 slwi $t1,$h2,20 1031 srwi $h2,$h2,12 1032 addc $h0,$h0,$t0 1033 slwi $t0,$h3,14 1034 srwi $h3,$h3,18 1035 adde $h1,$h1,$t1 1036 slwi $t1,$h4,8 1037 srwi $h4,$h4,24 1038 adde $h2,$h2,$t0 1039 li $t0,0 1040 adde $h3,$h3,$t1 1041 addze $h4,$h4 1042 1043 stw $h0,0($ctx) # store hash base 2^32 1044 stw $h1,4($ctx) 1045 stw $h2,8($ctx) 1046 stw $h3,12($ctx) 1047 stw $h4,16($ctx) 1048 stw $t0,24($ctx) # clear is_base2_26 1049 1050 b Lpoly1305_blocks 1051 .long 0 1052 .byte 0,12,0x14,0,0,0,4,0 1053.size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx 1054 1055.align 5 1056__poly1305_mul: 1057 vmulouw $ACC0,$H0,$R0 1058 vmulouw $ACC1,$H1,$R0 1059 vmulouw $ACC2,$H2,$R0 1060 vmulouw $ACC3,$H3,$R0 1061 vmulouw $ACC4,$H4,$R0 1062 1063 vmulouw $T0,$H4,$S1 1064 vaddudm $ACC0,$ACC0,$T0 1065 vmulouw $T0,$H0,$R1 1066 vaddudm $ACC1,$ACC1,$T0 1067 vmulouw $T0,$H1,$R1 1068 vaddudm $ACC2,$ACC2,$T0 1069 vmulouw $T0,$H2,$R1 1070 vaddudm $ACC3,$ACC3,$T0 1071 vmulouw $T0,$H3,$R1 1072 vaddudm $ACC4,$ACC4,$T0 1073 1074 vmulouw $T0,$H3,$S2 1075 vaddudm $ACC0,$ACC0,$T0 1076 vmulouw $T0,$H4,$S2 1077 vaddudm $ACC1,$ACC1,$T0 1078 vmulouw $T0,$H0,$R2 1079 vaddudm $ACC2,$ACC2,$T0 1080 vmulouw $T0,$H1,$R2 1081 vaddudm $ACC3,$ACC3,$T0 1082 vmulouw $T0,$H2,$R2 1083 vaddudm $ACC4,$ACC4,$T0 1084 1085 vmulouw $T0,$H2,$S3 1086 vaddudm $ACC0,$ACC0,$T0 1087 vmulouw $T0,$H3,$S3 1088 vaddudm $ACC1,$ACC1,$T0 1089 vmulouw $T0,$H4,$S3 1090 vaddudm $ACC2,$ACC2,$T0 1091 vmulouw $T0,$H0,$R3 1092 vaddudm $ACC3,$ACC3,$T0 1093 vmulouw $T0,$H1,$R3 1094 vaddudm $ACC4,$ACC4,$T0 1095 1096 vmulouw $T0,$H1,$S4 1097 vaddudm $ACC0,$ACC0,$T0 1098 vmulouw $T0,$H2,$S4 1099 vaddudm $ACC1,$ACC1,$T0 1100 vmulouw $T0,$H3,$S4 1101 vaddudm $ACC2,$ACC2,$T0 1102 vmulouw $T0,$H4,$S4 1103 vaddudm $ACC3,$ACC3,$T0 1104 vmulouw $T0,$H0,$R4 1105 vaddudm $ACC4,$ACC4,$T0 1106 1107 ################################################################ 1108 # lazy reduction 1109 1110 vspltisb $T0,2 1111 vsrd $H4,$ACC3,$_26 1112 vsrd $H1,$ACC0,$_26 1113 vand $H3,$ACC3,$mask26 1114 vand $H0,$ACC0,$mask26 1115 vaddudm $H4,$H4,$ACC4 # h3 -> h4 1116 vaddudm $H1,$H1,$ACC1 # h0 -> h1 1117 1118 vsrd $ACC4,$H4,$_26 1119 vsrd $ACC1,$H1,$_26 1120 vand $H4,$H4,$mask26 1121 vand $H1,$H1,$mask26 1122 vaddudm $H0,$H0,$ACC4 1123 vaddudm $H2,$ACC2,$ACC1 # h1 -> h2 1124 1125 vsld $ACC4,$ACC4,$T0 # <<2 1126 vsrd $ACC2,$H2,$_26 1127 vand $H2,$H2,$mask26 1128 vaddudm $H0,$H0,$ACC4 # h4 -> h0 1129 vaddudm $H3,$H3,$ACC2 # h2 -> h3 1130 1131 vsrd $ACC0,$H0,$_26 1132 vsrd $ACC3,$H3,$_26 1133 vand $H0,$H0,$mask26 1134 vand $H3,$H3,$mask26 1135 vaddudm $H1,$H1,$ACC0 # h0 -> h1 1136 vaddudm $H4,$H4,$ACC3 # h3 -> h4 1137 1138 blr 1139 .long 0 1140 .byte 0,12,0x14,0,0,0,0,0 1141.size __poly1305_mul,.-__poly1305_mul 1142 1143.align 5 1144__poly1305_blocks_vsx: 1145 $STU $sp,-$VSXFRAME($sp) 1146 mflr r0 1147 li r10,`15+$LOCALS+128` 1148 li r11,`31+$LOCALS+128` 1149 mfspr r12,256 1150 stvx v20,r10,$sp 1151 addi r10,r10,32 1152 stvx v21,r11,$sp 1153 addi r11,r11,32 1154 stvx v22,r10,$sp 1155 addi r10,r10,32 1156 stvx v23,r11,$sp 1157 addi r11,r11,32 1158 stvx v24,r10,$sp 1159 addi r10,r10,32 1160 stvx v25,r11,$sp 1161 addi r11,r11,32 1162 stvx v26,r10,$sp 1163 addi r10,r10,32 1164 stvx v27,r11,$sp 1165 addi r11,r11,32 1166 stvx v28,r10,$sp 1167 addi r10,r10,32 1168 stvx v29,r11,$sp 1169 addi r11,r11,32 1170 stvx v30,r10,$sp 1171 stvx v31,r11,$sp 1172 stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave 1173 li r12,-1 1174 mtspr 256,r12 # preserve all AltiVec registers 1175 $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp) 1176 $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp) 1177 $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp) 1178 $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp) 1179 $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp) 1180 $PUSH r0,`$VSXFRAME+$LRSAVE`($sp) 1181 1182 bl LPICmeup 1183 1184 li $x10,0x10 1185 li $x20,0x20 1186 li $x30,0x30 1187 li $x40,0x40 1188 li $x50,0x50 1189 lvx_u $mask26,$x00,$const 1190 lvx_u $_26,$x10,$const 1191 lvx_u $_40,$x20,$const 1192 lvx_u $I2perm,$x30,$const 1193 lvx_u $padbits,$x40,$const 1194 1195 cmplwi r7,0 # is_base2_26? 1196 bne Lskip_init_vsx 1197 1198 lwz $h1,32($ctx) # load key base 2^32 1199 lwz $h2,36($ctx) 1200 lwz $h3,40($ctx) 1201 lwz $h4,44($ctx) 1202 1203 extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26 1204 extrwi $h1,$h1,6,0 1205 insrwi $h1,$h2,20,6 1206 extrwi $h2,$h2,12,0 1207 insrwi $h2,$h3,14,6 1208 extrwi $h3,$h3,18,0 1209 insrwi $h3,$h4,8,6 1210 extrwi $h4,$h4,24,0 1211 1212 mtvrwz $R0,$h0 1213 slwi $h0,$h1,2 1214 mtvrwz $R1,$h1 1215 add $h1,$h1,$h0 1216 mtvrwz $S1,$h1 1217 slwi $h1,$h2,2 1218 mtvrwz $R2,$h2 1219 add $h2,$h2,$h1 1220 mtvrwz $S2,$h2 1221 slwi $h2,$h3,2 1222 mtvrwz $R3,$h3 1223 add $h3,$h3,$h2 1224 mtvrwz $S3,$h3 1225 slwi $h3,$h4,2 1226 mtvrwz $R4,$h4 1227 add $h4,$h4,$h3 1228 mtvrwz $S4,$h4 1229 1230 vmr $H0,$R0 1231 vmr $H1,$R1 1232 vmr $H2,$R2 1233 vmr $H3,$R3 1234 vmr $H4,$R4 1235 1236 bl __poly1305_mul # r^1:- * r^1:- 1237 1238 vpermdi $R0,$H0,$R0,0b00 1239 vpermdi $R1,$H1,$R1,0b00 1240 vpermdi $R2,$H2,$R2,0b00 1241 vpermdi $R3,$H3,$R3,0b00 1242 vpermdi $R4,$H4,$R4,0b00 1243 vpermdi $H0,$H0,$H0,0b00 1244 vpermdi $H1,$H1,$H1,0b00 1245 vpermdi $H2,$H2,$H2,0b00 1246 vpermdi $H3,$H3,$H3,0b00 1247 vpermdi $H4,$H4,$H4,0b00 1248 vsld $S1,$R1,$T0 # <<2 1249 vsld $S2,$R2,$T0 1250 vsld $S3,$R3,$T0 1251 vsld $S4,$R4,$T0 1252 vaddudm $S1,$S1,$R1 1253 vaddudm $S2,$S2,$R2 1254 vaddudm $S3,$S3,$R3 1255 vaddudm $S4,$S4,$R4 1256 1257 bl __poly1305_mul # r^2:r^2 * r^2:r^1 1258 1259 addi $h0,$ctx,0x60 1260 lwz $h1,0($ctx) # load hash 1261 lwz $h2,4($ctx) 1262 lwz $h3,8($ctx) 1263 lwz $h4,12($ctx) 1264 lwz $t0,16($ctx) 1265 1266 vmrgow $R0,$R0,$H0 # r^2:r^4:r^1:r^3 1267 vmrgow $R1,$R1,$H1 1268 vmrgow $R2,$R2,$H2 1269 vmrgow $R3,$R3,$H3 1270 vmrgow $R4,$R4,$H4 1271 vslw $S1,$R1,$T0 # <<2 1272 vslw $S2,$R2,$T0 1273 vslw $S3,$R3,$T0 1274 vslw $S4,$R4,$T0 1275 vadduwm $S1,$S1,$R1 1276 vadduwm $S2,$S2,$R2 1277 vadduwm $S3,$S3,$R3 1278 vadduwm $S4,$S4,$R4 1279 1280 stvx_u $R0,$x30,$ctx 1281 stvx_u $R1,$x40,$ctx 1282 stvx_u $S1,$x50,$ctx 1283 stvx_u $R2,$x00,$h0 1284 stvx_u $S2,$x10,$h0 1285 stvx_u $R3,$x20,$h0 1286 stvx_u $S3,$x30,$h0 1287 stvx_u $R4,$x40,$h0 1288 stvx_u $S4,$x50,$h0 1289 1290 extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26 1291 extrwi $h1,$h1,6,0 1292 mtvrwz $H0,$h0 1293 insrwi $h1,$h2,20,6 1294 extrwi $h2,$h2,12,0 1295 mtvrwz $H1,$h1 1296 insrwi $h2,$h3,14,6 1297 extrwi $h3,$h3,18,0 1298 mtvrwz $H2,$h2 1299 insrwi $h3,$h4,8,6 1300 extrwi $h4,$h4,24,0 1301 mtvrwz $H3,$h3 1302 insrwi $h4,$t0,3,5 1303 mtvrwz $H4,$h4 1304___ 1305 } 1306$code.=<<___; 1307 li r0,1 1308 stw r0,24($ctx) # set is_base2_26 1309 b Loaded_vsx 1310 1311.align 4 1312Lskip_init_vsx: 1313 li $x10,4 1314 li $x20,8 1315 li $x30,12 1316 li $x40,16 1317 lvwzx_u $H0,$x00,$ctx 1318 lvwzx_u $H1,$x10,$ctx 1319 lvwzx_u $H2,$x20,$ctx 1320 lvwzx_u $H3,$x30,$ctx 1321 lvwzx_u $H4,$x40,$ctx 1322 1323Loaded_vsx: 1324 li $x10,0x10 1325 li $x20,0x20 1326 li $x30,0x30 1327 li $x40,0x40 1328 li $x50,0x50 1329 li $x60,0x60 1330 li $x70,0x70 1331 addi $ctx_,$ctx,64 # &ctx->r[1] 1332 addi $_ctx,$sp,`$LOCALS+15` # &ctx->r[1], r^2:r^4 shadow 1333 1334 vxor $T0,$T0,$T0 # ensure second half is zero 1335 vpermdi $H0,$H0,$T0,0b00 1336 vpermdi $H1,$H1,$T0,0b00 1337 vpermdi $H2,$H2,$T0,0b00 1338 vpermdi $H3,$H3,$T0,0b00 1339 vpermdi $H4,$H4,$T0,0b00 1340 1341 be?lvx_u $_4,$x50,$const # byte swap mask 1342 lvx_u $T1,$x00,$inp # load first input block 1343 lvx_u $T2,$x10,$inp 1344 lvx_u $T3,$x20,$inp 1345 lvx_u $T4,$x30,$inp 1346 be?vperm $T1,$T1,$T1,$_4 1347 be?vperm $T2,$T2,$T2,$_4 1348 be?vperm $T3,$T3,$T3,$_4 1349 be?vperm $T4,$T4,$T4,$_4 1350 1351 vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26 1352 vspltisb $_4,4 1353 vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011 1354 vspltisb $_14,14 1355 vpermdi $I3,$T1,$T2,0b11 1356 1357 vsrd $I1,$I0,$_26 1358 vsrd $I2,$I2,$_4 1359 vsrd $I4,$I3,$_40 1360 vsrd $I3,$I3,$_14 1361 vand $I0,$I0,$mask26 1362 vand $I1,$I1,$mask26 1363 vand $I2,$I2,$mask26 1364 vand $I3,$I3,$mask26 1365 1366 vpermdi $T1,$T3,$T4,0b00 1367 vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011 1368 vpermdi $T3,$T3,$T4,0b11 1369 1370 vsrd $T0,$T1,$_26 1371 vsrd $T2,$T2,$_4 1372 vsrd $T4,$T3,$_40 1373 vsrd $T3,$T3,$_14 1374 vand $T1,$T1,$mask26 1375 vand $T0,$T0,$mask26 1376 vand $T2,$T2,$mask26 1377 vand $T3,$T3,$mask26 1378 1379 # inp[2]:inp[0]:inp[3]:inp[1] 1380 vmrgow $I4,$T4,$I4 1381 vmrgow $I0,$T1,$I0 1382 vmrgow $I1,$T0,$I1 1383 vmrgow $I2,$T2,$I2 1384 vmrgow $I3,$T3,$I3 1385 vor $I4,$I4,$padbits 1386 1387 lvx_splt $R0,$x30,$ctx # taking lvx_vsplt out of loop 1388 lvx_splt $R1,$x00,$ctx_ # gives ~8% improvement 1389 lvx_splt $S1,$x10,$ctx_ 1390 lvx_splt $R2,$x20,$ctx_ 1391 lvx_splt $S2,$x30,$ctx_ 1392 lvx_splt $T1,$x40,$ctx_ 1393 lvx_splt $T2,$x50,$ctx_ 1394 lvx_splt $T3,$x60,$ctx_ 1395 lvx_splt $T4,$x70,$ctx_ 1396 stvx $R1,$x00,$_ctx 1397 stvx $S1,$x10,$_ctx 1398 stvx $R2,$x20,$_ctx 1399 stvx $S2,$x30,$_ctx 1400 stvx $T1,$x40,$_ctx 1401 stvx $T2,$x50,$_ctx 1402 stvx $T3,$x60,$_ctx 1403 stvx $T4,$x70,$_ctx 1404 1405 addi $inp,$inp,0x40 1406 addi $const,$const,0x50 1407 addi r0,$len,-64 1408 srdi r0,r0,6 1409 mtctr r0 1410 b Loop_vsx 1411 1412.align 4 1413Loop_vsx: 1414 ################################################################ 1415 ## ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 1416 ## ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 1417 ## \___________________/ 1418 ## 1419 ## Note that we start with inp[2:3]*r^2. This is because it 1420 ## doesn't depend on reduction in previous iteration. 1421 ################################################################ 1422 ## d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1423 ## d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1424 ## d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1425 ## d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1426 ## d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1427 1428 vmuleuw $ACC0,$I0,$R0 1429 vmuleuw $ACC1,$I0,$R1 1430 vmuleuw $ACC2,$I0,$R2 1431 vmuleuw $ACC3,$I1,$R2 1432 1433 vmuleuw $T0,$I1,$R0 1434 vaddudm $ACC1,$ACC1,$T0 1435 vmuleuw $T0,$I1,$R1 1436 vaddudm $ACC2,$ACC2,$T0 1437 vmuleuw $ACC4,$I2,$R2 1438 vmuleuw $T0,$I4,$S1 1439 vaddudm $ACC0,$ACC0,$T0 1440 vmuleuw $T0,$I2,$R1 1441 vaddudm $ACC3,$ACC3,$T0 1442 lvx $S3,$x50,$_ctx 1443 vmuleuw $T0,$I3,$R1 1444 vaddudm $ACC4,$ACC4,$T0 1445 lvx $R3,$x40,$_ctx 1446 1447 vaddudm $H2,$H2,$I2 1448 vaddudm $H0,$H0,$I0 1449 vaddudm $H3,$H3,$I3 1450 vaddudm $H1,$H1,$I1 1451 vaddudm $H4,$H4,$I4 1452 1453 vmuleuw $T0,$I3,$S2 1454 vaddudm $ACC0,$ACC0,$T0 1455 vmuleuw $T0,$I4,$S2 1456 vaddudm $ACC1,$ACC1,$T0 1457 vmuleuw $T0,$I2,$R0 1458 vaddudm $ACC2,$ACC2,$T0 1459 vmuleuw $T0,$I3,$R0 1460 vaddudm $ACC3,$ACC3,$T0 1461 lvx $S4,$x70,$_ctx 1462 vmuleuw $T0,$I4,$R0 1463 vaddudm $ACC4,$ACC4,$T0 1464 lvx $R4,$x60,$_ctx 1465 1466 vmuleuw $T0,$I2,$S3 1467 vaddudm $ACC0,$ACC0,$T0 1468 vmuleuw $T0,$I3,$S3 1469 vaddudm $ACC1,$ACC1,$T0 1470 vmuleuw $T0,$I4,$S3 1471 vaddudm $ACC2,$ACC2,$T0 1472 vmuleuw $T0,$I0,$R3 1473 vaddudm $ACC3,$ACC3,$T0 1474 vmuleuw $T0,$I1,$R3 1475 vaddudm $ACC4,$ACC4,$T0 1476 1477 be?lvx_u $_4,$x00,$const # byte swap mask 1478 lvx_u $T1,$x00,$inp # load next input block 1479 lvx_u $T2,$x10,$inp 1480 lvx_u $T3,$x20,$inp 1481 lvx_u $T4,$x30,$inp 1482 be?vperm $T1,$T1,$T1,$_4 1483 be?vperm $T2,$T2,$T2,$_4 1484 be?vperm $T3,$T3,$T3,$_4 1485 be?vperm $T4,$T4,$T4,$_4 1486 1487 vmuleuw $T0,$I1,$S4 1488 vaddudm $ACC0,$ACC0,$T0 1489 vmuleuw $T0,$I2,$S4 1490 vaddudm $ACC1,$ACC1,$T0 1491 vmuleuw $T0,$I3,$S4 1492 vaddudm $ACC2,$ACC2,$T0 1493 vmuleuw $T0,$I4,$S4 1494 vaddudm $ACC3,$ACC3,$T0 1495 vmuleuw $T0,$I0,$R4 1496 vaddudm $ACC4,$ACC4,$T0 1497 1498 vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26 1499 vspltisb $_4,4 1500 vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011 1501 vpermdi $I3,$T1,$T2,0b11 1502 1503 # (hash + inp[0:1]) * r^4 1504 vmulouw $T0,$H0,$R0 1505 vaddudm $ACC0,$ACC0,$T0 1506 vmulouw $T0,$H1,$R0 1507 vaddudm $ACC1,$ACC1,$T0 1508 vmulouw $T0,$H2,$R0 1509 vaddudm $ACC2,$ACC2,$T0 1510 vmulouw $T0,$H3,$R0 1511 vaddudm $ACC3,$ACC3,$T0 1512 vmulouw $T0,$H4,$R0 1513 vaddudm $ACC4,$ACC4,$T0 1514 1515 vpermdi $T1,$T3,$T4,0b00 1516 vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011 1517 vpermdi $T3,$T3,$T4,0b11 1518 1519 vmulouw $T0,$H2,$S3 1520 vaddudm $ACC0,$ACC0,$T0 1521 vmulouw $T0,$H3,$S3 1522 vaddudm $ACC1,$ACC1,$T0 1523 vmulouw $T0,$H4,$S3 1524 vaddudm $ACC2,$ACC2,$T0 1525 vmulouw $T0,$H0,$R3 1526 vaddudm $ACC3,$ACC3,$T0 1527 lvx $S1,$x10,$_ctx 1528 vmulouw $T0,$H1,$R3 1529 vaddudm $ACC4,$ACC4,$T0 1530 lvx $R1,$x00,$_ctx 1531 1532 vsrd $I1,$I0,$_26 1533 vsrd $I2,$I2,$_4 1534 vsrd $I4,$I3,$_40 1535 vsrd $I3,$I3,$_14 1536 1537 vmulouw $T0,$H1,$S4 1538 vaddudm $ACC0,$ACC0,$T0 1539 vmulouw $T0,$H2,$S4 1540 vaddudm $ACC1,$ACC1,$T0 1541 vmulouw $T0,$H3,$S4 1542 vaddudm $ACC2,$ACC2,$T0 1543 vmulouw $T0,$H4,$S4 1544 vaddudm $ACC3,$ACC3,$T0 1545 lvx $S2,$x30,$_ctx 1546 vmulouw $T0,$H0,$R4 1547 vaddudm $ACC4,$ACC4,$T0 1548 lvx $R2,$x20,$_ctx 1549 1550 vand $I0,$I0,$mask26 1551 vand $I1,$I1,$mask26 1552 vand $I2,$I2,$mask26 1553 vand $I3,$I3,$mask26 1554 1555 vmulouw $T0,$H4,$S1 1556 vaddudm $ACC0,$ACC0,$T0 1557 vmulouw $T0,$H0,$R1 1558 vaddudm $ACC1,$ACC1,$T0 1559 vmulouw $T0,$H1,$R1 1560 vaddudm $ACC2,$ACC2,$T0 1561 vmulouw $T0,$H2,$R1 1562 vaddudm $ACC3,$ACC3,$T0 1563 vmulouw $T0,$H3,$R1 1564 vaddudm $ACC4,$ACC4,$T0 1565 1566 vsrd $T2,$T2,$_4 1567 vsrd $_4,$T1,$_26 1568 vsrd $T4,$T3,$_40 1569 vsrd $T3,$T3,$_14 1570 1571 vmulouw $T0,$H3,$S2 1572 vaddudm $ACC0,$ACC0,$T0 1573 vmulouw $T0,$H4,$S2 1574 vaddudm $ACC1,$ACC1,$T0 1575 vmulouw $T0,$H0,$R2 1576 vaddudm $ACC2,$ACC2,$T0 1577 vmulouw $T0,$H1,$R2 1578 vaddudm $ACC3,$ACC3,$T0 1579 vmulouw $T0,$H2,$R2 1580 vaddudm $ACC4,$ACC4,$T0 1581 1582 vand $T1,$T1,$mask26 1583 vand $_4,$_4,$mask26 1584 vand $T2,$T2,$mask26 1585 vand $T3,$T3,$mask26 1586 1587 ################################################################ 1588 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 1589 # and P. Schwabe 1590 1591 vspltisb $T0,2 1592 vsrd $H4,$ACC3,$_26 1593 vsrd $H1,$ACC0,$_26 1594 vand $H3,$ACC3,$mask26 1595 vand $H0,$ACC0,$mask26 1596 vaddudm $H4,$H4,$ACC4 # h3 -> h4 1597 vaddudm $H1,$H1,$ACC1 # h0 -> h1 1598 1599 vmrgow $I4,$T4,$I4 1600 vmrgow $I0,$T1,$I0 1601 vmrgow $I1,$_4,$I1 1602 vmrgow $I2,$T2,$I2 1603 vmrgow $I3,$T3,$I3 1604 vor $I4,$I4,$padbits 1605 1606 vsrd $ACC4,$H4,$_26 1607 vsrd $ACC1,$H1,$_26 1608 vand $H4,$H4,$mask26 1609 vand $H1,$H1,$mask26 1610 vaddudm $H0,$H0,$ACC4 1611 vaddudm $H2,$ACC2,$ACC1 # h1 -> h2 1612 1613 vsld $ACC4,$ACC4,$T0 # <<2 1614 vsrd $ACC2,$H2,$_26 1615 vand $H2,$H2,$mask26 1616 vaddudm $H0,$H0,$ACC4 # h4 -> h0 1617 vaddudm $H3,$H3,$ACC2 # h2 -> h3 1618 1619 vsrd $ACC0,$H0,$_26 1620 vsrd $ACC3,$H3,$_26 1621 vand $H0,$H0,$mask26 1622 vand $H3,$H3,$mask26 1623 vaddudm $H1,$H1,$ACC0 # h0 -> h1 1624 vaddudm $H4,$H4,$ACC3 # h3 -> h4 1625 1626 addi $inp,$inp,0x40 1627 bdnz Loop_vsx 1628 1629 neg $len,$len 1630 andi. $len,$len,0x30 1631 sub $inp,$inp,$len 1632 1633 lvx_u $R0,$x30,$ctx # load all powers 1634 lvx_u $R1,$x00,$ctx_ 1635 lvx_u $S1,$x10,$ctx_ 1636 lvx_u $R2,$x20,$ctx_ 1637 lvx_u $S2,$x30,$ctx_ 1638 1639Last_vsx: 1640 vmuleuw $ACC0,$I0,$R0 1641 vmuleuw $ACC1,$I1,$R0 1642 vmuleuw $ACC2,$I2,$R0 1643 vmuleuw $ACC3,$I3,$R0 1644 vmuleuw $ACC4,$I4,$R0 1645 1646 vmuleuw $T0,$I4,$S1 1647 vaddudm $ACC0,$ACC0,$T0 1648 vmuleuw $T0,$I0,$R1 1649 vaddudm $ACC1,$ACC1,$T0 1650 vmuleuw $T0,$I1,$R1 1651 vaddudm $ACC2,$ACC2,$T0 1652 vmuleuw $T0,$I2,$R1 1653 vaddudm $ACC3,$ACC3,$T0 1654 lvx_u $S3,$x50,$ctx_ 1655 vmuleuw $T0,$I3,$R1 1656 vaddudm $ACC4,$ACC4,$T0 1657 lvx_u $R3,$x40,$ctx_ 1658 1659 vaddudm $H2,$H2,$I2 1660 vaddudm $H0,$H0,$I0 1661 vaddudm $H3,$H3,$I3 1662 vaddudm $H1,$H1,$I1 1663 vaddudm $H4,$H4,$I4 1664 1665 vmuleuw $T0,$I3,$S2 1666 vaddudm $ACC0,$ACC0,$T0 1667 vmuleuw $T0,$I4,$S2 1668 vaddudm $ACC1,$ACC1,$T0 1669 vmuleuw $T0,$I0,$R2 1670 vaddudm $ACC2,$ACC2,$T0 1671 vmuleuw $T0,$I1,$R2 1672 vaddudm $ACC3,$ACC3,$T0 1673 lvx_u $S4,$x70,$ctx_ 1674 vmuleuw $T0,$I2,$R2 1675 vaddudm $ACC4,$ACC4,$T0 1676 lvx_u $R4,$x60,$ctx_ 1677 1678 vmuleuw $T0,$I2,$S3 1679 vaddudm $ACC0,$ACC0,$T0 1680 vmuleuw $T0,$I3,$S3 1681 vaddudm $ACC1,$ACC1,$T0 1682 vmuleuw $T0,$I4,$S3 1683 vaddudm $ACC2,$ACC2,$T0 1684 vmuleuw $T0,$I0,$R3 1685 vaddudm $ACC3,$ACC3,$T0 1686 vmuleuw $T0,$I1,$R3 1687 vaddudm $ACC4,$ACC4,$T0 1688 1689 vmuleuw $T0,$I1,$S4 1690 vaddudm $ACC0,$ACC0,$T0 1691 vmuleuw $T0,$I2,$S4 1692 vaddudm $ACC1,$ACC1,$T0 1693 vmuleuw $T0,$I3,$S4 1694 vaddudm $ACC2,$ACC2,$T0 1695 vmuleuw $T0,$I4,$S4 1696 vaddudm $ACC3,$ACC3,$T0 1697 vmuleuw $T0,$I0,$R4 1698 vaddudm $ACC4,$ACC4,$T0 1699 1700 # (hash + inp[0:1]) * r^4 1701 vmulouw $T0,$H0,$R0 1702 vaddudm $ACC0,$ACC0,$T0 1703 vmulouw $T0,$H1,$R0 1704 vaddudm $ACC1,$ACC1,$T0 1705 vmulouw $T0,$H2,$R0 1706 vaddudm $ACC2,$ACC2,$T0 1707 vmulouw $T0,$H3,$R0 1708 vaddudm $ACC3,$ACC3,$T0 1709 vmulouw $T0,$H4,$R0 1710 vaddudm $ACC4,$ACC4,$T0 1711 1712 vmulouw $T0,$H2,$S3 1713 vaddudm $ACC0,$ACC0,$T0 1714 vmulouw $T0,$H3,$S3 1715 vaddudm $ACC1,$ACC1,$T0 1716 vmulouw $T0,$H4,$S3 1717 vaddudm $ACC2,$ACC2,$T0 1718 vmulouw $T0,$H0,$R3 1719 vaddudm $ACC3,$ACC3,$T0 1720 lvx_u $S1,$x10,$ctx_ 1721 vmulouw $T0,$H1,$R3 1722 vaddudm $ACC4,$ACC4,$T0 1723 lvx_u $R1,$x00,$ctx_ 1724 1725 vmulouw $T0,$H1,$S4 1726 vaddudm $ACC0,$ACC0,$T0 1727 vmulouw $T0,$H2,$S4 1728 vaddudm $ACC1,$ACC1,$T0 1729 vmulouw $T0,$H3,$S4 1730 vaddudm $ACC2,$ACC2,$T0 1731 vmulouw $T0,$H4,$S4 1732 vaddudm $ACC3,$ACC3,$T0 1733 lvx_u $S2,$x30,$ctx_ 1734 vmulouw $T0,$H0,$R4 1735 vaddudm $ACC4,$ACC4,$T0 1736 lvx_u $R2,$x20,$ctx_ 1737 1738 vmulouw $T0,$H4,$S1 1739 vaddudm $ACC0,$ACC0,$T0 1740 vmulouw $T0,$H0,$R1 1741 vaddudm $ACC1,$ACC1,$T0 1742 vmulouw $T0,$H1,$R1 1743 vaddudm $ACC2,$ACC2,$T0 1744 vmulouw $T0,$H2,$R1 1745 vaddudm $ACC3,$ACC3,$T0 1746 vmulouw $T0,$H3,$R1 1747 vaddudm $ACC4,$ACC4,$T0 1748 1749 vmulouw $T0,$H3,$S2 1750 vaddudm $ACC0,$ACC0,$T0 1751 vmulouw $T0,$H4,$S2 1752 vaddudm $ACC1,$ACC1,$T0 1753 vmulouw $T0,$H0,$R2 1754 vaddudm $ACC2,$ACC2,$T0 1755 vmulouw $T0,$H1,$R2 1756 vaddudm $ACC3,$ACC3,$T0 1757 vmulouw $T0,$H2,$R2 1758 vaddudm $ACC4,$ACC4,$T0 1759 1760 ################################################################ 1761 # horizontal addition 1762 1763 vpermdi $H0,$ACC0,$ACC0,0b10 1764 vpermdi $H1,$ACC1,$ACC1,0b10 1765 vpermdi $H2,$ACC2,$ACC2,0b10 1766 vpermdi $H3,$ACC3,$ACC3,0b10 1767 vpermdi $H4,$ACC4,$ACC4,0b10 1768 vaddudm $ACC0,$ACC0,$H0 1769 vaddudm $ACC1,$ACC1,$H1 1770 vaddudm $ACC2,$ACC2,$H2 1771 vaddudm $ACC3,$ACC3,$H3 1772 vaddudm $ACC4,$ACC4,$H4 1773 1774 ################################################################ 1775 # lazy reduction 1776 1777 vspltisb $T0,2 1778 vsrd $H4,$ACC3,$_26 1779 vsrd $H1,$ACC0,$_26 1780 vand $H3,$ACC3,$mask26 1781 vand $H0,$ACC0,$mask26 1782 vaddudm $H4,$H4,$ACC4 # h3 -> h4 1783 vaddudm $H1,$H1,$ACC1 # h0 -> h1 1784 1785 vsrd $ACC4,$H4,$_26 1786 vsrd $ACC1,$H1,$_26 1787 vand $H4,$H4,$mask26 1788 vand $H1,$H1,$mask26 1789 vaddudm $H0,$H0,$ACC4 1790 vaddudm $H2,$ACC2,$ACC1 # h1 -> h2 1791 1792 vsld $ACC4,$ACC4,$T0 # <<2 1793 vsrd $ACC2,$H2,$_26 1794 vand $H2,$H2,$mask26 1795 vaddudm $H0,$H0,$ACC4 # h4 -> h0 1796 vaddudm $H3,$H3,$ACC2 # h2 -> h3 1797 1798 vsrd $ACC0,$H0,$_26 1799 vsrd $ACC3,$H3,$_26 1800 vand $H0,$H0,$mask26 1801 vand $H3,$H3,$mask26 1802 vaddudm $H1,$H1,$ACC0 # h0 -> h1 1803 vaddudm $H4,$H4,$ACC3 # h3 -> h4 1804 1805 beq Ldone_vsx 1806 1807 add r6,$const,$len 1808 1809 be?lvx_u $_4,$x00,$const # byte swap mask 1810 lvx_u $T1,$x00,$inp # load last partial input block 1811 lvx_u $T2,$x10,$inp 1812 lvx_u $T3,$x20,$inp 1813 lvx_u $T4,$x30,$inp 1814 be?vperm $T1,$T1,$T1,$_4 1815 be?vperm $T2,$T2,$T2,$_4 1816 be?vperm $T3,$T3,$T3,$_4 1817 be?vperm $T4,$T4,$T4,$_4 1818 1819 vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26 1820 vspltisb $_4,4 1821 vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011 1822 vpermdi $I3,$T1,$T2,0b11 1823 1824 vsrd $I1,$I0,$_26 1825 vsrd $I2,$I2,$_4 1826 vsrd $I4,$I3,$_40 1827 vsrd $I3,$I3,$_14 1828 vand $I0,$I0,$mask26 1829 vand $I1,$I1,$mask26 1830 vand $I2,$I2,$mask26 1831 vand $I3,$I3,$mask26 1832 1833 vpermdi $T0,$T3,$T4,0b00 1834 vperm $T1,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011 1835 vpermdi $T2,$T3,$T4,0b11 1836 1837 lvx_u $ACC0,$x00,r6 1838 lvx_u $ACC1,$x30,r6 1839 1840 vsrd $T3,$T0,$_26 1841 vsrd $T1,$T1,$_4 1842 vsrd $T4,$T2,$_40 1843 vsrd $T2,$T2,$_14 1844 vand $T0,$T0,$mask26 1845 vand $T3,$T3,$mask26 1846 vand $T1,$T1,$mask26 1847 vand $T2,$T2,$mask26 1848 1849 # inp[2]:inp[0]:inp[3]:inp[1] 1850 vmrgow $I4,$T4,$I4 1851 vmrgow $I0,$T0,$I0 1852 vmrgow $I1,$T3,$I1 1853 vmrgow $I2,$T1,$I2 1854 vmrgow $I3,$T2,$I3 1855 vor $I4,$I4,$padbits 1856 1857 vperm $H0,$H0,$H0,$ACC0 # move hash to right lane 1858 vand $I0,$I0, $ACC1 # mask redundant input lane[s] 1859 vperm $H1,$H1,$H1,$ACC0 1860 vand $I1,$I1, $ACC1 1861 vperm $H2,$H2,$H2,$ACC0 1862 vand $I2,$I2, $ACC1 1863 vperm $H3,$H3,$H3,$ACC0 1864 vand $I3,$I3, $ACC1 1865 vperm $H4,$H4,$H4,$ACC0 1866 vand $I4,$I4, $ACC1 1867 1868 vaddudm $I0,$I0,$H0 # accumulate hash 1869 vxor $H0,$H0,$H0 # wipe hash value 1870 vaddudm $I1,$I1,$H1 1871 vxor $H1,$H1,$H1 1872 vaddudm $I2,$I2,$H2 1873 vxor $H2,$H2,$H2 1874 vaddudm $I3,$I3,$H3 1875 vxor $H3,$H3,$H3 1876 vaddudm $I4,$I4,$H4 1877 vxor $H4,$H4,$H4 1878 1879 xor. $len,$len,$len 1880 b Last_vsx 1881 1882.align 4 1883Ldone_vsx: 1884 $POP r0,`$VSXFRAME+$LRSAVE`($sp) 1885 li $x10,4 1886 li $x20,8 1887 li $x30,12 1888 li $x40,16 1889 stvwx_u $H0,$x00,$ctx # store hash 1890 stvwx_u $H1,$x10,$ctx 1891 stvwx_u $H2,$x20,$ctx 1892 stvwx_u $H3,$x30,$ctx 1893 stvwx_u $H4,$x40,$ctx 1894 1895 lwz r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# pull vrsave 1896 mtlr r0 1897 li r10,`15+$LOCALS+128` 1898 li r11,`31+$LOCALS+128` 1899 mtspr 256,r12 # restore vrsave 1900 lvx v20,r10,$sp 1901 addi r10,r10,32 1902 lvx v21,r11,$sp 1903 addi r11,r11,32 1904 lvx v22,r10,$sp 1905 addi r10,r10,32 1906 lvx v23,r11,$sp 1907 addi r11,r11,32 1908 lvx v24,r10,$sp 1909 addi r10,r10,32 1910 lvx v25,r11,$sp 1911 addi r11,r11,32 1912 lvx v26,r10,$sp 1913 addi r10,r10,32 1914 lvx v27,r11,$sp 1915 addi r11,r11,32 1916 lvx v28,r10,$sp 1917 addi r10,r10,32 1918 lvx v29,r11,$sp 1919 addi r11,r11,32 1920 lvx v30,r10,$sp 1921 lvx v31,r11,$sp 1922 $POP r27,`$VSXFRAME-$SIZE_T*5`($sp) 1923 $POP r28,`$VSXFRAME-$SIZE_T*4`($sp) 1924 $POP r29,`$VSXFRAME-$SIZE_T*3`($sp) 1925 $POP r30,`$VSXFRAME-$SIZE_T*2`($sp) 1926 $POP r31,`$VSXFRAME-$SIZE_T*1`($sp) 1927 addi $sp,$sp,$VSXFRAME 1928 blr 1929 .long 0 1930 .byte 0,12,0x04,1,0x80,5,4,0 1931 .long 0 1932.size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx 1933 1934.align 6 1935LPICmeup: 1936 mflr r0 1937 bcl 20,31,\$+4 1938 mflr $const # vvvvvv "distance" between . and 1st data entry 1939 addi $const,$const,`64-8` 1940 mtlr r0 1941 blr 1942 .long 0 1943 .byte 0,12,0x14,0,0,0,0,0 1944 .space `64-9*4` 1945 1946.quad 0x0000000003ffffff,0x0000000003ffffff # mask26 1947.quad 0x000000000000001a,0x000000000000001a # _26 1948.quad 0x0000000000000028,0x0000000000000028 # _40 1949.quad 0x000000000e0f0001,0x000000001e1f1011 # I2perm 1950.quad 0x0100000001000000,0x0100000001000000 # padbits 1951.quad 0x0706050403020100,0x0f0e0d0c0b0a0908 # byte swap for big-endian 1952 1953.quad 0x0000000000000000,0x0000000004050607 # magic tail masks 1954.quad 0x0405060700000000,0x0000000000000000 1955.quad 0x0000000000000000,0x0405060700000000 1956 1957.quad 0xffffffff00000000,0xffffffffffffffff 1958.quad 0xffffffff00000000,0xffffffff00000000 1959.quad 0x0000000000000000,0xffffffff00000000 1960___ 1961}}} 1962$code.=<<___; 1963.asciz "Poly1305 for PPC, CRYPTOGAMS by \@dot-asm" 1964___ 1965 1966foreach (split("\n",$code)) { 1967 s/\`([^\`]*)\`/eval($1)/ge; 1968 1969 # instructions prefixed with '?' are endian-specific and need 1970 # to be adjusted accordingly... 1971 if ($flavour !~ /le$/) { # big-endian 1972 s/be\?// or 1973 s/le\?/#le#/ 1974 } else { # little-endian 1975 s/le\?// or 1976 s/be\?/#be#/ 1977 } 1978 1979 print $_,"\n"; 1980} 1981close STDOUT or die "error closing STDOUT: $!"; 1982