1#!/usr/bin/env perl 2# 3# Implemented as a Perl wrapper as we want to support several different 4# architectures with single file. We pick up the target based on the 5# file name we are asked to generate. 6# 7# It should be noted though that this perl code is nothing like 8# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 9# as pre-processor to cover for platform differences in name decoration, 10# linker tables, 32-/64-bit instruction sets... 11# 12# As you might know there're several PowerPC ABI in use. Most notably 13# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs 14# are similar enough to implement leaf(!) functions, which would be ABI 15# neutral. And that's what you find here: ABI neutral leaf functions. 16# In case you wonder what that is... 17# 18# AIX performance 19# 20# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. 21# 22# The following is the performance of 32-bit compiler 23# generated code: 24# 25# OpenSSL 0.9.6c 21 dec 2001 26# built on: Tue Jun 11 11:06:51 EDT 2002 27# options:bn(64,32) ... 28#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 29# sign verify sign/s verify/s 30#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 31#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 32#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 33#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 34#dsa 512 bits 0.0087s 0.0106s 114.3 94.5 35#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 36# 37# Same bechmark with this assembler code: 38# 39#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 40#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 41#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 42#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 43#dsa 512 bits 0.0052s 0.0062s 191.6 162.0 44#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 45# 46# Number of operations increases by at almost 75% 47# 48# Here are performance numbers for 64-bit compiler 49# generated code: 50# 51# OpenSSL 0.9.6g [engine] 9 Aug 2002 52# built on: Fri Apr 18 16:59:20 EDT 2003 53# options:bn(64,64) ... 54# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 55# sign verify sign/s verify/s 56#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 57#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 58#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 59#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 60#dsa 512 bits 0.0026s 0.0032s 382.5 313.7 61#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 62# 63# Same benchmark with this assembler code: 64# 65#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 66#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 67#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 68#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 69#dsa 512 bits 0.0016s 0.0020s 610.7 507.1 70#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 71# 72# Again, performance increases by at about 75% 73# 74# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) 75# OpenSSL 0.9.7c 30 Sep 2003 76# 77# Original code. 78# 79#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 80#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 81#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 82#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 83#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 84#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 85#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 86# 87# Same benchmark with this assembler code: 88# 89#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 90#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 91#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 92#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 93#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 94#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 95#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 96# 97# Performance increase of ~60% 98# 99# If you have comments or suggestions to improve code send 100# me a note at schari@us.ibm.com 101# 102 103$flavour = shift; 104 105if ($flavour =~ /32/) { 106 $BITS= 32; 107 $BNSZ= $BITS/8; 108 $ISA= "\"ppc\""; 109 110 $LD= "lwz"; # load 111 $LDU= "lwzu"; # load and update 112 $ST= "stw"; # store 113 $STU= "stwu"; # store and update 114 $UMULL= "mullw"; # unsigned multiply low 115 $UMULH= "mulhwu"; # unsigned multiply high 116 $UDIV= "divwu"; # unsigned divide 117 $UCMPI= "cmplwi"; # unsigned compare with immediate 118 $UCMP= "cmplw"; # unsigned compare 119 $CNTLZ= "cntlzw"; # count leading zeros 120 $SHL= "slw"; # shift left 121 $SHR= "srw"; # unsigned shift right 122 $SHRI= "srwi"; # unsigned shift right by immediate 123 $SHLI= "slwi"; # shift left by immediate 124 $CLRU= "clrlwi"; # clear upper bits 125 $INSR= "insrwi"; # insert right 126 $ROTL= "rotlwi"; # rotate left by immediate 127 $TR= "tw"; # conditional trap 128} elsif ($flavour =~ /64/) { 129 $BITS= 64; 130 $BNSZ= $BITS/8; 131 $ISA= "\"ppc64\""; 132 133 # same as above, but 64-bit mnemonics... 134 $LD= "ld"; # load 135 $LDU= "ldu"; # load and update 136 $ST= "std"; # store 137 $STU= "stdu"; # store and update 138 $UMULL= "mulld"; # unsigned multiply low 139 $UMULH= "mulhdu"; # unsigned multiply high 140 $UDIV= "divdu"; # unsigned divide 141 $UCMPI= "cmpldi"; # unsigned compare with immediate 142 $UCMP= "cmpld"; # unsigned compare 143 $CNTLZ= "cntlzd"; # count leading zeros 144 $SHL= "sld"; # shift left 145 $SHR= "srd"; # unsigned shift right 146 $SHRI= "srdi"; # unsigned shift right by immediate 147 $SHLI= "sldi"; # shift left by immediate 148 $CLRU= "clrldi"; # clear upper bits 149 $INSR= "insrdi"; # insert right 150 $ROTL= "rotldi"; # rotate left by immediate 151 $TR= "td"; # conditional trap 152} else { die "nonsense $flavour"; } 153 154$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 155( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 156( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 157die "can't locate ppc-xlate.pl"; 158 159open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 160 161$data=<<EOF; 162#-------------------------------------------------------------------- 163# 164# 165# 166# 167# File: ppc32.s 168# 169# Created by: Suresh Chari 170# IBM Thomas J. Watson Research Library 171# Hawthorne, NY 172# 173# 174# Description: Optimized assembly routines for OpenSSL crypto 175# on the 32 bitPowerPC platform. 176# 177# 178# Version History 179# 180# 2. Fixed bn_add,bn_sub and bn_div_words, added comments, 181# cleaned up code. Also made a single version which can 182# be used for both the AIX and Linux compilers. See NOTE 183# below. 184# 12/05/03 Suresh Chari 185# (with lots of help from) Andy Polyakov 186## 187# 1. Initial version 10/20/02 Suresh Chari 188# 189# 190# The following file works for the xlc,cc 191# and gcc compilers. 192# 193# NOTE: To get the file to link correctly with the gcc compiler 194# you have to change the names of the routines and remove 195# the first .(dot) character. This should automatically 196# be done in the build process. 197# 198# Hand optimized assembly code for the following routines 199# 200# bn_sqr_comba4 201# bn_sqr_comba8 202# bn_mul_comba4 203# bn_mul_comba8 204# bn_sub_words 205# bn_add_words 206# bn_div_words 207# bn_sqr_words 208# bn_mul_words 209# bn_mul_add_words 210# 211# NOTE: It is possible to optimize this code more for 212# specific PowerPC or Power architectures. On the Northstar 213# architecture the optimizations in this file do 214# NOT provide much improvement. 215# 216# If you have comments or suggestions to improve code send 217# me a note at schari\@us.ibm.com 218# 219#-------------------------------------------------------------------------- 220# 221# Defines to be used in the assembly code. 222# 223#.set r0,0 # we use it as storage for value of 0 224#.set SP,1 # preserved 225#.set RTOC,2 # preserved 226#.set r3,3 # 1st argument/return value 227#.set r4,4 # 2nd argument/volatile register 228#.set r5,5 # 3rd argument/volatile register 229#.set r6,6 # ... 230#.set r7,7 231#.set r8,8 232#.set r9,9 233#.set r10,10 234#.set r11,11 235#.set r12,12 236#.set r13,13 # not used, nor any other "below" it... 237 238# Declare function names to be global 239# NOTE: For gcc these names MUST be changed to remove 240# the first . i.e. for example change ".bn_sqr_comba4" 241# to "bn_sqr_comba4". This should be automatically done 242# in the build. 243 244 .globl .bn_sqr_comba4 245 .globl .bn_sqr_comba8 246 .globl .bn_mul_comba4 247 .globl .bn_mul_comba8 248 .globl .bn_sub_words 249 .globl .bn_add_words 250 .globl .bn_div_words 251 .globl .bn_sqr_words 252 .globl .bn_mul_words 253 .globl .bn_mul_add_words 254 255# .text section 256 257 .machine "any" 258 259# 260# NOTE: The following label name should be changed to 261# "bn_sqr_comba4" i.e. remove the first dot 262# for the gcc compiler. This should be automatically 263# done in the build 264# 265 266.align 4 267.bn_sqr_comba4: 268# 269# Optimized version of bn_sqr_comba4. 270# 271# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 272# r3 contains r 273# r4 contains a 274# 275# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 276# 277# r5,r6 are the two BN_ULONGs being multiplied. 278# r7,r8 are the results of the 32x32 giving 64 bit multiply. 279# r9,r10, r11 are the equivalents of c1,c2, c3. 280# Here's the assembly 281# 282# 283 xor r0,r0,r0 # set r0 = 0. Used in the addze 284 # instructions below 285 286 #sqr_add_c(a,0,c1,c2,c3) 287 $LD r5,`0*$BNSZ`(r4) 288 $UMULL r9,r5,r5 289 $UMULH r10,r5,r5 #in first iteration. No need 290 #to add since c1=c2=c3=0. 291 # Note c3(r11) is NOT set to 0 292 # but will be. 293 294 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 295 # sqr_add_c2(a,1,0,c2,c3,c1); 296 $LD r6,`1*$BNSZ`(r4) 297 $UMULL r7,r5,r6 298 $UMULH r8,r5,r6 299 300 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) 301 adde r8,r8,r8 302 addze r9,r0 # catch carry if any. 303 # r9= r0(=0) and carry 304 305 addc r10,r7,r10 # now add to temp result. 306 addze r11,r8 # r8 added to r11 which is 0 307 addze r9,r9 308 309 $ST r10,`1*$BNSZ`(r3) #r[1]=c2; 310 #sqr_add_c(a,1,c3,c1,c2) 311 $UMULL r7,r6,r6 312 $UMULH r8,r6,r6 313 addc r11,r7,r11 314 adde r9,r8,r9 315 addze r10,r0 316 #sqr_add_c2(a,2,0,c3,c1,c2) 317 $LD r6,`2*$BNSZ`(r4) 318 $UMULL r7,r5,r6 319 $UMULH r8,r5,r6 320 321 addc r7,r7,r7 322 adde r8,r8,r8 323 addze r10,r10 324 325 addc r11,r7,r11 326 adde r9,r8,r9 327 addze r10,r10 328 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 329 #sqr_add_c2(a,3,0,c1,c2,c3); 330 $LD r6,`3*$BNSZ`(r4) 331 $UMULL r7,r5,r6 332 $UMULH r8,r5,r6 333 addc r7,r7,r7 334 adde r8,r8,r8 335 addze r11,r0 336 337 addc r9,r7,r9 338 adde r10,r8,r10 339 addze r11,r11 340 #sqr_add_c2(a,2,1,c1,c2,c3); 341 $LD r5,`1*$BNSZ`(r4) 342 $LD r6,`2*$BNSZ`(r4) 343 $UMULL r7,r5,r6 344 $UMULH r8,r5,r6 345 346 addc r7,r7,r7 347 adde r8,r8,r8 348 addze r11,r11 349 addc r9,r7,r9 350 adde r10,r8,r10 351 addze r11,r11 352 $ST r9,`3*$BNSZ`(r3) #r[3]=c1 353 #sqr_add_c(a,2,c2,c3,c1); 354 $UMULL r7,r6,r6 355 $UMULH r8,r6,r6 356 addc r10,r7,r10 357 adde r11,r8,r11 358 addze r9,r0 359 #sqr_add_c2(a,3,1,c2,c3,c1); 360 $LD r6,`3*$BNSZ`(r4) 361 $UMULL r7,r5,r6 362 $UMULH r8,r5,r6 363 addc r7,r7,r7 364 adde r8,r8,r8 365 addze r9,r9 366 367 addc r10,r7,r10 368 adde r11,r8,r11 369 addze r9,r9 370 $ST r10,`4*$BNSZ`(r3) #r[4]=c2 371 #sqr_add_c2(a,3,2,c3,c1,c2); 372 $LD r5,`2*$BNSZ`(r4) 373 $UMULL r7,r5,r6 374 $UMULH r8,r5,r6 375 addc r7,r7,r7 376 adde r8,r8,r8 377 addze r10,r0 378 379 addc r11,r7,r11 380 adde r9,r8,r9 381 addze r10,r10 382 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 383 #sqr_add_c(a,3,c1,c2,c3); 384 $UMULL r7,r6,r6 385 $UMULH r8,r6,r6 386 addc r9,r7,r9 387 adde r10,r8,r10 388 389 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 390 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 391 blr 392 .long 0 393 .byte 0,12,0x14,0,0,0,2,0 394 .long 0 395.size .bn_sqr_comba4,.-.bn_sqr_comba4 396 397# 398# NOTE: The following label name should be changed to 399# "bn_sqr_comba8" i.e. remove the first dot 400# for the gcc compiler. This should be automatically 401# done in the build 402# 403 404.align 4 405.bn_sqr_comba8: 406# 407# This is an optimized version of the bn_sqr_comba8 routine. 408# Tightly uses the adde instruction 409# 410# 411# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 412# r3 contains r 413# r4 contains a 414# 415# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 416# 417# r5,r6 are the two BN_ULONGs being multiplied. 418# r7,r8 are the results of the 32x32 giving 64 bit multiply. 419# r9,r10, r11 are the equivalents of c1,c2, c3. 420# 421# Possible optimization of loading all 8 longs of a into registers 422# doesnt provide any speedup 423# 424 425 xor r0,r0,r0 #set r0 = 0.Used in addze 426 #instructions below. 427 428 #sqr_add_c(a,0,c1,c2,c3); 429 $LD r5,`0*$BNSZ`(r4) 430 $UMULL r9,r5,r5 #1st iteration: no carries. 431 $UMULH r10,r5,r5 432 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 433 #sqr_add_c2(a,1,0,c2,c3,c1); 434 $LD r6,`1*$BNSZ`(r4) 435 $UMULL r7,r5,r6 436 $UMULH r8,r5,r6 437 438 addc r10,r7,r10 #add the two register number 439 adde r11,r8,r0 # (r8,r7) to the three register 440 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 441 442 addc r10,r7,r10 #add the two register number 443 adde r11,r8,r11 # (r8,r7) to the three register 444 addze r9,r9 # number (r9,r11,r10). 445 446 $ST r10,`1*$BNSZ`(r3) # r[1]=c2 447 448 #sqr_add_c(a,1,c3,c1,c2); 449 $UMULL r7,r6,r6 450 $UMULH r8,r6,r6 451 addc r11,r7,r11 452 adde r9,r8,r9 453 addze r10,r0 454 #sqr_add_c2(a,2,0,c3,c1,c2); 455 $LD r6,`2*$BNSZ`(r4) 456 $UMULL r7,r5,r6 457 $UMULH r8,r5,r6 458 459 addc r11,r7,r11 460 adde r9,r8,r9 461 addze r10,r10 462 463 addc r11,r7,r11 464 adde r9,r8,r9 465 addze r10,r10 466 467 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 468 #sqr_add_c2(a,3,0,c1,c2,c3); 469 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. 470 $UMULL r7,r5,r6 471 $UMULH r8,r5,r6 472 473 addc r9,r7,r9 474 adde r10,r8,r10 475 addze r11,r0 476 477 addc r9,r7,r9 478 adde r10,r8,r10 479 addze r11,r11 480 #sqr_add_c2(a,2,1,c1,c2,c3); 481 $LD r5,`1*$BNSZ`(r4) 482 $LD r6,`2*$BNSZ`(r4) 483 $UMULL r7,r5,r6 484 $UMULH r8,r5,r6 485 486 addc r9,r7,r9 487 adde r10,r8,r10 488 addze r11,r11 489 490 addc r9,r7,r9 491 adde r10,r8,r10 492 addze r11,r11 493 494 $ST r9,`3*$BNSZ`(r3) #r[3]=c1; 495 #sqr_add_c(a,2,c2,c3,c1); 496 $UMULL r7,r6,r6 497 $UMULH r8,r6,r6 498 499 addc r10,r7,r10 500 adde r11,r8,r11 501 addze r9,r0 502 #sqr_add_c2(a,3,1,c2,c3,c1); 503 $LD r6,`3*$BNSZ`(r4) 504 $UMULL r7,r5,r6 505 $UMULH r8,r5,r6 506 507 addc r10,r7,r10 508 adde r11,r8,r11 509 addze r9,r9 510 511 addc r10,r7,r10 512 adde r11,r8,r11 513 addze r9,r9 514 #sqr_add_c2(a,4,0,c2,c3,c1); 515 $LD r5,`0*$BNSZ`(r4) 516 $LD r6,`4*$BNSZ`(r4) 517 $UMULL r7,r5,r6 518 $UMULH r8,r5,r6 519 520 addc r10,r7,r10 521 adde r11,r8,r11 522 addze r9,r9 523 524 addc r10,r7,r10 525 adde r11,r8,r11 526 addze r9,r9 527 $ST r10,`4*$BNSZ`(r3) #r[4]=c2; 528 #sqr_add_c2(a,5,0,c3,c1,c2); 529 $LD r6,`5*$BNSZ`(r4) 530 $UMULL r7,r5,r6 531 $UMULH r8,r5,r6 532 533 addc r11,r7,r11 534 adde r9,r8,r9 535 addze r10,r0 536 537 addc r11,r7,r11 538 adde r9,r8,r9 539 addze r10,r10 540 #sqr_add_c2(a,4,1,c3,c1,c2); 541 $LD r5,`1*$BNSZ`(r4) 542 $LD r6,`4*$BNSZ`(r4) 543 $UMULL r7,r5,r6 544 $UMULH r8,r5,r6 545 546 addc r11,r7,r11 547 adde r9,r8,r9 548 addze r10,r10 549 550 addc r11,r7,r11 551 adde r9,r8,r9 552 addze r10,r10 553 #sqr_add_c2(a,3,2,c3,c1,c2); 554 $LD r5,`2*$BNSZ`(r4) 555 $LD r6,`3*$BNSZ`(r4) 556 $UMULL r7,r5,r6 557 $UMULH r8,r5,r6 558 559 addc r11,r7,r11 560 adde r9,r8,r9 561 addze r10,r10 562 563 addc r11,r7,r11 564 adde r9,r8,r9 565 addze r10,r10 566 $ST r11,`5*$BNSZ`(r3) #r[5]=c3; 567 #sqr_add_c(a,3,c1,c2,c3); 568 $UMULL r7,r6,r6 569 $UMULH r8,r6,r6 570 addc r9,r7,r9 571 adde r10,r8,r10 572 addze r11,r0 573 #sqr_add_c2(a,4,2,c1,c2,c3); 574 $LD r6,`4*$BNSZ`(r4) 575 $UMULL r7,r5,r6 576 $UMULH r8,r5,r6 577 578 addc r9,r7,r9 579 adde r10,r8,r10 580 addze r11,r11 581 582 addc r9,r7,r9 583 adde r10,r8,r10 584 addze r11,r11 585 #sqr_add_c2(a,5,1,c1,c2,c3); 586 $LD r5,`1*$BNSZ`(r4) 587 $LD r6,`5*$BNSZ`(r4) 588 $UMULL r7,r5,r6 589 $UMULH r8,r5,r6 590 591 addc r9,r7,r9 592 adde r10,r8,r10 593 addze r11,r11 594 595 addc r9,r7,r9 596 adde r10,r8,r10 597 addze r11,r11 598 #sqr_add_c2(a,6,0,c1,c2,c3); 599 $LD r5,`0*$BNSZ`(r4) 600 $LD r6,`6*$BNSZ`(r4) 601 $UMULL r7,r5,r6 602 $UMULH r8,r5,r6 603 addc r9,r7,r9 604 adde r10,r8,r10 605 addze r11,r11 606 addc r9,r7,r9 607 adde r10,r8,r10 608 addze r11,r11 609 $ST r9,`6*$BNSZ`(r3) #r[6]=c1; 610 #sqr_add_c2(a,7,0,c2,c3,c1); 611 $LD r6,`7*$BNSZ`(r4) 612 $UMULL r7,r5,r6 613 $UMULH r8,r5,r6 614 615 addc r10,r7,r10 616 adde r11,r8,r11 617 addze r9,r0 618 addc r10,r7,r10 619 adde r11,r8,r11 620 addze r9,r9 621 #sqr_add_c2(a,6,1,c2,c3,c1); 622 $LD r5,`1*$BNSZ`(r4) 623 $LD r6,`6*$BNSZ`(r4) 624 $UMULL r7,r5,r6 625 $UMULH r8,r5,r6 626 627 addc r10,r7,r10 628 adde r11,r8,r11 629 addze r9,r9 630 addc r10,r7,r10 631 adde r11,r8,r11 632 addze r9,r9 633 #sqr_add_c2(a,5,2,c2,c3,c1); 634 $LD r5,`2*$BNSZ`(r4) 635 $LD r6,`5*$BNSZ`(r4) 636 $UMULL r7,r5,r6 637 $UMULH r8,r5,r6 638 addc r10,r7,r10 639 adde r11,r8,r11 640 addze r9,r9 641 addc r10,r7,r10 642 adde r11,r8,r11 643 addze r9,r9 644 #sqr_add_c2(a,4,3,c2,c3,c1); 645 $LD r5,`3*$BNSZ`(r4) 646 $LD r6,`4*$BNSZ`(r4) 647 $UMULL r7,r5,r6 648 $UMULH r8,r5,r6 649 650 addc r10,r7,r10 651 adde r11,r8,r11 652 addze r9,r9 653 addc r10,r7,r10 654 adde r11,r8,r11 655 addze r9,r9 656 $ST r10,`7*$BNSZ`(r3) #r[7]=c2; 657 #sqr_add_c(a,4,c3,c1,c2); 658 $UMULL r7,r6,r6 659 $UMULH r8,r6,r6 660 addc r11,r7,r11 661 adde r9,r8,r9 662 addze r10,r0 663 #sqr_add_c2(a,5,3,c3,c1,c2); 664 $LD r6,`5*$BNSZ`(r4) 665 $UMULL r7,r5,r6 666 $UMULH r8,r5,r6 667 addc r11,r7,r11 668 adde r9,r8,r9 669 addze r10,r10 670 addc r11,r7,r11 671 adde r9,r8,r9 672 addze r10,r10 673 #sqr_add_c2(a,6,2,c3,c1,c2); 674 $LD r5,`2*$BNSZ`(r4) 675 $LD r6,`6*$BNSZ`(r4) 676 $UMULL r7,r5,r6 677 $UMULH r8,r5,r6 678 addc r11,r7,r11 679 adde r9,r8,r9 680 addze r10,r10 681 682 addc r11,r7,r11 683 adde r9,r8,r9 684 addze r10,r10 685 #sqr_add_c2(a,7,1,c3,c1,c2); 686 $LD r5,`1*$BNSZ`(r4) 687 $LD r6,`7*$BNSZ`(r4) 688 $UMULL r7,r5,r6 689 $UMULH r8,r5,r6 690 addc r11,r7,r11 691 adde r9,r8,r9 692 addze r10,r10 693 addc r11,r7,r11 694 adde r9,r8,r9 695 addze r10,r10 696 $ST r11,`8*$BNSZ`(r3) #r[8]=c3; 697 #sqr_add_c2(a,7,2,c1,c2,c3); 698 $LD r5,`2*$BNSZ`(r4) 699 $UMULL r7,r5,r6 700 $UMULH r8,r5,r6 701 702 addc r9,r7,r9 703 adde r10,r8,r10 704 addze r11,r0 705 addc r9,r7,r9 706 adde r10,r8,r10 707 addze r11,r11 708 #sqr_add_c2(a,6,3,c1,c2,c3); 709 $LD r5,`3*$BNSZ`(r4) 710 $LD r6,`6*$BNSZ`(r4) 711 $UMULL r7,r5,r6 712 $UMULH r8,r5,r6 713 addc r9,r7,r9 714 adde r10,r8,r10 715 addze r11,r11 716 addc r9,r7,r9 717 adde r10,r8,r10 718 addze r11,r11 719 #sqr_add_c2(a,5,4,c1,c2,c3); 720 $LD r5,`4*$BNSZ`(r4) 721 $LD r6,`5*$BNSZ`(r4) 722 $UMULL r7,r5,r6 723 $UMULH r8,r5,r6 724 addc r9,r7,r9 725 adde r10,r8,r10 726 addze r11,r11 727 addc r9,r7,r9 728 adde r10,r8,r10 729 addze r11,r11 730 $ST r9,`9*$BNSZ`(r3) #r[9]=c1; 731 #sqr_add_c(a,5,c2,c3,c1); 732 $UMULL r7,r6,r6 733 $UMULH r8,r6,r6 734 addc r10,r7,r10 735 adde r11,r8,r11 736 addze r9,r0 737 #sqr_add_c2(a,6,4,c2,c3,c1); 738 $LD r6,`6*$BNSZ`(r4) 739 $UMULL r7,r5,r6 740 $UMULH r8,r5,r6 741 addc r10,r7,r10 742 adde r11,r8,r11 743 addze r9,r9 744 addc r10,r7,r10 745 adde r11,r8,r11 746 addze r9,r9 747 #sqr_add_c2(a,7,3,c2,c3,c1); 748 $LD r5,`3*$BNSZ`(r4) 749 $LD r6,`7*$BNSZ`(r4) 750 $UMULL r7,r5,r6 751 $UMULH r8,r5,r6 752 addc r10,r7,r10 753 adde r11,r8,r11 754 addze r9,r9 755 addc r10,r7,r10 756 adde r11,r8,r11 757 addze r9,r9 758 $ST r10,`10*$BNSZ`(r3) #r[10]=c2; 759 #sqr_add_c2(a,7,4,c3,c1,c2); 760 $LD r5,`4*$BNSZ`(r4) 761 $UMULL r7,r5,r6 762 $UMULH r8,r5,r6 763 addc r11,r7,r11 764 adde r9,r8,r9 765 addze r10,r0 766 addc r11,r7,r11 767 adde r9,r8,r9 768 addze r10,r10 769 #sqr_add_c2(a,6,5,c3,c1,c2); 770 $LD r5,`5*$BNSZ`(r4) 771 $LD r6,`6*$BNSZ`(r4) 772 $UMULL r7,r5,r6 773 $UMULH r8,r5,r6 774 addc r11,r7,r11 775 adde r9,r8,r9 776 addze r10,r10 777 addc r11,r7,r11 778 adde r9,r8,r9 779 addze r10,r10 780 $ST r11,`11*$BNSZ`(r3) #r[11]=c3; 781 #sqr_add_c(a,6,c1,c2,c3); 782 $UMULL r7,r6,r6 783 $UMULH r8,r6,r6 784 addc r9,r7,r9 785 adde r10,r8,r10 786 addze r11,r0 787 #sqr_add_c2(a,7,5,c1,c2,c3) 788 $LD r6,`7*$BNSZ`(r4) 789 $UMULL r7,r5,r6 790 $UMULH r8,r5,r6 791 addc r9,r7,r9 792 adde r10,r8,r10 793 addze r11,r11 794 addc r9,r7,r9 795 adde r10,r8,r10 796 addze r11,r11 797 $ST r9,`12*$BNSZ`(r3) #r[12]=c1; 798 799 #sqr_add_c2(a,7,6,c2,c3,c1) 800 $LD r5,`6*$BNSZ`(r4) 801 $UMULL r7,r5,r6 802 $UMULH r8,r5,r6 803 addc r10,r7,r10 804 adde r11,r8,r11 805 addze r9,r0 806 addc r10,r7,r10 807 adde r11,r8,r11 808 addze r9,r9 809 $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 810 #sqr_add_c(a,7,c3,c1,c2); 811 $UMULL r7,r6,r6 812 $UMULH r8,r6,r6 813 addc r11,r7,r11 814 adde r9,r8,r9 815 $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 816 $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 817 818 819 blr 820 .long 0 821 .byte 0,12,0x14,0,0,0,2,0 822 .long 0 823.size .bn_sqr_comba8,.-.bn_sqr_comba8 824 825# 826# NOTE: The following label name should be changed to 827# "bn_mul_comba4" i.e. remove the first dot 828# for the gcc compiler. This should be automatically 829# done in the build 830# 831 832.align 4 833.bn_mul_comba4: 834# 835# This is an optimized version of the bn_mul_comba4 routine. 836# 837# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 838# r3 contains r 839# r4 contains a 840# r5 contains b 841# r6, r7 are the 2 BN_ULONGs being multiplied. 842# r8, r9 are the results of the 32x32 giving 64 multiply. 843# r10, r11, r12 are the equivalents of c1, c2, and c3. 844# 845 xor r0,r0,r0 #r0=0. Used in addze below. 846 #mul_add_c(a[0],b[0],c1,c2,c3); 847 $LD r6,`0*$BNSZ`(r4) 848 $LD r7,`0*$BNSZ`(r5) 849 $UMULL r10,r6,r7 850 $UMULH r11,r6,r7 851 $ST r10,`0*$BNSZ`(r3) #r[0]=c1 852 #mul_add_c(a[0],b[1],c2,c3,c1); 853 $LD r7,`1*$BNSZ`(r5) 854 $UMULL r8,r6,r7 855 $UMULH r9,r6,r7 856 addc r11,r8,r11 857 adde r12,r9,r0 858 addze r10,r0 859 #mul_add_c(a[1],b[0],c2,c3,c1); 860 $LD r6, `1*$BNSZ`(r4) 861 $LD r7, `0*$BNSZ`(r5) 862 $UMULL r8,r6,r7 863 $UMULH r9,r6,r7 864 addc r11,r8,r11 865 adde r12,r9,r12 866 addze r10,r10 867 $ST r11,`1*$BNSZ`(r3) #r[1]=c2 868 #mul_add_c(a[2],b[0],c3,c1,c2); 869 $LD r6,`2*$BNSZ`(r4) 870 $UMULL r8,r6,r7 871 $UMULH r9,r6,r7 872 addc r12,r8,r12 873 adde r10,r9,r10 874 addze r11,r0 875 #mul_add_c(a[1],b[1],c3,c1,c2); 876 $LD r6,`1*$BNSZ`(r4) 877 $LD r7,`1*$BNSZ`(r5) 878 $UMULL r8,r6,r7 879 $UMULH r9,r6,r7 880 addc r12,r8,r12 881 adde r10,r9,r10 882 addze r11,r11 883 #mul_add_c(a[0],b[2],c3,c1,c2); 884 $LD r6,`0*$BNSZ`(r4) 885 $LD r7,`2*$BNSZ`(r5) 886 $UMULL r8,r6,r7 887 $UMULH r9,r6,r7 888 addc r12,r8,r12 889 adde r10,r9,r10 890 addze r11,r11 891 $ST r12,`2*$BNSZ`(r3) #r[2]=c3 892 #mul_add_c(a[0],b[3],c1,c2,c3); 893 $LD r7,`3*$BNSZ`(r5) 894 $UMULL r8,r6,r7 895 $UMULH r9,r6,r7 896 addc r10,r8,r10 897 adde r11,r9,r11 898 addze r12,r0 899 #mul_add_c(a[1],b[2],c1,c2,c3); 900 $LD r6,`1*$BNSZ`(r4) 901 $LD r7,`2*$BNSZ`(r5) 902 $UMULL r8,r6,r7 903 $UMULH r9,r6,r7 904 addc r10,r8,r10 905 adde r11,r9,r11 906 addze r12,r12 907 #mul_add_c(a[2],b[1],c1,c2,c3); 908 $LD r6,`2*$BNSZ`(r4) 909 $LD r7,`1*$BNSZ`(r5) 910 $UMULL r8,r6,r7 911 $UMULH r9,r6,r7 912 addc r10,r8,r10 913 adde r11,r9,r11 914 addze r12,r12 915 #mul_add_c(a[3],b[0],c1,c2,c3); 916 $LD r6,`3*$BNSZ`(r4) 917 $LD r7,`0*$BNSZ`(r5) 918 $UMULL r8,r6,r7 919 $UMULH r9,r6,r7 920 addc r10,r8,r10 921 adde r11,r9,r11 922 addze r12,r12 923 $ST r10,`3*$BNSZ`(r3) #r[3]=c1 924 #mul_add_c(a[3],b[1],c2,c3,c1); 925 $LD r7,`1*$BNSZ`(r5) 926 $UMULL r8,r6,r7 927 $UMULH r9,r6,r7 928 addc r11,r8,r11 929 adde r12,r9,r12 930 addze r10,r0 931 #mul_add_c(a[2],b[2],c2,c3,c1); 932 $LD r6,`2*$BNSZ`(r4) 933 $LD r7,`2*$BNSZ`(r5) 934 $UMULL r8,r6,r7 935 $UMULH r9,r6,r7 936 addc r11,r8,r11 937 adde r12,r9,r12 938 addze r10,r10 939 #mul_add_c(a[1],b[3],c2,c3,c1); 940 $LD r6,`1*$BNSZ`(r4) 941 $LD r7,`3*$BNSZ`(r5) 942 $UMULL r8,r6,r7 943 $UMULH r9,r6,r7 944 addc r11,r8,r11 945 adde r12,r9,r12 946 addze r10,r10 947 $ST r11,`4*$BNSZ`(r3) #r[4]=c2 948 #mul_add_c(a[2],b[3],c3,c1,c2); 949 $LD r6,`2*$BNSZ`(r4) 950 $UMULL r8,r6,r7 951 $UMULH r9,r6,r7 952 addc r12,r8,r12 953 adde r10,r9,r10 954 addze r11,r0 955 #mul_add_c(a[3],b[2],c3,c1,c2); 956 $LD r6,`3*$BNSZ`(r4) 957 $LD r7,`2*$BNSZ`(r5) 958 $UMULL r8,r6,r7 959 $UMULH r9,r6,r7 960 addc r12,r8,r12 961 adde r10,r9,r10 962 addze r11,r11 963 $ST r12,`5*$BNSZ`(r3) #r[5]=c3 964 #mul_add_c(a[3],b[3],c1,c2,c3); 965 $LD r7,`3*$BNSZ`(r5) 966 $UMULL r8,r6,r7 967 $UMULH r9,r6,r7 968 addc r10,r8,r10 969 adde r11,r9,r11 970 971 $ST r10,`6*$BNSZ`(r3) #r[6]=c1 972 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 973 blr 974 .long 0 975 .byte 0,12,0x14,0,0,0,3,0 976 .long 0 977.size .bn_mul_comba4,.-.bn_mul_comba4 978 979# 980# NOTE: The following label name should be changed to 981# "bn_mul_comba8" i.e. remove the first dot 982# for the gcc compiler. This should be automatically 983# done in the build 984# 985 986.align 4 987.bn_mul_comba8: 988# 989# Optimized version of the bn_mul_comba8 routine. 990# 991# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 992# r3 contains r 993# r4 contains a 994# r5 contains b 995# r6, r7 are the 2 BN_ULONGs being multiplied. 996# r8, r9 are the results of the 32x32 giving 64 multiply. 997# r10, r11, r12 are the equivalents of c1, c2, and c3. 998# 999 xor r0,r0,r0 #r0=0. Used in addze below. 1000 1001 #mul_add_c(a[0],b[0],c1,c2,c3); 1002 $LD r6,`0*$BNSZ`(r4) #a[0] 1003 $LD r7,`0*$BNSZ`(r5) #b[0] 1004 $UMULL r10,r6,r7 1005 $UMULH r11,r6,r7 1006 $ST r10,`0*$BNSZ`(r3) #r[0]=c1; 1007 #mul_add_c(a[0],b[1],c2,c3,c1); 1008 $LD r7,`1*$BNSZ`(r5) 1009 $UMULL r8,r6,r7 1010 $UMULH r9,r6,r7 1011 addc r11,r11,r8 1012 addze r12,r9 # since we didnt set r12 to zero before. 1013 addze r10,r0 1014 #mul_add_c(a[1],b[0],c2,c3,c1); 1015 $LD r6,`1*$BNSZ`(r4) 1016 $LD r7,`0*$BNSZ`(r5) 1017 $UMULL r8,r6,r7 1018 $UMULH r9,r6,r7 1019 addc r11,r11,r8 1020 adde r12,r12,r9 1021 addze r10,r10 1022 $ST r11,`1*$BNSZ`(r3) #r[1]=c2; 1023 #mul_add_c(a[2],b[0],c3,c1,c2); 1024 $LD r6,`2*$BNSZ`(r4) 1025 $UMULL r8,r6,r7 1026 $UMULH r9,r6,r7 1027 addc r12,r12,r8 1028 adde r10,r10,r9 1029 addze r11,r0 1030 #mul_add_c(a[1],b[1],c3,c1,c2); 1031 $LD r6,`1*$BNSZ`(r4) 1032 $LD r7,`1*$BNSZ`(r5) 1033 $UMULL r8,r6,r7 1034 $UMULH r9,r6,r7 1035 addc r12,r12,r8 1036 adde r10,r10,r9 1037 addze r11,r11 1038 #mul_add_c(a[0],b[2],c3,c1,c2); 1039 $LD r6,`0*$BNSZ`(r4) 1040 $LD r7,`2*$BNSZ`(r5) 1041 $UMULL r8,r6,r7 1042 $UMULH r9,r6,r7 1043 addc r12,r12,r8 1044 adde r10,r10,r9 1045 addze r11,r11 1046 $ST r12,`2*$BNSZ`(r3) #r[2]=c3; 1047 #mul_add_c(a[0],b[3],c1,c2,c3); 1048 $LD r7,`3*$BNSZ`(r5) 1049 $UMULL r8,r6,r7 1050 $UMULH r9,r6,r7 1051 addc r10,r10,r8 1052 adde r11,r11,r9 1053 addze r12,r0 1054 #mul_add_c(a[1],b[2],c1,c2,c3); 1055 $LD r6,`1*$BNSZ`(r4) 1056 $LD r7,`2*$BNSZ`(r5) 1057 $UMULL r8,r6,r7 1058 $UMULH r9,r6,r7 1059 addc r10,r10,r8 1060 adde r11,r11,r9 1061 addze r12,r12 1062 1063 #mul_add_c(a[2],b[1],c1,c2,c3); 1064 $LD r6,`2*$BNSZ`(r4) 1065 $LD r7,`1*$BNSZ`(r5) 1066 $UMULL r8,r6,r7 1067 $UMULH r9,r6,r7 1068 addc r10,r10,r8 1069 adde r11,r11,r9 1070 addze r12,r12 1071 #mul_add_c(a[3],b[0],c1,c2,c3); 1072 $LD r6,`3*$BNSZ`(r4) 1073 $LD r7,`0*$BNSZ`(r5) 1074 $UMULL r8,r6,r7 1075 $UMULH r9,r6,r7 1076 addc r10,r10,r8 1077 adde r11,r11,r9 1078 addze r12,r12 1079 $ST r10,`3*$BNSZ`(r3) #r[3]=c1; 1080 #mul_add_c(a[4],b[0],c2,c3,c1); 1081 $LD r6,`4*$BNSZ`(r4) 1082 $UMULL r8,r6,r7 1083 $UMULH r9,r6,r7 1084 addc r11,r11,r8 1085 adde r12,r12,r9 1086 addze r10,r0 1087 #mul_add_c(a[3],b[1],c2,c3,c1); 1088 $LD r6,`3*$BNSZ`(r4) 1089 $LD r7,`1*$BNSZ`(r5) 1090 $UMULL r8,r6,r7 1091 $UMULH r9,r6,r7 1092 addc r11,r11,r8 1093 adde r12,r12,r9 1094 addze r10,r10 1095 #mul_add_c(a[2],b[2],c2,c3,c1); 1096 $LD r6,`2*$BNSZ`(r4) 1097 $LD r7,`2*$BNSZ`(r5) 1098 $UMULL r8,r6,r7 1099 $UMULH r9,r6,r7 1100 addc r11,r11,r8 1101 adde r12,r12,r9 1102 addze r10,r10 1103 #mul_add_c(a[1],b[3],c2,c3,c1); 1104 $LD r6,`1*$BNSZ`(r4) 1105 $LD r7,`3*$BNSZ`(r5) 1106 $UMULL r8,r6,r7 1107 $UMULH r9,r6,r7 1108 addc r11,r11,r8 1109 adde r12,r12,r9 1110 addze r10,r10 1111 #mul_add_c(a[0],b[4],c2,c3,c1); 1112 $LD r6,`0*$BNSZ`(r4) 1113 $LD r7,`4*$BNSZ`(r5) 1114 $UMULL r8,r6,r7 1115 $UMULH r9,r6,r7 1116 addc r11,r11,r8 1117 adde r12,r12,r9 1118 addze r10,r10 1119 $ST r11,`4*$BNSZ`(r3) #r[4]=c2; 1120 #mul_add_c(a[0],b[5],c3,c1,c2); 1121 $LD r7,`5*$BNSZ`(r5) 1122 $UMULL r8,r6,r7 1123 $UMULH r9,r6,r7 1124 addc r12,r12,r8 1125 adde r10,r10,r9 1126 addze r11,r0 1127 #mul_add_c(a[1],b[4],c3,c1,c2); 1128 $LD r6,`1*$BNSZ`(r4) 1129 $LD r7,`4*$BNSZ`(r5) 1130 $UMULL r8,r6,r7 1131 $UMULH r9,r6,r7 1132 addc r12,r12,r8 1133 adde r10,r10,r9 1134 addze r11,r11 1135 #mul_add_c(a[2],b[3],c3,c1,c2); 1136 $LD r6,`2*$BNSZ`(r4) 1137 $LD r7,`3*$BNSZ`(r5) 1138 $UMULL r8,r6,r7 1139 $UMULH r9,r6,r7 1140 addc r12,r12,r8 1141 adde r10,r10,r9 1142 addze r11,r11 1143 #mul_add_c(a[3],b[2],c3,c1,c2); 1144 $LD r6,`3*$BNSZ`(r4) 1145 $LD r7,`2*$BNSZ`(r5) 1146 $UMULL r8,r6,r7 1147 $UMULH r9,r6,r7 1148 addc r12,r12,r8 1149 adde r10,r10,r9 1150 addze r11,r11 1151 #mul_add_c(a[4],b[1],c3,c1,c2); 1152 $LD r6,`4*$BNSZ`(r4) 1153 $LD r7,`1*$BNSZ`(r5) 1154 $UMULL r8,r6,r7 1155 $UMULH r9,r6,r7 1156 addc r12,r12,r8 1157 adde r10,r10,r9 1158 addze r11,r11 1159 #mul_add_c(a[5],b[0],c3,c1,c2); 1160 $LD r6,`5*$BNSZ`(r4) 1161 $LD r7,`0*$BNSZ`(r5) 1162 $UMULL r8,r6,r7 1163 $UMULH r9,r6,r7 1164 addc r12,r12,r8 1165 adde r10,r10,r9 1166 addze r11,r11 1167 $ST r12,`5*$BNSZ`(r3) #r[5]=c3; 1168 #mul_add_c(a[6],b[0],c1,c2,c3); 1169 $LD r6,`6*$BNSZ`(r4) 1170 $UMULL r8,r6,r7 1171 $UMULH r9,r6,r7 1172 addc r10,r10,r8 1173 adde r11,r11,r9 1174 addze r12,r0 1175 #mul_add_c(a[5],b[1],c1,c2,c3); 1176 $LD r6,`5*$BNSZ`(r4) 1177 $LD r7,`1*$BNSZ`(r5) 1178 $UMULL r8,r6,r7 1179 $UMULH r9,r6,r7 1180 addc r10,r10,r8 1181 adde r11,r11,r9 1182 addze r12,r12 1183 #mul_add_c(a[4],b[2],c1,c2,c3); 1184 $LD r6,`4*$BNSZ`(r4) 1185 $LD r7,`2*$BNSZ`(r5) 1186 $UMULL r8,r6,r7 1187 $UMULH r9,r6,r7 1188 addc r10,r10,r8 1189 adde r11,r11,r9 1190 addze r12,r12 1191 #mul_add_c(a[3],b[3],c1,c2,c3); 1192 $LD r6,`3*$BNSZ`(r4) 1193 $LD r7,`3*$BNSZ`(r5) 1194 $UMULL r8,r6,r7 1195 $UMULH r9,r6,r7 1196 addc r10,r10,r8 1197 adde r11,r11,r9 1198 addze r12,r12 1199 #mul_add_c(a[2],b[4],c1,c2,c3); 1200 $LD r6,`2*$BNSZ`(r4) 1201 $LD r7,`4*$BNSZ`(r5) 1202 $UMULL r8,r6,r7 1203 $UMULH r9,r6,r7 1204 addc r10,r10,r8 1205 adde r11,r11,r9 1206 addze r12,r12 1207 #mul_add_c(a[1],b[5],c1,c2,c3); 1208 $LD r6,`1*$BNSZ`(r4) 1209 $LD r7,`5*$BNSZ`(r5) 1210 $UMULL r8,r6,r7 1211 $UMULH r9,r6,r7 1212 addc r10,r10,r8 1213 adde r11,r11,r9 1214 addze r12,r12 1215 #mul_add_c(a[0],b[6],c1,c2,c3); 1216 $LD r6,`0*$BNSZ`(r4) 1217 $LD r7,`6*$BNSZ`(r5) 1218 $UMULL r8,r6,r7 1219 $UMULH r9,r6,r7 1220 addc r10,r10,r8 1221 adde r11,r11,r9 1222 addze r12,r12 1223 $ST r10,`6*$BNSZ`(r3) #r[6]=c1; 1224 #mul_add_c(a[0],b[7],c2,c3,c1); 1225 $LD r7,`7*$BNSZ`(r5) 1226 $UMULL r8,r6,r7 1227 $UMULH r9,r6,r7 1228 addc r11,r11,r8 1229 adde r12,r12,r9 1230 addze r10,r0 1231 #mul_add_c(a[1],b[6],c2,c3,c1); 1232 $LD r6,`1*$BNSZ`(r4) 1233 $LD r7,`6*$BNSZ`(r5) 1234 $UMULL r8,r6,r7 1235 $UMULH r9,r6,r7 1236 addc r11,r11,r8 1237 adde r12,r12,r9 1238 addze r10,r10 1239 #mul_add_c(a[2],b[5],c2,c3,c1); 1240 $LD r6,`2*$BNSZ`(r4) 1241 $LD r7,`5*$BNSZ`(r5) 1242 $UMULL r8,r6,r7 1243 $UMULH r9,r6,r7 1244 addc r11,r11,r8 1245 adde r12,r12,r9 1246 addze r10,r10 1247 #mul_add_c(a[3],b[4],c2,c3,c1); 1248 $LD r6,`3*$BNSZ`(r4) 1249 $LD r7,`4*$BNSZ`(r5) 1250 $UMULL r8,r6,r7 1251 $UMULH r9,r6,r7 1252 addc r11,r11,r8 1253 adde r12,r12,r9 1254 addze r10,r10 1255 #mul_add_c(a[4],b[3],c2,c3,c1); 1256 $LD r6,`4*$BNSZ`(r4) 1257 $LD r7,`3*$BNSZ`(r5) 1258 $UMULL r8,r6,r7 1259 $UMULH r9,r6,r7 1260 addc r11,r11,r8 1261 adde r12,r12,r9 1262 addze r10,r10 1263 #mul_add_c(a[5],b[2],c2,c3,c1); 1264 $LD r6,`5*$BNSZ`(r4) 1265 $LD r7,`2*$BNSZ`(r5) 1266 $UMULL r8,r6,r7 1267 $UMULH r9,r6,r7 1268 addc r11,r11,r8 1269 adde r12,r12,r9 1270 addze r10,r10 1271 #mul_add_c(a[6],b[1],c2,c3,c1); 1272 $LD r6,`6*$BNSZ`(r4) 1273 $LD r7,`1*$BNSZ`(r5) 1274 $UMULL r8,r6,r7 1275 $UMULH r9,r6,r7 1276 addc r11,r11,r8 1277 adde r12,r12,r9 1278 addze r10,r10 1279 #mul_add_c(a[7],b[0],c2,c3,c1); 1280 $LD r6,`7*$BNSZ`(r4) 1281 $LD r7,`0*$BNSZ`(r5) 1282 $UMULL r8,r6,r7 1283 $UMULH r9,r6,r7 1284 addc r11,r11,r8 1285 adde r12,r12,r9 1286 addze r10,r10 1287 $ST r11,`7*$BNSZ`(r3) #r[7]=c2; 1288 #mul_add_c(a[7],b[1],c3,c1,c2); 1289 $LD r7,`1*$BNSZ`(r5) 1290 $UMULL r8,r6,r7 1291 $UMULH r9,r6,r7 1292 addc r12,r12,r8 1293 adde r10,r10,r9 1294 addze r11,r0 1295 #mul_add_c(a[6],b[2],c3,c1,c2); 1296 $LD r6,`6*$BNSZ`(r4) 1297 $LD r7,`2*$BNSZ`(r5) 1298 $UMULL r8,r6,r7 1299 $UMULH r9,r6,r7 1300 addc r12,r12,r8 1301 adde r10,r10,r9 1302 addze r11,r11 1303 #mul_add_c(a[5],b[3],c3,c1,c2); 1304 $LD r6,`5*$BNSZ`(r4) 1305 $LD r7,`3*$BNSZ`(r5) 1306 $UMULL r8,r6,r7 1307 $UMULH r9,r6,r7 1308 addc r12,r12,r8 1309 adde r10,r10,r9 1310 addze r11,r11 1311 #mul_add_c(a[4],b[4],c3,c1,c2); 1312 $LD r6,`4*$BNSZ`(r4) 1313 $LD r7,`4*$BNSZ`(r5) 1314 $UMULL r8,r6,r7 1315 $UMULH r9,r6,r7 1316 addc r12,r12,r8 1317 adde r10,r10,r9 1318 addze r11,r11 1319 #mul_add_c(a[3],b[5],c3,c1,c2); 1320 $LD r6,`3*$BNSZ`(r4) 1321 $LD r7,`5*$BNSZ`(r5) 1322 $UMULL r8,r6,r7 1323 $UMULH r9,r6,r7 1324 addc r12,r12,r8 1325 adde r10,r10,r9 1326 addze r11,r11 1327 #mul_add_c(a[2],b[6],c3,c1,c2); 1328 $LD r6,`2*$BNSZ`(r4) 1329 $LD r7,`6*$BNSZ`(r5) 1330 $UMULL r8,r6,r7 1331 $UMULH r9,r6,r7 1332 addc r12,r12,r8 1333 adde r10,r10,r9 1334 addze r11,r11 1335 #mul_add_c(a[1],b[7],c3,c1,c2); 1336 $LD r6,`1*$BNSZ`(r4) 1337 $LD r7,`7*$BNSZ`(r5) 1338 $UMULL r8,r6,r7 1339 $UMULH r9,r6,r7 1340 addc r12,r12,r8 1341 adde r10,r10,r9 1342 addze r11,r11 1343 $ST r12,`8*$BNSZ`(r3) #r[8]=c3; 1344 #mul_add_c(a[2],b[7],c1,c2,c3); 1345 $LD r6,`2*$BNSZ`(r4) 1346 $UMULL r8,r6,r7 1347 $UMULH r9,r6,r7 1348 addc r10,r10,r8 1349 adde r11,r11,r9 1350 addze r12,r0 1351 #mul_add_c(a[3],b[6],c1,c2,c3); 1352 $LD r6,`3*$BNSZ`(r4) 1353 $LD r7,`6*$BNSZ`(r5) 1354 $UMULL r8,r6,r7 1355 $UMULH r9,r6,r7 1356 addc r10,r10,r8 1357 adde r11,r11,r9 1358 addze r12,r12 1359 #mul_add_c(a[4],b[5],c1,c2,c3); 1360 $LD r6,`4*$BNSZ`(r4) 1361 $LD r7,`5*$BNSZ`(r5) 1362 $UMULL r8,r6,r7 1363 $UMULH r9,r6,r7 1364 addc r10,r10,r8 1365 adde r11,r11,r9 1366 addze r12,r12 1367 #mul_add_c(a[5],b[4],c1,c2,c3); 1368 $LD r6,`5*$BNSZ`(r4) 1369 $LD r7,`4*$BNSZ`(r5) 1370 $UMULL r8,r6,r7 1371 $UMULH r9,r6,r7 1372 addc r10,r10,r8 1373 adde r11,r11,r9 1374 addze r12,r12 1375 #mul_add_c(a[6],b[3],c1,c2,c3); 1376 $LD r6,`6*$BNSZ`(r4) 1377 $LD r7,`3*$BNSZ`(r5) 1378 $UMULL r8,r6,r7 1379 $UMULH r9,r6,r7 1380 addc r10,r10,r8 1381 adde r11,r11,r9 1382 addze r12,r12 1383 #mul_add_c(a[7],b[2],c1,c2,c3); 1384 $LD r6,`7*$BNSZ`(r4) 1385 $LD r7,`2*$BNSZ`(r5) 1386 $UMULL r8,r6,r7 1387 $UMULH r9,r6,r7 1388 addc r10,r10,r8 1389 adde r11,r11,r9 1390 addze r12,r12 1391 $ST r10,`9*$BNSZ`(r3) #r[9]=c1; 1392 #mul_add_c(a[7],b[3],c2,c3,c1); 1393 $LD r7,`3*$BNSZ`(r5) 1394 $UMULL r8,r6,r7 1395 $UMULH r9,r6,r7 1396 addc r11,r11,r8 1397 adde r12,r12,r9 1398 addze r10,r0 1399 #mul_add_c(a[6],b[4],c2,c3,c1); 1400 $LD r6,`6*$BNSZ`(r4) 1401 $LD r7,`4*$BNSZ`(r5) 1402 $UMULL r8,r6,r7 1403 $UMULH r9,r6,r7 1404 addc r11,r11,r8 1405 adde r12,r12,r9 1406 addze r10,r10 1407 #mul_add_c(a[5],b[5],c2,c3,c1); 1408 $LD r6,`5*$BNSZ`(r4) 1409 $LD r7,`5*$BNSZ`(r5) 1410 $UMULL r8,r6,r7 1411 $UMULH r9,r6,r7 1412 addc r11,r11,r8 1413 adde r12,r12,r9 1414 addze r10,r10 1415 #mul_add_c(a[4],b[6],c2,c3,c1); 1416 $LD r6,`4*$BNSZ`(r4) 1417 $LD r7,`6*$BNSZ`(r5) 1418 $UMULL r8,r6,r7 1419 $UMULH r9,r6,r7 1420 addc r11,r11,r8 1421 adde r12,r12,r9 1422 addze r10,r10 1423 #mul_add_c(a[3],b[7],c2,c3,c1); 1424 $LD r6,`3*$BNSZ`(r4) 1425 $LD r7,`7*$BNSZ`(r5) 1426 $UMULL r8,r6,r7 1427 $UMULH r9,r6,r7 1428 addc r11,r11,r8 1429 adde r12,r12,r9 1430 addze r10,r10 1431 $ST r11,`10*$BNSZ`(r3) #r[10]=c2; 1432 #mul_add_c(a[4],b[7],c3,c1,c2); 1433 $LD r6,`4*$BNSZ`(r4) 1434 $UMULL r8,r6,r7 1435 $UMULH r9,r6,r7 1436 addc r12,r12,r8 1437 adde r10,r10,r9 1438 addze r11,r0 1439 #mul_add_c(a[5],b[6],c3,c1,c2); 1440 $LD r6,`5*$BNSZ`(r4) 1441 $LD r7,`6*$BNSZ`(r5) 1442 $UMULL r8,r6,r7 1443 $UMULH r9,r6,r7 1444 addc r12,r12,r8 1445 adde r10,r10,r9 1446 addze r11,r11 1447 #mul_add_c(a[6],b[5],c3,c1,c2); 1448 $LD r6,`6*$BNSZ`(r4) 1449 $LD r7,`5*$BNSZ`(r5) 1450 $UMULL r8,r6,r7 1451 $UMULH r9,r6,r7 1452 addc r12,r12,r8 1453 adde r10,r10,r9 1454 addze r11,r11 1455 #mul_add_c(a[7],b[4],c3,c1,c2); 1456 $LD r6,`7*$BNSZ`(r4) 1457 $LD r7,`4*$BNSZ`(r5) 1458 $UMULL r8,r6,r7 1459 $UMULH r9,r6,r7 1460 addc r12,r12,r8 1461 adde r10,r10,r9 1462 addze r11,r11 1463 $ST r12,`11*$BNSZ`(r3) #r[11]=c3; 1464 #mul_add_c(a[7],b[5],c1,c2,c3); 1465 $LD r7,`5*$BNSZ`(r5) 1466 $UMULL r8,r6,r7 1467 $UMULH r9,r6,r7 1468 addc r10,r10,r8 1469 adde r11,r11,r9 1470 addze r12,r0 1471 #mul_add_c(a[6],b[6],c1,c2,c3); 1472 $LD r6,`6*$BNSZ`(r4) 1473 $LD r7,`6*$BNSZ`(r5) 1474 $UMULL r8,r6,r7 1475 $UMULH r9,r6,r7 1476 addc r10,r10,r8 1477 adde r11,r11,r9 1478 addze r12,r12 1479 #mul_add_c(a[5],b[7],c1,c2,c3); 1480 $LD r6,`5*$BNSZ`(r4) 1481 $LD r7,`7*$BNSZ`(r5) 1482 $UMULL r8,r6,r7 1483 $UMULH r9,r6,r7 1484 addc r10,r10,r8 1485 adde r11,r11,r9 1486 addze r12,r12 1487 $ST r10,`12*$BNSZ`(r3) #r[12]=c1; 1488 #mul_add_c(a[6],b[7],c2,c3,c1); 1489 $LD r6,`6*$BNSZ`(r4) 1490 $UMULL r8,r6,r7 1491 $UMULH r9,r6,r7 1492 addc r11,r11,r8 1493 adde r12,r12,r9 1494 addze r10,r0 1495 #mul_add_c(a[7],b[6],c2,c3,c1); 1496 $LD r6,`7*$BNSZ`(r4) 1497 $LD r7,`6*$BNSZ`(r5) 1498 $UMULL r8,r6,r7 1499 $UMULH r9,r6,r7 1500 addc r11,r11,r8 1501 adde r12,r12,r9 1502 addze r10,r10 1503 $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 1504 #mul_add_c(a[7],b[7],c3,c1,c2); 1505 $LD r7,`7*$BNSZ`(r5) 1506 $UMULL r8,r6,r7 1507 $UMULH r9,r6,r7 1508 addc r12,r12,r8 1509 adde r10,r10,r9 1510 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1511 $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1512 blr 1513 .long 0 1514 .byte 0,12,0x14,0,0,0,3,0 1515 .long 0 1516.size .bn_mul_comba8,.-.bn_mul_comba8 1517 1518# 1519# NOTE: The following label name should be changed to 1520# "bn_sub_words" i.e. remove the first dot 1521# for the gcc compiler. This should be automatically 1522# done in the build 1523# 1524# 1525.align 4 1526.bn_sub_words: 1527# 1528# Handcoded version of bn_sub_words 1529# 1530#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1531# 1532# r3 = r 1533# r4 = a 1534# r5 = b 1535# r6 = n 1536# 1537# Note: No loop unrolling done since this is not a performance 1538# critical loop. 1539 1540 xor r0,r0,r0 #set r0 = 0 1541# 1542# check for r6 = 0 AND set carry bit. 1543# 1544 subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1545 # if r6 > 0 then result !=0 1546 # In either case carry bit is set. 1547 beq Lppcasm_sub_adios 1548 addi r4,r4,-$BNSZ 1549 addi r3,r3,-$BNSZ 1550 addi r5,r5,-$BNSZ 1551 mtctr r6 1552Lppcasm_sub_mainloop: 1553 $LDU r7,$BNSZ(r4) 1554 $LDU r8,$BNSZ(r5) 1555 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 1556 # if carry = 1 this is r7-r8. Else it 1557 # is r7-r8 -1 as we need. 1558 $STU r6,$BNSZ(r3) 1559 bdnz Lppcasm_sub_mainloop 1560Lppcasm_sub_adios: 1561 subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1562 andi. r3,r3,1 # keep only last bit. 1563 blr 1564 .long 0 1565 .byte 0,12,0x14,0,0,0,4,0 1566 .long 0 1567.size .bn_sub_words,.-.bn_sub_words 1568 1569# 1570# NOTE: The following label name should be changed to 1571# "bn_add_words" i.e. remove the first dot 1572# for the gcc compiler. This should be automatically 1573# done in the build 1574# 1575 1576.align 4 1577.bn_add_words: 1578# 1579# Handcoded version of bn_add_words 1580# 1581#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1582# 1583# r3 = r 1584# r4 = a 1585# r5 = b 1586# r6 = n 1587# 1588# Note: No loop unrolling done since this is not a performance 1589# critical loop. 1590 1591 xor r0,r0,r0 1592# 1593# check for r6 = 0. Is this needed? 1594# 1595 addic. r6,r6,0 #test r6 and clear carry bit. 1596 beq Lppcasm_add_adios 1597 addi r4,r4,-$BNSZ 1598 addi r3,r3,-$BNSZ 1599 addi r5,r5,-$BNSZ 1600 mtctr r6 1601Lppcasm_add_mainloop: 1602 $LDU r7,$BNSZ(r4) 1603 $LDU r8,$BNSZ(r5) 1604 adde r8,r7,r8 1605 $STU r8,$BNSZ(r3) 1606 bdnz Lppcasm_add_mainloop 1607Lppcasm_add_adios: 1608 addze r3,r0 #return carry bit. 1609 blr 1610 .long 0 1611 .byte 0,12,0x14,0,0,0,4,0 1612 .long 0 1613.size .bn_add_words,.-.bn_add_words 1614 1615# 1616# NOTE: The following label name should be changed to 1617# "bn_div_words" i.e. remove the first dot 1618# for the gcc compiler. This should be automatically 1619# done in the build 1620# 1621 1622.align 4 1623.bn_div_words: 1624# 1625# This is a cleaned up version of code generated by 1626# the AIX compiler. The only optimization is to use 1627# the PPC instruction to count leading zeros instead 1628# of call to num_bits_word. Since this was compiled 1629# only at level -O2 we can possibly squeeze it more? 1630# 1631# r3 = h 1632# r4 = l 1633# r5 = d 1634 1635 $UCMPI 0,r5,0 # compare r5 and 0 1636 bne Lppcasm_div1 # proceed if d!=0 1637 li r3,-1 # d=0 return -1 1638 blr 1639Lppcasm_div1: 1640 xor r0,r0,r0 #r0=0 1641 li r8,$BITS 1642 $CNTLZ. r7,r5 #r7 = num leading 0s in d. 1643 beq Lppcasm_div2 #proceed if no leading zeros 1644 subf r8,r7,r8 #r8 = BN_num_bits_word(d) 1645 $SHR. r9,r3,r8 #are there any bits above r8'th? 1646 $TR 16,r9,r0 #if there're, signal to dump core... 1647Lppcasm_div2: 1648 $UCMP 0,r3,r5 #h>=d? 1649 blt Lppcasm_div3 #goto Lppcasm_div3 if not 1650 subf r3,r5,r3 #h-=d ; 1651Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1652 cmpi 0,0,r7,0 # is (i == 0)? 1653 beq Lppcasm_div4 1654 $SHL r3,r3,r7 # h = (h<< i) 1655 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1656 $SHL r5,r5,r7 # d<<=i 1657 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 1658 $SHL r4,r4,r7 # l <<=i 1659Lppcasm_div4: 1660 $SHRI r9,r5,`$BITS/2` # r9 = dh 1661 # dl will be computed when needed 1662 # as it saves registers. 1663 li r6,2 #r6=2 1664 mtctr r6 #counter will be in count. 1665Lppcasm_divouterloop: 1666 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 1667 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1668 # compute here for innerloop. 1669 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1670 bne Lppcasm_div5 # goto Lppcasm_div5 if not 1671 1672 li r8,-1 1673 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1674 b Lppcasm_div6 1675Lppcasm_div5: 1676 $UDIV r8,r3,r9 #q = h/dh 1677Lppcasm_div6: 1678 $UMULL r12,r9,r8 #th = q*dh 1679 $CLRU r10,r5,`$BITS/2` #r10=dl 1680 $UMULL r6,r8,r10 #tl = q*dl 1681 1682Lppcasm_divinnerloop: 1683 subf r10,r12,r3 #t = h -th 1684 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 1685 addic. r7,r7,0 #test if r7 == 0. used below. 1686 # now want to compute 1687 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) 1688 # the following 2 instructions do that 1689 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1690 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1691 $UCMP cr1,r6,r7 # compare (tl <= r7) 1692 bne Lppcasm_divinnerexit 1693 ble cr1,Lppcasm_divinnerexit 1694 addi r8,r8,-1 #q-- 1695 subf r12,r9,r12 #th -=dh 1696 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1697 subf r6,r10,r6 #tl -=dl 1698 b Lppcasm_divinnerloop 1699Lppcasm_divinnerexit: 1700 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1701 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1702 $UCMP cr1,r4,r11 # compare l and tl 1703 add r12,r12,r10 # th+=t 1704 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1705 addi r12,r12,1 # th++ 1706Lppcasm_div7: 1707 subf r11,r11,r4 #r11=l-tl 1708 $UCMP cr1,r3,r12 #compare h and th 1709 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 1710 addi r8,r8,-1 # q-- 1711 add r3,r5,r3 # h+=d 1712Lppcasm_div8: 1713 subf r12,r12,r3 #r12 = h-th 1714 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 1715 # want to compute 1716 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 1717 # the following 2 instructions will do this. 1718 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 1719 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1720 bdz Lppcasm_div9 #if (count==0) break ; 1721 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1722 b Lppcasm_divouterloop 1723Lppcasm_div9: 1724 or r3,r8,r0 1725 blr 1726 .long 0 1727 .byte 0,12,0x14,0,0,0,3,0 1728 .long 0 1729.size .bn_div_words,.-.bn_div_words 1730 1731# 1732# NOTE: The following label name should be changed to 1733# "bn_sqr_words" i.e. remove the first dot 1734# for the gcc compiler. This should be automatically 1735# done in the build 1736# 1737.align 4 1738.bn_sqr_words: 1739# 1740# Optimized version of bn_sqr_words 1741# 1742# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 1743# 1744# r3 = r 1745# r4 = a 1746# r5 = n 1747# 1748# r6 = a[i]. 1749# r7,r8 = product. 1750# 1751# No unrolling done here. Not performance critical. 1752 1753 addic. r5,r5,0 #test r5. 1754 beq Lppcasm_sqr_adios 1755 addi r4,r4,-$BNSZ 1756 addi r3,r3,-$BNSZ 1757 mtctr r5 1758Lppcasm_sqr_mainloop: 1759 #sqr(r[0],r[1],a[0]); 1760 $LDU r6,$BNSZ(r4) 1761 $UMULL r7,r6,r6 1762 $UMULH r8,r6,r6 1763 $STU r7,$BNSZ(r3) 1764 $STU r8,$BNSZ(r3) 1765 bdnz Lppcasm_sqr_mainloop 1766Lppcasm_sqr_adios: 1767 blr 1768 .long 0 1769 .byte 0,12,0x14,0,0,0,3,0 1770 .long 0 1771.size .bn_sqr_words,.-.bn_sqr_words 1772 1773# 1774# NOTE: The following label name should be changed to 1775# "bn_mul_words" i.e. remove the first dot 1776# for the gcc compiler. This should be automatically 1777# done in the build 1778# 1779 1780.align 4 1781.bn_mul_words: 1782# 1783# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1784# 1785# r3 = rp 1786# r4 = ap 1787# r5 = num 1788# r6 = w 1789 xor r0,r0,r0 1790 xor r12,r12,r12 # used for carry 1791 rlwinm. r7,r5,30,2,31 # num >> 2 1792 beq Lppcasm_mw_REM 1793 mtctr r7 1794Lppcasm_mw_LOOP: 1795 #mul(rp[0],ap[0],w,c1); 1796 $LD r8,`0*$BNSZ`(r4) 1797 $UMULL r9,r6,r8 1798 $UMULH r10,r6,r8 1799 addc r9,r9,r12 1800 #addze r10,r10 #carry is NOT ignored. 1801 #will be taken care of 1802 #in second spin below 1803 #using adde. 1804 $ST r9,`0*$BNSZ`(r3) 1805 #mul(rp[1],ap[1],w,c1); 1806 $LD r8,`1*$BNSZ`(r4) 1807 $UMULL r11,r6,r8 1808 $UMULH r12,r6,r8 1809 adde r11,r11,r10 1810 #addze r12,r12 1811 $ST r11,`1*$BNSZ`(r3) 1812 #mul(rp[2],ap[2],w,c1); 1813 $LD r8,`2*$BNSZ`(r4) 1814 $UMULL r9,r6,r8 1815 $UMULH r10,r6,r8 1816 adde r9,r9,r12 1817 #addze r10,r10 1818 $ST r9,`2*$BNSZ`(r3) 1819 #mul_add(rp[3],ap[3],w,c1); 1820 $LD r8,`3*$BNSZ`(r4) 1821 $UMULL r11,r6,r8 1822 $UMULH r12,r6,r8 1823 adde r11,r11,r10 1824 addze r12,r12 #this spin we collect carry into 1825 #r12 1826 $ST r11,`3*$BNSZ`(r3) 1827 1828 addi r3,r3,`4*$BNSZ` 1829 addi r4,r4,`4*$BNSZ` 1830 bdnz Lppcasm_mw_LOOP 1831 1832Lppcasm_mw_REM: 1833 andi. r5,r5,0x3 1834 beq Lppcasm_mw_OVER 1835 #mul(rp[0],ap[0],w,c1); 1836 $LD r8,`0*$BNSZ`(r4) 1837 $UMULL r9,r6,r8 1838 $UMULH r10,r6,r8 1839 addc r9,r9,r12 1840 addze r10,r10 1841 $ST r9,`0*$BNSZ`(r3) 1842 addi r12,r10,0 1843 1844 addi r5,r5,-1 1845 cmpli 0,0,r5,0 1846 beq Lppcasm_mw_OVER 1847 1848 1849 #mul(rp[1],ap[1],w,c1); 1850 $LD r8,`1*$BNSZ`(r4) 1851 $UMULL r9,r6,r8 1852 $UMULH r10,r6,r8 1853 addc r9,r9,r12 1854 addze r10,r10 1855 $ST r9,`1*$BNSZ`(r3) 1856 addi r12,r10,0 1857 1858 addi r5,r5,-1 1859 cmpli 0,0,r5,0 1860 beq Lppcasm_mw_OVER 1861 1862 #mul_add(rp[2],ap[2],w,c1); 1863 $LD r8,`2*$BNSZ`(r4) 1864 $UMULL r9,r6,r8 1865 $UMULH r10,r6,r8 1866 addc r9,r9,r12 1867 addze r10,r10 1868 $ST r9,`2*$BNSZ`(r3) 1869 addi r12,r10,0 1870 1871Lppcasm_mw_OVER: 1872 addi r3,r12,0 1873 blr 1874 .long 0 1875 .byte 0,12,0x14,0,0,0,4,0 1876 .long 0 1877.size bn_mul_words,.-bn_mul_words 1878 1879# 1880# NOTE: The following label name should be changed to 1881# "bn_mul_add_words" i.e. remove the first dot 1882# for the gcc compiler. This should be automatically 1883# done in the build 1884# 1885 1886.align 4 1887.bn_mul_add_words: 1888# 1889# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1890# 1891# r3 = rp 1892# r4 = ap 1893# r5 = num 1894# r6 = w 1895# 1896# empirical evidence suggests that unrolled version performs best!! 1897# 1898 xor r0,r0,r0 #r0 = 0 1899 xor r12,r12,r12 #r12 = 0 . used for carry 1900 rlwinm. r7,r5,30,2,31 # num >> 2 1901 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 1902 mtctr r7 1903Lppcasm_maw_mainloop: 1904 #mul_add(rp[0],ap[0],w,c1); 1905 $LD r8,`0*$BNSZ`(r4) 1906 $LD r11,`0*$BNSZ`(r3) 1907 $UMULL r9,r6,r8 1908 $UMULH r10,r6,r8 1909 addc r9,r9,r12 #r12 is carry. 1910 addze r10,r10 1911 addc r9,r9,r11 1912 #addze r10,r10 1913 #the above instruction addze 1914 #is NOT needed. Carry will NOT 1915 #be ignored. It's not affected 1916 #by multiply and will be collected 1917 #in the next spin 1918 $ST r9,`0*$BNSZ`(r3) 1919 1920 #mul_add(rp[1],ap[1],w,c1); 1921 $LD r8,`1*$BNSZ`(r4) 1922 $LD r9,`1*$BNSZ`(r3) 1923 $UMULL r11,r6,r8 1924 $UMULH r12,r6,r8 1925 adde r11,r11,r10 #r10 is carry. 1926 addze r12,r12 1927 addc r11,r11,r9 1928 #addze r12,r12 1929 $ST r11,`1*$BNSZ`(r3) 1930 1931 #mul_add(rp[2],ap[2],w,c1); 1932 $LD r8,`2*$BNSZ`(r4) 1933 $UMULL r9,r6,r8 1934 $LD r11,`2*$BNSZ`(r3) 1935 $UMULH r10,r6,r8 1936 adde r9,r9,r12 1937 addze r10,r10 1938 addc r9,r9,r11 1939 #addze r10,r10 1940 $ST r9,`2*$BNSZ`(r3) 1941 1942 #mul_add(rp[3],ap[3],w,c1); 1943 $LD r8,`3*$BNSZ`(r4) 1944 $UMULL r11,r6,r8 1945 $LD r9,`3*$BNSZ`(r3) 1946 $UMULH r12,r6,r8 1947 adde r11,r11,r10 1948 addze r12,r12 1949 addc r11,r11,r9 1950 addze r12,r12 1951 $ST r11,`3*$BNSZ`(r3) 1952 addi r3,r3,`4*$BNSZ` 1953 addi r4,r4,`4*$BNSZ` 1954 bdnz Lppcasm_maw_mainloop 1955 1956Lppcasm_maw_leftover: 1957 andi. r5,r5,0x3 1958 beq Lppcasm_maw_adios 1959 addi r3,r3,-$BNSZ 1960 addi r4,r4,-$BNSZ 1961 #mul_add(rp[0],ap[0],w,c1); 1962 mtctr r5 1963 $LDU r8,$BNSZ(r4) 1964 $UMULL r9,r6,r8 1965 $UMULH r10,r6,r8 1966 $LDU r11,$BNSZ(r3) 1967 addc r9,r9,r11 1968 addze r10,r10 1969 addc r9,r9,r12 1970 addze r12,r10 1971 $ST r9,0(r3) 1972 1973 bdz Lppcasm_maw_adios 1974 #mul_add(rp[1],ap[1],w,c1); 1975 $LDU r8,$BNSZ(r4) 1976 $UMULL r9,r6,r8 1977 $UMULH r10,r6,r8 1978 $LDU r11,$BNSZ(r3) 1979 addc r9,r9,r11 1980 addze r10,r10 1981 addc r9,r9,r12 1982 addze r12,r10 1983 $ST r9,0(r3) 1984 1985 bdz Lppcasm_maw_adios 1986 #mul_add(rp[2],ap[2],w,c1); 1987 $LDU r8,$BNSZ(r4) 1988 $UMULL r9,r6,r8 1989 $UMULH r10,r6,r8 1990 $LDU r11,$BNSZ(r3) 1991 addc r9,r9,r11 1992 addze r10,r10 1993 addc r9,r9,r12 1994 addze r12,r10 1995 $ST r9,0(r3) 1996 1997Lppcasm_maw_adios: 1998 addi r3,r12,0 1999 blr 2000 .long 0 2001 .byte 0,12,0x14,0,0,0,4,0 2002 .long 0 2003.size .bn_mul_add_words,.-.bn_mul_add_words 2004 .align 4 2005EOF 2006$data =~ s/\`([^\`]*)\`/eval $1/gem; 2007print $data; 2008close STDOUT; 2009