1#!/usr/bin/env perl 2# 3# Implemented as a Perl wrapper as we want to support several different 4# architectures with single file. We pick up the target based on the 5# file name we are asked to generate. 6# 7# It should be noted though that this perl code is nothing like 8# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 9# as pre-processor to cover for platform differences in name decoration, 10# linker tables, 32-/64-bit instruction sets... 11# 12# As you might know there're several PowerPC ABI in use. Most notably 13# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs 14# are similar enough to implement leaf(!) functions, which would be ABI 15# neutral. And that's what you find here: ABI neutral leaf functions. 16# In case you wonder what that is... 17# 18# AIX performance 19# 20# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. 21# 22# The following is the performance of 32-bit compiler 23# generated code: 24# 25# OpenSSL 0.9.6c 21 dec 2001 26# built on: Tue Jun 11 11:06:51 EDT 2002 27# options:bn(64,32) ... 28#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 29# sign verify sign/s verify/s 30#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 31#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 32#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 33#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 34#dsa 512 bits 0.0087s 0.0106s 114.3 94.5 35#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 36# 37# Same bechmark with this assembler code: 38# 39#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 40#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 41#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 42#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 43#dsa 512 bits 0.0052s 0.0062s 191.6 162.0 44#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 45# 46# Number of operations increases by at almost 75% 47# 48# Here are performance numbers for 64-bit compiler 49# generated code: 50# 51# OpenSSL 0.9.6g [engine] 9 Aug 2002 52# built on: Fri Apr 18 16:59:20 EDT 2003 53# options:bn(64,64) ... 54# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 55# sign verify sign/s verify/s 56#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 57#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 58#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 59#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 60#dsa 512 bits 0.0026s 0.0032s 382.5 313.7 61#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 62# 63# Same benchmark with this assembler code: 64# 65#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 66#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 67#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 68#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 69#dsa 512 bits 0.0016s 0.0020s 610.7 507.1 70#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 71# 72# Again, performance increases by at about 75% 73# 74# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) 75# OpenSSL 0.9.7c 30 Sep 2003 76# 77# Original code. 78# 79#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 80#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 81#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 82#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 83#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 84#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 85#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 86# 87# Same benchmark with this assembler code: 88# 89#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 90#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 91#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 92#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 93#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 94#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 95#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 96# 97# Performance increase of ~60% 98# 99# If you have comments or suggestions to improve code send 100# me a note at schari@us.ibm.com 101# 102 103$opf = shift; 104 105if ($opf =~ /32\.s/) { 106 $BITS= 32; 107 $BNSZ= $BITS/8; 108 $ISA= "\"ppc\""; 109 110 $LD= "lwz"; # load 111 $LDU= "lwzu"; # load and update 112 $ST= "stw"; # store 113 $STU= "stwu"; # store and update 114 $UMULL= "mullw"; # unsigned multiply low 115 $UMULH= "mulhwu"; # unsigned multiply high 116 $UDIV= "divwu"; # unsigned divide 117 $UCMPI= "cmplwi"; # unsigned compare with immediate 118 $UCMP= "cmplw"; # unsigned compare 119 $CNTLZ= "cntlzw"; # count leading zeros 120 $SHL= "slw"; # shift left 121 $SHR= "srw"; # unsigned shift right 122 $SHRI= "srwi"; # unsigned shift right by immediate 123 $SHLI= "slwi"; # shift left by immediate 124 $CLRU= "clrlwi"; # clear upper bits 125 $INSR= "insrwi"; # insert right 126 $ROTL= "rotlwi"; # rotate left by immediate 127 $TR= "tw"; # conditional trap 128} elsif ($opf =~ /64\.s/) { 129 $BITS= 64; 130 $BNSZ= $BITS/8; 131 $ISA= "\"ppc64\""; 132 133 # same as above, but 64-bit mnemonics... 134 $LD= "ld"; # load 135 $LDU= "ldu"; # load and update 136 $ST= "std"; # store 137 $STU= "stdu"; # store and update 138 $UMULL= "mulld"; # unsigned multiply low 139 $UMULH= "mulhdu"; # unsigned multiply high 140 $UDIV= "divdu"; # unsigned divide 141 $UCMPI= "cmpldi"; # unsigned compare with immediate 142 $UCMP= "cmpld"; # unsigned compare 143 $CNTLZ= "cntlzd"; # count leading zeros 144 $SHL= "sld"; # shift left 145 $SHR= "srd"; # unsigned shift right 146 $SHRI= "srdi"; # unsigned shift right by immediate 147 $SHLI= "sldi"; # shift left by immediate 148 $CLRU= "clrldi"; # clear upper bits 149 $INSR= "insrdi"; # insert right 150 $ROTL= "rotldi"; # rotate left by immediate 151 $TR= "td"; # conditional trap 152} else { die "nonsense $opf"; } 153 154( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!"; 155 156# function entry points from the AIX code 157# 158# There are other, more elegant, ways to handle this. We (IBM) chose 159# this approach as it plays well with scripts we run to 'namespace' 160# OpenSSL .i.e. we add a prefix to all the public symbols so we can 161# co-exist in the same process with other implementations of OpenSSL. 162# 'cleverer' ways of doing these substitutions tend to hide data we 163# need to be obvious. 164# 165my @items = ("bn_sqr_comba4", 166 "bn_sqr_comba8", 167 "bn_mul_comba4", 168 "bn_mul_comba8", 169 "bn_sub_words", 170 "bn_add_words", 171 "bn_div_words", 172 "bn_sqr_words", 173 "bn_mul_words", 174 "bn_mul_add_words"); 175 176if ($opf =~ /linux/) { do_linux(); } 177elsif ($opf =~ /aix/) { do_aix(); } 178elsif ($opf =~ /osx/) { do_osx(); } 179else { do_bsd(); } 180 181sub do_linux { 182 $d=&data(); 183 184 if ($BITS==64) { 185 foreach $t (@items) { 186 $d =~ s/\.$t:/\ 187\t.section\t".opd","aw"\ 188\t.align\t3\ 189\t.globl\t$t\ 190$t:\ 191\t.quad\t.$t,.TOC.\@tocbase,0\ 192\t.size\t$t,24\ 193\t.previous\n\ 194\t.type\t.$t,\@function\ 195\t.globl\t.$t\ 196.$t:/g; 197 } 198 } 199 else { 200 foreach $t (@items) { 201 $d=~s/\.$t/$t/g; 202 } 203 } 204 # hide internal labels to avoid pollution of name table... 205 $d=~s/Lppcasm_/.Lppcasm_/gm; 206 print $d; 207} 208 209sub do_aix { 210 # AIX assembler is smart enough to please the linker without 211 # making us do something special... 212 print &data(); 213} 214 215# MacOSX 32 bit 216sub do_osx { 217 $d=&data(); 218 # Change the bn symbol prefix from '.' to '_' 219 foreach $t (@items) { 220 $d=~s/\.$t/_$t/g; 221 } 222 # Change .machine to something OS X asm will accept 223 $d=~s/\.machine.*/.text/g; 224 $d=~s/\#/;/g; # change comment from '#' to ';' 225 print $d; 226} 227 228# BSD (Untested) 229sub do_bsd { 230 $d=&data(); 231 foreach $t (@items) { 232 $d=~s/\.$t/_$t/g; 233 } 234 print $d; 235} 236 237sub data { 238 local($data)=<<EOF; 239#-------------------------------------------------------------------- 240# 241# 242# 243# 244# File: ppc32.s 245# 246# Created by: Suresh Chari 247# IBM Thomas J. Watson Research Library 248# Hawthorne, NY 249# 250# 251# Description: Optimized assembly routines for OpenSSL crypto 252# on the 32 bitPowerPC platform. 253# 254# 255# Version History 256# 257# 2. Fixed bn_add,bn_sub and bn_div_words, added comments, 258# cleaned up code. Also made a single version which can 259# be used for both the AIX and Linux compilers. See NOTE 260# below. 261# 12/05/03 Suresh Chari 262# (with lots of help from) Andy Polyakov 263## 264# 1. Initial version 10/20/02 Suresh Chari 265# 266# 267# The following file works for the xlc,cc 268# and gcc compilers. 269# 270# NOTE: To get the file to link correctly with the gcc compiler 271# you have to change the names of the routines and remove 272# the first .(dot) character. This should automatically 273# be done in the build process. 274# 275# Hand optimized assembly code for the following routines 276# 277# bn_sqr_comba4 278# bn_sqr_comba8 279# bn_mul_comba4 280# bn_mul_comba8 281# bn_sub_words 282# bn_add_words 283# bn_div_words 284# bn_sqr_words 285# bn_mul_words 286# bn_mul_add_words 287# 288# NOTE: It is possible to optimize this code more for 289# specific PowerPC or Power architectures. On the Northstar 290# architecture the optimizations in this file do 291# NOT provide much improvement. 292# 293# If you have comments or suggestions to improve code send 294# me a note at schari\@us.ibm.com 295# 296#-------------------------------------------------------------------------- 297# 298# Defines to be used in the assembly code. 299# 300.set r0,0 # we use it as storage for value of 0 301.set SP,1 # preserved 302.set RTOC,2 # preserved 303.set r3,3 # 1st argument/return value 304.set r4,4 # 2nd argument/volatile register 305.set r5,5 # 3rd argument/volatile register 306.set r6,6 # ... 307.set r7,7 308.set r8,8 309.set r9,9 310.set r10,10 311.set r11,11 312.set r12,12 313.set r13,13 # not used, nor any other "below" it... 314 315.set BO_IF_NOT,4 316.set BO_IF,12 317.set BO_dCTR_NZERO,16 318.set BO_dCTR_ZERO,18 319.set BO_ALWAYS,20 320.set CR0_LT,0; 321.set CR0_GT,1; 322.set CR0_EQ,2 323.set CR1_FX,4; 324.set CR1_FEX,5; 325.set CR1_VX,6 326.set LR,8 327 328# Declare function names to be global 329# NOTE: For gcc these names MUST be changed to remove 330# the first . i.e. for example change ".bn_sqr_comba4" 331# to "bn_sqr_comba4". This should be automatically done 332# in the build. 333 334 .globl .bn_sqr_comba4 335 .globl .bn_sqr_comba8 336 .globl .bn_mul_comba4 337 .globl .bn_mul_comba8 338 .globl .bn_sub_words 339 .globl .bn_add_words 340 .globl .bn_div_words 341 .globl .bn_sqr_words 342 .globl .bn_mul_words 343 .globl .bn_mul_add_words 344 345# .text section 346 347 .machine $ISA 348 349# 350# NOTE: The following label name should be changed to 351# "bn_sqr_comba4" i.e. remove the first dot 352# for the gcc compiler. This should be automatically 353# done in the build 354# 355 356.align 4 357.bn_sqr_comba4: 358# 359# Optimized version of bn_sqr_comba4. 360# 361# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 362# r3 contains r 363# r4 contains a 364# 365# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 366# 367# r5,r6 are the two BN_ULONGs being multiplied. 368# r7,r8 are the results of the 32x32 giving 64 bit multiply. 369# r9,r10, r11 are the equivalents of c1,c2, c3. 370# Here's the assembly 371# 372# 373 xor r0,r0,r0 # set r0 = 0. Used in the addze 374 # instructions below 375 376 #sqr_add_c(a,0,c1,c2,c3) 377 $LD r5,`0*$BNSZ`(r4) 378 $UMULL r9,r5,r5 379 $UMULH r10,r5,r5 #in first iteration. No need 380 #to add since c1=c2=c3=0. 381 # Note c3(r11) is NOT set to 0 382 # but will be. 383 384 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 385 # sqr_add_c2(a,1,0,c2,c3,c1); 386 $LD r6,`1*$BNSZ`(r4) 387 $UMULL r7,r5,r6 388 $UMULH r8,r5,r6 389 390 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) 391 adde r8,r8,r8 392 addze r9,r0 # catch carry if any. 393 # r9= r0(=0) and carry 394 395 addc r10,r7,r10 # now add to temp result. 396 addze r11,r8 # r8 added to r11 which is 0 397 addze r9,r9 398 399 $ST r10,`1*$BNSZ`(r3) #r[1]=c2; 400 #sqr_add_c(a,1,c3,c1,c2) 401 $UMULL r7,r6,r6 402 $UMULH r8,r6,r6 403 addc r11,r7,r11 404 adde r9,r8,r9 405 addze r10,r0 406 #sqr_add_c2(a,2,0,c3,c1,c2) 407 $LD r6,`2*$BNSZ`(r4) 408 $UMULL r7,r5,r6 409 $UMULH r8,r5,r6 410 411 addc r7,r7,r7 412 adde r8,r8,r8 413 addze r10,r10 414 415 addc r11,r7,r11 416 adde r9,r8,r9 417 addze r10,r10 418 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 419 #sqr_add_c2(a,3,0,c1,c2,c3); 420 $LD r6,`3*$BNSZ`(r4) 421 $UMULL r7,r5,r6 422 $UMULH r8,r5,r6 423 addc r7,r7,r7 424 adde r8,r8,r8 425 addze r11,r0 426 427 addc r9,r7,r9 428 adde r10,r8,r10 429 addze r11,r11 430 #sqr_add_c2(a,2,1,c1,c2,c3); 431 $LD r5,`1*$BNSZ`(r4) 432 $LD r6,`2*$BNSZ`(r4) 433 $UMULL r7,r5,r6 434 $UMULH r8,r5,r6 435 436 addc r7,r7,r7 437 adde r8,r8,r8 438 addze r11,r11 439 addc r9,r7,r9 440 adde r10,r8,r10 441 addze r11,r11 442 $ST r9,`3*$BNSZ`(r3) #r[3]=c1 443 #sqr_add_c(a,2,c2,c3,c1); 444 $UMULL r7,r6,r6 445 $UMULH r8,r6,r6 446 addc r10,r7,r10 447 adde r11,r8,r11 448 addze r9,r0 449 #sqr_add_c2(a,3,1,c2,c3,c1); 450 $LD r6,`3*$BNSZ`(r4) 451 $UMULL r7,r5,r6 452 $UMULH r8,r5,r6 453 addc r7,r7,r7 454 adde r8,r8,r8 455 addze r9,r9 456 457 addc r10,r7,r10 458 adde r11,r8,r11 459 addze r9,r9 460 $ST r10,`4*$BNSZ`(r3) #r[4]=c2 461 #sqr_add_c2(a,3,2,c3,c1,c2); 462 $LD r5,`2*$BNSZ`(r4) 463 $UMULL r7,r5,r6 464 $UMULH r8,r5,r6 465 addc r7,r7,r7 466 adde r8,r8,r8 467 addze r10,r0 468 469 addc r11,r7,r11 470 adde r9,r8,r9 471 addze r10,r10 472 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 473 #sqr_add_c(a,3,c1,c2,c3); 474 $UMULL r7,r6,r6 475 $UMULH r8,r6,r6 476 addc r9,r7,r9 477 adde r10,r8,r10 478 479 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 480 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 481 bclr BO_ALWAYS,CR0_LT 482 .long 0x00000000 483 484# 485# NOTE: The following label name should be changed to 486# "bn_sqr_comba8" i.e. remove the first dot 487# for the gcc compiler. This should be automatically 488# done in the build 489# 490 491.align 4 492.bn_sqr_comba8: 493# 494# This is an optimized version of the bn_sqr_comba8 routine. 495# Tightly uses the adde instruction 496# 497# 498# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 499# r3 contains r 500# r4 contains a 501# 502# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 503# 504# r5,r6 are the two BN_ULONGs being multiplied. 505# r7,r8 are the results of the 32x32 giving 64 bit multiply. 506# r9,r10, r11 are the equivalents of c1,c2, c3. 507# 508# Possible optimization of loading all 8 longs of a into registers 509# doesnt provide any speedup 510# 511 512 xor r0,r0,r0 #set r0 = 0.Used in addze 513 #instructions below. 514 515 #sqr_add_c(a,0,c1,c2,c3); 516 $LD r5,`0*$BNSZ`(r4) 517 $UMULL r9,r5,r5 #1st iteration: no carries. 518 $UMULH r10,r5,r5 519 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 520 #sqr_add_c2(a,1,0,c2,c3,c1); 521 $LD r6,`1*$BNSZ`(r4) 522 $UMULL r7,r5,r6 523 $UMULH r8,r5,r6 524 525 addc r10,r7,r10 #add the two register number 526 adde r11,r8,r0 # (r8,r7) to the three register 527 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 528 529 addc r10,r7,r10 #add the two register number 530 adde r11,r8,r11 # (r8,r7) to the three register 531 addze r9,r9 # number (r9,r11,r10). 532 533 $ST r10,`1*$BNSZ`(r3) # r[1]=c2 534 535 #sqr_add_c(a,1,c3,c1,c2); 536 $UMULL r7,r6,r6 537 $UMULH r8,r6,r6 538 addc r11,r7,r11 539 adde r9,r8,r9 540 addze r10,r0 541 #sqr_add_c2(a,2,0,c3,c1,c2); 542 $LD r6,`2*$BNSZ`(r4) 543 $UMULL r7,r5,r6 544 $UMULH r8,r5,r6 545 546 addc r11,r7,r11 547 adde r9,r8,r9 548 addze r10,r10 549 550 addc r11,r7,r11 551 adde r9,r8,r9 552 addze r10,r10 553 554 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 555 #sqr_add_c2(a,3,0,c1,c2,c3); 556 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. 557 $UMULL r7,r5,r6 558 $UMULH r8,r5,r6 559 560 addc r9,r7,r9 561 adde r10,r8,r10 562 addze r11,r0 563 564 addc r9,r7,r9 565 adde r10,r8,r10 566 addze r11,r11 567 #sqr_add_c2(a,2,1,c1,c2,c3); 568 $LD r5,`1*$BNSZ`(r4) 569 $LD r6,`2*$BNSZ`(r4) 570 $UMULL r7,r5,r6 571 $UMULH r8,r5,r6 572 573 addc r9,r7,r9 574 adde r10,r8,r10 575 addze r11,r11 576 577 addc r9,r7,r9 578 adde r10,r8,r10 579 addze r11,r11 580 581 $ST r9,`3*$BNSZ`(r3) #r[3]=c1; 582 #sqr_add_c(a,2,c2,c3,c1); 583 $UMULL r7,r6,r6 584 $UMULH r8,r6,r6 585 586 addc r10,r7,r10 587 adde r11,r8,r11 588 addze r9,r0 589 #sqr_add_c2(a,3,1,c2,c3,c1); 590 $LD r6,`3*$BNSZ`(r4) 591 $UMULL r7,r5,r6 592 $UMULH r8,r5,r6 593 594 addc r10,r7,r10 595 adde r11,r8,r11 596 addze r9,r9 597 598 addc r10,r7,r10 599 adde r11,r8,r11 600 addze r9,r9 601 #sqr_add_c2(a,4,0,c2,c3,c1); 602 $LD r5,`0*$BNSZ`(r4) 603 $LD r6,`4*$BNSZ`(r4) 604 $UMULL r7,r5,r6 605 $UMULH r8,r5,r6 606 607 addc r10,r7,r10 608 adde r11,r8,r11 609 addze r9,r9 610 611 addc r10,r7,r10 612 adde r11,r8,r11 613 addze r9,r9 614 $ST r10,`4*$BNSZ`(r3) #r[4]=c2; 615 #sqr_add_c2(a,5,0,c3,c1,c2); 616 $LD r6,`5*$BNSZ`(r4) 617 $UMULL r7,r5,r6 618 $UMULH r8,r5,r6 619 620 addc r11,r7,r11 621 adde r9,r8,r9 622 addze r10,r0 623 624 addc r11,r7,r11 625 adde r9,r8,r9 626 addze r10,r10 627 #sqr_add_c2(a,4,1,c3,c1,c2); 628 $LD r5,`1*$BNSZ`(r4) 629 $LD r6,`4*$BNSZ`(r4) 630 $UMULL r7,r5,r6 631 $UMULH r8,r5,r6 632 633 addc r11,r7,r11 634 adde r9,r8,r9 635 addze r10,r10 636 637 addc r11,r7,r11 638 adde r9,r8,r9 639 addze r10,r10 640 #sqr_add_c2(a,3,2,c3,c1,c2); 641 $LD r5,`2*$BNSZ`(r4) 642 $LD r6,`3*$BNSZ`(r4) 643 $UMULL r7,r5,r6 644 $UMULH r8,r5,r6 645 646 addc r11,r7,r11 647 adde r9,r8,r9 648 addze r10,r10 649 650 addc r11,r7,r11 651 adde r9,r8,r9 652 addze r10,r10 653 $ST r11,`5*$BNSZ`(r3) #r[5]=c3; 654 #sqr_add_c(a,3,c1,c2,c3); 655 $UMULL r7,r6,r6 656 $UMULH r8,r6,r6 657 addc r9,r7,r9 658 adde r10,r8,r10 659 addze r11,r0 660 #sqr_add_c2(a,4,2,c1,c2,c3); 661 $LD r6,`4*$BNSZ`(r4) 662 $UMULL r7,r5,r6 663 $UMULH r8,r5,r6 664 665 addc r9,r7,r9 666 adde r10,r8,r10 667 addze r11,r11 668 669 addc r9,r7,r9 670 adde r10,r8,r10 671 addze r11,r11 672 #sqr_add_c2(a,5,1,c1,c2,c3); 673 $LD r5,`1*$BNSZ`(r4) 674 $LD r6,`5*$BNSZ`(r4) 675 $UMULL r7,r5,r6 676 $UMULH r8,r5,r6 677 678 addc r9,r7,r9 679 adde r10,r8,r10 680 addze r11,r11 681 682 addc r9,r7,r9 683 adde r10,r8,r10 684 addze r11,r11 685 #sqr_add_c2(a,6,0,c1,c2,c3); 686 $LD r5,`0*$BNSZ`(r4) 687 $LD r6,`6*$BNSZ`(r4) 688 $UMULL r7,r5,r6 689 $UMULH r8,r5,r6 690 addc r9,r7,r9 691 adde r10,r8,r10 692 addze r11,r11 693 addc r9,r7,r9 694 adde r10,r8,r10 695 addze r11,r11 696 $ST r9,`6*$BNSZ`(r3) #r[6]=c1; 697 #sqr_add_c2(a,7,0,c2,c3,c1); 698 $LD r6,`7*$BNSZ`(r4) 699 $UMULL r7,r5,r6 700 $UMULH r8,r5,r6 701 702 addc r10,r7,r10 703 adde r11,r8,r11 704 addze r9,r0 705 addc r10,r7,r10 706 adde r11,r8,r11 707 addze r9,r9 708 #sqr_add_c2(a,6,1,c2,c3,c1); 709 $LD r5,`1*$BNSZ`(r4) 710 $LD r6,`6*$BNSZ`(r4) 711 $UMULL r7,r5,r6 712 $UMULH r8,r5,r6 713 714 addc r10,r7,r10 715 adde r11,r8,r11 716 addze r9,r9 717 addc r10,r7,r10 718 adde r11,r8,r11 719 addze r9,r9 720 #sqr_add_c2(a,5,2,c2,c3,c1); 721 $LD r5,`2*$BNSZ`(r4) 722 $LD r6,`5*$BNSZ`(r4) 723 $UMULL r7,r5,r6 724 $UMULH r8,r5,r6 725 addc r10,r7,r10 726 adde r11,r8,r11 727 addze r9,r9 728 addc r10,r7,r10 729 adde r11,r8,r11 730 addze r9,r9 731 #sqr_add_c2(a,4,3,c2,c3,c1); 732 $LD r5,`3*$BNSZ`(r4) 733 $LD r6,`4*$BNSZ`(r4) 734 $UMULL r7,r5,r6 735 $UMULH r8,r5,r6 736 737 addc r10,r7,r10 738 adde r11,r8,r11 739 addze r9,r9 740 addc r10,r7,r10 741 adde r11,r8,r11 742 addze r9,r9 743 $ST r10,`7*$BNSZ`(r3) #r[7]=c2; 744 #sqr_add_c(a,4,c3,c1,c2); 745 $UMULL r7,r6,r6 746 $UMULH r8,r6,r6 747 addc r11,r7,r11 748 adde r9,r8,r9 749 addze r10,r0 750 #sqr_add_c2(a,5,3,c3,c1,c2); 751 $LD r6,`5*$BNSZ`(r4) 752 $UMULL r7,r5,r6 753 $UMULH r8,r5,r6 754 addc r11,r7,r11 755 adde r9,r8,r9 756 addze r10,r10 757 addc r11,r7,r11 758 adde r9,r8,r9 759 addze r10,r10 760 #sqr_add_c2(a,6,2,c3,c1,c2); 761 $LD r5,`2*$BNSZ`(r4) 762 $LD r6,`6*$BNSZ`(r4) 763 $UMULL r7,r5,r6 764 $UMULH r8,r5,r6 765 addc r11,r7,r11 766 adde r9,r8,r9 767 addze r10,r10 768 769 addc r11,r7,r11 770 adde r9,r8,r9 771 addze r10,r10 772 #sqr_add_c2(a,7,1,c3,c1,c2); 773 $LD r5,`1*$BNSZ`(r4) 774 $LD r6,`7*$BNSZ`(r4) 775 $UMULL r7,r5,r6 776 $UMULH r8,r5,r6 777 addc r11,r7,r11 778 adde r9,r8,r9 779 addze r10,r10 780 addc r11,r7,r11 781 adde r9,r8,r9 782 addze r10,r10 783 $ST r11,`8*$BNSZ`(r3) #r[8]=c3; 784 #sqr_add_c2(a,7,2,c1,c2,c3); 785 $LD r5,`2*$BNSZ`(r4) 786 $UMULL r7,r5,r6 787 $UMULH r8,r5,r6 788 789 addc r9,r7,r9 790 adde r10,r8,r10 791 addze r11,r0 792 addc r9,r7,r9 793 adde r10,r8,r10 794 addze r11,r11 795 #sqr_add_c2(a,6,3,c1,c2,c3); 796 $LD r5,`3*$BNSZ`(r4) 797 $LD r6,`6*$BNSZ`(r4) 798 $UMULL r7,r5,r6 799 $UMULH r8,r5,r6 800 addc r9,r7,r9 801 adde r10,r8,r10 802 addze r11,r11 803 addc r9,r7,r9 804 adde r10,r8,r10 805 addze r11,r11 806 #sqr_add_c2(a,5,4,c1,c2,c3); 807 $LD r5,`4*$BNSZ`(r4) 808 $LD r6,`5*$BNSZ`(r4) 809 $UMULL r7,r5,r6 810 $UMULH r8,r5,r6 811 addc r9,r7,r9 812 adde r10,r8,r10 813 addze r11,r11 814 addc r9,r7,r9 815 adde r10,r8,r10 816 addze r11,r11 817 $ST r9,`9*$BNSZ`(r3) #r[9]=c1; 818 #sqr_add_c(a,5,c2,c3,c1); 819 $UMULL r7,r6,r6 820 $UMULH r8,r6,r6 821 addc r10,r7,r10 822 adde r11,r8,r11 823 addze r9,r0 824 #sqr_add_c2(a,6,4,c2,c3,c1); 825 $LD r6,`6*$BNSZ`(r4) 826 $UMULL r7,r5,r6 827 $UMULH r8,r5,r6 828 addc r10,r7,r10 829 adde r11,r8,r11 830 addze r9,r9 831 addc r10,r7,r10 832 adde r11,r8,r11 833 addze r9,r9 834 #sqr_add_c2(a,7,3,c2,c3,c1); 835 $LD r5,`3*$BNSZ`(r4) 836 $LD r6,`7*$BNSZ`(r4) 837 $UMULL r7,r5,r6 838 $UMULH r8,r5,r6 839 addc r10,r7,r10 840 adde r11,r8,r11 841 addze r9,r9 842 addc r10,r7,r10 843 adde r11,r8,r11 844 addze r9,r9 845 $ST r10,`10*$BNSZ`(r3) #r[10]=c2; 846 #sqr_add_c2(a,7,4,c3,c1,c2); 847 $LD r5,`4*$BNSZ`(r4) 848 $UMULL r7,r5,r6 849 $UMULH r8,r5,r6 850 addc r11,r7,r11 851 adde r9,r8,r9 852 addze r10,r0 853 addc r11,r7,r11 854 adde r9,r8,r9 855 addze r10,r10 856 #sqr_add_c2(a,6,5,c3,c1,c2); 857 $LD r5,`5*$BNSZ`(r4) 858 $LD r6,`6*$BNSZ`(r4) 859 $UMULL r7,r5,r6 860 $UMULH r8,r5,r6 861 addc r11,r7,r11 862 adde r9,r8,r9 863 addze r10,r10 864 addc r11,r7,r11 865 adde r9,r8,r9 866 addze r10,r10 867 $ST r11,`11*$BNSZ`(r3) #r[11]=c3; 868 #sqr_add_c(a,6,c1,c2,c3); 869 $UMULL r7,r6,r6 870 $UMULH r8,r6,r6 871 addc r9,r7,r9 872 adde r10,r8,r10 873 addze r11,r0 874 #sqr_add_c2(a,7,5,c1,c2,c3) 875 $LD r6,`7*$BNSZ`(r4) 876 $UMULL r7,r5,r6 877 $UMULH r8,r5,r6 878 addc r9,r7,r9 879 adde r10,r8,r10 880 addze r11,r11 881 addc r9,r7,r9 882 adde r10,r8,r10 883 addze r11,r11 884 $ST r9,`12*$BNSZ`(r3) #r[12]=c1; 885 886 #sqr_add_c2(a,7,6,c2,c3,c1) 887 $LD r5,`6*$BNSZ`(r4) 888 $UMULL r7,r5,r6 889 $UMULH r8,r5,r6 890 addc r10,r7,r10 891 adde r11,r8,r11 892 addze r9,r0 893 addc r10,r7,r10 894 adde r11,r8,r11 895 addze r9,r9 896 $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 897 #sqr_add_c(a,7,c3,c1,c2); 898 $UMULL r7,r6,r6 899 $UMULH r8,r6,r6 900 addc r11,r7,r11 901 adde r9,r8,r9 902 $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 903 $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 904 905 906 bclr BO_ALWAYS,CR0_LT 907 908 .long 0x00000000 909 910# 911# NOTE: The following label name should be changed to 912# "bn_mul_comba4" i.e. remove the first dot 913# for the gcc compiler. This should be automatically 914# done in the build 915# 916 917.align 4 918.bn_mul_comba4: 919# 920# This is an optimized version of the bn_mul_comba4 routine. 921# 922# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 923# r3 contains r 924# r4 contains a 925# r5 contains b 926# r6, r7 are the 2 BN_ULONGs being multiplied. 927# r8, r9 are the results of the 32x32 giving 64 multiply. 928# r10, r11, r12 are the equivalents of c1, c2, and c3. 929# 930 xor r0,r0,r0 #r0=0. Used in addze below. 931 #mul_add_c(a[0],b[0],c1,c2,c3); 932 $LD r6,`0*$BNSZ`(r4) 933 $LD r7,`0*$BNSZ`(r5) 934 $UMULL r10,r6,r7 935 $UMULH r11,r6,r7 936 $ST r10,`0*$BNSZ`(r3) #r[0]=c1 937 #mul_add_c(a[0],b[1],c2,c3,c1); 938 $LD r7,`1*$BNSZ`(r5) 939 $UMULL r8,r6,r7 940 $UMULH r9,r6,r7 941 addc r11,r8,r11 942 adde r12,r9,r0 943 addze r10,r0 944 #mul_add_c(a[1],b[0],c2,c3,c1); 945 $LD r6, `1*$BNSZ`(r4) 946 $LD r7, `0*$BNSZ`(r5) 947 $UMULL r8,r6,r7 948 $UMULH r9,r6,r7 949 addc r11,r8,r11 950 adde r12,r9,r12 951 addze r10,r10 952 $ST r11,`1*$BNSZ`(r3) #r[1]=c2 953 #mul_add_c(a[2],b[0],c3,c1,c2); 954 $LD r6,`2*$BNSZ`(r4) 955 $UMULL r8,r6,r7 956 $UMULH r9,r6,r7 957 addc r12,r8,r12 958 adde r10,r9,r10 959 addze r11,r0 960 #mul_add_c(a[1],b[1],c3,c1,c2); 961 $LD r6,`1*$BNSZ`(r4) 962 $LD r7,`1*$BNSZ`(r5) 963 $UMULL r8,r6,r7 964 $UMULH r9,r6,r7 965 addc r12,r8,r12 966 adde r10,r9,r10 967 addze r11,r11 968 #mul_add_c(a[0],b[2],c3,c1,c2); 969 $LD r6,`0*$BNSZ`(r4) 970 $LD r7,`2*$BNSZ`(r5) 971 $UMULL r8,r6,r7 972 $UMULH r9,r6,r7 973 addc r12,r8,r12 974 adde r10,r9,r10 975 addze r11,r11 976 $ST r12,`2*$BNSZ`(r3) #r[2]=c3 977 #mul_add_c(a[0],b[3],c1,c2,c3); 978 $LD r7,`3*$BNSZ`(r5) 979 $UMULL r8,r6,r7 980 $UMULH r9,r6,r7 981 addc r10,r8,r10 982 adde r11,r9,r11 983 addze r12,r0 984 #mul_add_c(a[1],b[2],c1,c2,c3); 985 $LD r6,`1*$BNSZ`(r4) 986 $LD r7,`2*$BNSZ`(r5) 987 $UMULL r8,r6,r7 988 $UMULH r9,r6,r7 989 addc r10,r8,r10 990 adde r11,r9,r11 991 addze r12,r12 992 #mul_add_c(a[2],b[1],c1,c2,c3); 993 $LD r6,`2*$BNSZ`(r4) 994 $LD r7,`1*$BNSZ`(r5) 995 $UMULL r8,r6,r7 996 $UMULH r9,r6,r7 997 addc r10,r8,r10 998 adde r11,r9,r11 999 addze r12,r12 1000 #mul_add_c(a[3],b[0],c1,c2,c3); 1001 $LD r6,`3*$BNSZ`(r4) 1002 $LD r7,`0*$BNSZ`(r5) 1003 $UMULL r8,r6,r7 1004 $UMULH r9,r6,r7 1005 addc r10,r8,r10 1006 adde r11,r9,r11 1007 addze r12,r12 1008 $ST r10,`3*$BNSZ`(r3) #r[3]=c1 1009 #mul_add_c(a[3],b[1],c2,c3,c1); 1010 $LD r7,`1*$BNSZ`(r5) 1011 $UMULL r8,r6,r7 1012 $UMULH r9,r6,r7 1013 addc r11,r8,r11 1014 adde r12,r9,r12 1015 addze r10,r0 1016 #mul_add_c(a[2],b[2],c2,c3,c1); 1017 $LD r6,`2*$BNSZ`(r4) 1018 $LD r7,`2*$BNSZ`(r5) 1019 $UMULL r8,r6,r7 1020 $UMULH r9,r6,r7 1021 addc r11,r8,r11 1022 adde r12,r9,r12 1023 addze r10,r10 1024 #mul_add_c(a[1],b[3],c2,c3,c1); 1025 $LD r6,`1*$BNSZ`(r4) 1026 $LD r7,`3*$BNSZ`(r5) 1027 $UMULL r8,r6,r7 1028 $UMULH r9,r6,r7 1029 addc r11,r8,r11 1030 adde r12,r9,r12 1031 addze r10,r10 1032 $ST r11,`4*$BNSZ`(r3) #r[4]=c2 1033 #mul_add_c(a[2],b[3],c3,c1,c2); 1034 $LD r6,`2*$BNSZ`(r4) 1035 $UMULL r8,r6,r7 1036 $UMULH r9,r6,r7 1037 addc r12,r8,r12 1038 adde r10,r9,r10 1039 addze r11,r0 1040 #mul_add_c(a[3],b[2],c3,c1,c2); 1041 $LD r6,`3*$BNSZ`(r4) 1042 $LD r7,`2*$BNSZ`(r4) 1043 $UMULL r8,r6,r7 1044 $UMULH r9,r6,r7 1045 addc r12,r8,r12 1046 adde r10,r9,r10 1047 addze r11,r11 1048 $ST r12,`5*$BNSZ`(r3) #r[5]=c3 1049 #mul_add_c(a[3],b[3],c1,c2,c3); 1050 $LD r7,`3*$BNSZ`(r5) 1051 $UMULL r8,r6,r7 1052 $UMULH r9,r6,r7 1053 addc r10,r8,r10 1054 adde r11,r9,r11 1055 1056 $ST r10,`6*$BNSZ`(r3) #r[6]=c1 1057 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 1058 bclr BO_ALWAYS,CR0_LT 1059 .long 0x00000000 1060 1061# 1062# NOTE: The following label name should be changed to 1063# "bn_mul_comba8" i.e. remove the first dot 1064# for the gcc compiler. This should be automatically 1065# done in the build 1066# 1067 1068.align 4 1069.bn_mul_comba8: 1070# 1071# Optimized version of the bn_mul_comba8 routine. 1072# 1073# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1074# r3 contains r 1075# r4 contains a 1076# r5 contains b 1077# r6, r7 are the 2 BN_ULONGs being multiplied. 1078# r8, r9 are the results of the 32x32 giving 64 multiply. 1079# r10, r11, r12 are the equivalents of c1, c2, and c3. 1080# 1081 xor r0,r0,r0 #r0=0. Used in addze below. 1082 1083 #mul_add_c(a[0],b[0],c1,c2,c3); 1084 $LD r6,`0*$BNSZ`(r4) #a[0] 1085 $LD r7,`0*$BNSZ`(r5) #b[0] 1086 $UMULL r10,r6,r7 1087 $UMULH r11,r6,r7 1088 $ST r10,`0*$BNSZ`(r3) #r[0]=c1; 1089 #mul_add_c(a[0],b[1],c2,c3,c1); 1090 $LD r7,`1*$BNSZ`(r5) 1091 $UMULL r8,r6,r7 1092 $UMULH r9,r6,r7 1093 addc r11,r11,r8 1094 addze r12,r9 # since we didnt set r12 to zero before. 1095 addze r10,r0 1096 #mul_add_c(a[1],b[0],c2,c3,c1); 1097 $LD r6,`1*$BNSZ`(r4) 1098 $LD r7,`0*$BNSZ`(r5) 1099 $UMULL r8,r6,r7 1100 $UMULH r9,r6,r7 1101 addc r11,r11,r8 1102 adde r12,r12,r9 1103 addze r10,r10 1104 $ST r11,`1*$BNSZ`(r3) #r[1]=c2; 1105 #mul_add_c(a[2],b[0],c3,c1,c2); 1106 $LD r6,`2*$BNSZ`(r4) 1107 $UMULL r8,r6,r7 1108 $UMULH r9,r6,r7 1109 addc r12,r12,r8 1110 adde r10,r10,r9 1111 addze r11,r0 1112 #mul_add_c(a[1],b[1],c3,c1,c2); 1113 $LD r6,`1*$BNSZ`(r4) 1114 $LD r7,`1*$BNSZ`(r5) 1115 $UMULL r8,r6,r7 1116 $UMULH r9,r6,r7 1117 addc r12,r12,r8 1118 adde r10,r10,r9 1119 addze r11,r11 1120 #mul_add_c(a[0],b[2],c3,c1,c2); 1121 $LD r6,`0*$BNSZ`(r4) 1122 $LD r7,`2*$BNSZ`(r5) 1123 $UMULL r8,r6,r7 1124 $UMULH r9,r6,r7 1125 addc r12,r12,r8 1126 adde r10,r10,r9 1127 addze r11,r11 1128 $ST r12,`2*$BNSZ`(r3) #r[2]=c3; 1129 #mul_add_c(a[0],b[3],c1,c2,c3); 1130 $LD r7,`3*$BNSZ`(r5) 1131 $UMULL r8,r6,r7 1132 $UMULH r9,r6,r7 1133 addc r10,r10,r8 1134 adde r11,r11,r9 1135 addze r12,r0 1136 #mul_add_c(a[1],b[2],c1,c2,c3); 1137 $LD r6,`1*$BNSZ`(r4) 1138 $LD r7,`2*$BNSZ`(r5) 1139 $UMULL r8,r6,r7 1140 $UMULH r9,r6,r7 1141 addc r10,r10,r8 1142 adde r11,r11,r9 1143 addze r12,r12 1144 1145 #mul_add_c(a[2],b[1],c1,c2,c3); 1146 $LD r6,`2*$BNSZ`(r4) 1147 $LD r7,`1*$BNSZ`(r5) 1148 $UMULL r8,r6,r7 1149 $UMULH r9,r6,r7 1150 addc r10,r10,r8 1151 adde r11,r11,r9 1152 addze r12,r12 1153 #mul_add_c(a[3],b[0],c1,c2,c3); 1154 $LD r6,`3*$BNSZ`(r4) 1155 $LD r7,`0*$BNSZ`(r5) 1156 $UMULL r8,r6,r7 1157 $UMULH r9,r6,r7 1158 addc r10,r10,r8 1159 adde r11,r11,r9 1160 addze r12,r12 1161 $ST r10,`3*$BNSZ`(r3) #r[3]=c1; 1162 #mul_add_c(a[4],b[0],c2,c3,c1); 1163 $LD r6,`4*$BNSZ`(r4) 1164 $UMULL r8,r6,r7 1165 $UMULH r9,r6,r7 1166 addc r11,r11,r8 1167 adde r12,r12,r9 1168 addze r10,r0 1169 #mul_add_c(a[3],b[1],c2,c3,c1); 1170 $LD r6,`3*$BNSZ`(r4) 1171 $LD r7,`1*$BNSZ`(r5) 1172 $UMULL r8,r6,r7 1173 $UMULH r9,r6,r7 1174 addc r11,r11,r8 1175 adde r12,r12,r9 1176 addze r10,r10 1177 #mul_add_c(a[2],b[2],c2,c3,c1); 1178 $LD r6,`2*$BNSZ`(r4) 1179 $LD r7,`2*$BNSZ`(r5) 1180 $UMULL r8,r6,r7 1181 $UMULH r9,r6,r7 1182 addc r11,r11,r8 1183 adde r12,r12,r9 1184 addze r10,r10 1185 #mul_add_c(a[1],b[3],c2,c3,c1); 1186 $LD r6,`1*$BNSZ`(r4) 1187 $LD r7,`3*$BNSZ`(r5) 1188 $UMULL r8,r6,r7 1189 $UMULH r9,r6,r7 1190 addc r11,r11,r8 1191 adde r12,r12,r9 1192 addze r10,r10 1193 #mul_add_c(a[0],b[4],c2,c3,c1); 1194 $LD r6,`0*$BNSZ`(r4) 1195 $LD r7,`4*$BNSZ`(r5) 1196 $UMULL r8,r6,r7 1197 $UMULH r9,r6,r7 1198 addc r11,r11,r8 1199 adde r12,r12,r9 1200 addze r10,r10 1201 $ST r11,`4*$BNSZ`(r3) #r[4]=c2; 1202 #mul_add_c(a[0],b[5],c3,c1,c2); 1203 $LD r7,`5*$BNSZ`(r5) 1204 $UMULL r8,r6,r7 1205 $UMULH r9,r6,r7 1206 addc r12,r12,r8 1207 adde r10,r10,r9 1208 addze r11,r0 1209 #mul_add_c(a[1],b[4],c3,c1,c2); 1210 $LD r6,`1*$BNSZ`(r4) 1211 $LD r7,`4*$BNSZ`(r5) 1212 $UMULL r8,r6,r7 1213 $UMULH r9,r6,r7 1214 addc r12,r12,r8 1215 adde r10,r10,r9 1216 addze r11,r11 1217 #mul_add_c(a[2],b[3],c3,c1,c2); 1218 $LD r6,`2*$BNSZ`(r4) 1219 $LD r7,`3*$BNSZ`(r5) 1220 $UMULL r8,r6,r7 1221 $UMULH r9,r6,r7 1222 addc r12,r12,r8 1223 adde r10,r10,r9 1224 addze r11,r11 1225 #mul_add_c(a[3],b[2],c3,c1,c2); 1226 $LD r6,`3*$BNSZ`(r4) 1227 $LD r7,`2*$BNSZ`(r5) 1228 $UMULL r8,r6,r7 1229 $UMULH r9,r6,r7 1230 addc r12,r12,r8 1231 adde r10,r10,r9 1232 addze r11,r11 1233 #mul_add_c(a[4],b[1],c3,c1,c2); 1234 $LD r6,`4*$BNSZ`(r4) 1235 $LD r7,`1*$BNSZ`(r5) 1236 $UMULL r8,r6,r7 1237 $UMULH r9,r6,r7 1238 addc r12,r12,r8 1239 adde r10,r10,r9 1240 addze r11,r11 1241 #mul_add_c(a[5],b[0],c3,c1,c2); 1242 $LD r6,`5*$BNSZ`(r4) 1243 $LD r7,`0*$BNSZ`(r5) 1244 $UMULL r8,r6,r7 1245 $UMULH r9,r6,r7 1246 addc r12,r12,r8 1247 adde r10,r10,r9 1248 addze r11,r11 1249 $ST r12,`5*$BNSZ`(r3) #r[5]=c3; 1250 #mul_add_c(a[6],b[0],c1,c2,c3); 1251 $LD r6,`6*$BNSZ`(r4) 1252 $UMULL r8,r6,r7 1253 $UMULH r9,r6,r7 1254 addc r10,r10,r8 1255 adde r11,r11,r9 1256 addze r12,r0 1257 #mul_add_c(a[5],b[1],c1,c2,c3); 1258 $LD r6,`5*$BNSZ`(r4) 1259 $LD r7,`1*$BNSZ`(r5) 1260 $UMULL r8,r6,r7 1261 $UMULH r9,r6,r7 1262 addc r10,r10,r8 1263 adde r11,r11,r9 1264 addze r12,r12 1265 #mul_add_c(a[4],b[2],c1,c2,c3); 1266 $LD r6,`4*$BNSZ`(r4) 1267 $LD r7,`2*$BNSZ`(r5) 1268 $UMULL r8,r6,r7 1269 $UMULH r9,r6,r7 1270 addc r10,r10,r8 1271 adde r11,r11,r9 1272 addze r12,r12 1273 #mul_add_c(a[3],b[3],c1,c2,c3); 1274 $LD r6,`3*$BNSZ`(r4) 1275 $LD r7,`3*$BNSZ`(r5) 1276 $UMULL r8,r6,r7 1277 $UMULH r9,r6,r7 1278 addc r10,r10,r8 1279 adde r11,r11,r9 1280 addze r12,r12 1281 #mul_add_c(a[2],b[4],c1,c2,c3); 1282 $LD r6,`2*$BNSZ`(r4) 1283 $LD r7,`4*$BNSZ`(r5) 1284 $UMULL r8,r6,r7 1285 $UMULH r9,r6,r7 1286 addc r10,r10,r8 1287 adde r11,r11,r9 1288 addze r12,r12 1289 #mul_add_c(a[1],b[5],c1,c2,c3); 1290 $LD r6,`1*$BNSZ`(r4) 1291 $LD r7,`5*$BNSZ`(r5) 1292 $UMULL r8,r6,r7 1293 $UMULH r9,r6,r7 1294 addc r10,r10,r8 1295 adde r11,r11,r9 1296 addze r12,r12 1297 #mul_add_c(a[0],b[6],c1,c2,c3); 1298 $LD r6,`0*$BNSZ`(r4) 1299 $LD r7,`6*$BNSZ`(r5) 1300 $UMULL r8,r6,r7 1301 $UMULH r9,r6,r7 1302 addc r10,r10,r8 1303 adde r11,r11,r9 1304 addze r12,r12 1305 $ST r10,`6*$BNSZ`(r3) #r[6]=c1; 1306 #mul_add_c(a[0],b[7],c2,c3,c1); 1307 $LD r7,`7*$BNSZ`(r5) 1308 $UMULL r8,r6,r7 1309 $UMULH r9,r6,r7 1310 addc r11,r11,r8 1311 adde r12,r12,r9 1312 addze r10,r0 1313 #mul_add_c(a[1],b[6],c2,c3,c1); 1314 $LD r6,`1*$BNSZ`(r4) 1315 $LD r7,`6*$BNSZ`(r5) 1316 $UMULL r8,r6,r7 1317 $UMULH r9,r6,r7 1318 addc r11,r11,r8 1319 adde r12,r12,r9 1320 addze r10,r10 1321 #mul_add_c(a[2],b[5],c2,c3,c1); 1322 $LD r6,`2*$BNSZ`(r4) 1323 $LD r7,`5*$BNSZ`(r5) 1324 $UMULL r8,r6,r7 1325 $UMULH r9,r6,r7 1326 addc r11,r11,r8 1327 adde r12,r12,r9 1328 addze r10,r10 1329 #mul_add_c(a[3],b[4],c2,c3,c1); 1330 $LD r6,`3*$BNSZ`(r4) 1331 $LD r7,`4*$BNSZ`(r5) 1332 $UMULL r8,r6,r7 1333 $UMULH r9,r6,r7 1334 addc r11,r11,r8 1335 adde r12,r12,r9 1336 addze r10,r10 1337 #mul_add_c(a[4],b[3],c2,c3,c1); 1338 $LD r6,`4*$BNSZ`(r4) 1339 $LD r7,`3*$BNSZ`(r5) 1340 $UMULL r8,r6,r7 1341 $UMULH r9,r6,r7 1342 addc r11,r11,r8 1343 adde r12,r12,r9 1344 addze r10,r10 1345 #mul_add_c(a[5],b[2],c2,c3,c1); 1346 $LD r6,`5*$BNSZ`(r4) 1347 $LD r7,`2*$BNSZ`(r5) 1348 $UMULL r8,r6,r7 1349 $UMULH r9,r6,r7 1350 addc r11,r11,r8 1351 adde r12,r12,r9 1352 addze r10,r10 1353 #mul_add_c(a[6],b[1],c2,c3,c1); 1354 $LD r6,`6*$BNSZ`(r4) 1355 $LD r7,`1*$BNSZ`(r5) 1356 $UMULL r8,r6,r7 1357 $UMULH r9,r6,r7 1358 addc r11,r11,r8 1359 adde r12,r12,r9 1360 addze r10,r10 1361 #mul_add_c(a[7],b[0],c2,c3,c1); 1362 $LD r6,`7*$BNSZ`(r4) 1363 $LD r7,`0*$BNSZ`(r5) 1364 $UMULL r8,r6,r7 1365 $UMULH r9,r6,r7 1366 addc r11,r11,r8 1367 adde r12,r12,r9 1368 addze r10,r10 1369 $ST r11,`7*$BNSZ`(r3) #r[7]=c2; 1370 #mul_add_c(a[7],b[1],c3,c1,c2); 1371 $LD r7,`1*$BNSZ`(r5) 1372 $UMULL r8,r6,r7 1373 $UMULH r9,r6,r7 1374 addc r12,r12,r8 1375 adde r10,r10,r9 1376 addze r11,r0 1377 #mul_add_c(a[6],b[2],c3,c1,c2); 1378 $LD r6,`6*$BNSZ`(r4) 1379 $LD r7,`2*$BNSZ`(r5) 1380 $UMULL r8,r6,r7 1381 $UMULH r9,r6,r7 1382 addc r12,r12,r8 1383 adde r10,r10,r9 1384 addze r11,r11 1385 #mul_add_c(a[5],b[3],c3,c1,c2); 1386 $LD r6,`5*$BNSZ`(r4) 1387 $LD r7,`3*$BNSZ`(r5) 1388 $UMULL r8,r6,r7 1389 $UMULH r9,r6,r7 1390 addc r12,r12,r8 1391 adde r10,r10,r9 1392 addze r11,r11 1393 #mul_add_c(a[4],b[4],c3,c1,c2); 1394 $LD r6,`4*$BNSZ`(r4) 1395 $LD r7,`4*$BNSZ`(r5) 1396 $UMULL r8,r6,r7 1397 $UMULH r9,r6,r7 1398 addc r12,r12,r8 1399 adde r10,r10,r9 1400 addze r11,r11 1401 #mul_add_c(a[3],b[5],c3,c1,c2); 1402 $LD r6,`3*$BNSZ`(r4) 1403 $LD r7,`5*$BNSZ`(r5) 1404 $UMULL r8,r6,r7 1405 $UMULH r9,r6,r7 1406 addc r12,r12,r8 1407 adde r10,r10,r9 1408 addze r11,r11 1409 #mul_add_c(a[2],b[6],c3,c1,c2); 1410 $LD r6,`2*$BNSZ`(r4) 1411 $LD r7,`6*$BNSZ`(r5) 1412 $UMULL r8,r6,r7 1413 $UMULH r9,r6,r7 1414 addc r12,r12,r8 1415 adde r10,r10,r9 1416 addze r11,r11 1417 #mul_add_c(a[1],b[7],c3,c1,c2); 1418 $LD r6,`1*$BNSZ`(r4) 1419 $LD r7,`7*$BNSZ`(r5) 1420 $UMULL r8,r6,r7 1421 $UMULH r9,r6,r7 1422 addc r12,r12,r8 1423 adde r10,r10,r9 1424 addze r11,r11 1425 $ST r12,`8*$BNSZ`(r3) #r[8]=c3; 1426 #mul_add_c(a[2],b[7],c1,c2,c3); 1427 $LD r6,`2*$BNSZ`(r4) 1428 $UMULL r8,r6,r7 1429 $UMULH r9,r6,r7 1430 addc r10,r10,r8 1431 adde r11,r11,r9 1432 addze r12,r0 1433 #mul_add_c(a[3],b[6],c1,c2,c3); 1434 $LD r6,`3*$BNSZ`(r4) 1435 $LD r7,`6*$BNSZ`(r5) 1436 $UMULL r8,r6,r7 1437 $UMULH r9,r6,r7 1438 addc r10,r10,r8 1439 adde r11,r11,r9 1440 addze r12,r12 1441 #mul_add_c(a[4],b[5],c1,c2,c3); 1442 $LD r6,`4*$BNSZ`(r4) 1443 $LD r7,`5*$BNSZ`(r5) 1444 $UMULL r8,r6,r7 1445 $UMULH r9,r6,r7 1446 addc r10,r10,r8 1447 adde r11,r11,r9 1448 addze r12,r12 1449 #mul_add_c(a[5],b[4],c1,c2,c3); 1450 $LD r6,`5*$BNSZ`(r4) 1451 $LD r7,`4*$BNSZ`(r5) 1452 $UMULL r8,r6,r7 1453 $UMULH r9,r6,r7 1454 addc r10,r10,r8 1455 adde r11,r11,r9 1456 addze r12,r12 1457 #mul_add_c(a[6],b[3],c1,c2,c3); 1458 $LD r6,`6*$BNSZ`(r4) 1459 $LD r7,`3*$BNSZ`(r5) 1460 $UMULL r8,r6,r7 1461 $UMULH r9,r6,r7 1462 addc r10,r10,r8 1463 adde r11,r11,r9 1464 addze r12,r12 1465 #mul_add_c(a[7],b[2],c1,c2,c3); 1466 $LD r6,`7*$BNSZ`(r4) 1467 $LD r7,`2*$BNSZ`(r5) 1468 $UMULL r8,r6,r7 1469 $UMULH r9,r6,r7 1470 addc r10,r10,r8 1471 adde r11,r11,r9 1472 addze r12,r12 1473 $ST r10,`9*$BNSZ`(r3) #r[9]=c1; 1474 #mul_add_c(a[7],b[3],c2,c3,c1); 1475 $LD r7,`3*$BNSZ`(r5) 1476 $UMULL r8,r6,r7 1477 $UMULH r9,r6,r7 1478 addc r11,r11,r8 1479 adde r12,r12,r9 1480 addze r10,r0 1481 #mul_add_c(a[6],b[4],c2,c3,c1); 1482 $LD r6,`6*$BNSZ`(r4) 1483 $LD r7,`4*$BNSZ`(r5) 1484 $UMULL r8,r6,r7 1485 $UMULH r9,r6,r7 1486 addc r11,r11,r8 1487 adde r12,r12,r9 1488 addze r10,r10 1489 #mul_add_c(a[5],b[5],c2,c3,c1); 1490 $LD r6,`5*$BNSZ`(r4) 1491 $LD r7,`5*$BNSZ`(r5) 1492 $UMULL r8,r6,r7 1493 $UMULH r9,r6,r7 1494 addc r11,r11,r8 1495 adde r12,r12,r9 1496 addze r10,r10 1497 #mul_add_c(a[4],b[6],c2,c3,c1); 1498 $LD r6,`4*$BNSZ`(r4) 1499 $LD r7,`6*$BNSZ`(r5) 1500 $UMULL r8,r6,r7 1501 $UMULH r9,r6,r7 1502 addc r11,r11,r8 1503 adde r12,r12,r9 1504 addze r10,r10 1505 #mul_add_c(a[3],b[7],c2,c3,c1); 1506 $LD r6,`3*$BNSZ`(r4) 1507 $LD r7,`7*$BNSZ`(r5) 1508 $UMULL r8,r6,r7 1509 $UMULH r9,r6,r7 1510 addc r11,r11,r8 1511 adde r12,r12,r9 1512 addze r10,r10 1513 $ST r11,`10*$BNSZ`(r3) #r[10]=c2; 1514 #mul_add_c(a[4],b[7],c3,c1,c2); 1515 $LD r6,`4*$BNSZ`(r4) 1516 $UMULL r8,r6,r7 1517 $UMULH r9,r6,r7 1518 addc r12,r12,r8 1519 adde r10,r10,r9 1520 addze r11,r0 1521 #mul_add_c(a[5],b[6],c3,c1,c2); 1522 $LD r6,`5*$BNSZ`(r4) 1523 $LD r7,`6*$BNSZ`(r5) 1524 $UMULL r8,r6,r7 1525 $UMULH r9,r6,r7 1526 addc r12,r12,r8 1527 adde r10,r10,r9 1528 addze r11,r11 1529 #mul_add_c(a[6],b[5],c3,c1,c2); 1530 $LD r6,`6*$BNSZ`(r4) 1531 $LD r7,`5*$BNSZ`(r5) 1532 $UMULL r8,r6,r7 1533 $UMULH r9,r6,r7 1534 addc r12,r12,r8 1535 adde r10,r10,r9 1536 addze r11,r11 1537 #mul_add_c(a[7],b[4],c3,c1,c2); 1538 $LD r6,`7*$BNSZ`(r4) 1539 $LD r7,`4*$BNSZ`(r5) 1540 $UMULL r8,r6,r7 1541 $UMULH r9,r6,r7 1542 addc r12,r12,r8 1543 adde r10,r10,r9 1544 addze r11,r11 1545 $ST r12,`11*$BNSZ`(r3) #r[11]=c3; 1546 #mul_add_c(a[7],b[5],c1,c2,c3); 1547 $LD r7,`5*$BNSZ`(r5) 1548 $UMULL r8,r6,r7 1549 $UMULH r9,r6,r7 1550 addc r10,r10,r8 1551 adde r11,r11,r9 1552 addze r12,r0 1553 #mul_add_c(a[6],b[6],c1,c2,c3); 1554 $LD r6,`6*$BNSZ`(r4) 1555 $LD r7,`6*$BNSZ`(r5) 1556 $UMULL r8,r6,r7 1557 $UMULH r9,r6,r7 1558 addc r10,r10,r8 1559 adde r11,r11,r9 1560 addze r12,r12 1561 #mul_add_c(a[5],b[7],c1,c2,c3); 1562 $LD r6,`5*$BNSZ`(r4) 1563 $LD r7,`7*$BNSZ`(r5) 1564 $UMULL r8,r6,r7 1565 $UMULH r9,r6,r7 1566 addc r10,r10,r8 1567 adde r11,r11,r9 1568 addze r12,r12 1569 $ST r10,`12*$BNSZ`(r3) #r[12]=c1; 1570 #mul_add_c(a[6],b[7],c2,c3,c1); 1571 $LD r6,`6*$BNSZ`(r4) 1572 $UMULL r8,r6,r7 1573 $UMULH r9,r6,r7 1574 addc r11,r11,r8 1575 adde r12,r12,r9 1576 addze r10,r0 1577 #mul_add_c(a[7],b[6],c2,c3,c1); 1578 $LD r6,`7*$BNSZ`(r4) 1579 $LD r7,`6*$BNSZ`(r5) 1580 $UMULL r8,r6,r7 1581 $UMULH r9,r6,r7 1582 addc r11,r11,r8 1583 adde r12,r12,r9 1584 addze r10,r10 1585 $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 1586 #mul_add_c(a[7],b[7],c3,c1,c2); 1587 $LD r7,`7*$BNSZ`(r5) 1588 $UMULL r8,r6,r7 1589 $UMULH r9,r6,r7 1590 addc r12,r12,r8 1591 adde r10,r10,r9 1592 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1593 $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1594 bclr BO_ALWAYS,CR0_LT 1595 .long 0x00000000 1596 1597# 1598# NOTE: The following label name should be changed to 1599# "bn_sub_words" i.e. remove the first dot 1600# for the gcc compiler. This should be automatically 1601# done in the build 1602# 1603# 1604.align 4 1605.bn_sub_words: 1606# 1607# Handcoded version of bn_sub_words 1608# 1609#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1610# 1611# r3 = r 1612# r4 = a 1613# r5 = b 1614# r6 = n 1615# 1616# Note: No loop unrolling done since this is not a performance 1617# critical loop. 1618 1619 xor r0,r0,r0 #set r0 = 0 1620# 1621# check for r6 = 0 AND set carry bit. 1622# 1623 subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1624 # if r6 > 0 then result !=0 1625 # In either case carry bit is set. 1626 bc BO_IF,CR0_EQ,Lppcasm_sub_adios 1627 addi r4,r4,-$BNSZ 1628 addi r3,r3,-$BNSZ 1629 addi r5,r5,-$BNSZ 1630 mtctr r6 1631Lppcasm_sub_mainloop: 1632 $LDU r7,$BNSZ(r4) 1633 $LDU r8,$BNSZ(r5) 1634 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 1635 # if carry = 1 this is r7-r8. Else it 1636 # is r7-r8 -1 as we need. 1637 $STU r6,$BNSZ(r3) 1638 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop 1639Lppcasm_sub_adios: 1640 subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1641 andi. r3,r3,1 # keep only last bit. 1642 bclr BO_ALWAYS,CR0_LT 1643 .long 0x00000000 1644 1645 1646# 1647# NOTE: The following label name should be changed to 1648# "bn_add_words" i.e. remove the first dot 1649# for the gcc compiler. This should be automatically 1650# done in the build 1651# 1652 1653.align 4 1654.bn_add_words: 1655# 1656# Handcoded version of bn_add_words 1657# 1658#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1659# 1660# r3 = r 1661# r4 = a 1662# r5 = b 1663# r6 = n 1664# 1665# Note: No loop unrolling done since this is not a performance 1666# critical loop. 1667 1668 xor r0,r0,r0 1669# 1670# check for r6 = 0. Is this needed? 1671# 1672 addic. r6,r6,0 #test r6 and clear carry bit. 1673 bc BO_IF,CR0_EQ,Lppcasm_add_adios 1674 addi r4,r4,-$BNSZ 1675 addi r3,r3,-$BNSZ 1676 addi r5,r5,-$BNSZ 1677 mtctr r6 1678Lppcasm_add_mainloop: 1679 $LDU r7,$BNSZ(r4) 1680 $LDU r8,$BNSZ(r5) 1681 adde r8,r7,r8 1682 $STU r8,$BNSZ(r3) 1683 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop 1684Lppcasm_add_adios: 1685 addze r3,r0 #return carry bit. 1686 bclr BO_ALWAYS,CR0_LT 1687 .long 0x00000000 1688 1689# 1690# NOTE: The following label name should be changed to 1691# "bn_div_words" i.e. remove the first dot 1692# for the gcc compiler. This should be automatically 1693# done in the build 1694# 1695 1696.align 4 1697.bn_div_words: 1698# 1699# This is a cleaned up version of code generated by 1700# the AIX compiler. The only optimization is to use 1701# the PPC instruction to count leading zeros instead 1702# of call to num_bits_word. Since this was compiled 1703# only at level -O2 we can possibly squeeze it more? 1704# 1705# r3 = h 1706# r4 = l 1707# r5 = d 1708 1709 $UCMPI 0,r5,0 # compare r5 and 0 1710 bc BO_IF_NOT,CR0_EQ,Lppcasm_div1 # proceed if d!=0 1711 li r3,-1 # d=0 return -1 1712 bclr BO_ALWAYS,CR0_LT 1713Lppcasm_div1: 1714 xor r0,r0,r0 #r0=0 1715 li r8,$BITS 1716 $CNTLZ. r7,r5 #r7 = num leading 0s in d. 1717 bc BO_IF,CR0_EQ,Lppcasm_div2 #proceed if no leading zeros 1718 subf r8,r7,r8 #r8 = BN_num_bits_word(d) 1719 $SHR. r9,r3,r8 #are there any bits above r8'th? 1720 $TR 16,r9,r0 #if there're, signal to dump core... 1721Lppcasm_div2: 1722 $UCMP 0,r3,r5 #h>=d? 1723 bc BO_IF,CR0_LT,Lppcasm_div3 #goto Lppcasm_div3 if not 1724 subf r3,r5,r3 #h-=d ; 1725Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1726 cmpi 0,0,r7,0 # is (i == 0)? 1727 bc BO_IF,CR0_EQ,Lppcasm_div4 1728 $SHL r3,r3,r7 # h = (h<< i) 1729 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1730 $SHL r5,r5,r7 # d<<=i 1731 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 1732 $SHL r4,r4,r7 # l <<=i 1733Lppcasm_div4: 1734 $SHRI r9,r5,`$BITS/2` # r9 = dh 1735 # dl will be computed when needed 1736 # as it saves registers. 1737 li r6,2 #r6=2 1738 mtctr r6 #counter will be in count. 1739Lppcasm_divouterloop: 1740 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 1741 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1742 # compute here for innerloop. 1743 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1744 bc BO_IF_NOT,CR0_EQ,Lppcasm_div5 # goto Lppcasm_div5 if not 1745 1746 li r8,-1 1747 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1748 b Lppcasm_div6 1749Lppcasm_div5: 1750 $UDIV r8,r3,r9 #q = h/dh 1751Lppcasm_div6: 1752 $UMULL r12,r9,r8 #th = q*dh 1753 $CLRU r10,r5,`$BITS/2` #r10=dl 1754 $UMULL r6,r8,r10 #tl = q*dl 1755 1756Lppcasm_divinnerloop: 1757 subf r10,r12,r3 #t = h -th 1758 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 1759 addic. r7,r7,0 #test if r7 == 0. used below. 1760 # now want to compute 1761 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) 1762 # the following 2 instructions do that 1763 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1764 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1765 $UCMP 1,r6,r7 # compare (tl <= r7) 1766 bc BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit 1767 bc BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit 1768 addi r8,r8,-1 #q-- 1769 subf r12,r9,r12 #th -=dh 1770 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1771 subf r6,r10,r6 #tl -=dl 1772 b Lppcasm_divinnerloop 1773Lppcasm_divinnerexit: 1774 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1775 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1776 $UCMP 1,r4,r11 # compare l and tl 1777 add r12,r12,r10 # th+=t 1778 bc BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1779 addi r12,r12,1 # th++ 1780Lppcasm_div7: 1781 subf r11,r11,r4 #r11=l-tl 1782 $UCMP 1,r3,r12 #compare h and th 1783 bc BO_IF_NOT,CR1_FX,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 1784 addi r8,r8,-1 # q-- 1785 add r3,r5,r3 # h+=d 1786Lppcasm_div8: 1787 subf r12,r12,r3 #r12 = h-th 1788 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 1789 # want to compute 1790 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 1791 # the following 2 instructions will do this. 1792 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 1793 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1794 bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ; 1795 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1796 b Lppcasm_divouterloop 1797Lppcasm_div9: 1798 or r3,r8,r0 1799 bclr BO_ALWAYS,CR0_LT 1800 .long 0x00000000 1801 1802# 1803# NOTE: The following label name should be changed to 1804# "bn_sqr_words" i.e. remove the first dot 1805# for the gcc compiler. This should be automatically 1806# done in the build 1807# 1808.align 4 1809.bn_sqr_words: 1810# 1811# Optimized version of bn_sqr_words 1812# 1813# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 1814# 1815# r3 = r 1816# r4 = a 1817# r5 = n 1818# 1819# r6 = a[i]. 1820# r7,r8 = product. 1821# 1822# No unrolling done here. Not performance critical. 1823 1824 addic. r5,r5,0 #test r5. 1825 bc BO_IF,CR0_EQ,Lppcasm_sqr_adios 1826 addi r4,r4,-$BNSZ 1827 addi r3,r3,-$BNSZ 1828 mtctr r5 1829Lppcasm_sqr_mainloop: 1830 #sqr(r[0],r[1],a[0]); 1831 $LDU r6,$BNSZ(r4) 1832 $UMULL r7,r6,r6 1833 $UMULH r8,r6,r6 1834 $STU r7,$BNSZ(r3) 1835 $STU r8,$BNSZ(r3) 1836 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop 1837Lppcasm_sqr_adios: 1838 bclr BO_ALWAYS,CR0_LT 1839 .long 0x00000000 1840 1841 1842# 1843# NOTE: The following label name should be changed to 1844# "bn_mul_words" i.e. remove the first dot 1845# for the gcc compiler. This should be automatically 1846# done in the build 1847# 1848 1849.align 4 1850.bn_mul_words: 1851# 1852# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1853# 1854# r3 = rp 1855# r4 = ap 1856# r5 = num 1857# r6 = w 1858 xor r0,r0,r0 1859 xor r12,r12,r12 # used for carry 1860 rlwinm. r7,r5,30,2,31 # num >> 2 1861 bc BO_IF,CR0_EQ,Lppcasm_mw_REM 1862 mtctr r7 1863Lppcasm_mw_LOOP: 1864 #mul(rp[0],ap[0],w,c1); 1865 $LD r8,`0*$BNSZ`(r4) 1866 $UMULL r9,r6,r8 1867 $UMULH r10,r6,r8 1868 addc r9,r9,r12 1869 #addze r10,r10 #carry is NOT ignored. 1870 #will be taken care of 1871 #in second spin below 1872 #using adde. 1873 $ST r9,`0*$BNSZ`(r3) 1874 #mul(rp[1],ap[1],w,c1); 1875 $LD r8,`1*$BNSZ`(r4) 1876 $UMULL r11,r6,r8 1877 $UMULH r12,r6,r8 1878 adde r11,r11,r10 1879 #addze r12,r12 1880 $ST r11,`1*$BNSZ`(r3) 1881 #mul(rp[2],ap[2],w,c1); 1882 $LD r8,`2*$BNSZ`(r4) 1883 $UMULL r9,r6,r8 1884 $UMULH r10,r6,r8 1885 adde r9,r9,r12 1886 #addze r10,r10 1887 $ST r9,`2*$BNSZ`(r3) 1888 #mul_add(rp[3],ap[3],w,c1); 1889 $LD r8,`3*$BNSZ`(r4) 1890 $UMULL r11,r6,r8 1891 $UMULH r12,r6,r8 1892 adde r11,r11,r10 1893 addze r12,r12 #this spin we collect carry into 1894 #r12 1895 $ST r11,`3*$BNSZ`(r3) 1896 1897 addi r3,r3,`4*$BNSZ` 1898 addi r4,r4,`4*$BNSZ` 1899 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP 1900 1901Lppcasm_mw_REM: 1902 andi. r5,r5,0x3 1903 bc BO_IF,CR0_EQ,Lppcasm_mw_OVER 1904 #mul(rp[0],ap[0],w,c1); 1905 $LD r8,`0*$BNSZ`(r4) 1906 $UMULL r9,r6,r8 1907 $UMULH r10,r6,r8 1908 addc r9,r9,r12 1909 addze r10,r10 1910 $ST r9,`0*$BNSZ`(r3) 1911 addi r12,r10,0 1912 1913 addi r5,r5,-1 1914 cmpli 0,0,r5,0 1915 bc BO_IF,CR0_EQ,Lppcasm_mw_OVER 1916 1917 1918 #mul(rp[1],ap[1],w,c1); 1919 $LD r8,`1*$BNSZ`(r4) 1920 $UMULL r9,r6,r8 1921 $UMULH r10,r6,r8 1922 addc r9,r9,r12 1923 addze r10,r10 1924 $ST r9,`1*$BNSZ`(r3) 1925 addi r12,r10,0 1926 1927 addi r5,r5,-1 1928 cmpli 0,0,r5,0 1929 bc BO_IF,CR0_EQ,Lppcasm_mw_OVER 1930 1931 #mul_add(rp[2],ap[2],w,c1); 1932 $LD r8,`2*$BNSZ`(r4) 1933 $UMULL r9,r6,r8 1934 $UMULH r10,r6,r8 1935 addc r9,r9,r12 1936 addze r10,r10 1937 $ST r9,`2*$BNSZ`(r3) 1938 addi r12,r10,0 1939 1940Lppcasm_mw_OVER: 1941 addi r3,r12,0 1942 bclr BO_ALWAYS,CR0_LT 1943 .long 0x00000000 1944 1945# 1946# NOTE: The following label name should be changed to 1947# "bn_mul_add_words" i.e. remove the first dot 1948# for the gcc compiler. This should be automatically 1949# done in the build 1950# 1951 1952.align 4 1953.bn_mul_add_words: 1954# 1955# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1956# 1957# r3 = rp 1958# r4 = ap 1959# r5 = num 1960# r6 = w 1961# 1962# empirical evidence suggests that unrolled version performs best!! 1963# 1964 xor r0,r0,r0 #r0 = 0 1965 xor r12,r12,r12 #r12 = 0 . used for carry 1966 rlwinm. r7,r5,30,2,31 # num >> 2 1967 bc BO_IF,CR0_EQ,Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 1968 mtctr r7 1969Lppcasm_maw_mainloop: 1970 #mul_add(rp[0],ap[0],w,c1); 1971 $LD r8,`0*$BNSZ`(r4) 1972 $LD r11,`0*$BNSZ`(r3) 1973 $UMULL r9,r6,r8 1974 $UMULH r10,r6,r8 1975 addc r9,r9,r12 #r12 is carry. 1976 addze r10,r10 1977 addc r9,r9,r11 1978 #addze r10,r10 1979 #the above instruction addze 1980 #is NOT needed. Carry will NOT 1981 #be ignored. It's not affected 1982 #by multiply and will be collected 1983 #in the next spin 1984 $ST r9,`0*$BNSZ`(r3) 1985 1986 #mul_add(rp[1],ap[1],w,c1); 1987 $LD r8,`1*$BNSZ`(r4) 1988 $LD r9,`1*$BNSZ`(r3) 1989 $UMULL r11,r6,r8 1990 $UMULH r12,r6,r8 1991 adde r11,r11,r10 #r10 is carry. 1992 addze r12,r12 1993 addc r11,r11,r9 1994 #addze r12,r12 1995 $ST r11,`1*$BNSZ`(r3) 1996 1997 #mul_add(rp[2],ap[2],w,c1); 1998 $LD r8,`2*$BNSZ`(r4) 1999 $UMULL r9,r6,r8 2000 $LD r11,`2*$BNSZ`(r3) 2001 $UMULH r10,r6,r8 2002 adde r9,r9,r12 2003 addze r10,r10 2004 addc r9,r9,r11 2005 #addze r10,r10 2006 $ST r9,`2*$BNSZ`(r3) 2007 2008 #mul_add(rp[3],ap[3],w,c1); 2009 $LD r8,`3*$BNSZ`(r4) 2010 $UMULL r11,r6,r8 2011 $LD r9,`3*$BNSZ`(r3) 2012 $UMULH r12,r6,r8 2013 adde r11,r11,r10 2014 addze r12,r12 2015 addc r11,r11,r9 2016 addze r12,r12 2017 $ST r11,`3*$BNSZ`(r3) 2018 addi r3,r3,`4*$BNSZ` 2019 addi r4,r4,`4*$BNSZ` 2020 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop 2021 2022Lppcasm_maw_leftover: 2023 andi. r5,r5,0x3 2024 bc BO_IF,CR0_EQ,Lppcasm_maw_adios 2025 addi r3,r3,-$BNSZ 2026 addi r4,r4,-$BNSZ 2027 #mul_add(rp[0],ap[0],w,c1); 2028 mtctr r5 2029 $LDU r8,$BNSZ(r4) 2030 $UMULL r9,r6,r8 2031 $UMULH r10,r6,r8 2032 $LDU r11,$BNSZ(r3) 2033 addc r9,r9,r11 2034 addze r10,r10 2035 addc r9,r9,r12 2036 addze r12,r10 2037 $ST r9,0(r3) 2038 2039 bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios 2040 #mul_add(rp[1],ap[1],w,c1); 2041 $LDU r8,$BNSZ(r4) 2042 $UMULL r9,r6,r8 2043 $UMULH r10,r6,r8 2044 $LDU r11,$BNSZ(r3) 2045 addc r9,r9,r11 2046 addze r10,r10 2047 addc r9,r9,r12 2048 addze r12,r10 2049 $ST r9,0(r3) 2050 2051 bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios 2052 #mul_add(rp[2],ap[2],w,c1); 2053 $LDU r8,$BNSZ(r4) 2054 $UMULL r9,r6,r8 2055 $UMULH r10,r6,r8 2056 $LDU r11,$BNSZ(r3) 2057 addc r9,r9,r11 2058 addze r10,r10 2059 addc r9,r9,r12 2060 addze r12,r10 2061 $ST r9,0(r3) 2062 2063Lppcasm_maw_adios: 2064 addi r3,r12,0 2065 bclr BO_ALWAYS,CR0_LT 2066 .long 0x00000000 2067 .align 4 2068EOF 2069 $data =~ s/\`([^\`]*)\`/eval $1/gem; 2070 2071 # if some assembler chokes on some simplified mnemonic, 2072 # this is the spot to fix it up, e.g.: 2073 # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare 2074 $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm; 2075 # assembler X doesn't accept li, load immediate value 2076 #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm; 2077 return($data); 2078} 2079