1#! /usr/bin/env perl 2# Copyright 2004-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# Implemented as a Perl wrapper as we want to support several different 10# architectures with single file. We pick up the target based on the 11# file name we are asked to generate. 12# 13# It should be noted though that this perl code is nothing like 14# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 15# as pre-processor to cover for platform differences in name decoration, 16# linker tables, 32-/64-bit instruction sets... 17# 18# As you might know there're several PowerPC ABI in use. Most notably 19# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs 20# are similar enough to implement leaf(!) functions, which would be ABI 21# neutral. And that's what you find here: ABI neutral leaf functions. 22# In case you wonder what that is... 23# 24# AIX performance 25# 26# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. 27# 28# The following is the performance of 32-bit compiler 29# generated code: 30# 31# OpenSSL 0.9.6c 21 dec 2001 32# built on: Tue Jun 11 11:06:51 EDT 2002 33# options:bn(64,32) ... 34#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 35# sign verify sign/s verify/s 36#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 37#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 38#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 39#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 40#dsa 512 bits 0.0087s 0.0106s 114.3 94.5 41#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 42# 43# Same benchmark with this assembler code: 44# 45#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 46#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 47#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 48#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 49#dsa 512 bits 0.0052s 0.0062s 191.6 162.0 50#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 51# 52# Number of operations increases by at almost 75% 53# 54# Here are performance numbers for 64-bit compiler 55# generated code: 56# 57# OpenSSL 0.9.6g [engine] 9 Aug 2002 58# built on: Fri Apr 18 16:59:20 EDT 2003 59# options:bn(64,64) ... 60# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 61# sign verify sign/s verify/s 62#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 63#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 64#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 65#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 66#dsa 512 bits 0.0026s 0.0032s 382.5 313.7 67#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 68# 69# Same benchmark with this assembler code: 70# 71#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 72#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 73#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 74#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 75#dsa 512 bits 0.0016s 0.0020s 610.7 507.1 76#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 77# 78# Again, performance increases by at about 75% 79# 80# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) 81# OpenSSL 0.9.7c 30 Sep 2003 82# 83# Original code. 84# 85#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 86#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 87#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 88#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 89#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 90#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 91#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 92# 93# Same benchmark with this assembler code: 94# 95#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 96#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 97#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 98#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 99#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 100#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 101#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 102# 103# Performance increase of ~60% 104# Based on submission from Suresh N. Chari of IBM 105 106$flavour = shift; 107 108if ($flavour =~ /32/) { 109 $BITS= 32; 110 $BNSZ= $BITS/8; 111 $ISA= "\"ppc\""; 112 113 $LD= "lwz"; # load 114 $LDU= "lwzu"; # load and update 115 $ST= "stw"; # store 116 $STU= "stwu"; # store and update 117 $UMULL= "mullw"; # unsigned multiply low 118 $UMULH= "mulhwu"; # unsigned multiply high 119 $UDIV= "divwu"; # unsigned divide 120 $UCMPI= "cmplwi"; # unsigned compare with immediate 121 $UCMP= "cmplw"; # unsigned compare 122 $CNTLZ= "cntlzw"; # count leading zeros 123 $SHL= "slw"; # shift left 124 $SHR= "srw"; # unsigned shift right 125 $SHRI= "srwi"; # unsigned shift right by immediate 126 $SHLI= "slwi"; # shift left by immediate 127 $CLRU= "clrlwi"; # clear upper bits 128 $INSR= "insrwi"; # insert right 129 $ROTL= "rotlwi"; # rotate left by immediate 130 $TR= "tw"; # conditional trap 131} elsif ($flavour =~ /64/) { 132 $BITS= 64; 133 $BNSZ= $BITS/8; 134 $ISA= "\"ppc64\""; 135 136 # same as above, but 64-bit mnemonics... 137 $LD= "ld"; # load 138 $LDU= "ldu"; # load and update 139 $ST= "std"; # store 140 $STU= "stdu"; # store and update 141 $UMULL= "mulld"; # unsigned multiply low 142 $UMULH= "mulhdu"; # unsigned multiply high 143 $UDIV= "divdu"; # unsigned divide 144 $UCMPI= "cmpldi"; # unsigned compare with immediate 145 $UCMP= "cmpld"; # unsigned compare 146 $CNTLZ= "cntlzd"; # count leading zeros 147 $SHL= "sld"; # shift left 148 $SHR= "srd"; # unsigned shift right 149 $SHRI= "srdi"; # unsigned shift right by immediate 150 $SHLI= "sldi"; # shift left by immediate 151 $CLRU= "clrldi"; # clear upper bits 152 $INSR= "insrdi"; # insert right 153 $ROTL= "rotldi"; # rotate left by immediate 154 $TR= "td"; # conditional trap 155} else { die "nonsense $flavour"; } 156 157$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 158( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 159( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 160die "can't locate ppc-xlate.pl"; 161 162open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 163 164$data=<<EOF; 165#-------------------------------------------------------------------- 166# 167# 168# 169# 170# File: ppc32.s 171# 172# Created by: Suresh Chari 173# IBM Thomas J. Watson Research Library 174# Hawthorne, NY 175# 176# 177# Description: Optimized assembly routines for OpenSSL crypto 178# on the 32 bitPowerPC platform. 179# 180# 181# Version History 182# 183# 2. Fixed bn_add,bn_sub and bn_div_words, added comments, 184# cleaned up code. Also made a single version which can 185# be used for both the AIX and Linux compilers. See NOTE 186# below. 187# 12/05/03 Suresh Chari 188# (with lots of help from) Andy Polyakov 189## 190# 1. Initial version 10/20/02 Suresh Chari 191# 192# 193# The following file works for the xlc,cc 194# and gcc compilers. 195# 196# NOTE: To get the file to link correctly with the gcc compiler 197# you have to change the names of the routines and remove 198# the first .(dot) character. This should automatically 199# be done in the build process. 200# 201# Hand optimized assembly code for the following routines 202# 203# bn_sqr_comba4 204# bn_sqr_comba8 205# bn_mul_comba4 206# bn_mul_comba8 207# bn_sub_words 208# bn_add_words 209# bn_div_words 210# bn_sqr_words 211# bn_mul_words 212# bn_mul_add_words 213# 214# NOTE: It is possible to optimize this code more for 215# specific PowerPC or Power architectures. On the Northstar 216# architecture the optimizations in this file do 217# NOT provide much improvement. 218# 219# If you have comments or suggestions to improve code send 220# me a note at schari\@us.ibm.com 221# 222#-------------------------------------------------------------------------- 223# 224# Defines to be used in the assembly code. 225# 226#.set r0,0 # we use it as storage for value of 0 227#.set SP,1 # preserved 228#.set RTOC,2 # preserved 229#.set r3,3 # 1st argument/return value 230#.set r4,4 # 2nd argument/volatile register 231#.set r5,5 # 3rd argument/volatile register 232#.set r6,6 # ... 233#.set r7,7 234#.set r8,8 235#.set r9,9 236#.set r10,10 237#.set r11,11 238#.set r12,12 239#.set r13,13 # not used, nor any other "below" it... 240 241# Declare function names to be global 242# NOTE: For gcc these names MUST be changed to remove 243# the first . i.e. for example change ".bn_sqr_comba4" 244# to "bn_sqr_comba4". This should be automatically done 245# in the build. 246 247 .globl .bn_sqr_comba4 248 .globl .bn_sqr_comba8 249 .globl .bn_mul_comba4 250 .globl .bn_mul_comba8 251 .globl .bn_sub_words 252 .globl .bn_add_words 253 .globl .bn_div_words 254 .globl .bn_sqr_words 255 .globl .bn_mul_words 256 .globl .bn_mul_add_words 257 258# .text section 259 260 .machine "any" 261 262# 263# NOTE: The following label name should be changed to 264# "bn_sqr_comba4" i.e. remove the first dot 265# for the gcc compiler. This should be automatically 266# done in the build 267# 268 269.align 4 270.bn_sqr_comba4: 271# 272# Optimized version of bn_sqr_comba4. 273# 274# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 275# r3 contains r 276# r4 contains a 277# 278# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 279# 280# r5,r6 are the two BN_ULONGs being multiplied. 281# r7,r8 are the results of the 32x32 giving 64 bit multiply. 282# r9,r10, r11 are the equivalents of c1,c2, c3. 283# Here's the assembly 284# 285# 286 xor r0,r0,r0 # set r0 = 0. Used in the addze 287 # instructions below 288 289 #sqr_add_c(a,0,c1,c2,c3) 290 $LD r5,`0*$BNSZ`(r4) 291 $UMULL r9,r5,r5 292 $UMULH r10,r5,r5 #in first iteration. No need 293 #to add since c1=c2=c3=0. 294 # Note c3(r11) is NOT set to 0 295 # but will be. 296 297 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 298 # sqr_add_c2(a,1,0,c2,c3,c1); 299 $LD r6,`1*$BNSZ`(r4) 300 $UMULL r7,r5,r6 301 $UMULH r8,r5,r6 302 303 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) 304 adde r8,r8,r8 305 addze r9,r0 # catch carry if any. 306 # r9= r0(=0) and carry 307 308 addc r10,r7,r10 # now add to temp result. 309 addze r11,r8 # r8 added to r11 which is 0 310 addze r9,r9 311 312 $ST r10,`1*$BNSZ`(r3) #r[1]=c2; 313 #sqr_add_c(a,1,c3,c1,c2) 314 $UMULL r7,r6,r6 315 $UMULH r8,r6,r6 316 addc r11,r7,r11 317 adde r9,r8,r9 318 addze r10,r0 319 #sqr_add_c2(a,2,0,c3,c1,c2) 320 $LD r6,`2*$BNSZ`(r4) 321 $UMULL r7,r5,r6 322 $UMULH r8,r5,r6 323 324 addc r7,r7,r7 325 adde r8,r8,r8 326 addze r10,r10 327 328 addc r11,r7,r11 329 adde r9,r8,r9 330 addze r10,r10 331 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 332 #sqr_add_c2(a,3,0,c1,c2,c3); 333 $LD r6,`3*$BNSZ`(r4) 334 $UMULL r7,r5,r6 335 $UMULH r8,r5,r6 336 addc r7,r7,r7 337 adde r8,r8,r8 338 addze r11,r0 339 340 addc r9,r7,r9 341 adde r10,r8,r10 342 addze r11,r11 343 #sqr_add_c2(a,2,1,c1,c2,c3); 344 $LD r5,`1*$BNSZ`(r4) 345 $LD r6,`2*$BNSZ`(r4) 346 $UMULL r7,r5,r6 347 $UMULH r8,r5,r6 348 349 addc r7,r7,r7 350 adde r8,r8,r8 351 addze r11,r11 352 addc r9,r7,r9 353 adde r10,r8,r10 354 addze r11,r11 355 $ST r9,`3*$BNSZ`(r3) #r[3]=c1 356 #sqr_add_c(a,2,c2,c3,c1); 357 $UMULL r7,r6,r6 358 $UMULH r8,r6,r6 359 addc r10,r7,r10 360 adde r11,r8,r11 361 addze r9,r0 362 #sqr_add_c2(a,3,1,c2,c3,c1); 363 $LD r6,`3*$BNSZ`(r4) 364 $UMULL r7,r5,r6 365 $UMULH r8,r5,r6 366 addc r7,r7,r7 367 adde r8,r8,r8 368 addze r9,r9 369 370 addc r10,r7,r10 371 adde r11,r8,r11 372 addze r9,r9 373 $ST r10,`4*$BNSZ`(r3) #r[4]=c2 374 #sqr_add_c2(a,3,2,c3,c1,c2); 375 $LD r5,`2*$BNSZ`(r4) 376 $UMULL r7,r5,r6 377 $UMULH r8,r5,r6 378 addc r7,r7,r7 379 adde r8,r8,r8 380 addze r10,r0 381 382 addc r11,r7,r11 383 adde r9,r8,r9 384 addze r10,r10 385 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 386 #sqr_add_c(a,3,c1,c2,c3); 387 $UMULL r7,r6,r6 388 $UMULH r8,r6,r6 389 addc r9,r7,r9 390 adde r10,r8,r10 391 392 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 393 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 394 blr 395 .long 0 396 .byte 0,12,0x14,0,0,0,2,0 397 .long 0 398.size .bn_sqr_comba4,.-.bn_sqr_comba4 399 400# 401# NOTE: The following label name should be changed to 402# "bn_sqr_comba8" i.e. remove the first dot 403# for the gcc compiler. This should be automatically 404# done in the build 405# 406 407.align 4 408.bn_sqr_comba8: 409# 410# This is an optimized version of the bn_sqr_comba8 routine. 411# Tightly uses the adde instruction 412# 413# 414# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 415# r3 contains r 416# r4 contains a 417# 418# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 419# 420# r5,r6 are the two BN_ULONGs being multiplied. 421# r7,r8 are the results of the 32x32 giving 64 bit multiply. 422# r9,r10, r11 are the equivalents of c1,c2, c3. 423# 424# Possible optimization of loading all 8 longs of a into registers 425# doesn't provide any speedup 426# 427 428 xor r0,r0,r0 #set r0 = 0.Used in addze 429 #instructions below. 430 431 #sqr_add_c(a,0,c1,c2,c3); 432 $LD r5,`0*$BNSZ`(r4) 433 $UMULL r9,r5,r5 #1st iteration: no carries. 434 $UMULH r10,r5,r5 435 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 436 #sqr_add_c2(a,1,0,c2,c3,c1); 437 $LD r6,`1*$BNSZ`(r4) 438 $UMULL r7,r5,r6 439 $UMULH r8,r5,r6 440 441 addc r10,r7,r10 #add the two register number 442 adde r11,r8,r0 # (r8,r7) to the three register 443 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 444 445 addc r10,r7,r10 #add the two register number 446 adde r11,r8,r11 # (r8,r7) to the three register 447 addze r9,r9 # number (r9,r11,r10). 448 449 $ST r10,`1*$BNSZ`(r3) # r[1]=c2 450 451 #sqr_add_c(a,1,c3,c1,c2); 452 $UMULL r7,r6,r6 453 $UMULH r8,r6,r6 454 addc r11,r7,r11 455 adde r9,r8,r9 456 addze r10,r0 457 #sqr_add_c2(a,2,0,c3,c1,c2); 458 $LD r6,`2*$BNSZ`(r4) 459 $UMULL r7,r5,r6 460 $UMULH r8,r5,r6 461 462 addc r11,r7,r11 463 adde r9,r8,r9 464 addze r10,r10 465 466 addc r11,r7,r11 467 adde r9,r8,r9 468 addze r10,r10 469 470 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 471 #sqr_add_c2(a,3,0,c1,c2,c3); 472 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. 473 $UMULL r7,r5,r6 474 $UMULH r8,r5,r6 475 476 addc r9,r7,r9 477 adde r10,r8,r10 478 addze r11,r0 479 480 addc r9,r7,r9 481 adde r10,r8,r10 482 addze r11,r11 483 #sqr_add_c2(a,2,1,c1,c2,c3); 484 $LD r5,`1*$BNSZ`(r4) 485 $LD r6,`2*$BNSZ`(r4) 486 $UMULL r7,r5,r6 487 $UMULH r8,r5,r6 488 489 addc r9,r7,r9 490 adde r10,r8,r10 491 addze r11,r11 492 493 addc r9,r7,r9 494 adde r10,r8,r10 495 addze r11,r11 496 497 $ST r9,`3*$BNSZ`(r3) #r[3]=c1; 498 #sqr_add_c(a,2,c2,c3,c1); 499 $UMULL r7,r6,r6 500 $UMULH r8,r6,r6 501 502 addc r10,r7,r10 503 adde r11,r8,r11 504 addze r9,r0 505 #sqr_add_c2(a,3,1,c2,c3,c1); 506 $LD r6,`3*$BNSZ`(r4) 507 $UMULL r7,r5,r6 508 $UMULH r8,r5,r6 509 510 addc r10,r7,r10 511 adde r11,r8,r11 512 addze r9,r9 513 514 addc r10,r7,r10 515 adde r11,r8,r11 516 addze r9,r9 517 #sqr_add_c2(a,4,0,c2,c3,c1); 518 $LD r5,`0*$BNSZ`(r4) 519 $LD r6,`4*$BNSZ`(r4) 520 $UMULL r7,r5,r6 521 $UMULH r8,r5,r6 522 523 addc r10,r7,r10 524 adde r11,r8,r11 525 addze r9,r9 526 527 addc r10,r7,r10 528 adde r11,r8,r11 529 addze r9,r9 530 $ST r10,`4*$BNSZ`(r3) #r[4]=c2; 531 #sqr_add_c2(a,5,0,c3,c1,c2); 532 $LD r6,`5*$BNSZ`(r4) 533 $UMULL r7,r5,r6 534 $UMULH r8,r5,r6 535 536 addc r11,r7,r11 537 adde r9,r8,r9 538 addze r10,r0 539 540 addc r11,r7,r11 541 adde r9,r8,r9 542 addze r10,r10 543 #sqr_add_c2(a,4,1,c3,c1,c2); 544 $LD r5,`1*$BNSZ`(r4) 545 $LD r6,`4*$BNSZ`(r4) 546 $UMULL r7,r5,r6 547 $UMULH r8,r5,r6 548 549 addc r11,r7,r11 550 adde r9,r8,r9 551 addze r10,r10 552 553 addc r11,r7,r11 554 adde r9,r8,r9 555 addze r10,r10 556 #sqr_add_c2(a,3,2,c3,c1,c2); 557 $LD r5,`2*$BNSZ`(r4) 558 $LD r6,`3*$BNSZ`(r4) 559 $UMULL r7,r5,r6 560 $UMULH r8,r5,r6 561 562 addc r11,r7,r11 563 adde r9,r8,r9 564 addze r10,r10 565 566 addc r11,r7,r11 567 adde r9,r8,r9 568 addze r10,r10 569 $ST r11,`5*$BNSZ`(r3) #r[5]=c3; 570 #sqr_add_c(a,3,c1,c2,c3); 571 $UMULL r7,r6,r6 572 $UMULH r8,r6,r6 573 addc r9,r7,r9 574 adde r10,r8,r10 575 addze r11,r0 576 #sqr_add_c2(a,4,2,c1,c2,c3); 577 $LD r6,`4*$BNSZ`(r4) 578 $UMULL r7,r5,r6 579 $UMULH r8,r5,r6 580 581 addc r9,r7,r9 582 adde r10,r8,r10 583 addze r11,r11 584 585 addc r9,r7,r9 586 adde r10,r8,r10 587 addze r11,r11 588 #sqr_add_c2(a,5,1,c1,c2,c3); 589 $LD r5,`1*$BNSZ`(r4) 590 $LD r6,`5*$BNSZ`(r4) 591 $UMULL r7,r5,r6 592 $UMULH r8,r5,r6 593 594 addc r9,r7,r9 595 adde r10,r8,r10 596 addze r11,r11 597 598 addc r9,r7,r9 599 adde r10,r8,r10 600 addze r11,r11 601 #sqr_add_c2(a,6,0,c1,c2,c3); 602 $LD r5,`0*$BNSZ`(r4) 603 $LD r6,`6*$BNSZ`(r4) 604 $UMULL r7,r5,r6 605 $UMULH r8,r5,r6 606 addc r9,r7,r9 607 adde r10,r8,r10 608 addze r11,r11 609 addc r9,r7,r9 610 adde r10,r8,r10 611 addze r11,r11 612 $ST r9,`6*$BNSZ`(r3) #r[6]=c1; 613 #sqr_add_c2(a,7,0,c2,c3,c1); 614 $LD r6,`7*$BNSZ`(r4) 615 $UMULL r7,r5,r6 616 $UMULH r8,r5,r6 617 618 addc r10,r7,r10 619 adde r11,r8,r11 620 addze r9,r0 621 addc r10,r7,r10 622 adde r11,r8,r11 623 addze r9,r9 624 #sqr_add_c2(a,6,1,c2,c3,c1); 625 $LD r5,`1*$BNSZ`(r4) 626 $LD r6,`6*$BNSZ`(r4) 627 $UMULL r7,r5,r6 628 $UMULH r8,r5,r6 629 630 addc r10,r7,r10 631 adde r11,r8,r11 632 addze r9,r9 633 addc r10,r7,r10 634 adde r11,r8,r11 635 addze r9,r9 636 #sqr_add_c2(a,5,2,c2,c3,c1); 637 $LD r5,`2*$BNSZ`(r4) 638 $LD r6,`5*$BNSZ`(r4) 639 $UMULL r7,r5,r6 640 $UMULH r8,r5,r6 641 addc r10,r7,r10 642 adde r11,r8,r11 643 addze r9,r9 644 addc r10,r7,r10 645 adde r11,r8,r11 646 addze r9,r9 647 #sqr_add_c2(a,4,3,c2,c3,c1); 648 $LD r5,`3*$BNSZ`(r4) 649 $LD r6,`4*$BNSZ`(r4) 650 $UMULL r7,r5,r6 651 $UMULH r8,r5,r6 652 653 addc r10,r7,r10 654 adde r11,r8,r11 655 addze r9,r9 656 addc r10,r7,r10 657 adde r11,r8,r11 658 addze r9,r9 659 $ST r10,`7*$BNSZ`(r3) #r[7]=c2; 660 #sqr_add_c(a,4,c3,c1,c2); 661 $UMULL r7,r6,r6 662 $UMULH r8,r6,r6 663 addc r11,r7,r11 664 adde r9,r8,r9 665 addze r10,r0 666 #sqr_add_c2(a,5,3,c3,c1,c2); 667 $LD r6,`5*$BNSZ`(r4) 668 $UMULL r7,r5,r6 669 $UMULH r8,r5,r6 670 addc r11,r7,r11 671 adde r9,r8,r9 672 addze r10,r10 673 addc r11,r7,r11 674 adde r9,r8,r9 675 addze r10,r10 676 #sqr_add_c2(a,6,2,c3,c1,c2); 677 $LD r5,`2*$BNSZ`(r4) 678 $LD r6,`6*$BNSZ`(r4) 679 $UMULL r7,r5,r6 680 $UMULH r8,r5,r6 681 addc r11,r7,r11 682 adde r9,r8,r9 683 addze r10,r10 684 685 addc r11,r7,r11 686 adde r9,r8,r9 687 addze r10,r10 688 #sqr_add_c2(a,7,1,c3,c1,c2); 689 $LD r5,`1*$BNSZ`(r4) 690 $LD r6,`7*$BNSZ`(r4) 691 $UMULL r7,r5,r6 692 $UMULH r8,r5,r6 693 addc r11,r7,r11 694 adde r9,r8,r9 695 addze r10,r10 696 addc r11,r7,r11 697 adde r9,r8,r9 698 addze r10,r10 699 $ST r11,`8*$BNSZ`(r3) #r[8]=c3; 700 #sqr_add_c2(a,7,2,c1,c2,c3); 701 $LD r5,`2*$BNSZ`(r4) 702 $UMULL r7,r5,r6 703 $UMULH r8,r5,r6 704 705 addc r9,r7,r9 706 adde r10,r8,r10 707 addze r11,r0 708 addc r9,r7,r9 709 adde r10,r8,r10 710 addze r11,r11 711 #sqr_add_c2(a,6,3,c1,c2,c3); 712 $LD r5,`3*$BNSZ`(r4) 713 $LD r6,`6*$BNSZ`(r4) 714 $UMULL r7,r5,r6 715 $UMULH r8,r5,r6 716 addc r9,r7,r9 717 adde r10,r8,r10 718 addze r11,r11 719 addc r9,r7,r9 720 adde r10,r8,r10 721 addze r11,r11 722 #sqr_add_c2(a,5,4,c1,c2,c3); 723 $LD r5,`4*$BNSZ`(r4) 724 $LD r6,`5*$BNSZ`(r4) 725 $UMULL r7,r5,r6 726 $UMULH r8,r5,r6 727 addc r9,r7,r9 728 adde r10,r8,r10 729 addze r11,r11 730 addc r9,r7,r9 731 adde r10,r8,r10 732 addze r11,r11 733 $ST r9,`9*$BNSZ`(r3) #r[9]=c1; 734 #sqr_add_c(a,5,c2,c3,c1); 735 $UMULL r7,r6,r6 736 $UMULH r8,r6,r6 737 addc r10,r7,r10 738 adde r11,r8,r11 739 addze r9,r0 740 #sqr_add_c2(a,6,4,c2,c3,c1); 741 $LD r6,`6*$BNSZ`(r4) 742 $UMULL r7,r5,r6 743 $UMULH r8,r5,r6 744 addc r10,r7,r10 745 adde r11,r8,r11 746 addze r9,r9 747 addc r10,r7,r10 748 adde r11,r8,r11 749 addze r9,r9 750 #sqr_add_c2(a,7,3,c2,c3,c1); 751 $LD r5,`3*$BNSZ`(r4) 752 $LD r6,`7*$BNSZ`(r4) 753 $UMULL r7,r5,r6 754 $UMULH r8,r5,r6 755 addc r10,r7,r10 756 adde r11,r8,r11 757 addze r9,r9 758 addc r10,r7,r10 759 adde r11,r8,r11 760 addze r9,r9 761 $ST r10,`10*$BNSZ`(r3) #r[10]=c2; 762 #sqr_add_c2(a,7,4,c3,c1,c2); 763 $LD r5,`4*$BNSZ`(r4) 764 $UMULL r7,r5,r6 765 $UMULH r8,r5,r6 766 addc r11,r7,r11 767 adde r9,r8,r9 768 addze r10,r0 769 addc r11,r7,r11 770 adde r9,r8,r9 771 addze r10,r10 772 #sqr_add_c2(a,6,5,c3,c1,c2); 773 $LD r5,`5*$BNSZ`(r4) 774 $LD r6,`6*$BNSZ`(r4) 775 $UMULL r7,r5,r6 776 $UMULH r8,r5,r6 777 addc r11,r7,r11 778 adde r9,r8,r9 779 addze r10,r10 780 addc r11,r7,r11 781 adde r9,r8,r9 782 addze r10,r10 783 $ST r11,`11*$BNSZ`(r3) #r[11]=c3; 784 #sqr_add_c(a,6,c1,c2,c3); 785 $UMULL r7,r6,r6 786 $UMULH r8,r6,r6 787 addc r9,r7,r9 788 adde r10,r8,r10 789 addze r11,r0 790 #sqr_add_c2(a,7,5,c1,c2,c3) 791 $LD r6,`7*$BNSZ`(r4) 792 $UMULL r7,r5,r6 793 $UMULH r8,r5,r6 794 addc r9,r7,r9 795 adde r10,r8,r10 796 addze r11,r11 797 addc r9,r7,r9 798 adde r10,r8,r10 799 addze r11,r11 800 $ST r9,`12*$BNSZ`(r3) #r[12]=c1; 801 802 #sqr_add_c2(a,7,6,c2,c3,c1) 803 $LD r5,`6*$BNSZ`(r4) 804 $UMULL r7,r5,r6 805 $UMULH r8,r5,r6 806 addc r10,r7,r10 807 adde r11,r8,r11 808 addze r9,r0 809 addc r10,r7,r10 810 adde r11,r8,r11 811 addze r9,r9 812 $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 813 #sqr_add_c(a,7,c3,c1,c2); 814 $UMULL r7,r6,r6 815 $UMULH r8,r6,r6 816 addc r11,r7,r11 817 adde r9,r8,r9 818 $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 819 $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 820 821 822 blr 823 .long 0 824 .byte 0,12,0x14,0,0,0,2,0 825 .long 0 826.size .bn_sqr_comba8,.-.bn_sqr_comba8 827 828# 829# NOTE: The following label name should be changed to 830# "bn_mul_comba4" i.e. remove the first dot 831# for the gcc compiler. This should be automatically 832# done in the build 833# 834 835.align 4 836.bn_mul_comba4: 837# 838# This is an optimized version of the bn_mul_comba4 routine. 839# 840# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 841# r3 contains r 842# r4 contains a 843# r5 contains b 844# r6, r7 are the 2 BN_ULONGs being multiplied. 845# r8, r9 are the results of the 32x32 giving 64 multiply. 846# r10, r11, r12 are the equivalents of c1, c2, and c3. 847# 848 xor r0,r0,r0 #r0=0. Used in addze below. 849 #mul_add_c(a[0],b[0],c1,c2,c3); 850 $LD r6,`0*$BNSZ`(r4) 851 $LD r7,`0*$BNSZ`(r5) 852 $UMULL r10,r6,r7 853 $UMULH r11,r6,r7 854 $ST r10,`0*$BNSZ`(r3) #r[0]=c1 855 #mul_add_c(a[0],b[1],c2,c3,c1); 856 $LD r7,`1*$BNSZ`(r5) 857 $UMULL r8,r6,r7 858 $UMULH r9,r6,r7 859 addc r11,r8,r11 860 adde r12,r9,r0 861 addze r10,r0 862 #mul_add_c(a[1],b[0],c2,c3,c1); 863 $LD r6, `1*$BNSZ`(r4) 864 $LD r7, `0*$BNSZ`(r5) 865 $UMULL r8,r6,r7 866 $UMULH r9,r6,r7 867 addc r11,r8,r11 868 adde r12,r9,r12 869 addze r10,r10 870 $ST r11,`1*$BNSZ`(r3) #r[1]=c2 871 #mul_add_c(a[2],b[0],c3,c1,c2); 872 $LD r6,`2*$BNSZ`(r4) 873 $UMULL r8,r6,r7 874 $UMULH r9,r6,r7 875 addc r12,r8,r12 876 adde r10,r9,r10 877 addze r11,r0 878 #mul_add_c(a[1],b[1],c3,c1,c2); 879 $LD r6,`1*$BNSZ`(r4) 880 $LD r7,`1*$BNSZ`(r5) 881 $UMULL r8,r6,r7 882 $UMULH r9,r6,r7 883 addc r12,r8,r12 884 adde r10,r9,r10 885 addze r11,r11 886 #mul_add_c(a[0],b[2],c3,c1,c2); 887 $LD r6,`0*$BNSZ`(r4) 888 $LD r7,`2*$BNSZ`(r5) 889 $UMULL r8,r6,r7 890 $UMULH r9,r6,r7 891 addc r12,r8,r12 892 adde r10,r9,r10 893 addze r11,r11 894 $ST r12,`2*$BNSZ`(r3) #r[2]=c3 895 #mul_add_c(a[0],b[3],c1,c2,c3); 896 $LD r7,`3*$BNSZ`(r5) 897 $UMULL r8,r6,r7 898 $UMULH r9,r6,r7 899 addc r10,r8,r10 900 adde r11,r9,r11 901 addze r12,r0 902 #mul_add_c(a[1],b[2],c1,c2,c3); 903 $LD r6,`1*$BNSZ`(r4) 904 $LD r7,`2*$BNSZ`(r5) 905 $UMULL r8,r6,r7 906 $UMULH r9,r6,r7 907 addc r10,r8,r10 908 adde r11,r9,r11 909 addze r12,r12 910 #mul_add_c(a[2],b[1],c1,c2,c3); 911 $LD r6,`2*$BNSZ`(r4) 912 $LD r7,`1*$BNSZ`(r5) 913 $UMULL r8,r6,r7 914 $UMULH r9,r6,r7 915 addc r10,r8,r10 916 adde r11,r9,r11 917 addze r12,r12 918 #mul_add_c(a[3],b[0],c1,c2,c3); 919 $LD r6,`3*$BNSZ`(r4) 920 $LD r7,`0*$BNSZ`(r5) 921 $UMULL r8,r6,r7 922 $UMULH r9,r6,r7 923 addc r10,r8,r10 924 adde r11,r9,r11 925 addze r12,r12 926 $ST r10,`3*$BNSZ`(r3) #r[3]=c1 927 #mul_add_c(a[3],b[1],c2,c3,c1); 928 $LD r7,`1*$BNSZ`(r5) 929 $UMULL r8,r6,r7 930 $UMULH r9,r6,r7 931 addc r11,r8,r11 932 adde r12,r9,r12 933 addze r10,r0 934 #mul_add_c(a[2],b[2],c2,c3,c1); 935 $LD r6,`2*$BNSZ`(r4) 936 $LD r7,`2*$BNSZ`(r5) 937 $UMULL r8,r6,r7 938 $UMULH r9,r6,r7 939 addc r11,r8,r11 940 adde r12,r9,r12 941 addze r10,r10 942 #mul_add_c(a[1],b[3],c2,c3,c1); 943 $LD r6,`1*$BNSZ`(r4) 944 $LD r7,`3*$BNSZ`(r5) 945 $UMULL r8,r6,r7 946 $UMULH r9,r6,r7 947 addc r11,r8,r11 948 adde r12,r9,r12 949 addze r10,r10 950 $ST r11,`4*$BNSZ`(r3) #r[4]=c2 951 #mul_add_c(a[2],b[3],c3,c1,c2); 952 $LD r6,`2*$BNSZ`(r4) 953 $UMULL r8,r6,r7 954 $UMULH r9,r6,r7 955 addc r12,r8,r12 956 adde r10,r9,r10 957 addze r11,r0 958 #mul_add_c(a[3],b[2],c3,c1,c2); 959 $LD r6,`3*$BNSZ`(r4) 960 $LD r7,`2*$BNSZ`(r5) 961 $UMULL r8,r6,r7 962 $UMULH r9,r6,r7 963 addc r12,r8,r12 964 adde r10,r9,r10 965 addze r11,r11 966 $ST r12,`5*$BNSZ`(r3) #r[5]=c3 967 #mul_add_c(a[3],b[3],c1,c2,c3); 968 $LD r7,`3*$BNSZ`(r5) 969 $UMULL r8,r6,r7 970 $UMULH r9,r6,r7 971 addc r10,r8,r10 972 adde r11,r9,r11 973 974 $ST r10,`6*$BNSZ`(r3) #r[6]=c1 975 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 976 blr 977 .long 0 978 .byte 0,12,0x14,0,0,0,3,0 979 .long 0 980.size .bn_mul_comba4,.-.bn_mul_comba4 981 982# 983# NOTE: The following label name should be changed to 984# "bn_mul_comba8" i.e. remove the first dot 985# for the gcc compiler. This should be automatically 986# done in the build 987# 988 989.align 4 990.bn_mul_comba8: 991# 992# Optimized version of the bn_mul_comba8 routine. 993# 994# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 995# r3 contains r 996# r4 contains a 997# r5 contains b 998# r6, r7 are the 2 BN_ULONGs being multiplied. 999# r8, r9 are the results of the 32x32 giving 64 multiply. 1000# r10, r11, r12 are the equivalents of c1, c2, and c3. 1001# 1002 xor r0,r0,r0 #r0=0. Used in addze below. 1003 1004 #mul_add_c(a[0],b[0],c1,c2,c3); 1005 $LD r6,`0*$BNSZ`(r4) #a[0] 1006 $LD r7,`0*$BNSZ`(r5) #b[0] 1007 $UMULL r10,r6,r7 1008 $UMULH r11,r6,r7 1009 $ST r10,`0*$BNSZ`(r3) #r[0]=c1; 1010 #mul_add_c(a[0],b[1],c2,c3,c1); 1011 $LD r7,`1*$BNSZ`(r5) 1012 $UMULL r8,r6,r7 1013 $UMULH r9,r6,r7 1014 addc r11,r11,r8 1015 addze r12,r9 # since we didn't set r12 to zero before. 1016 addze r10,r0 1017 #mul_add_c(a[1],b[0],c2,c3,c1); 1018 $LD r6,`1*$BNSZ`(r4) 1019 $LD r7,`0*$BNSZ`(r5) 1020 $UMULL r8,r6,r7 1021 $UMULH r9,r6,r7 1022 addc r11,r11,r8 1023 adde r12,r12,r9 1024 addze r10,r10 1025 $ST r11,`1*$BNSZ`(r3) #r[1]=c2; 1026 #mul_add_c(a[2],b[0],c3,c1,c2); 1027 $LD r6,`2*$BNSZ`(r4) 1028 $UMULL r8,r6,r7 1029 $UMULH r9,r6,r7 1030 addc r12,r12,r8 1031 adde r10,r10,r9 1032 addze r11,r0 1033 #mul_add_c(a[1],b[1],c3,c1,c2); 1034 $LD r6,`1*$BNSZ`(r4) 1035 $LD r7,`1*$BNSZ`(r5) 1036 $UMULL r8,r6,r7 1037 $UMULH r9,r6,r7 1038 addc r12,r12,r8 1039 adde r10,r10,r9 1040 addze r11,r11 1041 #mul_add_c(a[0],b[2],c3,c1,c2); 1042 $LD r6,`0*$BNSZ`(r4) 1043 $LD r7,`2*$BNSZ`(r5) 1044 $UMULL r8,r6,r7 1045 $UMULH r9,r6,r7 1046 addc r12,r12,r8 1047 adde r10,r10,r9 1048 addze r11,r11 1049 $ST r12,`2*$BNSZ`(r3) #r[2]=c3; 1050 #mul_add_c(a[0],b[3],c1,c2,c3); 1051 $LD r7,`3*$BNSZ`(r5) 1052 $UMULL r8,r6,r7 1053 $UMULH r9,r6,r7 1054 addc r10,r10,r8 1055 adde r11,r11,r9 1056 addze r12,r0 1057 #mul_add_c(a[1],b[2],c1,c2,c3); 1058 $LD r6,`1*$BNSZ`(r4) 1059 $LD r7,`2*$BNSZ`(r5) 1060 $UMULL r8,r6,r7 1061 $UMULH r9,r6,r7 1062 addc r10,r10,r8 1063 adde r11,r11,r9 1064 addze r12,r12 1065 1066 #mul_add_c(a[2],b[1],c1,c2,c3); 1067 $LD r6,`2*$BNSZ`(r4) 1068 $LD r7,`1*$BNSZ`(r5) 1069 $UMULL r8,r6,r7 1070 $UMULH r9,r6,r7 1071 addc r10,r10,r8 1072 adde r11,r11,r9 1073 addze r12,r12 1074 #mul_add_c(a[3],b[0],c1,c2,c3); 1075 $LD r6,`3*$BNSZ`(r4) 1076 $LD r7,`0*$BNSZ`(r5) 1077 $UMULL r8,r6,r7 1078 $UMULH r9,r6,r7 1079 addc r10,r10,r8 1080 adde r11,r11,r9 1081 addze r12,r12 1082 $ST r10,`3*$BNSZ`(r3) #r[3]=c1; 1083 #mul_add_c(a[4],b[0],c2,c3,c1); 1084 $LD r6,`4*$BNSZ`(r4) 1085 $UMULL r8,r6,r7 1086 $UMULH r9,r6,r7 1087 addc r11,r11,r8 1088 adde r12,r12,r9 1089 addze r10,r0 1090 #mul_add_c(a[3],b[1],c2,c3,c1); 1091 $LD r6,`3*$BNSZ`(r4) 1092 $LD r7,`1*$BNSZ`(r5) 1093 $UMULL r8,r6,r7 1094 $UMULH r9,r6,r7 1095 addc r11,r11,r8 1096 adde r12,r12,r9 1097 addze r10,r10 1098 #mul_add_c(a[2],b[2],c2,c3,c1); 1099 $LD r6,`2*$BNSZ`(r4) 1100 $LD r7,`2*$BNSZ`(r5) 1101 $UMULL r8,r6,r7 1102 $UMULH r9,r6,r7 1103 addc r11,r11,r8 1104 adde r12,r12,r9 1105 addze r10,r10 1106 #mul_add_c(a[1],b[3],c2,c3,c1); 1107 $LD r6,`1*$BNSZ`(r4) 1108 $LD r7,`3*$BNSZ`(r5) 1109 $UMULL r8,r6,r7 1110 $UMULH r9,r6,r7 1111 addc r11,r11,r8 1112 adde r12,r12,r9 1113 addze r10,r10 1114 #mul_add_c(a[0],b[4],c2,c3,c1); 1115 $LD r6,`0*$BNSZ`(r4) 1116 $LD r7,`4*$BNSZ`(r5) 1117 $UMULL r8,r6,r7 1118 $UMULH r9,r6,r7 1119 addc r11,r11,r8 1120 adde r12,r12,r9 1121 addze r10,r10 1122 $ST r11,`4*$BNSZ`(r3) #r[4]=c2; 1123 #mul_add_c(a[0],b[5],c3,c1,c2); 1124 $LD r7,`5*$BNSZ`(r5) 1125 $UMULL r8,r6,r7 1126 $UMULH r9,r6,r7 1127 addc r12,r12,r8 1128 adde r10,r10,r9 1129 addze r11,r0 1130 #mul_add_c(a[1],b[4],c3,c1,c2); 1131 $LD r6,`1*$BNSZ`(r4) 1132 $LD r7,`4*$BNSZ`(r5) 1133 $UMULL r8,r6,r7 1134 $UMULH r9,r6,r7 1135 addc r12,r12,r8 1136 adde r10,r10,r9 1137 addze r11,r11 1138 #mul_add_c(a[2],b[3],c3,c1,c2); 1139 $LD r6,`2*$BNSZ`(r4) 1140 $LD r7,`3*$BNSZ`(r5) 1141 $UMULL r8,r6,r7 1142 $UMULH r9,r6,r7 1143 addc r12,r12,r8 1144 adde r10,r10,r9 1145 addze r11,r11 1146 #mul_add_c(a[3],b[2],c3,c1,c2); 1147 $LD r6,`3*$BNSZ`(r4) 1148 $LD r7,`2*$BNSZ`(r5) 1149 $UMULL r8,r6,r7 1150 $UMULH r9,r6,r7 1151 addc r12,r12,r8 1152 adde r10,r10,r9 1153 addze r11,r11 1154 #mul_add_c(a[4],b[1],c3,c1,c2); 1155 $LD r6,`4*$BNSZ`(r4) 1156 $LD r7,`1*$BNSZ`(r5) 1157 $UMULL r8,r6,r7 1158 $UMULH r9,r6,r7 1159 addc r12,r12,r8 1160 adde r10,r10,r9 1161 addze r11,r11 1162 #mul_add_c(a[5],b[0],c3,c1,c2); 1163 $LD r6,`5*$BNSZ`(r4) 1164 $LD r7,`0*$BNSZ`(r5) 1165 $UMULL r8,r6,r7 1166 $UMULH r9,r6,r7 1167 addc r12,r12,r8 1168 adde r10,r10,r9 1169 addze r11,r11 1170 $ST r12,`5*$BNSZ`(r3) #r[5]=c3; 1171 #mul_add_c(a[6],b[0],c1,c2,c3); 1172 $LD r6,`6*$BNSZ`(r4) 1173 $UMULL r8,r6,r7 1174 $UMULH r9,r6,r7 1175 addc r10,r10,r8 1176 adde r11,r11,r9 1177 addze r12,r0 1178 #mul_add_c(a[5],b[1],c1,c2,c3); 1179 $LD r6,`5*$BNSZ`(r4) 1180 $LD r7,`1*$BNSZ`(r5) 1181 $UMULL r8,r6,r7 1182 $UMULH r9,r6,r7 1183 addc r10,r10,r8 1184 adde r11,r11,r9 1185 addze r12,r12 1186 #mul_add_c(a[4],b[2],c1,c2,c3); 1187 $LD r6,`4*$BNSZ`(r4) 1188 $LD r7,`2*$BNSZ`(r5) 1189 $UMULL r8,r6,r7 1190 $UMULH r9,r6,r7 1191 addc r10,r10,r8 1192 adde r11,r11,r9 1193 addze r12,r12 1194 #mul_add_c(a[3],b[3],c1,c2,c3); 1195 $LD r6,`3*$BNSZ`(r4) 1196 $LD r7,`3*$BNSZ`(r5) 1197 $UMULL r8,r6,r7 1198 $UMULH r9,r6,r7 1199 addc r10,r10,r8 1200 adde r11,r11,r9 1201 addze r12,r12 1202 #mul_add_c(a[2],b[4],c1,c2,c3); 1203 $LD r6,`2*$BNSZ`(r4) 1204 $LD r7,`4*$BNSZ`(r5) 1205 $UMULL r8,r6,r7 1206 $UMULH r9,r6,r7 1207 addc r10,r10,r8 1208 adde r11,r11,r9 1209 addze r12,r12 1210 #mul_add_c(a[1],b[5],c1,c2,c3); 1211 $LD r6,`1*$BNSZ`(r4) 1212 $LD r7,`5*$BNSZ`(r5) 1213 $UMULL r8,r6,r7 1214 $UMULH r9,r6,r7 1215 addc r10,r10,r8 1216 adde r11,r11,r9 1217 addze r12,r12 1218 #mul_add_c(a[0],b[6],c1,c2,c3); 1219 $LD r6,`0*$BNSZ`(r4) 1220 $LD r7,`6*$BNSZ`(r5) 1221 $UMULL r8,r6,r7 1222 $UMULH r9,r6,r7 1223 addc r10,r10,r8 1224 adde r11,r11,r9 1225 addze r12,r12 1226 $ST r10,`6*$BNSZ`(r3) #r[6]=c1; 1227 #mul_add_c(a[0],b[7],c2,c3,c1); 1228 $LD r7,`7*$BNSZ`(r5) 1229 $UMULL r8,r6,r7 1230 $UMULH r9,r6,r7 1231 addc r11,r11,r8 1232 adde r12,r12,r9 1233 addze r10,r0 1234 #mul_add_c(a[1],b[6],c2,c3,c1); 1235 $LD r6,`1*$BNSZ`(r4) 1236 $LD r7,`6*$BNSZ`(r5) 1237 $UMULL r8,r6,r7 1238 $UMULH r9,r6,r7 1239 addc r11,r11,r8 1240 adde r12,r12,r9 1241 addze r10,r10 1242 #mul_add_c(a[2],b[5],c2,c3,c1); 1243 $LD r6,`2*$BNSZ`(r4) 1244 $LD r7,`5*$BNSZ`(r5) 1245 $UMULL r8,r6,r7 1246 $UMULH r9,r6,r7 1247 addc r11,r11,r8 1248 adde r12,r12,r9 1249 addze r10,r10 1250 #mul_add_c(a[3],b[4],c2,c3,c1); 1251 $LD r6,`3*$BNSZ`(r4) 1252 $LD r7,`4*$BNSZ`(r5) 1253 $UMULL r8,r6,r7 1254 $UMULH r9,r6,r7 1255 addc r11,r11,r8 1256 adde r12,r12,r9 1257 addze r10,r10 1258 #mul_add_c(a[4],b[3],c2,c3,c1); 1259 $LD r6,`4*$BNSZ`(r4) 1260 $LD r7,`3*$BNSZ`(r5) 1261 $UMULL r8,r6,r7 1262 $UMULH r9,r6,r7 1263 addc r11,r11,r8 1264 adde r12,r12,r9 1265 addze r10,r10 1266 #mul_add_c(a[5],b[2],c2,c3,c1); 1267 $LD r6,`5*$BNSZ`(r4) 1268 $LD r7,`2*$BNSZ`(r5) 1269 $UMULL r8,r6,r7 1270 $UMULH r9,r6,r7 1271 addc r11,r11,r8 1272 adde r12,r12,r9 1273 addze r10,r10 1274 #mul_add_c(a[6],b[1],c2,c3,c1); 1275 $LD r6,`6*$BNSZ`(r4) 1276 $LD r7,`1*$BNSZ`(r5) 1277 $UMULL r8,r6,r7 1278 $UMULH r9,r6,r7 1279 addc r11,r11,r8 1280 adde r12,r12,r9 1281 addze r10,r10 1282 #mul_add_c(a[7],b[0],c2,c3,c1); 1283 $LD r6,`7*$BNSZ`(r4) 1284 $LD r7,`0*$BNSZ`(r5) 1285 $UMULL r8,r6,r7 1286 $UMULH r9,r6,r7 1287 addc r11,r11,r8 1288 adde r12,r12,r9 1289 addze r10,r10 1290 $ST r11,`7*$BNSZ`(r3) #r[7]=c2; 1291 #mul_add_c(a[7],b[1],c3,c1,c2); 1292 $LD r7,`1*$BNSZ`(r5) 1293 $UMULL r8,r6,r7 1294 $UMULH r9,r6,r7 1295 addc r12,r12,r8 1296 adde r10,r10,r9 1297 addze r11,r0 1298 #mul_add_c(a[6],b[2],c3,c1,c2); 1299 $LD r6,`6*$BNSZ`(r4) 1300 $LD r7,`2*$BNSZ`(r5) 1301 $UMULL r8,r6,r7 1302 $UMULH r9,r6,r7 1303 addc r12,r12,r8 1304 adde r10,r10,r9 1305 addze r11,r11 1306 #mul_add_c(a[5],b[3],c3,c1,c2); 1307 $LD r6,`5*$BNSZ`(r4) 1308 $LD r7,`3*$BNSZ`(r5) 1309 $UMULL r8,r6,r7 1310 $UMULH r9,r6,r7 1311 addc r12,r12,r8 1312 adde r10,r10,r9 1313 addze r11,r11 1314 #mul_add_c(a[4],b[4],c3,c1,c2); 1315 $LD r6,`4*$BNSZ`(r4) 1316 $LD r7,`4*$BNSZ`(r5) 1317 $UMULL r8,r6,r7 1318 $UMULH r9,r6,r7 1319 addc r12,r12,r8 1320 adde r10,r10,r9 1321 addze r11,r11 1322 #mul_add_c(a[3],b[5],c3,c1,c2); 1323 $LD r6,`3*$BNSZ`(r4) 1324 $LD r7,`5*$BNSZ`(r5) 1325 $UMULL r8,r6,r7 1326 $UMULH r9,r6,r7 1327 addc r12,r12,r8 1328 adde r10,r10,r9 1329 addze r11,r11 1330 #mul_add_c(a[2],b[6],c3,c1,c2); 1331 $LD r6,`2*$BNSZ`(r4) 1332 $LD r7,`6*$BNSZ`(r5) 1333 $UMULL r8,r6,r7 1334 $UMULH r9,r6,r7 1335 addc r12,r12,r8 1336 adde r10,r10,r9 1337 addze r11,r11 1338 #mul_add_c(a[1],b[7],c3,c1,c2); 1339 $LD r6,`1*$BNSZ`(r4) 1340 $LD r7,`7*$BNSZ`(r5) 1341 $UMULL r8,r6,r7 1342 $UMULH r9,r6,r7 1343 addc r12,r12,r8 1344 adde r10,r10,r9 1345 addze r11,r11 1346 $ST r12,`8*$BNSZ`(r3) #r[8]=c3; 1347 #mul_add_c(a[2],b[7],c1,c2,c3); 1348 $LD r6,`2*$BNSZ`(r4) 1349 $UMULL r8,r6,r7 1350 $UMULH r9,r6,r7 1351 addc r10,r10,r8 1352 adde r11,r11,r9 1353 addze r12,r0 1354 #mul_add_c(a[3],b[6],c1,c2,c3); 1355 $LD r6,`3*$BNSZ`(r4) 1356 $LD r7,`6*$BNSZ`(r5) 1357 $UMULL r8,r6,r7 1358 $UMULH r9,r6,r7 1359 addc r10,r10,r8 1360 adde r11,r11,r9 1361 addze r12,r12 1362 #mul_add_c(a[4],b[5],c1,c2,c3); 1363 $LD r6,`4*$BNSZ`(r4) 1364 $LD r7,`5*$BNSZ`(r5) 1365 $UMULL r8,r6,r7 1366 $UMULH r9,r6,r7 1367 addc r10,r10,r8 1368 adde r11,r11,r9 1369 addze r12,r12 1370 #mul_add_c(a[5],b[4],c1,c2,c3); 1371 $LD r6,`5*$BNSZ`(r4) 1372 $LD r7,`4*$BNSZ`(r5) 1373 $UMULL r8,r6,r7 1374 $UMULH r9,r6,r7 1375 addc r10,r10,r8 1376 adde r11,r11,r9 1377 addze r12,r12 1378 #mul_add_c(a[6],b[3],c1,c2,c3); 1379 $LD r6,`6*$BNSZ`(r4) 1380 $LD r7,`3*$BNSZ`(r5) 1381 $UMULL r8,r6,r7 1382 $UMULH r9,r6,r7 1383 addc r10,r10,r8 1384 adde r11,r11,r9 1385 addze r12,r12 1386 #mul_add_c(a[7],b[2],c1,c2,c3); 1387 $LD r6,`7*$BNSZ`(r4) 1388 $LD r7,`2*$BNSZ`(r5) 1389 $UMULL r8,r6,r7 1390 $UMULH r9,r6,r7 1391 addc r10,r10,r8 1392 adde r11,r11,r9 1393 addze r12,r12 1394 $ST r10,`9*$BNSZ`(r3) #r[9]=c1; 1395 #mul_add_c(a[7],b[3],c2,c3,c1); 1396 $LD r7,`3*$BNSZ`(r5) 1397 $UMULL r8,r6,r7 1398 $UMULH r9,r6,r7 1399 addc r11,r11,r8 1400 adde r12,r12,r9 1401 addze r10,r0 1402 #mul_add_c(a[6],b[4],c2,c3,c1); 1403 $LD r6,`6*$BNSZ`(r4) 1404 $LD r7,`4*$BNSZ`(r5) 1405 $UMULL r8,r6,r7 1406 $UMULH r9,r6,r7 1407 addc r11,r11,r8 1408 adde r12,r12,r9 1409 addze r10,r10 1410 #mul_add_c(a[5],b[5],c2,c3,c1); 1411 $LD r6,`5*$BNSZ`(r4) 1412 $LD r7,`5*$BNSZ`(r5) 1413 $UMULL r8,r6,r7 1414 $UMULH r9,r6,r7 1415 addc r11,r11,r8 1416 adde r12,r12,r9 1417 addze r10,r10 1418 #mul_add_c(a[4],b[6],c2,c3,c1); 1419 $LD r6,`4*$BNSZ`(r4) 1420 $LD r7,`6*$BNSZ`(r5) 1421 $UMULL r8,r6,r7 1422 $UMULH r9,r6,r7 1423 addc r11,r11,r8 1424 adde r12,r12,r9 1425 addze r10,r10 1426 #mul_add_c(a[3],b[7],c2,c3,c1); 1427 $LD r6,`3*$BNSZ`(r4) 1428 $LD r7,`7*$BNSZ`(r5) 1429 $UMULL r8,r6,r7 1430 $UMULH r9,r6,r7 1431 addc r11,r11,r8 1432 adde r12,r12,r9 1433 addze r10,r10 1434 $ST r11,`10*$BNSZ`(r3) #r[10]=c2; 1435 #mul_add_c(a[4],b[7],c3,c1,c2); 1436 $LD r6,`4*$BNSZ`(r4) 1437 $UMULL r8,r6,r7 1438 $UMULH r9,r6,r7 1439 addc r12,r12,r8 1440 adde r10,r10,r9 1441 addze r11,r0 1442 #mul_add_c(a[5],b[6],c3,c1,c2); 1443 $LD r6,`5*$BNSZ`(r4) 1444 $LD r7,`6*$BNSZ`(r5) 1445 $UMULL r8,r6,r7 1446 $UMULH r9,r6,r7 1447 addc r12,r12,r8 1448 adde r10,r10,r9 1449 addze r11,r11 1450 #mul_add_c(a[6],b[5],c3,c1,c2); 1451 $LD r6,`6*$BNSZ`(r4) 1452 $LD r7,`5*$BNSZ`(r5) 1453 $UMULL r8,r6,r7 1454 $UMULH r9,r6,r7 1455 addc r12,r12,r8 1456 adde r10,r10,r9 1457 addze r11,r11 1458 #mul_add_c(a[7],b[4],c3,c1,c2); 1459 $LD r6,`7*$BNSZ`(r4) 1460 $LD r7,`4*$BNSZ`(r5) 1461 $UMULL r8,r6,r7 1462 $UMULH r9,r6,r7 1463 addc r12,r12,r8 1464 adde r10,r10,r9 1465 addze r11,r11 1466 $ST r12,`11*$BNSZ`(r3) #r[11]=c3; 1467 #mul_add_c(a[7],b[5],c1,c2,c3); 1468 $LD r7,`5*$BNSZ`(r5) 1469 $UMULL r8,r6,r7 1470 $UMULH r9,r6,r7 1471 addc r10,r10,r8 1472 adde r11,r11,r9 1473 addze r12,r0 1474 #mul_add_c(a[6],b[6],c1,c2,c3); 1475 $LD r6,`6*$BNSZ`(r4) 1476 $LD r7,`6*$BNSZ`(r5) 1477 $UMULL r8,r6,r7 1478 $UMULH r9,r6,r7 1479 addc r10,r10,r8 1480 adde r11,r11,r9 1481 addze r12,r12 1482 #mul_add_c(a[5],b[7],c1,c2,c3); 1483 $LD r6,`5*$BNSZ`(r4) 1484 $LD r7,`7*$BNSZ`(r5) 1485 $UMULL r8,r6,r7 1486 $UMULH r9,r6,r7 1487 addc r10,r10,r8 1488 adde r11,r11,r9 1489 addze r12,r12 1490 $ST r10,`12*$BNSZ`(r3) #r[12]=c1; 1491 #mul_add_c(a[6],b[7],c2,c3,c1); 1492 $LD r6,`6*$BNSZ`(r4) 1493 $UMULL r8,r6,r7 1494 $UMULH r9,r6,r7 1495 addc r11,r11,r8 1496 adde r12,r12,r9 1497 addze r10,r0 1498 #mul_add_c(a[7],b[6],c2,c3,c1); 1499 $LD r6,`7*$BNSZ`(r4) 1500 $LD r7,`6*$BNSZ`(r5) 1501 $UMULL r8,r6,r7 1502 $UMULH r9,r6,r7 1503 addc r11,r11,r8 1504 adde r12,r12,r9 1505 addze r10,r10 1506 $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 1507 #mul_add_c(a[7],b[7],c3,c1,c2); 1508 $LD r7,`7*$BNSZ`(r5) 1509 $UMULL r8,r6,r7 1510 $UMULH r9,r6,r7 1511 addc r12,r12,r8 1512 adde r10,r10,r9 1513 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1514 $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1515 blr 1516 .long 0 1517 .byte 0,12,0x14,0,0,0,3,0 1518 .long 0 1519.size .bn_mul_comba8,.-.bn_mul_comba8 1520 1521# 1522# NOTE: The following label name should be changed to 1523# "bn_sub_words" i.e. remove the first dot 1524# for the gcc compiler. This should be automatically 1525# done in the build 1526# 1527# 1528.align 4 1529.bn_sub_words: 1530# 1531# Handcoded version of bn_sub_words 1532# 1533#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1534# 1535# r3 = r 1536# r4 = a 1537# r5 = b 1538# r6 = n 1539# 1540# Note: No loop unrolling done since this is not a performance 1541# critical loop. 1542 1543 xor r0,r0,r0 #set r0 = 0 1544# 1545# check for r6 = 0 AND set carry bit. 1546# 1547 subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1548 # if r6 > 0 then result !=0 1549 # In either case carry bit is set. 1550 beq Lppcasm_sub_adios 1551 addi r4,r4,-$BNSZ 1552 addi r3,r3,-$BNSZ 1553 addi r5,r5,-$BNSZ 1554 mtctr r6 1555Lppcasm_sub_mainloop: 1556 $LDU r7,$BNSZ(r4) 1557 $LDU r8,$BNSZ(r5) 1558 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 1559 # if carry = 1 this is r7-r8. Else it 1560 # is r7-r8 -1 as we need. 1561 $STU r6,$BNSZ(r3) 1562 bdnz Lppcasm_sub_mainloop 1563Lppcasm_sub_adios: 1564 subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1565 andi. r3,r3,1 # keep only last bit. 1566 blr 1567 .long 0 1568 .byte 0,12,0x14,0,0,0,4,0 1569 .long 0 1570.size .bn_sub_words,.-.bn_sub_words 1571 1572# 1573# NOTE: The following label name should be changed to 1574# "bn_add_words" i.e. remove the first dot 1575# for the gcc compiler. This should be automatically 1576# done in the build 1577# 1578 1579.align 4 1580.bn_add_words: 1581# 1582# Handcoded version of bn_add_words 1583# 1584#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1585# 1586# r3 = r 1587# r4 = a 1588# r5 = b 1589# r6 = n 1590# 1591# Note: No loop unrolling done since this is not a performance 1592# critical loop. 1593 1594 xor r0,r0,r0 1595# 1596# check for r6 = 0. Is this needed? 1597# 1598 addic. r6,r6,0 #test r6 and clear carry bit. 1599 beq Lppcasm_add_adios 1600 addi r4,r4,-$BNSZ 1601 addi r3,r3,-$BNSZ 1602 addi r5,r5,-$BNSZ 1603 mtctr r6 1604Lppcasm_add_mainloop: 1605 $LDU r7,$BNSZ(r4) 1606 $LDU r8,$BNSZ(r5) 1607 adde r8,r7,r8 1608 $STU r8,$BNSZ(r3) 1609 bdnz Lppcasm_add_mainloop 1610Lppcasm_add_adios: 1611 addze r3,r0 #return carry bit. 1612 blr 1613 .long 0 1614 .byte 0,12,0x14,0,0,0,4,0 1615 .long 0 1616.size .bn_add_words,.-.bn_add_words 1617 1618# 1619# NOTE: The following label name should be changed to 1620# "bn_div_words" i.e. remove the first dot 1621# for the gcc compiler. This should be automatically 1622# done in the build 1623# 1624 1625.align 4 1626.bn_div_words: 1627# 1628# This is a cleaned up version of code generated by 1629# the AIX compiler. The only optimization is to use 1630# the PPC instruction to count leading zeros instead 1631# of call to num_bits_word. Since this was compiled 1632# only at level -O2 we can possibly squeeze it more? 1633# 1634# r3 = h 1635# r4 = l 1636# r5 = d 1637 1638 $UCMPI 0,r5,0 # compare r5 and 0 1639 bne Lppcasm_div1 # proceed if d!=0 1640 li r3,-1 # d=0 return -1 1641 blr 1642Lppcasm_div1: 1643 xor r0,r0,r0 #r0=0 1644 li r8,$BITS 1645 $CNTLZ. r7,r5 #r7 = num leading 0s in d. 1646 beq Lppcasm_div2 #proceed if no leading zeros 1647 subf r8,r7,r8 #r8 = BN_num_bits_word(d) 1648 $SHR. r9,r3,r8 #are there any bits above r8'th? 1649 $TR 16,r9,r0 #if there're, signal to dump core... 1650Lppcasm_div2: 1651 $UCMP 0,r3,r5 #h>=d? 1652 blt Lppcasm_div3 #goto Lppcasm_div3 if not 1653 subf r3,r5,r3 #h-=d ; 1654Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1655 cmpi 0,0,r7,0 # is (i == 0)? 1656 beq Lppcasm_div4 1657 $SHL r3,r3,r7 # h = (h<< i) 1658 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1659 $SHL r5,r5,r7 # d<<=i 1660 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 1661 $SHL r4,r4,r7 # l <<=i 1662Lppcasm_div4: 1663 $SHRI r9,r5,`$BITS/2` # r9 = dh 1664 # dl will be computed when needed 1665 # as it saves registers. 1666 li r6,2 #r6=2 1667 mtctr r6 #counter will be in count. 1668Lppcasm_divouterloop: 1669 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 1670 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1671 # compute here for innerloop. 1672 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1673 bne Lppcasm_div5 # goto Lppcasm_div5 if not 1674 1675 li r8,-1 1676 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1677 b Lppcasm_div6 1678Lppcasm_div5: 1679 $UDIV r8,r3,r9 #q = h/dh 1680Lppcasm_div6: 1681 $UMULL r12,r9,r8 #th = q*dh 1682 $CLRU r10,r5,`$BITS/2` #r10=dl 1683 $UMULL r6,r8,r10 #tl = q*dl 1684 1685Lppcasm_divinnerloop: 1686 subf r10,r12,r3 #t = h -th 1687 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 1688 addic. r7,r7,0 #test if r7 == 0. used below. 1689 # now want to compute 1690 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) 1691 # the following 2 instructions do that 1692 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1693 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1694 $UCMP cr1,r6,r7 # compare (tl <= r7) 1695 bne Lppcasm_divinnerexit 1696 ble cr1,Lppcasm_divinnerexit 1697 addi r8,r8,-1 #q-- 1698 subf r12,r9,r12 #th -=dh 1699 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1700 subf r6,r10,r6 #tl -=dl 1701 b Lppcasm_divinnerloop 1702Lppcasm_divinnerexit: 1703 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1704 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1705 $UCMP cr1,r4,r11 # compare l and tl 1706 add r12,r12,r10 # th+=t 1707 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1708 addi r12,r12,1 # th++ 1709Lppcasm_div7: 1710 subf r11,r11,r4 #r11=l-tl 1711 $UCMP cr1,r3,r12 #compare h and th 1712 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 1713 addi r8,r8,-1 # q-- 1714 add r3,r5,r3 # h+=d 1715Lppcasm_div8: 1716 subf r12,r12,r3 #r12 = h-th 1717 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 1718 # want to compute 1719 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 1720 # the following 2 instructions will do this. 1721 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 1722 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1723 bdz Lppcasm_div9 #if (count==0) break ; 1724 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1725 b Lppcasm_divouterloop 1726Lppcasm_div9: 1727 or r3,r8,r0 1728 blr 1729 .long 0 1730 .byte 0,12,0x14,0,0,0,3,0 1731 .long 0 1732.size .bn_div_words,.-.bn_div_words 1733 1734# 1735# NOTE: The following label name should be changed to 1736# "bn_sqr_words" i.e. remove the first dot 1737# for the gcc compiler. This should be automatically 1738# done in the build 1739# 1740.align 4 1741.bn_sqr_words: 1742# 1743# Optimized version of bn_sqr_words 1744# 1745# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 1746# 1747# r3 = r 1748# r4 = a 1749# r5 = n 1750# 1751# r6 = a[i]. 1752# r7,r8 = product. 1753# 1754# No unrolling done here. Not performance critical. 1755 1756 addic. r5,r5,0 #test r5. 1757 beq Lppcasm_sqr_adios 1758 addi r4,r4,-$BNSZ 1759 addi r3,r3,-$BNSZ 1760 mtctr r5 1761Lppcasm_sqr_mainloop: 1762 #sqr(r[0],r[1],a[0]); 1763 $LDU r6,$BNSZ(r4) 1764 $UMULL r7,r6,r6 1765 $UMULH r8,r6,r6 1766 $STU r7,$BNSZ(r3) 1767 $STU r8,$BNSZ(r3) 1768 bdnz Lppcasm_sqr_mainloop 1769Lppcasm_sqr_adios: 1770 blr 1771 .long 0 1772 .byte 0,12,0x14,0,0,0,3,0 1773 .long 0 1774.size .bn_sqr_words,.-.bn_sqr_words 1775 1776# 1777# NOTE: The following label name should be changed to 1778# "bn_mul_words" i.e. remove the first dot 1779# for the gcc compiler. This should be automatically 1780# done in the build 1781# 1782 1783.align 4 1784.bn_mul_words: 1785# 1786# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1787# 1788# r3 = rp 1789# r4 = ap 1790# r5 = num 1791# r6 = w 1792 xor r0,r0,r0 1793 xor r12,r12,r12 # used for carry 1794 rlwinm. r7,r5,30,2,31 # num >> 2 1795 beq Lppcasm_mw_REM 1796 mtctr r7 1797Lppcasm_mw_LOOP: 1798 #mul(rp[0],ap[0],w,c1); 1799 $LD r8,`0*$BNSZ`(r4) 1800 $UMULL r9,r6,r8 1801 $UMULH r10,r6,r8 1802 addc r9,r9,r12 1803 #addze r10,r10 #carry is NOT ignored. 1804 #will be taken care of 1805 #in second spin below 1806 #using adde. 1807 $ST r9,`0*$BNSZ`(r3) 1808 #mul(rp[1],ap[1],w,c1); 1809 $LD r8,`1*$BNSZ`(r4) 1810 $UMULL r11,r6,r8 1811 $UMULH r12,r6,r8 1812 adde r11,r11,r10 1813 #addze r12,r12 1814 $ST r11,`1*$BNSZ`(r3) 1815 #mul(rp[2],ap[2],w,c1); 1816 $LD r8,`2*$BNSZ`(r4) 1817 $UMULL r9,r6,r8 1818 $UMULH r10,r6,r8 1819 adde r9,r9,r12 1820 #addze r10,r10 1821 $ST r9,`2*$BNSZ`(r3) 1822 #mul_add(rp[3],ap[3],w,c1); 1823 $LD r8,`3*$BNSZ`(r4) 1824 $UMULL r11,r6,r8 1825 $UMULH r12,r6,r8 1826 adde r11,r11,r10 1827 addze r12,r12 #this spin we collect carry into 1828 #r12 1829 $ST r11,`3*$BNSZ`(r3) 1830 1831 addi r3,r3,`4*$BNSZ` 1832 addi r4,r4,`4*$BNSZ` 1833 bdnz Lppcasm_mw_LOOP 1834 1835Lppcasm_mw_REM: 1836 andi. r5,r5,0x3 1837 beq Lppcasm_mw_OVER 1838 #mul(rp[0],ap[0],w,c1); 1839 $LD r8,`0*$BNSZ`(r4) 1840 $UMULL r9,r6,r8 1841 $UMULH r10,r6,r8 1842 addc r9,r9,r12 1843 addze r10,r10 1844 $ST r9,`0*$BNSZ`(r3) 1845 addi r12,r10,0 1846 1847 addi r5,r5,-1 1848 cmpli 0,0,r5,0 1849 beq Lppcasm_mw_OVER 1850 1851 1852 #mul(rp[1],ap[1],w,c1); 1853 $LD r8,`1*$BNSZ`(r4) 1854 $UMULL r9,r6,r8 1855 $UMULH r10,r6,r8 1856 addc r9,r9,r12 1857 addze r10,r10 1858 $ST r9,`1*$BNSZ`(r3) 1859 addi r12,r10,0 1860 1861 addi r5,r5,-1 1862 cmpli 0,0,r5,0 1863 beq Lppcasm_mw_OVER 1864 1865 #mul_add(rp[2],ap[2],w,c1); 1866 $LD r8,`2*$BNSZ`(r4) 1867 $UMULL r9,r6,r8 1868 $UMULH r10,r6,r8 1869 addc r9,r9,r12 1870 addze r10,r10 1871 $ST r9,`2*$BNSZ`(r3) 1872 addi r12,r10,0 1873 1874Lppcasm_mw_OVER: 1875 addi r3,r12,0 1876 blr 1877 .long 0 1878 .byte 0,12,0x14,0,0,0,4,0 1879 .long 0 1880.size .bn_mul_words,.-.bn_mul_words 1881 1882# 1883# NOTE: The following label name should be changed to 1884# "bn_mul_add_words" i.e. remove the first dot 1885# for the gcc compiler. This should be automatically 1886# done in the build 1887# 1888 1889.align 4 1890.bn_mul_add_words: 1891# 1892# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1893# 1894# r3 = rp 1895# r4 = ap 1896# r5 = num 1897# r6 = w 1898# 1899# empirical evidence suggests that unrolled version performs best!! 1900# 1901 xor r0,r0,r0 #r0 = 0 1902 xor r12,r12,r12 #r12 = 0 . used for carry 1903 rlwinm. r7,r5,30,2,31 # num >> 2 1904 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 1905 mtctr r7 1906Lppcasm_maw_mainloop: 1907 #mul_add(rp[0],ap[0],w,c1); 1908 $LD r8,`0*$BNSZ`(r4) 1909 $LD r11,`0*$BNSZ`(r3) 1910 $UMULL r9,r6,r8 1911 $UMULH r10,r6,r8 1912 addc r9,r9,r12 #r12 is carry. 1913 addze r10,r10 1914 addc r9,r9,r11 1915 #addze r10,r10 1916 #the above instruction addze 1917 #is NOT needed. Carry will NOT 1918 #be ignored. It's not affected 1919 #by multiply and will be collected 1920 #in the next spin 1921 $ST r9,`0*$BNSZ`(r3) 1922 1923 #mul_add(rp[1],ap[1],w,c1); 1924 $LD r8,`1*$BNSZ`(r4) 1925 $LD r9,`1*$BNSZ`(r3) 1926 $UMULL r11,r6,r8 1927 $UMULH r12,r6,r8 1928 adde r11,r11,r10 #r10 is carry. 1929 addze r12,r12 1930 addc r11,r11,r9 1931 #addze r12,r12 1932 $ST r11,`1*$BNSZ`(r3) 1933 1934 #mul_add(rp[2],ap[2],w,c1); 1935 $LD r8,`2*$BNSZ`(r4) 1936 $UMULL r9,r6,r8 1937 $LD r11,`2*$BNSZ`(r3) 1938 $UMULH r10,r6,r8 1939 adde r9,r9,r12 1940 addze r10,r10 1941 addc r9,r9,r11 1942 #addze r10,r10 1943 $ST r9,`2*$BNSZ`(r3) 1944 1945 #mul_add(rp[3],ap[3],w,c1); 1946 $LD r8,`3*$BNSZ`(r4) 1947 $UMULL r11,r6,r8 1948 $LD r9,`3*$BNSZ`(r3) 1949 $UMULH r12,r6,r8 1950 adde r11,r11,r10 1951 addze r12,r12 1952 addc r11,r11,r9 1953 addze r12,r12 1954 $ST r11,`3*$BNSZ`(r3) 1955 addi r3,r3,`4*$BNSZ` 1956 addi r4,r4,`4*$BNSZ` 1957 bdnz Lppcasm_maw_mainloop 1958 1959Lppcasm_maw_leftover: 1960 andi. r5,r5,0x3 1961 beq Lppcasm_maw_adios 1962 addi r3,r3,-$BNSZ 1963 addi r4,r4,-$BNSZ 1964 #mul_add(rp[0],ap[0],w,c1); 1965 mtctr r5 1966 $LDU r8,$BNSZ(r4) 1967 $UMULL r9,r6,r8 1968 $UMULH r10,r6,r8 1969 $LDU r11,$BNSZ(r3) 1970 addc r9,r9,r11 1971 addze r10,r10 1972 addc r9,r9,r12 1973 addze r12,r10 1974 $ST r9,0(r3) 1975 1976 bdz Lppcasm_maw_adios 1977 #mul_add(rp[1],ap[1],w,c1); 1978 $LDU r8,$BNSZ(r4) 1979 $UMULL r9,r6,r8 1980 $UMULH r10,r6,r8 1981 $LDU r11,$BNSZ(r3) 1982 addc r9,r9,r11 1983 addze r10,r10 1984 addc r9,r9,r12 1985 addze r12,r10 1986 $ST r9,0(r3) 1987 1988 bdz Lppcasm_maw_adios 1989 #mul_add(rp[2],ap[2],w,c1); 1990 $LDU r8,$BNSZ(r4) 1991 $UMULL r9,r6,r8 1992 $UMULH r10,r6,r8 1993 $LDU r11,$BNSZ(r3) 1994 addc r9,r9,r11 1995 addze r10,r10 1996 addc r9,r9,r12 1997 addze r12,r10 1998 $ST r9,0(r3) 1999 2000Lppcasm_maw_adios: 2001 addi r3,r12,0 2002 blr 2003 .long 0 2004 .byte 0,12,0x14,0,0,0,4,0 2005 .long 0 2006.size .bn_mul_add_words,.-.bn_mul_add_words 2007 .align 4 2008EOF 2009$data =~ s/\`([^\`]*)\`/eval $1/gem; 2010print $data; 2011close STDOUT; 2012