1#! /usr/bin/env perl 2# Copyright 2004-2019 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# Implemented as a Perl wrapper as we want to support several different 10# architectures with single file. We pick up the target based on the 11# file name we are asked to generate. 12# 13# It should be noted though that this perl code is nothing like 14# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 15# as pre-processor to cover for platform differences in name decoration, 16# linker tables, 32-/64-bit instruction sets... 17# 18# As you might know there're several PowerPC ABI in use. Most notably 19# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs 20# are similar enough to implement leaf(!) functions, which would be ABI 21# neutral. And that's what you find here: ABI neutral leaf functions. 22# In case you wonder what that is... 23# 24# AIX performance 25# 26# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. 27# 28# The following is the performance of 32-bit compiler 29# generated code: 30# 31# OpenSSL 0.9.6c 21 dec 2001 32# built on: Tue Jun 11 11:06:51 EDT 2002 33# options:bn(64,32) ... 34#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 35# sign verify sign/s verify/s 36#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 37#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 38#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 39#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 40#dsa 512 bits 0.0087s 0.0106s 114.3 94.5 41#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 42# 43# Same benchmark with this assembler code: 44# 45#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 46#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 47#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 48#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 49#dsa 512 bits 0.0052s 0.0062s 191.6 162.0 50#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 51# 52# Number of operations increases by at almost 75% 53# 54# Here are performance numbers for 64-bit compiler 55# generated code: 56# 57# OpenSSL 0.9.6g [engine] 9 Aug 2002 58# built on: Fri Apr 18 16:59:20 EDT 2003 59# options:bn(64,64) ... 60# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 61# sign verify sign/s verify/s 62#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 63#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 64#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 65#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 66#dsa 512 bits 0.0026s 0.0032s 382.5 313.7 67#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 68# 69# Same benchmark with this assembler code: 70# 71#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 72#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 73#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 74#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 75#dsa 512 bits 0.0016s 0.0020s 610.7 507.1 76#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 77# 78# Again, performance increases by at about 75% 79# 80# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) 81# OpenSSL 0.9.7c 30 Sep 2003 82# 83# Original code. 84# 85#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 86#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 87#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 88#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 89#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 90#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 91#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 92# 93# Same benchmark with this assembler code: 94# 95#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 96#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 97#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 98#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 99#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 100#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 101#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 102# 103# Performance increase of ~60% 104# Based on submission from Suresh N. Chari of IBM 105 106$flavour = shift; 107 108if ($flavour =~ /32/) { 109 $BITS= 32; 110 $BNSZ= $BITS/8; 111 $ISA= "\"ppc\""; 112 113 $LD= "lwz"; # load 114 $LDU= "lwzu"; # load and update 115 $ST= "stw"; # store 116 $STU= "stwu"; # store and update 117 $UMULL= "mullw"; # unsigned multiply low 118 $UMULH= "mulhwu"; # unsigned multiply high 119 $UDIV= "divwu"; # unsigned divide 120 $UCMPI= "cmplwi"; # unsigned compare with immediate 121 $UCMP= "cmplw"; # unsigned compare 122 $CNTLZ= "cntlzw"; # count leading zeros 123 $SHL= "slw"; # shift left 124 $SHR= "srw"; # unsigned shift right 125 $SHRI= "srwi"; # unsigned shift right by immediate 126 $SHLI= "slwi"; # shift left by immediate 127 $CLRU= "clrlwi"; # clear upper bits 128 $INSR= "insrwi"; # insert right 129 $ROTL= "rotlwi"; # rotate left by immediate 130 $TR= "tw"; # conditional trap 131} elsif ($flavour =~ /64/) { 132 $BITS= 64; 133 $BNSZ= $BITS/8; 134 $ISA= "\"ppc64\""; 135 136 # same as above, but 64-bit mnemonics... 137 $LD= "ld"; # load 138 $LDU= "ldu"; # load and update 139 $ST= "std"; # store 140 $STU= "stdu"; # store and update 141 $UMULL= "mulld"; # unsigned multiply low 142 $UMULH= "mulhdu"; # unsigned multiply high 143 $UDIV= "divdu"; # unsigned divide 144 $UCMPI= "cmpldi"; # unsigned compare with immediate 145 $UCMP= "cmpld"; # unsigned compare 146 $CNTLZ= "cntlzd"; # count leading zeros 147 $SHL= "sld"; # shift left 148 $SHR= "srd"; # unsigned shift right 149 $SHRI= "srdi"; # unsigned shift right by immediate 150 $SHLI= "sldi"; # shift left by immediate 151 $CLRU= "clrldi"; # clear upper bits 152 $INSR= "insrdi"; # insert right 153 $ROTL= "rotldi"; # rotate left by immediate 154 $TR= "td"; # conditional trap 155} else { die "nonsense $flavour"; } 156 157$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 158( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 159( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 160die "can't locate ppc-xlate.pl"; 161 162open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 163 164$data=<<EOF; 165#-------------------------------------------------------------------- 166# 167# 168# 169# 170# File: ppc32.s 171# 172# Created by: Suresh Chari 173# IBM Thomas J. Watson Research Library 174# Hawthorne, NY 175# 176# 177# Description: Optimized assembly routines for OpenSSL crypto 178# on the 32 bitPowerPC platform. 179# 180# 181# Version History 182# 183# 2. Fixed bn_add,bn_sub and bn_div_words, added comments, 184# cleaned up code. Also made a single version which can 185# be used for both the AIX and Linux compilers. See NOTE 186# below. 187# 12/05/03 Suresh Chari 188# (with lots of help from) Andy Polyakov 189## 190# 1. Initial version 10/20/02 Suresh Chari 191# 192# 193# The following file works for the xlc,cc 194# and gcc compilers. 195# 196# NOTE: To get the file to link correctly with the gcc compiler 197# you have to change the names of the routines and remove 198# the first .(dot) character. This should automatically 199# be done in the build process. 200# 201# Hand optimized assembly code for the following routines 202# 203# bn_sqr_comba4 204# bn_sqr_comba8 205# bn_mul_comba4 206# bn_mul_comba8 207# bn_sub_words 208# bn_add_words 209# bn_div_words 210# bn_sqr_words 211# bn_mul_words 212# bn_mul_add_words 213# 214# NOTE: It is possible to optimize this code more for 215# specific PowerPC or Power architectures. On the Northstar 216# architecture the optimizations in this file do 217# NOT provide much improvement. 218# 219# If you have comments or suggestions to improve code send 220# me a note at schari\@us.ibm.com 221# 222#-------------------------------------------------------------------------- 223# 224# Defines to be used in the assembly code. 225# 226#.set r0,0 # we use it as storage for value of 0 227#.set SP,1 # preserved 228#.set RTOC,2 # preserved 229#.set r3,3 # 1st argument/return value 230#.set r4,4 # 2nd argument/volatile register 231#.set r5,5 # 3rd argument/volatile register 232#.set r6,6 # ... 233#.set r7,7 234#.set r8,8 235#.set r9,9 236#.set r10,10 237#.set r11,11 238#.set r12,12 239#.set r13,13 # not used, nor any other "below" it... 240 241# Declare function names to be global 242# NOTE: For gcc these names MUST be changed to remove 243# the first . i.e. for example change ".bn_sqr_comba4" 244# to "bn_sqr_comba4". This should be automatically done 245# in the build. 246 247 .globl .bn_sqr_comba4 248 .globl .bn_sqr_comba8 249 .globl .bn_mul_comba4 250 .globl .bn_mul_comba8 251 .globl .bn_sub_words 252 .globl .bn_add_words 253 .globl .bn_div_words 254 .globl .bn_sqr_words 255 .globl .bn_mul_words 256 .globl .bn_mul_add_words 257 258# .text section 259 260 .machine "any" 261 .text 262 263# 264# NOTE: The following label name should be changed to 265# "bn_sqr_comba4" i.e. remove the first dot 266# for the gcc compiler. This should be automatically 267# done in the build 268# 269 270.align 4 271.bn_sqr_comba4: 272# 273# Optimized version of bn_sqr_comba4. 274# 275# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 276# r3 contains r 277# r4 contains a 278# 279# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 280# 281# r5,r6 are the two BN_ULONGs being multiplied. 282# r7,r8 are the results of the 32x32 giving 64 bit multiply. 283# r9,r10, r11 are the equivalents of c1,c2, c3. 284# Here's the assembly 285# 286# 287 xor r0,r0,r0 # set r0 = 0. Used in the addze 288 # instructions below 289 290 #sqr_add_c(a,0,c1,c2,c3) 291 $LD r5,`0*$BNSZ`(r4) 292 $UMULL r9,r5,r5 293 $UMULH r10,r5,r5 #in first iteration. No need 294 #to add since c1=c2=c3=0. 295 # Note c3(r11) is NOT set to 0 296 # but will be. 297 298 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 299 # sqr_add_c2(a,1,0,c2,c3,c1); 300 $LD r6,`1*$BNSZ`(r4) 301 $UMULL r7,r5,r6 302 $UMULH r8,r5,r6 303 304 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) 305 adde r8,r8,r8 306 addze r9,r0 # catch carry if any. 307 # r9= r0(=0) and carry 308 309 addc r10,r7,r10 # now add to temp result. 310 addze r11,r8 # r8 added to r11 which is 0 311 addze r9,r9 312 313 $ST r10,`1*$BNSZ`(r3) #r[1]=c2; 314 #sqr_add_c(a,1,c3,c1,c2) 315 $UMULL r7,r6,r6 316 $UMULH r8,r6,r6 317 addc r11,r7,r11 318 adde r9,r8,r9 319 addze r10,r0 320 #sqr_add_c2(a,2,0,c3,c1,c2) 321 $LD r6,`2*$BNSZ`(r4) 322 $UMULL r7,r5,r6 323 $UMULH r8,r5,r6 324 325 addc r7,r7,r7 326 adde r8,r8,r8 327 addze r10,r10 328 329 addc r11,r7,r11 330 adde r9,r8,r9 331 addze r10,r10 332 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 333 #sqr_add_c2(a,3,0,c1,c2,c3); 334 $LD r6,`3*$BNSZ`(r4) 335 $UMULL r7,r5,r6 336 $UMULH r8,r5,r6 337 addc r7,r7,r7 338 adde r8,r8,r8 339 addze r11,r0 340 341 addc r9,r7,r9 342 adde r10,r8,r10 343 addze r11,r11 344 #sqr_add_c2(a,2,1,c1,c2,c3); 345 $LD r5,`1*$BNSZ`(r4) 346 $LD r6,`2*$BNSZ`(r4) 347 $UMULL r7,r5,r6 348 $UMULH r8,r5,r6 349 350 addc r7,r7,r7 351 adde r8,r8,r8 352 addze r11,r11 353 addc r9,r7,r9 354 adde r10,r8,r10 355 addze r11,r11 356 $ST r9,`3*$BNSZ`(r3) #r[3]=c1 357 #sqr_add_c(a,2,c2,c3,c1); 358 $UMULL r7,r6,r6 359 $UMULH r8,r6,r6 360 addc r10,r7,r10 361 adde r11,r8,r11 362 addze r9,r0 363 #sqr_add_c2(a,3,1,c2,c3,c1); 364 $LD r6,`3*$BNSZ`(r4) 365 $UMULL r7,r5,r6 366 $UMULH r8,r5,r6 367 addc r7,r7,r7 368 adde r8,r8,r8 369 addze r9,r9 370 371 addc r10,r7,r10 372 adde r11,r8,r11 373 addze r9,r9 374 $ST r10,`4*$BNSZ`(r3) #r[4]=c2 375 #sqr_add_c2(a,3,2,c3,c1,c2); 376 $LD r5,`2*$BNSZ`(r4) 377 $UMULL r7,r5,r6 378 $UMULH r8,r5,r6 379 addc r7,r7,r7 380 adde r8,r8,r8 381 addze r10,r0 382 383 addc r11,r7,r11 384 adde r9,r8,r9 385 addze r10,r10 386 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 387 #sqr_add_c(a,3,c1,c2,c3); 388 $UMULL r7,r6,r6 389 $UMULH r8,r6,r6 390 addc r9,r7,r9 391 adde r10,r8,r10 392 393 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 394 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 395 blr 396 .long 0 397 .byte 0,12,0x14,0,0,0,2,0 398 .long 0 399.size .bn_sqr_comba4,.-.bn_sqr_comba4 400 401# 402# NOTE: The following label name should be changed to 403# "bn_sqr_comba8" i.e. remove the first dot 404# for the gcc compiler. This should be automatically 405# done in the build 406# 407 408.align 4 409.bn_sqr_comba8: 410# 411# This is an optimized version of the bn_sqr_comba8 routine. 412# Tightly uses the adde instruction 413# 414# 415# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 416# r3 contains r 417# r4 contains a 418# 419# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 420# 421# r5,r6 are the two BN_ULONGs being multiplied. 422# r7,r8 are the results of the 32x32 giving 64 bit multiply. 423# r9,r10, r11 are the equivalents of c1,c2, c3. 424# 425# Possible optimization of loading all 8 longs of a into registers 426# doesn't provide any speedup 427# 428 429 xor r0,r0,r0 #set r0 = 0.Used in addze 430 #instructions below. 431 432 #sqr_add_c(a,0,c1,c2,c3); 433 $LD r5,`0*$BNSZ`(r4) 434 $UMULL r9,r5,r5 #1st iteration: no carries. 435 $UMULH r10,r5,r5 436 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 437 #sqr_add_c2(a,1,0,c2,c3,c1); 438 $LD r6,`1*$BNSZ`(r4) 439 $UMULL r7,r5,r6 440 $UMULH r8,r5,r6 441 442 addc r10,r7,r10 #add the two register number 443 adde r11,r8,r0 # (r8,r7) to the three register 444 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 445 446 addc r10,r7,r10 #add the two register number 447 adde r11,r8,r11 # (r8,r7) to the three register 448 addze r9,r9 # number (r9,r11,r10). 449 450 $ST r10,`1*$BNSZ`(r3) # r[1]=c2 451 452 #sqr_add_c(a,1,c3,c1,c2); 453 $UMULL r7,r6,r6 454 $UMULH r8,r6,r6 455 addc r11,r7,r11 456 adde r9,r8,r9 457 addze r10,r0 458 #sqr_add_c2(a,2,0,c3,c1,c2); 459 $LD r6,`2*$BNSZ`(r4) 460 $UMULL r7,r5,r6 461 $UMULH r8,r5,r6 462 463 addc r11,r7,r11 464 adde r9,r8,r9 465 addze r10,r10 466 467 addc r11,r7,r11 468 adde r9,r8,r9 469 addze r10,r10 470 471 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 472 #sqr_add_c2(a,3,0,c1,c2,c3); 473 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. 474 $UMULL r7,r5,r6 475 $UMULH r8,r5,r6 476 477 addc r9,r7,r9 478 adde r10,r8,r10 479 addze r11,r0 480 481 addc r9,r7,r9 482 adde r10,r8,r10 483 addze r11,r11 484 #sqr_add_c2(a,2,1,c1,c2,c3); 485 $LD r5,`1*$BNSZ`(r4) 486 $LD r6,`2*$BNSZ`(r4) 487 $UMULL r7,r5,r6 488 $UMULH r8,r5,r6 489 490 addc r9,r7,r9 491 adde r10,r8,r10 492 addze r11,r11 493 494 addc r9,r7,r9 495 adde r10,r8,r10 496 addze r11,r11 497 498 $ST r9,`3*$BNSZ`(r3) #r[3]=c1; 499 #sqr_add_c(a,2,c2,c3,c1); 500 $UMULL r7,r6,r6 501 $UMULH r8,r6,r6 502 503 addc r10,r7,r10 504 adde r11,r8,r11 505 addze r9,r0 506 #sqr_add_c2(a,3,1,c2,c3,c1); 507 $LD r6,`3*$BNSZ`(r4) 508 $UMULL r7,r5,r6 509 $UMULH r8,r5,r6 510 511 addc r10,r7,r10 512 adde r11,r8,r11 513 addze r9,r9 514 515 addc r10,r7,r10 516 adde r11,r8,r11 517 addze r9,r9 518 #sqr_add_c2(a,4,0,c2,c3,c1); 519 $LD r5,`0*$BNSZ`(r4) 520 $LD r6,`4*$BNSZ`(r4) 521 $UMULL r7,r5,r6 522 $UMULH r8,r5,r6 523 524 addc r10,r7,r10 525 adde r11,r8,r11 526 addze r9,r9 527 528 addc r10,r7,r10 529 adde r11,r8,r11 530 addze r9,r9 531 $ST r10,`4*$BNSZ`(r3) #r[4]=c2; 532 #sqr_add_c2(a,5,0,c3,c1,c2); 533 $LD r6,`5*$BNSZ`(r4) 534 $UMULL r7,r5,r6 535 $UMULH r8,r5,r6 536 537 addc r11,r7,r11 538 adde r9,r8,r9 539 addze r10,r0 540 541 addc r11,r7,r11 542 adde r9,r8,r9 543 addze r10,r10 544 #sqr_add_c2(a,4,1,c3,c1,c2); 545 $LD r5,`1*$BNSZ`(r4) 546 $LD r6,`4*$BNSZ`(r4) 547 $UMULL r7,r5,r6 548 $UMULH r8,r5,r6 549 550 addc r11,r7,r11 551 adde r9,r8,r9 552 addze r10,r10 553 554 addc r11,r7,r11 555 adde r9,r8,r9 556 addze r10,r10 557 #sqr_add_c2(a,3,2,c3,c1,c2); 558 $LD r5,`2*$BNSZ`(r4) 559 $LD r6,`3*$BNSZ`(r4) 560 $UMULL r7,r5,r6 561 $UMULH r8,r5,r6 562 563 addc r11,r7,r11 564 adde r9,r8,r9 565 addze r10,r10 566 567 addc r11,r7,r11 568 adde r9,r8,r9 569 addze r10,r10 570 $ST r11,`5*$BNSZ`(r3) #r[5]=c3; 571 #sqr_add_c(a,3,c1,c2,c3); 572 $UMULL r7,r6,r6 573 $UMULH r8,r6,r6 574 addc r9,r7,r9 575 adde r10,r8,r10 576 addze r11,r0 577 #sqr_add_c2(a,4,2,c1,c2,c3); 578 $LD r6,`4*$BNSZ`(r4) 579 $UMULL r7,r5,r6 580 $UMULH r8,r5,r6 581 582 addc r9,r7,r9 583 adde r10,r8,r10 584 addze r11,r11 585 586 addc r9,r7,r9 587 adde r10,r8,r10 588 addze r11,r11 589 #sqr_add_c2(a,5,1,c1,c2,c3); 590 $LD r5,`1*$BNSZ`(r4) 591 $LD r6,`5*$BNSZ`(r4) 592 $UMULL r7,r5,r6 593 $UMULH r8,r5,r6 594 595 addc r9,r7,r9 596 adde r10,r8,r10 597 addze r11,r11 598 599 addc r9,r7,r9 600 adde r10,r8,r10 601 addze r11,r11 602 #sqr_add_c2(a,6,0,c1,c2,c3); 603 $LD r5,`0*$BNSZ`(r4) 604 $LD r6,`6*$BNSZ`(r4) 605 $UMULL r7,r5,r6 606 $UMULH r8,r5,r6 607 addc r9,r7,r9 608 adde r10,r8,r10 609 addze r11,r11 610 addc r9,r7,r9 611 adde r10,r8,r10 612 addze r11,r11 613 $ST r9,`6*$BNSZ`(r3) #r[6]=c1; 614 #sqr_add_c2(a,7,0,c2,c3,c1); 615 $LD r6,`7*$BNSZ`(r4) 616 $UMULL r7,r5,r6 617 $UMULH r8,r5,r6 618 619 addc r10,r7,r10 620 adde r11,r8,r11 621 addze r9,r0 622 addc r10,r7,r10 623 adde r11,r8,r11 624 addze r9,r9 625 #sqr_add_c2(a,6,1,c2,c3,c1); 626 $LD r5,`1*$BNSZ`(r4) 627 $LD r6,`6*$BNSZ`(r4) 628 $UMULL r7,r5,r6 629 $UMULH r8,r5,r6 630 631 addc r10,r7,r10 632 adde r11,r8,r11 633 addze r9,r9 634 addc r10,r7,r10 635 adde r11,r8,r11 636 addze r9,r9 637 #sqr_add_c2(a,5,2,c2,c3,c1); 638 $LD r5,`2*$BNSZ`(r4) 639 $LD r6,`5*$BNSZ`(r4) 640 $UMULL r7,r5,r6 641 $UMULH r8,r5,r6 642 addc r10,r7,r10 643 adde r11,r8,r11 644 addze r9,r9 645 addc r10,r7,r10 646 adde r11,r8,r11 647 addze r9,r9 648 #sqr_add_c2(a,4,3,c2,c3,c1); 649 $LD r5,`3*$BNSZ`(r4) 650 $LD r6,`4*$BNSZ`(r4) 651 $UMULL r7,r5,r6 652 $UMULH r8,r5,r6 653 654 addc r10,r7,r10 655 adde r11,r8,r11 656 addze r9,r9 657 addc r10,r7,r10 658 adde r11,r8,r11 659 addze r9,r9 660 $ST r10,`7*$BNSZ`(r3) #r[7]=c2; 661 #sqr_add_c(a,4,c3,c1,c2); 662 $UMULL r7,r6,r6 663 $UMULH r8,r6,r6 664 addc r11,r7,r11 665 adde r9,r8,r9 666 addze r10,r0 667 #sqr_add_c2(a,5,3,c3,c1,c2); 668 $LD r6,`5*$BNSZ`(r4) 669 $UMULL r7,r5,r6 670 $UMULH r8,r5,r6 671 addc r11,r7,r11 672 adde r9,r8,r9 673 addze r10,r10 674 addc r11,r7,r11 675 adde r9,r8,r9 676 addze r10,r10 677 #sqr_add_c2(a,6,2,c3,c1,c2); 678 $LD r5,`2*$BNSZ`(r4) 679 $LD r6,`6*$BNSZ`(r4) 680 $UMULL r7,r5,r6 681 $UMULH r8,r5,r6 682 addc r11,r7,r11 683 adde r9,r8,r9 684 addze r10,r10 685 686 addc r11,r7,r11 687 adde r9,r8,r9 688 addze r10,r10 689 #sqr_add_c2(a,7,1,c3,c1,c2); 690 $LD r5,`1*$BNSZ`(r4) 691 $LD r6,`7*$BNSZ`(r4) 692 $UMULL r7,r5,r6 693 $UMULH r8,r5,r6 694 addc r11,r7,r11 695 adde r9,r8,r9 696 addze r10,r10 697 addc r11,r7,r11 698 adde r9,r8,r9 699 addze r10,r10 700 $ST r11,`8*$BNSZ`(r3) #r[8]=c3; 701 #sqr_add_c2(a,7,2,c1,c2,c3); 702 $LD r5,`2*$BNSZ`(r4) 703 $UMULL r7,r5,r6 704 $UMULH r8,r5,r6 705 706 addc r9,r7,r9 707 adde r10,r8,r10 708 addze r11,r0 709 addc r9,r7,r9 710 adde r10,r8,r10 711 addze r11,r11 712 #sqr_add_c2(a,6,3,c1,c2,c3); 713 $LD r5,`3*$BNSZ`(r4) 714 $LD r6,`6*$BNSZ`(r4) 715 $UMULL r7,r5,r6 716 $UMULH r8,r5,r6 717 addc r9,r7,r9 718 adde r10,r8,r10 719 addze r11,r11 720 addc r9,r7,r9 721 adde r10,r8,r10 722 addze r11,r11 723 #sqr_add_c2(a,5,4,c1,c2,c3); 724 $LD r5,`4*$BNSZ`(r4) 725 $LD r6,`5*$BNSZ`(r4) 726 $UMULL r7,r5,r6 727 $UMULH r8,r5,r6 728 addc r9,r7,r9 729 adde r10,r8,r10 730 addze r11,r11 731 addc r9,r7,r9 732 adde r10,r8,r10 733 addze r11,r11 734 $ST r9,`9*$BNSZ`(r3) #r[9]=c1; 735 #sqr_add_c(a,5,c2,c3,c1); 736 $UMULL r7,r6,r6 737 $UMULH r8,r6,r6 738 addc r10,r7,r10 739 adde r11,r8,r11 740 addze r9,r0 741 #sqr_add_c2(a,6,4,c2,c3,c1); 742 $LD r6,`6*$BNSZ`(r4) 743 $UMULL r7,r5,r6 744 $UMULH r8,r5,r6 745 addc r10,r7,r10 746 adde r11,r8,r11 747 addze r9,r9 748 addc r10,r7,r10 749 adde r11,r8,r11 750 addze r9,r9 751 #sqr_add_c2(a,7,3,c2,c3,c1); 752 $LD r5,`3*$BNSZ`(r4) 753 $LD r6,`7*$BNSZ`(r4) 754 $UMULL r7,r5,r6 755 $UMULH r8,r5,r6 756 addc r10,r7,r10 757 adde r11,r8,r11 758 addze r9,r9 759 addc r10,r7,r10 760 adde r11,r8,r11 761 addze r9,r9 762 $ST r10,`10*$BNSZ`(r3) #r[10]=c2; 763 #sqr_add_c2(a,7,4,c3,c1,c2); 764 $LD r5,`4*$BNSZ`(r4) 765 $UMULL r7,r5,r6 766 $UMULH r8,r5,r6 767 addc r11,r7,r11 768 adde r9,r8,r9 769 addze r10,r0 770 addc r11,r7,r11 771 adde r9,r8,r9 772 addze r10,r10 773 #sqr_add_c2(a,6,5,c3,c1,c2); 774 $LD r5,`5*$BNSZ`(r4) 775 $LD r6,`6*$BNSZ`(r4) 776 $UMULL r7,r5,r6 777 $UMULH r8,r5,r6 778 addc r11,r7,r11 779 adde r9,r8,r9 780 addze r10,r10 781 addc r11,r7,r11 782 adde r9,r8,r9 783 addze r10,r10 784 $ST r11,`11*$BNSZ`(r3) #r[11]=c3; 785 #sqr_add_c(a,6,c1,c2,c3); 786 $UMULL r7,r6,r6 787 $UMULH r8,r6,r6 788 addc r9,r7,r9 789 adde r10,r8,r10 790 addze r11,r0 791 #sqr_add_c2(a,7,5,c1,c2,c3) 792 $LD r6,`7*$BNSZ`(r4) 793 $UMULL r7,r5,r6 794 $UMULH r8,r5,r6 795 addc r9,r7,r9 796 adde r10,r8,r10 797 addze r11,r11 798 addc r9,r7,r9 799 adde r10,r8,r10 800 addze r11,r11 801 $ST r9,`12*$BNSZ`(r3) #r[12]=c1; 802 803 #sqr_add_c2(a,7,6,c2,c3,c1) 804 $LD r5,`6*$BNSZ`(r4) 805 $UMULL r7,r5,r6 806 $UMULH r8,r5,r6 807 addc r10,r7,r10 808 adde r11,r8,r11 809 addze r9,r0 810 addc r10,r7,r10 811 adde r11,r8,r11 812 addze r9,r9 813 $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 814 #sqr_add_c(a,7,c3,c1,c2); 815 $UMULL r7,r6,r6 816 $UMULH r8,r6,r6 817 addc r11,r7,r11 818 adde r9,r8,r9 819 $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 820 $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 821 822 823 blr 824 .long 0 825 .byte 0,12,0x14,0,0,0,2,0 826 .long 0 827.size .bn_sqr_comba8,.-.bn_sqr_comba8 828 829# 830# NOTE: The following label name should be changed to 831# "bn_mul_comba4" i.e. remove the first dot 832# for the gcc compiler. This should be automatically 833# done in the build 834# 835 836.align 4 837.bn_mul_comba4: 838# 839# This is an optimized version of the bn_mul_comba4 routine. 840# 841# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 842# r3 contains r 843# r4 contains a 844# r5 contains b 845# r6, r7 are the 2 BN_ULONGs being multiplied. 846# r8, r9 are the results of the 32x32 giving 64 multiply. 847# r10, r11, r12 are the equivalents of c1, c2, and c3. 848# 849 xor r0,r0,r0 #r0=0. Used in addze below. 850 #mul_add_c(a[0],b[0],c1,c2,c3); 851 $LD r6,`0*$BNSZ`(r4) 852 $LD r7,`0*$BNSZ`(r5) 853 $UMULL r10,r6,r7 854 $UMULH r11,r6,r7 855 $ST r10,`0*$BNSZ`(r3) #r[0]=c1 856 #mul_add_c(a[0],b[1],c2,c3,c1); 857 $LD r7,`1*$BNSZ`(r5) 858 $UMULL r8,r6,r7 859 $UMULH r9,r6,r7 860 addc r11,r8,r11 861 adde r12,r9,r0 862 addze r10,r0 863 #mul_add_c(a[1],b[0],c2,c3,c1); 864 $LD r6, `1*$BNSZ`(r4) 865 $LD r7, `0*$BNSZ`(r5) 866 $UMULL r8,r6,r7 867 $UMULH r9,r6,r7 868 addc r11,r8,r11 869 adde r12,r9,r12 870 addze r10,r10 871 $ST r11,`1*$BNSZ`(r3) #r[1]=c2 872 #mul_add_c(a[2],b[0],c3,c1,c2); 873 $LD r6,`2*$BNSZ`(r4) 874 $UMULL r8,r6,r7 875 $UMULH r9,r6,r7 876 addc r12,r8,r12 877 adde r10,r9,r10 878 addze r11,r0 879 #mul_add_c(a[1],b[1],c3,c1,c2); 880 $LD r6,`1*$BNSZ`(r4) 881 $LD r7,`1*$BNSZ`(r5) 882 $UMULL r8,r6,r7 883 $UMULH r9,r6,r7 884 addc r12,r8,r12 885 adde r10,r9,r10 886 addze r11,r11 887 #mul_add_c(a[0],b[2],c3,c1,c2); 888 $LD r6,`0*$BNSZ`(r4) 889 $LD r7,`2*$BNSZ`(r5) 890 $UMULL r8,r6,r7 891 $UMULH r9,r6,r7 892 addc r12,r8,r12 893 adde r10,r9,r10 894 addze r11,r11 895 $ST r12,`2*$BNSZ`(r3) #r[2]=c3 896 #mul_add_c(a[0],b[3],c1,c2,c3); 897 $LD r7,`3*$BNSZ`(r5) 898 $UMULL r8,r6,r7 899 $UMULH r9,r6,r7 900 addc r10,r8,r10 901 adde r11,r9,r11 902 addze r12,r0 903 #mul_add_c(a[1],b[2],c1,c2,c3); 904 $LD r6,`1*$BNSZ`(r4) 905 $LD r7,`2*$BNSZ`(r5) 906 $UMULL r8,r6,r7 907 $UMULH r9,r6,r7 908 addc r10,r8,r10 909 adde r11,r9,r11 910 addze r12,r12 911 #mul_add_c(a[2],b[1],c1,c2,c3); 912 $LD r6,`2*$BNSZ`(r4) 913 $LD r7,`1*$BNSZ`(r5) 914 $UMULL r8,r6,r7 915 $UMULH r9,r6,r7 916 addc r10,r8,r10 917 adde r11,r9,r11 918 addze r12,r12 919 #mul_add_c(a[3],b[0],c1,c2,c3); 920 $LD r6,`3*$BNSZ`(r4) 921 $LD r7,`0*$BNSZ`(r5) 922 $UMULL r8,r6,r7 923 $UMULH r9,r6,r7 924 addc r10,r8,r10 925 adde r11,r9,r11 926 addze r12,r12 927 $ST r10,`3*$BNSZ`(r3) #r[3]=c1 928 #mul_add_c(a[3],b[1],c2,c3,c1); 929 $LD r7,`1*$BNSZ`(r5) 930 $UMULL r8,r6,r7 931 $UMULH r9,r6,r7 932 addc r11,r8,r11 933 adde r12,r9,r12 934 addze r10,r0 935 #mul_add_c(a[2],b[2],c2,c3,c1); 936 $LD r6,`2*$BNSZ`(r4) 937 $LD r7,`2*$BNSZ`(r5) 938 $UMULL r8,r6,r7 939 $UMULH r9,r6,r7 940 addc r11,r8,r11 941 adde r12,r9,r12 942 addze r10,r10 943 #mul_add_c(a[1],b[3],c2,c3,c1); 944 $LD r6,`1*$BNSZ`(r4) 945 $LD r7,`3*$BNSZ`(r5) 946 $UMULL r8,r6,r7 947 $UMULH r9,r6,r7 948 addc r11,r8,r11 949 adde r12,r9,r12 950 addze r10,r10 951 $ST r11,`4*$BNSZ`(r3) #r[4]=c2 952 #mul_add_c(a[2],b[3],c3,c1,c2); 953 $LD r6,`2*$BNSZ`(r4) 954 $UMULL r8,r6,r7 955 $UMULH r9,r6,r7 956 addc r12,r8,r12 957 adde r10,r9,r10 958 addze r11,r0 959 #mul_add_c(a[3],b[2],c3,c1,c2); 960 $LD r6,`3*$BNSZ`(r4) 961 $LD r7,`2*$BNSZ`(r5) 962 $UMULL r8,r6,r7 963 $UMULH r9,r6,r7 964 addc r12,r8,r12 965 adde r10,r9,r10 966 addze r11,r11 967 $ST r12,`5*$BNSZ`(r3) #r[5]=c3 968 #mul_add_c(a[3],b[3],c1,c2,c3); 969 $LD r7,`3*$BNSZ`(r5) 970 $UMULL r8,r6,r7 971 $UMULH r9,r6,r7 972 addc r10,r8,r10 973 adde r11,r9,r11 974 975 $ST r10,`6*$BNSZ`(r3) #r[6]=c1 976 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 977 blr 978 .long 0 979 .byte 0,12,0x14,0,0,0,3,0 980 .long 0 981.size .bn_mul_comba4,.-.bn_mul_comba4 982 983# 984# NOTE: The following label name should be changed to 985# "bn_mul_comba8" i.e. remove the first dot 986# for the gcc compiler. This should be automatically 987# done in the build 988# 989 990.align 4 991.bn_mul_comba8: 992# 993# Optimized version of the bn_mul_comba8 routine. 994# 995# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 996# r3 contains r 997# r4 contains a 998# r5 contains b 999# r6, r7 are the 2 BN_ULONGs being multiplied. 1000# r8, r9 are the results of the 32x32 giving 64 multiply. 1001# r10, r11, r12 are the equivalents of c1, c2, and c3. 1002# 1003 xor r0,r0,r0 #r0=0. Used in addze below. 1004 1005 #mul_add_c(a[0],b[0],c1,c2,c3); 1006 $LD r6,`0*$BNSZ`(r4) #a[0] 1007 $LD r7,`0*$BNSZ`(r5) #b[0] 1008 $UMULL r10,r6,r7 1009 $UMULH r11,r6,r7 1010 $ST r10,`0*$BNSZ`(r3) #r[0]=c1; 1011 #mul_add_c(a[0],b[1],c2,c3,c1); 1012 $LD r7,`1*$BNSZ`(r5) 1013 $UMULL r8,r6,r7 1014 $UMULH r9,r6,r7 1015 addc r11,r11,r8 1016 addze r12,r9 # since we didn't set r12 to zero before. 1017 addze r10,r0 1018 #mul_add_c(a[1],b[0],c2,c3,c1); 1019 $LD r6,`1*$BNSZ`(r4) 1020 $LD r7,`0*$BNSZ`(r5) 1021 $UMULL r8,r6,r7 1022 $UMULH r9,r6,r7 1023 addc r11,r11,r8 1024 adde r12,r12,r9 1025 addze r10,r10 1026 $ST r11,`1*$BNSZ`(r3) #r[1]=c2; 1027 #mul_add_c(a[2],b[0],c3,c1,c2); 1028 $LD r6,`2*$BNSZ`(r4) 1029 $UMULL r8,r6,r7 1030 $UMULH r9,r6,r7 1031 addc r12,r12,r8 1032 adde r10,r10,r9 1033 addze r11,r0 1034 #mul_add_c(a[1],b[1],c3,c1,c2); 1035 $LD r6,`1*$BNSZ`(r4) 1036 $LD r7,`1*$BNSZ`(r5) 1037 $UMULL r8,r6,r7 1038 $UMULH r9,r6,r7 1039 addc r12,r12,r8 1040 adde r10,r10,r9 1041 addze r11,r11 1042 #mul_add_c(a[0],b[2],c3,c1,c2); 1043 $LD r6,`0*$BNSZ`(r4) 1044 $LD r7,`2*$BNSZ`(r5) 1045 $UMULL r8,r6,r7 1046 $UMULH r9,r6,r7 1047 addc r12,r12,r8 1048 adde r10,r10,r9 1049 addze r11,r11 1050 $ST r12,`2*$BNSZ`(r3) #r[2]=c3; 1051 #mul_add_c(a[0],b[3],c1,c2,c3); 1052 $LD r7,`3*$BNSZ`(r5) 1053 $UMULL r8,r6,r7 1054 $UMULH r9,r6,r7 1055 addc r10,r10,r8 1056 adde r11,r11,r9 1057 addze r12,r0 1058 #mul_add_c(a[1],b[2],c1,c2,c3); 1059 $LD r6,`1*$BNSZ`(r4) 1060 $LD r7,`2*$BNSZ`(r5) 1061 $UMULL r8,r6,r7 1062 $UMULH r9,r6,r7 1063 addc r10,r10,r8 1064 adde r11,r11,r9 1065 addze r12,r12 1066 1067 #mul_add_c(a[2],b[1],c1,c2,c3); 1068 $LD r6,`2*$BNSZ`(r4) 1069 $LD r7,`1*$BNSZ`(r5) 1070 $UMULL r8,r6,r7 1071 $UMULH r9,r6,r7 1072 addc r10,r10,r8 1073 adde r11,r11,r9 1074 addze r12,r12 1075 #mul_add_c(a[3],b[0],c1,c2,c3); 1076 $LD r6,`3*$BNSZ`(r4) 1077 $LD r7,`0*$BNSZ`(r5) 1078 $UMULL r8,r6,r7 1079 $UMULH r9,r6,r7 1080 addc r10,r10,r8 1081 adde r11,r11,r9 1082 addze r12,r12 1083 $ST r10,`3*$BNSZ`(r3) #r[3]=c1; 1084 #mul_add_c(a[4],b[0],c2,c3,c1); 1085 $LD r6,`4*$BNSZ`(r4) 1086 $UMULL r8,r6,r7 1087 $UMULH r9,r6,r7 1088 addc r11,r11,r8 1089 adde r12,r12,r9 1090 addze r10,r0 1091 #mul_add_c(a[3],b[1],c2,c3,c1); 1092 $LD r6,`3*$BNSZ`(r4) 1093 $LD r7,`1*$BNSZ`(r5) 1094 $UMULL r8,r6,r7 1095 $UMULH r9,r6,r7 1096 addc r11,r11,r8 1097 adde r12,r12,r9 1098 addze r10,r10 1099 #mul_add_c(a[2],b[2],c2,c3,c1); 1100 $LD r6,`2*$BNSZ`(r4) 1101 $LD r7,`2*$BNSZ`(r5) 1102 $UMULL r8,r6,r7 1103 $UMULH r9,r6,r7 1104 addc r11,r11,r8 1105 adde r12,r12,r9 1106 addze r10,r10 1107 #mul_add_c(a[1],b[3],c2,c3,c1); 1108 $LD r6,`1*$BNSZ`(r4) 1109 $LD r7,`3*$BNSZ`(r5) 1110 $UMULL r8,r6,r7 1111 $UMULH r9,r6,r7 1112 addc r11,r11,r8 1113 adde r12,r12,r9 1114 addze r10,r10 1115 #mul_add_c(a[0],b[4],c2,c3,c1); 1116 $LD r6,`0*$BNSZ`(r4) 1117 $LD r7,`4*$BNSZ`(r5) 1118 $UMULL r8,r6,r7 1119 $UMULH r9,r6,r7 1120 addc r11,r11,r8 1121 adde r12,r12,r9 1122 addze r10,r10 1123 $ST r11,`4*$BNSZ`(r3) #r[4]=c2; 1124 #mul_add_c(a[0],b[5],c3,c1,c2); 1125 $LD r7,`5*$BNSZ`(r5) 1126 $UMULL r8,r6,r7 1127 $UMULH r9,r6,r7 1128 addc r12,r12,r8 1129 adde r10,r10,r9 1130 addze r11,r0 1131 #mul_add_c(a[1],b[4],c3,c1,c2); 1132 $LD r6,`1*$BNSZ`(r4) 1133 $LD r7,`4*$BNSZ`(r5) 1134 $UMULL r8,r6,r7 1135 $UMULH r9,r6,r7 1136 addc r12,r12,r8 1137 adde r10,r10,r9 1138 addze r11,r11 1139 #mul_add_c(a[2],b[3],c3,c1,c2); 1140 $LD r6,`2*$BNSZ`(r4) 1141 $LD r7,`3*$BNSZ`(r5) 1142 $UMULL r8,r6,r7 1143 $UMULH r9,r6,r7 1144 addc r12,r12,r8 1145 adde r10,r10,r9 1146 addze r11,r11 1147 #mul_add_c(a[3],b[2],c3,c1,c2); 1148 $LD r6,`3*$BNSZ`(r4) 1149 $LD r7,`2*$BNSZ`(r5) 1150 $UMULL r8,r6,r7 1151 $UMULH r9,r6,r7 1152 addc r12,r12,r8 1153 adde r10,r10,r9 1154 addze r11,r11 1155 #mul_add_c(a[4],b[1],c3,c1,c2); 1156 $LD r6,`4*$BNSZ`(r4) 1157 $LD r7,`1*$BNSZ`(r5) 1158 $UMULL r8,r6,r7 1159 $UMULH r9,r6,r7 1160 addc r12,r12,r8 1161 adde r10,r10,r9 1162 addze r11,r11 1163 #mul_add_c(a[5],b[0],c3,c1,c2); 1164 $LD r6,`5*$BNSZ`(r4) 1165 $LD r7,`0*$BNSZ`(r5) 1166 $UMULL r8,r6,r7 1167 $UMULH r9,r6,r7 1168 addc r12,r12,r8 1169 adde r10,r10,r9 1170 addze r11,r11 1171 $ST r12,`5*$BNSZ`(r3) #r[5]=c3; 1172 #mul_add_c(a[6],b[0],c1,c2,c3); 1173 $LD r6,`6*$BNSZ`(r4) 1174 $UMULL r8,r6,r7 1175 $UMULH r9,r6,r7 1176 addc r10,r10,r8 1177 adde r11,r11,r9 1178 addze r12,r0 1179 #mul_add_c(a[5],b[1],c1,c2,c3); 1180 $LD r6,`5*$BNSZ`(r4) 1181 $LD r7,`1*$BNSZ`(r5) 1182 $UMULL r8,r6,r7 1183 $UMULH r9,r6,r7 1184 addc r10,r10,r8 1185 adde r11,r11,r9 1186 addze r12,r12 1187 #mul_add_c(a[4],b[2],c1,c2,c3); 1188 $LD r6,`4*$BNSZ`(r4) 1189 $LD r7,`2*$BNSZ`(r5) 1190 $UMULL r8,r6,r7 1191 $UMULH r9,r6,r7 1192 addc r10,r10,r8 1193 adde r11,r11,r9 1194 addze r12,r12 1195 #mul_add_c(a[3],b[3],c1,c2,c3); 1196 $LD r6,`3*$BNSZ`(r4) 1197 $LD r7,`3*$BNSZ`(r5) 1198 $UMULL r8,r6,r7 1199 $UMULH r9,r6,r7 1200 addc r10,r10,r8 1201 adde r11,r11,r9 1202 addze r12,r12 1203 #mul_add_c(a[2],b[4],c1,c2,c3); 1204 $LD r6,`2*$BNSZ`(r4) 1205 $LD r7,`4*$BNSZ`(r5) 1206 $UMULL r8,r6,r7 1207 $UMULH r9,r6,r7 1208 addc r10,r10,r8 1209 adde r11,r11,r9 1210 addze r12,r12 1211 #mul_add_c(a[1],b[5],c1,c2,c3); 1212 $LD r6,`1*$BNSZ`(r4) 1213 $LD r7,`5*$BNSZ`(r5) 1214 $UMULL r8,r6,r7 1215 $UMULH r9,r6,r7 1216 addc r10,r10,r8 1217 adde r11,r11,r9 1218 addze r12,r12 1219 #mul_add_c(a[0],b[6],c1,c2,c3); 1220 $LD r6,`0*$BNSZ`(r4) 1221 $LD r7,`6*$BNSZ`(r5) 1222 $UMULL r8,r6,r7 1223 $UMULH r9,r6,r7 1224 addc r10,r10,r8 1225 adde r11,r11,r9 1226 addze r12,r12 1227 $ST r10,`6*$BNSZ`(r3) #r[6]=c1; 1228 #mul_add_c(a[0],b[7],c2,c3,c1); 1229 $LD r7,`7*$BNSZ`(r5) 1230 $UMULL r8,r6,r7 1231 $UMULH r9,r6,r7 1232 addc r11,r11,r8 1233 adde r12,r12,r9 1234 addze r10,r0 1235 #mul_add_c(a[1],b[6],c2,c3,c1); 1236 $LD r6,`1*$BNSZ`(r4) 1237 $LD r7,`6*$BNSZ`(r5) 1238 $UMULL r8,r6,r7 1239 $UMULH r9,r6,r7 1240 addc r11,r11,r8 1241 adde r12,r12,r9 1242 addze r10,r10 1243 #mul_add_c(a[2],b[5],c2,c3,c1); 1244 $LD r6,`2*$BNSZ`(r4) 1245 $LD r7,`5*$BNSZ`(r5) 1246 $UMULL r8,r6,r7 1247 $UMULH r9,r6,r7 1248 addc r11,r11,r8 1249 adde r12,r12,r9 1250 addze r10,r10 1251 #mul_add_c(a[3],b[4],c2,c3,c1); 1252 $LD r6,`3*$BNSZ`(r4) 1253 $LD r7,`4*$BNSZ`(r5) 1254 $UMULL r8,r6,r7 1255 $UMULH r9,r6,r7 1256 addc r11,r11,r8 1257 adde r12,r12,r9 1258 addze r10,r10 1259 #mul_add_c(a[4],b[3],c2,c3,c1); 1260 $LD r6,`4*$BNSZ`(r4) 1261 $LD r7,`3*$BNSZ`(r5) 1262 $UMULL r8,r6,r7 1263 $UMULH r9,r6,r7 1264 addc r11,r11,r8 1265 adde r12,r12,r9 1266 addze r10,r10 1267 #mul_add_c(a[5],b[2],c2,c3,c1); 1268 $LD r6,`5*$BNSZ`(r4) 1269 $LD r7,`2*$BNSZ`(r5) 1270 $UMULL r8,r6,r7 1271 $UMULH r9,r6,r7 1272 addc r11,r11,r8 1273 adde r12,r12,r9 1274 addze r10,r10 1275 #mul_add_c(a[6],b[1],c2,c3,c1); 1276 $LD r6,`6*$BNSZ`(r4) 1277 $LD r7,`1*$BNSZ`(r5) 1278 $UMULL r8,r6,r7 1279 $UMULH r9,r6,r7 1280 addc r11,r11,r8 1281 adde r12,r12,r9 1282 addze r10,r10 1283 #mul_add_c(a[7],b[0],c2,c3,c1); 1284 $LD r6,`7*$BNSZ`(r4) 1285 $LD r7,`0*$BNSZ`(r5) 1286 $UMULL r8,r6,r7 1287 $UMULH r9,r6,r7 1288 addc r11,r11,r8 1289 adde r12,r12,r9 1290 addze r10,r10 1291 $ST r11,`7*$BNSZ`(r3) #r[7]=c2; 1292 #mul_add_c(a[7],b[1],c3,c1,c2); 1293 $LD r7,`1*$BNSZ`(r5) 1294 $UMULL r8,r6,r7 1295 $UMULH r9,r6,r7 1296 addc r12,r12,r8 1297 adde r10,r10,r9 1298 addze r11,r0 1299 #mul_add_c(a[6],b[2],c3,c1,c2); 1300 $LD r6,`6*$BNSZ`(r4) 1301 $LD r7,`2*$BNSZ`(r5) 1302 $UMULL r8,r6,r7 1303 $UMULH r9,r6,r7 1304 addc r12,r12,r8 1305 adde r10,r10,r9 1306 addze r11,r11 1307 #mul_add_c(a[5],b[3],c3,c1,c2); 1308 $LD r6,`5*$BNSZ`(r4) 1309 $LD r7,`3*$BNSZ`(r5) 1310 $UMULL r8,r6,r7 1311 $UMULH r9,r6,r7 1312 addc r12,r12,r8 1313 adde r10,r10,r9 1314 addze r11,r11 1315 #mul_add_c(a[4],b[4],c3,c1,c2); 1316 $LD r6,`4*$BNSZ`(r4) 1317 $LD r7,`4*$BNSZ`(r5) 1318 $UMULL r8,r6,r7 1319 $UMULH r9,r6,r7 1320 addc r12,r12,r8 1321 adde r10,r10,r9 1322 addze r11,r11 1323 #mul_add_c(a[3],b[5],c3,c1,c2); 1324 $LD r6,`3*$BNSZ`(r4) 1325 $LD r7,`5*$BNSZ`(r5) 1326 $UMULL r8,r6,r7 1327 $UMULH r9,r6,r7 1328 addc r12,r12,r8 1329 adde r10,r10,r9 1330 addze r11,r11 1331 #mul_add_c(a[2],b[6],c3,c1,c2); 1332 $LD r6,`2*$BNSZ`(r4) 1333 $LD r7,`6*$BNSZ`(r5) 1334 $UMULL r8,r6,r7 1335 $UMULH r9,r6,r7 1336 addc r12,r12,r8 1337 adde r10,r10,r9 1338 addze r11,r11 1339 #mul_add_c(a[1],b[7],c3,c1,c2); 1340 $LD r6,`1*$BNSZ`(r4) 1341 $LD r7,`7*$BNSZ`(r5) 1342 $UMULL r8,r6,r7 1343 $UMULH r9,r6,r7 1344 addc r12,r12,r8 1345 adde r10,r10,r9 1346 addze r11,r11 1347 $ST r12,`8*$BNSZ`(r3) #r[8]=c3; 1348 #mul_add_c(a[2],b[7],c1,c2,c3); 1349 $LD r6,`2*$BNSZ`(r4) 1350 $UMULL r8,r6,r7 1351 $UMULH r9,r6,r7 1352 addc r10,r10,r8 1353 adde r11,r11,r9 1354 addze r12,r0 1355 #mul_add_c(a[3],b[6],c1,c2,c3); 1356 $LD r6,`3*$BNSZ`(r4) 1357 $LD r7,`6*$BNSZ`(r5) 1358 $UMULL r8,r6,r7 1359 $UMULH r9,r6,r7 1360 addc r10,r10,r8 1361 adde r11,r11,r9 1362 addze r12,r12 1363 #mul_add_c(a[4],b[5],c1,c2,c3); 1364 $LD r6,`4*$BNSZ`(r4) 1365 $LD r7,`5*$BNSZ`(r5) 1366 $UMULL r8,r6,r7 1367 $UMULH r9,r6,r7 1368 addc r10,r10,r8 1369 adde r11,r11,r9 1370 addze r12,r12 1371 #mul_add_c(a[5],b[4],c1,c2,c3); 1372 $LD r6,`5*$BNSZ`(r4) 1373 $LD r7,`4*$BNSZ`(r5) 1374 $UMULL r8,r6,r7 1375 $UMULH r9,r6,r7 1376 addc r10,r10,r8 1377 adde r11,r11,r9 1378 addze r12,r12 1379 #mul_add_c(a[6],b[3],c1,c2,c3); 1380 $LD r6,`6*$BNSZ`(r4) 1381 $LD r7,`3*$BNSZ`(r5) 1382 $UMULL r8,r6,r7 1383 $UMULH r9,r6,r7 1384 addc r10,r10,r8 1385 adde r11,r11,r9 1386 addze r12,r12 1387 #mul_add_c(a[7],b[2],c1,c2,c3); 1388 $LD r6,`7*$BNSZ`(r4) 1389 $LD r7,`2*$BNSZ`(r5) 1390 $UMULL r8,r6,r7 1391 $UMULH r9,r6,r7 1392 addc r10,r10,r8 1393 adde r11,r11,r9 1394 addze r12,r12 1395 $ST r10,`9*$BNSZ`(r3) #r[9]=c1; 1396 #mul_add_c(a[7],b[3],c2,c3,c1); 1397 $LD r7,`3*$BNSZ`(r5) 1398 $UMULL r8,r6,r7 1399 $UMULH r9,r6,r7 1400 addc r11,r11,r8 1401 adde r12,r12,r9 1402 addze r10,r0 1403 #mul_add_c(a[6],b[4],c2,c3,c1); 1404 $LD r6,`6*$BNSZ`(r4) 1405 $LD r7,`4*$BNSZ`(r5) 1406 $UMULL r8,r6,r7 1407 $UMULH r9,r6,r7 1408 addc r11,r11,r8 1409 adde r12,r12,r9 1410 addze r10,r10 1411 #mul_add_c(a[5],b[5],c2,c3,c1); 1412 $LD r6,`5*$BNSZ`(r4) 1413 $LD r7,`5*$BNSZ`(r5) 1414 $UMULL r8,r6,r7 1415 $UMULH r9,r6,r7 1416 addc r11,r11,r8 1417 adde r12,r12,r9 1418 addze r10,r10 1419 #mul_add_c(a[4],b[6],c2,c3,c1); 1420 $LD r6,`4*$BNSZ`(r4) 1421 $LD r7,`6*$BNSZ`(r5) 1422 $UMULL r8,r6,r7 1423 $UMULH r9,r6,r7 1424 addc r11,r11,r8 1425 adde r12,r12,r9 1426 addze r10,r10 1427 #mul_add_c(a[3],b[7],c2,c3,c1); 1428 $LD r6,`3*$BNSZ`(r4) 1429 $LD r7,`7*$BNSZ`(r5) 1430 $UMULL r8,r6,r7 1431 $UMULH r9,r6,r7 1432 addc r11,r11,r8 1433 adde r12,r12,r9 1434 addze r10,r10 1435 $ST r11,`10*$BNSZ`(r3) #r[10]=c2; 1436 #mul_add_c(a[4],b[7],c3,c1,c2); 1437 $LD r6,`4*$BNSZ`(r4) 1438 $UMULL r8,r6,r7 1439 $UMULH r9,r6,r7 1440 addc r12,r12,r8 1441 adde r10,r10,r9 1442 addze r11,r0 1443 #mul_add_c(a[5],b[6],c3,c1,c2); 1444 $LD r6,`5*$BNSZ`(r4) 1445 $LD r7,`6*$BNSZ`(r5) 1446 $UMULL r8,r6,r7 1447 $UMULH r9,r6,r7 1448 addc r12,r12,r8 1449 adde r10,r10,r9 1450 addze r11,r11 1451 #mul_add_c(a[6],b[5],c3,c1,c2); 1452 $LD r6,`6*$BNSZ`(r4) 1453 $LD r7,`5*$BNSZ`(r5) 1454 $UMULL r8,r6,r7 1455 $UMULH r9,r6,r7 1456 addc r12,r12,r8 1457 adde r10,r10,r9 1458 addze r11,r11 1459 #mul_add_c(a[7],b[4],c3,c1,c2); 1460 $LD r6,`7*$BNSZ`(r4) 1461 $LD r7,`4*$BNSZ`(r5) 1462 $UMULL r8,r6,r7 1463 $UMULH r9,r6,r7 1464 addc r12,r12,r8 1465 adde r10,r10,r9 1466 addze r11,r11 1467 $ST r12,`11*$BNSZ`(r3) #r[11]=c3; 1468 #mul_add_c(a[7],b[5],c1,c2,c3); 1469 $LD r7,`5*$BNSZ`(r5) 1470 $UMULL r8,r6,r7 1471 $UMULH r9,r6,r7 1472 addc r10,r10,r8 1473 adde r11,r11,r9 1474 addze r12,r0 1475 #mul_add_c(a[6],b[6],c1,c2,c3); 1476 $LD r6,`6*$BNSZ`(r4) 1477 $LD r7,`6*$BNSZ`(r5) 1478 $UMULL r8,r6,r7 1479 $UMULH r9,r6,r7 1480 addc r10,r10,r8 1481 adde r11,r11,r9 1482 addze r12,r12 1483 #mul_add_c(a[5],b[7],c1,c2,c3); 1484 $LD r6,`5*$BNSZ`(r4) 1485 $LD r7,`7*$BNSZ`(r5) 1486 $UMULL r8,r6,r7 1487 $UMULH r9,r6,r7 1488 addc r10,r10,r8 1489 adde r11,r11,r9 1490 addze r12,r12 1491 $ST r10,`12*$BNSZ`(r3) #r[12]=c1; 1492 #mul_add_c(a[6],b[7],c2,c3,c1); 1493 $LD r6,`6*$BNSZ`(r4) 1494 $UMULL r8,r6,r7 1495 $UMULH r9,r6,r7 1496 addc r11,r11,r8 1497 adde r12,r12,r9 1498 addze r10,r0 1499 #mul_add_c(a[7],b[6],c2,c3,c1); 1500 $LD r6,`7*$BNSZ`(r4) 1501 $LD r7,`6*$BNSZ`(r5) 1502 $UMULL r8,r6,r7 1503 $UMULH r9,r6,r7 1504 addc r11,r11,r8 1505 adde r12,r12,r9 1506 addze r10,r10 1507 $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 1508 #mul_add_c(a[7],b[7],c3,c1,c2); 1509 $LD r7,`7*$BNSZ`(r5) 1510 $UMULL r8,r6,r7 1511 $UMULH r9,r6,r7 1512 addc r12,r12,r8 1513 adde r10,r10,r9 1514 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1515 $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1516 blr 1517 .long 0 1518 .byte 0,12,0x14,0,0,0,3,0 1519 .long 0 1520.size .bn_mul_comba8,.-.bn_mul_comba8 1521 1522# 1523# NOTE: The following label name should be changed to 1524# "bn_sub_words" i.e. remove the first dot 1525# for the gcc compiler. This should be automatically 1526# done in the build 1527# 1528# 1529.align 4 1530.bn_sub_words: 1531# 1532# Handcoded version of bn_sub_words 1533# 1534#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1535# 1536# r3 = r 1537# r4 = a 1538# r5 = b 1539# r6 = n 1540# 1541# Note: No loop unrolling done since this is not a performance 1542# critical loop. 1543 1544 xor r0,r0,r0 #set r0 = 0 1545# 1546# check for r6 = 0 AND set carry bit. 1547# 1548 subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1549 # if r6 > 0 then result !=0 1550 # In either case carry bit is set. 1551 beq Lppcasm_sub_adios 1552 addi r4,r4,-$BNSZ 1553 addi r3,r3,-$BNSZ 1554 addi r5,r5,-$BNSZ 1555 mtctr r6 1556Lppcasm_sub_mainloop: 1557 $LDU r7,$BNSZ(r4) 1558 $LDU r8,$BNSZ(r5) 1559 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 1560 # if carry = 1 this is r7-r8. Else it 1561 # is r7-r8 -1 as we need. 1562 $STU r6,$BNSZ(r3) 1563 bdnz Lppcasm_sub_mainloop 1564Lppcasm_sub_adios: 1565 subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1566 andi. r3,r3,1 # keep only last bit. 1567 blr 1568 .long 0 1569 .byte 0,12,0x14,0,0,0,4,0 1570 .long 0 1571.size .bn_sub_words,.-.bn_sub_words 1572 1573# 1574# NOTE: The following label name should be changed to 1575# "bn_add_words" i.e. remove the first dot 1576# for the gcc compiler. This should be automatically 1577# done in the build 1578# 1579 1580.align 4 1581.bn_add_words: 1582# 1583# Handcoded version of bn_add_words 1584# 1585#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1586# 1587# r3 = r 1588# r4 = a 1589# r5 = b 1590# r6 = n 1591# 1592# Note: No loop unrolling done since this is not a performance 1593# critical loop. 1594 1595 xor r0,r0,r0 1596# 1597# check for r6 = 0. Is this needed? 1598# 1599 addic. r6,r6,0 #test r6 and clear carry bit. 1600 beq Lppcasm_add_adios 1601 addi r4,r4,-$BNSZ 1602 addi r3,r3,-$BNSZ 1603 addi r5,r5,-$BNSZ 1604 mtctr r6 1605Lppcasm_add_mainloop: 1606 $LDU r7,$BNSZ(r4) 1607 $LDU r8,$BNSZ(r5) 1608 adde r8,r7,r8 1609 $STU r8,$BNSZ(r3) 1610 bdnz Lppcasm_add_mainloop 1611Lppcasm_add_adios: 1612 addze r3,r0 #return carry bit. 1613 blr 1614 .long 0 1615 .byte 0,12,0x14,0,0,0,4,0 1616 .long 0 1617.size .bn_add_words,.-.bn_add_words 1618 1619# 1620# NOTE: The following label name should be changed to 1621# "bn_div_words" i.e. remove the first dot 1622# for the gcc compiler. This should be automatically 1623# done in the build 1624# 1625 1626.align 4 1627.bn_div_words: 1628# 1629# This is a cleaned up version of code generated by 1630# the AIX compiler. The only optimization is to use 1631# the PPC instruction to count leading zeros instead 1632# of call to num_bits_word. Since this was compiled 1633# only at level -O2 we can possibly squeeze it more? 1634# 1635# r3 = h 1636# r4 = l 1637# r5 = d 1638 1639 $UCMPI 0,r5,0 # compare r5 and 0 1640 bne Lppcasm_div1 # proceed if d!=0 1641 li r3,-1 # d=0 return -1 1642 blr 1643Lppcasm_div1: 1644 xor r0,r0,r0 #r0=0 1645 li r8,$BITS 1646 $CNTLZ. r7,r5 #r7 = num leading 0s in d. 1647 beq Lppcasm_div2 #proceed if no leading zeros 1648 subf r8,r7,r8 #r8 = BN_num_bits_word(d) 1649 $SHR. r9,r3,r8 #are there any bits above r8'th? 1650 $TR 16,r9,r0 #if there're, signal to dump core... 1651Lppcasm_div2: 1652 $UCMP 0,r3,r5 #h>=d? 1653 blt Lppcasm_div3 #goto Lppcasm_div3 if not 1654 subf r3,r5,r3 #h-=d ; 1655Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1656 cmpi 0,0,r7,0 # is (i == 0)? 1657 beq Lppcasm_div4 1658 $SHL r3,r3,r7 # h = (h<< i) 1659 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1660 $SHL r5,r5,r7 # d<<=i 1661 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 1662 $SHL r4,r4,r7 # l <<=i 1663Lppcasm_div4: 1664 $SHRI r9,r5,`$BITS/2` # r9 = dh 1665 # dl will be computed when needed 1666 # as it saves registers. 1667 li r6,2 #r6=2 1668 mtctr r6 #counter will be in count. 1669Lppcasm_divouterloop: 1670 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 1671 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1672 # compute here for innerloop. 1673 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1674 bne Lppcasm_div5 # goto Lppcasm_div5 if not 1675 1676 li r8,-1 1677 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1678 b Lppcasm_div6 1679Lppcasm_div5: 1680 $UDIV r8,r3,r9 #q = h/dh 1681Lppcasm_div6: 1682 $UMULL r12,r9,r8 #th = q*dh 1683 $CLRU r10,r5,`$BITS/2` #r10=dl 1684 $UMULL r6,r8,r10 #tl = q*dl 1685 1686Lppcasm_divinnerloop: 1687 subf r10,r12,r3 #t = h -th 1688 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 1689 addic. r7,r7,0 #test if r7 == 0. used below. 1690 # now want to compute 1691 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) 1692 # the following 2 instructions do that 1693 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1694 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1695 $UCMP cr1,r6,r7 # compare (tl <= r7) 1696 bne Lppcasm_divinnerexit 1697 ble cr1,Lppcasm_divinnerexit 1698 addi r8,r8,-1 #q-- 1699 subf r12,r9,r12 #th -=dh 1700 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1701 subf r6,r10,r6 #tl -=dl 1702 b Lppcasm_divinnerloop 1703Lppcasm_divinnerexit: 1704 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1705 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1706 $UCMP cr1,r4,r11 # compare l and tl 1707 add r12,r12,r10 # th+=t 1708 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1709 addi r12,r12,1 # th++ 1710Lppcasm_div7: 1711 subf r11,r11,r4 #r11=l-tl 1712 $UCMP cr1,r3,r12 #compare h and th 1713 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 1714 addi r8,r8,-1 # q-- 1715 add r3,r5,r3 # h+=d 1716Lppcasm_div8: 1717 subf r12,r12,r3 #r12 = h-th 1718 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 1719 # want to compute 1720 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 1721 # the following 2 instructions will do this. 1722 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 1723 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1724 bdz Lppcasm_div9 #if (count==0) break ; 1725 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1726 b Lppcasm_divouterloop 1727Lppcasm_div9: 1728 or r3,r8,r0 1729 blr 1730 .long 0 1731 .byte 0,12,0x14,0,0,0,3,0 1732 .long 0 1733.size .bn_div_words,.-.bn_div_words 1734 1735# 1736# NOTE: The following label name should be changed to 1737# "bn_sqr_words" i.e. remove the first dot 1738# for the gcc compiler. This should be automatically 1739# done in the build 1740# 1741.align 4 1742.bn_sqr_words: 1743# 1744# Optimized version of bn_sqr_words 1745# 1746# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 1747# 1748# r3 = r 1749# r4 = a 1750# r5 = n 1751# 1752# r6 = a[i]. 1753# r7,r8 = product. 1754# 1755# No unrolling done here. Not performance critical. 1756 1757 addic. r5,r5,0 #test r5. 1758 beq Lppcasm_sqr_adios 1759 addi r4,r4,-$BNSZ 1760 addi r3,r3,-$BNSZ 1761 mtctr r5 1762Lppcasm_sqr_mainloop: 1763 #sqr(r[0],r[1],a[0]); 1764 $LDU r6,$BNSZ(r4) 1765 $UMULL r7,r6,r6 1766 $UMULH r8,r6,r6 1767 $STU r7,$BNSZ(r3) 1768 $STU r8,$BNSZ(r3) 1769 bdnz Lppcasm_sqr_mainloop 1770Lppcasm_sqr_adios: 1771 blr 1772 .long 0 1773 .byte 0,12,0x14,0,0,0,3,0 1774 .long 0 1775.size .bn_sqr_words,.-.bn_sqr_words 1776 1777# 1778# NOTE: The following label name should be changed to 1779# "bn_mul_words" i.e. remove the first dot 1780# for the gcc compiler. This should be automatically 1781# done in the build 1782# 1783 1784.align 4 1785.bn_mul_words: 1786# 1787# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1788# 1789# r3 = rp 1790# r4 = ap 1791# r5 = num 1792# r6 = w 1793 xor r0,r0,r0 1794 xor r12,r12,r12 # used for carry 1795 rlwinm. r7,r5,30,2,31 # num >> 2 1796 beq Lppcasm_mw_REM 1797 mtctr r7 1798Lppcasm_mw_LOOP: 1799 #mul(rp[0],ap[0],w,c1); 1800 $LD r8,`0*$BNSZ`(r4) 1801 $UMULL r9,r6,r8 1802 $UMULH r10,r6,r8 1803 addc r9,r9,r12 1804 #addze r10,r10 #carry is NOT ignored. 1805 #will be taken care of 1806 #in second spin below 1807 #using adde. 1808 $ST r9,`0*$BNSZ`(r3) 1809 #mul(rp[1],ap[1],w,c1); 1810 $LD r8,`1*$BNSZ`(r4) 1811 $UMULL r11,r6,r8 1812 $UMULH r12,r6,r8 1813 adde r11,r11,r10 1814 #addze r12,r12 1815 $ST r11,`1*$BNSZ`(r3) 1816 #mul(rp[2],ap[2],w,c1); 1817 $LD r8,`2*$BNSZ`(r4) 1818 $UMULL r9,r6,r8 1819 $UMULH r10,r6,r8 1820 adde r9,r9,r12 1821 #addze r10,r10 1822 $ST r9,`2*$BNSZ`(r3) 1823 #mul_add(rp[3],ap[3],w,c1); 1824 $LD r8,`3*$BNSZ`(r4) 1825 $UMULL r11,r6,r8 1826 $UMULH r12,r6,r8 1827 adde r11,r11,r10 1828 addze r12,r12 #this spin we collect carry into 1829 #r12 1830 $ST r11,`3*$BNSZ`(r3) 1831 1832 addi r3,r3,`4*$BNSZ` 1833 addi r4,r4,`4*$BNSZ` 1834 bdnz Lppcasm_mw_LOOP 1835 1836Lppcasm_mw_REM: 1837 andi. r5,r5,0x3 1838 beq Lppcasm_mw_OVER 1839 #mul(rp[0],ap[0],w,c1); 1840 $LD r8,`0*$BNSZ`(r4) 1841 $UMULL r9,r6,r8 1842 $UMULH r10,r6,r8 1843 addc r9,r9,r12 1844 addze r10,r10 1845 $ST r9,`0*$BNSZ`(r3) 1846 addi r12,r10,0 1847 1848 addi r5,r5,-1 1849 cmpli 0,0,r5,0 1850 beq Lppcasm_mw_OVER 1851 1852 1853 #mul(rp[1],ap[1],w,c1); 1854 $LD r8,`1*$BNSZ`(r4) 1855 $UMULL r9,r6,r8 1856 $UMULH r10,r6,r8 1857 addc r9,r9,r12 1858 addze r10,r10 1859 $ST r9,`1*$BNSZ`(r3) 1860 addi r12,r10,0 1861 1862 addi r5,r5,-1 1863 cmpli 0,0,r5,0 1864 beq Lppcasm_mw_OVER 1865 1866 #mul_add(rp[2],ap[2],w,c1); 1867 $LD r8,`2*$BNSZ`(r4) 1868 $UMULL r9,r6,r8 1869 $UMULH r10,r6,r8 1870 addc r9,r9,r12 1871 addze r10,r10 1872 $ST r9,`2*$BNSZ`(r3) 1873 addi r12,r10,0 1874 1875Lppcasm_mw_OVER: 1876 addi r3,r12,0 1877 blr 1878 .long 0 1879 .byte 0,12,0x14,0,0,0,4,0 1880 .long 0 1881.size .bn_mul_words,.-.bn_mul_words 1882 1883# 1884# NOTE: The following label name should be changed to 1885# "bn_mul_add_words" i.e. remove the first dot 1886# for the gcc compiler. This should be automatically 1887# done in the build 1888# 1889 1890.align 4 1891.bn_mul_add_words: 1892# 1893# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1894# 1895# r3 = rp 1896# r4 = ap 1897# r5 = num 1898# r6 = w 1899# 1900# empirical evidence suggests that unrolled version performs best!! 1901# 1902 xor r0,r0,r0 #r0 = 0 1903 xor r12,r12,r12 #r12 = 0 . used for carry 1904 rlwinm. r7,r5,30,2,31 # num >> 2 1905 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 1906 mtctr r7 1907Lppcasm_maw_mainloop: 1908 #mul_add(rp[0],ap[0],w,c1); 1909 $LD r8,`0*$BNSZ`(r4) 1910 $LD r11,`0*$BNSZ`(r3) 1911 $UMULL r9,r6,r8 1912 $UMULH r10,r6,r8 1913 addc r9,r9,r12 #r12 is carry. 1914 addze r10,r10 1915 addc r9,r9,r11 1916 #addze r10,r10 1917 #the above instruction addze 1918 #is NOT needed. Carry will NOT 1919 #be ignored. It's not affected 1920 #by multiply and will be collected 1921 #in the next spin 1922 $ST r9,`0*$BNSZ`(r3) 1923 1924 #mul_add(rp[1],ap[1],w,c1); 1925 $LD r8,`1*$BNSZ`(r4) 1926 $LD r9,`1*$BNSZ`(r3) 1927 $UMULL r11,r6,r8 1928 $UMULH r12,r6,r8 1929 adde r11,r11,r10 #r10 is carry. 1930 addze r12,r12 1931 addc r11,r11,r9 1932 #addze r12,r12 1933 $ST r11,`1*$BNSZ`(r3) 1934 1935 #mul_add(rp[2],ap[2],w,c1); 1936 $LD r8,`2*$BNSZ`(r4) 1937 $UMULL r9,r6,r8 1938 $LD r11,`2*$BNSZ`(r3) 1939 $UMULH r10,r6,r8 1940 adde r9,r9,r12 1941 addze r10,r10 1942 addc r9,r9,r11 1943 #addze r10,r10 1944 $ST r9,`2*$BNSZ`(r3) 1945 1946 #mul_add(rp[3],ap[3],w,c1); 1947 $LD r8,`3*$BNSZ`(r4) 1948 $UMULL r11,r6,r8 1949 $LD r9,`3*$BNSZ`(r3) 1950 $UMULH r12,r6,r8 1951 adde r11,r11,r10 1952 addze r12,r12 1953 addc r11,r11,r9 1954 addze r12,r12 1955 $ST r11,`3*$BNSZ`(r3) 1956 addi r3,r3,`4*$BNSZ` 1957 addi r4,r4,`4*$BNSZ` 1958 bdnz Lppcasm_maw_mainloop 1959 1960Lppcasm_maw_leftover: 1961 andi. r5,r5,0x3 1962 beq Lppcasm_maw_adios 1963 addi r3,r3,-$BNSZ 1964 addi r4,r4,-$BNSZ 1965 #mul_add(rp[0],ap[0],w,c1); 1966 mtctr r5 1967 $LDU r8,$BNSZ(r4) 1968 $UMULL r9,r6,r8 1969 $UMULH r10,r6,r8 1970 $LDU r11,$BNSZ(r3) 1971 addc r9,r9,r11 1972 addze r10,r10 1973 addc r9,r9,r12 1974 addze r12,r10 1975 $ST r9,0(r3) 1976 1977 bdz Lppcasm_maw_adios 1978 #mul_add(rp[1],ap[1],w,c1); 1979 $LDU r8,$BNSZ(r4) 1980 $UMULL r9,r6,r8 1981 $UMULH r10,r6,r8 1982 $LDU r11,$BNSZ(r3) 1983 addc r9,r9,r11 1984 addze r10,r10 1985 addc r9,r9,r12 1986 addze r12,r10 1987 $ST r9,0(r3) 1988 1989 bdz Lppcasm_maw_adios 1990 #mul_add(rp[2],ap[2],w,c1); 1991 $LDU r8,$BNSZ(r4) 1992 $UMULL r9,r6,r8 1993 $UMULH r10,r6,r8 1994 $LDU r11,$BNSZ(r3) 1995 addc r9,r9,r11 1996 addze r10,r10 1997 addc r9,r9,r12 1998 addze r12,r10 1999 $ST r9,0(r3) 2000 2001Lppcasm_maw_adios: 2002 addi r3,r12,0 2003 blr 2004 .long 0 2005 .byte 0,12,0x14,0,0,0,4,0 2006 .long 0 2007.size .bn_mul_add_words,.-.bn_mul_add_words 2008 .align 4 2009EOF 2010$data =~ s/\`([^\`]*)\`/eval $1/gem; 2011print $data; 2012close STDOUT; 2013