1#! /usr/bin/env perl 2# Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# Implemented as a Perl wrapper as we want to support several different 10# architectures with single file. We pick up the target based on the 11# file name we are asked to generate. 12# 13# It should be noted though that this perl code is nothing like 14# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 15# as pre-processor to cover for platform differences in name decoration, 16# linker tables, 32-/64-bit instruction sets... 17# 18# As you might know there're several PowerPC ABI in use. Most notably 19# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs 20# are similar enough to implement leaf(!) functions, which would be ABI 21# neutral. And that's what you find here: ABI neutral leaf functions. 22# In case you wonder what that is... 23# 24# AIX performance 25# 26# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. 27# 28# The following is the performance of 32-bit compiler 29# generated code: 30# 31# OpenSSL 0.9.6c 21 dec 2001 32# built on: Tue Jun 11 11:06:51 EDT 2002 33# options:bn(64,32) ... 34#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 35# sign verify sign/s verify/s 36#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 37#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 38#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 39#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 40#dsa 512 bits 0.0087s 0.0106s 114.3 94.5 41#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 42# 43# Same benchmark with this assembler code: 44# 45#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 46#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 47#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 48#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 49#dsa 512 bits 0.0052s 0.0062s 191.6 162.0 50#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 51# 52# Number of operations increases by at almost 75% 53# 54# Here are performance numbers for 64-bit compiler 55# generated code: 56# 57# OpenSSL 0.9.6g [engine] 9 Aug 2002 58# built on: Fri Apr 18 16:59:20 EDT 2003 59# options:bn(64,64) ... 60# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 61# sign verify sign/s verify/s 62#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 63#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 64#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 65#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 66#dsa 512 bits 0.0026s 0.0032s 382.5 313.7 67#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 68# 69# Same benchmark with this assembler code: 70# 71#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 72#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 73#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 74#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 75#dsa 512 bits 0.0016s 0.0020s 610.7 507.1 76#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 77# 78# Again, performance increases by at about 75% 79# 80# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) 81# OpenSSL 0.9.7c 30 Sep 2003 82# 83# Original code. 84# 85#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 86#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 87#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 88#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 89#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 90#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 91#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 92# 93# Same benchmark with this assembler code: 94# 95#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 96#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 97#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 98#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 99#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 100#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 101#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 102# 103# Performance increase of ~60% 104# Based on submission from Suresh N. Chari of IBM 105 106# $output is the last argument if it looks like a file (it has an extension) 107# $flavour is the first argument if it doesn't look like a file 108$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 109$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 110 111if ($flavour =~ /32/) { 112 $BITS= 32; 113 $BNSZ= $BITS/8; 114 $ISA= "\"ppc\""; 115 116 $LD= "lwz"; # load 117 $LDU= "lwzu"; # load and update 118 $ST= "stw"; # store 119 $STU= "stwu"; # store and update 120 $UMULL= "mullw"; # unsigned multiply low 121 $UMULH= "mulhwu"; # unsigned multiply high 122 $UDIV= "divwu"; # unsigned divide 123 $UCMPI= "cmplwi"; # unsigned compare with immediate 124 $UCMP= "cmplw"; # unsigned compare 125 $CNTLZ= "cntlzw"; # count leading zeros 126 $SHL= "slw"; # shift left 127 $SHR= "srw"; # unsigned shift right 128 $SHRI= "srwi"; # unsigned shift right by immediate 129 $SHLI= "slwi"; # shift left by immediate 130 $CLRU= "clrlwi"; # clear upper bits 131 $INSR= "insrwi"; # insert right 132 $ROTL= "rotlwi"; # rotate left by immediate 133 $TR= "tw"; # conditional trap 134} elsif ($flavour =~ /64/) { 135 $BITS= 64; 136 $BNSZ= $BITS/8; 137 $ISA= "\"ppc64\""; 138 139 # same as above, but 64-bit mnemonics... 140 $LD= "ld"; # load 141 $LDU= "ldu"; # load and update 142 $ST= "std"; # store 143 $STU= "stdu"; # store and update 144 $UMULL= "mulld"; # unsigned multiply low 145 $UMULH= "mulhdu"; # unsigned multiply high 146 $UDIV= "divdu"; # unsigned divide 147 $UCMPI= "cmpldi"; # unsigned compare with immediate 148 $UCMP= "cmpld"; # unsigned compare 149 $CNTLZ= "cntlzd"; # count leading zeros 150 $SHL= "sld"; # shift left 151 $SHR= "srd"; # unsigned shift right 152 $SHRI= "srdi"; # unsigned shift right by immediate 153 $SHLI= "sldi"; # shift left by immediate 154 $CLRU= "clrldi"; # clear upper bits 155 $INSR= "insrdi"; # insert right 156 $ROTL= "rotldi"; # rotate left by immediate 157 $TR= "td"; # conditional trap 158} else { die "nonsense $flavour"; } 159 160$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 161( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 162( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 163die "can't locate ppc-xlate.pl"; 164 165open STDOUT,"| $^X $xlate $flavour \"$output\"" 166 or die "can't call $xlate: $!"; 167 168$data=<<EOF; 169#-------------------------------------------------------------------- 170# 171# 172# 173# 174# File: ppc32.s 175# 176# Created by: Suresh Chari 177# IBM Thomas J. Watson Research Library 178# Hawthorne, NY 179# 180# 181# Description: Optimized assembly routines for OpenSSL crypto 182# on the 32 bitPowerPC platform. 183# 184# 185# Version History 186# 187# 2. Fixed bn_add,bn_sub and bn_div_words, added comments, 188# cleaned up code. Also made a single version which can 189# be used for both the AIX and Linux compilers. See NOTE 190# below. 191# 12/05/03 Suresh Chari 192# (with lots of help from) Andy Polyakov 193## 194# 1. Initial version 10/20/02 Suresh Chari 195# 196# 197# The following file works for the xlc,cc 198# and gcc compilers. 199# 200# NOTE: To get the file to link correctly with the gcc compiler 201# you have to change the names of the routines and remove 202# the first .(dot) character. This should automatically 203# be done in the build process. 204# 205# Hand optimized assembly code for the following routines 206# 207# bn_sqr_comba4 208# bn_sqr_comba8 209# bn_mul_comba4 210# bn_mul_comba8 211# bn_sub_words 212# bn_add_words 213# bn_div_words 214# bn_sqr_words 215# bn_mul_words 216# bn_mul_add_words 217# 218# NOTE: It is possible to optimize this code more for 219# specific PowerPC or Power architectures. On the Northstar 220# architecture the optimizations in this file do 221# NOT provide much improvement. 222# 223# If you have comments or suggestions to improve code send 224# me a note at schari\@us.ibm.com 225# 226#-------------------------------------------------------------------------- 227# 228# Defines to be used in the assembly code. 229# 230#.set r0,0 # we use it as storage for value of 0 231#.set SP,1 # preserved 232#.set RTOC,2 # preserved 233#.set r3,3 # 1st argument/return value 234#.set r4,4 # 2nd argument/volatile register 235#.set r5,5 # 3rd argument/volatile register 236#.set r6,6 # ... 237#.set r7,7 238#.set r8,8 239#.set r9,9 240#.set r10,10 241#.set r11,11 242#.set r12,12 243#.set r13,13 # not used, nor any other "below" it... 244 245# Declare function names to be global 246# NOTE: For gcc these names MUST be changed to remove 247# the first . i.e. for example change ".bn_sqr_comba4" 248# to "bn_sqr_comba4". This should be automatically done 249# in the build. 250 251 .globl .bn_sqr_comba4 252 .globl .bn_sqr_comba8 253 .globl .bn_mul_comba4 254 .globl .bn_mul_comba8 255 .globl .bn_sub_words 256 .globl .bn_add_words 257 .globl .bn_div_words 258 .globl .bn_sqr_words 259 .globl .bn_mul_words 260 .globl .bn_mul_add_words 261 262# .text section 263 264 .machine "any" 265 .text 266 267# 268# NOTE: The following label name should be changed to 269# "bn_sqr_comba4" i.e. remove the first dot 270# for the gcc compiler. This should be automatically 271# done in the build 272# 273 274.align 4 275.bn_sqr_comba4: 276# 277# Optimized version of bn_sqr_comba4. 278# 279# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 280# r3 contains r 281# r4 contains a 282# 283# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 284# 285# r5,r6 are the two BN_ULONGs being multiplied. 286# r7,r8 are the results of the 32x32 giving 64 bit multiply. 287# r9,r10, r11 are the equivalents of c1,c2, c3. 288# Here's the assembly 289# 290# 291 xor r0,r0,r0 # set r0 = 0. Used in the addze 292 # instructions below 293 294 #sqr_add_c(a,0,c1,c2,c3) 295 $LD r5,`0*$BNSZ`(r4) 296 $UMULL r9,r5,r5 297 $UMULH r10,r5,r5 #in first iteration. No need 298 #to add since c1=c2=c3=0. 299 # Note c3(r11) is NOT set to 0 300 # but will be. 301 302 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 303 # sqr_add_c2(a,1,0,c2,c3,c1); 304 $LD r6,`1*$BNSZ`(r4) 305 $UMULL r7,r5,r6 306 $UMULH r8,r5,r6 307 308 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) 309 adde r8,r8,r8 310 addze r9,r0 # catch carry if any. 311 # r9= r0(=0) and carry 312 313 addc r10,r7,r10 # now add to temp result. 314 addze r11,r8 # r8 added to r11 which is 0 315 addze r9,r9 316 317 $ST r10,`1*$BNSZ`(r3) #r[1]=c2; 318 #sqr_add_c(a,1,c3,c1,c2) 319 $UMULL r7,r6,r6 320 $UMULH r8,r6,r6 321 addc r11,r7,r11 322 adde r9,r8,r9 323 addze r10,r0 324 #sqr_add_c2(a,2,0,c3,c1,c2) 325 $LD r6,`2*$BNSZ`(r4) 326 $UMULL r7,r5,r6 327 $UMULH r8,r5,r6 328 329 addc r7,r7,r7 330 adde r8,r8,r8 331 addze r10,r10 332 333 addc r11,r7,r11 334 adde r9,r8,r9 335 addze r10,r10 336 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 337 #sqr_add_c2(a,3,0,c1,c2,c3); 338 $LD r6,`3*$BNSZ`(r4) 339 $UMULL r7,r5,r6 340 $UMULH r8,r5,r6 341 addc r7,r7,r7 342 adde r8,r8,r8 343 addze r11,r0 344 345 addc r9,r7,r9 346 adde r10,r8,r10 347 addze r11,r11 348 #sqr_add_c2(a,2,1,c1,c2,c3); 349 $LD r5,`1*$BNSZ`(r4) 350 $LD r6,`2*$BNSZ`(r4) 351 $UMULL r7,r5,r6 352 $UMULH r8,r5,r6 353 354 addc r7,r7,r7 355 adde r8,r8,r8 356 addze r11,r11 357 addc r9,r7,r9 358 adde r10,r8,r10 359 addze r11,r11 360 $ST r9,`3*$BNSZ`(r3) #r[3]=c1 361 #sqr_add_c(a,2,c2,c3,c1); 362 $UMULL r7,r6,r6 363 $UMULH r8,r6,r6 364 addc r10,r7,r10 365 adde r11,r8,r11 366 addze r9,r0 367 #sqr_add_c2(a,3,1,c2,c3,c1); 368 $LD r6,`3*$BNSZ`(r4) 369 $UMULL r7,r5,r6 370 $UMULH r8,r5,r6 371 addc r7,r7,r7 372 adde r8,r8,r8 373 addze r9,r9 374 375 addc r10,r7,r10 376 adde r11,r8,r11 377 addze r9,r9 378 $ST r10,`4*$BNSZ`(r3) #r[4]=c2 379 #sqr_add_c2(a,3,2,c3,c1,c2); 380 $LD r5,`2*$BNSZ`(r4) 381 $UMULL r7,r5,r6 382 $UMULH r8,r5,r6 383 addc r7,r7,r7 384 adde r8,r8,r8 385 addze r10,r0 386 387 addc r11,r7,r11 388 adde r9,r8,r9 389 addze r10,r10 390 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 391 #sqr_add_c(a,3,c1,c2,c3); 392 $UMULL r7,r6,r6 393 $UMULH r8,r6,r6 394 addc r9,r7,r9 395 adde r10,r8,r10 396 397 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 398 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 399 blr 400 .long 0 401 .byte 0,12,0x14,0,0,0,2,0 402 .long 0 403.size .bn_sqr_comba4,.-.bn_sqr_comba4 404 405# 406# NOTE: The following label name should be changed to 407# "bn_sqr_comba8" i.e. remove the first dot 408# for the gcc compiler. This should be automatically 409# done in the build 410# 411 412.align 4 413.bn_sqr_comba8: 414# 415# This is an optimized version of the bn_sqr_comba8 routine. 416# Tightly uses the adde instruction 417# 418# 419# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 420# r3 contains r 421# r4 contains a 422# 423# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 424# 425# r5,r6 are the two BN_ULONGs being multiplied. 426# r7,r8 are the results of the 32x32 giving 64 bit multiply. 427# r9,r10, r11 are the equivalents of c1,c2, c3. 428# 429# Possible optimization of loading all 8 longs of a into registers 430# doesn't provide any speedup 431# 432 433 xor r0,r0,r0 #set r0 = 0.Used in addze 434 #instructions below. 435 436 #sqr_add_c(a,0,c1,c2,c3); 437 $LD r5,`0*$BNSZ`(r4) 438 $UMULL r9,r5,r5 #1st iteration: no carries. 439 $UMULH r10,r5,r5 440 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 441 #sqr_add_c2(a,1,0,c2,c3,c1); 442 $LD r6,`1*$BNSZ`(r4) 443 $UMULL r7,r5,r6 444 $UMULH r8,r5,r6 445 446 addc r10,r7,r10 #add the two register number 447 adde r11,r8,r0 # (r8,r7) to the three register 448 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 449 450 addc r10,r7,r10 #add the two register number 451 adde r11,r8,r11 # (r8,r7) to the three register 452 addze r9,r9 # number (r9,r11,r10). 453 454 $ST r10,`1*$BNSZ`(r3) # r[1]=c2 455 456 #sqr_add_c(a,1,c3,c1,c2); 457 $UMULL r7,r6,r6 458 $UMULH r8,r6,r6 459 addc r11,r7,r11 460 adde r9,r8,r9 461 addze r10,r0 462 #sqr_add_c2(a,2,0,c3,c1,c2); 463 $LD r6,`2*$BNSZ`(r4) 464 $UMULL r7,r5,r6 465 $UMULH r8,r5,r6 466 467 addc r11,r7,r11 468 adde r9,r8,r9 469 addze r10,r10 470 471 addc r11,r7,r11 472 adde r9,r8,r9 473 addze r10,r10 474 475 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 476 #sqr_add_c2(a,3,0,c1,c2,c3); 477 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. 478 $UMULL r7,r5,r6 479 $UMULH r8,r5,r6 480 481 addc r9,r7,r9 482 adde r10,r8,r10 483 addze r11,r0 484 485 addc r9,r7,r9 486 adde r10,r8,r10 487 addze r11,r11 488 #sqr_add_c2(a,2,1,c1,c2,c3); 489 $LD r5,`1*$BNSZ`(r4) 490 $LD r6,`2*$BNSZ`(r4) 491 $UMULL r7,r5,r6 492 $UMULH r8,r5,r6 493 494 addc r9,r7,r9 495 adde r10,r8,r10 496 addze r11,r11 497 498 addc r9,r7,r9 499 adde r10,r8,r10 500 addze r11,r11 501 502 $ST r9,`3*$BNSZ`(r3) #r[3]=c1; 503 #sqr_add_c(a,2,c2,c3,c1); 504 $UMULL r7,r6,r6 505 $UMULH r8,r6,r6 506 507 addc r10,r7,r10 508 adde r11,r8,r11 509 addze r9,r0 510 #sqr_add_c2(a,3,1,c2,c3,c1); 511 $LD r6,`3*$BNSZ`(r4) 512 $UMULL r7,r5,r6 513 $UMULH r8,r5,r6 514 515 addc r10,r7,r10 516 adde r11,r8,r11 517 addze r9,r9 518 519 addc r10,r7,r10 520 adde r11,r8,r11 521 addze r9,r9 522 #sqr_add_c2(a,4,0,c2,c3,c1); 523 $LD r5,`0*$BNSZ`(r4) 524 $LD r6,`4*$BNSZ`(r4) 525 $UMULL r7,r5,r6 526 $UMULH r8,r5,r6 527 528 addc r10,r7,r10 529 adde r11,r8,r11 530 addze r9,r9 531 532 addc r10,r7,r10 533 adde r11,r8,r11 534 addze r9,r9 535 $ST r10,`4*$BNSZ`(r3) #r[4]=c2; 536 #sqr_add_c2(a,5,0,c3,c1,c2); 537 $LD r6,`5*$BNSZ`(r4) 538 $UMULL r7,r5,r6 539 $UMULH r8,r5,r6 540 541 addc r11,r7,r11 542 adde r9,r8,r9 543 addze r10,r0 544 545 addc r11,r7,r11 546 adde r9,r8,r9 547 addze r10,r10 548 #sqr_add_c2(a,4,1,c3,c1,c2); 549 $LD r5,`1*$BNSZ`(r4) 550 $LD r6,`4*$BNSZ`(r4) 551 $UMULL r7,r5,r6 552 $UMULH r8,r5,r6 553 554 addc r11,r7,r11 555 adde r9,r8,r9 556 addze r10,r10 557 558 addc r11,r7,r11 559 adde r9,r8,r9 560 addze r10,r10 561 #sqr_add_c2(a,3,2,c3,c1,c2); 562 $LD r5,`2*$BNSZ`(r4) 563 $LD r6,`3*$BNSZ`(r4) 564 $UMULL r7,r5,r6 565 $UMULH r8,r5,r6 566 567 addc r11,r7,r11 568 adde r9,r8,r9 569 addze r10,r10 570 571 addc r11,r7,r11 572 adde r9,r8,r9 573 addze r10,r10 574 $ST r11,`5*$BNSZ`(r3) #r[5]=c3; 575 #sqr_add_c(a,3,c1,c2,c3); 576 $UMULL r7,r6,r6 577 $UMULH r8,r6,r6 578 addc r9,r7,r9 579 adde r10,r8,r10 580 addze r11,r0 581 #sqr_add_c2(a,4,2,c1,c2,c3); 582 $LD r6,`4*$BNSZ`(r4) 583 $UMULL r7,r5,r6 584 $UMULH r8,r5,r6 585 586 addc r9,r7,r9 587 adde r10,r8,r10 588 addze r11,r11 589 590 addc r9,r7,r9 591 adde r10,r8,r10 592 addze r11,r11 593 #sqr_add_c2(a,5,1,c1,c2,c3); 594 $LD r5,`1*$BNSZ`(r4) 595 $LD r6,`5*$BNSZ`(r4) 596 $UMULL r7,r5,r6 597 $UMULH r8,r5,r6 598 599 addc r9,r7,r9 600 adde r10,r8,r10 601 addze r11,r11 602 603 addc r9,r7,r9 604 adde r10,r8,r10 605 addze r11,r11 606 #sqr_add_c2(a,6,0,c1,c2,c3); 607 $LD r5,`0*$BNSZ`(r4) 608 $LD r6,`6*$BNSZ`(r4) 609 $UMULL r7,r5,r6 610 $UMULH r8,r5,r6 611 addc r9,r7,r9 612 adde r10,r8,r10 613 addze r11,r11 614 addc r9,r7,r9 615 adde r10,r8,r10 616 addze r11,r11 617 $ST r9,`6*$BNSZ`(r3) #r[6]=c1; 618 #sqr_add_c2(a,7,0,c2,c3,c1); 619 $LD r6,`7*$BNSZ`(r4) 620 $UMULL r7,r5,r6 621 $UMULH r8,r5,r6 622 623 addc r10,r7,r10 624 adde r11,r8,r11 625 addze r9,r0 626 addc r10,r7,r10 627 adde r11,r8,r11 628 addze r9,r9 629 #sqr_add_c2(a,6,1,c2,c3,c1); 630 $LD r5,`1*$BNSZ`(r4) 631 $LD r6,`6*$BNSZ`(r4) 632 $UMULL r7,r5,r6 633 $UMULH r8,r5,r6 634 635 addc r10,r7,r10 636 adde r11,r8,r11 637 addze r9,r9 638 addc r10,r7,r10 639 adde r11,r8,r11 640 addze r9,r9 641 #sqr_add_c2(a,5,2,c2,c3,c1); 642 $LD r5,`2*$BNSZ`(r4) 643 $LD r6,`5*$BNSZ`(r4) 644 $UMULL r7,r5,r6 645 $UMULH r8,r5,r6 646 addc r10,r7,r10 647 adde r11,r8,r11 648 addze r9,r9 649 addc r10,r7,r10 650 adde r11,r8,r11 651 addze r9,r9 652 #sqr_add_c2(a,4,3,c2,c3,c1); 653 $LD r5,`3*$BNSZ`(r4) 654 $LD r6,`4*$BNSZ`(r4) 655 $UMULL r7,r5,r6 656 $UMULH r8,r5,r6 657 658 addc r10,r7,r10 659 adde r11,r8,r11 660 addze r9,r9 661 addc r10,r7,r10 662 adde r11,r8,r11 663 addze r9,r9 664 $ST r10,`7*$BNSZ`(r3) #r[7]=c2; 665 #sqr_add_c(a,4,c3,c1,c2); 666 $UMULL r7,r6,r6 667 $UMULH r8,r6,r6 668 addc r11,r7,r11 669 adde r9,r8,r9 670 addze r10,r0 671 #sqr_add_c2(a,5,3,c3,c1,c2); 672 $LD r6,`5*$BNSZ`(r4) 673 $UMULL r7,r5,r6 674 $UMULH r8,r5,r6 675 addc r11,r7,r11 676 adde r9,r8,r9 677 addze r10,r10 678 addc r11,r7,r11 679 adde r9,r8,r9 680 addze r10,r10 681 #sqr_add_c2(a,6,2,c3,c1,c2); 682 $LD r5,`2*$BNSZ`(r4) 683 $LD r6,`6*$BNSZ`(r4) 684 $UMULL r7,r5,r6 685 $UMULH r8,r5,r6 686 addc r11,r7,r11 687 adde r9,r8,r9 688 addze r10,r10 689 690 addc r11,r7,r11 691 adde r9,r8,r9 692 addze r10,r10 693 #sqr_add_c2(a,7,1,c3,c1,c2); 694 $LD r5,`1*$BNSZ`(r4) 695 $LD r6,`7*$BNSZ`(r4) 696 $UMULL r7,r5,r6 697 $UMULH r8,r5,r6 698 addc r11,r7,r11 699 adde r9,r8,r9 700 addze r10,r10 701 addc r11,r7,r11 702 adde r9,r8,r9 703 addze r10,r10 704 $ST r11,`8*$BNSZ`(r3) #r[8]=c3; 705 #sqr_add_c2(a,7,2,c1,c2,c3); 706 $LD r5,`2*$BNSZ`(r4) 707 $UMULL r7,r5,r6 708 $UMULH r8,r5,r6 709 710 addc r9,r7,r9 711 adde r10,r8,r10 712 addze r11,r0 713 addc r9,r7,r9 714 adde r10,r8,r10 715 addze r11,r11 716 #sqr_add_c2(a,6,3,c1,c2,c3); 717 $LD r5,`3*$BNSZ`(r4) 718 $LD r6,`6*$BNSZ`(r4) 719 $UMULL r7,r5,r6 720 $UMULH r8,r5,r6 721 addc r9,r7,r9 722 adde r10,r8,r10 723 addze r11,r11 724 addc r9,r7,r9 725 adde r10,r8,r10 726 addze r11,r11 727 #sqr_add_c2(a,5,4,c1,c2,c3); 728 $LD r5,`4*$BNSZ`(r4) 729 $LD r6,`5*$BNSZ`(r4) 730 $UMULL r7,r5,r6 731 $UMULH r8,r5,r6 732 addc r9,r7,r9 733 adde r10,r8,r10 734 addze r11,r11 735 addc r9,r7,r9 736 adde r10,r8,r10 737 addze r11,r11 738 $ST r9,`9*$BNSZ`(r3) #r[9]=c1; 739 #sqr_add_c(a,5,c2,c3,c1); 740 $UMULL r7,r6,r6 741 $UMULH r8,r6,r6 742 addc r10,r7,r10 743 adde r11,r8,r11 744 addze r9,r0 745 #sqr_add_c2(a,6,4,c2,c3,c1); 746 $LD r6,`6*$BNSZ`(r4) 747 $UMULL r7,r5,r6 748 $UMULH r8,r5,r6 749 addc r10,r7,r10 750 adde r11,r8,r11 751 addze r9,r9 752 addc r10,r7,r10 753 adde r11,r8,r11 754 addze r9,r9 755 #sqr_add_c2(a,7,3,c2,c3,c1); 756 $LD r5,`3*$BNSZ`(r4) 757 $LD r6,`7*$BNSZ`(r4) 758 $UMULL r7,r5,r6 759 $UMULH r8,r5,r6 760 addc r10,r7,r10 761 adde r11,r8,r11 762 addze r9,r9 763 addc r10,r7,r10 764 adde r11,r8,r11 765 addze r9,r9 766 $ST r10,`10*$BNSZ`(r3) #r[10]=c2; 767 #sqr_add_c2(a,7,4,c3,c1,c2); 768 $LD r5,`4*$BNSZ`(r4) 769 $UMULL r7,r5,r6 770 $UMULH r8,r5,r6 771 addc r11,r7,r11 772 adde r9,r8,r9 773 addze r10,r0 774 addc r11,r7,r11 775 adde r9,r8,r9 776 addze r10,r10 777 #sqr_add_c2(a,6,5,c3,c1,c2); 778 $LD r5,`5*$BNSZ`(r4) 779 $LD r6,`6*$BNSZ`(r4) 780 $UMULL r7,r5,r6 781 $UMULH r8,r5,r6 782 addc r11,r7,r11 783 adde r9,r8,r9 784 addze r10,r10 785 addc r11,r7,r11 786 adde r9,r8,r9 787 addze r10,r10 788 $ST r11,`11*$BNSZ`(r3) #r[11]=c3; 789 #sqr_add_c(a,6,c1,c2,c3); 790 $UMULL r7,r6,r6 791 $UMULH r8,r6,r6 792 addc r9,r7,r9 793 adde r10,r8,r10 794 addze r11,r0 795 #sqr_add_c2(a,7,5,c1,c2,c3) 796 $LD r6,`7*$BNSZ`(r4) 797 $UMULL r7,r5,r6 798 $UMULH r8,r5,r6 799 addc r9,r7,r9 800 adde r10,r8,r10 801 addze r11,r11 802 addc r9,r7,r9 803 adde r10,r8,r10 804 addze r11,r11 805 $ST r9,`12*$BNSZ`(r3) #r[12]=c1; 806 807 #sqr_add_c2(a,7,6,c2,c3,c1) 808 $LD r5,`6*$BNSZ`(r4) 809 $UMULL r7,r5,r6 810 $UMULH r8,r5,r6 811 addc r10,r7,r10 812 adde r11,r8,r11 813 addze r9,r0 814 addc r10,r7,r10 815 adde r11,r8,r11 816 addze r9,r9 817 $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 818 #sqr_add_c(a,7,c3,c1,c2); 819 $UMULL r7,r6,r6 820 $UMULH r8,r6,r6 821 addc r11,r7,r11 822 adde r9,r8,r9 823 $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 824 $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 825 826 827 blr 828 .long 0 829 .byte 0,12,0x14,0,0,0,2,0 830 .long 0 831.size .bn_sqr_comba8,.-.bn_sqr_comba8 832 833# 834# NOTE: The following label name should be changed to 835# "bn_mul_comba4" i.e. remove the first dot 836# for the gcc compiler. This should be automatically 837# done in the build 838# 839 840.align 4 841.bn_mul_comba4: 842# 843# This is an optimized version of the bn_mul_comba4 routine. 844# 845# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 846# r3 contains r 847# r4 contains a 848# r5 contains b 849# r6, r7 are the 2 BN_ULONGs being multiplied. 850# r8, r9 are the results of the 32x32 giving 64 multiply. 851# r10, r11, r12 are the equivalents of c1, c2, and c3. 852# 853 xor r0,r0,r0 #r0=0. Used in addze below. 854 #mul_add_c(a[0],b[0],c1,c2,c3); 855 $LD r6,`0*$BNSZ`(r4) 856 $LD r7,`0*$BNSZ`(r5) 857 $UMULL r10,r6,r7 858 $UMULH r11,r6,r7 859 $ST r10,`0*$BNSZ`(r3) #r[0]=c1 860 #mul_add_c(a[0],b[1],c2,c3,c1); 861 $LD r7,`1*$BNSZ`(r5) 862 $UMULL r8,r6,r7 863 $UMULH r9,r6,r7 864 addc r11,r8,r11 865 adde r12,r9,r0 866 addze r10,r0 867 #mul_add_c(a[1],b[0],c2,c3,c1); 868 $LD r6, `1*$BNSZ`(r4) 869 $LD r7, `0*$BNSZ`(r5) 870 $UMULL r8,r6,r7 871 $UMULH r9,r6,r7 872 addc r11,r8,r11 873 adde r12,r9,r12 874 addze r10,r10 875 $ST r11,`1*$BNSZ`(r3) #r[1]=c2 876 #mul_add_c(a[2],b[0],c3,c1,c2); 877 $LD r6,`2*$BNSZ`(r4) 878 $UMULL r8,r6,r7 879 $UMULH r9,r6,r7 880 addc r12,r8,r12 881 adde r10,r9,r10 882 addze r11,r0 883 #mul_add_c(a[1],b[1],c3,c1,c2); 884 $LD r6,`1*$BNSZ`(r4) 885 $LD r7,`1*$BNSZ`(r5) 886 $UMULL r8,r6,r7 887 $UMULH r9,r6,r7 888 addc r12,r8,r12 889 adde r10,r9,r10 890 addze r11,r11 891 #mul_add_c(a[0],b[2],c3,c1,c2); 892 $LD r6,`0*$BNSZ`(r4) 893 $LD r7,`2*$BNSZ`(r5) 894 $UMULL r8,r6,r7 895 $UMULH r9,r6,r7 896 addc r12,r8,r12 897 adde r10,r9,r10 898 addze r11,r11 899 $ST r12,`2*$BNSZ`(r3) #r[2]=c3 900 #mul_add_c(a[0],b[3],c1,c2,c3); 901 $LD r7,`3*$BNSZ`(r5) 902 $UMULL r8,r6,r7 903 $UMULH r9,r6,r7 904 addc r10,r8,r10 905 adde r11,r9,r11 906 addze r12,r0 907 #mul_add_c(a[1],b[2],c1,c2,c3); 908 $LD r6,`1*$BNSZ`(r4) 909 $LD r7,`2*$BNSZ`(r5) 910 $UMULL r8,r6,r7 911 $UMULH r9,r6,r7 912 addc r10,r8,r10 913 adde r11,r9,r11 914 addze r12,r12 915 #mul_add_c(a[2],b[1],c1,c2,c3); 916 $LD r6,`2*$BNSZ`(r4) 917 $LD r7,`1*$BNSZ`(r5) 918 $UMULL r8,r6,r7 919 $UMULH r9,r6,r7 920 addc r10,r8,r10 921 adde r11,r9,r11 922 addze r12,r12 923 #mul_add_c(a[3],b[0],c1,c2,c3); 924 $LD r6,`3*$BNSZ`(r4) 925 $LD r7,`0*$BNSZ`(r5) 926 $UMULL r8,r6,r7 927 $UMULH r9,r6,r7 928 addc r10,r8,r10 929 adde r11,r9,r11 930 addze r12,r12 931 $ST r10,`3*$BNSZ`(r3) #r[3]=c1 932 #mul_add_c(a[3],b[1],c2,c3,c1); 933 $LD r7,`1*$BNSZ`(r5) 934 $UMULL r8,r6,r7 935 $UMULH r9,r6,r7 936 addc r11,r8,r11 937 adde r12,r9,r12 938 addze r10,r0 939 #mul_add_c(a[2],b[2],c2,c3,c1); 940 $LD r6,`2*$BNSZ`(r4) 941 $LD r7,`2*$BNSZ`(r5) 942 $UMULL r8,r6,r7 943 $UMULH r9,r6,r7 944 addc r11,r8,r11 945 adde r12,r9,r12 946 addze r10,r10 947 #mul_add_c(a[1],b[3],c2,c3,c1); 948 $LD r6,`1*$BNSZ`(r4) 949 $LD r7,`3*$BNSZ`(r5) 950 $UMULL r8,r6,r7 951 $UMULH r9,r6,r7 952 addc r11,r8,r11 953 adde r12,r9,r12 954 addze r10,r10 955 $ST r11,`4*$BNSZ`(r3) #r[4]=c2 956 #mul_add_c(a[2],b[3],c3,c1,c2); 957 $LD r6,`2*$BNSZ`(r4) 958 $UMULL r8,r6,r7 959 $UMULH r9,r6,r7 960 addc r12,r8,r12 961 adde r10,r9,r10 962 addze r11,r0 963 #mul_add_c(a[3],b[2],c3,c1,c2); 964 $LD r6,`3*$BNSZ`(r4) 965 $LD r7,`2*$BNSZ`(r5) 966 $UMULL r8,r6,r7 967 $UMULH r9,r6,r7 968 addc r12,r8,r12 969 adde r10,r9,r10 970 addze r11,r11 971 $ST r12,`5*$BNSZ`(r3) #r[5]=c3 972 #mul_add_c(a[3],b[3],c1,c2,c3); 973 $LD r7,`3*$BNSZ`(r5) 974 $UMULL r8,r6,r7 975 $UMULH r9,r6,r7 976 addc r10,r8,r10 977 adde r11,r9,r11 978 979 $ST r10,`6*$BNSZ`(r3) #r[6]=c1 980 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 981 blr 982 .long 0 983 .byte 0,12,0x14,0,0,0,3,0 984 .long 0 985.size .bn_mul_comba4,.-.bn_mul_comba4 986 987# 988# NOTE: The following label name should be changed to 989# "bn_mul_comba8" i.e. remove the first dot 990# for the gcc compiler. This should be automatically 991# done in the build 992# 993 994.align 4 995.bn_mul_comba8: 996# 997# Optimized version of the bn_mul_comba8 routine. 998# 999# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1000# r3 contains r 1001# r4 contains a 1002# r5 contains b 1003# r6, r7 are the 2 BN_ULONGs being multiplied. 1004# r8, r9 are the results of the 32x32 giving 64 multiply. 1005# r10, r11, r12 are the equivalents of c1, c2, and c3. 1006# 1007 xor r0,r0,r0 #r0=0. Used in addze below. 1008 1009 #mul_add_c(a[0],b[0],c1,c2,c3); 1010 $LD r6,`0*$BNSZ`(r4) #a[0] 1011 $LD r7,`0*$BNSZ`(r5) #b[0] 1012 $UMULL r10,r6,r7 1013 $UMULH r11,r6,r7 1014 $ST r10,`0*$BNSZ`(r3) #r[0]=c1; 1015 #mul_add_c(a[0],b[1],c2,c3,c1); 1016 $LD r7,`1*$BNSZ`(r5) 1017 $UMULL r8,r6,r7 1018 $UMULH r9,r6,r7 1019 addc r11,r11,r8 1020 addze r12,r9 # since we didn't set r12 to zero before. 1021 addze r10,r0 1022 #mul_add_c(a[1],b[0],c2,c3,c1); 1023 $LD r6,`1*$BNSZ`(r4) 1024 $LD r7,`0*$BNSZ`(r5) 1025 $UMULL r8,r6,r7 1026 $UMULH r9,r6,r7 1027 addc r11,r11,r8 1028 adde r12,r12,r9 1029 addze r10,r10 1030 $ST r11,`1*$BNSZ`(r3) #r[1]=c2; 1031 #mul_add_c(a[2],b[0],c3,c1,c2); 1032 $LD r6,`2*$BNSZ`(r4) 1033 $UMULL r8,r6,r7 1034 $UMULH r9,r6,r7 1035 addc r12,r12,r8 1036 adde r10,r10,r9 1037 addze r11,r0 1038 #mul_add_c(a[1],b[1],c3,c1,c2); 1039 $LD r6,`1*$BNSZ`(r4) 1040 $LD r7,`1*$BNSZ`(r5) 1041 $UMULL r8,r6,r7 1042 $UMULH r9,r6,r7 1043 addc r12,r12,r8 1044 adde r10,r10,r9 1045 addze r11,r11 1046 #mul_add_c(a[0],b[2],c3,c1,c2); 1047 $LD r6,`0*$BNSZ`(r4) 1048 $LD r7,`2*$BNSZ`(r5) 1049 $UMULL r8,r6,r7 1050 $UMULH r9,r6,r7 1051 addc r12,r12,r8 1052 adde r10,r10,r9 1053 addze r11,r11 1054 $ST r12,`2*$BNSZ`(r3) #r[2]=c3; 1055 #mul_add_c(a[0],b[3],c1,c2,c3); 1056 $LD r7,`3*$BNSZ`(r5) 1057 $UMULL r8,r6,r7 1058 $UMULH r9,r6,r7 1059 addc r10,r10,r8 1060 adde r11,r11,r9 1061 addze r12,r0 1062 #mul_add_c(a[1],b[2],c1,c2,c3); 1063 $LD r6,`1*$BNSZ`(r4) 1064 $LD r7,`2*$BNSZ`(r5) 1065 $UMULL r8,r6,r7 1066 $UMULH r9,r6,r7 1067 addc r10,r10,r8 1068 adde r11,r11,r9 1069 addze r12,r12 1070 1071 #mul_add_c(a[2],b[1],c1,c2,c3); 1072 $LD r6,`2*$BNSZ`(r4) 1073 $LD r7,`1*$BNSZ`(r5) 1074 $UMULL r8,r6,r7 1075 $UMULH r9,r6,r7 1076 addc r10,r10,r8 1077 adde r11,r11,r9 1078 addze r12,r12 1079 #mul_add_c(a[3],b[0],c1,c2,c3); 1080 $LD r6,`3*$BNSZ`(r4) 1081 $LD r7,`0*$BNSZ`(r5) 1082 $UMULL r8,r6,r7 1083 $UMULH r9,r6,r7 1084 addc r10,r10,r8 1085 adde r11,r11,r9 1086 addze r12,r12 1087 $ST r10,`3*$BNSZ`(r3) #r[3]=c1; 1088 #mul_add_c(a[4],b[0],c2,c3,c1); 1089 $LD r6,`4*$BNSZ`(r4) 1090 $UMULL r8,r6,r7 1091 $UMULH r9,r6,r7 1092 addc r11,r11,r8 1093 adde r12,r12,r9 1094 addze r10,r0 1095 #mul_add_c(a[3],b[1],c2,c3,c1); 1096 $LD r6,`3*$BNSZ`(r4) 1097 $LD r7,`1*$BNSZ`(r5) 1098 $UMULL r8,r6,r7 1099 $UMULH r9,r6,r7 1100 addc r11,r11,r8 1101 adde r12,r12,r9 1102 addze r10,r10 1103 #mul_add_c(a[2],b[2],c2,c3,c1); 1104 $LD r6,`2*$BNSZ`(r4) 1105 $LD r7,`2*$BNSZ`(r5) 1106 $UMULL r8,r6,r7 1107 $UMULH r9,r6,r7 1108 addc r11,r11,r8 1109 adde r12,r12,r9 1110 addze r10,r10 1111 #mul_add_c(a[1],b[3],c2,c3,c1); 1112 $LD r6,`1*$BNSZ`(r4) 1113 $LD r7,`3*$BNSZ`(r5) 1114 $UMULL r8,r6,r7 1115 $UMULH r9,r6,r7 1116 addc r11,r11,r8 1117 adde r12,r12,r9 1118 addze r10,r10 1119 #mul_add_c(a[0],b[4],c2,c3,c1); 1120 $LD r6,`0*$BNSZ`(r4) 1121 $LD r7,`4*$BNSZ`(r5) 1122 $UMULL r8,r6,r7 1123 $UMULH r9,r6,r7 1124 addc r11,r11,r8 1125 adde r12,r12,r9 1126 addze r10,r10 1127 $ST r11,`4*$BNSZ`(r3) #r[4]=c2; 1128 #mul_add_c(a[0],b[5],c3,c1,c2); 1129 $LD r7,`5*$BNSZ`(r5) 1130 $UMULL r8,r6,r7 1131 $UMULH r9,r6,r7 1132 addc r12,r12,r8 1133 adde r10,r10,r9 1134 addze r11,r0 1135 #mul_add_c(a[1],b[4],c3,c1,c2); 1136 $LD r6,`1*$BNSZ`(r4) 1137 $LD r7,`4*$BNSZ`(r5) 1138 $UMULL r8,r6,r7 1139 $UMULH r9,r6,r7 1140 addc r12,r12,r8 1141 adde r10,r10,r9 1142 addze r11,r11 1143 #mul_add_c(a[2],b[3],c3,c1,c2); 1144 $LD r6,`2*$BNSZ`(r4) 1145 $LD r7,`3*$BNSZ`(r5) 1146 $UMULL r8,r6,r7 1147 $UMULH r9,r6,r7 1148 addc r12,r12,r8 1149 adde r10,r10,r9 1150 addze r11,r11 1151 #mul_add_c(a[3],b[2],c3,c1,c2); 1152 $LD r6,`3*$BNSZ`(r4) 1153 $LD r7,`2*$BNSZ`(r5) 1154 $UMULL r8,r6,r7 1155 $UMULH r9,r6,r7 1156 addc r12,r12,r8 1157 adde r10,r10,r9 1158 addze r11,r11 1159 #mul_add_c(a[4],b[1],c3,c1,c2); 1160 $LD r6,`4*$BNSZ`(r4) 1161 $LD r7,`1*$BNSZ`(r5) 1162 $UMULL r8,r6,r7 1163 $UMULH r9,r6,r7 1164 addc r12,r12,r8 1165 adde r10,r10,r9 1166 addze r11,r11 1167 #mul_add_c(a[5],b[0],c3,c1,c2); 1168 $LD r6,`5*$BNSZ`(r4) 1169 $LD r7,`0*$BNSZ`(r5) 1170 $UMULL r8,r6,r7 1171 $UMULH r9,r6,r7 1172 addc r12,r12,r8 1173 adde r10,r10,r9 1174 addze r11,r11 1175 $ST r12,`5*$BNSZ`(r3) #r[5]=c3; 1176 #mul_add_c(a[6],b[0],c1,c2,c3); 1177 $LD r6,`6*$BNSZ`(r4) 1178 $UMULL r8,r6,r7 1179 $UMULH r9,r6,r7 1180 addc r10,r10,r8 1181 adde r11,r11,r9 1182 addze r12,r0 1183 #mul_add_c(a[5],b[1],c1,c2,c3); 1184 $LD r6,`5*$BNSZ`(r4) 1185 $LD r7,`1*$BNSZ`(r5) 1186 $UMULL r8,r6,r7 1187 $UMULH r9,r6,r7 1188 addc r10,r10,r8 1189 adde r11,r11,r9 1190 addze r12,r12 1191 #mul_add_c(a[4],b[2],c1,c2,c3); 1192 $LD r6,`4*$BNSZ`(r4) 1193 $LD r7,`2*$BNSZ`(r5) 1194 $UMULL r8,r6,r7 1195 $UMULH r9,r6,r7 1196 addc r10,r10,r8 1197 adde r11,r11,r9 1198 addze r12,r12 1199 #mul_add_c(a[3],b[3],c1,c2,c3); 1200 $LD r6,`3*$BNSZ`(r4) 1201 $LD r7,`3*$BNSZ`(r5) 1202 $UMULL r8,r6,r7 1203 $UMULH r9,r6,r7 1204 addc r10,r10,r8 1205 adde r11,r11,r9 1206 addze r12,r12 1207 #mul_add_c(a[2],b[4],c1,c2,c3); 1208 $LD r6,`2*$BNSZ`(r4) 1209 $LD r7,`4*$BNSZ`(r5) 1210 $UMULL r8,r6,r7 1211 $UMULH r9,r6,r7 1212 addc r10,r10,r8 1213 adde r11,r11,r9 1214 addze r12,r12 1215 #mul_add_c(a[1],b[5],c1,c2,c3); 1216 $LD r6,`1*$BNSZ`(r4) 1217 $LD r7,`5*$BNSZ`(r5) 1218 $UMULL r8,r6,r7 1219 $UMULH r9,r6,r7 1220 addc r10,r10,r8 1221 adde r11,r11,r9 1222 addze r12,r12 1223 #mul_add_c(a[0],b[6],c1,c2,c3); 1224 $LD r6,`0*$BNSZ`(r4) 1225 $LD r7,`6*$BNSZ`(r5) 1226 $UMULL r8,r6,r7 1227 $UMULH r9,r6,r7 1228 addc r10,r10,r8 1229 adde r11,r11,r9 1230 addze r12,r12 1231 $ST r10,`6*$BNSZ`(r3) #r[6]=c1; 1232 #mul_add_c(a[0],b[7],c2,c3,c1); 1233 $LD r7,`7*$BNSZ`(r5) 1234 $UMULL r8,r6,r7 1235 $UMULH r9,r6,r7 1236 addc r11,r11,r8 1237 adde r12,r12,r9 1238 addze r10,r0 1239 #mul_add_c(a[1],b[6],c2,c3,c1); 1240 $LD r6,`1*$BNSZ`(r4) 1241 $LD r7,`6*$BNSZ`(r5) 1242 $UMULL r8,r6,r7 1243 $UMULH r9,r6,r7 1244 addc r11,r11,r8 1245 adde r12,r12,r9 1246 addze r10,r10 1247 #mul_add_c(a[2],b[5],c2,c3,c1); 1248 $LD r6,`2*$BNSZ`(r4) 1249 $LD r7,`5*$BNSZ`(r5) 1250 $UMULL r8,r6,r7 1251 $UMULH r9,r6,r7 1252 addc r11,r11,r8 1253 adde r12,r12,r9 1254 addze r10,r10 1255 #mul_add_c(a[3],b[4],c2,c3,c1); 1256 $LD r6,`3*$BNSZ`(r4) 1257 $LD r7,`4*$BNSZ`(r5) 1258 $UMULL r8,r6,r7 1259 $UMULH r9,r6,r7 1260 addc r11,r11,r8 1261 adde r12,r12,r9 1262 addze r10,r10 1263 #mul_add_c(a[4],b[3],c2,c3,c1); 1264 $LD r6,`4*$BNSZ`(r4) 1265 $LD r7,`3*$BNSZ`(r5) 1266 $UMULL r8,r6,r7 1267 $UMULH r9,r6,r7 1268 addc r11,r11,r8 1269 adde r12,r12,r9 1270 addze r10,r10 1271 #mul_add_c(a[5],b[2],c2,c3,c1); 1272 $LD r6,`5*$BNSZ`(r4) 1273 $LD r7,`2*$BNSZ`(r5) 1274 $UMULL r8,r6,r7 1275 $UMULH r9,r6,r7 1276 addc r11,r11,r8 1277 adde r12,r12,r9 1278 addze r10,r10 1279 #mul_add_c(a[6],b[1],c2,c3,c1); 1280 $LD r6,`6*$BNSZ`(r4) 1281 $LD r7,`1*$BNSZ`(r5) 1282 $UMULL r8,r6,r7 1283 $UMULH r9,r6,r7 1284 addc r11,r11,r8 1285 adde r12,r12,r9 1286 addze r10,r10 1287 #mul_add_c(a[7],b[0],c2,c3,c1); 1288 $LD r6,`7*$BNSZ`(r4) 1289 $LD r7,`0*$BNSZ`(r5) 1290 $UMULL r8,r6,r7 1291 $UMULH r9,r6,r7 1292 addc r11,r11,r8 1293 adde r12,r12,r9 1294 addze r10,r10 1295 $ST r11,`7*$BNSZ`(r3) #r[7]=c2; 1296 #mul_add_c(a[7],b[1],c3,c1,c2); 1297 $LD r7,`1*$BNSZ`(r5) 1298 $UMULL r8,r6,r7 1299 $UMULH r9,r6,r7 1300 addc r12,r12,r8 1301 adde r10,r10,r9 1302 addze r11,r0 1303 #mul_add_c(a[6],b[2],c3,c1,c2); 1304 $LD r6,`6*$BNSZ`(r4) 1305 $LD r7,`2*$BNSZ`(r5) 1306 $UMULL r8,r6,r7 1307 $UMULH r9,r6,r7 1308 addc r12,r12,r8 1309 adde r10,r10,r9 1310 addze r11,r11 1311 #mul_add_c(a[5],b[3],c3,c1,c2); 1312 $LD r6,`5*$BNSZ`(r4) 1313 $LD r7,`3*$BNSZ`(r5) 1314 $UMULL r8,r6,r7 1315 $UMULH r9,r6,r7 1316 addc r12,r12,r8 1317 adde r10,r10,r9 1318 addze r11,r11 1319 #mul_add_c(a[4],b[4],c3,c1,c2); 1320 $LD r6,`4*$BNSZ`(r4) 1321 $LD r7,`4*$BNSZ`(r5) 1322 $UMULL r8,r6,r7 1323 $UMULH r9,r6,r7 1324 addc r12,r12,r8 1325 adde r10,r10,r9 1326 addze r11,r11 1327 #mul_add_c(a[3],b[5],c3,c1,c2); 1328 $LD r6,`3*$BNSZ`(r4) 1329 $LD r7,`5*$BNSZ`(r5) 1330 $UMULL r8,r6,r7 1331 $UMULH r9,r6,r7 1332 addc r12,r12,r8 1333 adde r10,r10,r9 1334 addze r11,r11 1335 #mul_add_c(a[2],b[6],c3,c1,c2); 1336 $LD r6,`2*$BNSZ`(r4) 1337 $LD r7,`6*$BNSZ`(r5) 1338 $UMULL r8,r6,r7 1339 $UMULH r9,r6,r7 1340 addc r12,r12,r8 1341 adde r10,r10,r9 1342 addze r11,r11 1343 #mul_add_c(a[1],b[7],c3,c1,c2); 1344 $LD r6,`1*$BNSZ`(r4) 1345 $LD r7,`7*$BNSZ`(r5) 1346 $UMULL r8,r6,r7 1347 $UMULH r9,r6,r7 1348 addc r12,r12,r8 1349 adde r10,r10,r9 1350 addze r11,r11 1351 $ST r12,`8*$BNSZ`(r3) #r[8]=c3; 1352 #mul_add_c(a[2],b[7],c1,c2,c3); 1353 $LD r6,`2*$BNSZ`(r4) 1354 $UMULL r8,r6,r7 1355 $UMULH r9,r6,r7 1356 addc r10,r10,r8 1357 adde r11,r11,r9 1358 addze r12,r0 1359 #mul_add_c(a[3],b[6],c1,c2,c3); 1360 $LD r6,`3*$BNSZ`(r4) 1361 $LD r7,`6*$BNSZ`(r5) 1362 $UMULL r8,r6,r7 1363 $UMULH r9,r6,r7 1364 addc r10,r10,r8 1365 adde r11,r11,r9 1366 addze r12,r12 1367 #mul_add_c(a[4],b[5],c1,c2,c3); 1368 $LD r6,`4*$BNSZ`(r4) 1369 $LD r7,`5*$BNSZ`(r5) 1370 $UMULL r8,r6,r7 1371 $UMULH r9,r6,r7 1372 addc r10,r10,r8 1373 adde r11,r11,r9 1374 addze r12,r12 1375 #mul_add_c(a[5],b[4],c1,c2,c3); 1376 $LD r6,`5*$BNSZ`(r4) 1377 $LD r7,`4*$BNSZ`(r5) 1378 $UMULL r8,r6,r7 1379 $UMULH r9,r6,r7 1380 addc r10,r10,r8 1381 adde r11,r11,r9 1382 addze r12,r12 1383 #mul_add_c(a[6],b[3],c1,c2,c3); 1384 $LD r6,`6*$BNSZ`(r4) 1385 $LD r7,`3*$BNSZ`(r5) 1386 $UMULL r8,r6,r7 1387 $UMULH r9,r6,r7 1388 addc r10,r10,r8 1389 adde r11,r11,r9 1390 addze r12,r12 1391 #mul_add_c(a[7],b[2],c1,c2,c3); 1392 $LD r6,`7*$BNSZ`(r4) 1393 $LD r7,`2*$BNSZ`(r5) 1394 $UMULL r8,r6,r7 1395 $UMULH r9,r6,r7 1396 addc r10,r10,r8 1397 adde r11,r11,r9 1398 addze r12,r12 1399 $ST r10,`9*$BNSZ`(r3) #r[9]=c1; 1400 #mul_add_c(a[7],b[3],c2,c3,c1); 1401 $LD r7,`3*$BNSZ`(r5) 1402 $UMULL r8,r6,r7 1403 $UMULH r9,r6,r7 1404 addc r11,r11,r8 1405 adde r12,r12,r9 1406 addze r10,r0 1407 #mul_add_c(a[6],b[4],c2,c3,c1); 1408 $LD r6,`6*$BNSZ`(r4) 1409 $LD r7,`4*$BNSZ`(r5) 1410 $UMULL r8,r6,r7 1411 $UMULH r9,r6,r7 1412 addc r11,r11,r8 1413 adde r12,r12,r9 1414 addze r10,r10 1415 #mul_add_c(a[5],b[5],c2,c3,c1); 1416 $LD r6,`5*$BNSZ`(r4) 1417 $LD r7,`5*$BNSZ`(r5) 1418 $UMULL r8,r6,r7 1419 $UMULH r9,r6,r7 1420 addc r11,r11,r8 1421 adde r12,r12,r9 1422 addze r10,r10 1423 #mul_add_c(a[4],b[6],c2,c3,c1); 1424 $LD r6,`4*$BNSZ`(r4) 1425 $LD r7,`6*$BNSZ`(r5) 1426 $UMULL r8,r6,r7 1427 $UMULH r9,r6,r7 1428 addc r11,r11,r8 1429 adde r12,r12,r9 1430 addze r10,r10 1431 #mul_add_c(a[3],b[7],c2,c3,c1); 1432 $LD r6,`3*$BNSZ`(r4) 1433 $LD r7,`7*$BNSZ`(r5) 1434 $UMULL r8,r6,r7 1435 $UMULH r9,r6,r7 1436 addc r11,r11,r8 1437 adde r12,r12,r9 1438 addze r10,r10 1439 $ST r11,`10*$BNSZ`(r3) #r[10]=c2; 1440 #mul_add_c(a[4],b[7],c3,c1,c2); 1441 $LD r6,`4*$BNSZ`(r4) 1442 $UMULL r8,r6,r7 1443 $UMULH r9,r6,r7 1444 addc r12,r12,r8 1445 adde r10,r10,r9 1446 addze r11,r0 1447 #mul_add_c(a[5],b[6],c3,c1,c2); 1448 $LD r6,`5*$BNSZ`(r4) 1449 $LD r7,`6*$BNSZ`(r5) 1450 $UMULL r8,r6,r7 1451 $UMULH r9,r6,r7 1452 addc r12,r12,r8 1453 adde r10,r10,r9 1454 addze r11,r11 1455 #mul_add_c(a[6],b[5],c3,c1,c2); 1456 $LD r6,`6*$BNSZ`(r4) 1457 $LD r7,`5*$BNSZ`(r5) 1458 $UMULL r8,r6,r7 1459 $UMULH r9,r6,r7 1460 addc r12,r12,r8 1461 adde r10,r10,r9 1462 addze r11,r11 1463 #mul_add_c(a[7],b[4],c3,c1,c2); 1464 $LD r6,`7*$BNSZ`(r4) 1465 $LD r7,`4*$BNSZ`(r5) 1466 $UMULL r8,r6,r7 1467 $UMULH r9,r6,r7 1468 addc r12,r12,r8 1469 adde r10,r10,r9 1470 addze r11,r11 1471 $ST r12,`11*$BNSZ`(r3) #r[11]=c3; 1472 #mul_add_c(a[7],b[5],c1,c2,c3); 1473 $LD r7,`5*$BNSZ`(r5) 1474 $UMULL r8,r6,r7 1475 $UMULH r9,r6,r7 1476 addc r10,r10,r8 1477 adde r11,r11,r9 1478 addze r12,r0 1479 #mul_add_c(a[6],b[6],c1,c2,c3); 1480 $LD r6,`6*$BNSZ`(r4) 1481 $LD r7,`6*$BNSZ`(r5) 1482 $UMULL r8,r6,r7 1483 $UMULH r9,r6,r7 1484 addc r10,r10,r8 1485 adde r11,r11,r9 1486 addze r12,r12 1487 #mul_add_c(a[5],b[7],c1,c2,c3); 1488 $LD r6,`5*$BNSZ`(r4) 1489 $LD r7,`7*$BNSZ`(r5) 1490 $UMULL r8,r6,r7 1491 $UMULH r9,r6,r7 1492 addc r10,r10,r8 1493 adde r11,r11,r9 1494 addze r12,r12 1495 $ST r10,`12*$BNSZ`(r3) #r[12]=c1; 1496 #mul_add_c(a[6],b[7],c2,c3,c1); 1497 $LD r6,`6*$BNSZ`(r4) 1498 $UMULL r8,r6,r7 1499 $UMULH r9,r6,r7 1500 addc r11,r11,r8 1501 adde r12,r12,r9 1502 addze r10,r0 1503 #mul_add_c(a[7],b[6],c2,c3,c1); 1504 $LD r6,`7*$BNSZ`(r4) 1505 $LD r7,`6*$BNSZ`(r5) 1506 $UMULL r8,r6,r7 1507 $UMULH r9,r6,r7 1508 addc r11,r11,r8 1509 adde r12,r12,r9 1510 addze r10,r10 1511 $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 1512 #mul_add_c(a[7],b[7],c3,c1,c2); 1513 $LD r7,`7*$BNSZ`(r5) 1514 $UMULL r8,r6,r7 1515 $UMULH r9,r6,r7 1516 addc r12,r12,r8 1517 adde r10,r10,r9 1518 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1519 $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1520 blr 1521 .long 0 1522 .byte 0,12,0x14,0,0,0,3,0 1523 .long 0 1524.size .bn_mul_comba8,.-.bn_mul_comba8 1525 1526# 1527# NOTE: The following label name should be changed to 1528# "bn_sub_words" i.e. remove the first dot 1529# for the gcc compiler. This should be automatically 1530# done in the build 1531# 1532# 1533.align 4 1534.bn_sub_words: 1535# 1536# Handcoded version of bn_sub_words 1537# 1538#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1539# 1540# r3 = r 1541# r4 = a 1542# r5 = b 1543# r6 = n 1544# 1545# Note: No loop unrolling done since this is not a performance 1546# critical loop. 1547 1548 xor r0,r0,r0 #set r0 = 0 1549# 1550# check for r6 = 0 AND set carry bit. 1551# 1552 subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1553 # if r6 > 0 then result !=0 1554 # In either case carry bit is set. 1555 beq Lppcasm_sub_adios 1556 addi r4,r4,-$BNSZ 1557 addi r3,r3,-$BNSZ 1558 addi r5,r5,-$BNSZ 1559 mtctr r6 1560Lppcasm_sub_mainloop: 1561 $LDU r7,$BNSZ(r4) 1562 $LDU r8,$BNSZ(r5) 1563 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 1564 # if carry = 1 this is r7-r8. Else it 1565 # is r7-r8 -1 as we need. 1566 $STU r6,$BNSZ(r3) 1567 bdnz Lppcasm_sub_mainloop 1568Lppcasm_sub_adios: 1569 subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1570 andi. r3,r3,1 # keep only last bit. 1571 blr 1572 .long 0 1573 .byte 0,12,0x14,0,0,0,4,0 1574 .long 0 1575.size .bn_sub_words,.-.bn_sub_words 1576 1577# 1578# NOTE: The following label name should be changed to 1579# "bn_add_words" i.e. remove the first dot 1580# for the gcc compiler. This should be automatically 1581# done in the build 1582# 1583 1584.align 4 1585.bn_add_words: 1586# 1587# Handcoded version of bn_add_words 1588# 1589#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1590# 1591# r3 = r 1592# r4 = a 1593# r5 = b 1594# r6 = n 1595# 1596# Note: No loop unrolling done since this is not a performance 1597# critical loop. 1598 1599 xor r0,r0,r0 1600# 1601# check for r6 = 0. Is this needed? 1602# 1603 addic. r6,r6,0 #test r6 and clear carry bit. 1604 beq Lppcasm_add_adios 1605 addi r4,r4,-$BNSZ 1606 addi r3,r3,-$BNSZ 1607 addi r5,r5,-$BNSZ 1608 mtctr r6 1609Lppcasm_add_mainloop: 1610 $LDU r7,$BNSZ(r4) 1611 $LDU r8,$BNSZ(r5) 1612 adde r8,r7,r8 1613 $STU r8,$BNSZ(r3) 1614 bdnz Lppcasm_add_mainloop 1615Lppcasm_add_adios: 1616 addze r3,r0 #return carry bit. 1617 blr 1618 .long 0 1619 .byte 0,12,0x14,0,0,0,4,0 1620 .long 0 1621.size .bn_add_words,.-.bn_add_words 1622 1623# 1624# NOTE: The following label name should be changed to 1625# "bn_div_words" i.e. remove the first dot 1626# for the gcc compiler. This should be automatically 1627# done in the build 1628# 1629 1630.align 4 1631.bn_div_words: 1632# 1633# This is a cleaned up version of code generated by 1634# the AIX compiler. The only optimization is to use 1635# the PPC instruction to count leading zeros instead 1636# of call to num_bits_word. Since this was compiled 1637# only at level -O2 we can possibly squeeze it more? 1638# 1639# r3 = h 1640# r4 = l 1641# r5 = d 1642 1643 $UCMPI 0,r5,0 # compare r5 and 0 1644 bne Lppcasm_div1 # proceed if d!=0 1645 li r3,-1 # d=0 return -1 1646 blr 1647Lppcasm_div1: 1648 xor r0,r0,r0 #r0=0 1649 li r8,$BITS 1650 $CNTLZ. r7,r5 #r7 = num leading 0s in d. 1651 beq Lppcasm_div2 #proceed if no leading zeros 1652 subf r8,r7,r8 #r8 = BN_num_bits_word(d) 1653 $SHR. r9,r3,r8 #are there any bits above r8'th? 1654 $TR 16,r9,r0 #if there're, signal to dump core... 1655Lppcasm_div2: 1656 $UCMP 0,r3,r5 #h>=d? 1657 blt Lppcasm_div3 #goto Lppcasm_div3 if not 1658 subf r3,r5,r3 #h-=d ; 1659Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1660 cmpi 0,0,r7,0 # is (i == 0)? 1661 beq Lppcasm_div4 1662 $SHL r3,r3,r7 # h = (h<< i) 1663 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1664 $SHL r5,r5,r7 # d<<=i 1665 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 1666 $SHL r4,r4,r7 # l <<=i 1667Lppcasm_div4: 1668 $SHRI r9,r5,`$BITS/2` # r9 = dh 1669 # dl will be computed when needed 1670 # as it saves registers. 1671 li r6,2 #r6=2 1672 mtctr r6 #counter will be in count. 1673Lppcasm_divouterloop: 1674 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 1675 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1676 # compute here for innerloop. 1677 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1678 bne Lppcasm_div5 # goto Lppcasm_div5 if not 1679 1680 li r8,-1 1681 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1682 b Lppcasm_div6 1683Lppcasm_div5: 1684 $UDIV r8,r3,r9 #q = h/dh 1685Lppcasm_div6: 1686 $UMULL r12,r9,r8 #th = q*dh 1687 $CLRU r10,r5,`$BITS/2` #r10=dl 1688 $UMULL r6,r8,r10 #tl = q*dl 1689 1690Lppcasm_divinnerloop: 1691 subf r10,r12,r3 #t = h -th 1692 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 1693 addic. r7,r7,0 #test if r7 == 0. used below. 1694 # now want to compute 1695 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) 1696 # the following 2 instructions do that 1697 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1698 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1699 $UCMP cr1,r6,r7 # compare (tl <= r7) 1700 bne Lppcasm_divinnerexit 1701 ble cr1,Lppcasm_divinnerexit 1702 addi r8,r8,-1 #q-- 1703 subf r12,r9,r12 #th -=dh 1704 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1705 subf r6,r10,r6 #tl -=dl 1706 b Lppcasm_divinnerloop 1707Lppcasm_divinnerexit: 1708 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1709 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1710 $UCMP cr1,r4,r11 # compare l and tl 1711 add r12,r12,r10 # th+=t 1712 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1713 addi r12,r12,1 # th++ 1714Lppcasm_div7: 1715 subf r11,r11,r4 #r11=l-tl 1716 $UCMP cr1,r3,r12 #compare h and th 1717 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 1718 addi r8,r8,-1 # q-- 1719 add r3,r5,r3 # h+=d 1720Lppcasm_div8: 1721 subf r12,r12,r3 #r12 = h-th 1722 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 1723 # want to compute 1724 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 1725 # the following 2 instructions will do this. 1726 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 1727 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1728 bdz Lppcasm_div9 #if (count==0) break ; 1729 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1730 b Lppcasm_divouterloop 1731Lppcasm_div9: 1732 or r3,r8,r0 1733 blr 1734 .long 0 1735 .byte 0,12,0x14,0,0,0,3,0 1736 .long 0 1737.size .bn_div_words,.-.bn_div_words 1738 1739# 1740# NOTE: The following label name should be changed to 1741# "bn_sqr_words" i.e. remove the first dot 1742# for the gcc compiler. This should be automatically 1743# done in the build 1744# 1745.align 4 1746.bn_sqr_words: 1747# 1748# Optimized version of bn_sqr_words 1749# 1750# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 1751# 1752# r3 = r 1753# r4 = a 1754# r5 = n 1755# 1756# r6 = a[i]. 1757# r7,r8 = product. 1758# 1759# No unrolling done here. Not performance critical. 1760 1761 addic. r5,r5,0 #test r5. 1762 beq Lppcasm_sqr_adios 1763 addi r4,r4,-$BNSZ 1764 addi r3,r3,-$BNSZ 1765 mtctr r5 1766Lppcasm_sqr_mainloop: 1767 #sqr(r[0],r[1],a[0]); 1768 $LDU r6,$BNSZ(r4) 1769 $UMULL r7,r6,r6 1770 $UMULH r8,r6,r6 1771 $STU r7,$BNSZ(r3) 1772 $STU r8,$BNSZ(r3) 1773 bdnz Lppcasm_sqr_mainloop 1774Lppcasm_sqr_adios: 1775 blr 1776 .long 0 1777 .byte 0,12,0x14,0,0,0,3,0 1778 .long 0 1779.size .bn_sqr_words,.-.bn_sqr_words 1780 1781# 1782# NOTE: The following label name should be changed to 1783# "bn_mul_words" i.e. remove the first dot 1784# for the gcc compiler. This should be automatically 1785# done in the build 1786# 1787 1788.align 4 1789.bn_mul_words: 1790# 1791# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1792# 1793# r3 = rp 1794# r4 = ap 1795# r5 = num 1796# r6 = w 1797 xor r0,r0,r0 1798 xor r12,r12,r12 # used for carry 1799 rlwinm. r7,r5,30,2,31 # num >> 2 1800 beq Lppcasm_mw_REM 1801 mtctr r7 1802Lppcasm_mw_LOOP: 1803 #mul(rp[0],ap[0],w,c1); 1804 $LD r8,`0*$BNSZ`(r4) 1805 $UMULL r9,r6,r8 1806 $UMULH r10,r6,r8 1807 addc r9,r9,r12 1808 #addze r10,r10 #carry is NOT ignored. 1809 #will be taken care of 1810 #in second spin below 1811 #using adde. 1812 $ST r9,`0*$BNSZ`(r3) 1813 #mul(rp[1],ap[1],w,c1); 1814 $LD r8,`1*$BNSZ`(r4) 1815 $UMULL r11,r6,r8 1816 $UMULH r12,r6,r8 1817 adde r11,r11,r10 1818 #addze r12,r12 1819 $ST r11,`1*$BNSZ`(r3) 1820 #mul(rp[2],ap[2],w,c1); 1821 $LD r8,`2*$BNSZ`(r4) 1822 $UMULL r9,r6,r8 1823 $UMULH r10,r6,r8 1824 adde r9,r9,r12 1825 #addze r10,r10 1826 $ST r9,`2*$BNSZ`(r3) 1827 #mul_add(rp[3],ap[3],w,c1); 1828 $LD r8,`3*$BNSZ`(r4) 1829 $UMULL r11,r6,r8 1830 $UMULH r12,r6,r8 1831 adde r11,r11,r10 1832 addze r12,r12 #this spin we collect carry into 1833 #r12 1834 $ST r11,`3*$BNSZ`(r3) 1835 1836 addi r3,r3,`4*$BNSZ` 1837 addi r4,r4,`4*$BNSZ` 1838 bdnz Lppcasm_mw_LOOP 1839 1840Lppcasm_mw_REM: 1841 andi. r5,r5,0x3 1842 beq Lppcasm_mw_OVER 1843 #mul(rp[0],ap[0],w,c1); 1844 $LD r8,`0*$BNSZ`(r4) 1845 $UMULL r9,r6,r8 1846 $UMULH r10,r6,r8 1847 addc r9,r9,r12 1848 addze r10,r10 1849 $ST r9,`0*$BNSZ`(r3) 1850 addi r12,r10,0 1851 1852 addi r5,r5,-1 1853 cmpli 0,0,r5,0 1854 beq Lppcasm_mw_OVER 1855 1856 1857 #mul(rp[1],ap[1],w,c1); 1858 $LD r8,`1*$BNSZ`(r4) 1859 $UMULL r9,r6,r8 1860 $UMULH r10,r6,r8 1861 addc r9,r9,r12 1862 addze r10,r10 1863 $ST r9,`1*$BNSZ`(r3) 1864 addi r12,r10,0 1865 1866 addi r5,r5,-1 1867 cmpli 0,0,r5,0 1868 beq Lppcasm_mw_OVER 1869 1870 #mul_add(rp[2],ap[2],w,c1); 1871 $LD r8,`2*$BNSZ`(r4) 1872 $UMULL r9,r6,r8 1873 $UMULH r10,r6,r8 1874 addc r9,r9,r12 1875 addze r10,r10 1876 $ST r9,`2*$BNSZ`(r3) 1877 addi r12,r10,0 1878 1879Lppcasm_mw_OVER: 1880 addi r3,r12,0 1881 blr 1882 .long 0 1883 .byte 0,12,0x14,0,0,0,4,0 1884 .long 0 1885.size .bn_mul_words,.-.bn_mul_words 1886 1887# 1888# NOTE: The following label name should be changed to 1889# "bn_mul_add_words" i.e. remove the first dot 1890# for the gcc compiler. This should be automatically 1891# done in the build 1892# 1893 1894.align 4 1895.bn_mul_add_words: 1896# 1897# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1898# 1899# r3 = rp 1900# r4 = ap 1901# r5 = num 1902# r6 = w 1903# 1904# empirical evidence suggests that unrolled version performs best!! 1905# 1906 xor r0,r0,r0 #r0 = 0 1907 xor r12,r12,r12 #r12 = 0 . used for carry 1908 rlwinm. r7,r5,30,2,31 # num >> 2 1909 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 1910 mtctr r7 1911Lppcasm_maw_mainloop: 1912 #mul_add(rp[0],ap[0],w,c1); 1913 $LD r8,`0*$BNSZ`(r4) 1914 $LD r11,`0*$BNSZ`(r3) 1915 $UMULL r9,r6,r8 1916 $UMULH r10,r6,r8 1917 addc r9,r9,r12 #r12 is carry. 1918 addze r10,r10 1919 addc r9,r9,r11 1920 #addze r10,r10 1921 #the above instruction addze 1922 #is NOT needed. Carry will NOT 1923 #be ignored. It's not affected 1924 #by multiply and will be collected 1925 #in the next spin 1926 $ST r9,`0*$BNSZ`(r3) 1927 1928 #mul_add(rp[1],ap[1],w,c1); 1929 $LD r8,`1*$BNSZ`(r4) 1930 $LD r9,`1*$BNSZ`(r3) 1931 $UMULL r11,r6,r8 1932 $UMULH r12,r6,r8 1933 adde r11,r11,r10 #r10 is carry. 1934 addze r12,r12 1935 addc r11,r11,r9 1936 #addze r12,r12 1937 $ST r11,`1*$BNSZ`(r3) 1938 1939 #mul_add(rp[2],ap[2],w,c1); 1940 $LD r8,`2*$BNSZ`(r4) 1941 $UMULL r9,r6,r8 1942 $LD r11,`2*$BNSZ`(r3) 1943 $UMULH r10,r6,r8 1944 adde r9,r9,r12 1945 addze r10,r10 1946 addc r9,r9,r11 1947 #addze r10,r10 1948 $ST r9,`2*$BNSZ`(r3) 1949 1950 #mul_add(rp[3],ap[3],w,c1); 1951 $LD r8,`3*$BNSZ`(r4) 1952 $UMULL r11,r6,r8 1953 $LD r9,`3*$BNSZ`(r3) 1954 $UMULH r12,r6,r8 1955 adde r11,r11,r10 1956 addze r12,r12 1957 addc r11,r11,r9 1958 addze r12,r12 1959 $ST r11,`3*$BNSZ`(r3) 1960 addi r3,r3,`4*$BNSZ` 1961 addi r4,r4,`4*$BNSZ` 1962 bdnz Lppcasm_maw_mainloop 1963 1964Lppcasm_maw_leftover: 1965 andi. r5,r5,0x3 1966 beq Lppcasm_maw_adios 1967 addi r3,r3,-$BNSZ 1968 addi r4,r4,-$BNSZ 1969 #mul_add(rp[0],ap[0],w,c1); 1970 mtctr r5 1971 $LDU r8,$BNSZ(r4) 1972 $UMULL r9,r6,r8 1973 $UMULH r10,r6,r8 1974 $LDU r11,$BNSZ(r3) 1975 addc r9,r9,r11 1976 addze r10,r10 1977 addc r9,r9,r12 1978 addze r12,r10 1979 $ST r9,0(r3) 1980 1981 bdz Lppcasm_maw_adios 1982 #mul_add(rp[1],ap[1],w,c1); 1983 $LDU r8,$BNSZ(r4) 1984 $UMULL r9,r6,r8 1985 $UMULH r10,r6,r8 1986 $LDU r11,$BNSZ(r3) 1987 addc r9,r9,r11 1988 addze r10,r10 1989 addc r9,r9,r12 1990 addze r12,r10 1991 $ST r9,0(r3) 1992 1993 bdz Lppcasm_maw_adios 1994 #mul_add(rp[2],ap[2],w,c1); 1995 $LDU r8,$BNSZ(r4) 1996 $UMULL r9,r6,r8 1997 $UMULH r10,r6,r8 1998 $LDU r11,$BNSZ(r3) 1999 addc r9,r9,r11 2000 addze r10,r10 2001 addc r9,r9,r12 2002 addze r12,r10 2003 $ST r9,0(r3) 2004 2005Lppcasm_maw_adios: 2006 addi r3,r12,0 2007 blr 2008 .long 0 2009 .byte 0,12,0x14,0,0,0,4,0 2010 .long 0 2011.size .bn_mul_add_words,.-.bn_mul_add_words 2012 .align 4 2013EOF 2014$data =~ s/\`([^\`]*)\`/eval $1/gem; 2015print $data; 2016close STDOUT or die "error closing STDOUT: $!"; 2017