1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# August 2011. 11# 12# Companion to x86_64-mont.pl that optimizes cache-timing attack 13# countermeasures. The subroutines are produced by replacing bp[i] 14# references in their x86_64-mont.pl counterparts with cache-neutral 15# references to powers table computed in BN_mod_exp_mont_consttime. 16# In addition subroutine that scatters elements of the powers table 17# is implemented, so that scatter-/gathering can be tuned without 18# bn_exp.c modifications. 19 20# August 2013. 21# 22# Add MULX/AD*X code paths and additional interfaces to optimize for 23# branch prediction unit. For input lengths that are multiples of 8 24# the np argument is not just modulus value, but one interleaved 25# with 0. This is to optimize post-condition... 26 27$flavour = shift; 28$output = shift; 29if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 30 31$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 32 33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 34( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 35( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 36die "can't locate x86_64-xlate.pl"; 37 38open OUT,"| \"$^X\" $xlate $flavour $output"; 39*STDOUT=*OUT; 40 41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 43 $addx = ($1>=2.23); 44} 45 46if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 48 $addx = ($1>=2.10); 49} 50 51if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 52 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 53 $addx = ($1>=12); 54} 55 56if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 57 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 58 $addx = ($ver>=3.03); 59} 60 61# int bn_mul_mont_gather5( 62$rp="%rdi"; # BN_ULONG *rp, 63$ap="%rsi"; # const BN_ULONG *ap, 64$bp="%rdx"; # const BN_ULONG *bp, 65$np="%rcx"; # const BN_ULONG *np, 66$n0="%r8"; # const BN_ULONG *n0, 67$num="%r9"; # int num, 68 # int idx); # 0 to 2^5-1, "index" in $bp holding 69 # pre-computed powers of a', interlaced 70 # in such manner that b[0] is $bp[idx], 71 # b[1] is [2^5+idx], etc. 72$lo0="%r10"; 73$hi0="%r11"; 74$hi1="%r13"; 75$i="%r14"; 76$j="%r15"; 77$m0="%rbx"; 78$m1="%rbp"; 79 80$code=<<___; 81.text 82 83.extern OPENSSL_ia32cap_P 84 85.globl bn_mul_mont_gather5 86.type bn_mul_mont_gather5,\@function,6 87.align 64 88bn_mul_mont_gather5: 89 test \$7,${num}d 90 jnz .Lmul_enter 91___ 92$code.=<<___ if ($addx); 93 mov OPENSSL_ia32cap_P+8(%rip),%r11d 94___ 95$code.=<<___; 96 jmp .Lmul4x_enter 97 98.align 16 99.Lmul_enter: 100 mov ${num}d,${num}d 101 mov %rsp,%rax 102 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument 103 lea .Linc(%rip),%r10 104 push %rbx 105 push %rbp 106 push %r12 107 push %r13 108 push %r14 109 push %r15 110 111 lea 2($num),%r11 112 neg %r11 113 lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) 114 and \$-1024,%rsp # minimize TLB usage 115 116 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 117.Lmul_body: 118 # Some OSes, *cough*-dows, insist on stack being "wired" to 119 # physical memory in strictly sequential manner, i.e. if stack 120 # allocation spans two pages, then reference to farmost one can 121 # be punishable by SEGV. But page walking can do good even on 122 # other OSes, because it guarantees that villain thread hits 123 # the guard page before it can make damage to innocent one... 124 sub %rsp,%rax 125 and \$-4096,%rax 126.Lmul_page_walk: 127 mov (%rsp,%rax),%r11 128 sub \$4096,%rax 129 .byte 0x2e # predict non-taken 130 jnc .Lmul_page_walk 131 132 lea 128($bp),%r12 # reassign $bp (+size optimization) 133___ 134 $bp="%r12"; 135 $STRIDE=2**5*8; # 5 is "window size" 136 $N=$STRIDE/4; # should match cache line size 137$code.=<<___; 138 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 139 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 140 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) 141 and \$-16,%r10 142 143 pshufd \$0,%xmm5,%xmm5 # broadcast index 144 movdqa %xmm1,%xmm4 145 movdqa %xmm1,%xmm2 146___ 147######################################################################## 148# calculate mask by comparing 0..31 to index and save result to stack 149# 150$code.=<<___; 151 paddd %xmm0,%xmm1 152 pcmpeqd %xmm5,%xmm0 # compare to 1,0 153 .byte 0x67 154 movdqa %xmm4,%xmm3 155___ 156for($k=0;$k<$STRIDE/16-4;$k+=4) { 157$code.=<<___; 158 paddd %xmm1,%xmm2 159 pcmpeqd %xmm5,%xmm1 # compare to 3,2 160 movdqa %xmm0,`16*($k+0)+112`(%r10) 161 movdqa %xmm4,%xmm0 162 163 paddd %xmm2,%xmm3 164 pcmpeqd %xmm5,%xmm2 # compare to 5,4 165 movdqa %xmm1,`16*($k+1)+112`(%r10) 166 movdqa %xmm4,%xmm1 167 168 paddd %xmm3,%xmm0 169 pcmpeqd %xmm5,%xmm3 # compare to 7,6 170 movdqa %xmm2,`16*($k+2)+112`(%r10) 171 movdqa %xmm4,%xmm2 172 173 paddd %xmm0,%xmm1 174 pcmpeqd %xmm5,%xmm0 175 movdqa %xmm3,`16*($k+3)+112`(%r10) 176 movdqa %xmm4,%xmm3 177___ 178} 179$code.=<<___; # last iteration can be optimized 180 paddd %xmm1,%xmm2 181 pcmpeqd %xmm5,%xmm1 182 movdqa %xmm0,`16*($k+0)+112`(%r10) 183 184 paddd %xmm2,%xmm3 185 .byte 0x67 186 pcmpeqd %xmm5,%xmm2 187 movdqa %xmm1,`16*($k+1)+112`(%r10) 188 189 pcmpeqd %xmm5,%xmm3 190 movdqa %xmm2,`16*($k+2)+112`(%r10) 191 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register 192 193 pand `16*($k+1)-128`($bp),%xmm1 194 pand `16*($k+2)-128`($bp),%xmm2 195 movdqa %xmm3,`16*($k+3)+112`(%r10) 196 pand `16*($k+3)-128`($bp),%xmm3 197 por %xmm2,%xmm0 198 por %xmm3,%xmm1 199___ 200for($k=0;$k<$STRIDE/16-4;$k+=4) { 201$code.=<<___; 202 movdqa `16*($k+0)-128`($bp),%xmm4 203 movdqa `16*($k+1)-128`($bp),%xmm5 204 movdqa `16*($k+2)-128`($bp),%xmm2 205 pand `16*($k+0)+112`(%r10),%xmm4 206 movdqa `16*($k+3)-128`($bp),%xmm3 207 pand `16*($k+1)+112`(%r10),%xmm5 208 por %xmm4,%xmm0 209 pand `16*($k+2)+112`(%r10),%xmm2 210 por %xmm5,%xmm1 211 pand `16*($k+3)+112`(%r10),%xmm3 212 por %xmm2,%xmm0 213 por %xmm3,%xmm1 214___ 215} 216$code.=<<___; 217 por %xmm1,%xmm0 218 pshufd \$0x4e,%xmm0,%xmm1 219 por %xmm1,%xmm0 220 lea $STRIDE($bp),$bp 221 movq %xmm0,$m0 # m0=bp[0] 222 223 mov ($n0),$n0 # pull n0[0] value 224 mov ($ap),%rax 225 226 xor $i,$i # i=0 227 xor $j,$j # j=0 228 229 mov $n0,$m1 230 mulq $m0 # ap[0]*bp[0] 231 mov %rax,$lo0 232 mov ($np),%rax 233 234 imulq $lo0,$m1 # "tp[0]"*n0 235 mov %rdx,$hi0 236 237 mulq $m1 # np[0]*m1 238 add %rax,$lo0 # discarded 239 mov 8($ap),%rax 240 adc \$0,%rdx 241 mov %rdx,$hi1 242 243 lea 1($j),$j # j++ 244 jmp .L1st_enter 245 246.align 16 247.L1st: 248 add %rax,$hi1 249 mov ($ap,$j,8),%rax 250 adc \$0,%rdx 251 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 252 mov $lo0,$hi0 253 adc \$0,%rdx 254 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 255 mov %rdx,$hi1 256 257.L1st_enter: 258 mulq $m0 # ap[j]*bp[0] 259 add %rax,$hi0 260 mov ($np,$j,8),%rax 261 adc \$0,%rdx 262 lea 1($j),$j # j++ 263 mov %rdx,$lo0 264 265 mulq $m1 # np[j]*m1 266 cmp $num,$j 267 jne .L1st # note that upon exit $j==$num, so 268 # they can be used interchangeably 269 270 add %rax,$hi1 271 adc \$0,%rdx 272 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 273 adc \$0,%rdx 274 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 275 mov %rdx,$hi1 276 mov $lo0,$hi0 277 278 xor %rdx,%rdx 279 add $hi0,$hi1 280 adc \$0,%rdx 281 mov $hi1,-8(%rsp,$num,8) 282 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 283 284 lea 1($i),$i # i++ 285 jmp .Louter 286.align 16 287.Louter: 288 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) 289 and \$-16,%rdx 290 pxor %xmm4,%xmm4 291 pxor %xmm5,%xmm5 292___ 293for($k=0;$k<$STRIDE/16;$k+=4) { 294$code.=<<___; 295 movdqa `16*($k+0)-128`($bp),%xmm0 296 movdqa `16*($k+1)-128`($bp),%xmm1 297 movdqa `16*($k+2)-128`($bp),%xmm2 298 movdqa `16*($k+3)-128`($bp),%xmm3 299 pand `16*($k+0)-128`(%rdx),%xmm0 300 pand `16*($k+1)-128`(%rdx),%xmm1 301 por %xmm0,%xmm4 302 pand `16*($k+2)-128`(%rdx),%xmm2 303 por %xmm1,%xmm5 304 pand `16*($k+3)-128`(%rdx),%xmm3 305 por %xmm2,%xmm4 306 por %xmm3,%xmm5 307___ 308} 309$code.=<<___; 310 por %xmm5,%xmm4 311 pshufd \$0x4e,%xmm4,%xmm0 312 por %xmm4,%xmm0 313 lea $STRIDE($bp),$bp 314 315 mov ($ap),%rax # ap[0] 316 movq %xmm0,$m0 # m0=bp[i] 317 318 xor $j,$j # j=0 319 mov $n0,$m1 320 mov (%rsp),$lo0 321 322 mulq $m0 # ap[0]*bp[i] 323 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 324 mov ($np),%rax 325 adc \$0,%rdx 326 327 imulq $lo0,$m1 # tp[0]*n0 328 mov %rdx,$hi0 329 330 mulq $m1 # np[0]*m1 331 add %rax,$lo0 # discarded 332 mov 8($ap),%rax 333 adc \$0,%rdx 334 mov 8(%rsp),$lo0 # tp[1] 335 mov %rdx,$hi1 336 337 lea 1($j),$j # j++ 338 jmp .Linner_enter 339 340.align 16 341.Linner: 342 add %rax,$hi1 343 mov ($ap,$j,8),%rax 344 adc \$0,%rdx 345 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 346 mov (%rsp,$j,8),$lo0 347 adc \$0,%rdx 348 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 349 mov %rdx,$hi1 350 351.Linner_enter: 352 mulq $m0 # ap[j]*bp[i] 353 add %rax,$hi0 354 mov ($np,$j,8),%rax 355 adc \$0,%rdx 356 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 357 mov %rdx,$hi0 358 adc \$0,$hi0 359 lea 1($j),$j # j++ 360 361 mulq $m1 # np[j]*m1 362 cmp $num,$j 363 jne .Linner # note that upon exit $j==$num, so 364 # they can be used interchangeably 365 add %rax,$hi1 366 adc \$0,%rdx 367 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 368 mov (%rsp,$num,8),$lo0 369 adc \$0,%rdx 370 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 371 mov %rdx,$hi1 372 373 xor %rdx,%rdx 374 add $hi0,$hi1 375 adc \$0,%rdx 376 add $lo0,$hi1 # pull upmost overflow bit 377 adc \$0,%rdx 378 mov $hi1,-8(%rsp,$num,8) 379 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 380 381 lea 1($i),$i # i++ 382 cmp $num,$i 383 jb .Louter 384 385 xor $i,$i # i=0 and clear CF! 386 mov (%rsp),%rax # tp[0] 387 lea (%rsp),$ap # borrow ap for tp 388 mov $num,$j # j=num 389 jmp .Lsub 390.align 16 391.Lsub: sbb ($np,$i,8),%rax 392 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 393 mov 8($ap,$i,8),%rax # tp[i+1] 394 lea 1($i),$i # i++ 395 dec $j # doesnn't affect CF! 396 jnz .Lsub 397 398 sbb \$0,%rax # handle upmost overflow bit 399 xor $i,$i 400 and %rax,$ap 401 not %rax 402 mov $rp,$np 403 and %rax,$np 404 mov $num,$j # j=num 405 or $np,$ap # ap=borrow?tp:rp 406.align 16 407.Lcopy: # copy or in-place refresh 408 mov ($ap,$i,8),%rax 409 mov $i,(%rsp,$i,8) # zap temporary vector 410 mov %rax,($rp,$i,8) # rp[i]=tp[i] 411 lea 1($i),$i 412 sub \$1,$j 413 jnz .Lcopy 414 415 mov 8(%rsp,$num,8),%rsi # restore %rsp 416 mov \$1,%rax 417 418 mov -48(%rsi),%r15 419 mov -40(%rsi),%r14 420 mov -32(%rsi),%r13 421 mov -24(%rsi),%r12 422 mov -16(%rsi),%rbp 423 mov -8(%rsi),%rbx 424 lea (%rsi),%rsp 425.Lmul_epilogue: 426 ret 427.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 428___ 429{{{ 430my @A=("%r10","%r11"); 431my @N=("%r13","%rdi"); 432$code.=<<___; 433.type bn_mul4x_mont_gather5,\@function,6 434.align 32 435bn_mul4x_mont_gather5: 436.Lmul4x_enter: 437___ 438$code.=<<___ if ($addx); 439 and \$0x80108,%r11d 440 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 441 je .Lmulx4x_enter 442___ 443$code.=<<___; 444 .byte 0x67 445 mov %rsp,%rax 446 push %rbx 447 push %rbp 448 push %r12 449 push %r13 450 push %r14 451 push %r15 452 453 .byte 0x67 454 shl \$3,${num}d # convert $num to bytes 455 lea ($num,$num,2),%r10 # 3*$num in bytes 456 neg $num # -$num 457 458 ############################################################## 459 # Ensure that stack frame doesn't alias with $rptr+3*$num 460 # modulo 4096, which covers ret[num], am[num] and n[num] 461 # (see bn_exp.c). This is done to allow memory disambiguation 462 # logic do its magic. [Extra [num] is allocated in order 463 # to align with bn_power5's frame, which is cleansed after 464 # completing exponentiation. Extra 256 bytes is for power mask 465 # calculated from 7th argument, the index.] 466 # 467 lea -320(%rsp,$num,2),%r11 468 sub $rp,%r11 469 and \$4095,%r11 470 cmp %r11,%r10 471 jb .Lmul4xsp_alt 472 sub %r11,%rsp # align with $rp 473 lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) 474 jmp .Lmul4xsp_done 475 476.align 32 477.Lmul4xsp_alt: 478 lea 4096-320(,$num,2),%r10 479 lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) 480 sub %r10,%r11 481 mov \$0,%r10 482 cmovc %r10,%r11 483 sub %r11,%rsp 484.Lmul4xsp_done: 485 and \$-64,%rsp 486 mov %rax,%r11 487 sub %rsp,%r11 488 and \$-4096,%r11 489.Lmul4x_page_walk: 490 mov (%rsp,%r11),%r10 491 sub \$4096,%r11 492 .byte 0x2e # predict non-taken 493 jnc .Lmul4x_page_walk 494 495 neg $num 496 497 mov %rax,40(%rsp) 498.Lmul4x_body: 499 500 call mul4x_internal 501 502 mov 40(%rsp),%rsi # restore %rsp 503 mov \$1,%rax 504 505 mov -48(%rsi),%r15 506 mov -40(%rsi),%r14 507 mov -32(%rsi),%r13 508 mov -24(%rsi),%r12 509 mov -16(%rsi),%rbp 510 mov -8(%rsi),%rbx 511 lea (%rsi),%rsp 512.Lmul4x_epilogue: 513 ret 514.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 515 516.type mul4x_internal,\@abi-omnipotent 517.align 32 518mul4x_internal: 519 shl \$5,$num # $num was in bytes 520 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index 521 lea .Linc(%rip),%rax 522 lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) 523 shr \$5,$num # restore $num 524___ 525 $bp="%r12"; 526 $STRIDE=2**5*8; # 5 is "window size" 527 $N=$STRIDE/4; # should match cache line size 528 $tp=$i; 529$code.=<<___; 530 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 531 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 532 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) 533 lea 128(%rdx),$bp # size optimization 534 535 pshufd \$0,%xmm5,%xmm5 # broadcast index 536 movdqa %xmm1,%xmm4 537 .byte 0x67,0x67 538 movdqa %xmm1,%xmm2 539___ 540######################################################################## 541# calculate mask by comparing 0..31 to index and save result to stack 542# 543$code.=<<___; 544 paddd %xmm0,%xmm1 545 pcmpeqd %xmm5,%xmm0 # compare to 1,0 546 .byte 0x67 547 movdqa %xmm4,%xmm3 548___ 549for($i=0;$i<$STRIDE/16-4;$i+=4) { 550$code.=<<___; 551 paddd %xmm1,%xmm2 552 pcmpeqd %xmm5,%xmm1 # compare to 3,2 553 movdqa %xmm0,`16*($i+0)+112`(%r10) 554 movdqa %xmm4,%xmm0 555 556 paddd %xmm2,%xmm3 557 pcmpeqd %xmm5,%xmm2 # compare to 5,4 558 movdqa %xmm1,`16*($i+1)+112`(%r10) 559 movdqa %xmm4,%xmm1 560 561 paddd %xmm3,%xmm0 562 pcmpeqd %xmm5,%xmm3 # compare to 7,6 563 movdqa %xmm2,`16*($i+2)+112`(%r10) 564 movdqa %xmm4,%xmm2 565 566 paddd %xmm0,%xmm1 567 pcmpeqd %xmm5,%xmm0 568 movdqa %xmm3,`16*($i+3)+112`(%r10) 569 movdqa %xmm4,%xmm3 570___ 571} 572$code.=<<___; # last iteration can be optimized 573 paddd %xmm1,%xmm2 574 pcmpeqd %xmm5,%xmm1 575 movdqa %xmm0,`16*($i+0)+112`(%r10) 576 577 paddd %xmm2,%xmm3 578 .byte 0x67 579 pcmpeqd %xmm5,%xmm2 580 movdqa %xmm1,`16*($i+1)+112`(%r10) 581 582 pcmpeqd %xmm5,%xmm3 583 movdqa %xmm2,`16*($i+2)+112`(%r10) 584 pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register 585 586 pand `16*($i+1)-128`($bp),%xmm1 587 pand `16*($i+2)-128`($bp),%xmm2 588 movdqa %xmm3,`16*($i+3)+112`(%r10) 589 pand `16*($i+3)-128`($bp),%xmm3 590 por %xmm2,%xmm0 591 por %xmm3,%xmm1 592___ 593for($i=0;$i<$STRIDE/16-4;$i+=4) { 594$code.=<<___; 595 movdqa `16*($i+0)-128`($bp),%xmm4 596 movdqa `16*($i+1)-128`($bp),%xmm5 597 movdqa `16*($i+2)-128`($bp),%xmm2 598 pand `16*($i+0)+112`(%r10),%xmm4 599 movdqa `16*($i+3)-128`($bp),%xmm3 600 pand `16*($i+1)+112`(%r10),%xmm5 601 por %xmm4,%xmm0 602 pand `16*($i+2)+112`(%r10),%xmm2 603 por %xmm5,%xmm1 604 pand `16*($i+3)+112`(%r10),%xmm3 605 por %xmm2,%xmm0 606 por %xmm3,%xmm1 607___ 608} 609$code.=<<___; 610 por %xmm1,%xmm0 611 pshufd \$0x4e,%xmm0,%xmm1 612 por %xmm1,%xmm0 613 lea $STRIDE($bp),$bp 614 movq %xmm0,$m0 # m0=bp[0] 615 616 mov %r13,16+8(%rsp) # save end of b[num] 617 mov $rp, 56+8(%rsp) # save $rp 618 619 mov ($n0),$n0 # pull n0[0] value 620 mov ($ap),%rax 621 lea ($ap,$num),$ap # end of a[num] 622 neg $num 623 624 mov $n0,$m1 625 mulq $m0 # ap[0]*bp[0] 626 mov %rax,$A[0] 627 mov ($np),%rax 628 629 imulq $A[0],$m1 # "tp[0]"*n0 630 lea 64+8(%rsp),$tp 631 mov %rdx,$A[1] 632 633 mulq $m1 # np[0]*m1 634 add %rax,$A[0] # discarded 635 mov 8($ap,$num),%rax 636 adc \$0,%rdx 637 mov %rdx,$N[1] 638 639 mulq $m0 640 add %rax,$A[1] 641 mov 8*1($np),%rax 642 adc \$0,%rdx 643 mov %rdx,$A[0] 644 645 mulq $m1 646 add %rax,$N[1] 647 mov 16($ap,$num),%rax 648 adc \$0,%rdx 649 add $A[1],$N[1] 650 lea 4*8($num),$j # j=4 651 lea 8*4($np),$np 652 adc \$0,%rdx 653 mov $N[1],($tp) 654 mov %rdx,$N[0] 655 jmp .L1st4x 656 657.align 32 658.L1st4x: 659 mulq $m0 # ap[j]*bp[0] 660 add %rax,$A[0] 661 mov -8*2($np),%rax 662 lea 32($tp),$tp 663 adc \$0,%rdx 664 mov %rdx,$A[1] 665 666 mulq $m1 # np[j]*m1 667 add %rax,$N[0] 668 mov -8($ap,$j),%rax 669 adc \$0,%rdx 670 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 671 adc \$0,%rdx 672 mov $N[0],-24($tp) # tp[j-1] 673 mov %rdx,$N[1] 674 675 mulq $m0 # ap[j]*bp[0] 676 add %rax,$A[1] 677 mov -8*1($np),%rax 678 adc \$0,%rdx 679 mov %rdx,$A[0] 680 681 mulq $m1 # np[j]*m1 682 add %rax,$N[1] 683 mov ($ap,$j),%rax 684 adc \$0,%rdx 685 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 686 adc \$0,%rdx 687 mov $N[1],-16($tp) # tp[j-1] 688 mov %rdx,$N[0] 689 690 mulq $m0 # ap[j]*bp[0] 691 add %rax,$A[0] 692 mov 8*0($np),%rax 693 adc \$0,%rdx 694 mov %rdx,$A[1] 695 696 mulq $m1 # np[j]*m1 697 add %rax,$N[0] 698 mov 8($ap,$j),%rax 699 adc \$0,%rdx 700 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 701 adc \$0,%rdx 702 mov $N[0],-8($tp) # tp[j-1] 703 mov %rdx,$N[1] 704 705 mulq $m0 # ap[j]*bp[0] 706 add %rax,$A[1] 707 mov 8*1($np),%rax 708 adc \$0,%rdx 709 mov %rdx,$A[0] 710 711 mulq $m1 # np[j]*m1 712 add %rax,$N[1] 713 mov 16($ap,$j),%rax 714 adc \$0,%rdx 715 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 716 lea 8*4($np),$np 717 adc \$0,%rdx 718 mov $N[1],($tp) # tp[j-1] 719 mov %rdx,$N[0] 720 721 add \$32,$j # j+=4 722 jnz .L1st4x 723 724 mulq $m0 # ap[j]*bp[0] 725 add %rax,$A[0] 726 mov -8*2($np),%rax 727 lea 32($tp),$tp 728 adc \$0,%rdx 729 mov %rdx,$A[1] 730 731 mulq $m1 # np[j]*m1 732 add %rax,$N[0] 733 mov -8($ap),%rax 734 adc \$0,%rdx 735 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 736 adc \$0,%rdx 737 mov $N[0],-24($tp) # tp[j-1] 738 mov %rdx,$N[1] 739 740 mulq $m0 # ap[j]*bp[0] 741 add %rax,$A[1] 742 mov -8*1($np),%rax 743 adc \$0,%rdx 744 mov %rdx,$A[0] 745 746 mulq $m1 # np[j]*m1 747 add %rax,$N[1] 748 mov ($ap,$num),%rax # ap[0] 749 adc \$0,%rdx 750 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 751 adc \$0,%rdx 752 mov $N[1],-16($tp) # tp[j-1] 753 mov %rdx,$N[0] 754 755 lea ($np,$num),$np # rewind $np 756 757 xor $N[1],$N[1] 758 add $A[0],$N[0] 759 adc \$0,$N[1] 760 mov $N[0],-8($tp) 761 762 jmp .Louter4x 763 764.align 32 765.Louter4x: 766 lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) 767 pxor %xmm4,%xmm4 768 pxor %xmm5,%xmm5 769___ 770for($i=0;$i<$STRIDE/16;$i+=4) { 771$code.=<<___; 772 movdqa `16*($i+0)-128`($bp),%xmm0 773 movdqa `16*($i+1)-128`($bp),%xmm1 774 movdqa `16*($i+2)-128`($bp),%xmm2 775 movdqa `16*($i+3)-128`($bp),%xmm3 776 pand `16*($i+0)-128`(%rdx),%xmm0 777 pand `16*($i+1)-128`(%rdx),%xmm1 778 por %xmm0,%xmm4 779 pand `16*($i+2)-128`(%rdx),%xmm2 780 por %xmm1,%xmm5 781 pand `16*($i+3)-128`(%rdx),%xmm3 782 por %xmm2,%xmm4 783 por %xmm3,%xmm5 784___ 785} 786$code.=<<___; 787 por %xmm5,%xmm4 788 pshufd \$0x4e,%xmm4,%xmm0 789 por %xmm4,%xmm0 790 lea $STRIDE($bp),$bp 791 movq %xmm0,$m0 # m0=bp[i] 792 793 mov ($tp,$num),$A[0] 794 mov $n0,$m1 795 mulq $m0 # ap[0]*bp[i] 796 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 797 mov ($np),%rax 798 adc \$0,%rdx 799 800 imulq $A[0],$m1 # tp[0]*n0 801 mov %rdx,$A[1] 802 mov $N[1],($tp) # store upmost overflow bit 803 804 lea ($tp,$num),$tp # rewind $tp 805 806 mulq $m1 # np[0]*m1 807 add %rax,$A[0] # "$N[0]", discarded 808 mov 8($ap,$num),%rax 809 adc \$0,%rdx 810 mov %rdx,$N[1] 811 812 mulq $m0 # ap[j]*bp[i] 813 add %rax,$A[1] 814 mov 8*1($np),%rax 815 adc \$0,%rdx 816 add 8($tp),$A[1] # +tp[1] 817 adc \$0,%rdx 818 mov %rdx,$A[0] 819 820 mulq $m1 # np[j]*m1 821 add %rax,$N[1] 822 mov 16($ap,$num),%rax 823 adc \$0,%rdx 824 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 825 lea 4*8($num),$j # j=4 826 lea 8*4($np),$np 827 adc \$0,%rdx 828 mov %rdx,$N[0] 829 jmp .Linner4x 830 831.align 32 832.Linner4x: 833 mulq $m0 # ap[j]*bp[i] 834 add %rax,$A[0] 835 mov -8*2($np),%rax 836 adc \$0,%rdx 837 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 838 lea 32($tp),$tp 839 adc \$0,%rdx 840 mov %rdx,$A[1] 841 842 mulq $m1 # np[j]*m1 843 add %rax,$N[0] 844 mov -8($ap,$j),%rax 845 adc \$0,%rdx 846 add $A[0],$N[0] 847 adc \$0,%rdx 848 mov $N[1],-32($tp) # tp[j-1] 849 mov %rdx,$N[1] 850 851 mulq $m0 # ap[j]*bp[i] 852 add %rax,$A[1] 853 mov -8*1($np),%rax 854 adc \$0,%rdx 855 add -8($tp),$A[1] 856 adc \$0,%rdx 857 mov %rdx,$A[0] 858 859 mulq $m1 # np[j]*m1 860 add %rax,$N[1] 861 mov ($ap,$j),%rax 862 adc \$0,%rdx 863 add $A[1],$N[1] 864 adc \$0,%rdx 865 mov $N[0],-24($tp) # tp[j-1] 866 mov %rdx,$N[0] 867 868 mulq $m0 # ap[j]*bp[i] 869 add %rax,$A[0] 870 mov 8*0($np),%rax 871 adc \$0,%rdx 872 add ($tp),$A[0] # ap[j]*bp[i]+tp[j] 873 adc \$0,%rdx 874 mov %rdx,$A[1] 875 876 mulq $m1 # np[j]*m1 877 add %rax,$N[0] 878 mov 8($ap,$j),%rax 879 adc \$0,%rdx 880 add $A[0],$N[0] 881 adc \$0,%rdx 882 mov $N[1],-16($tp) # tp[j-1] 883 mov %rdx,$N[1] 884 885 mulq $m0 # ap[j]*bp[i] 886 add %rax,$A[1] 887 mov 8*1($np),%rax 888 adc \$0,%rdx 889 add 8($tp),$A[1] 890 adc \$0,%rdx 891 mov %rdx,$A[0] 892 893 mulq $m1 # np[j]*m1 894 add %rax,$N[1] 895 mov 16($ap,$j),%rax 896 adc \$0,%rdx 897 add $A[1],$N[1] 898 lea 8*4($np),$np 899 adc \$0,%rdx 900 mov $N[0],-8($tp) # tp[j-1] 901 mov %rdx,$N[0] 902 903 add \$32,$j # j+=4 904 jnz .Linner4x 905 906 mulq $m0 # ap[j]*bp[i] 907 add %rax,$A[0] 908 mov -8*2($np),%rax 909 adc \$0,%rdx 910 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 911 lea 32($tp),$tp 912 adc \$0,%rdx 913 mov %rdx,$A[1] 914 915 mulq $m1 # np[j]*m1 916 add %rax,$N[0] 917 mov -8($ap),%rax 918 adc \$0,%rdx 919 add $A[0],$N[0] 920 adc \$0,%rdx 921 mov $N[1],-32($tp) # tp[j-1] 922 mov %rdx,$N[1] 923 924 mulq $m0 # ap[j]*bp[i] 925 add %rax,$A[1] 926 mov $m1,%rax 927 mov -8*1($np),$m1 928 adc \$0,%rdx 929 add -8($tp),$A[1] 930 adc \$0,%rdx 931 mov %rdx,$A[0] 932 933 mulq $m1 # np[j]*m1 934 add %rax,$N[1] 935 mov ($ap,$num),%rax # ap[0] 936 adc \$0,%rdx 937 add $A[1],$N[1] 938 adc \$0,%rdx 939 mov $N[0],-24($tp) # tp[j-1] 940 mov %rdx,$N[0] 941 942 mov $N[1],-16($tp) # tp[j-1] 943 lea ($np,$num),$np # rewind $np 944 945 xor $N[1],$N[1] 946 add $A[0],$N[0] 947 adc \$0,$N[1] 948 add ($tp),$N[0] # pull upmost overflow bit 949 adc \$0,$N[1] # upmost overflow bit 950 mov $N[0],-8($tp) 951 952 cmp 16+8(%rsp),$bp 953 jb .Louter4x 954___ 955if (1) { 956$code.=<<___; 957 xor %rax,%rax 958 sub $N[0],$m1 # compare top-most words 959 adc $j,$j # $j is zero 960 or $j,$N[1] 961 sub $N[1],%rax # %rax=-$N[1] 962 lea ($tp,$num),%rbx # tptr in .sqr4x_sub 963 mov ($np),%r12 964 lea ($np),%rbp # nptr in .sqr4x_sub 965 mov %r9,%rcx 966 sar \$3+2,%rcx 967 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub 968 dec %r12 # so that after 'not' we get -n[0] 969 xor %r10,%r10 970 mov 8*1(%rbp),%r13 971 mov 8*2(%rbp),%r14 972 mov 8*3(%rbp),%r15 973 jmp .Lsqr4x_sub_entry 974___ 975} else { 976my @ri=("%rax",$bp,$m0,$m1); 977my $rp="%rdx"; 978$code.=<<___ 979 xor \$1,$N[1] 980 lea ($tp,$num),$tp # rewind $tp 981 sar \$5,$num # cf=0 982 lea ($np,$N[1],8),$np 983 mov 56+8(%rsp),$rp # restore $rp 984 jmp .Lsub4x 985 986.align 32 987.Lsub4x: 988 .byte 0x66 989 mov 8*0($tp),@ri[0] 990 mov 8*1($tp),@ri[1] 991 .byte 0x66 992 sbb 16*0($np),@ri[0] 993 mov 8*2($tp),@ri[2] 994 sbb 16*1($np),@ri[1] 995 mov 3*8($tp),@ri[3] 996 lea 4*8($tp),$tp 997 sbb 16*2($np),@ri[2] 998 mov @ri[0],8*0($rp) 999 sbb 16*3($np),@ri[3] 1000 lea 16*4($np),$np 1001 mov @ri[1],8*1($rp) 1002 mov @ri[2],8*2($rp) 1003 mov @ri[3],8*3($rp) 1004 lea 8*4($rp),$rp 1005 1006 inc $num 1007 jnz .Lsub4x 1008 1009 ret 1010___ 1011} 1012$code.=<<___; 1013.size mul4x_internal,.-mul4x_internal 1014___ 1015}}} 1016{{{ 1017###################################################################### 1018# void bn_power5( 1019my $rptr="%rdi"; # BN_ULONG *rptr, 1020my $aptr="%rsi"; # const BN_ULONG *aptr, 1021my $bptr="%rdx"; # const void *table, 1022my $nptr="%rcx"; # const BN_ULONG *nptr, 1023my $n0 ="%r8"; # const BN_ULONG *n0); 1024my $num ="%r9"; # int num, has to be divisible by 8 1025 # int pwr 1026 1027my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 1028my @A0=("%r10","%r11"); 1029my @A1=("%r12","%r13"); 1030my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 1031 1032$code.=<<___; 1033.globl bn_power5 1034.type bn_power5,\@function,6 1035.align 32 1036bn_power5: 1037___ 1038$code.=<<___ if ($addx); 1039 mov OPENSSL_ia32cap_P+8(%rip),%r11d 1040 and \$0x80108,%r11d 1041 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 1042 je .Lpowerx5_enter 1043___ 1044$code.=<<___; 1045 mov %rsp,%rax 1046 push %rbx 1047 push %rbp 1048 push %r12 1049 push %r13 1050 push %r14 1051 push %r15 1052 1053 shl \$3,${num}d # convert $num to bytes 1054 lea ($num,$num,2),%r10d # 3*$num 1055 neg $num 1056 mov ($n0),$n0 # *n0 1057 1058 ############################################################## 1059 # Ensure that stack frame doesn't alias with $rptr+3*$num 1060 # modulo 4096, which covers ret[num], am[num] and n[num] 1061 # (see bn_exp.c). This is done to allow memory disambiguation 1062 # logic do its magic. [Extra 256 bytes is for power mask 1063 # calculated from 7th argument, the index.] 1064 # 1065 lea -320(%rsp,$num,2),%r11 1066 sub $rptr,%r11 1067 and \$4095,%r11 1068 cmp %r11,%r10 1069 jb .Lpwr_sp_alt 1070 sub %r11,%rsp # align with $aptr 1071 lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) 1072 jmp .Lpwr_sp_done 1073 1074.align 32 1075.Lpwr_sp_alt: 1076 lea 4096-320(,$num,2),%r10 1077 lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) 1078 sub %r10,%r11 1079 mov \$0,%r10 1080 cmovc %r10,%r11 1081 sub %r11,%rsp 1082.Lpwr_sp_done: 1083 and \$-64,%rsp 1084 mov %rax,%r11 1085 sub %rsp,%r11 1086 and \$-4096,%r11 1087.Lpwr_page_walk: 1088 mov (%rsp,%r11),%r10 1089 sub \$4096,%r11 1090 .byte 0x2e # predict non-taken 1091 jnc .Lpwr_page_walk 1092 1093 mov $num,%r10 1094 neg $num 1095 1096 ############################################################## 1097 # Stack layout 1098 # 1099 # +0 saved $num, used in reduction section 1100 # +8 &t[2*$num], used in reduction section 1101 # +32 saved *n0 1102 # +40 saved %rsp 1103 # +48 t[2*$num] 1104 # 1105 mov $n0, 32(%rsp) 1106 mov %rax, 40(%rsp) # save original %rsp 1107.Lpower5_body: 1108 movq $rptr,%xmm1 # save $rptr, used in sqr8x 1109 movq $nptr,%xmm2 # save $nptr 1110 movq %r10, %xmm3 # -$num, used in sqr8x 1111 movq $bptr,%xmm4 1112 1113 call __bn_sqr8x_internal 1114 call __bn_post4x_internal 1115 call __bn_sqr8x_internal 1116 call __bn_post4x_internal 1117 call __bn_sqr8x_internal 1118 call __bn_post4x_internal 1119 call __bn_sqr8x_internal 1120 call __bn_post4x_internal 1121 call __bn_sqr8x_internal 1122 call __bn_post4x_internal 1123 1124 movq %xmm2,$nptr 1125 movq %xmm4,$bptr 1126 mov $aptr,$rptr 1127 mov 40(%rsp),%rax 1128 lea 32(%rsp),$n0 1129 1130 call mul4x_internal 1131 1132 mov 40(%rsp),%rsi # restore %rsp 1133 mov \$1,%rax 1134 mov -48(%rsi),%r15 1135 mov -40(%rsi),%r14 1136 mov -32(%rsi),%r13 1137 mov -24(%rsi),%r12 1138 mov -16(%rsi),%rbp 1139 mov -8(%rsi),%rbx 1140 lea (%rsi),%rsp 1141.Lpower5_epilogue: 1142 ret 1143.size bn_power5,.-bn_power5 1144 1145.globl bn_sqr8x_internal 1146.hidden bn_sqr8x_internal 1147.type bn_sqr8x_internal,\@abi-omnipotent 1148.align 32 1149bn_sqr8x_internal: 1150__bn_sqr8x_internal: 1151 ############################################################## 1152 # Squaring part: 1153 # 1154 # a) multiply-n-add everything but a[i]*a[i]; 1155 # b) shift result of a) by 1 to the left and accumulate 1156 # a[i]*a[i] products; 1157 # 1158 ############################################################## 1159 # a[1]a[0] 1160 # a[2]a[0] 1161 # a[3]a[0] 1162 # a[2]a[1] 1163 # a[4]a[0] 1164 # a[3]a[1] 1165 # a[5]a[0] 1166 # a[4]a[1] 1167 # a[3]a[2] 1168 # a[6]a[0] 1169 # a[5]a[1] 1170 # a[4]a[2] 1171 # a[7]a[0] 1172 # a[6]a[1] 1173 # a[5]a[2] 1174 # a[4]a[3] 1175 # a[7]a[1] 1176 # a[6]a[2] 1177 # a[5]a[3] 1178 # a[7]a[2] 1179 # a[6]a[3] 1180 # a[5]a[4] 1181 # a[7]a[3] 1182 # a[6]a[4] 1183 # a[7]a[4] 1184 # a[6]a[5] 1185 # a[7]a[5] 1186 # a[7]a[6] 1187 # a[1]a[0] 1188 # a[2]a[0] 1189 # a[3]a[0] 1190 # a[4]a[0] 1191 # a[5]a[0] 1192 # a[6]a[0] 1193 # a[7]a[0] 1194 # a[2]a[1] 1195 # a[3]a[1] 1196 # a[4]a[1] 1197 # a[5]a[1] 1198 # a[6]a[1] 1199 # a[7]a[1] 1200 # a[3]a[2] 1201 # a[4]a[2] 1202 # a[5]a[2] 1203 # a[6]a[2] 1204 # a[7]a[2] 1205 # a[4]a[3] 1206 # a[5]a[3] 1207 # a[6]a[3] 1208 # a[7]a[3] 1209 # a[5]a[4] 1210 # a[6]a[4] 1211 # a[7]a[4] 1212 # a[6]a[5] 1213 # a[7]a[5] 1214 # a[7]a[6] 1215 # a[0]a[0] 1216 # a[1]a[1] 1217 # a[2]a[2] 1218 # a[3]a[3] 1219 # a[4]a[4] 1220 # a[5]a[5] 1221 # a[6]a[6] 1222 # a[7]a[7] 1223 1224 lea 32(%r10),$i # $i=-($num-32) 1225 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 1226 1227 mov $num,$j # $j=$num 1228 1229 # comments apply to $num==8 case 1230 mov -32($aptr,$i),$a0 # a[0] 1231 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1232 mov -24($aptr,$i),%rax # a[1] 1233 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1234 mov -16($aptr,$i),$ai # a[2] 1235 mov %rax,$a1 1236 1237 mul $a0 # a[1]*a[0] 1238 mov %rax,$A0[0] # a[1]*a[0] 1239 mov $ai,%rax # a[2] 1240 mov %rdx,$A0[1] 1241 mov $A0[0],-24($tptr,$i) # t[1] 1242 1243 mul $a0 # a[2]*a[0] 1244 add %rax,$A0[1] 1245 mov $ai,%rax 1246 adc \$0,%rdx 1247 mov $A0[1],-16($tptr,$i) # t[2] 1248 mov %rdx,$A0[0] 1249 1250 1251 mov -8($aptr,$i),$ai # a[3] 1252 mul $a1 # a[2]*a[1] 1253 mov %rax,$A1[0] # a[2]*a[1]+t[3] 1254 mov $ai,%rax 1255 mov %rdx,$A1[1] 1256 1257 lea ($i),$j 1258 mul $a0 # a[3]*a[0] 1259 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1260 mov $ai,%rax 1261 mov %rdx,$A0[1] 1262 adc \$0,$A0[1] 1263 add $A1[0],$A0[0] 1264 adc \$0,$A0[1] 1265 mov $A0[0],-8($tptr,$j) # t[3] 1266 jmp .Lsqr4x_1st 1267 1268.align 32 1269.Lsqr4x_1st: 1270 mov ($aptr,$j),$ai # a[4] 1271 mul $a1 # a[3]*a[1] 1272 add %rax,$A1[1] # a[3]*a[1]+t[4] 1273 mov $ai,%rax 1274 mov %rdx,$A1[0] 1275 adc \$0,$A1[0] 1276 1277 mul $a0 # a[4]*a[0] 1278 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1279 mov $ai,%rax # a[3] 1280 mov 8($aptr,$j),$ai # a[5] 1281 mov %rdx,$A0[0] 1282 adc \$0,$A0[0] 1283 add $A1[1],$A0[1] 1284 adc \$0,$A0[0] 1285 1286 1287 mul $a1 # a[4]*a[3] 1288 add %rax,$A1[0] # a[4]*a[3]+t[5] 1289 mov $ai,%rax 1290 mov $A0[1],($tptr,$j) # t[4] 1291 mov %rdx,$A1[1] 1292 adc \$0,$A1[1] 1293 1294 mul $a0 # a[5]*a[2] 1295 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1296 mov $ai,%rax 1297 mov 16($aptr,$j),$ai # a[6] 1298 mov %rdx,$A0[1] 1299 adc \$0,$A0[1] 1300 add $A1[0],$A0[0] 1301 adc \$0,$A0[1] 1302 1303 mul $a1 # a[5]*a[3] 1304 add %rax,$A1[1] # a[5]*a[3]+t[6] 1305 mov $ai,%rax 1306 mov $A0[0],8($tptr,$j) # t[5] 1307 mov %rdx,$A1[0] 1308 adc \$0,$A1[0] 1309 1310 mul $a0 # a[6]*a[2] 1311 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 1312 mov $ai,%rax # a[3] 1313 mov 24($aptr,$j),$ai # a[7] 1314 mov %rdx,$A0[0] 1315 adc \$0,$A0[0] 1316 add $A1[1],$A0[1] 1317 adc \$0,$A0[0] 1318 1319 1320 mul $a1 # a[6]*a[5] 1321 add %rax,$A1[0] # a[6]*a[5]+t[7] 1322 mov $ai,%rax 1323 mov $A0[1],16($tptr,$j) # t[6] 1324 mov %rdx,$A1[1] 1325 adc \$0,$A1[1] 1326 lea 32($j),$j 1327 1328 mul $a0 # a[7]*a[4] 1329 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 1330 mov $ai,%rax 1331 mov %rdx,$A0[1] 1332 adc \$0,$A0[1] 1333 add $A1[0],$A0[0] 1334 adc \$0,$A0[1] 1335 mov $A0[0],-8($tptr,$j) # t[7] 1336 1337 cmp \$0,$j 1338 jne .Lsqr4x_1st 1339 1340 mul $a1 # a[7]*a[5] 1341 add %rax,$A1[1] 1342 lea 16($i),$i 1343 adc \$0,%rdx 1344 add $A0[1],$A1[1] 1345 adc \$0,%rdx 1346 1347 mov $A1[1],($tptr) # t[8] 1348 mov %rdx,$A1[0] 1349 mov %rdx,8($tptr) # t[9] 1350 jmp .Lsqr4x_outer 1351 1352.align 32 1353.Lsqr4x_outer: # comments apply to $num==6 case 1354 mov -32($aptr,$i),$a0 # a[0] 1355 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1356 mov -24($aptr,$i),%rax # a[1] 1357 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1358 mov -16($aptr,$i),$ai # a[2] 1359 mov %rax,$a1 1360 1361 mul $a0 # a[1]*a[0] 1362 mov -24($tptr,$i),$A0[0] # t[1] 1363 add %rax,$A0[0] # a[1]*a[0]+t[1] 1364 mov $ai,%rax # a[2] 1365 adc \$0,%rdx 1366 mov $A0[0],-24($tptr,$i) # t[1] 1367 mov %rdx,$A0[1] 1368 1369 mul $a0 # a[2]*a[0] 1370 add %rax,$A0[1] 1371 mov $ai,%rax 1372 adc \$0,%rdx 1373 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 1374 mov %rdx,$A0[0] 1375 adc \$0,$A0[0] 1376 mov $A0[1],-16($tptr,$i) # t[2] 1377 1378 xor $A1[0],$A1[0] 1379 1380 mov -8($aptr,$i),$ai # a[3] 1381 mul $a1 # a[2]*a[1] 1382 add %rax,$A1[0] # a[2]*a[1]+t[3] 1383 mov $ai,%rax 1384 adc \$0,%rdx 1385 add -8($tptr,$i),$A1[0] 1386 mov %rdx,$A1[1] 1387 adc \$0,$A1[1] 1388 1389 mul $a0 # a[3]*a[0] 1390 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1391 mov $ai,%rax 1392 adc \$0,%rdx 1393 add $A1[0],$A0[0] 1394 mov %rdx,$A0[1] 1395 adc \$0,$A0[1] 1396 mov $A0[0],-8($tptr,$i) # t[3] 1397 1398 lea ($i),$j 1399 jmp .Lsqr4x_inner 1400 1401.align 32 1402.Lsqr4x_inner: 1403 mov ($aptr,$j),$ai # a[4] 1404 mul $a1 # a[3]*a[1] 1405 add %rax,$A1[1] # a[3]*a[1]+t[4] 1406 mov $ai,%rax 1407 mov %rdx,$A1[0] 1408 adc \$0,$A1[0] 1409 add ($tptr,$j),$A1[1] 1410 adc \$0,$A1[0] 1411 1412 .byte 0x67 1413 mul $a0 # a[4]*a[0] 1414 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1415 mov $ai,%rax # a[3] 1416 mov 8($aptr,$j),$ai # a[5] 1417 mov %rdx,$A0[0] 1418 adc \$0,$A0[0] 1419 add $A1[1],$A0[1] 1420 adc \$0,$A0[0] 1421 1422 mul $a1 # a[4]*a[3] 1423 add %rax,$A1[0] # a[4]*a[3]+t[5] 1424 mov $A0[1],($tptr,$j) # t[4] 1425 mov $ai,%rax 1426 mov %rdx,$A1[1] 1427 adc \$0,$A1[1] 1428 add 8($tptr,$j),$A1[0] 1429 lea 16($j),$j # j++ 1430 adc \$0,$A1[1] 1431 1432 mul $a0 # a[5]*a[2] 1433 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1434 mov $ai,%rax 1435 adc \$0,%rdx 1436 add $A1[0],$A0[0] 1437 mov %rdx,$A0[1] 1438 adc \$0,$A0[1] 1439 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 1440 1441 cmp \$0,$j 1442 jne .Lsqr4x_inner 1443 1444 .byte 0x67 1445 mul $a1 # a[5]*a[3] 1446 add %rax,$A1[1] 1447 adc \$0,%rdx 1448 add $A0[1],$A1[1] 1449 adc \$0,%rdx 1450 1451 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 1452 mov %rdx,$A1[0] 1453 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below 1454 1455 add \$16,$i 1456 jnz .Lsqr4x_outer 1457 1458 # comments apply to $num==4 case 1459 mov -32($aptr),$a0 # a[0] 1460 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1461 mov -24($aptr),%rax # a[1] 1462 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1463 mov -16($aptr),$ai # a[2] 1464 mov %rax,$a1 1465 1466 mul $a0 # a[1]*a[0] 1467 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 1468 mov $ai,%rax # a[2] 1469 mov %rdx,$A0[1] 1470 adc \$0,$A0[1] 1471 1472 mul $a0 # a[2]*a[0] 1473 add %rax,$A0[1] 1474 mov $ai,%rax 1475 mov $A0[0],-24($tptr) # t[1] 1476 mov %rdx,$A0[0] 1477 adc \$0,$A0[0] 1478 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1479 mov -8($aptr),$ai # a[3] 1480 adc \$0,$A0[0] 1481 1482 mul $a1 # a[2]*a[1] 1483 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1484 mov $ai,%rax 1485 mov $A0[1],-16($tptr) # t[2] 1486 mov %rdx,$A1[1] 1487 adc \$0,$A1[1] 1488 1489 mul $a0 # a[3]*a[0] 1490 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1491 mov $ai,%rax 1492 mov %rdx,$A0[1] 1493 adc \$0,$A0[1] 1494 add $A1[0],$A0[0] 1495 adc \$0,$A0[1] 1496 mov $A0[0],-8($tptr) # t[3] 1497 1498 mul $a1 # a[3]*a[1] 1499 add %rax,$A1[1] 1500 mov -16($aptr),%rax # a[2] 1501 adc \$0,%rdx 1502 add $A0[1],$A1[1] 1503 adc \$0,%rdx 1504 1505 mov $A1[1],($tptr) # t[4] 1506 mov %rdx,$A1[0] 1507 mov %rdx,8($tptr) # t[5] 1508 1509 mul $ai # a[2]*a[3] 1510___ 1511{ 1512my ($shift,$carry)=($a0,$a1); 1513my @S=(@A1,$ai,$n0); 1514$code.=<<___; 1515 add \$16,$i 1516 xor $shift,$shift 1517 sub $num,$i # $i=16-$num 1518 xor $carry,$carry 1519 1520 add $A1[0],%rax # t[5] 1521 adc \$0,%rdx 1522 mov %rax,8($tptr) # t[5] 1523 mov %rdx,16($tptr) # t[6] 1524 mov $carry,24($tptr) # t[7] 1525 1526 mov -16($aptr,$i),%rax # a[0] 1527 lea 48+8(%rsp),$tptr 1528 xor $A0[0],$A0[0] # t[0] 1529 mov 8($tptr),$A0[1] # t[1] 1530 1531 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1532 shr \$63,$A0[0] 1533 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1534 shr \$63,$A0[1] 1535 or $A0[0],$S[1] # | t[2*i]>>63 1536 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1537 mov $A0[1],$shift # shift=t[2*i+1]>>63 1538 mul %rax # a[i]*a[i] 1539 neg $carry # mov $carry,cf 1540 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1541 adc %rax,$S[0] 1542 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1543 mov $S[0],($tptr) 1544 adc %rdx,$S[1] 1545 1546 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1547 mov $S[1],8($tptr) 1548 sbb $carry,$carry # mov cf,$carry 1549 shr \$63,$A0[0] 1550 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1551 shr \$63,$A0[1] 1552 or $A0[0],$S[3] # | t[2*i]>>63 1553 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1554 mov $A0[1],$shift # shift=t[2*i+1]>>63 1555 mul %rax # a[i]*a[i] 1556 neg $carry # mov $carry,cf 1557 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1558 adc %rax,$S[2] 1559 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1560 mov $S[2],16($tptr) 1561 adc %rdx,$S[3] 1562 lea 16($i),$i 1563 mov $S[3],24($tptr) 1564 sbb $carry,$carry # mov cf,$carry 1565 lea 64($tptr),$tptr 1566 jmp .Lsqr4x_shift_n_add 1567 1568.align 32 1569.Lsqr4x_shift_n_add: 1570 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1571 shr \$63,$A0[0] 1572 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1573 shr \$63,$A0[1] 1574 or $A0[0],$S[1] # | t[2*i]>>63 1575 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1576 mov $A0[1],$shift # shift=t[2*i+1]>>63 1577 mul %rax # a[i]*a[i] 1578 neg $carry # mov $carry,cf 1579 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1580 adc %rax,$S[0] 1581 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1582 mov $S[0],-32($tptr) 1583 adc %rdx,$S[1] 1584 1585 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1586 mov $S[1],-24($tptr) 1587 sbb $carry,$carry # mov cf,$carry 1588 shr \$63,$A0[0] 1589 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1590 shr \$63,$A0[1] 1591 or $A0[0],$S[3] # | t[2*i]>>63 1592 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch 1593 mov $A0[1],$shift # shift=t[2*i+1]>>63 1594 mul %rax # a[i]*a[i] 1595 neg $carry # mov $carry,cf 1596 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1597 adc %rax,$S[2] 1598 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1599 mov $S[2],-16($tptr) 1600 adc %rdx,$S[3] 1601 1602 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1603 mov $S[3],-8($tptr) 1604 sbb $carry,$carry # mov cf,$carry 1605 shr \$63,$A0[0] 1606 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1607 shr \$63,$A0[1] 1608 or $A0[0],$S[1] # | t[2*i]>>63 1609 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1610 mov $A0[1],$shift # shift=t[2*i+1]>>63 1611 mul %rax # a[i]*a[i] 1612 neg $carry # mov $carry,cf 1613 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1614 adc %rax,$S[0] 1615 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1616 mov $S[0],0($tptr) 1617 adc %rdx,$S[1] 1618 1619 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1620 mov $S[1],8($tptr) 1621 sbb $carry,$carry # mov cf,$carry 1622 shr \$63,$A0[0] 1623 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1624 shr \$63,$A0[1] 1625 or $A0[0],$S[3] # | t[2*i]>>63 1626 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1627 mov $A0[1],$shift # shift=t[2*i+1]>>63 1628 mul %rax # a[i]*a[i] 1629 neg $carry # mov $carry,cf 1630 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1631 adc %rax,$S[2] 1632 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1633 mov $S[2],16($tptr) 1634 adc %rdx,$S[3] 1635 mov $S[3],24($tptr) 1636 sbb $carry,$carry # mov cf,$carry 1637 lea 64($tptr),$tptr 1638 add \$32,$i 1639 jnz .Lsqr4x_shift_n_add 1640 1641 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1642 .byte 0x67 1643 shr \$63,$A0[0] 1644 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1645 shr \$63,$A0[1] 1646 or $A0[0],$S[1] # | t[2*i]>>63 1647 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1648 mov $A0[1],$shift # shift=t[2*i+1]>>63 1649 mul %rax # a[i]*a[i] 1650 neg $carry # mov $carry,cf 1651 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1652 adc %rax,$S[0] 1653 mov -8($aptr),%rax # a[i+1] # prefetch 1654 mov $S[0],-32($tptr) 1655 adc %rdx,$S[1] 1656 1657 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1658 mov $S[1],-24($tptr) 1659 sbb $carry,$carry # mov cf,$carry 1660 shr \$63,$A0[0] 1661 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1662 shr \$63,$A0[1] 1663 or $A0[0],$S[3] # | t[2*i]>>63 1664 mul %rax # a[i]*a[i] 1665 neg $carry # mov $carry,cf 1666 adc %rax,$S[2] 1667 adc %rdx,$S[3] 1668 mov $S[2],-16($tptr) 1669 mov $S[3],-8($tptr) 1670___ 1671} 1672###################################################################### 1673# Montgomery reduction part, "word-by-word" algorithm. 1674# 1675# This new path is inspired by multiple submissions from Intel, by 1676# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 1677# Vinodh Gopal... 1678{ 1679my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); 1680 1681$code.=<<___; 1682 movq %xmm2,$nptr 1683__bn_sqr8x_reduction: 1684 xor %rax,%rax 1685 lea ($nptr,$num),%rcx # end of n[] 1686 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer 1687 mov %rcx,0+8(%rsp) 1688 lea 48+8(%rsp,$num),$tptr # end of initial t[] window 1689 mov %rdx,8+8(%rsp) 1690 neg $num 1691 jmp .L8x_reduction_loop 1692 1693.align 32 1694.L8x_reduction_loop: 1695 lea ($tptr,$num),$tptr # start of current t[] window 1696 .byte 0x66 1697 mov 8*0($tptr),$m0 1698 mov 8*1($tptr),%r9 1699 mov 8*2($tptr),%r10 1700 mov 8*3($tptr),%r11 1701 mov 8*4($tptr),%r12 1702 mov 8*5($tptr),%r13 1703 mov 8*6($tptr),%r14 1704 mov 8*7($tptr),%r15 1705 mov %rax,(%rdx) # store top-most carry bit 1706 lea 8*8($tptr),$tptr 1707 1708 .byte 0x67 1709 mov $m0,%r8 1710 imulq 32+8(%rsp),$m0 # n0*a[0] 1711 mov 8*0($nptr),%rax # n[0] 1712 mov \$8,%ecx 1713 jmp .L8x_reduce 1714 1715.align 32 1716.L8x_reduce: 1717 mulq $m0 1718 mov 8*1($nptr),%rax # n[1] 1719 neg %r8 1720 mov %rdx,%r8 1721 adc \$0,%r8 1722 1723 mulq $m0 1724 add %rax,%r9 1725 mov 8*2($nptr),%rax 1726 adc \$0,%rdx 1727 add %r9,%r8 1728 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] 1729 mov %rdx,%r9 1730 adc \$0,%r9 1731 1732 mulq $m0 1733 add %rax,%r10 1734 mov 8*3($nptr),%rax 1735 adc \$0,%rdx 1736 add %r10,%r9 1737 mov 32+8(%rsp),$carry # pull n0, borrow $carry 1738 mov %rdx,%r10 1739 adc \$0,%r10 1740 1741 mulq $m0 1742 add %rax,%r11 1743 mov 8*4($nptr),%rax 1744 adc \$0,%rdx 1745 imulq %r8,$carry # modulo-scheduled 1746 add %r11,%r10 1747 mov %rdx,%r11 1748 adc \$0,%r11 1749 1750 mulq $m0 1751 add %rax,%r12 1752 mov 8*5($nptr),%rax 1753 adc \$0,%rdx 1754 add %r12,%r11 1755 mov %rdx,%r12 1756 adc \$0,%r12 1757 1758 mulq $m0 1759 add %rax,%r13 1760 mov 8*6($nptr),%rax 1761 adc \$0,%rdx 1762 add %r13,%r12 1763 mov %rdx,%r13 1764 adc \$0,%r13 1765 1766 mulq $m0 1767 add %rax,%r14 1768 mov 8*7($nptr),%rax 1769 adc \$0,%rdx 1770 add %r14,%r13 1771 mov %rdx,%r14 1772 adc \$0,%r14 1773 1774 mulq $m0 1775 mov $carry,$m0 # n0*a[i] 1776 add %rax,%r15 1777 mov 8*0($nptr),%rax # n[0] 1778 adc \$0,%rdx 1779 add %r15,%r14 1780 mov %rdx,%r15 1781 adc \$0,%r15 1782 1783 dec %ecx 1784 jnz .L8x_reduce 1785 1786 lea 8*8($nptr),$nptr 1787 xor %rax,%rax 1788 mov 8+8(%rsp),%rdx # pull end of t[] 1789 cmp 0+8(%rsp),$nptr # end of n[]? 1790 jae .L8x_no_tail 1791 1792 .byte 0x66 1793 add 8*0($tptr),%r8 1794 adc 8*1($tptr),%r9 1795 adc 8*2($tptr),%r10 1796 adc 8*3($tptr),%r11 1797 adc 8*4($tptr),%r12 1798 adc 8*5($tptr),%r13 1799 adc 8*6($tptr),%r14 1800 adc 8*7($tptr),%r15 1801 sbb $carry,$carry # top carry 1802 1803 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1804 mov \$8,%ecx 1805 mov 8*0($nptr),%rax 1806 jmp .L8x_tail 1807 1808.align 32 1809.L8x_tail: 1810 mulq $m0 1811 add %rax,%r8 1812 mov 8*1($nptr),%rax 1813 mov %r8,($tptr) # save result 1814 mov %rdx,%r8 1815 adc \$0,%r8 1816 1817 mulq $m0 1818 add %rax,%r9 1819 mov 8*2($nptr),%rax 1820 adc \$0,%rdx 1821 add %r9,%r8 1822 lea 8($tptr),$tptr # $tptr++ 1823 mov %rdx,%r9 1824 adc \$0,%r9 1825 1826 mulq $m0 1827 add %rax,%r10 1828 mov 8*3($nptr),%rax 1829 adc \$0,%rdx 1830 add %r10,%r9 1831 mov %rdx,%r10 1832 adc \$0,%r10 1833 1834 mulq $m0 1835 add %rax,%r11 1836 mov 8*4($nptr),%rax 1837 adc \$0,%rdx 1838 add %r11,%r10 1839 mov %rdx,%r11 1840 adc \$0,%r11 1841 1842 mulq $m0 1843 add %rax,%r12 1844 mov 8*5($nptr),%rax 1845 adc \$0,%rdx 1846 add %r12,%r11 1847 mov %rdx,%r12 1848 adc \$0,%r12 1849 1850 mulq $m0 1851 add %rax,%r13 1852 mov 8*6($nptr),%rax 1853 adc \$0,%rdx 1854 add %r13,%r12 1855 mov %rdx,%r13 1856 adc \$0,%r13 1857 1858 mulq $m0 1859 add %rax,%r14 1860 mov 8*7($nptr),%rax 1861 adc \$0,%rdx 1862 add %r14,%r13 1863 mov %rdx,%r14 1864 adc \$0,%r14 1865 1866 mulq $m0 1867 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] 1868 add %rax,%r15 1869 adc \$0,%rdx 1870 add %r15,%r14 1871 mov 8*0($nptr),%rax # pull n[0] 1872 mov %rdx,%r15 1873 adc \$0,%r15 1874 1875 dec %ecx 1876 jnz .L8x_tail 1877 1878 lea 8*8($nptr),$nptr 1879 mov 8+8(%rsp),%rdx # pull end of t[] 1880 cmp 0+8(%rsp),$nptr # end of n[]? 1881 jae .L8x_tail_done # break out of loop 1882 1883 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1884 neg $carry 1885 mov 8*0($nptr),%rax # pull n[0] 1886 adc 8*0($tptr),%r8 1887 adc 8*1($tptr),%r9 1888 adc 8*2($tptr),%r10 1889 adc 8*3($tptr),%r11 1890 adc 8*4($tptr),%r12 1891 adc 8*5($tptr),%r13 1892 adc 8*6($tptr),%r14 1893 adc 8*7($tptr),%r15 1894 sbb $carry,$carry # top carry 1895 1896 mov \$8,%ecx 1897 jmp .L8x_tail 1898 1899.align 32 1900.L8x_tail_done: 1901 add (%rdx),%r8 # can this overflow? 1902 adc \$0,%r9 1903 adc \$0,%r10 1904 adc \$0,%r11 1905 adc \$0,%r12 1906 adc \$0,%r13 1907 adc \$0,%r14 1908 adc \$0,%r15 # can't overflow, because we 1909 # started with "overhung" part 1910 # of multiplication 1911 xor %rax,%rax 1912 1913 neg $carry 1914.L8x_no_tail: 1915 adc 8*0($tptr),%r8 1916 adc 8*1($tptr),%r9 1917 adc 8*2($tptr),%r10 1918 adc 8*3($tptr),%r11 1919 adc 8*4($tptr),%r12 1920 adc 8*5($tptr),%r13 1921 adc 8*6($tptr),%r14 1922 adc 8*7($tptr),%r15 1923 adc \$0,%rax # top-most carry 1924 mov -8($nptr),%rcx # np[num-1] 1925 xor $carry,$carry 1926 1927 movq %xmm2,$nptr # restore $nptr 1928 1929 mov %r8,8*0($tptr) # store top 512 bits 1930 mov %r9,8*1($tptr) 1931 movq %xmm3,$num # $num is %r9, can't be moved upwards 1932 mov %r10,8*2($tptr) 1933 mov %r11,8*3($tptr) 1934 mov %r12,8*4($tptr) 1935 mov %r13,8*5($tptr) 1936 mov %r14,8*6($tptr) 1937 mov %r15,8*7($tptr) 1938 lea 8*8($tptr),$tptr 1939 1940 cmp %rdx,$tptr # end of t[]? 1941 jb .L8x_reduction_loop 1942 ret 1943.size bn_sqr8x_internal,.-bn_sqr8x_internal 1944___ 1945} 1946############################################################## 1947# Post-condition, 4x unrolled 1948# 1949{ 1950my ($tptr,$nptr)=("%rbx","%rbp"); 1951$code.=<<___; 1952.type __bn_post4x_internal,\@abi-omnipotent 1953.align 32 1954__bn_post4x_internal: 1955 mov 8*0($nptr),%r12 1956 lea (%rdi,$num),$tptr # %rdi was $tptr above 1957 mov $num,%rcx 1958 movq %xmm1,$rptr # restore $rptr 1959 neg %rax 1960 movq %xmm1,$aptr # prepare for back-to-back call 1961 sar \$3+2,%rcx 1962 dec %r12 # so that after 'not' we get -n[0] 1963 xor %r10,%r10 1964 mov 8*1($nptr),%r13 1965 mov 8*2($nptr),%r14 1966 mov 8*3($nptr),%r15 1967 jmp .Lsqr4x_sub_entry 1968 1969.align 16 1970.Lsqr4x_sub: 1971 mov 8*0($nptr),%r12 1972 mov 8*1($nptr),%r13 1973 mov 8*2($nptr),%r14 1974 mov 8*3($nptr),%r15 1975.Lsqr4x_sub_entry: 1976 lea 8*4($nptr),$nptr 1977 not %r12 1978 not %r13 1979 not %r14 1980 not %r15 1981 and %rax,%r12 1982 and %rax,%r13 1983 and %rax,%r14 1984 and %rax,%r15 1985 1986 neg %r10 # mov %r10,%cf 1987 adc 8*0($tptr),%r12 1988 adc 8*1($tptr),%r13 1989 adc 8*2($tptr),%r14 1990 adc 8*3($tptr),%r15 1991 mov %r12,8*0($rptr) 1992 lea 8*4($tptr),$tptr 1993 mov %r13,8*1($rptr) 1994 sbb %r10,%r10 # mov %cf,%r10 1995 mov %r14,8*2($rptr) 1996 mov %r15,8*3($rptr) 1997 lea 8*4($rptr),$rptr 1998 1999 inc %rcx # pass %cf 2000 jnz .Lsqr4x_sub 2001 2002 mov $num,%r10 # prepare for back-to-back call 2003 neg $num # restore $num 2004 ret 2005.size __bn_post4x_internal,.-__bn_post4x_internal 2006___ 2007} 2008{ 2009$code.=<<___; 2010.globl bn_from_montgomery 2011.type bn_from_montgomery,\@abi-omnipotent 2012.align 32 2013bn_from_montgomery: 2014 testl \$7,`($win64?"48(%rsp)":"%r9d")` 2015 jz bn_from_mont8x 2016 xor %eax,%eax 2017 ret 2018.size bn_from_montgomery,.-bn_from_montgomery 2019 2020.type bn_from_mont8x,\@function,6 2021.align 32 2022bn_from_mont8x: 2023 .byte 0x67 2024 mov %rsp,%rax 2025 push %rbx 2026 push %rbp 2027 push %r12 2028 push %r13 2029 push %r14 2030 push %r15 2031 2032 shl \$3,${num}d # convert $num to bytes 2033 lea ($num,$num,2),%r10 # 3*$num in bytes 2034 neg $num 2035 mov ($n0),$n0 # *n0 2036 2037 ############################################################## 2038 # Ensure that stack frame doesn't alias with $rptr+3*$num 2039 # modulo 4096, which covers ret[num], am[num] and n[num] 2040 # (see bn_exp.c). The stack is allocated to aligned with 2041 # bn_power5's frame, and as bn_from_montgomery happens to be 2042 # last operation, we use the opportunity to cleanse it. 2043 # 2044 lea -320(%rsp,$num,2),%r11 2045 sub $rptr,%r11 2046 and \$4095,%r11 2047 cmp %r11,%r10 2048 jb .Lfrom_sp_alt 2049 sub %r11,%rsp # align with $aptr 2050 lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) 2051 jmp .Lfrom_sp_done 2052 2053.align 32 2054.Lfrom_sp_alt: 2055 lea 4096-320(,$num,2),%r10 2056 lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) 2057 sub %r10,%r11 2058 mov \$0,%r10 2059 cmovc %r10,%r11 2060 sub %r11,%rsp 2061.Lfrom_sp_done: 2062 and \$-64,%rsp 2063 mov %rax,%r11 2064 sub %rsp,%r11 2065 and \$-4096,%r11 2066.Lfrom_page_walk: 2067 mov (%rsp,%r11),%r10 2068 sub \$4096,%r11 2069 .byte 0x2e # predict non-taken 2070 jnc .Lfrom_page_walk 2071 2072 mov $num,%r10 2073 neg $num 2074 2075 ############################################################## 2076 # Stack layout 2077 # 2078 # +0 saved $num, used in reduction section 2079 # +8 &t[2*$num], used in reduction section 2080 # +32 saved *n0 2081 # +40 saved %rsp 2082 # +48 t[2*$num] 2083 # 2084 mov $n0, 32(%rsp) 2085 mov %rax, 40(%rsp) # save original %rsp 2086.Lfrom_body: 2087 mov $num,%r11 2088 lea 48(%rsp),%rax 2089 pxor %xmm0,%xmm0 2090 jmp .Lmul_by_1 2091 2092.align 32 2093.Lmul_by_1: 2094 movdqu ($aptr),%xmm1 2095 movdqu 16($aptr),%xmm2 2096 movdqu 32($aptr),%xmm3 2097 movdqa %xmm0,(%rax,$num) 2098 movdqu 48($aptr),%xmm4 2099 movdqa %xmm0,16(%rax,$num) 2100 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr 2101 movdqa %xmm1,(%rax) 2102 movdqa %xmm0,32(%rax,$num) 2103 movdqa %xmm2,16(%rax) 2104 movdqa %xmm0,48(%rax,$num) 2105 movdqa %xmm3,32(%rax) 2106 movdqa %xmm4,48(%rax) 2107 lea 64(%rax),%rax 2108 sub \$64,%r11 2109 jnz .Lmul_by_1 2110 2111 movq $rptr,%xmm1 2112 movq $nptr,%xmm2 2113 .byte 0x67 2114 mov $nptr,%rbp 2115 movq %r10, %xmm3 # -num 2116___ 2117$code.=<<___ if ($addx); 2118 mov OPENSSL_ia32cap_P+8(%rip),%r11d 2119 and \$0x80108,%r11d 2120 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 2121 jne .Lfrom_mont_nox 2122 2123 lea (%rax,$num),$rptr 2124 call __bn_sqrx8x_reduction 2125 call __bn_postx4x_internal 2126 2127 pxor %xmm0,%xmm0 2128 lea 48(%rsp),%rax 2129 mov 40(%rsp),%rsi # restore %rsp 2130 jmp .Lfrom_mont_zero 2131 2132.align 32 2133.Lfrom_mont_nox: 2134___ 2135$code.=<<___; 2136 call __bn_sqr8x_reduction 2137 call __bn_post4x_internal 2138 2139 pxor %xmm0,%xmm0 2140 lea 48(%rsp),%rax 2141 mov 40(%rsp),%rsi # restore %rsp 2142 jmp .Lfrom_mont_zero 2143 2144.align 32 2145.Lfrom_mont_zero: 2146 movdqa %xmm0,16*0(%rax) 2147 movdqa %xmm0,16*1(%rax) 2148 movdqa %xmm0,16*2(%rax) 2149 movdqa %xmm0,16*3(%rax) 2150 lea 16*4(%rax),%rax 2151 sub \$32,$num 2152 jnz .Lfrom_mont_zero 2153 2154 mov \$1,%rax 2155 mov -48(%rsi),%r15 2156 mov -40(%rsi),%r14 2157 mov -32(%rsi),%r13 2158 mov -24(%rsi),%r12 2159 mov -16(%rsi),%rbp 2160 mov -8(%rsi),%rbx 2161 lea (%rsi),%rsp 2162.Lfrom_epilogue: 2163 ret 2164.size bn_from_mont8x,.-bn_from_mont8x 2165___ 2166} 2167}}} 2168 2169if ($addx) {{{ 2170my $bp="%rdx"; # restore original value 2171 2172$code.=<<___; 2173.type bn_mulx4x_mont_gather5,\@function,6 2174.align 32 2175bn_mulx4x_mont_gather5: 2176.Lmulx4x_enter: 2177 mov %rsp,%rax 2178 push %rbx 2179 push %rbp 2180 push %r12 2181 push %r13 2182 push %r14 2183 push %r15 2184 2185 shl \$3,${num}d # convert $num to bytes 2186 lea ($num,$num,2),%r10 # 3*$num in bytes 2187 neg $num # -$num 2188 mov ($n0),$n0 # *n0 2189 2190 ############################################################## 2191 # Ensure that stack frame doesn't alias with $rptr+3*$num 2192 # modulo 4096, which covers ret[num], am[num] and n[num] 2193 # (see bn_exp.c). This is done to allow memory disambiguation 2194 # logic do its magic. [Extra [num] is allocated in order 2195 # to align with bn_power5's frame, which is cleansed after 2196 # completing exponentiation. Extra 256 bytes is for power mask 2197 # calculated from 7th argument, the index.] 2198 # 2199 lea -320(%rsp,$num,2),%r11 2200 sub $rp,%r11 2201 and \$4095,%r11 2202 cmp %r11,%r10 2203 jb .Lmulx4xsp_alt 2204 sub %r11,%rsp # align with $aptr 2205 lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) 2206 jmp .Lmulx4xsp_done 2207 2208.Lmulx4xsp_alt: 2209 lea 4096-320(,$num,2),%r10 2210 lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) 2211 sub %r10,%r11 2212 mov \$0,%r10 2213 cmovc %r10,%r11 2214 sub %r11,%rsp 2215.Lmulx4xsp_done: 2216 and \$-64,%rsp # ensure alignment 2217 mov %rax,%r11 2218 sub %rsp,%r11 2219 and \$-4096,%r11 2220.Lmulx4x_page_walk: 2221 mov (%rsp,%r11),%r10 2222 sub \$4096,%r11 2223 .byte 0x2e # predict non-taken 2224 jnc .Lmulx4x_page_walk 2225 2226 ############################################################## 2227 # Stack layout 2228 # +0 -num 2229 # +8 off-loaded &b[i] 2230 # +16 end of b[num] 2231 # +24 inner counter 2232 # +32 saved n0 2233 # +40 saved %rsp 2234 # +48 2235 # +56 saved rp 2236 # +64 tmp[num+1] 2237 # 2238 mov $n0, 32(%rsp) # save *n0 2239 mov %rax,40(%rsp) # save original %rsp 2240.Lmulx4x_body: 2241 call mulx4x_internal 2242 2243 mov 40(%rsp),%rsi # restore %rsp 2244 mov \$1,%rax 2245 2246 mov -48(%rsi),%r15 2247 mov -40(%rsi),%r14 2248 mov -32(%rsi),%r13 2249 mov -24(%rsi),%r12 2250 mov -16(%rsi),%rbp 2251 mov -8(%rsi),%rbx 2252 lea (%rsi),%rsp 2253.Lmulx4x_epilogue: 2254 ret 2255.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2256 2257.type mulx4x_internal,\@abi-omnipotent 2258.align 32 2259mulx4x_internal: 2260 mov $num,8(%rsp) # save -$num (it was in bytes) 2261 mov $num,%r10 2262 neg $num # restore $num 2263 shl \$5,$num 2264 neg %r10 # restore $num 2265 lea 128($bp,$num),%r13 # end of powers table (+size optimization) 2266 shr \$5+5,$num 2267 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument 2268 sub \$1,$num 2269 lea .Linc(%rip),%rax 2270 mov %r13,16+8(%rsp) # end of b[num] 2271 mov $num,24+8(%rsp) # inner counter 2272 mov $rp, 56+8(%rsp) # save $rp 2273___ 2274my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 2275 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 2276my $rptr=$bptr; 2277my $STRIDE=2**5*8; # 5 is "window size" 2278my $N=$STRIDE/4; # should match cache line size 2279$code.=<<___; 2280 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 2281 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 2282 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton) 2283 lea 128($bp),$bptr # size optimization 2284 2285 pshufd \$0,%xmm5,%xmm5 # broadcast index 2286 movdqa %xmm1,%xmm4 2287 .byte 0x67 2288 movdqa %xmm1,%xmm2 2289___ 2290######################################################################## 2291# calculate mask by comparing 0..31 to index and save result to stack 2292# 2293$code.=<<___; 2294 .byte 0x67 2295 paddd %xmm0,%xmm1 2296 pcmpeqd %xmm5,%xmm0 # compare to 1,0 2297 movdqa %xmm4,%xmm3 2298___ 2299for($i=0;$i<$STRIDE/16-4;$i+=4) { 2300$code.=<<___; 2301 paddd %xmm1,%xmm2 2302 pcmpeqd %xmm5,%xmm1 # compare to 3,2 2303 movdqa %xmm0,`16*($i+0)+112`(%r10) 2304 movdqa %xmm4,%xmm0 2305 2306 paddd %xmm2,%xmm3 2307 pcmpeqd %xmm5,%xmm2 # compare to 5,4 2308 movdqa %xmm1,`16*($i+1)+112`(%r10) 2309 movdqa %xmm4,%xmm1 2310 2311 paddd %xmm3,%xmm0 2312 pcmpeqd %xmm5,%xmm3 # compare to 7,6 2313 movdqa %xmm2,`16*($i+2)+112`(%r10) 2314 movdqa %xmm4,%xmm2 2315 2316 paddd %xmm0,%xmm1 2317 pcmpeqd %xmm5,%xmm0 2318 movdqa %xmm3,`16*($i+3)+112`(%r10) 2319 movdqa %xmm4,%xmm3 2320___ 2321} 2322$code.=<<___; # last iteration can be optimized 2323 .byte 0x67 2324 paddd %xmm1,%xmm2 2325 pcmpeqd %xmm5,%xmm1 2326 movdqa %xmm0,`16*($i+0)+112`(%r10) 2327 2328 paddd %xmm2,%xmm3 2329 pcmpeqd %xmm5,%xmm2 2330 movdqa %xmm1,`16*($i+1)+112`(%r10) 2331 2332 pcmpeqd %xmm5,%xmm3 2333 movdqa %xmm2,`16*($i+2)+112`(%r10) 2334 2335 pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register 2336 pand `16*($i+1)-128`($bptr),%xmm1 2337 pand `16*($i+2)-128`($bptr),%xmm2 2338 movdqa %xmm3,`16*($i+3)+112`(%r10) 2339 pand `16*($i+3)-128`($bptr),%xmm3 2340 por %xmm2,%xmm0 2341 por %xmm3,%xmm1 2342___ 2343for($i=0;$i<$STRIDE/16-4;$i+=4) { 2344$code.=<<___; 2345 movdqa `16*($i+0)-128`($bptr),%xmm4 2346 movdqa `16*($i+1)-128`($bptr),%xmm5 2347 movdqa `16*($i+2)-128`($bptr),%xmm2 2348 pand `16*($i+0)+112`(%r10),%xmm4 2349 movdqa `16*($i+3)-128`($bptr),%xmm3 2350 pand `16*($i+1)+112`(%r10),%xmm5 2351 por %xmm4,%xmm0 2352 pand `16*($i+2)+112`(%r10),%xmm2 2353 por %xmm5,%xmm1 2354 pand `16*($i+3)+112`(%r10),%xmm3 2355 por %xmm2,%xmm0 2356 por %xmm3,%xmm1 2357___ 2358} 2359$code.=<<___; 2360 pxor %xmm1,%xmm0 2361 pshufd \$0x4e,%xmm0,%xmm1 2362 por %xmm1,%xmm0 2363 lea $STRIDE($bptr),$bptr 2364 movq %xmm0,%rdx # bp[0] 2365 lea 64+8*4+8(%rsp),$tptr 2366 2367 mov %rdx,$bi 2368 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 2369 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] 2370 add %rax,%r11 2371 mulx 2*8($aptr),%rax,%r13 # ... 2372 adc %rax,%r12 2373 adc \$0,%r13 2374 mulx 3*8($aptr),%rax,%r14 2375 2376 mov $mi,%r15 2377 imulq 32+8(%rsp),$mi # "t[0]"*n0 2378 xor $zero,$zero # cf=0, of=0 2379 mov $mi,%rdx 2380 2381 mov $bptr,8+8(%rsp) # off-load &b[i] 2382 2383 lea 4*8($aptr),$aptr 2384 adcx %rax,%r13 2385 adcx $zero,%r14 # cf=0 2386 2387 mulx 0*8($nptr),%rax,%r10 2388 adcx %rax,%r15 # discarded 2389 adox %r11,%r10 2390 mulx 1*8($nptr),%rax,%r11 2391 adcx %rax,%r10 2392 adox %r12,%r11 2393 mulx 2*8($nptr),%rax,%r12 2394 mov 24+8(%rsp),$bptr # counter value 2395 mov %r10,-8*4($tptr) 2396 adcx %rax,%r11 2397 adox %r13,%r12 2398 mulx 3*8($nptr),%rax,%r15 2399 mov $bi,%rdx 2400 mov %r11,-8*3($tptr) 2401 adcx %rax,%r12 2402 adox $zero,%r15 # of=0 2403 lea 4*8($nptr),$nptr 2404 mov %r12,-8*2($tptr) 2405 jmp .Lmulx4x_1st 2406 2407.align 32 2408.Lmulx4x_1st: 2409 adcx $zero,%r15 # cf=0, modulo-scheduled 2410 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 2411 adcx %r14,%r10 2412 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 2413 adcx %rax,%r11 2414 mulx 2*8($aptr),%r12,%rax # ... 2415 adcx %r14,%r12 2416 mulx 3*8($aptr),%r13,%r14 2417 .byte 0x67,0x67 2418 mov $mi,%rdx 2419 adcx %rax,%r13 2420 adcx $zero,%r14 # cf=0 2421 lea 4*8($aptr),$aptr 2422 lea 4*8($tptr),$tptr 2423 2424 adox %r15,%r10 2425 mulx 0*8($nptr),%rax,%r15 2426 adcx %rax,%r10 2427 adox %r15,%r11 2428 mulx 1*8($nptr),%rax,%r15 2429 adcx %rax,%r11 2430 adox %r15,%r12 2431 mulx 2*8($nptr),%rax,%r15 2432 mov %r10,-5*8($tptr) 2433 adcx %rax,%r12 2434 mov %r11,-4*8($tptr) 2435 adox %r15,%r13 2436 mulx 3*8($nptr),%rax,%r15 2437 mov $bi,%rdx 2438 mov %r12,-3*8($tptr) 2439 adcx %rax,%r13 2440 adox $zero,%r15 2441 lea 4*8($nptr),$nptr 2442 mov %r13,-2*8($tptr) 2443 2444 dec $bptr # of=0, pass cf 2445 jnz .Lmulx4x_1st 2446 2447 mov 8(%rsp),$num # load -num 2448 adc $zero,%r15 # modulo-scheduled 2449 lea ($aptr,$num),$aptr # rewind $aptr 2450 add %r15,%r14 2451 mov 8+8(%rsp),$bptr # re-load &b[i] 2452 adc $zero,$zero # top-most carry 2453 mov %r14,-1*8($tptr) 2454 jmp .Lmulx4x_outer 2455 2456.align 32 2457.Lmulx4x_outer: 2458 lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) 2459 pxor %xmm4,%xmm4 2460 .byte 0x67,0x67 2461 pxor %xmm5,%xmm5 2462___ 2463for($i=0;$i<$STRIDE/16;$i+=4) { 2464$code.=<<___; 2465 movdqa `16*($i+0)-128`($bptr),%xmm0 2466 movdqa `16*($i+1)-128`($bptr),%xmm1 2467 movdqa `16*($i+2)-128`($bptr),%xmm2 2468 pand `16*($i+0)+256`(%r10),%xmm0 2469 movdqa `16*($i+3)-128`($bptr),%xmm3 2470 pand `16*($i+1)+256`(%r10),%xmm1 2471 por %xmm0,%xmm4 2472 pand `16*($i+2)+256`(%r10),%xmm2 2473 por %xmm1,%xmm5 2474 pand `16*($i+3)+256`(%r10),%xmm3 2475 por %xmm2,%xmm4 2476 por %xmm3,%xmm5 2477___ 2478} 2479$code.=<<___; 2480 por %xmm5,%xmm4 2481 pshufd \$0x4e,%xmm4,%xmm0 2482 por %xmm4,%xmm0 2483 lea $STRIDE($bptr),$bptr 2484 movq %xmm0,%rdx # m0=bp[i] 2485 2486 mov $zero,($tptr) # save top-most carry 2487 lea 4*8($tptr,$num),$tptr # rewind $tptr 2488 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 2489 xor $zero,$zero # cf=0, of=0 2490 mov %rdx,$bi 2491 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 2492 adox -4*8($tptr),$mi # +t[0] 2493 adcx %r14,%r11 2494 mulx 2*8($aptr),%r15,%r13 # ... 2495 adox -3*8($tptr),%r11 2496 adcx %r15,%r12 2497 mulx 3*8($aptr),%rdx,%r14 2498 adox -2*8($tptr),%r12 2499 adcx %rdx,%r13 2500 lea ($nptr,$num),$nptr # rewind $nptr 2501 lea 4*8($aptr),$aptr 2502 adox -1*8($tptr),%r13 2503 adcx $zero,%r14 2504 adox $zero,%r14 2505 2506 mov $mi,%r15 2507 imulq 32+8(%rsp),$mi # "t[0]"*n0 2508 2509 mov $mi,%rdx 2510 xor $zero,$zero # cf=0, of=0 2511 mov $bptr,8+8(%rsp) # off-load &b[i] 2512 2513 mulx 0*8($nptr),%rax,%r10 2514 adcx %rax,%r15 # discarded 2515 adox %r11,%r10 2516 mulx 1*8($nptr),%rax,%r11 2517 adcx %rax,%r10 2518 adox %r12,%r11 2519 mulx 2*8($nptr),%rax,%r12 2520 adcx %rax,%r11 2521 adox %r13,%r12 2522 mulx 3*8($nptr),%rax,%r15 2523 mov $bi,%rdx 2524 mov 24+8(%rsp),$bptr # counter value 2525 mov %r10,-8*4($tptr) 2526 adcx %rax,%r12 2527 mov %r11,-8*3($tptr) 2528 adox $zero,%r15 # of=0 2529 mov %r12,-8*2($tptr) 2530 lea 4*8($nptr),$nptr 2531 jmp .Lmulx4x_inner 2532 2533.align 32 2534.Lmulx4x_inner: 2535 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 2536 adcx $zero,%r15 # cf=0, modulo-scheduled 2537 adox %r14,%r10 2538 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 2539 adcx 0*8($tptr),%r10 2540 adox %rax,%r11 2541 mulx 2*8($aptr),%r12,%rax # ... 2542 adcx 1*8($tptr),%r11 2543 adox %r14,%r12 2544 mulx 3*8($aptr),%r13,%r14 2545 mov $mi,%rdx 2546 adcx 2*8($tptr),%r12 2547 adox %rax,%r13 2548 adcx 3*8($tptr),%r13 2549 adox $zero,%r14 # of=0 2550 lea 4*8($aptr),$aptr 2551 lea 4*8($tptr),$tptr 2552 adcx $zero,%r14 # cf=0 2553 2554 adox %r15,%r10 2555 mulx 0*8($nptr),%rax,%r15 2556 adcx %rax,%r10 2557 adox %r15,%r11 2558 mulx 1*8($nptr),%rax,%r15 2559 adcx %rax,%r11 2560 adox %r15,%r12 2561 mulx 2*8($nptr),%rax,%r15 2562 mov %r10,-5*8($tptr) 2563 adcx %rax,%r12 2564 adox %r15,%r13 2565 mov %r11,-4*8($tptr) 2566 mulx 3*8($nptr),%rax,%r15 2567 mov $bi,%rdx 2568 lea 4*8($nptr),$nptr 2569 mov %r12,-3*8($tptr) 2570 adcx %rax,%r13 2571 adox $zero,%r15 2572 mov %r13,-2*8($tptr) 2573 2574 dec $bptr # of=0, pass cf 2575 jnz .Lmulx4x_inner 2576 2577 mov 0+8(%rsp),$num # load -num 2578 adc $zero,%r15 # modulo-scheduled 2579 sub 0*8($tptr),$bptr # pull top-most carry to %cf 2580 mov 8+8(%rsp),$bptr # re-load &b[i] 2581 mov 16+8(%rsp),%r10 2582 adc %r15,%r14 2583 lea ($aptr,$num),$aptr # rewind $aptr 2584 adc $zero,$zero # top-most carry 2585 mov %r14,-1*8($tptr) 2586 2587 cmp %r10,$bptr 2588 jb .Lmulx4x_outer 2589 2590 mov -8($nptr),%r10 2591 mov $zero,%r8 2592 mov ($nptr,$num),%r12 2593 lea ($nptr,$num),%rbp # rewind $nptr 2594 mov $num,%rcx 2595 lea ($tptr,$num),%rdi # rewind $tptr 2596 xor %eax,%eax 2597 xor %r15,%r15 2598 sub %r14,%r10 # compare top-most words 2599 adc %r15,%r15 2600 or %r15,%r8 2601 sar \$3+2,%rcx 2602 sub %r8,%rax # %rax=-%r8 2603 mov 56+8(%rsp),%rdx # restore rp 2604 dec %r12 # so that after 'not' we get -n[0] 2605 mov 8*1(%rbp),%r13 2606 xor %r8,%r8 2607 mov 8*2(%rbp),%r14 2608 mov 8*3(%rbp),%r15 2609 jmp .Lsqrx4x_sub_entry # common post-condition 2610.size mulx4x_internal,.-mulx4x_internal 2611___ 2612}{ 2613###################################################################### 2614# void bn_power5( 2615my $rptr="%rdi"; # BN_ULONG *rptr, 2616my $aptr="%rsi"; # const BN_ULONG *aptr, 2617my $bptr="%rdx"; # const void *table, 2618my $nptr="%rcx"; # const BN_ULONG *nptr, 2619my $n0 ="%r8"; # const BN_ULONG *n0); 2620my $num ="%r9"; # int num, has to be divisible by 8 2621 # int pwr); 2622 2623my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 2624my @A0=("%r10","%r11"); 2625my @A1=("%r12","%r13"); 2626my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 2627 2628$code.=<<___; 2629.type bn_powerx5,\@function,6 2630.align 32 2631bn_powerx5: 2632.Lpowerx5_enter: 2633 mov %rsp,%rax 2634 push %rbx 2635 push %rbp 2636 push %r12 2637 push %r13 2638 push %r14 2639 push %r15 2640 2641 shl \$3,${num}d # convert $num to bytes 2642 lea ($num,$num,2),%r10 # 3*$num in bytes 2643 neg $num 2644 mov ($n0),$n0 # *n0 2645 2646 ############################################################## 2647 # Ensure that stack frame doesn't alias with $rptr+3*$num 2648 # modulo 4096, which covers ret[num], am[num] and n[num] 2649 # (see bn_exp.c). This is done to allow memory disambiguation 2650 # logic do its magic. [Extra 256 bytes is for power mask 2651 # calculated from 7th argument, the index.] 2652 # 2653 lea -320(%rsp,$num,2),%r11 2654 sub $rptr,%r11 2655 and \$4095,%r11 2656 cmp %r11,%r10 2657 jb .Lpwrx_sp_alt 2658 sub %r11,%rsp # align with $aptr 2659 lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) 2660 jmp .Lpwrx_sp_done 2661 2662.align 32 2663.Lpwrx_sp_alt: 2664 lea 4096-320(,$num,2),%r10 2665 lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) 2666 sub %r10,%r11 2667 mov \$0,%r10 2668 cmovc %r10,%r11 2669 sub %r11,%rsp 2670.Lpwrx_sp_done: 2671 and \$-64,%rsp 2672 mov %rax,%r11 2673 sub %rsp,%r11 2674 and \$-4096,%r11 2675.Lpwrx_page_walk: 2676 mov (%rsp,%r11),%r10 2677 sub \$4096,%r11 2678 .byte 0x2e # predict non-taken 2679 jnc .Lpwrx_page_walk 2680 2681 mov $num,%r10 2682 neg $num 2683 2684 ############################################################## 2685 # Stack layout 2686 # 2687 # +0 saved $num, used in reduction section 2688 # +8 &t[2*$num], used in reduction section 2689 # +16 intermediate carry bit 2690 # +24 top-most carry bit, used in reduction section 2691 # +32 saved *n0 2692 # +40 saved %rsp 2693 # +48 t[2*$num] 2694 # 2695 pxor %xmm0,%xmm0 2696 movq $rptr,%xmm1 # save $rptr 2697 movq $nptr,%xmm2 # save $nptr 2698 movq %r10, %xmm3 # -$num 2699 movq $bptr,%xmm4 2700 mov $n0, 32(%rsp) 2701 mov %rax, 40(%rsp) # save original %rsp 2702.Lpowerx5_body: 2703 2704 call __bn_sqrx8x_internal 2705 call __bn_postx4x_internal 2706 call __bn_sqrx8x_internal 2707 call __bn_postx4x_internal 2708 call __bn_sqrx8x_internal 2709 call __bn_postx4x_internal 2710 call __bn_sqrx8x_internal 2711 call __bn_postx4x_internal 2712 call __bn_sqrx8x_internal 2713 call __bn_postx4x_internal 2714 2715 mov %r10,$num # -num 2716 mov $aptr,$rptr 2717 movq %xmm2,$nptr 2718 movq %xmm4,$bptr 2719 mov 40(%rsp),%rax 2720 2721 call mulx4x_internal 2722 2723 mov 40(%rsp),%rsi # restore %rsp 2724 mov \$1,%rax 2725 2726 mov -48(%rsi),%r15 2727 mov -40(%rsi),%r14 2728 mov -32(%rsi),%r13 2729 mov -24(%rsi),%r12 2730 mov -16(%rsi),%rbp 2731 mov -8(%rsi),%rbx 2732 lea (%rsi),%rsp 2733.Lpowerx5_epilogue: 2734 ret 2735.size bn_powerx5,.-bn_powerx5 2736 2737.globl bn_sqrx8x_internal 2738.hidden bn_sqrx8x_internal 2739.type bn_sqrx8x_internal,\@abi-omnipotent 2740.align 32 2741bn_sqrx8x_internal: 2742__bn_sqrx8x_internal: 2743 ################################################################## 2744 # Squaring part: 2745 # 2746 # a) multiply-n-add everything but a[i]*a[i]; 2747 # b) shift result of a) by 1 to the left and accumulate 2748 # a[i]*a[i] products; 2749 # 2750 ################################################################## 2751 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2752 # a[1]a[0] 2753 # a[2]a[0] 2754 # a[3]a[0] 2755 # a[2]a[1] 2756 # a[3]a[1] 2757 # a[3]a[2] 2758 # 2759 # a[4]a[0] 2760 # a[5]a[0] 2761 # a[6]a[0] 2762 # a[7]a[0] 2763 # a[4]a[1] 2764 # a[5]a[1] 2765 # a[6]a[1] 2766 # a[7]a[1] 2767 # a[4]a[2] 2768 # a[5]a[2] 2769 # a[6]a[2] 2770 # a[7]a[2] 2771 # a[4]a[3] 2772 # a[5]a[3] 2773 # a[6]a[3] 2774 # a[7]a[3] 2775 # 2776 # a[5]a[4] 2777 # a[6]a[4] 2778 # a[7]a[4] 2779 # a[6]a[5] 2780 # a[7]a[5] 2781 # a[7]a[6] 2782 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2783___ 2784{ 2785my ($zero,$carry)=("%rbp","%rcx"); 2786my $aaptr=$zero; 2787$code.=<<___; 2788 lea 48+8(%rsp),$tptr 2789 lea ($aptr,$num),$aaptr 2790 mov $num,0+8(%rsp) # save $num 2791 mov $aaptr,8+8(%rsp) # save end of $aptr 2792 jmp .Lsqr8x_zero_start 2793 2794.align 32 2795.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2796.Lsqrx8x_zero: 2797 .byte 0x3e 2798 movdqa %xmm0,0*8($tptr) 2799 movdqa %xmm0,2*8($tptr) 2800 movdqa %xmm0,4*8($tptr) 2801 movdqa %xmm0,6*8($tptr) 2802.Lsqr8x_zero_start: # aligned at 32 2803 movdqa %xmm0,8*8($tptr) 2804 movdqa %xmm0,10*8($tptr) 2805 movdqa %xmm0,12*8($tptr) 2806 movdqa %xmm0,14*8($tptr) 2807 lea 16*8($tptr),$tptr 2808 sub \$64,$num 2809 jnz .Lsqrx8x_zero 2810 2811 mov 0*8($aptr),%rdx # a[0], modulo-scheduled 2812 #xor %r9,%r9 # t[1], ex-$num, zero already 2813 xor %r10,%r10 2814 xor %r11,%r11 2815 xor %r12,%r12 2816 xor %r13,%r13 2817 xor %r14,%r14 2818 xor %r15,%r15 2819 lea 48+8(%rsp),$tptr 2820 xor $zero,$zero # cf=0, cf=0 2821 jmp .Lsqrx8x_outer_loop 2822 2823.align 32 2824.Lsqrx8x_outer_loop: 2825 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] 2826 adcx %r9,%r8 # a[1]*a[0]+=t[1] 2827 adox %rax,%r10 2828 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] 2829 adcx %r10,%r9 2830 adox %rax,%r11 2831 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... 2832 adcx %r11,%r10 2833 adox %rax,%r12 2834 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax 2835 adcx %r12,%r11 2836 adox %rax,%r13 2837 mulx 5*8($aptr),%r12,%rax 2838 adcx %r13,%r12 2839 adox %rax,%r14 2840 mulx 6*8($aptr),%r13,%rax 2841 adcx %r14,%r13 2842 adox %r15,%rax 2843 mulx 7*8($aptr),%r14,%r15 2844 mov 1*8($aptr),%rdx # a[1] 2845 adcx %rax,%r14 2846 adox $zero,%r15 2847 adc 8*8($tptr),%r15 2848 mov %r8,1*8($tptr) # t[1] 2849 mov %r9,2*8($tptr) # t[2] 2850 sbb $carry,$carry # mov %cf,$carry 2851 xor $zero,$zero # cf=0, of=0 2852 2853 2854 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] 2855 mulx 3*8($aptr),%r9,%rax # a[3]*a[1] 2856 adcx %r10,%r8 2857 adox %rbx,%r9 2858 mulx 4*8($aptr),%r10,%rbx # ... 2859 adcx %r11,%r9 2860 adox %rax,%r10 2861 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax 2862 adcx %r12,%r10 2863 adox %rbx,%r11 2864 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx 2865 adcx %r13,%r11 2866 adox %r14,%r12 2867 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 2868 mov 2*8($aptr),%rdx # a[2] 2869 adcx %rax,%r12 2870 adox %rbx,%r13 2871 adcx %r15,%r13 2872 adox $zero,%r14 # of=0 2873 adcx $zero,%r14 # cf=0 2874 2875 mov %r8,3*8($tptr) # t[3] 2876 mov %r9,4*8($tptr) # t[4] 2877 2878 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] 2879 mulx 4*8($aptr),%r9,%rax # a[4]*a[2] 2880 adcx %r10,%r8 2881 adox %rbx,%r9 2882 mulx 5*8($aptr),%r10,%rbx # ... 2883 adcx %r11,%r9 2884 adox %rax,%r10 2885 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax 2886 adcx %r12,%r10 2887 adox %r13,%r11 2888 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 2889 .byte 0x3e 2890 mov 3*8($aptr),%rdx # a[3] 2891 adcx %rbx,%r11 2892 adox %rax,%r12 2893 adcx %r14,%r12 2894 mov %r8,5*8($tptr) # t[5] 2895 mov %r9,6*8($tptr) # t[6] 2896 mulx 4*8($aptr),%r8,%rax # a[4]*a[3] 2897 adox $zero,%r13 # of=0 2898 adcx $zero,%r13 # cf=0 2899 2900 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] 2901 adcx %r10,%r8 2902 adox %rax,%r9 2903 mulx 6*8($aptr),%r10,%rax # ... 2904 adcx %r11,%r9 2905 adox %r12,%r10 2906 mulx 7*8($aptr),%r11,%r12 2907 mov 4*8($aptr),%rdx # a[4] 2908 mov 5*8($aptr),%r14 # a[5] 2909 adcx %rbx,%r10 2910 adox %rax,%r11 2911 mov 6*8($aptr),%r15 # a[6] 2912 adcx %r13,%r11 2913 adox $zero,%r12 # of=0 2914 adcx $zero,%r12 # cf=0 2915 2916 mov %r8,7*8($tptr) # t[7] 2917 mov %r9,8*8($tptr) # t[8] 2918 2919 mulx %r14,%r9,%rax # a[5]*a[4] 2920 mov 7*8($aptr),%r8 # a[7] 2921 adcx %r10,%r9 2922 mulx %r15,%r10,%rbx # a[6]*a[4] 2923 adox %rax,%r10 2924 adcx %r11,%r10 2925 mulx %r8,%r11,%rax # a[7]*a[4] 2926 mov %r14,%rdx # a[5] 2927 adox %rbx,%r11 2928 adcx %r12,%r11 2929 #adox $zero,%rax # of=0 2930 adcx $zero,%rax # cf=0 2931 2932 mulx %r15,%r14,%rbx # a[6]*a[5] 2933 mulx %r8,%r12,%r13 # a[7]*a[5] 2934 mov %r15,%rdx # a[6] 2935 lea 8*8($aptr),$aptr 2936 adcx %r14,%r11 2937 adox %rbx,%r12 2938 adcx %rax,%r12 2939 adox $zero,%r13 2940 2941 .byte 0x67,0x67 2942 mulx %r8,%r8,%r14 # a[7]*a[6] 2943 adcx %r8,%r13 2944 adcx $zero,%r14 2945 2946 cmp 8+8(%rsp),$aptr 2947 je .Lsqrx8x_outer_break 2948 2949 neg $carry # mov $carry,%cf 2950 mov \$-8,%rcx 2951 mov $zero,%r15 2952 mov 8*8($tptr),%r8 2953 adcx 9*8($tptr),%r9 # +=t[9] 2954 adcx 10*8($tptr),%r10 # ... 2955 adcx 11*8($tptr),%r11 2956 adc 12*8($tptr),%r12 2957 adc 13*8($tptr),%r13 2958 adc 14*8($tptr),%r14 2959 adc 15*8($tptr),%r15 2960 lea ($aptr),$aaptr 2961 lea 2*64($tptr),$tptr 2962 sbb %rax,%rax # mov %cf,$carry 2963 2964 mov -64($aptr),%rdx # a[0] 2965 mov %rax,16+8(%rsp) # offload $carry 2966 mov $tptr,24+8(%rsp) 2967 2968 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above 2969 xor %eax,%eax # cf=0, of=0 2970 jmp .Lsqrx8x_loop 2971 2972.align 32 2973.Lsqrx8x_loop: 2974 mov %r8,%rbx 2975 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] 2976 adcx %rax,%rbx # +=t[8] 2977 adox %r9,%r8 2978 2979 mulx 1*8($aaptr),%rax,%r9 # ... 2980 adcx %rax,%r8 2981 adox %r10,%r9 2982 2983 mulx 2*8($aaptr),%rax,%r10 2984 adcx %rax,%r9 2985 adox %r11,%r10 2986 2987 mulx 3*8($aaptr),%rax,%r11 2988 adcx %rax,%r10 2989 adox %r12,%r11 2990 2991 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 2992 adcx %rax,%r11 2993 adox %r13,%r12 2994 2995 mulx 5*8($aaptr),%rax,%r13 2996 adcx %rax,%r12 2997 adox %r14,%r13 2998 2999 mulx 6*8($aaptr),%rax,%r14 3000 mov %rbx,($tptr,%rcx,8) # store t[8+i] 3001 mov \$0,%ebx 3002 adcx %rax,%r13 3003 adox %r15,%r14 3004 3005 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 3006 mov 8($aptr,%rcx,8),%rdx # a[i] 3007 adcx %rax,%r14 3008 adox %rbx,%r15 # %rbx is 0, of=0 3009 adcx %rbx,%r15 # cf=0 3010 3011 .byte 0x67 3012 inc %rcx # of=0 3013 jnz .Lsqrx8x_loop 3014 3015 lea 8*8($aaptr),$aaptr 3016 mov \$-8,%rcx 3017 cmp 8+8(%rsp),$aaptr # done? 3018 je .Lsqrx8x_break 3019 3020 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3021 .byte 0x66 3022 mov -64($aptr),%rdx 3023 adcx 0*8($tptr),%r8 3024 adcx 1*8($tptr),%r9 3025 adc 2*8($tptr),%r10 3026 adc 3*8($tptr),%r11 3027 adc 4*8($tptr),%r12 3028 adc 5*8($tptr),%r13 3029 adc 6*8($tptr),%r14 3030 adc 7*8($tptr),%r15 3031 lea 8*8($tptr),$tptr 3032 .byte 0x67 3033 sbb %rax,%rax # mov %cf,%rax 3034 xor %ebx,%ebx # cf=0, of=0 3035 mov %rax,16+8(%rsp) # offload carry 3036 jmp .Lsqrx8x_loop 3037 3038.align 32 3039.Lsqrx8x_break: 3040 sub 16+8(%rsp),%r8 # consume last carry 3041 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry 3042 mov 0*8($aptr),%rdx # a[8], modulo-scheduled 3043 xor %ebp,%ebp # xor $zero,$zero 3044 mov %r8,0*8($tptr) 3045 cmp $carry,$tptr # cf=0, of=0 3046 je .Lsqrx8x_outer_loop 3047 3048 mov %r9,1*8($tptr) 3049 mov 1*8($carry),%r9 3050 mov %r10,2*8($tptr) 3051 mov 2*8($carry),%r10 3052 mov %r11,3*8($tptr) 3053 mov 3*8($carry),%r11 3054 mov %r12,4*8($tptr) 3055 mov 4*8($carry),%r12 3056 mov %r13,5*8($tptr) 3057 mov 5*8($carry),%r13 3058 mov %r14,6*8($tptr) 3059 mov 6*8($carry),%r14 3060 mov %r15,7*8($tptr) 3061 mov 7*8($carry),%r15 3062 mov $carry,$tptr 3063 jmp .Lsqrx8x_outer_loop 3064 3065.align 32 3066.Lsqrx8x_outer_break: 3067 mov %r9,9*8($tptr) # t[9] 3068 movq %xmm3,%rcx # -$num 3069 mov %r10,10*8($tptr) # ... 3070 mov %r11,11*8($tptr) 3071 mov %r12,12*8($tptr) 3072 mov %r13,13*8($tptr) 3073 mov %r14,14*8($tptr) 3074___ 3075}{ 3076my $i="%rcx"; 3077$code.=<<___; 3078 lea 48+8(%rsp),$tptr 3079 mov ($aptr,$i),%rdx # a[0] 3080 3081 mov 8($tptr),$A0[1] # t[1] 3082 xor $A0[0],$A0[0] # t[0], of=0, cf=0 3083 mov 0+8(%rsp),$num # restore $num 3084 adox $A0[1],$A0[1] 3085 mov 16($tptr),$A1[0] # t[2] # prefetch 3086 mov 24($tptr),$A1[1] # t[3] # prefetch 3087 #jmp .Lsqrx4x_shift_n_add # happens to be aligned 3088 3089.align 32 3090.Lsqrx4x_shift_n_add: 3091 mulx %rdx,%rax,%rbx 3092 adox $A1[0],$A1[0] 3093 adcx $A0[0],%rax 3094 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch 3095 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch 3096 adox $A1[1],$A1[1] 3097 adcx $A0[1],%rbx 3098 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch 3099 mov %rax,0($tptr) 3100 mov %rbx,8($tptr) 3101 3102 mulx %rdx,%rax,%rbx 3103 adox $A0[0],$A0[0] 3104 adcx $A1[0],%rax 3105 mov 16($aptr,$i),%rdx # a[i+2] # prefetch 3106 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch 3107 adox $A0[1],$A0[1] 3108 adcx $A1[1],%rbx 3109 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch 3110 mov %rax,16($tptr) 3111 mov %rbx,24($tptr) 3112 3113 mulx %rdx,%rax,%rbx 3114 adox $A1[0],$A1[0] 3115 adcx $A0[0],%rax 3116 mov 24($aptr,$i),%rdx # a[i+3] # prefetch 3117 lea 32($i),$i 3118 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch 3119 adox $A1[1],$A1[1] 3120 adcx $A0[1],%rbx 3121 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch 3122 mov %rax,32($tptr) 3123 mov %rbx,40($tptr) 3124 3125 mulx %rdx,%rax,%rbx 3126 adox $A0[0],$A0[0] 3127 adcx $A1[0],%rax 3128 jrcxz .Lsqrx4x_shift_n_add_break 3129 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch 3130 adox $A0[1],$A0[1] 3131 adcx $A1[1],%rbx 3132 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch 3133 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch 3134 mov %rax,48($tptr) 3135 mov %rbx,56($tptr) 3136 lea 64($tptr),$tptr 3137 nop 3138 jmp .Lsqrx4x_shift_n_add 3139 3140.align 32 3141.Lsqrx4x_shift_n_add_break: 3142 adcx $A1[1],%rbx 3143 mov %rax,48($tptr) 3144 mov %rbx,56($tptr) 3145 lea 64($tptr),$tptr # end of t[] buffer 3146___ 3147} 3148###################################################################### 3149# Montgomery reduction part, "word-by-word" algorithm. 3150# 3151# This new path is inspired by multiple submissions from Intel, by 3152# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 3153# Vinodh Gopal... 3154{ 3155my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); 3156 3157$code.=<<___; 3158 movq %xmm2,$nptr 3159__bn_sqrx8x_reduction: 3160 xor %eax,%eax # initial top-most carry bit 3161 mov 32+8(%rsp),%rbx # n0 3162 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) 3163 lea -8*8($nptr,$num),%rcx # end of n[] 3164 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer 3165 mov %rcx, 0+8(%rsp) # save end of n[] 3166 mov $tptr,8+8(%rsp) # save end of t[] 3167 3168 lea 48+8(%rsp),$tptr # initial t[] window 3169 jmp .Lsqrx8x_reduction_loop 3170 3171.align 32 3172.Lsqrx8x_reduction_loop: 3173 mov 8*1($tptr),%r9 3174 mov 8*2($tptr),%r10 3175 mov 8*3($tptr),%r11 3176 mov 8*4($tptr),%r12 3177 mov %rdx,%r8 3178 imulq %rbx,%rdx # n0*a[i] 3179 mov 8*5($tptr),%r13 3180 mov 8*6($tptr),%r14 3181 mov 8*7($tptr),%r15 3182 mov %rax,24+8(%rsp) # store top-most carry bit 3183 3184 lea 8*8($tptr),$tptr 3185 xor $carry,$carry # cf=0,of=0 3186 mov \$-8,%rcx 3187 jmp .Lsqrx8x_reduce 3188 3189.align 32 3190.Lsqrx8x_reduce: 3191 mov %r8, %rbx 3192 mulx 8*0($nptr),%rax,%r8 # n[0] 3193 adcx %rbx,%rax # discarded 3194 adox %r9,%r8 3195 3196 mulx 8*1($nptr),%rbx,%r9 # n[1] 3197 adcx %rbx,%r8 3198 adox %r10,%r9 3199 3200 mulx 8*2($nptr),%rbx,%r10 3201 adcx %rbx,%r9 3202 adox %r11,%r10 3203 3204 mulx 8*3($nptr),%rbx,%r11 3205 adcx %rbx,%r10 3206 adox %r12,%r11 3207 3208 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 3209 mov %rdx,%rax 3210 mov %r8,%rdx 3211 adcx %rbx,%r11 3212 adox %r13,%r12 3213 3214 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded 3215 mov %rax,%rdx 3216 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] 3217 3218 mulx 8*5($nptr),%rax,%r13 3219 adcx %rax,%r12 3220 adox %r14,%r13 3221 3222 mulx 8*6($nptr),%rax,%r14 3223 adcx %rax,%r13 3224 adox %r15,%r14 3225 3226 mulx 8*7($nptr),%rax,%r15 3227 mov %rbx,%rdx 3228 adcx %rax,%r14 3229 adox $carry,%r15 # $carry is 0 3230 adcx $carry,%r15 # cf=0 3231 3232 .byte 0x67,0x67,0x67 3233 inc %rcx # of=0 3234 jnz .Lsqrx8x_reduce 3235 3236 mov $carry,%rax # xor %rax,%rax 3237 cmp 0+8(%rsp),$nptr # end of n[]? 3238 jae .Lsqrx8x_no_tail 3239 3240 mov 48+8(%rsp),%rdx # pull n0*a[0] 3241 add 8*0($tptr),%r8 3242 lea 8*8($nptr),$nptr 3243 mov \$-8,%rcx 3244 adcx 8*1($tptr),%r9 3245 adcx 8*2($tptr),%r10 3246 adc 8*3($tptr),%r11 3247 adc 8*4($tptr),%r12 3248 adc 8*5($tptr),%r13 3249 adc 8*6($tptr),%r14 3250 adc 8*7($tptr),%r15 3251 lea 8*8($tptr),$tptr 3252 sbb %rax,%rax # top carry 3253 3254 xor $carry,$carry # of=0, cf=0 3255 mov %rax,16+8(%rsp) 3256 jmp .Lsqrx8x_tail 3257 3258.align 32 3259.Lsqrx8x_tail: 3260 mov %r8,%rbx 3261 mulx 8*0($nptr),%rax,%r8 3262 adcx %rax,%rbx 3263 adox %r9,%r8 3264 3265 mulx 8*1($nptr),%rax,%r9 3266 adcx %rax,%r8 3267 adox %r10,%r9 3268 3269 mulx 8*2($nptr),%rax,%r10 3270 adcx %rax,%r9 3271 adox %r11,%r10 3272 3273 mulx 8*3($nptr),%rax,%r11 3274 adcx %rax,%r10 3275 adox %r12,%r11 3276 3277 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 3278 adcx %rax,%r11 3279 adox %r13,%r12 3280 3281 mulx 8*5($nptr),%rax,%r13 3282 adcx %rax,%r12 3283 adox %r14,%r13 3284 3285 mulx 8*6($nptr),%rax,%r14 3286 adcx %rax,%r13 3287 adox %r15,%r14 3288 3289 mulx 8*7($nptr),%rax,%r15 3290 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] 3291 adcx %rax,%r14 3292 adox $carry,%r15 3293 mov %rbx,($tptr,%rcx,8) # save result 3294 mov %r8,%rbx 3295 adcx $carry,%r15 # cf=0 3296 3297 inc %rcx # of=0 3298 jnz .Lsqrx8x_tail 3299 3300 cmp 0+8(%rsp),$nptr # end of n[]? 3301 jae .Lsqrx8x_tail_done # break out of loop 3302 3303 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3304 mov 48+8(%rsp),%rdx # pull n0*a[0] 3305 lea 8*8($nptr),$nptr 3306 adc 8*0($tptr),%r8 3307 adc 8*1($tptr),%r9 3308 adc 8*2($tptr),%r10 3309 adc 8*3($tptr),%r11 3310 adc 8*4($tptr),%r12 3311 adc 8*5($tptr),%r13 3312 adc 8*6($tptr),%r14 3313 adc 8*7($tptr),%r15 3314 lea 8*8($tptr),$tptr 3315 sbb %rax,%rax 3316 sub \$8,%rcx # mov \$-8,%rcx 3317 3318 xor $carry,$carry # of=0, cf=0 3319 mov %rax,16+8(%rsp) 3320 jmp .Lsqrx8x_tail 3321 3322.align 32 3323.Lsqrx8x_tail_done: 3324 add 24+8(%rsp),%r8 # can this overflow? 3325 adc \$0,%r9 3326 adc \$0,%r10 3327 adc \$0,%r11 3328 adc \$0,%r12 3329 adc \$0,%r13 3330 adc \$0,%r14 3331 adc \$0,%r15 # can't overflow, because we 3332 # started with "overhung" part 3333 # of multiplication 3334 mov $carry,%rax # xor %rax,%rax 3335 3336 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3337.Lsqrx8x_no_tail: # %cf is 0 if jumped here 3338 adc 8*0($tptr),%r8 3339 movq %xmm3,%rcx 3340 adc 8*1($tptr),%r9 3341 mov 8*7($nptr),$carry 3342 movq %xmm2,$nptr # restore $nptr 3343 adc 8*2($tptr),%r10 3344 adc 8*3($tptr),%r11 3345 adc 8*4($tptr),%r12 3346 adc 8*5($tptr),%r13 3347 adc 8*6($tptr),%r14 3348 adc 8*7($tptr),%r15 3349 adc %rax,%rax # top-most carry 3350 3351 mov 32+8(%rsp),%rbx # n0 3352 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" 3353 3354 mov %r8,8*0($tptr) # store top 512 bits 3355 lea 8*8($tptr),%r8 # borrow %r8 3356 mov %r9,8*1($tptr) 3357 mov %r10,8*2($tptr) 3358 mov %r11,8*3($tptr) 3359 mov %r12,8*4($tptr) 3360 mov %r13,8*5($tptr) 3361 mov %r14,8*6($tptr) 3362 mov %r15,8*7($tptr) 3363 3364 lea 8*8($tptr,%rcx),$tptr # start of current t[] window 3365 cmp 8+8(%rsp),%r8 # end of t[]? 3366 jb .Lsqrx8x_reduction_loop 3367 ret 3368.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3369___ 3370} 3371############################################################## 3372# Post-condition, 4x unrolled 3373# 3374{ 3375my ($rptr,$nptr)=("%rdx","%rbp"); 3376$code.=<<___; 3377.align 32 3378__bn_postx4x_internal: 3379 mov 8*0($nptr),%r12 3380 mov %rcx,%r10 # -$num 3381 mov %rcx,%r9 # -$num 3382 neg %rax 3383 sar \$3+2,%rcx 3384 #lea 48+8(%rsp,%r9),$tptr 3385 movq %xmm1,$rptr # restore $rptr 3386 movq %xmm1,$aptr # prepare for back-to-back call 3387 dec %r12 # so that after 'not' we get -n[0] 3388 mov 8*1($nptr),%r13 3389 xor %r8,%r8 3390 mov 8*2($nptr),%r14 3391 mov 8*3($nptr),%r15 3392 jmp .Lsqrx4x_sub_entry 3393 3394.align 16 3395.Lsqrx4x_sub: 3396 mov 8*0($nptr),%r12 3397 mov 8*1($nptr),%r13 3398 mov 8*2($nptr),%r14 3399 mov 8*3($nptr),%r15 3400.Lsqrx4x_sub_entry: 3401 andn %rax,%r12,%r12 3402 lea 8*4($nptr),$nptr 3403 andn %rax,%r13,%r13 3404 andn %rax,%r14,%r14 3405 andn %rax,%r15,%r15 3406 3407 neg %r8 # mov %r8,%cf 3408 adc 8*0($tptr),%r12 3409 adc 8*1($tptr),%r13 3410 adc 8*2($tptr),%r14 3411 adc 8*3($tptr),%r15 3412 mov %r12,8*0($rptr) 3413 lea 8*4($tptr),$tptr 3414 mov %r13,8*1($rptr) 3415 sbb %r8,%r8 # mov %cf,%r8 3416 mov %r14,8*2($rptr) 3417 mov %r15,8*3($rptr) 3418 lea 8*4($rptr),$rptr 3419 3420 inc %rcx 3421 jnz .Lsqrx4x_sub 3422 3423 neg %r9 # restore $num 3424 3425 ret 3426.size __bn_postx4x_internal,.-__bn_postx4x_internal 3427___ 3428} 3429}}} 3430{ 3431my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order 3432 ("%rdi","%esi","%rdx","%ecx"); # Unix order 3433my $out=$inp; 3434my $STRIDE=2**5*8; 3435my $N=$STRIDE/4; 3436 3437$code.=<<___; 3438.globl bn_get_bits5 3439.type bn_get_bits5,\@abi-omnipotent 3440.align 16 3441bn_get_bits5: 3442 lea 0($inp),%r10 3443 lea 1($inp),%r11 3444 mov $num,%ecx 3445 shr \$4,$num 3446 and \$15,%ecx 3447 lea -8(%ecx),%eax 3448 cmp \$11,%ecx 3449 cmova %r11,%r10 3450 cmova %eax,%ecx 3451 movzw (%r10,$num,2),%eax 3452 shrl %cl,%eax 3453 and \$31,%eax 3454 ret 3455.size bn_get_bits5,.-bn_get_bits5 3456 3457.globl bn_scatter5 3458.type bn_scatter5,\@abi-omnipotent 3459.align 16 3460bn_scatter5: 3461 cmp \$0, $num 3462 jz .Lscatter_epilogue 3463 lea ($tbl,$idx,8),$tbl 3464.Lscatter: 3465 mov ($inp),%rax 3466 lea 8($inp),$inp 3467 mov %rax,($tbl) 3468 lea 32*8($tbl),$tbl 3469 sub \$1,$num 3470 jnz .Lscatter 3471.Lscatter_epilogue: 3472 ret 3473.size bn_scatter5,.-bn_scatter5 3474 3475.globl bn_gather5 3476.type bn_gather5,\@abi-omnipotent 3477.align 32 3478bn_gather5: 3479.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases 3480 # I can't trust assembler to use specific encoding:-( 3481 .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 3482 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp 3483 lea .Linc(%rip),%rax 3484 and \$-16,%rsp # shouldn't be formally required 3485 3486 movd $idx,%xmm5 3487 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 3488 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 3489 lea 128($tbl),%r11 # size optimization 3490 lea 128(%rsp),%rax # size optimization 3491 3492 pshufd \$0,%xmm5,%xmm5 # broadcast $idx 3493 movdqa %xmm1,%xmm4 3494 movdqa %xmm1,%xmm2 3495___ 3496######################################################################## 3497# calculate mask by comparing 0..31 to $idx and save result to stack 3498# 3499for($i=0;$i<$STRIDE/16;$i+=4) { 3500$code.=<<___; 3501 paddd %xmm0,%xmm1 3502 pcmpeqd %xmm5,%xmm0 # compare to 1,0 3503___ 3504$code.=<<___ if ($i); 3505 movdqa %xmm3,`16*($i-1)-128`(%rax) 3506___ 3507$code.=<<___; 3508 movdqa %xmm4,%xmm3 3509 3510 paddd %xmm1,%xmm2 3511 pcmpeqd %xmm5,%xmm1 # compare to 3,2 3512 movdqa %xmm0,`16*($i+0)-128`(%rax) 3513 movdqa %xmm4,%xmm0 3514 3515 paddd %xmm2,%xmm3 3516 pcmpeqd %xmm5,%xmm2 # compare to 5,4 3517 movdqa %xmm1,`16*($i+1)-128`(%rax) 3518 movdqa %xmm4,%xmm1 3519 3520 paddd %xmm3,%xmm0 3521 pcmpeqd %xmm5,%xmm3 # compare to 7,6 3522 movdqa %xmm2,`16*($i+2)-128`(%rax) 3523 movdqa %xmm4,%xmm2 3524___ 3525} 3526$code.=<<___; 3527 movdqa %xmm3,`16*($i-1)-128`(%rax) 3528 jmp .Lgather 3529 3530.align 32 3531.Lgather: 3532 pxor %xmm4,%xmm4 3533 pxor %xmm5,%xmm5 3534___ 3535for($i=0;$i<$STRIDE/16;$i+=4) { 3536$code.=<<___; 3537 movdqa `16*($i+0)-128`(%r11),%xmm0 3538 movdqa `16*($i+1)-128`(%r11),%xmm1 3539 movdqa `16*($i+2)-128`(%r11),%xmm2 3540 pand `16*($i+0)-128`(%rax),%xmm0 3541 movdqa `16*($i+3)-128`(%r11),%xmm3 3542 pand `16*($i+1)-128`(%rax),%xmm1 3543 por %xmm0,%xmm4 3544 pand `16*($i+2)-128`(%rax),%xmm2 3545 por %xmm1,%xmm5 3546 pand `16*($i+3)-128`(%rax),%xmm3 3547 por %xmm2,%xmm4 3548 por %xmm3,%xmm5 3549___ 3550} 3551$code.=<<___; 3552 por %xmm5,%xmm4 3553 lea $STRIDE(%r11),%r11 3554 pshufd \$0x4e,%xmm4,%xmm0 3555 por %xmm4,%xmm0 3556 movq %xmm0,($out) # m0=bp[0] 3557 lea 8($out),$out 3558 sub \$1,$num 3559 jnz .Lgather 3560 3561 lea (%r10),%rsp 3562 ret 3563.LSEH_end_bn_gather5: 3564.size bn_gather5,.-bn_gather5 3565___ 3566} 3567$code.=<<___; 3568.align 64 3569.Linc: 3570 .long 0,0, 1,1 3571 .long 2,2, 2,2 3572.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3573___ 3574 3575# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3576# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3577if ($win64) { 3578$rec="%rcx"; 3579$frame="%rdx"; 3580$context="%r8"; 3581$disp="%r9"; 3582 3583$code.=<<___; 3584.extern __imp_RtlVirtualUnwind 3585.type mul_handler,\@abi-omnipotent 3586.align 16 3587mul_handler: 3588 push %rsi 3589 push %rdi 3590 push %rbx 3591 push %rbp 3592 push %r12 3593 push %r13 3594 push %r14 3595 push %r15 3596 pushfq 3597 sub \$64,%rsp 3598 3599 mov 120($context),%rax # pull context->Rax 3600 mov 248($context),%rbx # pull context->Rip 3601 3602 mov 8($disp),%rsi # disp->ImageBase 3603 mov 56($disp),%r11 # disp->HandlerData 3604 3605 mov 0(%r11),%r10d # HandlerData[0] 3606 lea (%rsi,%r10),%r10 # end of prologue label 3607 cmp %r10,%rbx # context->Rip<end of prologue label 3608 jb .Lcommon_seh_tail 3609 3610 mov 152($context),%rax # pull context->Rsp 3611 3612 mov 4(%r11),%r10d # HandlerData[1] 3613 lea (%rsi,%r10),%r10 # epilogue label 3614 cmp %r10,%rbx # context->Rip>=epilogue label 3615 jae .Lcommon_seh_tail 3616 3617 lea .Lmul_epilogue(%rip),%r10 3618 cmp %r10,%rbx 3619 ja .Lbody_40 3620 3621 mov 192($context),%r10 # pull $num 3622 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 3623 3624 jmp .Lbody_proceed 3625 3626.Lbody_40: 3627 mov 40(%rax),%rax # pull saved stack pointer 3628.Lbody_proceed: 3629 mov -8(%rax),%rbx 3630 mov -16(%rax),%rbp 3631 mov -24(%rax),%r12 3632 mov -32(%rax),%r13 3633 mov -40(%rax),%r14 3634 mov -48(%rax),%r15 3635 mov %rbx,144($context) # restore context->Rbx 3636 mov %rbp,160($context) # restore context->Rbp 3637 mov %r12,216($context) # restore context->R12 3638 mov %r13,224($context) # restore context->R13 3639 mov %r14,232($context) # restore context->R14 3640 mov %r15,240($context) # restore context->R15 3641 3642.Lcommon_seh_tail: 3643 mov 8(%rax),%rdi 3644 mov 16(%rax),%rsi 3645 mov %rax,152($context) # restore context->Rsp 3646 mov %rsi,168($context) # restore context->Rsi 3647 mov %rdi,176($context) # restore context->Rdi 3648 3649 mov 40($disp),%rdi # disp->ContextRecord 3650 mov $context,%rsi # context 3651 mov \$154,%ecx # sizeof(CONTEXT) 3652 .long 0xa548f3fc # cld; rep movsq 3653 3654 mov $disp,%rsi 3655 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3656 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3657 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3658 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3659 mov 40(%rsi),%r10 # disp->ContextRecord 3660 lea 56(%rsi),%r11 # &disp->HandlerData 3661 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3662 mov %r10,32(%rsp) # arg5 3663 mov %r11,40(%rsp) # arg6 3664 mov %r12,48(%rsp) # arg7 3665 mov %rcx,56(%rsp) # arg8, (NULL) 3666 call *__imp_RtlVirtualUnwind(%rip) 3667 3668 mov \$1,%eax # ExceptionContinueSearch 3669 add \$64,%rsp 3670 popfq 3671 pop %r15 3672 pop %r14 3673 pop %r13 3674 pop %r12 3675 pop %rbp 3676 pop %rbx 3677 pop %rdi 3678 pop %rsi 3679 ret 3680.size mul_handler,.-mul_handler 3681 3682.section .pdata 3683.align 4 3684 .rva .LSEH_begin_bn_mul_mont_gather5 3685 .rva .LSEH_end_bn_mul_mont_gather5 3686 .rva .LSEH_info_bn_mul_mont_gather5 3687 3688 .rva .LSEH_begin_bn_mul4x_mont_gather5 3689 .rva .LSEH_end_bn_mul4x_mont_gather5 3690 .rva .LSEH_info_bn_mul4x_mont_gather5 3691 3692 .rva .LSEH_begin_bn_power5 3693 .rva .LSEH_end_bn_power5 3694 .rva .LSEH_info_bn_power5 3695 3696 .rva .LSEH_begin_bn_from_mont8x 3697 .rva .LSEH_end_bn_from_mont8x 3698 .rva .LSEH_info_bn_from_mont8x 3699___ 3700$code.=<<___ if ($addx); 3701 .rva .LSEH_begin_bn_mulx4x_mont_gather5 3702 .rva .LSEH_end_bn_mulx4x_mont_gather5 3703 .rva .LSEH_info_bn_mulx4x_mont_gather5 3704 3705 .rva .LSEH_begin_bn_powerx5 3706 .rva .LSEH_end_bn_powerx5 3707 .rva .LSEH_info_bn_powerx5 3708___ 3709$code.=<<___; 3710 .rva .LSEH_begin_bn_gather5 3711 .rva .LSEH_end_bn_gather5 3712 .rva .LSEH_info_bn_gather5 3713 3714.section .xdata 3715.align 8 3716.LSEH_info_bn_mul_mont_gather5: 3717 .byte 9,0,0,0 3718 .rva mul_handler 3719 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 3720.align 8 3721.LSEH_info_bn_mul4x_mont_gather5: 3722 .byte 9,0,0,0 3723 .rva mul_handler 3724 .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 3725.align 8 3726.LSEH_info_bn_power5: 3727 .byte 9,0,0,0 3728 .rva mul_handler 3729 .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[] 3730.align 8 3731.LSEH_info_bn_from_mont8x: 3732 .byte 9,0,0,0 3733 .rva mul_handler 3734 .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] 3735___ 3736$code.=<<___ if ($addx); 3737.align 8 3738.LSEH_info_bn_mulx4x_mont_gather5: 3739 .byte 9,0,0,0 3740 .rva mul_handler 3741 .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 3742.align 8 3743.LSEH_info_bn_powerx5: 3744 .byte 9,0,0,0 3745 .rva mul_handler 3746 .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] 3747___ 3748$code.=<<___; 3749.align 8 3750.LSEH_info_bn_gather5: 3751 .byte 0x01,0x0b,0x03,0x0a 3752 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 3753 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) 3754.align 8 3755___ 3756} 3757 3758$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3759 3760print $code; 3761close STDOUT; 3762