1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# August 2011. 11# 12# Companion to x86_64-mont.pl that optimizes cache-timing attack 13# countermeasures. The subroutines are produced by replacing bp[i] 14# references in their x86_64-mont.pl counterparts with cache-neutral 15# references to powers table computed in BN_mod_exp_mont_consttime. 16# In addition subroutine that scatters elements of the powers table 17# is implemented, so that scatter-/gathering can be tuned without 18# bn_exp.c modifications. 19 20# August 2013. 21# 22# Add MULX/AD*X code paths and additional interfaces to optimize for 23# branch prediction unit. For input lengths that are multiples of 8 24# the np argument is not just modulus value, but one interleaved 25# with 0. This is to optimize post-condition... 26 27$flavour = shift; 28$output = shift; 29if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 30 31$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 32 33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 34( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 35( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 36die "can't locate x86_64-xlate.pl"; 37 38open OUT,"| \"$^X\" $xlate $flavour $output"; 39*STDOUT=*OUT; 40 41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 43 $addx = ($1>=2.23); 44} 45 46if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 48 $addx = ($1>=2.10); 49} 50 51if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 52 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 53 $addx = ($1>=12); 54} 55 56if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 57 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 58 $addx = ($ver>=3.03); 59} 60 61# int bn_mul_mont_gather5( 62$rp="%rdi"; # BN_ULONG *rp, 63$ap="%rsi"; # const BN_ULONG *ap, 64$bp="%rdx"; # const BN_ULONG *bp, 65$np="%rcx"; # const BN_ULONG *np, 66$n0="%r8"; # const BN_ULONG *n0, 67$num="%r9"; # int num, 68 # int idx); # 0 to 2^5-1, "index" in $bp holding 69 # pre-computed powers of a', interlaced 70 # in such manner that b[0] is $bp[idx], 71 # b[1] is [2^5+idx], etc. 72$lo0="%r10"; 73$hi0="%r11"; 74$hi1="%r13"; 75$i="%r14"; 76$j="%r15"; 77$m0="%rbx"; 78$m1="%rbp"; 79 80$code=<<___; 81.text 82 83.extern OPENSSL_ia32cap_P 84 85.globl bn_mul_mont_gather5 86.type bn_mul_mont_gather5,\@function,6 87.align 64 88bn_mul_mont_gather5: 89 mov ${num}d,${num}d 90 mov %rsp,%rax 91 test \$7,${num}d 92 jnz .Lmul_enter 93___ 94$code.=<<___ if ($addx); 95 mov OPENSSL_ia32cap_P+8(%rip),%r11d 96___ 97$code.=<<___; 98 jmp .Lmul4x_enter 99 100.align 16 101.Lmul_enter: 102 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument 103 push %rbx 104 push %rbp 105 push %r12 106 push %r13 107 push %r14 108 push %r15 109 110 neg $num 111 mov %rsp,%r11 112 lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) 113 neg $num # restore $num 114 and \$-1024,%r10 # minimize TLB usage 115 116 # Some OSes, *cough*-dows, insist on stack being "wired" to 117 # physical memory in strictly sequential manner, i.e. if stack 118 # allocation spans two pages, then reference to farmost one can 119 # be punishable by SEGV. But page walking can do good even on 120 # other OSes, because it guarantees that villain thread hits 121 # the guard page before it can make damage to innocent one... 122 sub %r10,%r11 123 and \$-4096,%r11 124 lea (%r10,%r11),%rsp 125 mov (%rsp),%r11 126 cmp %r10,%rsp 127 ja .Lmul_page_walk 128 jmp .Lmul_page_walk_done 129 130.Lmul_page_walk: 131 lea -4096(%rsp),%rsp 132 mov (%rsp),%r11 133 cmp %r10,%rsp 134 ja .Lmul_page_walk 135.Lmul_page_walk_done: 136 137 lea .Linc(%rip),%r10 138 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 139.Lmul_body: 140 141 lea 128($bp),%r12 # reassign $bp (+size optimization) 142___ 143 $bp="%r12"; 144 $STRIDE=2**5*8; # 5 is "window size" 145 $N=$STRIDE/4; # should match cache line size 146$code.=<<___; 147 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 148 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 149 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) 150 and \$-16,%r10 151 152 pshufd \$0,%xmm5,%xmm5 # broadcast index 153 movdqa %xmm1,%xmm4 154 movdqa %xmm1,%xmm2 155___ 156######################################################################## 157# calculate mask by comparing 0..31 to index and save result to stack 158# 159$code.=<<___; 160 paddd %xmm0,%xmm1 161 pcmpeqd %xmm5,%xmm0 # compare to 1,0 162 .byte 0x67 163 movdqa %xmm4,%xmm3 164___ 165for($k=0;$k<$STRIDE/16-4;$k+=4) { 166$code.=<<___; 167 paddd %xmm1,%xmm2 168 pcmpeqd %xmm5,%xmm1 # compare to 3,2 169 movdqa %xmm0,`16*($k+0)+112`(%r10) 170 movdqa %xmm4,%xmm0 171 172 paddd %xmm2,%xmm3 173 pcmpeqd %xmm5,%xmm2 # compare to 5,4 174 movdqa %xmm1,`16*($k+1)+112`(%r10) 175 movdqa %xmm4,%xmm1 176 177 paddd %xmm3,%xmm0 178 pcmpeqd %xmm5,%xmm3 # compare to 7,6 179 movdqa %xmm2,`16*($k+2)+112`(%r10) 180 movdqa %xmm4,%xmm2 181 182 paddd %xmm0,%xmm1 183 pcmpeqd %xmm5,%xmm0 184 movdqa %xmm3,`16*($k+3)+112`(%r10) 185 movdqa %xmm4,%xmm3 186___ 187} 188$code.=<<___; # last iteration can be optimized 189 paddd %xmm1,%xmm2 190 pcmpeqd %xmm5,%xmm1 191 movdqa %xmm0,`16*($k+0)+112`(%r10) 192 193 paddd %xmm2,%xmm3 194 .byte 0x67 195 pcmpeqd %xmm5,%xmm2 196 movdqa %xmm1,`16*($k+1)+112`(%r10) 197 198 pcmpeqd %xmm5,%xmm3 199 movdqa %xmm2,`16*($k+2)+112`(%r10) 200 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register 201 202 pand `16*($k+1)-128`($bp),%xmm1 203 pand `16*($k+2)-128`($bp),%xmm2 204 movdqa %xmm3,`16*($k+3)+112`(%r10) 205 pand `16*($k+3)-128`($bp),%xmm3 206 por %xmm2,%xmm0 207 por %xmm3,%xmm1 208___ 209for($k=0;$k<$STRIDE/16-4;$k+=4) { 210$code.=<<___; 211 movdqa `16*($k+0)-128`($bp),%xmm4 212 movdqa `16*($k+1)-128`($bp),%xmm5 213 movdqa `16*($k+2)-128`($bp),%xmm2 214 pand `16*($k+0)+112`(%r10),%xmm4 215 movdqa `16*($k+3)-128`($bp),%xmm3 216 pand `16*($k+1)+112`(%r10),%xmm5 217 por %xmm4,%xmm0 218 pand `16*($k+2)+112`(%r10),%xmm2 219 por %xmm5,%xmm1 220 pand `16*($k+3)+112`(%r10),%xmm3 221 por %xmm2,%xmm0 222 por %xmm3,%xmm1 223___ 224} 225$code.=<<___; 226 por %xmm1,%xmm0 227 pshufd \$0x4e,%xmm0,%xmm1 228 por %xmm1,%xmm0 229 lea $STRIDE($bp),$bp 230 movq %xmm0,$m0 # m0=bp[0] 231 232 mov ($n0),$n0 # pull n0[0] value 233 mov ($ap),%rax 234 235 xor $i,$i # i=0 236 xor $j,$j # j=0 237 238 mov $n0,$m1 239 mulq $m0 # ap[0]*bp[0] 240 mov %rax,$lo0 241 mov ($np),%rax 242 243 imulq $lo0,$m1 # "tp[0]"*n0 244 mov %rdx,$hi0 245 246 mulq $m1 # np[0]*m1 247 add %rax,$lo0 # discarded 248 mov 8($ap),%rax 249 adc \$0,%rdx 250 mov %rdx,$hi1 251 252 lea 1($j),$j # j++ 253 jmp .L1st_enter 254 255.align 16 256.L1st: 257 add %rax,$hi1 258 mov ($ap,$j,8),%rax 259 adc \$0,%rdx 260 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 261 mov $lo0,$hi0 262 adc \$0,%rdx 263 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 264 mov %rdx,$hi1 265 266.L1st_enter: 267 mulq $m0 # ap[j]*bp[0] 268 add %rax,$hi0 269 mov ($np,$j,8),%rax 270 adc \$0,%rdx 271 lea 1($j),$j # j++ 272 mov %rdx,$lo0 273 274 mulq $m1 # np[j]*m1 275 cmp $num,$j 276 jne .L1st # note that upon exit $j==$num, so 277 # they can be used interchangeably 278 279 add %rax,$hi1 280 adc \$0,%rdx 281 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 282 adc \$0,%rdx 283 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 284 mov %rdx,$hi1 285 mov $lo0,$hi0 286 287 xor %rdx,%rdx 288 add $hi0,$hi1 289 adc \$0,%rdx 290 mov $hi1,-8(%rsp,$num,8) 291 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 292 293 lea 1($i),$i # i++ 294 jmp .Louter 295.align 16 296.Louter: 297 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) 298 and \$-16,%rdx 299 pxor %xmm4,%xmm4 300 pxor %xmm5,%xmm5 301___ 302for($k=0;$k<$STRIDE/16;$k+=4) { 303$code.=<<___; 304 movdqa `16*($k+0)-128`($bp),%xmm0 305 movdqa `16*($k+1)-128`($bp),%xmm1 306 movdqa `16*($k+2)-128`($bp),%xmm2 307 movdqa `16*($k+3)-128`($bp),%xmm3 308 pand `16*($k+0)-128`(%rdx),%xmm0 309 pand `16*($k+1)-128`(%rdx),%xmm1 310 por %xmm0,%xmm4 311 pand `16*($k+2)-128`(%rdx),%xmm2 312 por %xmm1,%xmm5 313 pand `16*($k+3)-128`(%rdx),%xmm3 314 por %xmm2,%xmm4 315 por %xmm3,%xmm5 316___ 317} 318$code.=<<___; 319 por %xmm5,%xmm4 320 pshufd \$0x4e,%xmm4,%xmm0 321 por %xmm4,%xmm0 322 lea $STRIDE($bp),$bp 323 324 mov ($ap),%rax # ap[0] 325 movq %xmm0,$m0 # m0=bp[i] 326 327 xor $j,$j # j=0 328 mov $n0,$m1 329 mov (%rsp),$lo0 330 331 mulq $m0 # ap[0]*bp[i] 332 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 333 mov ($np),%rax 334 adc \$0,%rdx 335 336 imulq $lo0,$m1 # tp[0]*n0 337 mov %rdx,$hi0 338 339 mulq $m1 # np[0]*m1 340 add %rax,$lo0 # discarded 341 mov 8($ap),%rax 342 adc \$0,%rdx 343 mov 8(%rsp),$lo0 # tp[1] 344 mov %rdx,$hi1 345 346 lea 1($j),$j # j++ 347 jmp .Linner_enter 348 349.align 16 350.Linner: 351 add %rax,$hi1 352 mov ($ap,$j,8),%rax 353 adc \$0,%rdx 354 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 355 mov (%rsp,$j,8),$lo0 356 adc \$0,%rdx 357 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 358 mov %rdx,$hi1 359 360.Linner_enter: 361 mulq $m0 # ap[j]*bp[i] 362 add %rax,$hi0 363 mov ($np,$j,8),%rax 364 adc \$0,%rdx 365 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 366 mov %rdx,$hi0 367 adc \$0,$hi0 368 lea 1($j),$j # j++ 369 370 mulq $m1 # np[j]*m1 371 cmp $num,$j 372 jne .Linner # note that upon exit $j==$num, so 373 # they can be used interchangeably 374 add %rax,$hi1 375 adc \$0,%rdx 376 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 377 mov (%rsp,$num,8),$lo0 378 adc \$0,%rdx 379 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 380 mov %rdx,$hi1 381 382 xor %rdx,%rdx 383 add $hi0,$hi1 384 adc \$0,%rdx 385 add $lo0,$hi1 # pull upmost overflow bit 386 adc \$0,%rdx 387 mov $hi1,-8(%rsp,$num,8) 388 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 389 390 lea 1($i),$i # i++ 391 cmp $num,$i 392 jb .Louter 393 394 xor $i,$i # i=0 and clear CF! 395 mov (%rsp),%rax # tp[0] 396 lea (%rsp),$ap # borrow ap for tp 397 mov $num,$j # j=num 398 jmp .Lsub 399.align 16 400.Lsub: sbb ($np,$i,8),%rax 401 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 402 mov 8($ap,$i,8),%rax # tp[i+1] 403 lea 1($i),$i # i++ 404 dec $j # doesnn't affect CF! 405 jnz .Lsub 406 407 sbb \$0,%rax # handle upmost overflow bit 408 xor $i,$i 409 and %rax,$ap 410 not %rax 411 mov $rp,$np 412 and %rax,$np 413 mov $num,$j # j=num 414 or $np,$ap # ap=borrow?tp:rp 415.align 16 416.Lcopy: # copy or in-place refresh 417 mov ($ap,$i,8),%rax 418 mov $i,(%rsp,$i,8) # zap temporary vector 419 mov %rax,($rp,$i,8) # rp[i]=tp[i] 420 lea 1($i),$i 421 sub \$1,$j 422 jnz .Lcopy 423 424 mov 8(%rsp,$num,8),%rsi # restore %rsp 425 mov \$1,%rax 426 427 mov -48(%rsi),%r15 428 mov -40(%rsi),%r14 429 mov -32(%rsi),%r13 430 mov -24(%rsi),%r12 431 mov -16(%rsi),%rbp 432 mov -8(%rsi),%rbx 433 lea (%rsi),%rsp 434.Lmul_epilogue: 435 ret 436.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 437___ 438{{{ 439my @A=("%r10","%r11"); 440my @N=("%r13","%rdi"); 441$code.=<<___; 442.type bn_mul4x_mont_gather5,\@function,6 443.align 32 444bn_mul4x_mont_gather5: 445 .byte 0x67 446 mov %rsp,%rax 447.Lmul4x_enter: 448___ 449$code.=<<___ if ($addx); 450 and \$0x80108,%r11d 451 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 452 je .Lmulx4x_enter 453___ 454$code.=<<___; 455 push %rbx 456 push %rbp 457 push %r12 458 push %r13 459 push %r14 460 push %r15 461.Lmul4x_prologue: 462 463 .byte 0x67 464 shl \$3,${num}d # convert $num to bytes 465 lea ($num,$num,2),%r10 # 3*$num in bytes 466 neg $num # -$num 467 468 ############################################################## 469 # Ensure that stack frame doesn't alias with $rptr+3*$num 470 # modulo 4096, which covers ret[num], am[num] and n[num] 471 # (see bn_exp.c). This is done to allow memory disambiguation 472 # logic do its magic. [Extra [num] is allocated in order 473 # to align with bn_power5's frame, which is cleansed after 474 # completing exponentiation. Extra 256 bytes is for power mask 475 # calculated from 7th argument, the index.] 476 # 477 lea -320(%rsp,$num,2),%r11 478 mov %rsp,%rbp 479 sub $rp,%r11 480 and \$4095,%r11 481 cmp %r11,%r10 482 jb .Lmul4xsp_alt 483 sub %r11,%rbp # align with $rp 484 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 485 jmp .Lmul4xsp_done 486 487.align 32 488.Lmul4xsp_alt: 489 lea 4096-320(,$num,2),%r10 490 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 491 sub %r10,%r11 492 mov \$0,%r10 493 cmovc %r10,%r11 494 sub %r11,%rbp 495.Lmul4xsp_done: 496 and \$-64,%rbp 497 mov %rsp,%r11 498 sub %rbp,%r11 499 and \$-4096,%r11 500 lea (%rbp,%r11),%rsp 501 mov (%rsp),%r10 502 cmp %rbp,%rsp 503 ja .Lmul4x_page_walk 504 jmp .Lmul4x_page_walk_done 505 506.Lmul4x_page_walk: 507 lea -4096(%rsp),%rsp 508 mov (%rsp),%r10 509 cmp %rbp,%rsp 510 ja .Lmul4x_page_walk 511.Lmul4x_page_walk_done: 512 513 neg $num 514 515 mov %rax,40(%rsp) 516.Lmul4x_body: 517 518 call mul4x_internal 519 520 mov 40(%rsp),%rsi # restore %rsp 521 mov \$1,%rax 522 523 mov -48(%rsi),%r15 524 mov -40(%rsi),%r14 525 mov -32(%rsi),%r13 526 mov -24(%rsi),%r12 527 mov -16(%rsi),%rbp 528 mov -8(%rsi),%rbx 529 lea (%rsi),%rsp 530.Lmul4x_epilogue: 531 ret 532.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 533 534.type mul4x_internal,\@abi-omnipotent 535.align 32 536mul4x_internal: 537 shl \$5,$num # $num was in bytes 538 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index 539 lea .Linc(%rip),%rax 540 lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) 541 shr \$5,$num # restore $num 542___ 543 $bp="%r12"; 544 $STRIDE=2**5*8; # 5 is "window size" 545 $N=$STRIDE/4; # should match cache line size 546 $tp=$i; 547$code.=<<___; 548 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 549 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 550 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) 551 lea 128(%rdx),$bp # size optimization 552 553 pshufd \$0,%xmm5,%xmm5 # broadcast index 554 movdqa %xmm1,%xmm4 555 .byte 0x67,0x67 556 movdqa %xmm1,%xmm2 557___ 558######################################################################## 559# calculate mask by comparing 0..31 to index and save result to stack 560# 561$code.=<<___; 562 paddd %xmm0,%xmm1 563 pcmpeqd %xmm5,%xmm0 # compare to 1,0 564 .byte 0x67 565 movdqa %xmm4,%xmm3 566___ 567for($i=0;$i<$STRIDE/16-4;$i+=4) { 568$code.=<<___; 569 paddd %xmm1,%xmm2 570 pcmpeqd %xmm5,%xmm1 # compare to 3,2 571 movdqa %xmm0,`16*($i+0)+112`(%r10) 572 movdqa %xmm4,%xmm0 573 574 paddd %xmm2,%xmm3 575 pcmpeqd %xmm5,%xmm2 # compare to 5,4 576 movdqa %xmm1,`16*($i+1)+112`(%r10) 577 movdqa %xmm4,%xmm1 578 579 paddd %xmm3,%xmm0 580 pcmpeqd %xmm5,%xmm3 # compare to 7,6 581 movdqa %xmm2,`16*($i+2)+112`(%r10) 582 movdqa %xmm4,%xmm2 583 584 paddd %xmm0,%xmm1 585 pcmpeqd %xmm5,%xmm0 586 movdqa %xmm3,`16*($i+3)+112`(%r10) 587 movdqa %xmm4,%xmm3 588___ 589} 590$code.=<<___; # last iteration can be optimized 591 paddd %xmm1,%xmm2 592 pcmpeqd %xmm5,%xmm1 593 movdqa %xmm0,`16*($i+0)+112`(%r10) 594 595 paddd %xmm2,%xmm3 596 .byte 0x67 597 pcmpeqd %xmm5,%xmm2 598 movdqa %xmm1,`16*($i+1)+112`(%r10) 599 600 pcmpeqd %xmm5,%xmm3 601 movdqa %xmm2,`16*($i+2)+112`(%r10) 602 pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register 603 604 pand `16*($i+1)-128`($bp),%xmm1 605 pand `16*($i+2)-128`($bp),%xmm2 606 movdqa %xmm3,`16*($i+3)+112`(%r10) 607 pand `16*($i+3)-128`($bp),%xmm3 608 por %xmm2,%xmm0 609 por %xmm3,%xmm1 610___ 611for($i=0;$i<$STRIDE/16-4;$i+=4) { 612$code.=<<___; 613 movdqa `16*($i+0)-128`($bp),%xmm4 614 movdqa `16*($i+1)-128`($bp),%xmm5 615 movdqa `16*($i+2)-128`($bp),%xmm2 616 pand `16*($i+0)+112`(%r10),%xmm4 617 movdqa `16*($i+3)-128`($bp),%xmm3 618 pand `16*($i+1)+112`(%r10),%xmm5 619 por %xmm4,%xmm0 620 pand `16*($i+2)+112`(%r10),%xmm2 621 por %xmm5,%xmm1 622 pand `16*($i+3)+112`(%r10),%xmm3 623 por %xmm2,%xmm0 624 por %xmm3,%xmm1 625___ 626} 627$code.=<<___; 628 por %xmm1,%xmm0 629 pshufd \$0x4e,%xmm0,%xmm1 630 por %xmm1,%xmm0 631 lea $STRIDE($bp),$bp 632 movq %xmm0,$m0 # m0=bp[0] 633 634 mov %r13,16+8(%rsp) # save end of b[num] 635 mov $rp, 56+8(%rsp) # save $rp 636 637 mov ($n0),$n0 # pull n0[0] value 638 mov ($ap),%rax 639 lea ($ap,$num),$ap # end of a[num] 640 neg $num 641 642 mov $n0,$m1 643 mulq $m0 # ap[0]*bp[0] 644 mov %rax,$A[0] 645 mov ($np),%rax 646 647 imulq $A[0],$m1 # "tp[0]"*n0 648 lea 64+8(%rsp),$tp 649 mov %rdx,$A[1] 650 651 mulq $m1 # np[0]*m1 652 add %rax,$A[0] # discarded 653 mov 8($ap,$num),%rax 654 adc \$0,%rdx 655 mov %rdx,$N[1] 656 657 mulq $m0 658 add %rax,$A[1] 659 mov 8*1($np),%rax 660 adc \$0,%rdx 661 mov %rdx,$A[0] 662 663 mulq $m1 664 add %rax,$N[1] 665 mov 16($ap,$num),%rax 666 adc \$0,%rdx 667 add $A[1],$N[1] 668 lea 4*8($num),$j # j=4 669 lea 8*4($np),$np 670 adc \$0,%rdx 671 mov $N[1],($tp) 672 mov %rdx,$N[0] 673 jmp .L1st4x 674 675.align 32 676.L1st4x: 677 mulq $m0 # ap[j]*bp[0] 678 add %rax,$A[0] 679 mov -8*2($np),%rax 680 lea 32($tp),$tp 681 adc \$0,%rdx 682 mov %rdx,$A[1] 683 684 mulq $m1 # np[j]*m1 685 add %rax,$N[0] 686 mov -8($ap,$j),%rax 687 adc \$0,%rdx 688 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 689 adc \$0,%rdx 690 mov $N[0],-24($tp) # tp[j-1] 691 mov %rdx,$N[1] 692 693 mulq $m0 # ap[j]*bp[0] 694 add %rax,$A[1] 695 mov -8*1($np),%rax 696 adc \$0,%rdx 697 mov %rdx,$A[0] 698 699 mulq $m1 # np[j]*m1 700 add %rax,$N[1] 701 mov ($ap,$j),%rax 702 adc \$0,%rdx 703 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 704 adc \$0,%rdx 705 mov $N[1],-16($tp) # tp[j-1] 706 mov %rdx,$N[0] 707 708 mulq $m0 # ap[j]*bp[0] 709 add %rax,$A[0] 710 mov 8*0($np),%rax 711 adc \$0,%rdx 712 mov %rdx,$A[1] 713 714 mulq $m1 # np[j]*m1 715 add %rax,$N[0] 716 mov 8($ap,$j),%rax 717 adc \$0,%rdx 718 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 719 adc \$0,%rdx 720 mov $N[0],-8($tp) # tp[j-1] 721 mov %rdx,$N[1] 722 723 mulq $m0 # ap[j]*bp[0] 724 add %rax,$A[1] 725 mov 8*1($np),%rax 726 adc \$0,%rdx 727 mov %rdx,$A[0] 728 729 mulq $m1 # np[j]*m1 730 add %rax,$N[1] 731 mov 16($ap,$j),%rax 732 adc \$0,%rdx 733 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 734 lea 8*4($np),$np 735 adc \$0,%rdx 736 mov $N[1],($tp) # tp[j-1] 737 mov %rdx,$N[0] 738 739 add \$32,$j # j+=4 740 jnz .L1st4x 741 742 mulq $m0 # ap[j]*bp[0] 743 add %rax,$A[0] 744 mov -8*2($np),%rax 745 lea 32($tp),$tp 746 adc \$0,%rdx 747 mov %rdx,$A[1] 748 749 mulq $m1 # np[j]*m1 750 add %rax,$N[0] 751 mov -8($ap),%rax 752 adc \$0,%rdx 753 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 754 adc \$0,%rdx 755 mov $N[0],-24($tp) # tp[j-1] 756 mov %rdx,$N[1] 757 758 mulq $m0 # ap[j]*bp[0] 759 add %rax,$A[1] 760 mov -8*1($np),%rax 761 adc \$0,%rdx 762 mov %rdx,$A[0] 763 764 mulq $m1 # np[j]*m1 765 add %rax,$N[1] 766 mov ($ap,$num),%rax # ap[0] 767 adc \$0,%rdx 768 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 769 adc \$0,%rdx 770 mov $N[1],-16($tp) # tp[j-1] 771 mov %rdx,$N[0] 772 773 lea ($np,$num),$np # rewind $np 774 775 xor $N[1],$N[1] 776 add $A[0],$N[0] 777 adc \$0,$N[1] 778 mov $N[0],-8($tp) 779 780 jmp .Louter4x 781 782.align 32 783.Louter4x: 784 lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) 785 pxor %xmm4,%xmm4 786 pxor %xmm5,%xmm5 787___ 788for($i=0;$i<$STRIDE/16;$i+=4) { 789$code.=<<___; 790 movdqa `16*($i+0)-128`($bp),%xmm0 791 movdqa `16*($i+1)-128`($bp),%xmm1 792 movdqa `16*($i+2)-128`($bp),%xmm2 793 movdqa `16*($i+3)-128`($bp),%xmm3 794 pand `16*($i+0)-128`(%rdx),%xmm0 795 pand `16*($i+1)-128`(%rdx),%xmm1 796 por %xmm0,%xmm4 797 pand `16*($i+2)-128`(%rdx),%xmm2 798 por %xmm1,%xmm5 799 pand `16*($i+3)-128`(%rdx),%xmm3 800 por %xmm2,%xmm4 801 por %xmm3,%xmm5 802___ 803} 804$code.=<<___; 805 por %xmm5,%xmm4 806 pshufd \$0x4e,%xmm4,%xmm0 807 por %xmm4,%xmm0 808 lea $STRIDE($bp),$bp 809 movq %xmm0,$m0 # m0=bp[i] 810 811 mov ($tp,$num),$A[0] 812 mov $n0,$m1 813 mulq $m0 # ap[0]*bp[i] 814 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 815 mov ($np),%rax 816 adc \$0,%rdx 817 818 imulq $A[0],$m1 # tp[0]*n0 819 mov %rdx,$A[1] 820 mov $N[1],($tp) # store upmost overflow bit 821 822 lea ($tp,$num),$tp # rewind $tp 823 824 mulq $m1 # np[0]*m1 825 add %rax,$A[0] # "$N[0]", discarded 826 mov 8($ap,$num),%rax 827 adc \$0,%rdx 828 mov %rdx,$N[1] 829 830 mulq $m0 # ap[j]*bp[i] 831 add %rax,$A[1] 832 mov 8*1($np),%rax 833 adc \$0,%rdx 834 add 8($tp),$A[1] # +tp[1] 835 adc \$0,%rdx 836 mov %rdx,$A[0] 837 838 mulq $m1 # np[j]*m1 839 add %rax,$N[1] 840 mov 16($ap,$num),%rax 841 adc \$0,%rdx 842 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 843 lea 4*8($num),$j # j=4 844 lea 8*4($np),$np 845 adc \$0,%rdx 846 mov %rdx,$N[0] 847 jmp .Linner4x 848 849.align 32 850.Linner4x: 851 mulq $m0 # ap[j]*bp[i] 852 add %rax,$A[0] 853 mov -8*2($np),%rax 854 adc \$0,%rdx 855 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 856 lea 32($tp),$tp 857 adc \$0,%rdx 858 mov %rdx,$A[1] 859 860 mulq $m1 # np[j]*m1 861 add %rax,$N[0] 862 mov -8($ap,$j),%rax 863 adc \$0,%rdx 864 add $A[0],$N[0] 865 adc \$0,%rdx 866 mov $N[1],-32($tp) # tp[j-1] 867 mov %rdx,$N[1] 868 869 mulq $m0 # ap[j]*bp[i] 870 add %rax,$A[1] 871 mov -8*1($np),%rax 872 adc \$0,%rdx 873 add -8($tp),$A[1] 874 adc \$0,%rdx 875 mov %rdx,$A[0] 876 877 mulq $m1 # np[j]*m1 878 add %rax,$N[1] 879 mov ($ap,$j),%rax 880 adc \$0,%rdx 881 add $A[1],$N[1] 882 adc \$0,%rdx 883 mov $N[0],-24($tp) # tp[j-1] 884 mov %rdx,$N[0] 885 886 mulq $m0 # ap[j]*bp[i] 887 add %rax,$A[0] 888 mov 8*0($np),%rax 889 adc \$0,%rdx 890 add ($tp),$A[0] # ap[j]*bp[i]+tp[j] 891 adc \$0,%rdx 892 mov %rdx,$A[1] 893 894 mulq $m1 # np[j]*m1 895 add %rax,$N[0] 896 mov 8($ap,$j),%rax 897 adc \$0,%rdx 898 add $A[0],$N[0] 899 adc \$0,%rdx 900 mov $N[1],-16($tp) # tp[j-1] 901 mov %rdx,$N[1] 902 903 mulq $m0 # ap[j]*bp[i] 904 add %rax,$A[1] 905 mov 8*1($np),%rax 906 adc \$0,%rdx 907 add 8($tp),$A[1] 908 adc \$0,%rdx 909 mov %rdx,$A[0] 910 911 mulq $m1 # np[j]*m1 912 add %rax,$N[1] 913 mov 16($ap,$j),%rax 914 adc \$0,%rdx 915 add $A[1],$N[1] 916 lea 8*4($np),$np 917 adc \$0,%rdx 918 mov $N[0],-8($tp) # tp[j-1] 919 mov %rdx,$N[0] 920 921 add \$32,$j # j+=4 922 jnz .Linner4x 923 924 mulq $m0 # ap[j]*bp[i] 925 add %rax,$A[0] 926 mov -8*2($np),%rax 927 adc \$0,%rdx 928 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 929 lea 32($tp),$tp 930 adc \$0,%rdx 931 mov %rdx,$A[1] 932 933 mulq $m1 # np[j]*m1 934 add %rax,$N[0] 935 mov -8($ap),%rax 936 adc \$0,%rdx 937 add $A[0],$N[0] 938 adc \$0,%rdx 939 mov $N[1],-32($tp) # tp[j-1] 940 mov %rdx,$N[1] 941 942 mulq $m0 # ap[j]*bp[i] 943 add %rax,$A[1] 944 mov $m1,%rax 945 mov -8*1($np),$m1 946 adc \$0,%rdx 947 add -8($tp),$A[1] 948 adc \$0,%rdx 949 mov %rdx,$A[0] 950 951 mulq $m1 # np[j]*m1 952 add %rax,$N[1] 953 mov ($ap,$num),%rax # ap[0] 954 adc \$0,%rdx 955 add $A[1],$N[1] 956 adc \$0,%rdx 957 mov $N[0],-24($tp) # tp[j-1] 958 mov %rdx,$N[0] 959 960 mov $N[1],-16($tp) # tp[j-1] 961 lea ($np,$num),$np # rewind $np 962 963 xor $N[1],$N[1] 964 add $A[0],$N[0] 965 adc \$0,$N[1] 966 add ($tp),$N[0] # pull upmost overflow bit 967 adc \$0,$N[1] # upmost overflow bit 968 mov $N[0],-8($tp) 969 970 cmp 16+8(%rsp),$bp 971 jb .Louter4x 972___ 973if (1) { 974$code.=<<___; 975 xor %rax,%rax 976 sub $N[0],$m1 # compare top-most words 977 adc $j,$j # $j is zero 978 or $j,$N[1] 979 sub $N[1],%rax # %rax=-$N[1] 980 lea ($tp,$num),%rbx # tptr in .sqr4x_sub 981 mov ($np),%r12 982 lea ($np),%rbp # nptr in .sqr4x_sub 983 mov %r9,%rcx 984 sar \$3+2,%rcx 985 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub 986 dec %r12 # so that after 'not' we get -n[0] 987 xor %r10,%r10 988 mov 8*1(%rbp),%r13 989 mov 8*2(%rbp),%r14 990 mov 8*3(%rbp),%r15 991 jmp .Lsqr4x_sub_entry 992___ 993} else { 994my @ri=("%rax",$bp,$m0,$m1); 995my $rp="%rdx"; 996$code.=<<___ 997 xor \$1,$N[1] 998 lea ($tp,$num),$tp # rewind $tp 999 sar \$5,$num # cf=0 1000 lea ($np,$N[1],8),$np 1001 mov 56+8(%rsp),$rp # restore $rp 1002 jmp .Lsub4x 1003 1004.align 32 1005.Lsub4x: 1006 .byte 0x66 1007 mov 8*0($tp),@ri[0] 1008 mov 8*1($tp),@ri[1] 1009 .byte 0x66 1010 sbb 16*0($np),@ri[0] 1011 mov 8*2($tp),@ri[2] 1012 sbb 16*1($np),@ri[1] 1013 mov 3*8($tp),@ri[3] 1014 lea 4*8($tp),$tp 1015 sbb 16*2($np),@ri[2] 1016 mov @ri[0],8*0($rp) 1017 sbb 16*3($np),@ri[3] 1018 lea 16*4($np),$np 1019 mov @ri[1],8*1($rp) 1020 mov @ri[2],8*2($rp) 1021 mov @ri[3],8*3($rp) 1022 lea 8*4($rp),$rp 1023 1024 inc $num 1025 jnz .Lsub4x 1026 1027 ret 1028___ 1029} 1030$code.=<<___; 1031.size mul4x_internal,.-mul4x_internal 1032___ 1033}}} 1034{{{ 1035###################################################################### 1036# void bn_power5( 1037my $rptr="%rdi"; # BN_ULONG *rptr, 1038my $aptr="%rsi"; # const BN_ULONG *aptr, 1039my $bptr="%rdx"; # const void *table, 1040my $nptr="%rcx"; # const BN_ULONG *nptr, 1041my $n0 ="%r8"; # const BN_ULONG *n0); 1042my $num ="%r9"; # int num, has to be divisible by 8 1043 # int pwr 1044 1045my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 1046my @A0=("%r10","%r11"); 1047my @A1=("%r12","%r13"); 1048my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 1049 1050$code.=<<___; 1051.globl bn_power5 1052.type bn_power5,\@function,6 1053.align 32 1054bn_power5: 1055 mov %rsp,%rax 1056___ 1057$code.=<<___ if ($addx); 1058 mov OPENSSL_ia32cap_P+8(%rip),%r11d 1059 and \$0x80108,%r11d 1060 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 1061 je .Lpowerx5_enter 1062___ 1063$code.=<<___; 1064 push %rbx 1065 push %rbp 1066 push %r12 1067 push %r13 1068 push %r14 1069 push %r15 1070.Lpower5_prologue: 1071 1072 shl \$3,${num}d # convert $num to bytes 1073 lea ($num,$num,2),%r10d # 3*$num 1074 neg $num 1075 mov ($n0),$n0 # *n0 1076 1077 ############################################################## 1078 # Ensure that stack frame doesn't alias with $rptr+3*$num 1079 # modulo 4096, which covers ret[num], am[num] and n[num] 1080 # (see bn_exp.c). This is done to allow memory disambiguation 1081 # logic do its magic. [Extra 256 bytes is for power mask 1082 # calculated from 7th argument, the index.] 1083 # 1084 lea -320(%rsp,$num,2),%r11 1085 mov %rsp,%rbp 1086 sub $rptr,%r11 1087 and \$4095,%r11 1088 cmp %r11,%r10 1089 jb .Lpwr_sp_alt 1090 sub %r11,%rbp # align with $aptr 1091 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1092 jmp .Lpwr_sp_done 1093 1094.align 32 1095.Lpwr_sp_alt: 1096 lea 4096-320(,$num,2),%r10 1097 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1098 sub %r10,%r11 1099 mov \$0,%r10 1100 cmovc %r10,%r11 1101 sub %r11,%rbp 1102.Lpwr_sp_done: 1103 and \$-64,%rbp 1104 mov %rsp,%r11 1105 sub %rbp,%r11 1106 and \$-4096,%r11 1107 lea (%rbp,%r11),%rsp 1108 mov (%rsp),%r10 1109 cmp %rbp,%rsp 1110 ja .Lpwr_page_walk 1111 jmp .Lpwr_page_walk_done 1112 1113.Lpwr_page_walk: 1114 lea -4096(%rsp),%rsp 1115 mov (%rsp),%r10 1116 cmp %rbp,%rsp 1117 ja .Lpwr_page_walk 1118.Lpwr_page_walk_done: 1119 1120 mov $num,%r10 1121 neg $num 1122 1123 ############################################################## 1124 # Stack layout 1125 # 1126 # +0 saved $num, used in reduction section 1127 # +8 &t[2*$num], used in reduction section 1128 # +32 saved *n0 1129 # +40 saved %rsp 1130 # +48 t[2*$num] 1131 # 1132 mov $n0, 32(%rsp) 1133 mov %rax, 40(%rsp) # save original %rsp 1134.Lpower5_body: 1135 movq $rptr,%xmm1 # save $rptr, used in sqr8x 1136 movq $nptr,%xmm2 # save $nptr 1137 movq %r10, %xmm3 # -$num, used in sqr8x 1138 movq $bptr,%xmm4 1139 1140 call __bn_sqr8x_internal 1141 call __bn_post4x_internal 1142 call __bn_sqr8x_internal 1143 call __bn_post4x_internal 1144 call __bn_sqr8x_internal 1145 call __bn_post4x_internal 1146 call __bn_sqr8x_internal 1147 call __bn_post4x_internal 1148 call __bn_sqr8x_internal 1149 call __bn_post4x_internal 1150 1151 movq %xmm2,$nptr 1152 movq %xmm4,$bptr 1153 mov $aptr,$rptr 1154 mov 40(%rsp),%rax 1155 lea 32(%rsp),$n0 1156 1157 call mul4x_internal 1158 1159 mov 40(%rsp),%rsi # restore %rsp 1160 mov \$1,%rax 1161 mov -48(%rsi),%r15 1162 mov -40(%rsi),%r14 1163 mov -32(%rsi),%r13 1164 mov -24(%rsi),%r12 1165 mov -16(%rsi),%rbp 1166 mov -8(%rsi),%rbx 1167 lea (%rsi),%rsp 1168.Lpower5_epilogue: 1169 ret 1170.size bn_power5,.-bn_power5 1171 1172.globl bn_sqr8x_internal 1173.hidden bn_sqr8x_internal 1174.type bn_sqr8x_internal,\@abi-omnipotent 1175.align 32 1176bn_sqr8x_internal: 1177__bn_sqr8x_internal: 1178 ############################################################## 1179 # Squaring part: 1180 # 1181 # a) multiply-n-add everything but a[i]*a[i]; 1182 # b) shift result of a) by 1 to the left and accumulate 1183 # a[i]*a[i] products; 1184 # 1185 ############################################################## 1186 # a[1]a[0] 1187 # a[2]a[0] 1188 # a[3]a[0] 1189 # a[2]a[1] 1190 # a[4]a[0] 1191 # a[3]a[1] 1192 # a[5]a[0] 1193 # a[4]a[1] 1194 # a[3]a[2] 1195 # a[6]a[0] 1196 # a[5]a[1] 1197 # a[4]a[2] 1198 # a[7]a[0] 1199 # a[6]a[1] 1200 # a[5]a[2] 1201 # a[4]a[3] 1202 # a[7]a[1] 1203 # a[6]a[2] 1204 # a[5]a[3] 1205 # a[7]a[2] 1206 # a[6]a[3] 1207 # a[5]a[4] 1208 # a[7]a[3] 1209 # a[6]a[4] 1210 # a[7]a[4] 1211 # a[6]a[5] 1212 # a[7]a[5] 1213 # a[7]a[6] 1214 # a[1]a[0] 1215 # a[2]a[0] 1216 # a[3]a[0] 1217 # a[4]a[0] 1218 # a[5]a[0] 1219 # a[6]a[0] 1220 # a[7]a[0] 1221 # a[2]a[1] 1222 # a[3]a[1] 1223 # a[4]a[1] 1224 # a[5]a[1] 1225 # a[6]a[1] 1226 # a[7]a[1] 1227 # a[3]a[2] 1228 # a[4]a[2] 1229 # a[5]a[2] 1230 # a[6]a[2] 1231 # a[7]a[2] 1232 # a[4]a[3] 1233 # a[5]a[3] 1234 # a[6]a[3] 1235 # a[7]a[3] 1236 # a[5]a[4] 1237 # a[6]a[4] 1238 # a[7]a[4] 1239 # a[6]a[5] 1240 # a[7]a[5] 1241 # a[7]a[6] 1242 # a[0]a[0] 1243 # a[1]a[1] 1244 # a[2]a[2] 1245 # a[3]a[3] 1246 # a[4]a[4] 1247 # a[5]a[5] 1248 # a[6]a[6] 1249 # a[7]a[7] 1250 1251 lea 32(%r10),$i # $i=-($num-32) 1252 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 1253 1254 mov $num,$j # $j=$num 1255 1256 # comments apply to $num==8 case 1257 mov -32($aptr,$i),$a0 # a[0] 1258 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1259 mov -24($aptr,$i),%rax # a[1] 1260 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1261 mov -16($aptr,$i),$ai # a[2] 1262 mov %rax,$a1 1263 1264 mul $a0 # a[1]*a[0] 1265 mov %rax,$A0[0] # a[1]*a[0] 1266 mov $ai,%rax # a[2] 1267 mov %rdx,$A0[1] 1268 mov $A0[0],-24($tptr,$i) # t[1] 1269 1270 mul $a0 # a[2]*a[0] 1271 add %rax,$A0[1] 1272 mov $ai,%rax 1273 adc \$0,%rdx 1274 mov $A0[1],-16($tptr,$i) # t[2] 1275 mov %rdx,$A0[0] 1276 1277 1278 mov -8($aptr,$i),$ai # a[3] 1279 mul $a1 # a[2]*a[1] 1280 mov %rax,$A1[0] # a[2]*a[1]+t[3] 1281 mov $ai,%rax 1282 mov %rdx,$A1[1] 1283 1284 lea ($i),$j 1285 mul $a0 # a[3]*a[0] 1286 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1287 mov $ai,%rax 1288 mov %rdx,$A0[1] 1289 adc \$0,$A0[1] 1290 add $A1[0],$A0[0] 1291 adc \$0,$A0[1] 1292 mov $A0[0],-8($tptr,$j) # t[3] 1293 jmp .Lsqr4x_1st 1294 1295.align 32 1296.Lsqr4x_1st: 1297 mov ($aptr,$j),$ai # a[4] 1298 mul $a1 # a[3]*a[1] 1299 add %rax,$A1[1] # a[3]*a[1]+t[4] 1300 mov $ai,%rax 1301 mov %rdx,$A1[0] 1302 adc \$0,$A1[0] 1303 1304 mul $a0 # a[4]*a[0] 1305 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1306 mov $ai,%rax # a[3] 1307 mov 8($aptr,$j),$ai # a[5] 1308 mov %rdx,$A0[0] 1309 adc \$0,$A0[0] 1310 add $A1[1],$A0[1] 1311 adc \$0,$A0[0] 1312 1313 1314 mul $a1 # a[4]*a[3] 1315 add %rax,$A1[0] # a[4]*a[3]+t[5] 1316 mov $ai,%rax 1317 mov $A0[1],($tptr,$j) # t[4] 1318 mov %rdx,$A1[1] 1319 adc \$0,$A1[1] 1320 1321 mul $a0 # a[5]*a[2] 1322 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1323 mov $ai,%rax 1324 mov 16($aptr,$j),$ai # a[6] 1325 mov %rdx,$A0[1] 1326 adc \$0,$A0[1] 1327 add $A1[0],$A0[0] 1328 adc \$0,$A0[1] 1329 1330 mul $a1 # a[5]*a[3] 1331 add %rax,$A1[1] # a[5]*a[3]+t[6] 1332 mov $ai,%rax 1333 mov $A0[0],8($tptr,$j) # t[5] 1334 mov %rdx,$A1[0] 1335 adc \$0,$A1[0] 1336 1337 mul $a0 # a[6]*a[2] 1338 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 1339 mov $ai,%rax # a[3] 1340 mov 24($aptr,$j),$ai # a[7] 1341 mov %rdx,$A0[0] 1342 adc \$0,$A0[0] 1343 add $A1[1],$A0[1] 1344 adc \$0,$A0[0] 1345 1346 1347 mul $a1 # a[6]*a[5] 1348 add %rax,$A1[0] # a[6]*a[5]+t[7] 1349 mov $ai,%rax 1350 mov $A0[1],16($tptr,$j) # t[6] 1351 mov %rdx,$A1[1] 1352 adc \$0,$A1[1] 1353 lea 32($j),$j 1354 1355 mul $a0 # a[7]*a[4] 1356 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 1357 mov $ai,%rax 1358 mov %rdx,$A0[1] 1359 adc \$0,$A0[1] 1360 add $A1[0],$A0[0] 1361 adc \$0,$A0[1] 1362 mov $A0[0],-8($tptr,$j) # t[7] 1363 1364 cmp \$0,$j 1365 jne .Lsqr4x_1st 1366 1367 mul $a1 # a[7]*a[5] 1368 add %rax,$A1[1] 1369 lea 16($i),$i 1370 adc \$0,%rdx 1371 add $A0[1],$A1[1] 1372 adc \$0,%rdx 1373 1374 mov $A1[1],($tptr) # t[8] 1375 mov %rdx,$A1[0] 1376 mov %rdx,8($tptr) # t[9] 1377 jmp .Lsqr4x_outer 1378 1379.align 32 1380.Lsqr4x_outer: # comments apply to $num==6 case 1381 mov -32($aptr,$i),$a0 # a[0] 1382 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1383 mov -24($aptr,$i),%rax # a[1] 1384 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1385 mov -16($aptr,$i),$ai # a[2] 1386 mov %rax,$a1 1387 1388 mul $a0 # a[1]*a[0] 1389 mov -24($tptr,$i),$A0[0] # t[1] 1390 add %rax,$A0[0] # a[1]*a[0]+t[1] 1391 mov $ai,%rax # a[2] 1392 adc \$0,%rdx 1393 mov $A0[0],-24($tptr,$i) # t[1] 1394 mov %rdx,$A0[1] 1395 1396 mul $a0 # a[2]*a[0] 1397 add %rax,$A0[1] 1398 mov $ai,%rax 1399 adc \$0,%rdx 1400 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 1401 mov %rdx,$A0[0] 1402 adc \$0,$A0[0] 1403 mov $A0[1],-16($tptr,$i) # t[2] 1404 1405 xor $A1[0],$A1[0] 1406 1407 mov -8($aptr,$i),$ai # a[3] 1408 mul $a1 # a[2]*a[1] 1409 add %rax,$A1[0] # a[2]*a[1]+t[3] 1410 mov $ai,%rax 1411 adc \$0,%rdx 1412 add -8($tptr,$i),$A1[0] 1413 mov %rdx,$A1[1] 1414 adc \$0,$A1[1] 1415 1416 mul $a0 # a[3]*a[0] 1417 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1418 mov $ai,%rax 1419 adc \$0,%rdx 1420 add $A1[0],$A0[0] 1421 mov %rdx,$A0[1] 1422 adc \$0,$A0[1] 1423 mov $A0[0],-8($tptr,$i) # t[3] 1424 1425 lea ($i),$j 1426 jmp .Lsqr4x_inner 1427 1428.align 32 1429.Lsqr4x_inner: 1430 mov ($aptr,$j),$ai # a[4] 1431 mul $a1 # a[3]*a[1] 1432 add %rax,$A1[1] # a[3]*a[1]+t[4] 1433 mov $ai,%rax 1434 mov %rdx,$A1[0] 1435 adc \$0,$A1[0] 1436 add ($tptr,$j),$A1[1] 1437 adc \$0,$A1[0] 1438 1439 .byte 0x67 1440 mul $a0 # a[4]*a[0] 1441 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1442 mov $ai,%rax # a[3] 1443 mov 8($aptr,$j),$ai # a[5] 1444 mov %rdx,$A0[0] 1445 adc \$0,$A0[0] 1446 add $A1[1],$A0[1] 1447 adc \$0,$A0[0] 1448 1449 mul $a1 # a[4]*a[3] 1450 add %rax,$A1[0] # a[4]*a[3]+t[5] 1451 mov $A0[1],($tptr,$j) # t[4] 1452 mov $ai,%rax 1453 mov %rdx,$A1[1] 1454 adc \$0,$A1[1] 1455 add 8($tptr,$j),$A1[0] 1456 lea 16($j),$j # j++ 1457 adc \$0,$A1[1] 1458 1459 mul $a0 # a[5]*a[2] 1460 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1461 mov $ai,%rax 1462 adc \$0,%rdx 1463 add $A1[0],$A0[0] 1464 mov %rdx,$A0[1] 1465 adc \$0,$A0[1] 1466 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 1467 1468 cmp \$0,$j 1469 jne .Lsqr4x_inner 1470 1471 .byte 0x67 1472 mul $a1 # a[5]*a[3] 1473 add %rax,$A1[1] 1474 adc \$0,%rdx 1475 add $A0[1],$A1[1] 1476 adc \$0,%rdx 1477 1478 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 1479 mov %rdx,$A1[0] 1480 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below 1481 1482 add \$16,$i 1483 jnz .Lsqr4x_outer 1484 1485 # comments apply to $num==4 case 1486 mov -32($aptr),$a0 # a[0] 1487 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1488 mov -24($aptr),%rax # a[1] 1489 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1490 mov -16($aptr),$ai # a[2] 1491 mov %rax,$a1 1492 1493 mul $a0 # a[1]*a[0] 1494 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 1495 mov $ai,%rax # a[2] 1496 mov %rdx,$A0[1] 1497 adc \$0,$A0[1] 1498 1499 mul $a0 # a[2]*a[0] 1500 add %rax,$A0[1] 1501 mov $ai,%rax 1502 mov $A0[0],-24($tptr) # t[1] 1503 mov %rdx,$A0[0] 1504 adc \$0,$A0[0] 1505 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1506 mov -8($aptr),$ai # a[3] 1507 adc \$0,$A0[0] 1508 1509 mul $a1 # a[2]*a[1] 1510 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1511 mov $ai,%rax 1512 mov $A0[1],-16($tptr) # t[2] 1513 mov %rdx,$A1[1] 1514 adc \$0,$A1[1] 1515 1516 mul $a0 # a[3]*a[0] 1517 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1518 mov $ai,%rax 1519 mov %rdx,$A0[1] 1520 adc \$0,$A0[1] 1521 add $A1[0],$A0[0] 1522 adc \$0,$A0[1] 1523 mov $A0[0],-8($tptr) # t[3] 1524 1525 mul $a1 # a[3]*a[1] 1526 add %rax,$A1[1] 1527 mov -16($aptr),%rax # a[2] 1528 adc \$0,%rdx 1529 add $A0[1],$A1[1] 1530 adc \$0,%rdx 1531 1532 mov $A1[1],($tptr) # t[4] 1533 mov %rdx,$A1[0] 1534 mov %rdx,8($tptr) # t[5] 1535 1536 mul $ai # a[2]*a[3] 1537___ 1538{ 1539my ($shift,$carry)=($a0,$a1); 1540my @S=(@A1,$ai,$n0); 1541$code.=<<___; 1542 add \$16,$i 1543 xor $shift,$shift 1544 sub $num,$i # $i=16-$num 1545 xor $carry,$carry 1546 1547 add $A1[0],%rax # t[5] 1548 adc \$0,%rdx 1549 mov %rax,8($tptr) # t[5] 1550 mov %rdx,16($tptr) # t[6] 1551 mov $carry,24($tptr) # t[7] 1552 1553 mov -16($aptr,$i),%rax # a[0] 1554 lea 48+8(%rsp),$tptr 1555 xor $A0[0],$A0[0] # t[0] 1556 mov 8($tptr),$A0[1] # t[1] 1557 1558 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1559 shr \$63,$A0[0] 1560 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1561 shr \$63,$A0[1] 1562 or $A0[0],$S[1] # | t[2*i]>>63 1563 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1564 mov $A0[1],$shift # shift=t[2*i+1]>>63 1565 mul %rax # a[i]*a[i] 1566 neg $carry # mov $carry,cf 1567 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1568 adc %rax,$S[0] 1569 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1570 mov $S[0],($tptr) 1571 adc %rdx,$S[1] 1572 1573 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1574 mov $S[1],8($tptr) 1575 sbb $carry,$carry # mov cf,$carry 1576 shr \$63,$A0[0] 1577 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1578 shr \$63,$A0[1] 1579 or $A0[0],$S[3] # | t[2*i]>>63 1580 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1581 mov $A0[1],$shift # shift=t[2*i+1]>>63 1582 mul %rax # a[i]*a[i] 1583 neg $carry # mov $carry,cf 1584 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1585 adc %rax,$S[2] 1586 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1587 mov $S[2],16($tptr) 1588 adc %rdx,$S[3] 1589 lea 16($i),$i 1590 mov $S[3],24($tptr) 1591 sbb $carry,$carry # mov cf,$carry 1592 lea 64($tptr),$tptr 1593 jmp .Lsqr4x_shift_n_add 1594 1595.align 32 1596.Lsqr4x_shift_n_add: 1597 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1598 shr \$63,$A0[0] 1599 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1600 shr \$63,$A0[1] 1601 or $A0[0],$S[1] # | t[2*i]>>63 1602 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1603 mov $A0[1],$shift # shift=t[2*i+1]>>63 1604 mul %rax # a[i]*a[i] 1605 neg $carry # mov $carry,cf 1606 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1607 adc %rax,$S[0] 1608 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1609 mov $S[0],-32($tptr) 1610 adc %rdx,$S[1] 1611 1612 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1613 mov $S[1],-24($tptr) 1614 sbb $carry,$carry # mov cf,$carry 1615 shr \$63,$A0[0] 1616 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1617 shr \$63,$A0[1] 1618 or $A0[0],$S[3] # | t[2*i]>>63 1619 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch 1620 mov $A0[1],$shift # shift=t[2*i+1]>>63 1621 mul %rax # a[i]*a[i] 1622 neg $carry # mov $carry,cf 1623 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1624 adc %rax,$S[2] 1625 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1626 mov $S[2],-16($tptr) 1627 adc %rdx,$S[3] 1628 1629 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1630 mov $S[3],-8($tptr) 1631 sbb $carry,$carry # mov cf,$carry 1632 shr \$63,$A0[0] 1633 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1634 shr \$63,$A0[1] 1635 or $A0[0],$S[1] # | t[2*i]>>63 1636 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1637 mov $A0[1],$shift # shift=t[2*i+1]>>63 1638 mul %rax # a[i]*a[i] 1639 neg $carry # mov $carry,cf 1640 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1641 adc %rax,$S[0] 1642 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1643 mov $S[0],0($tptr) 1644 adc %rdx,$S[1] 1645 1646 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1647 mov $S[1],8($tptr) 1648 sbb $carry,$carry # mov cf,$carry 1649 shr \$63,$A0[0] 1650 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1651 shr \$63,$A0[1] 1652 or $A0[0],$S[3] # | t[2*i]>>63 1653 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1654 mov $A0[1],$shift # shift=t[2*i+1]>>63 1655 mul %rax # a[i]*a[i] 1656 neg $carry # mov $carry,cf 1657 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1658 adc %rax,$S[2] 1659 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1660 mov $S[2],16($tptr) 1661 adc %rdx,$S[3] 1662 mov $S[3],24($tptr) 1663 sbb $carry,$carry # mov cf,$carry 1664 lea 64($tptr),$tptr 1665 add \$32,$i 1666 jnz .Lsqr4x_shift_n_add 1667 1668 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1669 .byte 0x67 1670 shr \$63,$A0[0] 1671 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1672 shr \$63,$A0[1] 1673 or $A0[0],$S[1] # | t[2*i]>>63 1674 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1675 mov $A0[1],$shift # shift=t[2*i+1]>>63 1676 mul %rax # a[i]*a[i] 1677 neg $carry # mov $carry,cf 1678 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1679 adc %rax,$S[0] 1680 mov -8($aptr),%rax # a[i+1] # prefetch 1681 mov $S[0],-32($tptr) 1682 adc %rdx,$S[1] 1683 1684 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1685 mov $S[1],-24($tptr) 1686 sbb $carry,$carry # mov cf,$carry 1687 shr \$63,$A0[0] 1688 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1689 shr \$63,$A0[1] 1690 or $A0[0],$S[3] # | t[2*i]>>63 1691 mul %rax # a[i]*a[i] 1692 neg $carry # mov $carry,cf 1693 adc %rax,$S[2] 1694 adc %rdx,$S[3] 1695 mov $S[2],-16($tptr) 1696 mov $S[3],-8($tptr) 1697___ 1698} 1699###################################################################### 1700# Montgomery reduction part, "word-by-word" algorithm. 1701# 1702# This new path is inspired by multiple submissions from Intel, by 1703# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 1704# Vinodh Gopal... 1705{ 1706my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); 1707 1708$code.=<<___; 1709 movq %xmm2,$nptr 1710__bn_sqr8x_reduction: 1711 xor %rax,%rax 1712 lea ($nptr,$num),%rcx # end of n[] 1713 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer 1714 mov %rcx,0+8(%rsp) 1715 lea 48+8(%rsp,$num),$tptr # end of initial t[] window 1716 mov %rdx,8+8(%rsp) 1717 neg $num 1718 jmp .L8x_reduction_loop 1719 1720.align 32 1721.L8x_reduction_loop: 1722 lea ($tptr,$num),$tptr # start of current t[] window 1723 .byte 0x66 1724 mov 8*0($tptr),$m0 1725 mov 8*1($tptr),%r9 1726 mov 8*2($tptr),%r10 1727 mov 8*3($tptr),%r11 1728 mov 8*4($tptr),%r12 1729 mov 8*5($tptr),%r13 1730 mov 8*6($tptr),%r14 1731 mov 8*7($tptr),%r15 1732 mov %rax,(%rdx) # store top-most carry bit 1733 lea 8*8($tptr),$tptr 1734 1735 .byte 0x67 1736 mov $m0,%r8 1737 imulq 32+8(%rsp),$m0 # n0*a[0] 1738 mov 8*0($nptr),%rax # n[0] 1739 mov \$8,%ecx 1740 jmp .L8x_reduce 1741 1742.align 32 1743.L8x_reduce: 1744 mulq $m0 1745 mov 8*1($nptr),%rax # n[1] 1746 neg %r8 1747 mov %rdx,%r8 1748 adc \$0,%r8 1749 1750 mulq $m0 1751 add %rax,%r9 1752 mov 8*2($nptr),%rax 1753 adc \$0,%rdx 1754 add %r9,%r8 1755 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] 1756 mov %rdx,%r9 1757 adc \$0,%r9 1758 1759 mulq $m0 1760 add %rax,%r10 1761 mov 8*3($nptr),%rax 1762 adc \$0,%rdx 1763 add %r10,%r9 1764 mov 32+8(%rsp),$carry # pull n0, borrow $carry 1765 mov %rdx,%r10 1766 adc \$0,%r10 1767 1768 mulq $m0 1769 add %rax,%r11 1770 mov 8*4($nptr),%rax 1771 adc \$0,%rdx 1772 imulq %r8,$carry # modulo-scheduled 1773 add %r11,%r10 1774 mov %rdx,%r11 1775 adc \$0,%r11 1776 1777 mulq $m0 1778 add %rax,%r12 1779 mov 8*5($nptr),%rax 1780 adc \$0,%rdx 1781 add %r12,%r11 1782 mov %rdx,%r12 1783 adc \$0,%r12 1784 1785 mulq $m0 1786 add %rax,%r13 1787 mov 8*6($nptr),%rax 1788 adc \$0,%rdx 1789 add %r13,%r12 1790 mov %rdx,%r13 1791 adc \$0,%r13 1792 1793 mulq $m0 1794 add %rax,%r14 1795 mov 8*7($nptr),%rax 1796 adc \$0,%rdx 1797 add %r14,%r13 1798 mov %rdx,%r14 1799 adc \$0,%r14 1800 1801 mulq $m0 1802 mov $carry,$m0 # n0*a[i] 1803 add %rax,%r15 1804 mov 8*0($nptr),%rax # n[0] 1805 adc \$0,%rdx 1806 add %r15,%r14 1807 mov %rdx,%r15 1808 adc \$0,%r15 1809 1810 dec %ecx 1811 jnz .L8x_reduce 1812 1813 lea 8*8($nptr),$nptr 1814 xor %rax,%rax 1815 mov 8+8(%rsp),%rdx # pull end of t[] 1816 cmp 0+8(%rsp),$nptr # end of n[]? 1817 jae .L8x_no_tail 1818 1819 .byte 0x66 1820 add 8*0($tptr),%r8 1821 adc 8*1($tptr),%r9 1822 adc 8*2($tptr),%r10 1823 adc 8*3($tptr),%r11 1824 adc 8*4($tptr),%r12 1825 adc 8*5($tptr),%r13 1826 adc 8*6($tptr),%r14 1827 adc 8*7($tptr),%r15 1828 sbb $carry,$carry # top carry 1829 1830 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1831 mov \$8,%ecx 1832 mov 8*0($nptr),%rax 1833 jmp .L8x_tail 1834 1835.align 32 1836.L8x_tail: 1837 mulq $m0 1838 add %rax,%r8 1839 mov 8*1($nptr),%rax 1840 mov %r8,($tptr) # save result 1841 mov %rdx,%r8 1842 adc \$0,%r8 1843 1844 mulq $m0 1845 add %rax,%r9 1846 mov 8*2($nptr),%rax 1847 adc \$0,%rdx 1848 add %r9,%r8 1849 lea 8($tptr),$tptr # $tptr++ 1850 mov %rdx,%r9 1851 adc \$0,%r9 1852 1853 mulq $m0 1854 add %rax,%r10 1855 mov 8*3($nptr),%rax 1856 adc \$0,%rdx 1857 add %r10,%r9 1858 mov %rdx,%r10 1859 adc \$0,%r10 1860 1861 mulq $m0 1862 add %rax,%r11 1863 mov 8*4($nptr),%rax 1864 adc \$0,%rdx 1865 add %r11,%r10 1866 mov %rdx,%r11 1867 adc \$0,%r11 1868 1869 mulq $m0 1870 add %rax,%r12 1871 mov 8*5($nptr),%rax 1872 adc \$0,%rdx 1873 add %r12,%r11 1874 mov %rdx,%r12 1875 adc \$0,%r12 1876 1877 mulq $m0 1878 add %rax,%r13 1879 mov 8*6($nptr),%rax 1880 adc \$0,%rdx 1881 add %r13,%r12 1882 mov %rdx,%r13 1883 adc \$0,%r13 1884 1885 mulq $m0 1886 add %rax,%r14 1887 mov 8*7($nptr),%rax 1888 adc \$0,%rdx 1889 add %r14,%r13 1890 mov %rdx,%r14 1891 adc \$0,%r14 1892 1893 mulq $m0 1894 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] 1895 add %rax,%r15 1896 adc \$0,%rdx 1897 add %r15,%r14 1898 mov 8*0($nptr),%rax # pull n[0] 1899 mov %rdx,%r15 1900 adc \$0,%r15 1901 1902 dec %ecx 1903 jnz .L8x_tail 1904 1905 lea 8*8($nptr),$nptr 1906 mov 8+8(%rsp),%rdx # pull end of t[] 1907 cmp 0+8(%rsp),$nptr # end of n[]? 1908 jae .L8x_tail_done # break out of loop 1909 1910 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1911 neg $carry 1912 mov 8*0($nptr),%rax # pull n[0] 1913 adc 8*0($tptr),%r8 1914 adc 8*1($tptr),%r9 1915 adc 8*2($tptr),%r10 1916 adc 8*3($tptr),%r11 1917 adc 8*4($tptr),%r12 1918 adc 8*5($tptr),%r13 1919 adc 8*6($tptr),%r14 1920 adc 8*7($tptr),%r15 1921 sbb $carry,$carry # top carry 1922 1923 mov \$8,%ecx 1924 jmp .L8x_tail 1925 1926.align 32 1927.L8x_tail_done: 1928 xor %rax,%rax 1929 add (%rdx),%r8 # can this overflow? 1930 adc \$0,%r9 1931 adc \$0,%r10 1932 adc \$0,%r11 1933 adc \$0,%r12 1934 adc \$0,%r13 1935 adc \$0,%r14 1936 adc \$0,%r15 1937 adc \$0,%rax 1938 1939 neg $carry 1940.L8x_no_tail: 1941 adc 8*0($tptr),%r8 1942 adc 8*1($tptr),%r9 1943 adc 8*2($tptr),%r10 1944 adc 8*3($tptr),%r11 1945 adc 8*4($tptr),%r12 1946 adc 8*5($tptr),%r13 1947 adc 8*6($tptr),%r14 1948 adc 8*7($tptr),%r15 1949 adc \$0,%rax # top-most carry 1950 mov -8($nptr),%rcx # np[num-1] 1951 xor $carry,$carry 1952 1953 movq %xmm2,$nptr # restore $nptr 1954 1955 mov %r8,8*0($tptr) # store top 512 bits 1956 mov %r9,8*1($tptr) 1957 movq %xmm3,$num # $num is %r9, can't be moved upwards 1958 mov %r10,8*2($tptr) 1959 mov %r11,8*3($tptr) 1960 mov %r12,8*4($tptr) 1961 mov %r13,8*5($tptr) 1962 mov %r14,8*6($tptr) 1963 mov %r15,8*7($tptr) 1964 lea 8*8($tptr),$tptr 1965 1966 cmp %rdx,$tptr # end of t[]? 1967 jb .L8x_reduction_loop 1968 ret 1969.size bn_sqr8x_internal,.-bn_sqr8x_internal 1970___ 1971} 1972############################################################## 1973# Post-condition, 4x unrolled 1974# 1975{ 1976my ($tptr,$nptr)=("%rbx","%rbp"); 1977$code.=<<___; 1978.type __bn_post4x_internal,\@abi-omnipotent 1979.align 32 1980__bn_post4x_internal: 1981 mov 8*0($nptr),%r12 1982 lea (%rdi,$num),$tptr # %rdi was $tptr above 1983 mov $num,%rcx 1984 movq %xmm1,$rptr # restore $rptr 1985 neg %rax 1986 movq %xmm1,$aptr # prepare for back-to-back call 1987 sar \$3+2,%rcx 1988 dec %r12 # so that after 'not' we get -n[0] 1989 xor %r10,%r10 1990 mov 8*1($nptr),%r13 1991 mov 8*2($nptr),%r14 1992 mov 8*3($nptr),%r15 1993 jmp .Lsqr4x_sub_entry 1994 1995.align 16 1996.Lsqr4x_sub: 1997 mov 8*0($nptr),%r12 1998 mov 8*1($nptr),%r13 1999 mov 8*2($nptr),%r14 2000 mov 8*3($nptr),%r15 2001.Lsqr4x_sub_entry: 2002 lea 8*4($nptr),$nptr 2003 not %r12 2004 not %r13 2005 not %r14 2006 not %r15 2007 and %rax,%r12 2008 and %rax,%r13 2009 and %rax,%r14 2010 and %rax,%r15 2011 2012 neg %r10 # mov %r10,%cf 2013 adc 8*0($tptr),%r12 2014 adc 8*1($tptr),%r13 2015 adc 8*2($tptr),%r14 2016 adc 8*3($tptr),%r15 2017 mov %r12,8*0($rptr) 2018 lea 8*4($tptr),$tptr 2019 mov %r13,8*1($rptr) 2020 sbb %r10,%r10 # mov %cf,%r10 2021 mov %r14,8*2($rptr) 2022 mov %r15,8*3($rptr) 2023 lea 8*4($rptr),$rptr 2024 2025 inc %rcx # pass %cf 2026 jnz .Lsqr4x_sub 2027 2028 mov $num,%r10 # prepare for back-to-back call 2029 neg $num # restore $num 2030 ret 2031.size __bn_post4x_internal,.-__bn_post4x_internal 2032___ 2033} 2034{ 2035$code.=<<___; 2036.globl bn_from_montgomery 2037.type bn_from_montgomery,\@abi-omnipotent 2038.align 32 2039bn_from_montgomery: 2040 testl \$7,`($win64?"48(%rsp)":"%r9d")` 2041 jz bn_from_mont8x 2042 xor %eax,%eax 2043 ret 2044.size bn_from_montgomery,.-bn_from_montgomery 2045 2046.type bn_from_mont8x,\@function,6 2047.align 32 2048bn_from_mont8x: 2049 .byte 0x67 2050 mov %rsp,%rax 2051 push %rbx 2052 push %rbp 2053 push %r12 2054 push %r13 2055 push %r14 2056 push %r15 2057.Lfrom_prologue: 2058 2059 shl \$3,${num}d # convert $num to bytes 2060 lea ($num,$num,2),%r10 # 3*$num in bytes 2061 neg $num 2062 mov ($n0),$n0 # *n0 2063 2064 ############################################################## 2065 # Ensure that stack frame doesn't alias with $rptr+3*$num 2066 # modulo 4096, which covers ret[num], am[num] and n[num] 2067 # (see bn_exp.c). The stack is allocated to aligned with 2068 # bn_power5's frame, and as bn_from_montgomery happens to be 2069 # last operation, we use the opportunity to cleanse it. 2070 # 2071 lea -320(%rsp,$num,2),%r11 2072 mov %rsp,%rbp 2073 sub $rptr,%r11 2074 and \$4095,%r11 2075 cmp %r11,%r10 2076 jb .Lfrom_sp_alt 2077 sub %r11,%rbp # align with $aptr 2078 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2079 jmp .Lfrom_sp_done 2080 2081.align 32 2082.Lfrom_sp_alt: 2083 lea 4096-320(,$num,2),%r10 2084 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2085 sub %r10,%r11 2086 mov \$0,%r10 2087 cmovc %r10,%r11 2088 sub %r11,%rbp 2089.Lfrom_sp_done: 2090 and \$-64,%rbp 2091 mov %rsp,%r11 2092 sub %rbp,%r11 2093 and \$-4096,%r11 2094 lea (%rbp,%r11),%rsp 2095 mov (%rsp),%r10 2096 cmp %rbp,%rsp 2097 ja .Lfrom_page_walk 2098 jmp .Lfrom_page_walk_done 2099 2100.Lfrom_page_walk: 2101 lea -4096(%rsp),%rsp 2102 mov (%rsp),%r10 2103 cmp %rbp,%rsp 2104 ja .Lfrom_page_walk 2105.Lfrom_page_walk_done: 2106 2107 mov $num,%r10 2108 neg $num 2109 2110 ############################################################## 2111 # Stack layout 2112 # 2113 # +0 saved $num, used in reduction section 2114 # +8 &t[2*$num], used in reduction section 2115 # +32 saved *n0 2116 # +40 saved %rsp 2117 # +48 t[2*$num] 2118 # 2119 mov $n0, 32(%rsp) 2120 mov %rax, 40(%rsp) # save original %rsp 2121.Lfrom_body: 2122 mov $num,%r11 2123 lea 48(%rsp),%rax 2124 pxor %xmm0,%xmm0 2125 jmp .Lmul_by_1 2126 2127.align 32 2128.Lmul_by_1: 2129 movdqu ($aptr),%xmm1 2130 movdqu 16($aptr),%xmm2 2131 movdqu 32($aptr),%xmm3 2132 movdqa %xmm0,(%rax,$num) 2133 movdqu 48($aptr),%xmm4 2134 movdqa %xmm0,16(%rax,$num) 2135 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr 2136 movdqa %xmm1,(%rax) 2137 movdqa %xmm0,32(%rax,$num) 2138 movdqa %xmm2,16(%rax) 2139 movdqa %xmm0,48(%rax,$num) 2140 movdqa %xmm3,32(%rax) 2141 movdqa %xmm4,48(%rax) 2142 lea 64(%rax),%rax 2143 sub \$64,%r11 2144 jnz .Lmul_by_1 2145 2146 movq $rptr,%xmm1 2147 movq $nptr,%xmm2 2148 .byte 0x67 2149 mov $nptr,%rbp 2150 movq %r10, %xmm3 # -num 2151___ 2152$code.=<<___ if ($addx); 2153 mov OPENSSL_ia32cap_P+8(%rip),%r11d 2154 and \$0x80108,%r11d 2155 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 2156 jne .Lfrom_mont_nox 2157 2158 lea (%rax,$num),$rptr 2159 call __bn_sqrx8x_reduction 2160 call __bn_postx4x_internal 2161 2162 pxor %xmm0,%xmm0 2163 lea 48(%rsp),%rax 2164 mov 40(%rsp),%rsi # restore %rsp 2165 jmp .Lfrom_mont_zero 2166 2167.align 32 2168.Lfrom_mont_nox: 2169___ 2170$code.=<<___; 2171 call __bn_sqr8x_reduction 2172 call __bn_post4x_internal 2173 2174 pxor %xmm0,%xmm0 2175 lea 48(%rsp),%rax 2176 mov 40(%rsp),%rsi # restore %rsp 2177 jmp .Lfrom_mont_zero 2178 2179.align 32 2180.Lfrom_mont_zero: 2181 movdqa %xmm0,16*0(%rax) 2182 movdqa %xmm0,16*1(%rax) 2183 movdqa %xmm0,16*2(%rax) 2184 movdqa %xmm0,16*3(%rax) 2185 lea 16*4(%rax),%rax 2186 sub \$32,$num 2187 jnz .Lfrom_mont_zero 2188 2189 mov \$1,%rax 2190 mov -48(%rsi),%r15 2191 mov -40(%rsi),%r14 2192 mov -32(%rsi),%r13 2193 mov -24(%rsi),%r12 2194 mov -16(%rsi),%rbp 2195 mov -8(%rsi),%rbx 2196 lea (%rsi),%rsp 2197.Lfrom_epilogue: 2198 ret 2199.size bn_from_mont8x,.-bn_from_mont8x 2200___ 2201} 2202}}} 2203 2204if ($addx) {{{ 2205my $bp="%rdx"; # restore original value 2206 2207$code.=<<___; 2208.type bn_mulx4x_mont_gather5,\@function,6 2209.align 32 2210bn_mulx4x_mont_gather5: 2211 mov %rsp,%rax 2212.Lmulx4x_enter: 2213 push %rbx 2214 push %rbp 2215 push %r12 2216 push %r13 2217 push %r14 2218 push %r15 2219.Lmulx4x_prologue: 2220 2221 shl \$3,${num}d # convert $num to bytes 2222 lea ($num,$num,2),%r10 # 3*$num in bytes 2223 neg $num # -$num 2224 mov ($n0),$n0 # *n0 2225 2226 ############################################################## 2227 # Ensure that stack frame doesn't alias with $rptr+3*$num 2228 # modulo 4096, which covers ret[num], am[num] and n[num] 2229 # (see bn_exp.c). This is done to allow memory disambiguation 2230 # logic do its magic. [Extra [num] is allocated in order 2231 # to align with bn_power5's frame, which is cleansed after 2232 # completing exponentiation. Extra 256 bytes is for power mask 2233 # calculated from 7th argument, the index.] 2234 # 2235 lea -320(%rsp,$num,2),%r11 2236 mov %rsp,%rbp 2237 sub $rp,%r11 2238 and \$4095,%r11 2239 cmp %r11,%r10 2240 jb .Lmulx4xsp_alt 2241 sub %r11,%rbp # align with $aptr 2242 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2243 jmp .Lmulx4xsp_done 2244 2245.Lmulx4xsp_alt: 2246 lea 4096-320(,$num,2),%r10 2247 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2248 sub %r10,%r11 2249 mov \$0,%r10 2250 cmovc %r10,%r11 2251 sub %r11,%rbp 2252.Lmulx4xsp_done: 2253 and \$-64,%rbp # ensure alignment 2254 mov %rsp,%r11 2255 sub %rbp,%r11 2256 and \$-4096,%r11 2257 lea (%rbp,%r11),%rsp 2258 mov (%rsp),%r10 2259 cmp %rbp,%rsp 2260 ja .Lmulx4x_page_walk 2261 jmp .Lmulx4x_page_walk_done 2262 2263.Lmulx4x_page_walk: 2264 lea -4096(%rsp),%rsp 2265 mov (%rsp),%r10 2266 cmp %rbp,%rsp 2267 ja .Lmulx4x_page_walk 2268.Lmulx4x_page_walk_done: 2269 2270 ############################################################## 2271 # Stack layout 2272 # +0 -num 2273 # +8 off-loaded &b[i] 2274 # +16 end of b[num] 2275 # +24 inner counter 2276 # +32 saved n0 2277 # +40 saved %rsp 2278 # +48 2279 # +56 saved rp 2280 # +64 tmp[num+1] 2281 # 2282 mov $n0, 32(%rsp) # save *n0 2283 mov %rax,40(%rsp) # save original %rsp 2284.Lmulx4x_body: 2285 call mulx4x_internal 2286 2287 mov 40(%rsp),%rsi # restore %rsp 2288 mov \$1,%rax 2289 2290 mov -48(%rsi),%r15 2291 mov -40(%rsi),%r14 2292 mov -32(%rsi),%r13 2293 mov -24(%rsi),%r12 2294 mov -16(%rsi),%rbp 2295 mov -8(%rsi),%rbx 2296 lea (%rsi),%rsp 2297.Lmulx4x_epilogue: 2298 ret 2299.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2300 2301.type mulx4x_internal,\@abi-omnipotent 2302.align 32 2303mulx4x_internal: 2304 mov $num,8(%rsp) # save -$num (it was in bytes) 2305 mov $num,%r10 2306 neg $num # restore $num 2307 shl \$5,$num 2308 neg %r10 # restore $num 2309 lea 128($bp,$num),%r13 # end of powers table (+size optimization) 2310 shr \$5+5,$num 2311 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument 2312 sub \$1,$num 2313 lea .Linc(%rip),%rax 2314 mov %r13,16+8(%rsp) # end of b[num] 2315 mov $num,24+8(%rsp) # inner counter 2316 mov $rp, 56+8(%rsp) # save $rp 2317___ 2318my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 2319 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 2320my $rptr=$bptr; 2321my $STRIDE=2**5*8; # 5 is "window size" 2322my $N=$STRIDE/4; # should match cache line size 2323$code.=<<___; 2324 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 2325 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 2326 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton) 2327 lea 128($bp),$bptr # size optimization 2328 2329 pshufd \$0,%xmm5,%xmm5 # broadcast index 2330 movdqa %xmm1,%xmm4 2331 .byte 0x67 2332 movdqa %xmm1,%xmm2 2333___ 2334######################################################################## 2335# calculate mask by comparing 0..31 to index and save result to stack 2336# 2337$code.=<<___; 2338 .byte 0x67 2339 paddd %xmm0,%xmm1 2340 pcmpeqd %xmm5,%xmm0 # compare to 1,0 2341 movdqa %xmm4,%xmm3 2342___ 2343for($i=0;$i<$STRIDE/16-4;$i+=4) { 2344$code.=<<___; 2345 paddd %xmm1,%xmm2 2346 pcmpeqd %xmm5,%xmm1 # compare to 3,2 2347 movdqa %xmm0,`16*($i+0)+112`(%r10) 2348 movdqa %xmm4,%xmm0 2349 2350 paddd %xmm2,%xmm3 2351 pcmpeqd %xmm5,%xmm2 # compare to 5,4 2352 movdqa %xmm1,`16*($i+1)+112`(%r10) 2353 movdqa %xmm4,%xmm1 2354 2355 paddd %xmm3,%xmm0 2356 pcmpeqd %xmm5,%xmm3 # compare to 7,6 2357 movdqa %xmm2,`16*($i+2)+112`(%r10) 2358 movdqa %xmm4,%xmm2 2359 2360 paddd %xmm0,%xmm1 2361 pcmpeqd %xmm5,%xmm0 2362 movdqa %xmm3,`16*($i+3)+112`(%r10) 2363 movdqa %xmm4,%xmm3 2364___ 2365} 2366$code.=<<___; # last iteration can be optimized 2367 .byte 0x67 2368 paddd %xmm1,%xmm2 2369 pcmpeqd %xmm5,%xmm1 2370 movdqa %xmm0,`16*($i+0)+112`(%r10) 2371 2372 paddd %xmm2,%xmm3 2373 pcmpeqd %xmm5,%xmm2 2374 movdqa %xmm1,`16*($i+1)+112`(%r10) 2375 2376 pcmpeqd %xmm5,%xmm3 2377 movdqa %xmm2,`16*($i+2)+112`(%r10) 2378 2379 pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register 2380 pand `16*($i+1)-128`($bptr),%xmm1 2381 pand `16*($i+2)-128`($bptr),%xmm2 2382 movdqa %xmm3,`16*($i+3)+112`(%r10) 2383 pand `16*($i+3)-128`($bptr),%xmm3 2384 por %xmm2,%xmm0 2385 por %xmm3,%xmm1 2386___ 2387for($i=0;$i<$STRIDE/16-4;$i+=4) { 2388$code.=<<___; 2389 movdqa `16*($i+0)-128`($bptr),%xmm4 2390 movdqa `16*($i+1)-128`($bptr),%xmm5 2391 movdqa `16*($i+2)-128`($bptr),%xmm2 2392 pand `16*($i+0)+112`(%r10),%xmm4 2393 movdqa `16*($i+3)-128`($bptr),%xmm3 2394 pand `16*($i+1)+112`(%r10),%xmm5 2395 por %xmm4,%xmm0 2396 pand `16*($i+2)+112`(%r10),%xmm2 2397 por %xmm5,%xmm1 2398 pand `16*($i+3)+112`(%r10),%xmm3 2399 por %xmm2,%xmm0 2400 por %xmm3,%xmm1 2401___ 2402} 2403$code.=<<___; 2404 pxor %xmm1,%xmm0 2405 pshufd \$0x4e,%xmm0,%xmm1 2406 por %xmm1,%xmm0 2407 lea $STRIDE($bptr),$bptr 2408 movq %xmm0,%rdx # bp[0] 2409 lea 64+8*4+8(%rsp),$tptr 2410 2411 mov %rdx,$bi 2412 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 2413 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] 2414 add %rax,%r11 2415 mulx 2*8($aptr),%rax,%r13 # ... 2416 adc %rax,%r12 2417 adc \$0,%r13 2418 mulx 3*8($aptr),%rax,%r14 2419 2420 mov $mi,%r15 2421 imulq 32+8(%rsp),$mi # "t[0]"*n0 2422 xor $zero,$zero # cf=0, of=0 2423 mov $mi,%rdx 2424 2425 mov $bptr,8+8(%rsp) # off-load &b[i] 2426 2427 lea 4*8($aptr),$aptr 2428 adcx %rax,%r13 2429 adcx $zero,%r14 # cf=0 2430 2431 mulx 0*8($nptr),%rax,%r10 2432 adcx %rax,%r15 # discarded 2433 adox %r11,%r10 2434 mulx 1*8($nptr),%rax,%r11 2435 adcx %rax,%r10 2436 adox %r12,%r11 2437 mulx 2*8($nptr),%rax,%r12 2438 mov 24+8(%rsp),$bptr # counter value 2439 mov %r10,-8*4($tptr) 2440 adcx %rax,%r11 2441 adox %r13,%r12 2442 mulx 3*8($nptr),%rax,%r15 2443 mov $bi,%rdx 2444 mov %r11,-8*3($tptr) 2445 adcx %rax,%r12 2446 adox $zero,%r15 # of=0 2447 lea 4*8($nptr),$nptr 2448 mov %r12,-8*2($tptr) 2449 jmp .Lmulx4x_1st 2450 2451.align 32 2452.Lmulx4x_1st: 2453 adcx $zero,%r15 # cf=0, modulo-scheduled 2454 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 2455 adcx %r14,%r10 2456 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 2457 adcx %rax,%r11 2458 mulx 2*8($aptr),%r12,%rax # ... 2459 adcx %r14,%r12 2460 mulx 3*8($aptr),%r13,%r14 2461 .byte 0x67,0x67 2462 mov $mi,%rdx 2463 adcx %rax,%r13 2464 adcx $zero,%r14 # cf=0 2465 lea 4*8($aptr),$aptr 2466 lea 4*8($tptr),$tptr 2467 2468 adox %r15,%r10 2469 mulx 0*8($nptr),%rax,%r15 2470 adcx %rax,%r10 2471 adox %r15,%r11 2472 mulx 1*8($nptr),%rax,%r15 2473 adcx %rax,%r11 2474 adox %r15,%r12 2475 mulx 2*8($nptr),%rax,%r15 2476 mov %r10,-5*8($tptr) 2477 adcx %rax,%r12 2478 mov %r11,-4*8($tptr) 2479 adox %r15,%r13 2480 mulx 3*8($nptr),%rax,%r15 2481 mov $bi,%rdx 2482 mov %r12,-3*8($tptr) 2483 adcx %rax,%r13 2484 adox $zero,%r15 2485 lea 4*8($nptr),$nptr 2486 mov %r13,-2*8($tptr) 2487 2488 dec $bptr # of=0, pass cf 2489 jnz .Lmulx4x_1st 2490 2491 mov 8(%rsp),$num # load -num 2492 adc $zero,%r15 # modulo-scheduled 2493 lea ($aptr,$num),$aptr # rewind $aptr 2494 add %r15,%r14 2495 mov 8+8(%rsp),$bptr # re-load &b[i] 2496 adc $zero,$zero # top-most carry 2497 mov %r14,-1*8($tptr) 2498 jmp .Lmulx4x_outer 2499 2500.align 32 2501.Lmulx4x_outer: 2502 lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) 2503 pxor %xmm4,%xmm4 2504 .byte 0x67,0x67 2505 pxor %xmm5,%xmm5 2506___ 2507for($i=0;$i<$STRIDE/16;$i+=4) { 2508$code.=<<___; 2509 movdqa `16*($i+0)-128`($bptr),%xmm0 2510 movdqa `16*($i+1)-128`($bptr),%xmm1 2511 movdqa `16*($i+2)-128`($bptr),%xmm2 2512 pand `16*($i+0)+256`(%r10),%xmm0 2513 movdqa `16*($i+3)-128`($bptr),%xmm3 2514 pand `16*($i+1)+256`(%r10),%xmm1 2515 por %xmm0,%xmm4 2516 pand `16*($i+2)+256`(%r10),%xmm2 2517 por %xmm1,%xmm5 2518 pand `16*($i+3)+256`(%r10),%xmm3 2519 por %xmm2,%xmm4 2520 por %xmm3,%xmm5 2521___ 2522} 2523$code.=<<___; 2524 por %xmm5,%xmm4 2525 pshufd \$0x4e,%xmm4,%xmm0 2526 por %xmm4,%xmm0 2527 lea $STRIDE($bptr),$bptr 2528 movq %xmm0,%rdx # m0=bp[i] 2529 2530 mov $zero,($tptr) # save top-most carry 2531 lea 4*8($tptr,$num),$tptr # rewind $tptr 2532 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 2533 xor $zero,$zero # cf=0, of=0 2534 mov %rdx,$bi 2535 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 2536 adox -4*8($tptr),$mi # +t[0] 2537 adcx %r14,%r11 2538 mulx 2*8($aptr),%r15,%r13 # ... 2539 adox -3*8($tptr),%r11 2540 adcx %r15,%r12 2541 mulx 3*8($aptr),%rdx,%r14 2542 adox -2*8($tptr),%r12 2543 adcx %rdx,%r13 2544 lea ($nptr,$num),$nptr # rewind $nptr 2545 lea 4*8($aptr),$aptr 2546 adox -1*8($tptr),%r13 2547 adcx $zero,%r14 2548 adox $zero,%r14 2549 2550 mov $mi,%r15 2551 imulq 32+8(%rsp),$mi # "t[0]"*n0 2552 2553 mov $mi,%rdx 2554 xor $zero,$zero # cf=0, of=0 2555 mov $bptr,8+8(%rsp) # off-load &b[i] 2556 2557 mulx 0*8($nptr),%rax,%r10 2558 adcx %rax,%r15 # discarded 2559 adox %r11,%r10 2560 mulx 1*8($nptr),%rax,%r11 2561 adcx %rax,%r10 2562 adox %r12,%r11 2563 mulx 2*8($nptr),%rax,%r12 2564 adcx %rax,%r11 2565 adox %r13,%r12 2566 mulx 3*8($nptr),%rax,%r15 2567 mov $bi,%rdx 2568 mov 24+8(%rsp),$bptr # counter value 2569 mov %r10,-8*4($tptr) 2570 adcx %rax,%r12 2571 mov %r11,-8*3($tptr) 2572 adox $zero,%r15 # of=0 2573 mov %r12,-8*2($tptr) 2574 lea 4*8($nptr),$nptr 2575 jmp .Lmulx4x_inner 2576 2577.align 32 2578.Lmulx4x_inner: 2579 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 2580 adcx $zero,%r15 # cf=0, modulo-scheduled 2581 adox %r14,%r10 2582 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 2583 adcx 0*8($tptr),%r10 2584 adox %rax,%r11 2585 mulx 2*8($aptr),%r12,%rax # ... 2586 adcx 1*8($tptr),%r11 2587 adox %r14,%r12 2588 mulx 3*8($aptr),%r13,%r14 2589 mov $mi,%rdx 2590 adcx 2*8($tptr),%r12 2591 adox %rax,%r13 2592 adcx 3*8($tptr),%r13 2593 adox $zero,%r14 # of=0 2594 lea 4*8($aptr),$aptr 2595 lea 4*8($tptr),$tptr 2596 adcx $zero,%r14 # cf=0 2597 2598 adox %r15,%r10 2599 mulx 0*8($nptr),%rax,%r15 2600 adcx %rax,%r10 2601 adox %r15,%r11 2602 mulx 1*8($nptr),%rax,%r15 2603 adcx %rax,%r11 2604 adox %r15,%r12 2605 mulx 2*8($nptr),%rax,%r15 2606 mov %r10,-5*8($tptr) 2607 adcx %rax,%r12 2608 adox %r15,%r13 2609 mov %r11,-4*8($tptr) 2610 mulx 3*8($nptr),%rax,%r15 2611 mov $bi,%rdx 2612 lea 4*8($nptr),$nptr 2613 mov %r12,-3*8($tptr) 2614 adcx %rax,%r13 2615 adox $zero,%r15 2616 mov %r13,-2*8($tptr) 2617 2618 dec $bptr # of=0, pass cf 2619 jnz .Lmulx4x_inner 2620 2621 mov 0+8(%rsp),$num # load -num 2622 adc $zero,%r15 # modulo-scheduled 2623 sub 0*8($tptr),$bptr # pull top-most carry to %cf 2624 mov 8+8(%rsp),$bptr # re-load &b[i] 2625 mov 16+8(%rsp),%r10 2626 adc %r15,%r14 2627 lea ($aptr,$num),$aptr # rewind $aptr 2628 adc $zero,$zero # top-most carry 2629 mov %r14,-1*8($tptr) 2630 2631 cmp %r10,$bptr 2632 jb .Lmulx4x_outer 2633 2634 mov -8($nptr),%r10 2635 mov $zero,%r8 2636 mov ($nptr,$num),%r12 2637 lea ($nptr,$num),%rbp # rewind $nptr 2638 mov $num,%rcx 2639 lea ($tptr,$num),%rdi # rewind $tptr 2640 xor %eax,%eax 2641 xor %r15,%r15 2642 sub %r14,%r10 # compare top-most words 2643 adc %r15,%r15 2644 or %r15,%r8 2645 sar \$3+2,%rcx 2646 sub %r8,%rax # %rax=-%r8 2647 mov 56+8(%rsp),%rdx # restore rp 2648 dec %r12 # so that after 'not' we get -n[0] 2649 mov 8*1(%rbp),%r13 2650 xor %r8,%r8 2651 mov 8*2(%rbp),%r14 2652 mov 8*3(%rbp),%r15 2653 jmp .Lsqrx4x_sub_entry # common post-condition 2654.size mulx4x_internal,.-mulx4x_internal 2655___ 2656}{ 2657###################################################################### 2658# void bn_power5( 2659my $rptr="%rdi"; # BN_ULONG *rptr, 2660my $aptr="%rsi"; # const BN_ULONG *aptr, 2661my $bptr="%rdx"; # const void *table, 2662my $nptr="%rcx"; # const BN_ULONG *nptr, 2663my $n0 ="%r8"; # const BN_ULONG *n0); 2664my $num ="%r9"; # int num, has to be divisible by 8 2665 # int pwr); 2666 2667my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 2668my @A0=("%r10","%r11"); 2669my @A1=("%r12","%r13"); 2670my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 2671 2672$code.=<<___; 2673.type bn_powerx5,\@function,6 2674.align 32 2675bn_powerx5: 2676 mov %rsp,%rax 2677.Lpowerx5_enter: 2678 push %rbx 2679 push %rbp 2680 push %r12 2681 push %r13 2682 push %r14 2683 push %r15 2684.Lpowerx5_prologue: 2685 2686 shl \$3,${num}d # convert $num to bytes 2687 lea ($num,$num,2),%r10 # 3*$num in bytes 2688 neg $num 2689 mov ($n0),$n0 # *n0 2690 2691 ############################################################## 2692 # Ensure that stack frame doesn't alias with $rptr+3*$num 2693 # modulo 4096, which covers ret[num], am[num] and n[num] 2694 # (see bn_exp.c). This is done to allow memory disambiguation 2695 # logic do its magic. [Extra 256 bytes is for power mask 2696 # calculated from 7th argument, the index.] 2697 # 2698 lea -320(%rsp,$num,2),%r11 2699 mov %rsp,%rbp 2700 sub $rptr,%r11 2701 and \$4095,%r11 2702 cmp %r11,%r10 2703 jb .Lpwrx_sp_alt 2704 sub %r11,%rbp # align with $aptr 2705 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2706 jmp .Lpwrx_sp_done 2707 2708.align 32 2709.Lpwrx_sp_alt: 2710 lea 4096-320(,$num,2),%r10 2711 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) 2712 sub %r10,%r11 2713 mov \$0,%r10 2714 cmovc %r10,%r11 2715 sub %r11,%rbp 2716.Lpwrx_sp_done: 2717 and \$-64,%rbp 2718 mov %rsp,%r11 2719 sub %rbp,%r11 2720 and \$-4096,%r11 2721 lea (%rbp,%r11),%rsp 2722 mov (%rsp),%r10 2723 cmp %rbp,%rsp 2724 ja .Lpwrx_page_walk 2725 jmp .Lpwrx_page_walk_done 2726 2727.Lpwrx_page_walk: 2728 lea -4096(%rsp),%rsp 2729 mov (%rsp),%r10 2730 cmp %rbp,%rsp 2731 ja .Lpwrx_page_walk 2732.Lpwrx_page_walk_done: 2733 2734 mov $num,%r10 2735 neg $num 2736 2737 ############################################################## 2738 # Stack layout 2739 # 2740 # +0 saved $num, used in reduction section 2741 # +8 &t[2*$num], used in reduction section 2742 # +16 intermediate carry bit 2743 # +24 top-most carry bit, used in reduction section 2744 # +32 saved *n0 2745 # +40 saved %rsp 2746 # +48 t[2*$num] 2747 # 2748 pxor %xmm0,%xmm0 2749 movq $rptr,%xmm1 # save $rptr 2750 movq $nptr,%xmm2 # save $nptr 2751 movq %r10, %xmm3 # -$num 2752 movq $bptr,%xmm4 2753 mov $n0, 32(%rsp) 2754 mov %rax, 40(%rsp) # save original %rsp 2755.Lpowerx5_body: 2756 2757 call __bn_sqrx8x_internal 2758 call __bn_postx4x_internal 2759 call __bn_sqrx8x_internal 2760 call __bn_postx4x_internal 2761 call __bn_sqrx8x_internal 2762 call __bn_postx4x_internal 2763 call __bn_sqrx8x_internal 2764 call __bn_postx4x_internal 2765 call __bn_sqrx8x_internal 2766 call __bn_postx4x_internal 2767 2768 mov %r10,$num # -num 2769 mov $aptr,$rptr 2770 movq %xmm2,$nptr 2771 movq %xmm4,$bptr 2772 mov 40(%rsp),%rax 2773 2774 call mulx4x_internal 2775 2776 mov 40(%rsp),%rsi # restore %rsp 2777 mov \$1,%rax 2778 2779 mov -48(%rsi),%r15 2780 mov -40(%rsi),%r14 2781 mov -32(%rsi),%r13 2782 mov -24(%rsi),%r12 2783 mov -16(%rsi),%rbp 2784 mov -8(%rsi),%rbx 2785 lea (%rsi),%rsp 2786.Lpowerx5_epilogue: 2787 ret 2788.size bn_powerx5,.-bn_powerx5 2789 2790.globl bn_sqrx8x_internal 2791.hidden bn_sqrx8x_internal 2792.type bn_sqrx8x_internal,\@abi-omnipotent 2793.align 32 2794bn_sqrx8x_internal: 2795__bn_sqrx8x_internal: 2796 ################################################################## 2797 # Squaring part: 2798 # 2799 # a) multiply-n-add everything but a[i]*a[i]; 2800 # b) shift result of a) by 1 to the left and accumulate 2801 # a[i]*a[i] products; 2802 # 2803 ################################################################## 2804 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2805 # a[1]a[0] 2806 # a[2]a[0] 2807 # a[3]a[0] 2808 # a[2]a[1] 2809 # a[3]a[1] 2810 # a[3]a[2] 2811 # 2812 # a[4]a[0] 2813 # a[5]a[0] 2814 # a[6]a[0] 2815 # a[7]a[0] 2816 # a[4]a[1] 2817 # a[5]a[1] 2818 # a[6]a[1] 2819 # a[7]a[1] 2820 # a[4]a[2] 2821 # a[5]a[2] 2822 # a[6]a[2] 2823 # a[7]a[2] 2824 # a[4]a[3] 2825 # a[5]a[3] 2826 # a[6]a[3] 2827 # a[7]a[3] 2828 # 2829 # a[5]a[4] 2830 # a[6]a[4] 2831 # a[7]a[4] 2832 # a[6]a[5] 2833 # a[7]a[5] 2834 # a[7]a[6] 2835 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2836___ 2837{ 2838my ($zero,$carry)=("%rbp","%rcx"); 2839my $aaptr=$zero; 2840$code.=<<___; 2841 lea 48+8(%rsp),$tptr 2842 lea ($aptr,$num),$aaptr 2843 mov $num,0+8(%rsp) # save $num 2844 mov $aaptr,8+8(%rsp) # save end of $aptr 2845 jmp .Lsqr8x_zero_start 2846 2847.align 32 2848.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2849.Lsqrx8x_zero: 2850 .byte 0x3e 2851 movdqa %xmm0,0*8($tptr) 2852 movdqa %xmm0,2*8($tptr) 2853 movdqa %xmm0,4*8($tptr) 2854 movdqa %xmm0,6*8($tptr) 2855.Lsqr8x_zero_start: # aligned at 32 2856 movdqa %xmm0,8*8($tptr) 2857 movdqa %xmm0,10*8($tptr) 2858 movdqa %xmm0,12*8($tptr) 2859 movdqa %xmm0,14*8($tptr) 2860 lea 16*8($tptr),$tptr 2861 sub \$64,$num 2862 jnz .Lsqrx8x_zero 2863 2864 mov 0*8($aptr),%rdx # a[0], modulo-scheduled 2865 #xor %r9,%r9 # t[1], ex-$num, zero already 2866 xor %r10,%r10 2867 xor %r11,%r11 2868 xor %r12,%r12 2869 xor %r13,%r13 2870 xor %r14,%r14 2871 xor %r15,%r15 2872 lea 48+8(%rsp),$tptr 2873 xor $zero,$zero # cf=0, cf=0 2874 jmp .Lsqrx8x_outer_loop 2875 2876.align 32 2877.Lsqrx8x_outer_loop: 2878 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] 2879 adcx %r9,%r8 # a[1]*a[0]+=t[1] 2880 adox %rax,%r10 2881 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] 2882 adcx %r10,%r9 2883 adox %rax,%r11 2884 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... 2885 adcx %r11,%r10 2886 adox %rax,%r12 2887 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax 2888 adcx %r12,%r11 2889 adox %rax,%r13 2890 mulx 5*8($aptr),%r12,%rax 2891 adcx %r13,%r12 2892 adox %rax,%r14 2893 mulx 6*8($aptr),%r13,%rax 2894 adcx %r14,%r13 2895 adox %r15,%rax 2896 mulx 7*8($aptr),%r14,%r15 2897 mov 1*8($aptr),%rdx # a[1] 2898 adcx %rax,%r14 2899 adox $zero,%r15 2900 adc 8*8($tptr),%r15 2901 mov %r8,1*8($tptr) # t[1] 2902 mov %r9,2*8($tptr) # t[2] 2903 sbb $carry,$carry # mov %cf,$carry 2904 xor $zero,$zero # cf=0, of=0 2905 2906 2907 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] 2908 mulx 3*8($aptr),%r9,%rax # a[3]*a[1] 2909 adcx %r10,%r8 2910 adox %rbx,%r9 2911 mulx 4*8($aptr),%r10,%rbx # ... 2912 adcx %r11,%r9 2913 adox %rax,%r10 2914 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax 2915 adcx %r12,%r10 2916 adox %rbx,%r11 2917 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx 2918 adcx %r13,%r11 2919 adox %r14,%r12 2920 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 2921 mov 2*8($aptr),%rdx # a[2] 2922 adcx %rax,%r12 2923 adox %rbx,%r13 2924 adcx %r15,%r13 2925 adox $zero,%r14 # of=0 2926 adcx $zero,%r14 # cf=0 2927 2928 mov %r8,3*8($tptr) # t[3] 2929 mov %r9,4*8($tptr) # t[4] 2930 2931 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] 2932 mulx 4*8($aptr),%r9,%rax # a[4]*a[2] 2933 adcx %r10,%r8 2934 adox %rbx,%r9 2935 mulx 5*8($aptr),%r10,%rbx # ... 2936 adcx %r11,%r9 2937 adox %rax,%r10 2938 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax 2939 adcx %r12,%r10 2940 adox %r13,%r11 2941 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 2942 .byte 0x3e 2943 mov 3*8($aptr),%rdx # a[3] 2944 adcx %rbx,%r11 2945 adox %rax,%r12 2946 adcx %r14,%r12 2947 mov %r8,5*8($tptr) # t[5] 2948 mov %r9,6*8($tptr) # t[6] 2949 mulx 4*8($aptr),%r8,%rax # a[4]*a[3] 2950 adox $zero,%r13 # of=0 2951 adcx $zero,%r13 # cf=0 2952 2953 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] 2954 adcx %r10,%r8 2955 adox %rax,%r9 2956 mulx 6*8($aptr),%r10,%rax # ... 2957 adcx %r11,%r9 2958 adox %r12,%r10 2959 mulx 7*8($aptr),%r11,%r12 2960 mov 4*8($aptr),%rdx # a[4] 2961 mov 5*8($aptr),%r14 # a[5] 2962 adcx %rbx,%r10 2963 adox %rax,%r11 2964 mov 6*8($aptr),%r15 # a[6] 2965 adcx %r13,%r11 2966 adox $zero,%r12 # of=0 2967 adcx $zero,%r12 # cf=0 2968 2969 mov %r8,7*8($tptr) # t[7] 2970 mov %r9,8*8($tptr) # t[8] 2971 2972 mulx %r14,%r9,%rax # a[5]*a[4] 2973 mov 7*8($aptr),%r8 # a[7] 2974 adcx %r10,%r9 2975 mulx %r15,%r10,%rbx # a[6]*a[4] 2976 adox %rax,%r10 2977 adcx %r11,%r10 2978 mulx %r8,%r11,%rax # a[7]*a[4] 2979 mov %r14,%rdx # a[5] 2980 adox %rbx,%r11 2981 adcx %r12,%r11 2982 #adox $zero,%rax # of=0 2983 adcx $zero,%rax # cf=0 2984 2985 mulx %r15,%r14,%rbx # a[6]*a[5] 2986 mulx %r8,%r12,%r13 # a[7]*a[5] 2987 mov %r15,%rdx # a[6] 2988 lea 8*8($aptr),$aptr 2989 adcx %r14,%r11 2990 adox %rbx,%r12 2991 adcx %rax,%r12 2992 adox $zero,%r13 2993 2994 .byte 0x67,0x67 2995 mulx %r8,%r8,%r14 # a[7]*a[6] 2996 adcx %r8,%r13 2997 adcx $zero,%r14 2998 2999 cmp 8+8(%rsp),$aptr 3000 je .Lsqrx8x_outer_break 3001 3002 neg $carry # mov $carry,%cf 3003 mov \$-8,%rcx 3004 mov $zero,%r15 3005 mov 8*8($tptr),%r8 3006 adcx 9*8($tptr),%r9 # +=t[9] 3007 adcx 10*8($tptr),%r10 # ... 3008 adcx 11*8($tptr),%r11 3009 adc 12*8($tptr),%r12 3010 adc 13*8($tptr),%r13 3011 adc 14*8($tptr),%r14 3012 adc 15*8($tptr),%r15 3013 lea ($aptr),$aaptr 3014 lea 2*64($tptr),$tptr 3015 sbb %rax,%rax # mov %cf,$carry 3016 3017 mov -64($aptr),%rdx # a[0] 3018 mov %rax,16+8(%rsp) # offload $carry 3019 mov $tptr,24+8(%rsp) 3020 3021 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above 3022 xor %eax,%eax # cf=0, of=0 3023 jmp .Lsqrx8x_loop 3024 3025.align 32 3026.Lsqrx8x_loop: 3027 mov %r8,%rbx 3028 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] 3029 adcx %rax,%rbx # +=t[8] 3030 adox %r9,%r8 3031 3032 mulx 1*8($aaptr),%rax,%r9 # ... 3033 adcx %rax,%r8 3034 adox %r10,%r9 3035 3036 mulx 2*8($aaptr),%rax,%r10 3037 adcx %rax,%r9 3038 adox %r11,%r10 3039 3040 mulx 3*8($aaptr),%rax,%r11 3041 adcx %rax,%r10 3042 adox %r12,%r11 3043 3044 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 3045 adcx %rax,%r11 3046 adox %r13,%r12 3047 3048 mulx 5*8($aaptr),%rax,%r13 3049 adcx %rax,%r12 3050 adox %r14,%r13 3051 3052 mulx 6*8($aaptr),%rax,%r14 3053 mov %rbx,($tptr,%rcx,8) # store t[8+i] 3054 mov \$0,%ebx 3055 adcx %rax,%r13 3056 adox %r15,%r14 3057 3058 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 3059 mov 8($aptr,%rcx,8),%rdx # a[i] 3060 adcx %rax,%r14 3061 adox %rbx,%r15 # %rbx is 0, of=0 3062 adcx %rbx,%r15 # cf=0 3063 3064 .byte 0x67 3065 inc %rcx # of=0 3066 jnz .Lsqrx8x_loop 3067 3068 lea 8*8($aaptr),$aaptr 3069 mov \$-8,%rcx 3070 cmp 8+8(%rsp),$aaptr # done? 3071 je .Lsqrx8x_break 3072 3073 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3074 .byte 0x66 3075 mov -64($aptr),%rdx 3076 adcx 0*8($tptr),%r8 3077 adcx 1*8($tptr),%r9 3078 adc 2*8($tptr),%r10 3079 adc 3*8($tptr),%r11 3080 adc 4*8($tptr),%r12 3081 adc 5*8($tptr),%r13 3082 adc 6*8($tptr),%r14 3083 adc 7*8($tptr),%r15 3084 lea 8*8($tptr),$tptr 3085 .byte 0x67 3086 sbb %rax,%rax # mov %cf,%rax 3087 xor %ebx,%ebx # cf=0, of=0 3088 mov %rax,16+8(%rsp) # offload carry 3089 jmp .Lsqrx8x_loop 3090 3091.align 32 3092.Lsqrx8x_break: 3093 xor $zero,$zero 3094 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3095 adcx $zero,%r8 3096 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry 3097 adcx $zero,%r9 3098 mov 0*8($aptr),%rdx # a[8], modulo-scheduled 3099 adc \$0,%r10 3100 mov %r8,0*8($tptr) 3101 adc \$0,%r11 3102 adc \$0,%r12 3103 adc \$0,%r13 3104 adc \$0,%r14 3105 adc \$0,%r15 3106 cmp $carry,$tptr # cf=0, of=0 3107 je .Lsqrx8x_outer_loop 3108 3109 mov %r9,1*8($tptr) 3110 mov 1*8($carry),%r9 3111 mov %r10,2*8($tptr) 3112 mov 2*8($carry),%r10 3113 mov %r11,3*8($tptr) 3114 mov 3*8($carry),%r11 3115 mov %r12,4*8($tptr) 3116 mov 4*8($carry),%r12 3117 mov %r13,5*8($tptr) 3118 mov 5*8($carry),%r13 3119 mov %r14,6*8($tptr) 3120 mov 6*8($carry),%r14 3121 mov %r15,7*8($tptr) 3122 mov 7*8($carry),%r15 3123 mov $carry,$tptr 3124 jmp .Lsqrx8x_outer_loop 3125 3126.align 32 3127.Lsqrx8x_outer_break: 3128 mov %r9,9*8($tptr) # t[9] 3129 movq %xmm3,%rcx # -$num 3130 mov %r10,10*8($tptr) # ... 3131 mov %r11,11*8($tptr) 3132 mov %r12,12*8($tptr) 3133 mov %r13,13*8($tptr) 3134 mov %r14,14*8($tptr) 3135___ 3136}{ 3137my $i="%rcx"; 3138$code.=<<___; 3139 lea 48+8(%rsp),$tptr 3140 mov ($aptr,$i),%rdx # a[0] 3141 3142 mov 8($tptr),$A0[1] # t[1] 3143 xor $A0[0],$A0[0] # t[0], of=0, cf=0 3144 mov 0+8(%rsp),$num # restore $num 3145 adox $A0[1],$A0[1] 3146 mov 16($tptr),$A1[0] # t[2] # prefetch 3147 mov 24($tptr),$A1[1] # t[3] # prefetch 3148 #jmp .Lsqrx4x_shift_n_add # happens to be aligned 3149 3150.align 32 3151.Lsqrx4x_shift_n_add: 3152 mulx %rdx,%rax,%rbx 3153 adox $A1[0],$A1[0] 3154 adcx $A0[0],%rax 3155 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch 3156 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch 3157 adox $A1[1],$A1[1] 3158 adcx $A0[1],%rbx 3159 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch 3160 mov %rax,0($tptr) 3161 mov %rbx,8($tptr) 3162 3163 mulx %rdx,%rax,%rbx 3164 adox $A0[0],$A0[0] 3165 adcx $A1[0],%rax 3166 mov 16($aptr,$i),%rdx # a[i+2] # prefetch 3167 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch 3168 adox $A0[1],$A0[1] 3169 adcx $A1[1],%rbx 3170 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch 3171 mov %rax,16($tptr) 3172 mov %rbx,24($tptr) 3173 3174 mulx %rdx,%rax,%rbx 3175 adox $A1[0],$A1[0] 3176 adcx $A0[0],%rax 3177 mov 24($aptr,$i),%rdx # a[i+3] # prefetch 3178 lea 32($i),$i 3179 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch 3180 adox $A1[1],$A1[1] 3181 adcx $A0[1],%rbx 3182 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch 3183 mov %rax,32($tptr) 3184 mov %rbx,40($tptr) 3185 3186 mulx %rdx,%rax,%rbx 3187 adox $A0[0],$A0[0] 3188 adcx $A1[0],%rax 3189 jrcxz .Lsqrx4x_shift_n_add_break 3190 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch 3191 adox $A0[1],$A0[1] 3192 adcx $A1[1],%rbx 3193 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch 3194 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch 3195 mov %rax,48($tptr) 3196 mov %rbx,56($tptr) 3197 lea 64($tptr),$tptr 3198 nop 3199 jmp .Lsqrx4x_shift_n_add 3200 3201.align 32 3202.Lsqrx4x_shift_n_add_break: 3203 adcx $A1[1],%rbx 3204 mov %rax,48($tptr) 3205 mov %rbx,56($tptr) 3206 lea 64($tptr),$tptr # end of t[] buffer 3207___ 3208} 3209###################################################################### 3210# Montgomery reduction part, "word-by-word" algorithm. 3211# 3212# This new path is inspired by multiple submissions from Intel, by 3213# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 3214# Vinodh Gopal... 3215{ 3216my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); 3217 3218$code.=<<___; 3219 movq %xmm2,$nptr 3220__bn_sqrx8x_reduction: 3221 xor %eax,%eax # initial top-most carry bit 3222 mov 32+8(%rsp),%rbx # n0 3223 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) 3224 lea -8*8($nptr,$num),%rcx # end of n[] 3225 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer 3226 mov %rcx, 0+8(%rsp) # save end of n[] 3227 mov $tptr,8+8(%rsp) # save end of t[] 3228 3229 lea 48+8(%rsp),$tptr # initial t[] window 3230 jmp .Lsqrx8x_reduction_loop 3231 3232.align 32 3233.Lsqrx8x_reduction_loop: 3234 mov 8*1($tptr),%r9 3235 mov 8*2($tptr),%r10 3236 mov 8*3($tptr),%r11 3237 mov 8*4($tptr),%r12 3238 mov %rdx,%r8 3239 imulq %rbx,%rdx # n0*a[i] 3240 mov 8*5($tptr),%r13 3241 mov 8*6($tptr),%r14 3242 mov 8*7($tptr),%r15 3243 mov %rax,24+8(%rsp) # store top-most carry bit 3244 3245 lea 8*8($tptr),$tptr 3246 xor $carry,$carry # cf=0,of=0 3247 mov \$-8,%rcx 3248 jmp .Lsqrx8x_reduce 3249 3250.align 32 3251.Lsqrx8x_reduce: 3252 mov %r8, %rbx 3253 mulx 8*0($nptr),%rax,%r8 # n[0] 3254 adcx %rbx,%rax # discarded 3255 adox %r9,%r8 3256 3257 mulx 8*1($nptr),%rbx,%r9 # n[1] 3258 adcx %rbx,%r8 3259 adox %r10,%r9 3260 3261 mulx 8*2($nptr),%rbx,%r10 3262 adcx %rbx,%r9 3263 adox %r11,%r10 3264 3265 mulx 8*3($nptr),%rbx,%r11 3266 adcx %rbx,%r10 3267 adox %r12,%r11 3268 3269 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 3270 mov %rdx,%rax 3271 mov %r8,%rdx 3272 adcx %rbx,%r11 3273 adox %r13,%r12 3274 3275 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded 3276 mov %rax,%rdx 3277 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] 3278 3279 mulx 8*5($nptr),%rax,%r13 3280 adcx %rax,%r12 3281 adox %r14,%r13 3282 3283 mulx 8*6($nptr),%rax,%r14 3284 adcx %rax,%r13 3285 adox %r15,%r14 3286 3287 mulx 8*7($nptr),%rax,%r15 3288 mov %rbx,%rdx 3289 adcx %rax,%r14 3290 adox $carry,%r15 # $carry is 0 3291 adcx $carry,%r15 # cf=0 3292 3293 .byte 0x67,0x67,0x67 3294 inc %rcx # of=0 3295 jnz .Lsqrx8x_reduce 3296 3297 mov $carry,%rax # xor %rax,%rax 3298 cmp 0+8(%rsp),$nptr # end of n[]? 3299 jae .Lsqrx8x_no_tail 3300 3301 mov 48+8(%rsp),%rdx # pull n0*a[0] 3302 add 8*0($tptr),%r8 3303 lea 8*8($nptr),$nptr 3304 mov \$-8,%rcx 3305 adcx 8*1($tptr),%r9 3306 adcx 8*2($tptr),%r10 3307 adc 8*3($tptr),%r11 3308 adc 8*4($tptr),%r12 3309 adc 8*5($tptr),%r13 3310 adc 8*6($tptr),%r14 3311 adc 8*7($tptr),%r15 3312 lea 8*8($tptr),$tptr 3313 sbb %rax,%rax # top carry 3314 3315 xor $carry,$carry # of=0, cf=0 3316 mov %rax,16+8(%rsp) 3317 jmp .Lsqrx8x_tail 3318 3319.align 32 3320.Lsqrx8x_tail: 3321 mov %r8,%rbx 3322 mulx 8*0($nptr),%rax,%r8 3323 adcx %rax,%rbx 3324 adox %r9,%r8 3325 3326 mulx 8*1($nptr),%rax,%r9 3327 adcx %rax,%r8 3328 adox %r10,%r9 3329 3330 mulx 8*2($nptr),%rax,%r10 3331 adcx %rax,%r9 3332 adox %r11,%r10 3333 3334 mulx 8*3($nptr),%rax,%r11 3335 adcx %rax,%r10 3336 adox %r12,%r11 3337 3338 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 3339 adcx %rax,%r11 3340 adox %r13,%r12 3341 3342 mulx 8*5($nptr),%rax,%r13 3343 adcx %rax,%r12 3344 adox %r14,%r13 3345 3346 mulx 8*6($nptr),%rax,%r14 3347 adcx %rax,%r13 3348 adox %r15,%r14 3349 3350 mulx 8*7($nptr),%rax,%r15 3351 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] 3352 adcx %rax,%r14 3353 adox $carry,%r15 3354 mov %rbx,($tptr,%rcx,8) # save result 3355 mov %r8,%rbx 3356 adcx $carry,%r15 # cf=0 3357 3358 inc %rcx # of=0 3359 jnz .Lsqrx8x_tail 3360 3361 cmp 0+8(%rsp),$nptr # end of n[]? 3362 jae .Lsqrx8x_tail_done # break out of loop 3363 3364 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3365 mov 48+8(%rsp),%rdx # pull n0*a[0] 3366 lea 8*8($nptr),$nptr 3367 adc 8*0($tptr),%r8 3368 adc 8*1($tptr),%r9 3369 adc 8*2($tptr),%r10 3370 adc 8*3($tptr),%r11 3371 adc 8*4($tptr),%r12 3372 adc 8*5($tptr),%r13 3373 adc 8*6($tptr),%r14 3374 adc 8*7($tptr),%r15 3375 lea 8*8($tptr),$tptr 3376 sbb %rax,%rax 3377 sub \$8,%rcx # mov \$-8,%rcx 3378 3379 xor $carry,$carry # of=0, cf=0 3380 mov %rax,16+8(%rsp) 3381 jmp .Lsqrx8x_tail 3382 3383.align 32 3384.Lsqrx8x_tail_done: 3385 xor %rax,%rax 3386 add 24+8(%rsp),%r8 # can this overflow? 3387 adc \$0,%r9 3388 adc \$0,%r10 3389 adc \$0,%r11 3390 adc \$0,%r12 3391 adc \$0,%r13 3392 adc \$0,%r14 3393 adc \$0,%r15 3394 adc \$0,%rax 3395 3396 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3397.Lsqrx8x_no_tail: # %cf is 0 if jumped here 3398 adc 8*0($tptr),%r8 3399 movq %xmm3,%rcx 3400 adc 8*1($tptr),%r9 3401 mov 8*7($nptr),$carry 3402 movq %xmm2,$nptr # restore $nptr 3403 adc 8*2($tptr),%r10 3404 adc 8*3($tptr),%r11 3405 adc 8*4($tptr),%r12 3406 adc 8*5($tptr),%r13 3407 adc 8*6($tptr),%r14 3408 adc 8*7($tptr),%r15 3409 adc \$0,%rax # top-most carry 3410 3411 mov 32+8(%rsp),%rbx # n0 3412 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" 3413 3414 mov %r8,8*0($tptr) # store top 512 bits 3415 lea 8*8($tptr),%r8 # borrow %r8 3416 mov %r9,8*1($tptr) 3417 mov %r10,8*2($tptr) 3418 mov %r11,8*3($tptr) 3419 mov %r12,8*4($tptr) 3420 mov %r13,8*5($tptr) 3421 mov %r14,8*6($tptr) 3422 mov %r15,8*7($tptr) 3423 3424 lea 8*8($tptr,%rcx),$tptr # start of current t[] window 3425 cmp 8+8(%rsp),%r8 # end of t[]? 3426 jb .Lsqrx8x_reduction_loop 3427 ret 3428.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3429___ 3430} 3431############################################################## 3432# Post-condition, 4x unrolled 3433# 3434{ 3435my ($rptr,$nptr)=("%rdx","%rbp"); 3436$code.=<<___; 3437.align 32 3438__bn_postx4x_internal: 3439 mov 8*0($nptr),%r12 3440 mov %rcx,%r10 # -$num 3441 mov %rcx,%r9 # -$num 3442 neg %rax 3443 sar \$3+2,%rcx 3444 #lea 48+8(%rsp,%r9),$tptr 3445 movq %xmm1,$rptr # restore $rptr 3446 movq %xmm1,$aptr # prepare for back-to-back call 3447 dec %r12 # so that after 'not' we get -n[0] 3448 mov 8*1($nptr),%r13 3449 xor %r8,%r8 3450 mov 8*2($nptr),%r14 3451 mov 8*3($nptr),%r15 3452 jmp .Lsqrx4x_sub_entry 3453 3454.align 16 3455.Lsqrx4x_sub: 3456 mov 8*0($nptr),%r12 3457 mov 8*1($nptr),%r13 3458 mov 8*2($nptr),%r14 3459 mov 8*3($nptr),%r15 3460.Lsqrx4x_sub_entry: 3461 andn %rax,%r12,%r12 3462 lea 8*4($nptr),$nptr 3463 andn %rax,%r13,%r13 3464 andn %rax,%r14,%r14 3465 andn %rax,%r15,%r15 3466 3467 neg %r8 # mov %r8,%cf 3468 adc 8*0($tptr),%r12 3469 adc 8*1($tptr),%r13 3470 adc 8*2($tptr),%r14 3471 adc 8*3($tptr),%r15 3472 mov %r12,8*0($rptr) 3473 lea 8*4($tptr),$tptr 3474 mov %r13,8*1($rptr) 3475 sbb %r8,%r8 # mov %cf,%r8 3476 mov %r14,8*2($rptr) 3477 mov %r15,8*3($rptr) 3478 lea 8*4($rptr),$rptr 3479 3480 inc %rcx 3481 jnz .Lsqrx4x_sub 3482 3483 neg %r9 # restore $num 3484 3485 ret 3486.size __bn_postx4x_internal,.-__bn_postx4x_internal 3487___ 3488} 3489}}} 3490{ 3491my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order 3492 ("%rdi","%esi","%rdx","%ecx"); # Unix order 3493my $out=$inp; 3494my $STRIDE=2**5*8; 3495my $N=$STRIDE/4; 3496 3497$code.=<<___; 3498.globl bn_get_bits5 3499.type bn_get_bits5,\@abi-omnipotent 3500.align 16 3501bn_get_bits5: 3502 lea 0($inp),%r10 3503 lea 1($inp),%r11 3504 mov $num,%ecx 3505 shr \$4,$num 3506 and \$15,%ecx 3507 lea -8(%ecx),%eax 3508 cmp \$11,%ecx 3509 cmova %r11,%r10 3510 cmova %eax,%ecx 3511 movzw (%r10,$num,2),%eax 3512 shrl %cl,%eax 3513 and \$31,%eax 3514 ret 3515.size bn_get_bits5,.-bn_get_bits5 3516 3517.globl bn_scatter5 3518.type bn_scatter5,\@abi-omnipotent 3519.align 16 3520bn_scatter5: 3521 cmp \$0, $num 3522 jz .Lscatter_epilogue 3523 lea ($tbl,$idx,8),$tbl 3524.Lscatter: 3525 mov ($inp),%rax 3526 lea 8($inp),$inp 3527 mov %rax,($tbl) 3528 lea 32*8($tbl),$tbl 3529 sub \$1,$num 3530 jnz .Lscatter 3531.Lscatter_epilogue: 3532 ret 3533.size bn_scatter5,.-bn_scatter5 3534 3535.globl bn_gather5 3536.type bn_gather5,\@abi-omnipotent 3537.align 32 3538bn_gather5: 3539.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases 3540 # I can't trust assembler to use specific encoding:-( 3541 .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 3542 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp 3543 lea .Linc(%rip),%rax 3544 and \$-16,%rsp # shouldn't be formally required 3545 3546 movd $idx,%xmm5 3547 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 3548 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 3549 lea 128($tbl),%r11 # size optimization 3550 lea 128(%rsp),%rax # size optimization 3551 3552 pshufd \$0,%xmm5,%xmm5 # broadcast $idx 3553 movdqa %xmm1,%xmm4 3554 movdqa %xmm1,%xmm2 3555___ 3556######################################################################## 3557# calculate mask by comparing 0..31 to $idx and save result to stack 3558# 3559for($i=0;$i<$STRIDE/16;$i+=4) { 3560$code.=<<___; 3561 paddd %xmm0,%xmm1 3562 pcmpeqd %xmm5,%xmm0 # compare to 1,0 3563___ 3564$code.=<<___ if ($i); 3565 movdqa %xmm3,`16*($i-1)-128`(%rax) 3566___ 3567$code.=<<___; 3568 movdqa %xmm4,%xmm3 3569 3570 paddd %xmm1,%xmm2 3571 pcmpeqd %xmm5,%xmm1 # compare to 3,2 3572 movdqa %xmm0,`16*($i+0)-128`(%rax) 3573 movdqa %xmm4,%xmm0 3574 3575 paddd %xmm2,%xmm3 3576 pcmpeqd %xmm5,%xmm2 # compare to 5,4 3577 movdqa %xmm1,`16*($i+1)-128`(%rax) 3578 movdqa %xmm4,%xmm1 3579 3580 paddd %xmm3,%xmm0 3581 pcmpeqd %xmm5,%xmm3 # compare to 7,6 3582 movdqa %xmm2,`16*($i+2)-128`(%rax) 3583 movdqa %xmm4,%xmm2 3584___ 3585} 3586$code.=<<___; 3587 movdqa %xmm3,`16*($i-1)-128`(%rax) 3588 jmp .Lgather 3589 3590.align 32 3591.Lgather: 3592 pxor %xmm4,%xmm4 3593 pxor %xmm5,%xmm5 3594___ 3595for($i=0;$i<$STRIDE/16;$i+=4) { 3596$code.=<<___; 3597 movdqa `16*($i+0)-128`(%r11),%xmm0 3598 movdqa `16*($i+1)-128`(%r11),%xmm1 3599 movdqa `16*($i+2)-128`(%r11),%xmm2 3600 pand `16*($i+0)-128`(%rax),%xmm0 3601 movdqa `16*($i+3)-128`(%r11),%xmm3 3602 pand `16*($i+1)-128`(%rax),%xmm1 3603 por %xmm0,%xmm4 3604 pand `16*($i+2)-128`(%rax),%xmm2 3605 por %xmm1,%xmm5 3606 pand `16*($i+3)-128`(%rax),%xmm3 3607 por %xmm2,%xmm4 3608 por %xmm3,%xmm5 3609___ 3610} 3611$code.=<<___; 3612 por %xmm5,%xmm4 3613 lea $STRIDE(%r11),%r11 3614 pshufd \$0x4e,%xmm4,%xmm0 3615 por %xmm4,%xmm0 3616 movq %xmm0,($out) # m0=bp[0] 3617 lea 8($out),$out 3618 sub \$1,$num 3619 jnz .Lgather 3620 3621 lea (%r10),%rsp 3622 ret 3623.LSEH_end_bn_gather5: 3624.size bn_gather5,.-bn_gather5 3625___ 3626} 3627$code.=<<___; 3628.align 64 3629.Linc: 3630 .long 0,0, 1,1 3631 .long 2,2, 2,2 3632.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3633___ 3634 3635# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3636# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3637if ($win64) { 3638$rec="%rcx"; 3639$frame="%rdx"; 3640$context="%r8"; 3641$disp="%r9"; 3642 3643$code.=<<___; 3644.extern __imp_RtlVirtualUnwind 3645.type mul_handler,\@abi-omnipotent 3646.align 16 3647mul_handler: 3648 push %rsi 3649 push %rdi 3650 push %rbx 3651 push %rbp 3652 push %r12 3653 push %r13 3654 push %r14 3655 push %r15 3656 pushfq 3657 sub \$64,%rsp 3658 3659 mov 120($context),%rax # pull context->Rax 3660 mov 248($context),%rbx # pull context->Rip 3661 3662 mov 8($disp),%rsi # disp->ImageBase 3663 mov 56($disp),%r11 # disp->HandlerData 3664 3665 mov 0(%r11),%r10d # HandlerData[0] 3666 lea (%rsi,%r10),%r10 # end of prologue label 3667 cmp %r10,%rbx # context->Rip<end of prologue label 3668 jb .Lcommon_seh_tail 3669 3670 mov 4(%r11),%r10d # HandlerData[1] 3671 lea (%rsi,%r10),%r10 # epilogue label 3672 cmp %r10,%rbx # context->Rip>=epilogue label 3673 jb .Lcommon_pop_regs 3674 3675 mov 152($context),%rax # pull context->Rsp 3676 3677 mov 8(%r11),%r10d # HandlerData[2] 3678 lea (%rsi,%r10),%r10 # epilogue label 3679 cmp %r10,%rbx # context->Rip>=epilogue label 3680 jae .Lcommon_seh_tail 3681 3682 lea .Lmul_epilogue(%rip),%r10 3683 cmp %r10,%rbx 3684 ja .Lbody_40 3685 3686 mov 192($context),%r10 # pull $num 3687 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 3688 3689 jmp .Lcommon_pop_regs 3690 3691.Lbody_40: 3692 mov 40(%rax),%rax # pull saved stack pointer 3693.Lcommon_pop_regs: 3694 mov -8(%rax),%rbx 3695 mov -16(%rax),%rbp 3696 mov -24(%rax),%r12 3697 mov -32(%rax),%r13 3698 mov -40(%rax),%r14 3699 mov -48(%rax),%r15 3700 mov %rbx,144($context) # restore context->Rbx 3701 mov %rbp,160($context) # restore context->Rbp 3702 mov %r12,216($context) # restore context->R12 3703 mov %r13,224($context) # restore context->R13 3704 mov %r14,232($context) # restore context->R14 3705 mov %r15,240($context) # restore context->R15 3706 3707.Lcommon_seh_tail: 3708 mov 8(%rax),%rdi 3709 mov 16(%rax),%rsi 3710 mov %rax,152($context) # restore context->Rsp 3711 mov %rsi,168($context) # restore context->Rsi 3712 mov %rdi,176($context) # restore context->Rdi 3713 3714 mov 40($disp),%rdi # disp->ContextRecord 3715 mov $context,%rsi # context 3716 mov \$154,%ecx # sizeof(CONTEXT) 3717 .long 0xa548f3fc # cld; rep movsq 3718 3719 mov $disp,%rsi 3720 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3721 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3722 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3723 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3724 mov 40(%rsi),%r10 # disp->ContextRecord 3725 lea 56(%rsi),%r11 # &disp->HandlerData 3726 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3727 mov %r10,32(%rsp) # arg5 3728 mov %r11,40(%rsp) # arg6 3729 mov %r12,48(%rsp) # arg7 3730 mov %rcx,56(%rsp) # arg8, (NULL) 3731 call *__imp_RtlVirtualUnwind(%rip) 3732 3733 mov \$1,%eax # ExceptionContinueSearch 3734 add \$64,%rsp 3735 popfq 3736 pop %r15 3737 pop %r14 3738 pop %r13 3739 pop %r12 3740 pop %rbp 3741 pop %rbx 3742 pop %rdi 3743 pop %rsi 3744 ret 3745.size mul_handler,.-mul_handler 3746 3747.section .pdata 3748.align 4 3749 .rva .LSEH_begin_bn_mul_mont_gather5 3750 .rva .LSEH_end_bn_mul_mont_gather5 3751 .rva .LSEH_info_bn_mul_mont_gather5 3752 3753 .rva .LSEH_begin_bn_mul4x_mont_gather5 3754 .rva .LSEH_end_bn_mul4x_mont_gather5 3755 .rva .LSEH_info_bn_mul4x_mont_gather5 3756 3757 .rva .LSEH_begin_bn_power5 3758 .rva .LSEH_end_bn_power5 3759 .rva .LSEH_info_bn_power5 3760 3761 .rva .LSEH_begin_bn_from_mont8x 3762 .rva .LSEH_end_bn_from_mont8x 3763 .rva .LSEH_info_bn_from_mont8x 3764___ 3765$code.=<<___ if ($addx); 3766 .rva .LSEH_begin_bn_mulx4x_mont_gather5 3767 .rva .LSEH_end_bn_mulx4x_mont_gather5 3768 .rva .LSEH_info_bn_mulx4x_mont_gather5 3769 3770 .rva .LSEH_begin_bn_powerx5 3771 .rva .LSEH_end_bn_powerx5 3772 .rva .LSEH_info_bn_powerx5 3773___ 3774$code.=<<___; 3775 .rva .LSEH_begin_bn_gather5 3776 .rva .LSEH_end_bn_gather5 3777 .rva .LSEH_info_bn_gather5 3778 3779.section .xdata 3780.align 8 3781.LSEH_info_bn_mul_mont_gather5: 3782 .byte 9,0,0,0 3783 .rva mul_handler 3784 .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] 3785.align 8 3786.LSEH_info_bn_mul4x_mont_gather5: 3787 .byte 9,0,0,0 3788 .rva mul_handler 3789 .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 3790.align 8 3791.LSEH_info_bn_power5: 3792 .byte 9,0,0,0 3793 .rva mul_handler 3794 .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] 3795.align 8 3796.LSEH_info_bn_from_mont8x: 3797 .byte 9,0,0,0 3798 .rva mul_handler 3799 .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] 3800___ 3801$code.=<<___ if ($addx); 3802.align 8 3803.LSEH_info_bn_mulx4x_mont_gather5: 3804 .byte 9,0,0,0 3805 .rva mul_handler 3806 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 3807.align 8 3808.LSEH_info_bn_powerx5: 3809 .byte 9,0,0,0 3810 .rva mul_handler 3811 .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] 3812___ 3813$code.=<<___; 3814.align 8 3815.LSEH_info_bn_gather5: 3816 .byte 0x01,0x0b,0x03,0x0a 3817 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 3818 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) 3819.align 8 3820___ 3821} 3822 3823$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3824 3825print $code; 3826close STDOUT; 3827