1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# August 2011. 11# 12# Companion to x86_64-mont.pl that optimizes cache-timing attack 13# countermeasures. The subroutines are produced by replacing bp[i] 14# references in their x86_64-mont.pl counterparts with cache-neutral 15# references to powers table computed in BN_mod_exp_mont_consttime. 16# In addition subroutine that scatters elements of the powers table 17# is implemented, so that scatter-/gathering can be tuned without 18# bn_exp.c modifications. 19 20# August 2013. 21# 22# Add MULX/AD*X code paths and additional interfaces to optimize for 23# branch prediction unit. For input lengths that are multiples of 8 24# the np argument is not just modulus value, but one interleaved 25# with 0. This is to optimize post-condition... 26 27$flavour = shift; 28$output = shift; 29if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 30 31$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 32 33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 34( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 35( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 36die "can't locate x86_64-xlate.pl"; 37 38open OUT,"| \"$^X\" $xlate $flavour $output"; 39*STDOUT=*OUT; 40 41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 43 $addx = ($1>=2.23); 44} 45 46if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 48 $addx = ($1>=2.10); 49} 50 51if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 52 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 53 $addx = ($1>=12); 54} 55 56if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 57 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 58 $addx = ($ver>=3.03); 59} 60 61# int bn_mul_mont_gather5( 62$rp="%rdi"; # BN_ULONG *rp, 63$ap="%rsi"; # const BN_ULONG *ap, 64$bp="%rdx"; # const BN_ULONG *bp, 65$np="%rcx"; # const BN_ULONG *np, 66$n0="%r8"; # const BN_ULONG *n0, 67$num="%r9"; # int num, 68 # int idx); # 0 to 2^5-1, "index" in $bp holding 69 # pre-computed powers of a', interlaced 70 # in such manner that b[0] is $bp[idx], 71 # b[1] is [2^5+idx], etc. 72$lo0="%r10"; 73$hi0="%r11"; 74$hi1="%r13"; 75$i="%r14"; 76$j="%r15"; 77$m0="%rbx"; 78$m1="%rbp"; 79 80$code=<<___; 81.text 82 83.extern OPENSSL_ia32cap_P 84 85.globl bn_mul_mont_gather5 86.type bn_mul_mont_gather5,\@function,6 87.align 64 88bn_mul_mont_gather5: 89 mov ${num}d,${num}d 90 mov %rsp,%rax 91 test \$7,${num}d 92 jnz .Lmul_enter 93___ 94$code.=<<___ if ($addx); 95 mov OPENSSL_ia32cap_P+8(%rip),%r11d 96___ 97$code.=<<___; 98 jmp .Lmul4x_enter 99 100.align 16 101.Lmul_enter: 102 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument 103 push %rbx 104 push %rbp 105 push %r12 106 push %r13 107 push %r14 108 push %r15 109 110 neg $num 111 mov %rsp,%r11 112 lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) 113 neg $num # restore $num 114 and \$-1024,%r10 # minimize TLB usage 115 116 # Some OSes, *cough*-dows, insist on stack being "wired" to 117 # physical memory in strictly sequential manner, i.e. if stack 118 # allocation spans two pages, then reference to farmost one can 119 # be punishable by SEGV. But page walking can do good even on 120 # other OSes, because it guarantees that villain thread hits 121 # the guard page before it can make damage to innocent one... 122 sub %r10,%r11 123 and \$-4096,%r11 124 lea (%r10,%r11),%rsp 125 mov (%rsp),%r11 126 cmp %r10,%rsp 127 ja .Lmul_page_walk 128 jmp .Lmul_page_walk_done 129 130.Lmul_page_walk: 131 lea -4096(%rsp),%rsp 132 mov (%rsp),%r11 133 cmp %r10,%rsp 134 ja .Lmul_page_walk 135.Lmul_page_walk_done: 136 137 lea .Linc(%rip),%r10 138 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 139.Lmul_body: 140 141 lea 128($bp),%r12 # reassign $bp (+size optimization) 142___ 143 $bp="%r12"; 144 $STRIDE=2**5*8; # 5 is "window size" 145 $N=$STRIDE/4; # should match cache line size 146$code.=<<___; 147 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 148 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 149 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) 150 and \$-16,%r10 151 152 pshufd \$0,%xmm5,%xmm5 # broadcast index 153 movdqa %xmm1,%xmm4 154 movdqa %xmm1,%xmm2 155___ 156######################################################################## 157# calculate mask by comparing 0..31 to index and save result to stack 158# 159$code.=<<___; 160 paddd %xmm0,%xmm1 161 pcmpeqd %xmm5,%xmm0 # compare to 1,0 162 .byte 0x67 163 movdqa %xmm4,%xmm3 164___ 165for($k=0;$k<$STRIDE/16-4;$k+=4) { 166$code.=<<___; 167 paddd %xmm1,%xmm2 168 pcmpeqd %xmm5,%xmm1 # compare to 3,2 169 movdqa %xmm0,`16*($k+0)+112`(%r10) 170 movdqa %xmm4,%xmm0 171 172 paddd %xmm2,%xmm3 173 pcmpeqd %xmm5,%xmm2 # compare to 5,4 174 movdqa %xmm1,`16*($k+1)+112`(%r10) 175 movdqa %xmm4,%xmm1 176 177 paddd %xmm3,%xmm0 178 pcmpeqd %xmm5,%xmm3 # compare to 7,6 179 movdqa %xmm2,`16*($k+2)+112`(%r10) 180 movdqa %xmm4,%xmm2 181 182 paddd %xmm0,%xmm1 183 pcmpeqd %xmm5,%xmm0 184 movdqa %xmm3,`16*($k+3)+112`(%r10) 185 movdqa %xmm4,%xmm3 186___ 187} 188$code.=<<___; # last iteration can be optimized 189 paddd %xmm1,%xmm2 190 pcmpeqd %xmm5,%xmm1 191 movdqa %xmm0,`16*($k+0)+112`(%r10) 192 193 paddd %xmm2,%xmm3 194 .byte 0x67 195 pcmpeqd %xmm5,%xmm2 196 movdqa %xmm1,`16*($k+1)+112`(%r10) 197 198 pcmpeqd %xmm5,%xmm3 199 movdqa %xmm2,`16*($k+2)+112`(%r10) 200 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register 201 202 pand `16*($k+1)-128`($bp),%xmm1 203 pand `16*($k+2)-128`($bp),%xmm2 204 movdqa %xmm3,`16*($k+3)+112`(%r10) 205 pand `16*($k+3)-128`($bp),%xmm3 206 por %xmm2,%xmm0 207 por %xmm3,%xmm1 208___ 209for($k=0;$k<$STRIDE/16-4;$k+=4) { 210$code.=<<___; 211 movdqa `16*($k+0)-128`($bp),%xmm4 212 movdqa `16*($k+1)-128`($bp),%xmm5 213 movdqa `16*($k+2)-128`($bp),%xmm2 214 pand `16*($k+0)+112`(%r10),%xmm4 215 movdqa `16*($k+3)-128`($bp),%xmm3 216 pand `16*($k+1)+112`(%r10),%xmm5 217 por %xmm4,%xmm0 218 pand `16*($k+2)+112`(%r10),%xmm2 219 por %xmm5,%xmm1 220 pand `16*($k+3)+112`(%r10),%xmm3 221 por %xmm2,%xmm0 222 por %xmm3,%xmm1 223___ 224} 225$code.=<<___; 226 por %xmm1,%xmm0 227 pshufd \$0x4e,%xmm0,%xmm1 228 por %xmm1,%xmm0 229 lea $STRIDE($bp),$bp 230 movq %xmm0,$m0 # m0=bp[0] 231 232 mov ($n0),$n0 # pull n0[0] value 233 mov ($ap),%rax 234 235 xor $i,$i # i=0 236 xor $j,$j # j=0 237 238 mov $n0,$m1 239 mulq $m0 # ap[0]*bp[0] 240 mov %rax,$lo0 241 mov ($np),%rax 242 243 imulq $lo0,$m1 # "tp[0]"*n0 244 mov %rdx,$hi0 245 246 mulq $m1 # np[0]*m1 247 add %rax,$lo0 # discarded 248 mov 8($ap),%rax 249 adc \$0,%rdx 250 mov %rdx,$hi1 251 252 lea 1($j),$j # j++ 253 jmp .L1st_enter 254 255.align 16 256.L1st: 257 add %rax,$hi1 258 mov ($ap,$j,8),%rax 259 adc \$0,%rdx 260 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 261 mov $lo0,$hi0 262 adc \$0,%rdx 263 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 264 mov %rdx,$hi1 265 266.L1st_enter: 267 mulq $m0 # ap[j]*bp[0] 268 add %rax,$hi0 269 mov ($np,$j,8),%rax 270 adc \$0,%rdx 271 lea 1($j),$j # j++ 272 mov %rdx,$lo0 273 274 mulq $m1 # np[j]*m1 275 cmp $num,$j 276 jne .L1st # note that upon exit $j==$num, so 277 # they can be used interchangeably 278 279 add %rax,$hi1 280 adc \$0,%rdx 281 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 282 adc \$0,%rdx 283 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 284 mov %rdx,$hi1 285 mov $lo0,$hi0 286 287 xor %rdx,%rdx 288 add $hi0,$hi1 289 adc \$0,%rdx 290 mov $hi1,-8(%rsp,$num,8) 291 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 292 293 lea 1($i),$i # i++ 294 jmp .Louter 295.align 16 296.Louter: 297 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) 298 and \$-16,%rdx 299 pxor %xmm4,%xmm4 300 pxor %xmm5,%xmm5 301___ 302for($k=0;$k<$STRIDE/16;$k+=4) { 303$code.=<<___; 304 movdqa `16*($k+0)-128`($bp),%xmm0 305 movdqa `16*($k+1)-128`($bp),%xmm1 306 movdqa `16*($k+2)-128`($bp),%xmm2 307 movdqa `16*($k+3)-128`($bp),%xmm3 308 pand `16*($k+0)-128`(%rdx),%xmm0 309 pand `16*($k+1)-128`(%rdx),%xmm1 310 por %xmm0,%xmm4 311 pand `16*($k+2)-128`(%rdx),%xmm2 312 por %xmm1,%xmm5 313 pand `16*($k+3)-128`(%rdx),%xmm3 314 por %xmm2,%xmm4 315 por %xmm3,%xmm5 316___ 317} 318$code.=<<___; 319 por %xmm5,%xmm4 320 pshufd \$0x4e,%xmm4,%xmm0 321 por %xmm4,%xmm0 322 lea $STRIDE($bp),$bp 323 324 mov ($ap),%rax # ap[0] 325 movq %xmm0,$m0 # m0=bp[i] 326 327 xor $j,$j # j=0 328 mov $n0,$m1 329 mov (%rsp),$lo0 330 331 mulq $m0 # ap[0]*bp[i] 332 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 333 mov ($np),%rax 334 adc \$0,%rdx 335 336 imulq $lo0,$m1 # tp[0]*n0 337 mov %rdx,$hi0 338 339 mulq $m1 # np[0]*m1 340 add %rax,$lo0 # discarded 341 mov 8($ap),%rax 342 adc \$0,%rdx 343 mov 8(%rsp),$lo0 # tp[1] 344 mov %rdx,$hi1 345 346 lea 1($j),$j # j++ 347 jmp .Linner_enter 348 349.align 16 350.Linner: 351 add %rax,$hi1 352 mov ($ap,$j,8),%rax 353 adc \$0,%rdx 354 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 355 mov (%rsp,$j,8),$lo0 356 adc \$0,%rdx 357 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 358 mov %rdx,$hi1 359 360.Linner_enter: 361 mulq $m0 # ap[j]*bp[i] 362 add %rax,$hi0 363 mov ($np,$j,8),%rax 364 adc \$0,%rdx 365 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 366 mov %rdx,$hi0 367 adc \$0,$hi0 368 lea 1($j),$j # j++ 369 370 mulq $m1 # np[j]*m1 371 cmp $num,$j 372 jne .Linner # note that upon exit $j==$num, so 373 # they can be used interchangeably 374 add %rax,$hi1 375 adc \$0,%rdx 376 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 377 mov (%rsp,$num,8),$lo0 378 adc \$0,%rdx 379 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 380 mov %rdx,$hi1 381 382 xor %rdx,%rdx 383 add $hi0,$hi1 384 adc \$0,%rdx 385 add $lo0,$hi1 # pull upmost overflow bit 386 adc \$0,%rdx 387 mov $hi1,-8(%rsp,$num,8) 388 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 389 390 lea 1($i),$i # i++ 391 cmp $num,$i 392 jb .Louter 393 394 xor $i,$i # i=0 and clear CF! 395 mov (%rsp),%rax # tp[0] 396 lea (%rsp),$ap # borrow ap for tp 397 mov $num,$j # j=num 398 jmp .Lsub 399.align 16 400.Lsub: sbb ($np,$i,8),%rax 401 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 402 mov 8($ap,$i,8),%rax # tp[i+1] 403 lea 1($i),$i # i++ 404 dec $j # doesnn't affect CF! 405 jnz .Lsub 406 407 sbb \$0,%rax # handle upmost overflow bit 408 xor $i,$i 409 and %rax,$ap 410 not %rax 411 mov $rp,$np 412 and %rax,$np 413 mov $num,$j # j=num 414 or $np,$ap # ap=borrow?tp:rp 415.align 16 416.Lcopy: # copy or in-place refresh 417 mov ($ap,$i,8),%rax 418 mov $i,(%rsp,$i,8) # zap temporary vector 419 mov %rax,($rp,$i,8) # rp[i]=tp[i] 420 lea 1($i),$i 421 sub \$1,$j 422 jnz .Lcopy 423 424 mov 8(%rsp,$num,8),%rsi # restore %rsp 425 mov \$1,%rax 426 427 mov -48(%rsi),%r15 428 mov -40(%rsi),%r14 429 mov -32(%rsi),%r13 430 mov -24(%rsi),%r12 431 mov -16(%rsi),%rbp 432 mov -8(%rsi),%rbx 433 lea (%rsi),%rsp 434.Lmul_epilogue: 435 ret 436.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 437___ 438{{{ 439my @A=("%r10","%r11"); 440my @N=("%r13","%rdi"); 441$code.=<<___; 442.type bn_mul4x_mont_gather5,\@function,6 443.align 32 444bn_mul4x_mont_gather5: 445 .byte 0x67 446 mov %rsp,%rax 447.Lmul4x_enter: 448___ 449$code.=<<___ if ($addx); 450 and \$0x80108,%r11d 451 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 452 je .Lmulx4x_enter 453___ 454$code.=<<___; 455 push %rbx 456 push %rbp 457 push %r12 458 push %r13 459 push %r14 460 push %r15 461.Lmul4x_prologue: 462 463 .byte 0x67 464 shl \$3,${num}d # convert $num to bytes 465 lea ($num,$num,2),%r10 # 3*$num in bytes 466 neg $num # -$num 467 468 ############################################################## 469 # Ensure that stack frame doesn't alias with $rptr+3*$num 470 # modulo 4096, which covers ret[num], am[num] and n[num] 471 # (see bn_exp.c). This is done to allow memory disambiguation 472 # logic do its magic. [Extra [num] is allocated in order 473 # to align with bn_power5's frame, which is cleansed after 474 # completing exponentiation. Extra 256 bytes is for power mask 475 # calculated from 7th argument, the index.] 476 # 477 lea -320(%rsp,$num,2),%r11 478 mov %rsp,%rbp 479 sub $rp,%r11 480 and \$4095,%r11 481 cmp %r11,%r10 482 jb .Lmul4xsp_alt 483 sub %r11,%rbp # align with $rp 484 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 485 jmp .Lmul4xsp_done 486 487.align 32 488.Lmul4xsp_alt: 489 lea 4096-320(,$num,2),%r10 490 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 491 sub %r10,%r11 492 mov \$0,%r10 493 cmovc %r10,%r11 494 sub %r11,%rbp 495.Lmul4xsp_done: 496 and \$-64,%rbp 497 mov %rsp,%r11 498 sub %rbp,%r11 499 and \$-4096,%r11 500 lea (%rbp,%r11),%rsp 501 mov (%rsp),%r10 502 cmp %rbp,%rsp 503 ja .Lmul4x_page_walk 504 jmp .Lmul4x_page_walk_done 505 506.Lmul4x_page_walk: 507 lea -4096(%rsp),%rsp 508 mov (%rsp),%r10 509 cmp %rbp,%rsp 510 ja .Lmul4x_page_walk 511.Lmul4x_page_walk_done: 512 513 neg $num 514 515 mov %rax,40(%rsp) 516.Lmul4x_body: 517 518 call mul4x_internal 519 520 mov 40(%rsp),%rsi # restore %rsp 521 mov \$1,%rax 522 523 mov -48(%rsi),%r15 524 mov -40(%rsi),%r14 525 mov -32(%rsi),%r13 526 mov -24(%rsi),%r12 527 mov -16(%rsi),%rbp 528 mov -8(%rsi),%rbx 529 lea (%rsi),%rsp 530.Lmul4x_epilogue: 531 ret 532.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 533 534.type mul4x_internal,\@abi-omnipotent 535.align 32 536mul4x_internal: 537 shl \$5,$num # $num was in bytes 538 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index 539 lea .Linc(%rip),%rax 540 lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) 541 shr \$5,$num # restore $num 542___ 543 $bp="%r12"; 544 $STRIDE=2**5*8; # 5 is "window size" 545 $N=$STRIDE/4; # should match cache line size 546 $tp=$i; 547$code.=<<___; 548 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 549 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 550 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) 551 lea 128(%rdx),$bp # size optimization 552 553 pshufd \$0,%xmm5,%xmm5 # broadcast index 554 movdqa %xmm1,%xmm4 555 .byte 0x67,0x67 556 movdqa %xmm1,%xmm2 557___ 558######################################################################## 559# calculate mask by comparing 0..31 to index and save result to stack 560# 561$code.=<<___; 562 paddd %xmm0,%xmm1 563 pcmpeqd %xmm5,%xmm0 # compare to 1,0 564 .byte 0x67 565 movdqa %xmm4,%xmm3 566___ 567for($i=0;$i<$STRIDE/16-4;$i+=4) { 568$code.=<<___; 569 paddd %xmm1,%xmm2 570 pcmpeqd %xmm5,%xmm1 # compare to 3,2 571 movdqa %xmm0,`16*($i+0)+112`(%r10) 572 movdqa %xmm4,%xmm0 573 574 paddd %xmm2,%xmm3 575 pcmpeqd %xmm5,%xmm2 # compare to 5,4 576 movdqa %xmm1,`16*($i+1)+112`(%r10) 577 movdqa %xmm4,%xmm1 578 579 paddd %xmm3,%xmm0 580 pcmpeqd %xmm5,%xmm3 # compare to 7,6 581 movdqa %xmm2,`16*($i+2)+112`(%r10) 582 movdqa %xmm4,%xmm2 583 584 paddd %xmm0,%xmm1 585 pcmpeqd %xmm5,%xmm0 586 movdqa %xmm3,`16*($i+3)+112`(%r10) 587 movdqa %xmm4,%xmm3 588___ 589} 590$code.=<<___; # last iteration can be optimized 591 paddd %xmm1,%xmm2 592 pcmpeqd %xmm5,%xmm1 593 movdqa %xmm0,`16*($i+0)+112`(%r10) 594 595 paddd %xmm2,%xmm3 596 .byte 0x67 597 pcmpeqd %xmm5,%xmm2 598 movdqa %xmm1,`16*($i+1)+112`(%r10) 599 600 pcmpeqd %xmm5,%xmm3 601 movdqa %xmm2,`16*($i+2)+112`(%r10) 602 pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register 603 604 pand `16*($i+1)-128`($bp),%xmm1 605 pand `16*($i+2)-128`($bp),%xmm2 606 movdqa %xmm3,`16*($i+3)+112`(%r10) 607 pand `16*($i+3)-128`($bp),%xmm3 608 por %xmm2,%xmm0 609 por %xmm3,%xmm1 610___ 611for($i=0;$i<$STRIDE/16-4;$i+=4) { 612$code.=<<___; 613 movdqa `16*($i+0)-128`($bp),%xmm4 614 movdqa `16*($i+1)-128`($bp),%xmm5 615 movdqa `16*($i+2)-128`($bp),%xmm2 616 pand `16*($i+0)+112`(%r10),%xmm4 617 movdqa `16*($i+3)-128`($bp),%xmm3 618 pand `16*($i+1)+112`(%r10),%xmm5 619 por %xmm4,%xmm0 620 pand `16*($i+2)+112`(%r10),%xmm2 621 por %xmm5,%xmm1 622 pand `16*($i+3)+112`(%r10),%xmm3 623 por %xmm2,%xmm0 624 por %xmm3,%xmm1 625___ 626} 627$code.=<<___; 628 por %xmm1,%xmm0 629 pshufd \$0x4e,%xmm0,%xmm1 630 por %xmm1,%xmm0 631 lea $STRIDE($bp),$bp 632 movq %xmm0,$m0 # m0=bp[0] 633 634 mov %r13,16+8(%rsp) # save end of b[num] 635 mov $rp, 56+8(%rsp) # save $rp 636 637 mov ($n0),$n0 # pull n0[0] value 638 mov ($ap),%rax 639 lea ($ap,$num),$ap # end of a[num] 640 neg $num 641 642 mov $n0,$m1 643 mulq $m0 # ap[0]*bp[0] 644 mov %rax,$A[0] 645 mov ($np),%rax 646 647 imulq $A[0],$m1 # "tp[0]"*n0 648 lea 64+8(%rsp),$tp 649 mov %rdx,$A[1] 650 651 mulq $m1 # np[0]*m1 652 add %rax,$A[0] # discarded 653 mov 8($ap,$num),%rax 654 adc \$0,%rdx 655 mov %rdx,$N[1] 656 657 mulq $m0 658 add %rax,$A[1] 659 mov 8*1($np),%rax 660 adc \$0,%rdx 661 mov %rdx,$A[0] 662 663 mulq $m1 664 add %rax,$N[1] 665 mov 16($ap,$num),%rax 666 adc \$0,%rdx 667 add $A[1],$N[1] 668 lea 4*8($num),$j # j=4 669 lea 8*4($np),$np 670 adc \$0,%rdx 671 mov $N[1],($tp) 672 mov %rdx,$N[0] 673 jmp .L1st4x 674 675.align 32 676.L1st4x: 677 mulq $m0 # ap[j]*bp[0] 678 add %rax,$A[0] 679 mov -8*2($np),%rax 680 lea 32($tp),$tp 681 adc \$0,%rdx 682 mov %rdx,$A[1] 683 684 mulq $m1 # np[j]*m1 685 add %rax,$N[0] 686 mov -8($ap,$j),%rax 687 adc \$0,%rdx 688 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 689 adc \$0,%rdx 690 mov $N[0],-24($tp) # tp[j-1] 691 mov %rdx,$N[1] 692 693 mulq $m0 # ap[j]*bp[0] 694 add %rax,$A[1] 695 mov -8*1($np),%rax 696 adc \$0,%rdx 697 mov %rdx,$A[0] 698 699 mulq $m1 # np[j]*m1 700 add %rax,$N[1] 701 mov ($ap,$j),%rax 702 adc \$0,%rdx 703 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 704 adc \$0,%rdx 705 mov $N[1],-16($tp) # tp[j-1] 706 mov %rdx,$N[0] 707 708 mulq $m0 # ap[j]*bp[0] 709 add %rax,$A[0] 710 mov 8*0($np),%rax 711 adc \$0,%rdx 712 mov %rdx,$A[1] 713 714 mulq $m1 # np[j]*m1 715 add %rax,$N[0] 716 mov 8($ap,$j),%rax 717 adc \$0,%rdx 718 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 719 adc \$0,%rdx 720 mov $N[0],-8($tp) # tp[j-1] 721 mov %rdx,$N[1] 722 723 mulq $m0 # ap[j]*bp[0] 724 add %rax,$A[1] 725 mov 8*1($np),%rax 726 adc \$0,%rdx 727 mov %rdx,$A[0] 728 729 mulq $m1 # np[j]*m1 730 add %rax,$N[1] 731 mov 16($ap,$j),%rax 732 adc \$0,%rdx 733 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 734 lea 8*4($np),$np 735 adc \$0,%rdx 736 mov $N[1],($tp) # tp[j-1] 737 mov %rdx,$N[0] 738 739 add \$32,$j # j+=4 740 jnz .L1st4x 741 742 mulq $m0 # ap[j]*bp[0] 743 add %rax,$A[0] 744 mov -8*2($np),%rax 745 lea 32($tp),$tp 746 adc \$0,%rdx 747 mov %rdx,$A[1] 748 749 mulq $m1 # np[j]*m1 750 add %rax,$N[0] 751 mov -8($ap),%rax 752 adc \$0,%rdx 753 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 754 adc \$0,%rdx 755 mov $N[0],-24($tp) # tp[j-1] 756 mov %rdx,$N[1] 757 758 mulq $m0 # ap[j]*bp[0] 759 add %rax,$A[1] 760 mov -8*1($np),%rax 761 adc \$0,%rdx 762 mov %rdx,$A[0] 763 764 mulq $m1 # np[j]*m1 765 add %rax,$N[1] 766 mov ($ap,$num),%rax # ap[0] 767 adc \$0,%rdx 768 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 769 adc \$0,%rdx 770 mov $N[1],-16($tp) # tp[j-1] 771 mov %rdx,$N[0] 772 773 lea ($np,$num),$np # rewind $np 774 775 xor $N[1],$N[1] 776 add $A[0],$N[0] 777 adc \$0,$N[1] 778 mov $N[0],-8($tp) 779 780 jmp .Louter4x 781 782.align 32 783.Louter4x: 784 lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) 785 pxor %xmm4,%xmm4 786 pxor %xmm5,%xmm5 787___ 788for($i=0;$i<$STRIDE/16;$i+=4) { 789$code.=<<___; 790 movdqa `16*($i+0)-128`($bp),%xmm0 791 movdqa `16*($i+1)-128`($bp),%xmm1 792 movdqa `16*($i+2)-128`($bp),%xmm2 793 movdqa `16*($i+3)-128`($bp),%xmm3 794 pand `16*($i+0)-128`(%rdx),%xmm0 795 pand `16*($i+1)-128`(%rdx),%xmm1 796 por %xmm0,%xmm4 797 pand `16*($i+2)-128`(%rdx),%xmm2 798 por %xmm1,%xmm5 799 pand `16*($i+3)-128`(%rdx),%xmm3 800 por %xmm2,%xmm4 801 por %xmm3,%xmm5 802___ 803} 804$code.=<<___; 805 por %xmm5,%xmm4 806 pshufd \$0x4e,%xmm4,%xmm0 807 por %xmm4,%xmm0 808 lea $STRIDE($bp),$bp 809 movq %xmm0,$m0 # m0=bp[i] 810 811 mov ($tp,$num),$A[0] 812 mov $n0,$m1 813 mulq $m0 # ap[0]*bp[i] 814 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 815 mov ($np),%rax 816 adc \$0,%rdx 817 818 imulq $A[0],$m1 # tp[0]*n0 819 mov %rdx,$A[1] 820 mov $N[1],($tp) # store upmost overflow bit 821 822 lea ($tp,$num),$tp # rewind $tp 823 824 mulq $m1 # np[0]*m1 825 add %rax,$A[0] # "$N[0]", discarded 826 mov 8($ap,$num),%rax 827 adc \$0,%rdx 828 mov %rdx,$N[1] 829 830 mulq $m0 # ap[j]*bp[i] 831 add %rax,$A[1] 832 mov 8*1($np),%rax 833 adc \$0,%rdx 834 add 8($tp),$A[1] # +tp[1] 835 adc \$0,%rdx 836 mov %rdx,$A[0] 837 838 mulq $m1 # np[j]*m1 839 add %rax,$N[1] 840 mov 16($ap,$num),%rax 841 adc \$0,%rdx 842 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 843 lea 4*8($num),$j # j=4 844 lea 8*4($np),$np 845 adc \$0,%rdx 846 mov %rdx,$N[0] 847 jmp .Linner4x 848 849.align 32 850.Linner4x: 851 mulq $m0 # ap[j]*bp[i] 852 add %rax,$A[0] 853 mov -8*2($np),%rax 854 adc \$0,%rdx 855 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 856 lea 32($tp),$tp 857 adc \$0,%rdx 858 mov %rdx,$A[1] 859 860 mulq $m1 # np[j]*m1 861 add %rax,$N[0] 862 mov -8($ap,$j),%rax 863 adc \$0,%rdx 864 add $A[0],$N[0] 865 adc \$0,%rdx 866 mov $N[1],-32($tp) # tp[j-1] 867 mov %rdx,$N[1] 868 869 mulq $m0 # ap[j]*bp[i] 870 add %rax,$A[1] 871 mov -8*1($np),%rax 872 adc \$0,%rdx 873 add -8($tp),$A[1] 874 adc \$0,%rdx 875 mov %rdx,$A[0] 876 877 mulq $m1 # np[j]*m1 878 add %rax,$N[1] 879 mov ($ap,$j),%rax 880 adc \$0,%rdx 881 add $A[1],$N[1] 882 adc \$0,%rdx 883 mov $N[0],-24($tp) # tp[j-1] 884 mov %rdx,$N[0] 885 886 mulq $m0 # ap[j]*bp[i] 887 add %rax,$A[0] 888 mov 8*0($np),%rax 889 adc \$0,%rdx 890 add ($tp),$A[0] # ap[j]*bp[i]+tp[j] 891 adc \$0,%rdx 892 mov %rdx,$A[1] 893 894 mulq $m1 # np[j]*m1 895 add %rax,$N[0] 896 mov 8($ap,$j),%rax 897 adc \$0,%rdx 898 add $A[0],$N[0] 899 adc \$0,%rdx 900 mov $N[1],-16($tp) # tp[j-1] 901 mov %rdx,$N[1] 902 903 mulq $m0 # ap[j]*bp[i] 904 add %rax,$A[1] 905 mov 8*1($np),%rax 906 adc \$0,%rdx 907 add 8($tp),$A[1] 908 adc \$0,%rdx 909 mov %rdx,$A[0] 910 911 mulq $m1 # np[j]*m1 912 add %rax,$N[1] 913 mov 16($ap,$j),%rax 914 adc \$0,%rdx 915 add $A[1],$N[1] 916 lea 8*4($np),$np 917 adc \$0,%rdx 918 mov $N[0],-8($tp) # tp[j-1] 919 mov %rdx,$N[0] 920 921 add \$32,$j # j+=4 922 jnz .Linner4x 923 924 mulq $m0 # ap[j]*bp[i] 925 add %rax,$A[0] 926 mov -8*2($np),%rax 927 adc \$0,%rdx 928 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 929 lea 32($tp),$tp 930 adc \$0,%rdx 931 mov %rdx,$A[1] 932 933 mulq $m1 # np[j]*m1 934 add %rax,$N[0] 935 mov -8($ap),%rax 936 adc \$0,%rdx 937 add $A[0],$N[0] 938 adc \$0,%rdx 939 mov $N[1],-32($tp) # tp[j-1] 940 mov %rdx,$N[1] 941 942 mulq $m0 # ap[j]*bp[i] 943 add %rax,$A[1] 944 mov $m1,%rax 945 mov -8*1($np),$m1 946 adc \$0,%rdx 947 add -8($tp),$A[1] 948 adc \$0,%rdx 949 mov %rdx,$A[0] 950 951 mulq $m1 # np[j]*m1 952 add %rax,$N[1] 953 mov ($ap,$num),%rax # ap[0] 954 adc \$0,%rdx 955 add $A[1],$N[1] 956 adc \$0,%rdx 957 mov $N[0],-24($tp) # tp[j-1] 958 mov %rdx,$N[0] 959 960 mov $N[1],-16($tp) # tp[j-1] 961 lea ($np,$num),$np # rewind $np 962 963 xor $N[1],$N[1] 964 add $A[0],$N[0] 965 adc \$0,$N[1] 966 add ($tp),$N[0] # pull upmost overflow bit 967 adc \$0,$N[1] # upmost overflow bit 968 mov $N[0],-8($tp) 969 970 cmp 16+8(%rsp),$bp 971 jb .Louter4x 972___ 973if (1) { 974$code.=<<___; 975 xor %rax,%rax 976 sub $N[0],$m1 # compare top-most words 977 adc $j,$j # $j is zero 978 or $j,$N[1] 979 sub $N[1],%rax # %rax=-$N[1] 980 lea ($tp,$num),%rbx # tptr in .sqr4x_sub 981 mov ($np),%r12 982 lea ($np),%rbp # nptr in .sqr4x_sub 983 mov %r9,%rcx 984 sar \$3+2,%rcx 985 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub 986 dec %r12 # so that after 'not' we get -n[0] 987 xor %r10,%r10 988 mov 8*1(%rbp),%r13 989 mov 8*2(%rbp),%r14 990 mov 8*3(%rbp),%r15 991 jmp .Lsqr4x_sub_entry 992___ 993} else { 994my @ri=("%rax",$bp,$m0,$m1); 995my $rp="%rdx"; 996$code.=<<___ 997 xor \$1,$N[1] 998 lea ($tp,$num),$tp # rewind $tp 999 sar \$5,$num # cf=0 1000 lea ($np,$N[1],8),$np 1001 mov 56+8(%rsp),$rp # restore $rp 1002 jmp .Lsub4x 1003 1004.align 32 1005.Lsub4x: 1006 .byte 0x66 1007 mov 8*0($tp),@ri[0] 1008 mov 8*1($tp),@ri[1] 1009 .byte 0x66 1010 sbb 16*0($np),@ri[0] 1011 mov 8*2($tp),@ri[2] 1012 sbb 16*1($np),@ri[1] 1013 mov 3*8($tp),@ri[3] 1014 lea 4*8($tp),$tp 1015 sbb 16*2($np),@ri[2] 1016 mov @ri[0],8*0($rp) 1017 sbb 16*3($np),@ri[3] 1018 lea 16*4($np),$np 1019 mov @ri[1],8*1($rp) 1020 mov @ri[2],8*2($rp) 1021 mov @ri[3],8*3($rp) 1022 lea 8*4($rp),$rp 1023 1024 inc $num 1025 jnz .Lsub4x 1026 1027 ret 1028___ 1029} 1030$code.=<<___; 1031.size mul4x_internal,.-mul4x_internal 1032___ 1033}}} 1034{{{ 1035###################################################################### 1036# void bn_power5( 1037my $rptr="%rdi"; # BN_ULONG *rptr, 1038my $aptr="%rsi"; # const BN_ULONG *aptr, 1039my $bptr="%rdx"; # const void *table, 1040my $nptr="%rcx"; # const BN_ULONG *nptr, 1041my $n0 ="%r8"; # const BN_ULONG *n0); 1042my $num ="%r9"; # int num, has to be divisible by 8 1043 # int pwr 1044 1045my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 1046my @A0=("%r10","%r11"); 1047my @A1=("%r12","%r13"); 1048my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 1049 1050$code.=<<___; 1051.globl bn_power5 1052.type bn_power5,\@function,6 1053.align 32 1054bn_power5: 1055 mov %rsp,%rax 1056___ 1057$code.=<<___ if ($addx); 1058 mov OPENSSL_ia32cap_P+8(%rip),%r11d 1059 and \$0x80108,%r11d 1060 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 1061 je .Lpowerx5_enter 1062___ 1063$code.=<<___; 1064 push %rbx 1065 push %rbp 1066 push %r12 1067 push %r13 1068 push %r14 1069 push %r15 1070.Lpower5_prologue: 1071 1072 shl \$3,${num}d # convert $num to bytes 1073 lea ($num,$num,2),%r10d # 3*$num 1074 neg $num 1075 mov ($n0),$n0 # *n0 1076 1077 ############################################################## 1078 # Ensure that stack frame doesn't alias with $rptr+3*$num 1079 # modulo 4096, which covers ret[num], am[num] and n[num] 1080 # (see bn_exp.c). This is done to allow memory disambiguation 1081 # logic do its magic. [Extra 256 bytes is for power mask 1082 # calculated from 7th argument, the index.] 1083 # 1084 lea -320(%rsp,$num,2),%r11 1085 mov %rsp,%rbp 1086 sub $rptr,%r11 1087 and \$4095,%r11 1088 cmp %r11,%r10 1089 jb .Lpwr_sp_alt 1090 sub %r11,%rbp # align with $aptr 1091 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1092 jmp .Lpwr_sp_done 1093 1094.align 32 1095.Lpwr_sp_alt: 1096 lea 4096-320(,$num,2),%r10 1097 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1098 sub %r10,%r11 1099 mov \$0,%r10 1100 cmovc %r10,%r11 1101 sub %r11,%rbp 1102.Lpwr_sp_done: 1103 and \$-64,%rbp 1104 mov %rsp,%r11 1105 sub %rbp,%r11 1106 and \$-4096,%r11 1107 lea (%rbp,%r11),%rsp 1108 mov (%rsp),%r10 1109 cmp %rbp,%rsp 1110 ja .Lpwr_page_walk 1111 jmp .Lpwr_page_walk_done 1112 1113.Lpwr_page_walk: 1114 lea -4096(%rsp),%rsp 1115 mov (%rsp),%r10 1116 cmp %rbp,%rsp 1117 ja .Lpwr_page_walk 1118.Lpwr_page_walk_done: 1119 1120 mov $num,%r10 1121 neg $num 1122 1123 ############################################################## 1124 # Stack layout 1125 # 1126 # +0 saved $num, used in reduction section 1127 # +8 &t[2*$num], used in reduction section 1128 # +32 saved *n0 1129 # +40 saved %rsp 1130 # +48 t[2*$num] 1131 # 1132 mov $n0, 32(%rsp) 1133 mov %rax, 40(%rsp) # save original %rsp 1134.Lpower5_body: 1135 movq $rptr,%xmm1 # save $rptr, used in sqr8x 1136 movq $nptr,%xmm2 # save $nptr 1137 movq %r10, %xmm3 # -$num, used in sqr8x 1138 movq $bptr,%xmm4 1139 1140 call __bn_sqr8x_internal 1141 call __bn_post4x_internal 1142 call __bn_sqr8x_internal 1143 call __bn_post4x_internal 1144 call __bn_sqr8x_internal 1145 call __bn_post4x_internal 1146 call __bn_sqr8x_internal 1147 call __bn_post4x_internal 1148 call __bn_sqr8x_internal 1149 call __bn_post4x_internal 1150 1151 movq %xmm2,$nptr 1152 movq %xmm4,$bptr 1153 mov $aptr,$rptr 1154 mov 40(%rsp),%rax 1155 lea 32(%rsp),$n0 1156 1157 call mul4x_internal 1158 1159 mov 40(%rsp),%rsi # restore %rsp 1160 mov \$1,%rax 1161 mov -48(%rsi),%r15 1162 mov -40(%rsi),%r14 1163 mov -32(%rsi),%r13 1164 mov -24(%rsi),%r12 1165 mov -16(%rsi),%rbp 1166 mov -8(%rsi),%rbx 1167 lea (%rsi),%rsp 1168.Lpower5_epilogue: 1169 ret 1170.size bn_power5,.-bn_power5 1171 1172.globl bn_sqr8x_internal 1173.hidden bn_sqr8x_internal 1174.type bn_sqr8x_internal,\@abi-omnipotent 1175.align 32 1176bn_sqr8x_internal: 1177__bn_sqr8x_internal: 1178 ############################################################## 1179 # Squaring part: 1180 # 1181 # a) multiply-n-add everything but a[i]*a[i]; 1182 # b) shift result of a) by 1 to the left and accumulate 1183 # a[i]*a[i] products; 1184 # 1185 ############################################################## 1186 # a[1]a[0] 1187 # a[2]a[0] 1188 # a[3]a[0] 1189 # a[2]a[1] 1190 # a[4]a[0] 1191 # a[3]a[1] 1192 # a[5]a[0] 1193 # a[4]a[1] 1194 # a[3]a[2] 1195 # a[6]a[0] 1196 # a[5]a[1] 1197 # a[4]a[2] 1198 # a[7]a[0] 1199 # a[6]a[1] 1200 # a[5]a[2] 1201 # a[4]a[3] 1202 # a[7]a[1] 1203 # a[6]a[2] 1204 # a[5]a[3] 1205 # a[7]a[2] 1206 # a[6]a[3] 1207 # a[5]a[4] 1208 # a[7]a[3] 1209 # a[6]a[4] 1210 # a[7]a[4] 1211 # a[6]a[5] 1212 # a[7]a[5] 1213 # a[7]a[6] 1214 # a[1]a[0] 1215 # a[2]a[0] 1216 # a[3]a[0] 1217 # a[4]a[0] 1218 # a[5]a[0] 1219 # a[6]a[0] 1220 # a[7]a[0] 1221 # a[2]a[1] 1222 # a[3]a[1] 1223 # a[4]a[1] 1224 # a[5]a[1] 1225 # a[6]a[1] 1226 # a[7]a[1] 1227 # a[3]a[2] 1228 # a[4]a[2] 1229 # a[5]a[2] 1230 # a[6]a[2] 1231 # a[7]a[2] 1232 # a[4]a[3] 1233 # a[5]a[3] 1234 # a[6]a[3] 1235 # a[7]a[3] 1236 # a[5]a[4] 1237 # a[6]a[4] 1238 # a[7]a[4] 1239 # a[6]a[5] 1240 # a[7]a[5] 1241 # a[7]a[6] 1242 # a[0]a[0] 1243 # a[1]a[1] 1244 # a[2]a[2] 1245 # a[3]a[3] 1246 # a[4]a[4] 1247 # a[5]a[5] 1248 # a[6]a[6] 1249 # a[7]a[7] 1250 1251 lea 32(%r10),$i # $i=-($num-32) 1252 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 1253 1254 mov $num,$j # $j=$num 1255 1256 # comments apply to $num==8 case 1257 mov -32($aptr,$i),$a0 # a[0] 1258 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1259 mov -24($aptr,$i),%rax # a[1] 1260 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1261 mov -16($aptr,$i),$ai # a[2] 1262 mov %rax,$a1 1263 1264 mul $a0 # a[1]*a[0] 1265 mov %rax,$A0[0] # a[1]*a[0] 1266 mov $ai,%rax # a[2] 1267 mov %rdx,$A0[1] 1268 mov $A0[0],-24($tptr,$i) # t[1] 1269 1270 mul $a0 # a[2]*a[0] 1271 add %rax,$A0[1] 1272 mov $ai,%rax 1273 adc \$0,%rdx 1274 mov $A0[1],-16($tptr,$i) # t[2] 1275 mov %rdx,$A0[0] 1276 1277 1278 mov -8($aptr,$i),$ai # a[3] 1279 mul $a1 # a[2]*a[1] 1280 mov %rax,$A1[0] # a[2]*a[1]+t[3] 1281 mov $ai,%rax 1282 mov %rdx,$A1[1] 1283 1284 lea ($i),$j 1285 mul $a0 # a[3]*a[0] 1286 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1287 mov $ai,%rax 1288 mov %rdx,$A0[1] 1289 adc \$0,$A0[1] 1290 add $A1[0],$A0[0] 1291 adc \$0,$A0[1] 1292 mov $A0[0],-8($tptr,$j) # t[3] 1293 jmp .Lsqr4x_1st 1294 1295.align 32 1296.Lsqr4x_1st: 1297 mov ($aptr,$j),$ai # a[4] 1298 mul $a1 # a[3]*a[1] 1299 add %rax,$A1[1] # a[3]*a[1]+t[4] 1300 mov $ai,%rax 1301 mov %rdx,$A1[0] 1302 adc \$0,$A1[0] 1303 1304 mul $a0 # a[4]*a[0] 1305 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1306 mov $ai,%rax # a[3] 1307 mov 8($aptr,$j),$ai # a[5] 1308 mov %rdx,$A0[0] 1309 adc \$0,$A0[0] 1310 add $A1[1],$A0[1] 1311 adc \$0,$A0[0] 1312 1313 1314 mul $a1 # a[4]*a[3] 1315 add %rax,$A1[0] # a[4]*a[3]+t[5] 1316 mov $ai,%rax 1317 mov $A0[1],($tptr,$j) # t[4] 1318 mov %rdx,$A1[1] 1319 adc \$0,$A1[1] 1320 1321 mul $a0 # a[5]*a[2] 1322 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1323 mov $ai,%rax 1324 mov 16($aptr,$j),$ai # a[6] 1325 mov %rdx,$A0[1] 1326 adc \$0,$A0[1] 1327 add $A1[0],$A0[0] 1328 adc \$0,$A0[1] 1329 1330 mul $a1 # a[5]*a[3] 1331 add %rax,$A1[1] # a[5]*a[3]+t[6] 1332 mov $ai,%rax 1333 mov $A0[0],8($tptr,$j) # t[5] 1334 mov %rdx,$A1[0] 1335 adc \$0,$A1[0] 1336 1337 mul $a0 # a[6]*a[2] 1338 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 1339 mov $ai,%rax # a[3] 1340 mov 24($aptr,$j),$ai # a[7] 1341 mov %rdx,$A0[0] 1342 adc \$0,$A0[0] 1343 add $A1[1],$A0[1] 1344 adc \$0,$A0[0] 1345 1346 1347 mul $a1 # a[6]*a[5] 1348 add %rax,$A1[0] # a[6]*a[5]+t[7] 1349 mov $ai,%rax 1350 mov $A0[1],16($tptr,$j) # t[6] 1351 mov %rdx,$A1[1] 1352 adc \$0,$A1[1] 1353 lea 32($j),$j 1354 1355 mul $a0 # a[7]*a[4] 1356 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 1357 mov $ai,%rax 1358 mov %rdx,$A0[1] 1359 adc \$0,$A0[1] 1360 add $A1[0],$A0[0] 1361 adc \$0,$A0[1] 1362 mov $A0[0],-8($tptr,$j) # t[7] 1363 1364 cmp \$0,$j 1365 jne .Lsqr4x_1st 1366 1367 mul $a1 # a[7]*a[5] 1368 add %rax,$A1[1] 1369 lea 16($i),$i 1370 adc \$0,%rdx 1371 add $A0[1],$A1[1] 1372 adc \$0,%rdx 1373 1374 mov $A1[1],($tptr) # t[8] 1375 mov %rdx,$A1[0] 1376 mov %rdx,8($tptr) # t[9] 1377 jmp .Lsqr4x_outer 1378 1379.align 32 1380.Lsqr4x_outer: # comments apply to $num==6 case 1381 mov -32($aptr,$i),$a0 # a[0] 1382 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1383 mov -24($aptr,$i),%rax # a[1] 1384 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1385 mov -16($aptr,$i),$ai # a[2] 1386 mov %rax,$a1 1387 1388 mul $a0 # a[1]*a[0] 1389 mov -24($tptr,$i),$A0[0] # t[1] 1390 add %rax,$A0[0] # a[1]*a[0]+t[1] 1391 mov $ai,%rax # a[2] 1392 adc \$0,%rdx 1393 mov $A0[0],-24($tptr,$i) # t[1] 1394 mov %rdx,$A0[1] 1395 1396 mul $a0 # a[2]*a[0] 1397 add %rax,$A0[1] 1398 mov $ai,%rax 1399 adc \$0,%rdx 1400 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 1401 mov %rdx,$A0[0] 1402 adc \$0,$A0[0] 1403 mov $A0[1],-16($tptr,$i) # t[2] 1404 1405 xor $A1[0],$A1[0] 1406 1407 mov -8($aptr,$i),$ai # a[3] 1408 mul $a1 # a[2]*a[1] 1409 add %rax,$A1[0] # a[2]*a[1]+t[3] 1410 mov $ai,%rax 1411 adc \$0,%rdx 1412 add -8($tptr,$i),$A1[0] 1413 mov %rdx,$A1[1] 1414 adc \$0,$A1[1] 1415 1416 mul $a0 # a[3]*a[0] 1417 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1418 mov $ai,%rax 1419 adc \$0,%rdx 1420 add $A1[0],$A0[0] 1421 mov %rdx,$A0[1] 1422 adc \$0,$A0[1] 1423 mov $A0[0],-8($tptr,$i) # t[3] 1424 1425 lea ($i),$j 1426 jmp .Lsqr4x_inner 1427 1428.align 32 1429.Lsqr4x_inner: 1430 mov ($aptr,$j),$ai # a[4] 1431 mul $a1 # a[3]*a[1] 1432 add %rax,$A1[1] # a[3]*a[1]+t[4] 1433 mov $ai,%rax 1434 mov %rdx,$A1[0] 1435 adc \$0,$A1[0] 1436 add ($tptr,$j),$A1[1] 1437 adc \$0,$A1[0] 1438 1439 .byte 0x67 1440 mul $a0 # a[4]*a[0] 1441 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1442 mov $ai,%rax # a[3] 1443 mov 8($aptr,$j),$ai # a[5] 1444 mov %rdx,$A0[0] 1445 adc \$0,$A0[0] 1446 add $A1[1],$A0[1] 1447 adc \$0,$A0[0] 1448 1449 mul $a1 # a[4]*a[3] 1450 add %rax,$A1[0] # a[4]*a[3]+t[5] 1451 mov $A0[1],($tptr,$j) # t[4] 1452 mov $ai,%rax 1453 mov %rdx,$A1[1] 1454 adc \$0,$A1[1] 1455 add 8($tptr,$j),$A1[0] 1456 lea 16($j),$j # j++ 1457 adc \$0,$A1[1] 1458 1459 mul $a0 # a[5]*a[2] 1460 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1461 mov $ai,%rax 1462 adc \$0,%rdx 1463 add $A1[0],$A0[0] 1464 mov %rdx,$A0[1] 1465 adc \$0,$A0[1] 1466 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 1467 1468 cmp \$0,$j 1469 jne .Lsqr4x_inner 1470 1471 .byte 0x67 1472 mul $a1 # a[5]*a[3] 1473 add %rax,$A1[1] 1474 adc \$0,%rdx 1475 add $A0[1],$A1[1] 1476 adc \$0,%rdx 1477 1478 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 1479 mov %rdx,$A1[0] 1480 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below 1481 1482 add \$16,$i 1483 jnz .Lsqr4x_outer 1484 1485 # comments apply to $num==4 case 1486 mov -32($aptr),$a0 # a[0] 1487 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1488 mov -24($aptr),%rax # a[1] 1489 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1490 mov -16($aptr),$ai # a[2] 1491 mov %rax,$a1 1492 1493 mul $a0 # a[1]*a[0] 1494 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 1495 mov $ai,%rax # a[2] 1496 mov %rdx,$A0[1] 1497 adc \$0,$A0[1] 1498 1499 mul $a0 # a[2]*a[0] 1500 add %rax,$A0[1] 1501 mov $ai,%rax 1502 mov $A0[0],-24($tptr) # t[1] 1503 mov %rdx,$A0[0] 1504 adc \$0,$A0[0] 1505 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1506 mov -8($aptr),$ai # a[3] 1507 adc \$0,$A0[0] 1508 1509 mul $a1 # a[2]*a[1] 1510 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1511 mov $ai,%rax 1512 mov $A0[1],-16($tptr) # t[2] 1513 mov %rdx,$A1[1] 1514 adc \$0,$A1[1] 1515 1516 mul $a0 # a[3]*a[0] 1517 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1518 mov $ai,%rax 1519 mov %rdx,$A0[1] 1520 adc \$0,$A0[1] 1521 add $A1[0],$A0[0] 1522 adc \$0,$A0[1] 1523 mov $A0[0],-8($tptr) # t[3] 1524 1525 mul $a1 # a[3]*a[1] 1526 add %rax,$A1[1] 1527 mov -16($aptr),%rax # a[2] 1528 adc \$0,%rdx 1529 add $A0[1],$A1[1] 1530 adc \$0,%rdx 1531 1532 mov $A1[1],($tptr) # t[4] 1533 mov %rdx,$A1[0] 1534 mov %rdx,8($tptr) # t[5] 1535 1536 mul $ai # a[2]*a[3] 1537___ 1538{ 1539my ($shift,$carry)=($a0,$a1); 1540my @S=(@A1,$ai,$n0); 1541$code.=<<___; 1542 add \$16,$i 1543 xor $shift,$shift 1544 sub $num,$i # $i=16-$num 1545 xor $carry,$carry 1546 1547 add $A1[0],%rax # t[5] 1548 adc \$0,%rdx 1549 mov %rax,8($tptr) # t[5] 1550 mov %rdx,16($tptr) # t[6] 1551 mov $carry,24($tptr) # t[7] 1552 1553 mov -16($aptr,$i),%rax # a[0] 1554 lea 48+8(%rsp),$tptr 1555 xor $A0[0],$A0[0] # t[0] 1556 mov 8($tptr),$A0[1] # t[1] 1557 1558 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1559 shr \$63,$A0[0] 1560 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1561 shr \$63,$A0[1] 1562 or $A0[0],$S[1] # | t[2*i]>>63 1563 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1564 mov $A0[1],$shift # shift=t[2*i+1]>>63 1565 mul %rax # a[i]*a[i] 1566 neg $carry # mov $carry,cf 1567 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1568 adc %rax,$S[0] 1569 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1570 mov $S[0],($tptr) 1571 adc %rdx,$S[1] 1572 1573 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1574 mov $S[1],8($tptr) 1575 sbb $carry,$carry # mov cf,$carry 1576 shr \$63,$A0[0] 1577 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1578 shr \$63,$A0[1] 1579 or $A0[0],$S[3] # | t[2*i]>>63 1580 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1581 mov $A0[1],$shift # shift=t[2*i+1]>>63 1582 mul %rax # a[i]*a[i] 1583 neg $carry # mov $carry,cf 1584 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1585 adc %rax,$S[2] 1586 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1587 mov $S[2],16($tptr) 1588 adc %rdx,$S[3] 1589 lea 16($i),$i 1590 mov $S[3],24($tptr) 1591 sbb $carry,$carry # mov cf,$carry 1592 lea 64($tptr),$tptr 1593 jmp .Lsqr4x_shift_n_add 1594 1595.align 32 1596.Lsqr4x_shift_n_add: 1597 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1598 shr \$63,$A0[0] 1599 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1600 shr \$63,$A0[1] 1601 or $A0[0],$S[1] # | t[2*i]>>63 1602 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1603 mov $A0[1],$shift # shift=t[2*i+1]>>63 1604 mul %rax # a[i]*a[i] 1605 neg $carry # mov $carry,cf 1606 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1607 adc %rax,$S[0] 1608 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1609 mov $S[0],-32($tptr) 1610 adc %rdx,$S[1] 1611 1612 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1613 mov $S[1],-24($tptr) 1614 sbb $carry,$carry # mov cf,$carry 1615 shr \$63,$A0[0] 1616 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1617 shr \$63,$A0[1] 1618 or $A0[0],$S[3] # | t[2*i]>>63 1619 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch 1620 mov $A0[1],$shift # shift=t[2*i+1]>>63 1621 mul %rax # a[i]*a[i] 1622 neg $carry # mov $carry,cf 1623 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1624 adc %rax,$S[2] 1625 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1626 mov $S[2],-16($tptr) 1627 adc %rdx,$S[3] 1628 1629 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1630 mov $S[3],-8($tptr) 1631 sbb $carry,$carry # mov cf,$carry 1632 shr \$63,$A0[0] 1633 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1634 shr \$63,$A0[1] 1635 or $A0[0],$S[1] # | t[2*i]>>63 1636 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1637 mov $A0[1],$shift # shift=t[2*i+1]>>63 1638 mul %rax # a[i]*a[i] 1639 neg $carry # mov $carry,cf 1640 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1641 adc %rax,$S[0] 1642 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1643 mov $S[0],0($tptr) 1644 adc %rdx,$S[1] 1645 1646 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1647 mov $S[1],8($tptr) 1648 sbb $carry,$carry # mov cf,$carry 1649 shr \$63,$A0[0] 1650 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1651 shr \$63,$A0[1] 1652 or $A0[0],$S[3] # | t[2*i]>>63 1653 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1654 mov $A0[1],$shift # shift=t[2*i+1]>>63 1655 mul %rax # a[i]*a[i] 1656 neg $carry # mov $carry,cf 1657 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1658 adc %rax,$S[2] 1659 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1660 mov $S[2],16($tptr) 1661 adc %rdx,$S[3] 1662 mov $S[3],24($tptr) 1663 sbb $carry,$carry # mov cf,$carry 1664 lea 64($tptr),$tptr 1665 add \$32,$i 1666 jnz .Lsqr4x_shift_n_add 1667 1668 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1669 .byte 0x67 1670 shr \$63,$A0[0] 1671 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1672 shr \$63,$A0[1] 1673 or $A0[0],$S[1] # | t[2*i]>>63 1674 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1675 mov $A0[1],$shift # shift=t[2*i+1]>>63 1676 mul %rax # a[i]*a[i] 1677 neg $carry # mov $carry,cf 1678 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1679 adc %rax,$S[0] 1680 mov -8($aptr),%rax # a[i+1] # prefetch 1681 mov $S[0],-32($tptr) 1682 adc %rdx,$S[1] 1683 1684 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1685 mov $S[1],-24($tptr) 1686 sbb $carry,$carry # mov cf,$carry 1687 shr \$63,$A0[0] 1688 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1689 shr \$63,$A0[1] 1690 or $A0[0],$S[3] # | t[2*i]>>63 1691 mul %rax # a[i]*a[i] 1692 neg $carry # mov $carry,cf 1693 adc %rax,$S[2] 1694 adc %rdx,$S[3] 1695 mov $S[2],-16($tptr) 1696 mov $S[3],-8($tptr) 1697___ 1698} 1699###################################################################### 1700# Montgomery reduction part, "word-by-word" algorithm. 1701# 1702# This new path is inspired by multiple submissions from Intel, by 1703# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 1704# Vinodh Gopal... 1705{ 1706my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); 1707 1708$code.=<<___; 1709 movq %xmm2,$nptr 1710__bn_sqr8x_reduction: 1711 xor %rax,%rax 1712 lea ($nptr,$num),%rcx # end of n[] 1713 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer 1714 mov %rcx,0+8(%rsp) 1715 lea 48+8(%rsp,$num),$tptr # end of initial t[] window 1716 mov %rdx,8+8(%rsp) 1717 neg $num 1718 jmp .L8x_reduction_loop 1719 1720.align 32 1721.L8x_reduction_loop: 1722 lea ($tptr,$num),$tptr # start of current t[] window 1723 .byte 0x66 1724 mov 8*0($tptr),$m0 1725 mov 8*1($tptr),%r9 1726 mov 8*2($tptr),%r10 1727 mov 8*3($tptr),%r11 1728 mov 8*4($tptr),%r12 1729 mov 8*5($tptr),%r13 1730 mov 8*6($tptr),%r14 1731 mov 8*7($tptr),%r15 1732 mov %rax,(%rdx) # store top-most carry bit 1733 lea 8*8($tptr),$tptr 1734 1735 .byte 0x67 1736 mov $m0,%r8 1737 imulq 32+8(%rsp),$m0 # n0*a[0] 1738 mov 8*0($nptr),%rax # n[0] 1739 mov \$8,%ecx 1740 jmp .L8x_reduce 1741 1742.align 32 1743.L8x_reduce: 1744 mulq $m0 1745 mov 8*1($nptr),%rax # n[1] 1746 neg %r8 1747 mov %rdx,%r8 1748 adc \$0,%r8 1749 1750 mulq $m0 1751 add %rax,%r9 1752 mov 8*2($nptr),%rax 1753 adc \$0,%rdx 1754 add %r9,%r8 1755 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] 1756 mov %rdx,%r9 1757 adc \$0,%r9 1758 1759 mulq $m0 1760 add %rax,%r10 1761 mov 8*3($nptr),%rax 1762 adc \$0,%rdx 1763 add %r10,%r9 1764 mov 32+8(%rsp),$carry # pull n0, borrow $carry 1765 mov %rdx,%r10 1766 adc \$0,%r10 1767 1768 mulq $m0 1769 add %rax,%r11 1770 mov 8*4($nptr),%rax 1771 adc \$0,%rdx 1772 imulq %r8,$carry # modulo-scheduled 1773 add %r11,%r10 1774 mov %rdx,%r11 1775 adc \$0,%r11 1776 1777 mulq $m0 1778 add %rax,%r12 1779 mov 8*5($nptr),%rax 1780 adc \$0,%rdx 1781 add %r12,%r11 1782 mov %rdx,%r12 1783 adc \$0,%r12 1784 1785 mulq $m0 1786 add %rax,%r13 1787 mov 8*6($nptr),%rax 1788 adc \$0,%rdx 1789 add %r13,%r12 1790 mov %rdx,%r13 1791 adc \$0,%r13 1792 1793 mulq $m0 1794 add %rax,%r14 1795 mov 8*7($nptr),%rax 1796 adc \$0,%rdx 1797 add %r14,%r13 1798 mov %rdx,%r14 1799 adc \$0,%r14 1800 1801 mulq $m0 1802 mov $carry,$m0 # n0*a[i] 1803 add %rax,%r15 1804 mov 8*0($nptr),%rax # n[0] 1805 adc \$0,%rdx 1806 add %r15,%r14 1807 mov %rdx,%r15 1808 adc \$0,%r15 1809 1810 dec %ecx 1811 jnz .L8x_reduce 1812 1813 lea 8*8($nptr),$nptr 1814 xor %rax,%rax 1815 mov 8+8(%rsp),%rdx # pull end of t[] 1816 cmp 0+8(%rsp),$nptr # end of n[]? 1817 jae .L8x_no_tail 1818 1819 .byte 0x66 1820 add 8*0($tptr),%r8 1821 adc 8*1($tptr),%r9 1822 adc 8*2($tptr),%r10 1823 adc 8*3($tptr),%r11 1824 adc 8*4($tptr),%r12 1825 adc 8*5($tptr),%r13 1826 adc 8*6($tptr),%r14 1827 adc 8*7($tptr),%r15 1828 sbb $carry,$carry # top carry 1829 1830 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1831 mov \$8,%ecx 1832 mov 8*0($nptr),%rax 1833 jmp .L8x_tail 1834 1835.align 32 1836.L8x_tail: 1837 mulq $m0 1838 add %rax,%r8 1839 mov 8*1($nptr),%rax 1840 mov %r8,($tptr) # save result 1841 mov %rdx,%r8 1842 adc \$0,%r8 1843 1844 mulq $m0 1845 add %rax,%r9 1846 mov 8*2($nptr),%rax 1847 adc \$0,%rdx 1848 add %r9,%r8 1849 lea 8($tptr),$tptr # $tptr++ 1850 mov %rdx,%r9 1851 adc \$0,%r9 1852 1853 mulq $m0 1854 add %rax,%r10 1855 mov 8*3($nptr),%rax 1856 adc \$0,%rdx 1857 add %r10,%r9 1858 mov %rdx,%r10 1859 adc \$0,%r10 1860 1861 mulq $m0 1862 add %rax,%r11 1863 mov 8*4($nptr),%rax 1864 adc \$0,%rdx 1865 add %r11,%r10 1866 mov %rdx,%r11 1867 adc \$0,%r11 1868 1869 mulq $m0 1870 add %rax,%r12 1871 mov 8*5($nptr),%rax 1872 adc \$0,%rdx 1873 add %r12,%r11 1874 mov %rdx,%r12 1875 adc \$0,%r12 1876 1877 mulq $m0 1878 add %rax,%r13 1879 mov 8*6($nptr),%rax 1880 adc \$0,%rdx 1881 add %r13,%r12 1882 mov %rdx,%r13 1883 adc \$0,%r13 1884 1885 mulq $m0 1886 add %rax,%r14 1887 mov 8*7($nptr),%rax 1888 adc \$0,%rdx 1889 add %r14,%r13 1890 mov %rdx,%r14 1891 adc \$0,%r14 1892 1893 mulq $m0 1894 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] 1895 add %rax,%r15 1896 adc \$0,%rdx 1897 add %r15,%r14 1898 mov 8*0($nptr),%rax # pull n[0] 1899 mov %rdx,%r15 1900 adc \$0,%r15 1901 1902 dec %ecx 1903 jnz .L8x_tail 1904 1905 lea 8*8($nptr),$nptr 1906 mov 8+8(%rsp),%rdx # pull end of t[] 1907 cmp 0+8(%rsp),$nptr # end of n[]? 1908 jae .L8x_tail_done # break out of loop 1909 1910 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1911 neg $carry 1912 mov 8*0($nptr),%rax # pull n[0] 1913 adc 8*0($tptr),%r8 1914 adc 8*1($tptr),%r9 1915 adc 8*2($tptr),%r10 1916 adc 8*3($tptr),%r11 1917 adc 8*4($tptr),%r12 1918 adc 8*5($tptr),%r13 1919 adc 8*6($tptr),%r14 1920 adc 8*7($tptr),%r15 1921 sbb $carry,$carry # top carry 1922 1923 mov \$8,%ecx 1924 jmp .L8x_tail 1925 1926.align 32 1927.L8x_tail_done: 1928 add (%rdx),%r8 # can this overflow? 1929 adc \$0,%r9 1930 adc \$0,%r10 1931 adc \$0,%r11 1932 adc \$0,%r12 1933 adc \$0,%r13 1934 adc \$0,%r14 1935 adc \$0,%r15 # can't overflow, because we 1936 # started with "overhung" part 1937 # of multiplication 1938 xor %rax,%rax 1939 1940 neg $carry 1941.L8x_no_tail: 1942 adc 8*0($tptr),%r8 1943 adc 8*1($tptr),%r9 1944 adc 8*2($tptr),%r10 1945 adc 8*3($tptr),%r11 1946 adc 8*4($tptr),%r12 1947 adc 8*5($tptr),%r13 1948 adc 8*6($tptr),%r14 1949 adc 8*7($tptr),%r15 1950 adc \$0,%rax # top-most carry 1951 mov -8($nptr),%rcx # np[num-1] 1952 xor $carry,$carry 1953 1954 movq %xmm2,$nptr # restore $nptr 1955 1956 mov %r8,8*0($tptr) # store top 512 bits 1957 mov %r9,8*1($tptr) 1958 movq %xmm3,$num # $num is %r9, can't be moved upwards 1959 mov %r10,8*2($tptr) 1960 mov %r11,8*3($tptr) 1961 mov %r12,8*4($tptr) 1962 mov %r13,8*5($tptr) 1963 mov %r14,8*6($tptr) 1964 mov %r15,8*7($tptr) 1965 lea 8*8($tptr),$tptr 1966 1967 cmp %rdx,$tptr # end of t[]? 1968 jb .L8x_reduction_loop 1969 ret 1970.size bn_sqr8x_internal,.-bn_sqr8x_internal 1971___ 1972} 1973############################################################## 1974# Post-condition, 4x unrolled 1975# 1976{ 1977my ($tptr,$nptr)=("%rbx","%rbp"); 1978$code.=<<___; 1979.type __bn_post4x_internal,\@abi-omnipotent 1980.align 32 1981__bn_post4x_internal: 1982 mov 8*0($nptr),%r12 1983 lea (%rdi,$num),$tptr # %rdi was $tptr above 1984 mov $num,%rcx 1985 movq %xmm1,$rptr # restore $rptr 1986 neg %rax 1987 movq %xmm1,$aptr # prepare for back-to-back call 1988 sar \$3+2,%rcx 1989 dec %r12 # so that after 'not' we get -n[0] 1990 xor %r10,%r10 1991 mov 8*1($nptr),%r13 1992 mov 8*2($nptr),%r14 1993 mov 8*3($nptr),%r15 1994 jmp .Lsqr4x_sub_entry 1995 1996.align 16 1997.Lsqr4x_sub: 1998 mov 8*0($nptr),%r12 1999 mov 8*1($nptr),%r13 2000 mov 8*2($nptr),%r14 2001 mov 8*3($nptr),%r15 2002.Lsqr4x_sub_entry: 2003 lea 8*4($nptr),$nptr 2004 not %r12 2005 not %r13 2006 not %r14 2007 not %r15 2008 and %rax,%r12 2009 and %rax,%r13 2010 and %rax,%r14 2011 and %rax,%r15 2012 2013 neg %r10 # mov %r10,%cf 2014 adc 8*0($tptr),%r12 2015 adc 8*1($tptr),%r13 2016 adc 8*2($tptr),%r14 2017 adc 8*3($tptr),%r15 2018 mov %r12,8*0($rptr) 2019 lea 8*4($tptr),$tptr 2020 mov %r13,8*1($rptr) 2021 sbb %r10,%r10 # mov %cf,%r10 2022 mov %r14,8*2($rptr) 2023 mov %r15,8*3($rptr) 2024 lea 8*4($rptr),$rptr 2025 2026 inc %rcx # pass %cf 2027 jnz .Lsqr4x_sub 2028 2029 mov $num,%r10 # prepare for back-to-back call 2030 neg $num # restore $num 2031 ret 2032.size __bn_post4x_internal,.-__bn_post4x_internal 2033___ 2034} 2035{ 2036$code.=<<___; 2037.globl bn_from_montgomery 2038.type bn_from_montgomery,\@abi-omnipotent 2039.align 32 2040bn_from_montgomery: 2041 testl \$7,`($win64?"48(%rsp)":"%r9d")` 2042 jz bn_from_mont8x 2043 xor %eax,%eax 2044 ret 2045.size bn_from_montgomery,.-bn_from_montgomery 2046 2047.type bn_from_mont8x,\@function,6 2048.align 32 2049bn_from_mont8x: 2050 .byte 0x67 2051 mov %rsp,%rax 2052 push %rbx 2053 push %rbp 2054 push %r12 2055 push %r13 2056 push %r14 2057 push %r15 2058.Lfrom_prologue: 2059 2060 shl \$3,${num}d # convert $num to bytes 2061 lea ($num,$num,2),%r10 # 3*$num in bytes 2062 neg $num 2063 mov ($n0),$n0 # *n0 2064 2065 ############################################################## 2066 # Ensure that stack frame doesn't alias with $rptr+3*$num 2067 # modulo 4096, which covers ret[num], am[num] and n[num] 2068 # (see bn_exp.c). The stack is allocated to aligned with 2069 # bn_power5's frame, and as bn_from_montgomery happens to be 2070 # last operation, we use the opportunity to cleanse it. 2071 # 2072 lea -320(%rsp,$num,2),%r11 2073 mov %rsp,%rbp 2074 sub $rptr,%r11 2075 and \$4095,%r11 2076 cmp %r11,%r10 2077 jb .Lfrom_sp_alt 2078 sub %r11,%rbp # align with $aptr 2079 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2080 jmp .Lfrom_sp_done 2081 2082.align 32 2083.Lfrom_sp_alt: 2084 lea 4096-320(,$num,2),%r10 2085 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2086 sub %r10,%r11 2087 mov \$0,%r10 2088 cmovc %r10,%r11 2089 sub %r11,%rbp 2090.Lfrom_sp_done: 2091 and \$-64,%rbp 2092 mov %rsp,%r11 2093 sub %rbp,%r11 2094 and \$-4096,%r11 2095 lea (%rbp,%r11),%rsp 2096 mov (%rsp),%r10 2097 cmp %rbp,%rsp 2098 ja .Lfrom_page_walk 2099 jmp .Lfrom_page_walk_done 2100 2101.Lfrom_page_walk: 2102 lea -4096(%rsp),%rsp 2103 mov (%rsp),%r10 2104 cmp %rbp,%rsp 2105 ja .Lfrom_page_walk 2106.Lfrom_page_walk_done: 2107 2108 mov $num,%r10 2109 neg $num 2110 2111 ############################################################## 2112 # Stack layout 2113 # 2114 # +0 saved $num, used in reduction section 2115 # +8 &t[2*$num], used in reduction section 2116 # +32 saved *n0 2117 # +40 saved %rsp 2118 # +48 t[2*$num] 2119 # 2120 mov $n0, 32(%rsp) 2121 mov %rax, 40(%rsp) # save original %rsp 2122.Lfrom_body: 2123 mov $num,%r11 2124 lea 48(%rsp),%rax 2125 pxor %xmm0,%xmm0 2126 jmp .Lmul_by_1 2127 2128.align 32 2129.Lmul_by_1: 2130 movdqu ($aptr),%xmm1 2131 movdqu 16($aptr),%xmm2 2132 movdqu 32($aptr),%xmm3 2133 movdqa %xmm0,(%rax,$num) 2134 movdqu 48($aptr),%xmm4 2135 movdqa %xmm0,16(%rax,$num) 2136 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr 2137 movdqa %xmm1,(%rax) 2138 movdqa %xmm0,32(%rax,$num) 2139 movdqa %xmm2,16(%rax) 2140 movdqa %xmm0,48(%rax,$num) 2141 movdqa %xmm3,32(%rax) 2142 movdqa %xmm4,48(%rax) 2143 lea 64(%rax),%rax 2144 sub \$64,%r11 2145 jnz .Lmul_by_1 2146 2147 movq $rptr,%xmm1 2148 movq $nptr,%xmm2 2149 .byte 0x67 2150 mov $nptr,%rbp 2151 movq %r10, %xmm3 # -num 2152___ 2153$code.=<<___ if ($addx); 2154 mov OPENSSL_ia32cap_P+8(%rip),%r11d 2155 and \$0x80108,%r11d 2156 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 2157 jne .Lfrom_mont_nox 2158 2159 lea (%rax,$num),$rptr 2160 call __bn_sqrx8x_reduction 2161 call __bn_postx4x_internal 2162 2163 pxor %xmm0,%xmm0 2164 lea 48(%rsp),%rax 2165 mov 40(%rsp),%rsi # restore %rsp 2166 jmp .Lfrom_mont_zero 2167 2168.align 32 2169.Lfrom_mont_nox: 2170___ 2171$code.=<<___; 2172 call __bn_sqr8x_reduction 2173 call __bn_post4x_internal 2174 2175 pxor %xmm0,%xmm0 2176 lea 48(%rsp),%rax 2177 mov 40(%rsp),%rsi # restore %rsp 2178 jmp .Lfrom_mont_zero 2179 2180.align 32 2181.Lfrom_mont_zero: 2182 movdqa %xmm0,16*0(%rax) 2183 movdqa %xmm0,16*1(%rax) 2184 movdqa %xmm0,16*2(%rax) 2185 movdqa %xmm0,16*3(%rax) 2186 lea 16*4(%rax),%rax 2187 sub \$32,$num 2188 jnz .Lfrom_mont_zero 2189 2190 mov \$1,%rax 2191 mov -48(%rsi),%r15 2192 mov -40(%rsi),%r14 2193 mov -32(%rsi),%r13 2194 mov -24(%rsi),%r12 2195 mov -16(%rsi),%rbp 2196 mov -8(%rsi),%rbx 2197 lea (%rsi),%rsp 2198.Lfrom_epilogue: 2199 ret 2200.size bn_from_mont8x,.-bn_from_mont8x 2201___ 2202} 2203}}} 2204 2205if ($addx) {{{ 2206my $bp="%rdx"; # restore original value 2207 2208$code.=<<___; 2209.type bn_mulx4x_mont_gather5,\@function,6 2210.align 32 2211bn_mulx4x_mont_gather5: 2212 mov %rsp,%rax 2213.Lmulx4x_enter: 2214 push %rbx 2215 push %rbp 2216 push %r12 2217 push %r13 2218 push %r14 2219 push %r15 2220.Lmulx4x_prologue: 2221 2222 shl \$3,${num}d # convert $num to bytes 2223 lea ($num,$num,2),%r10 # 3*$num in bytes 2224 neg $num # -$num 2225 mov ($n0),$n0 # *n0 2226 2227 ############################################################## 2228 # Ensure that stack frame doesn't alias with $rptr+3*$num 2229 # modulo 4096, which covers ret[num], am[num] and n[num] 2230 # (see bn_exp.c). This is done to allow memory disambiguation 2231 # logic do its magic. [Extra [num] is allocated in order 2232 # to align with bn_power5's frame, which is cleansed after 2233 # completing exponentiation. Extra 256 bytes is for power mask 2234 # calculated from 7th argument, the index.] 2235 # 2236 lea -320(%rsp,$num,2),%r11 2237 mov %rsp,%rbp 2238 sub $rp,%r11 2239 and \$4095,%r11 2240 cmp %r11,%r10 2241 jb .Lmulx4xsp_alt 2242 sub %r11,%rbp # align with $aptr 2243 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2244 jmp .Lmulx4xsp_done 2245 2246.Lmulx4xsp_alt: 2247 lea 4096-320(,$num,2),%r10 2248 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2249 sub %r10,%r11 2250 mov \$0,%r10 2251 cmovc %r10,%r11 2252 sub %r11,%rbp 2253.Lmulx4xsp_done: 2254 and \$-64,%rbp # ensure alignment 2255 mov %rsp,%r11 2256 sub %rbp,%r11 2257 and \$-4096,%r11 2258 lea (%rbp,%r11),%rsp 2259 mov (%rsp),%r10 2260 cmp %rbp,%rsp 2261 ja .Lmulx4x_page_walk 2262 jmp .Lmulx4x_page_walk_done 2263 2264.Lmulx4x_page_walk: 2265 lea -4096(%rsp),%rsp 2266 mov (%rsp),%r10 2267 cmp %rbp,%rsp 2268 ja .Lmulx4x_page_walk 2269.Lmulx4x_page_walk_done: 2270 2271 ############################################################## 2272 # Stack layout 2273 # +0 -num 2274 # +8 off-loaded &b[i] 2275 # +16 end of b[num] 2276 # +24 inner counter 2277 # +32 saved n0 2278 # +40 saved %rsp 2279 # +48 2280 # +56 saved rp 2281 # +64 tmp[num+1] 2282 # 2283 mov $n0, 32(%rsp) # save *n0 2284 mov %rax,40(%rsp) # save original %rsp 2285.Lmulx4x_body: 2286 call mulx4x_internal 2287 2288 mov 40(%rsp),%rsi # restore %rsp 2289 mov \$1,%rax 2290 2291 mov -48(%rsi),%r15 2292 mov -40(%rsi),%r14 2293 mov -32(%rsi),%r13 2294 mov -24(%rsi),%r12 2295 mov -16(%rsi),%rbp 2296 mov -8(%rsi),%rbx 2297 lea (%rsi),%rsp 2298.Lmulx4x_epilogue: 2299 ret 2300.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2301 2302.type mulx4x_internal,\@abi-omnipotent 2303.align 32 2304mulx4x_internal: 2305 mov $num,8(%rsp) # save -$num (it was in bytes) 2306 mov $num,%r10 2307 neg $num # restore $num 2308 shl \$5,$num 2309 neg %r10 # restore $num 2310 lea 128($bp,$num),%r13 # end of powers table (+size optimization) 2311 shr \$5+5,$num 2312 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument 2313 sub \$1,$num 2314 lea .Linc(%rip),%rax 2315 mov %r13,16+8(%rsp) # end of b[num] 2316 mov $num,24+8(%rsp) # inner counter 2317 mov $rp, 56+8(%rsp) # save $rp 2318___ 2319my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 2320 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 2321my $rptr=$bptr; 2322my $STRIDE=2**5*8; # 5 is "window size" 2323my $N=$STRIDE/4; # should match cache line size 2324$code.=<<___; 2325 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 2326 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 2327 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton) 2328 lea 128($bp),$bptr # size optimization 2329 2330 pshufd \$0,%xmm5,%xmm5 # broadcast index 2331 movdqa %xmm1,%xmm4 2332 .byte 0x67 2333 movdqa %xmm1,%xmm2 2334___ 2335######################################################################## 2336# calculate mask by comparing 0..31 to index and save result to stack 2337# 2338$code.=<<___; 2339 .byte 0x67 2340 paddd %xmm0,%xmm1 2341 pcmpeqd %xmm5,%xmm0 # compare to 1,0 2342 movdqa %xmm4,%xmm3 2343___ 2344for($i=0;$i<$STRIDE/16-4;$i+=4) { 2345$code.=<<___; 2346 paddd %xmm1,%xmm2 2347 pcmpeqd %xmm5,%xmm1 # compare to 3,2 2348 movdqa %xmm0,`16*($i+0)+112`(%r10) 2349 movdqa %xmm4,%xmm0 2350 2351 paddd %xmm2,%xmm3 2352 pcmpeqd %xmm5,%xmm2 # compare to 5,4 2353 movdqa %xmm1,`16*($i+1)+112`(%r10) 2354 movdqa %xmm4,%xmm1 2355 2356 paddd %xmm3,%xmm0 2357 pcmpeqd %xmm5,%xmm3 # compare to 7,6 2358 movdqa %xmm2,`16*($i+2)+112`(%r10) 2359 movdqa %xmm4,%xmm2 2360 2361 paddd %xmm0,%xmm1 2362 pcmpeqd %xmm5,%xmm0 2363 movdqa %xmm3,`16*($i+3)+112`(%r10) 2364 movdqa %xmm4,%xmm3 2365___ 2366} 2367$code.=<<___; # last iteration can be optimized 2368 .byte 0x67 2369 paddd %xmm1,%xmm2 2370 pcmpeqd %xmm5,%xmm1 2371 movdqa %xmm0,`16*($i+0)+112`(%r10) 2372 2373 paddd %xmm2,%xmm3 2374 pcmpeqd %xmm5,%xmm2 2375 movdqa %xmm1,`16*($i+1)+112`(%r10) 2376 2377 pcmpeqd %xmm5,%xmm3 2378 movdqa %xmm2,`16*($i+2)+112`(%r10) 2379 2380 pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register 2381 pand `16*($i+1)-128`($bptr),%xmm1 2382 pand `16*($i+2)-128`($bptr),%xmm2 2383 movdqa %xmm3,`16*($i+3)+112`(%r10) 2384 pand `16*($i+3)-128`($bptr),%xmm3 2385 por %xmm2,%xmm0 2386 por %xmm3,%xmm1 2387___ 2388for($i=0;$i<$STRIDE/16-4;$i+=4) { 2389$code.=<<___; 2390 movdqa `16*($i+0)-128`($bptr),%xmm4 2391 movdqa `16*($i+1)-128`($bptr),%xmm5 2392 movdqa `16*($i+2)-128`($bptr),%xmm2 2393 pand `16*($i+0)+112`(%r10),%xmm4 2394 movdqa `16*($i+3)-128`($bptr),%xmm3 2395 pand `16*($i+1)+112`(%r10),%xmm5 2396 por %xmm4,%xmm0 2397 pand `16*($i+2)+112`(%r10),%xmm2 2398 por %xmm5,%xmm1 2399 pand `16*($i+3)+112`(%r10),%xmm3 2400 por %xmm2,%xmm0 2401 por %xmm3,%xmm1 2402___ 2403} 2404$code.=<<___; 2405 pxor %xmm1,%xmm0 2406 pshufd \$0x4e,%xmm0,%xmm1 2407 por %xmm1,%xmm0 2408 lea $STRIDE($bptr),$bptr 2409 movq %xmm0,%rdx # bp[0] 2410 lea 64+8*4+8(%rsp),$tptr 2411 2412 mov %rdx,$bi 2413 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 2414 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] 2415 add %rax,%r11 2416 mulx 2*8($aptr),%rax,%r13 # ... 2417 adc %rax,%r12 2418 adc \$0,%r13 2419 mulx 3*8($aptr),%rax,%r14 2420 2421 mov $mi,%r15 2422 imulq 32+8(%rsp),$mi # "t[0]"*n0 2423 xor $zero,$zero # cf=0, of=0 2424 mov $mi,%rdx 2425 2426 mov $bptr,8+8(%rsp) # off-load &b[i] 2427 2428 lea 4*8($aptr),$aptr 2429 adcx %rax,%r13 2430 adcx $zero,%r14 # cf=0 2431 2432 mulx 0*8($nptr),%rax,%r10 2433 adcx %rax,%r15 # discarded 2434 adox %r11,%r10 2435 mulx 1*8($nptr),%rax,%r11 2436 adcx %rax,%r10 2437 adox %r12,%r11 2438 mulx 2*8($nptr),%rax,%r12 2439 mov 24+8(%rsp),$bptr # counter value 2440 mov %r10,-8*4($tptr) 2441 adcx %rax,%r11 2442 adox %r13,%r12 2443 mulx 3*8($nptr),%rax,%r15 2444 mov $bi,%rdx 2445 mov %r11,-8*3($tptr) 2446 adcx %rax,%r12 2447 adox $zero,%r15 # of=0 2448 lea 4*8($nptr),$nptr 2449 mov %r12,-8*2($tptr) 2450 jmp .Lmulx4x_1st 2451 2452.align 32 2453.Lmulx4x_1st: 2454 adcx $zero,%r15 # cf=0, modulo-scheduled 2455 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 2456 adcx %r14,%r10 2457 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 2458 adcx %rax,%r11 2459 mulx 2*8($aptr),%r12,%rax # ... 2460 adcx %r14,%r12 2461 mulx 3*8($aptr),%r13,%r14 2462 .byte 0x67,0x67 2463 mov $mi,%rdx 2464 adcx %rax,%r13 2465 adcx $zero,%r14 # cf=0 2466 lea 4*8($aptr),$aptr 2467 lea 4*8($tptr),$tptr 2468 2469 adox %r15,%r10 2470 mulx 0*8($nptr),%rax,%r15 2471 adcx %rax,%r10 2472 adox %r15,%r11 2473 mulx 1*8($nptr),%rax,%r15 2474 adcx %rax,%r11 2475 adox %r15,%r12 2476 mulx 2*8($nptr),%rax,%r15 2477 mov %r10,-5*8($tptr) 2478 adcx %rax,%r12 2479 mov %r11,-4*8($tptr) 2480 adox %r15,%r13 2481 mulx 3*8($nptr),%rax,%r15 2482 mov $bi,%rdx 2483 mov %r12,-3*8($tptr) 2484 adcx %rax,%r13 2485 adox $zero,%r15 2486 lea 4*8($nptr),$nptr 2487 mov %r13,-2*8($tptr) 2488 2489 dec $bptr # of=0, pass cf 2490 jnz .Lmulx4x_1st 2491 2492 mov 8(%rsp),$num # load -num 2493 adc $zero,%r15 # modulo-scheduled 2494 lea ($aptr,$num),$aptr # rewind $aptr 2495 add %r15,%r14 2496 mov 8+8(%rsp),$bptr # re-load &b[i] 2497 adc $zero,$zero # top-most carry 2498 mov %r14,-1*8($tptr) 2499 jmp .Lmulx4x_outer 2500 2501.align 32 2502.Lmulx4x_outer: 2503 lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) 2504 pxor %xmm4,%xmm4 2505 .byte 0x67,0x67 2506 pxor %xmm5,%xmm5 2507___ 2508for($i=0;$i<$STRIDE/16;$i+=4) { 2509$code.=<<___; 2510 movdqa `16*($i+0)-128`($bptr),%xmm0 2511 movdqa `16*($i+1)-128`($bptr),%xmm1 2512 movdqa `16*($i+2)-128`($bptr),%xmm2 2513 pand `16*($i+0)+256`(%r10),%xmm0 2514 movdqa `16*($i+3)-128`($bptr),%xmm3 2515 pand `16*($i+1)+256`(%r10),%xmm1 2516 por %xmm0,%xmm4 2517 pand `16*($i+2)+256`(%r10),%xmm2 2518 por %xmm1,%xmm5 2519 pand `16*($i+3)+256`(%r10),%xmm3 2520 por %xmm2,%xmm4 2521 por %xmm3,%xmm5 2522___ 2523} 2524$code.=<<___; 2525 por %xmm5,%xmm4 2526 pshufd \$0x4e,%xmm4,%xmm0 2527 por %xmm4,%xmm0 2528 lea $STRIDE($bptr),$bptr 2529 movq %xmm0,%rdx # m0=bp[i] 2530 2531 mov $zero,($tptr) # save top-most carry 2532 lea 4*8($tptr,$num),$tptr # rewind $tptr 2533 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 2534 xor $zero,$zero # cf=0, of=0 2535 mov %rdx,$bi 2536 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 2537 adox -4*8($tptr),$mi # +t[0] 2538 adcx %r14,%r11 2539 mulx 2*8($aptr),%r15,%r13 # ... 2540 adox -3*8($tptr),%r11 2541 adcx %r15,%r12 2542 mulx 3*8($aptr),%rdx,%r14 2543 adox -2*8($tptr),%r12 2544 adcx %rdx,%r13 2545 lea ($nptr,$num),$nptr # rewind $nptr 2546 lea 4*8($aptr),$aptr 2547 adox -1*8($tptr),%r13 2548 adcx $zero,%r14 2549 adox $zero,%r14 2550 2551 mov $mi,%r15 2552 imulq 32+8(%rsp),$mi # "t[0]"*n0 2553 2554 mov $mi,%rdx 2555 xor $zero,$zero # cf=0, of=0 2556 mov $bptr,8+8(%rsp) # off-load &b[i] 2557 2558 mulx 0*8($nptr),%rax,%r10 2559 adcx %rax,%r15 # discarded 2560 adox %r11,%r10 2561 mulx 1*8($nptr),%rax,%r11 2562 adcx %rax,%r10 2563 adox %r12,%r11 2564 mulx 2*8($nptr),%rax,%r12 2565 adcx %rax,%r11 2566 adox %r13,%r12 2567 mulx 3*8($nptr),%rax,%r15 2568 mov $bi,%rdx 2569 mov 24+8(%rsp),$bptr # counter value 2570 mov %r10,-8*4($tptr) 2571 adcx %rax,%r12 2572 mov %r11,-8*3($tptr) 2573 adox $zero,%r15 # of=0 2574 mov %r12,-8*2($tptr) 2575 lea 4*8($nptr),$nptr 2576 jmp .Lmulx4x_inner 2577 2578.align 32 2579.Lmulx4x_inner: 2580 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 2581 adcx $zero,%r15 # cf=0, modulo-scheduled 2582 adox %r14,%r10 2583 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 2584 adcx 0*8($tptr),%r10 2585 adox %rax,%r11 2586 mulx 2*8($aptr),%r12,%rax # ... 2587 adcx 1*8($tptr),%r11 2588 adox %r14,%r12 2589 mulx 3*8($aptr),%r13,%r14 2590 mov $mi,%rdx 2591 adcx 2*8($tptr),%r12 2592 adox %rax,%r13 2593 adcx 3*8($tptr),%r13 2594 adox $zero,%r14 # of=0 2595 lea 4*8($aptr),$aptr 2596 lea 4*8($tptr),$tptr 2597 adcx $zero,%r14 # cf=0 2598 2599 adox %r15,%r10 2600 mulx 0*8($nptr),%rax,%r15 2601 adcx %rax,%r10 2602 adox %r15,%r11 2603 mulx 1*8($nptr),%rax,%r15 2604 adcx %rax,%r11 2605 adox %r15,%r12 2606 mulx 2*8($nptr),%rax,%r15 2607 mov %r10,-5*8($tptr) 2608 adcx %rax,%r12 2609 adox %r15,%r13 2610 mov %r11,-4*8($tptr) 2611 mulx 3*8($nptr),%rax,%r15 2612 mov $bi,%rdx 2613 lea 4*8($nptr),$nptr 2614 mov %r12,-3*8($tptr) 2615 adcx %rax,%r13 2616 adox $zero,%r15 2617 mov %r13,-2*8($tptr) 2618 2619 dec $bptr # of=0, pass cf 2620 jnz .Lmulx4x_inner 2621 2622 mov 0+8(%rsp),$num # load -num 2623 adc $zero,%r15 # modulo-scheduled 2624 sub 0*8($tptr),$bptr # pull top-most carry to %cf 2625 mov 8+8(%rsp),$bptr # re-load &b[i] 2626 mov 16+8(%rsp),%r10 2627 adc %r15,%r14 2628 lea ($aptr,$num),$aptr # rewind $aptr 2629 adc $zero,$zero # top-most carry 2630 mov %r14,-1*8($tptr) 2631 2632 cmp %r10,$bptr 2633 jb .Lmulx4x_outer 2634 2635 mov -8($nptr),%r10 2636 mov $zero,%r8 2637 mov ($nptr,$num),%r12 2638 lea ($nptr,$num),%rbp # rewind $nptr 2639 mov $num,%rcx 2640 lea ($tptr,$num),%rdi # rewind $tptr 2641 xor %eax,%eax 2642 xor %r15,%r15 2643 sub %r14,%r10 # compare top-most words 2644 adc %r15,%r15 2645 or %r15,%r8 2646 sar \$3+2,%rcx 2647 sub %r8,%rax # %rax=-%r8 2648 mov 56+8(%rsp),%rdx # restore rp 2649 dec %r12 # so that after 'not' we get -n[0] 2650 mov 8*1(%rbp),%r13 2651 xor %r8,%r8 2652 mov 8*2(%rbp),%r14 2653 mov 8*3(%rbp),%r15 2654 jmp .Lsqrx4x_sub_entry # common post-condition 2655.size mulx4x_internal,.-mulx4x_internal 2656___ 2657}{ 2658###################################################################### 2659# void bn_power5( 2660my $rptr="%rdi"; # BN_ULONG *rptr, 2661my $aptr="%rsi"; # const BN_ULONG *aptr, 2662my $bptr="%rdx"; # const void *table, 2663my $nptr="%rcx"; # const BN_ULONG *nptr, 2664my $n0 ="%r8"; # const BN_ULONG *n0); 2665my $num ="%r9"; # int num, has to be divisible by 8 2666 # int pwr); 2667 2668my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 2669my @A0=("%r10","%r11"); 2670my @A1=("%r12","%r13"); 2671my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 2672 2673$code.=<<___; 2674.type bn_powerx5,\@function,6 2675.align 32 2676bn_powerx5: 2677 mov %rsp,%rax 2678.Lpowerx5_enter: 2679 push %rbx 2680 push %rbp 2681 push %r12 2682 push %r13 2683 push %r14 2684 push %r15 2685.Lpowerx5_prologue: 2686 2687 shl \$3,${num}d # convert $num to bytes 2688 lea ($num,$num,2),%r10 # 3*$num in bytes 2689 neg $num 2690 mov ($n0),$n0 # *n0 2691 2692 ############################################################## 2693 # Ensure that stack frame doesn't alias with $rptr+3*$num 2694 # modulo 4096, which covers ret[num], am[num] and n[num] 2695 # (see bn_exp.c). This is done to allow memory disambiguation 2696 # logic do its magic. [Extra 256 bytes is for power mask 2697 # calculated from 7th argument, the index.] 2698 # 2699 lea -320(%rsp,$num,2),%r11 2700 mov %rsp,%rbp 2701 sub $rptr,%r11 2702 and \$4095,%r11 2703 cmp %r11,%r10 2704 jb .Lpwrx_sp_alt 2705 sub %r11,%rbp # align with $aptr 2706 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2707 jmp .Lpwrx_sp_done 2708 2709.align 32 2710.Lpwrx_sp_alt: 2711 lea 4096-320(,$num,2),%r10 2712 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) 2713 sub %r10,%r11 2714 mov \$0,%r10 2715 cmovc %r10,%r11 2716 sub %r11,%rbp 2717.Lpwrx_sp_done: 2718 and \$-64,%rbp 2719 mov %rsp,%r11 2720 sub %rbp,%r11 2721 and \$-4096,%r11 2722 lea (%rbp,%r11),%rsp 2723 mov (%rsp),%r10 2724 cmp %rbp,%rsp 2725 ja .Lpwrx_page_walk 2726 jmp .Lpwrx_page_walk_done 2727 2728.Lpwrx_page_walk: 2729 lea -4096(%rsp),%rsp 2730 mov (%rsp),%r10 2731 cmp %rbp,%rsp 2732 ja .Lpwrx_page_walk 2733.Lpwrx_page_walk_done: 2734 2735 mov $num,%r10 2736 neg $num 2737 2738 ############################################################## 2739 # Stack layout 2740 # 2741 # +0 saved $num, used in reduction section 2742 # +8 &t[2*$num], used in reduction section 2743 # +16 intermediate carry bit 2744 # +24 top-most carry bit, used in reduction section 2745 # +32 saved *n0 2746 # +40 saved %rsp 2747 # +48 t[2*$num] 2748 # 2749 pxor %xmm0,%xmm0 2750 movq $rptr,%xmm1 # save $rptr 2751 movq $nptr,%xmm2 # save $nptr 2752 movq %r10, %xmm3 # -$num 2753 movq $bptr,%xmm4 2754 mov $n0, 32(%rsp) 2755 mov %rax, 40(%rsp) # save original %rsp 2756.Lpowerx5_body: 2757 2758 call __bn_sqrx8x_internal 2759 call __bn_postx4x_internal 2760 call __bn_sqrx8x_internal 2761 call __bn_postx4x_internal 2762 call __bn_sqrx8x_internal 2763 call __bn_postx4x_internal 2764 call __bn_sqrx8x_internal 2765 call __bn_postx4x_internal 2766 call __bn_sqrx8x_internal 2767 call __bn_postx4x_internal 2768 2769 mov %r10,$num # -num 2770 mov $aptr,$rptr 2771 movq %xmm2,$nptr 2772 movq %xmm4,$bptr 2773 mov 40(%rsp),%rax 2774 2775 call mulx4x_internal 2776 2777 mov 40(%rsp),%rsi # restore %rsp 2778 mov \$1,%rax 2779 2780 mov -48(%rsi),%r15 2781 mov -40(%rsi),%r14 2782 mov -32(%rsi),%r13 2783 mov -24(%rsi),%r12 2784 mov -16(%rsi),%rbp 2785 mov -8(%rsi),%rbx 2786 lea (%rsi),%rsp 2787.Lpowerx5_epilogue: 2788 ret 2789.size bn_powerx5,.-bn_powerx5 2790 2791.globl bn_sqrx8x_internal 2792.hidden bn_sqrx8x_internal 2793.type bn_sqrx8x_internal,\@abi-omnipotent 2794.align 32 2795bn_sqrx8x_internal: 2796__bn_sqrx8x_internal: 2797 ################################################################## 2798 # Squaring part: 2799 # 2800 # a) multiply-n-add everything but a[i]*a[i]; 2801 # b) shift result of a) by 1 to the left and accumulate 2802 # a[i]*a[i] products; 2803 # 2804 ################################################################## 2805 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2806 # a[1]a[0] 2807 # a[2]a[0] 2808 # a[3]a[0] 2809 # a[2]a[1] 2810 # a[3]a[1] 2811 # a[3]a[2] 2812 # 2813 # a[4]a[0] 2814 # a[5]a[0] 2815 # a[6]a[0] 2816 # a[7]a[0] 2817 # a[4]a[1] 2818 # a[5]a[1] 2819 # a[6]a[1] 2820 # a[7]a[1] 2821 # a[4]a[2] 2822 # a[5]a[2] 2823 # a[6]a[2] 2824 # a[7]a[2] 2825 # a[4]a[3] 2826 # a[5]a[3] 2827 # a[6]a[3] 2828 # a[7]a[3] 2829 # 2830 # a[5]a[4] 2831 # a[6]a[4] 2832 # a[7]a[4] 2833 # a[6]a[5] 2834 # a[7]a[5] 2835 # a[7]a[6] 2836 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2837___ 2838{ 2839my ($zero,$carry)=("%rbp","%rcx"); 2840my $aaptr=$zero; 2841$code.=<<___; 2842 lea 48+8(%rsp),$tptr 2843 lea ($aptr,$num),$aaptr 2844 mov $num,0+8(%rsp) # save $num 2845 mov $aaptr,8+8(%rsp) # save end of $aptr 2846 jmp .Lsqr8x_zero_start 2847 2848.align 32 2849.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2850.Lsqrx8x_zero: 2851 .byte 0x3e 2852 movdqa %xmm0,0*8($tptr) 2853 movdqa %xmm0,2*8($tptr) 2854 movdqa %xmm0,4*8($tptr) 2855 movdqa %xmm0,6*8($tptr) 2856.Lsqr8x_zero_start: # aligned at 32 2857 movdqa %xmm0,8*8($tptr) 2858 movdqa %xmm0,10*8($tptr) 2859 movdqa %xmm0,12*8($tptr) 2860 movdqa %xmm0,14*8($tptr) 2861 lea 16*8($tptr),$tptr 2862 sub \$64,$num 2863 jnz .Lsqrx8x_zero 2864 2865 mov 0*8($aptr),%rdx # a[0], modulo-scheduled 2866 #xor %r9,%r9 # t[1], ex-$num, zero already 2867 xor %r10,%r10 2868 xor %r11,%r11 2869 xor %r12,%r12 2870 xor %r13,%r13 2871 xor %r14,%r14 2872 xor %r15,%r15 2873 lea 48+8(%rsp),$tptr 2874 xor $zero,$zero # cf=0, cf=0 2875 jmp .Lsqrx8x_outer_loop 2876 2877.align 32 2878.Lsqrx8x_outer_loop: 2879 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] 2880 adcx %r9,%r8 # a[1]*a[0]+=t[1] 2881 adox %rax,%r10 2882 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] 2883 adcx %r10,%r9 2884 adox %rax,%r11 2885 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... 2886 adcx %r11,%r10 2887 adox %rax,%r12 2888 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax 2889 adcx %r12,%r11 2890 adox %rax,%r13 2891 mulx 5*8($aptr),%r12,%rax 2892 adcx %r13,%r12 2893 adox %rax,%r14 2894 mulx 6*8($aptr),%r13,%rax 2895 adcx %r14,%r13 2896 adox %r15,%rax 2897 mulx 7*8($aptr),%r14,%r15 2898 mov 1*8($aptr),%rdx # a[1] 2899 adcx %rax,%r14 2900 adox $zero,%r15 2901 adc 8*8($tptr),%r15 2902 mov %r8,1*8($tptr) # t[1] 2903 mov %r9,2*8($tptr) # t[2] 2904 sbb $carry,$carry # mov %cf,$carry 2905 xor $zero,$zero # cf=0, of=0 2906 2907 2908 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] 2909 mulx 3*8($aptr),%r9,%rax # a[3]*a[1] 2910 adcx %r10,%r8 2911 adox %rbx,%r9 2912 mulx 4*8($aptr),%r10,%rbx # ... 2913 adcx %r11,%r9 2914 adox %rax,%r10 2915 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax 2916 adcx %r12,%r10 2917 adox %rbx,%r11 2918 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx 2919 adcx %r13,%r11 2920 adox %r14,%r12 2921 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 2922 mov 2*8($aptr),%rdx # a[2] 2923 adcx %rax,%r12 2924 adox %rbx,%r13 2925 adcx %r15,%r13 2926 adox $zero,%r14 # of=0 2927 adcx $zero,%r14 # cf=0 2928 2929 mov %r8,3*8($tptr) # t[3] 2930 mov %r9,4*8($tptr) # t[4] 2931 2932 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] 2933 mulx 4*8($aptr),%r9,%rax # a[4]*a[2] 2934 adcx %r10,%r8 2935 adox %rbx,%r9 2936 mulx 5*8($aptr),%r10,%rbx # ... 2937 adcx %r11,%r9 2938 adox %rax,%r10 2939 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax 2940 adcx %r12,%r10 2941 adox %r13,%r11 2942 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 2943 .byte 0x3e 2944 mov 3*8($aptr),%rdx # a[3] 2945 adcx %rbx,%r11 2946 adox %rax,%r12 2947 adcx %r14,%r12 2948 mov %r8,5*8($tptr) # t[5] 2949 mov %r9,6*8($tptr) # t[6] 2950 mulx 4*8($aptr),%r8,%rax # a[4]*a[3] 2951 adox $zero,%r13 # of=0 2952 adcx $zero,%r13 # cf=0 2953 2954 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] 2955 adcx %r10,%r8 2956 adox %rax,%r9 2957 mulx 6*8($aptr),%r10,%rax # ... 2958 adcx %r11,%r9 2959 adox %r12,%r10 2960 mulx 7*8($aptr),%r11,%r12 2961 mov 4*8($aptr),%rdx # a[4] 2962 mov 5*8($aptr),%r14 # a[5] 2963 adcx %rbx,%r10 2964 adox %rax,%r11 2965 mov 6*8($aptr),%r15 # a[6] 2966 adcx %r13,%r11 2967 adox $zero,%r12 # of=0 2968 adcx $zero,%r12 # cf=0 2969 2970 mov %r8,7*8($tptr) # t[7] 2971 mov %r9,8*8($tptr) # t[8] 2972 2973 mulx %r14,%r9,%rax # a[5]*a[4] 2974 mov 7*8($aptr),%r8 # a[7] 2975 adcx %r10,%r9 2976 mulx %r15,%r10,%rbx # a[6]*a[4] 2977 adox %rax,%r10 2978 adcx %r11,%r10 2979 mulx %r8,%r11,%rax # a[7]*a[4] 2980 mov %r14,%rdx # a[5] 2981 adox %rbx,%r11 2982 adcx %r12,%r11 2983 #adox $zero,%rax # of=0 2984 adcx $zero,%rax # cf=0 2985 2986 mulx %r15,%r14,%rbx # a[6]*a[5] 2987 mulx %r8,%r12,%r13 # a[7]*a[5] 2988 mov %r15,%rdx # a[6] 2989 lea 8*8($aptr),$aptr 2990 adcx %r14,%r11 2991 adox %rbx,%r12 2992 adcx %rax,%r12 2993 adox $zero,%r13 2994 2995 .byte 0x67,0x67 2996 mulx %r8,%r8,%r14 # a[7]*a[6] 2997 adcx %r8,%r13 2998 adcx $zero,%r14 2999 3000 cmp 8+8(%rsp),$aptr 3001 je .Lsqrx8x_outer_break 3002 3003 neg $carry # mov $carry,%cf 3004 mov \$-8,%rcx 3005 mov $zero,%r15 3006 mov 8*8($tptr),%r8 3007 adcx 9*8($tptr),%r9 # +=t[9] 3008 adcx 10*8($tptr),%r10 # ... 3009 adcx 11*8($tptr),%r11 3010 adc 12*8($tptr),%r12 3011 adc 13*8($tptr),%r13 3012 adc 14*8($tptr),%r14 3013 adc 15*8($tptr),%r15 3014 lea ($aptr),$aaptr 3015 lea 2*64($tptr),$tptr 3016 sbb %rax,%rax # mov %cf,$carry 3017 3018 mov -64($aptr),%rdx # a[0] 3019 mov %rax,16+8(%rsp) # offload $carry 3020 mov $tptr,24+8(%rsp) 3021 3022 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above 3023 xor %eax,%eax # cf=0, of=0 3024 jmp .Lsqrx8x_loop 3025 3026.align 32 3027.Lsqrx8x_loop: 3028 mov %r8,%rbx 3029 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] 3030 adcx %rax,%rbx # +=t[8] 3031 adox %r9,%r8 3032 3033 mulx 1*8($aaptr),%rax,%r9 # ... 3034 adcx %rax,%r8 3035 adox %r10,%r9 3036 3037 mulx 2*8($aaptr),%rax,%r10 3038 adcx %rax,%r9 3039 adox %r11,%r10 3040 3041 mulx 3*8($aaptr),%rax,%r11 3042 adcx %rax,%r10 3043 adox %r12,%r11 3044 3045 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 3046 adcx %rax,%r11 3047 adox %r13,%r12 3048 3049 mulx 5*8($aaptr),%rax,%r13 3050 adcx %rax,%r12 3051 adox %r14,%r13 3052 3053 mulx 6*8($aaptr),%rax,%r14 3054 mov %rbx,($tptr,%rcx,8) # store t[8+i] 3055 mov \$0,%ebx 3056 adcx %rax,%r13 3057 adox %r15,%r14 3058 3059 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 3060 mov 8($aptr,%rcx,8),%rdx # a[i] 3061 adcx %rax,%r14 3062 adox %rbx,%r15 # %rbx is 0, of=0 3063 adcx %rbx,%r15 # cf=0 3064 3065 .byte 0x67 3066 inc %rcx # of=0 3067 jnz .Lsqrx8x_loop 3068 3069 lea 8*8($aaptr),$aaptr 3070 mov \$-8,%rcx 3071 cmp 8+8(%rsp),$aaptr # done? 3072 je .Lsqrx8x_break 3073 3074 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3075 .byte 0x66 3076 mov -64($aptr),%rdx 3077 adcx 0*8($tptr),%r8 3078 adcx 1*8($tptr),%r9 3079 adc 2*8($tptr),%r10 3080 adc 3*8($tptr),%r11 3081 adc 4*8($tptr),%r12 3082 adc 5*8($tptr),%r13 3083 adc 6*8($tptr),%r14 3084 adc 7*8($tptr),%r15 3085 lea 8*8($tptr),$tptr 3086 .byte 0x67 3087 sbb %rax,%rax # mov %cf,%rax 3088 xor %ebx,%ebx # cf=0, of=0 3089 mov %rax,16+8(%rsp) # offload carry 3090 jmp .Lsqrx8x_loop 3091 3092.align 32 3093.Lsqrx8x_break: 3094 sub 16+8(%rsp),%r8 # consume last carry 3095 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry 3096 mov 0*8($aptr),%rdx # a[8], modulo-scheduled 3097 xor %ebp,%ebp # xor $zero,$zero 3098 mov %r8,0*8($tptr) 3099 cmp $carry,$tptr # cf=0, of=0 3100 je .Lsqrx8x_outer_loop 3101 3102 mov %r9,1*8($tptr) 3103 mov 1*8($carry),%r9 3104 mov %r10,2*8($tptr) 3105 mov 2*8($carry),%r10 3106 mov %r11,3*8($tptr) 3107 mov 3*8($carry),%r11 3108 mov %r12,4*8($tptr) 3109 mov 4*8($carry),%r12 3110 mov %r13,5*8($tptr) 3111 mov 5*8($carry),%r13 3112 mov %r14,6*8($tptr) 3113 mov 6*8($carry),%r14 3114 mov %r15,7*8($tptr) 3115 mov 7*8($carry),%r15 3116 mov $carry,$tptr 3117 jmp .Lsqrx8x_outer_loop 3118 3119.align 32 3120.Lsqrx8x_outer_break: 3121 mov %r9,9*8($tptr) # t[9] 3122 movq %xmm3,%rcx # -$num 3123 mov %r10,10*8($tptr) # ... 3124 mov %r11,11*8($tptr) 3125 mov %r12,12*8($tptr) 3126 mov %r13,13*8($tptr) 3127 mov %r14,14*8($tptr) 3128___ 3129}{ 3130my $i="%rcx"; 3131$code.=<<___; 3132 lea 48+8(%rsp),$tptr 3133 mov ($aptr,$i),%rdx # a[0] 3134 3135 mov 8($tptr),$A0[1] # t[1] 3136 xor $A0[0],$A0[0] # t[0], of=0, cf=0 3137 mov 0+8(%rsp),$num # restore $num 3138 adox $A0[1],$A0[1] 3139 mov 16($tptr),$A1[0] # t[2] # prefetch 3140 mov 24($tptr),$A1[1] # t[3] # prefetch 3141 #jmp .Lsqrx4x_shift_n_add # happens to be aligned 3142 3143.align 32 3144.Lsqrx4x_shift_n_add: 3145 mulx %rdx,%rax,%rbx 3146 adox $A1[0],$A1[0] 3147 adcx $A0[0],%rax 3148 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch 3149 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch 3150 adox $A1[1],$A1[1] 3151 adcx $A0[1],%rbx 3152 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch 3153 mov %rax,0($tptr) 3154 mov %rbx,8($tptr) 3155 3156 mulx %rdx,%rax,%rbx 3157 adox $A0[0],$A0[0] 3158 adcx $A1[0],%rax 3159 mov 16($aptr,$i),%rdx # a[i+2] # prefetch 3160 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch 3161 adox $A0[1],$A0[1] 3162 adcx $A1[1],%rbx 3163 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch 3164 mov %rax,16($tptr) 3165 mov %rbx,24($tptr) 3166 3167 mulx %rdx,%rax,%rbx 3168 adox $A1[0],$A1[0] 3169 adcx $A0[0],%rax 3170 mov 24($aptr,$i),%rdx # a[i+3] # prefetch 3171 lea 32($i),$i 3172 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch 3173 adox $A1[1],$A1[1] 3174 adcx $A0[1],%rbx 3175 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch 3176 mov %rax,32($tptr) 3177 mov %rbx,40($tptr) 3178 3179 mulx %rdx,%rax,%rbx 3180 adox $A0[0],$A0[0] 3181 adcx $A1[0],%rax 3182 jrcxz .Lsqrx4x_shift_n_add_break 3183 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch 3184 adox $A0[1],$A0[1] 3185 adcx $A1[1],%rbx 3186 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch 3187 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch 3188 mov %rax,48($tptr) 3189 mov %rbx,56($tptr) 3190 lea 64($tptr),$tptr 3191 nop 3192 jmp .Lsqrx4x_shift_n_add 3193 3194.align 32 3195.Lsqrx4x_shift_n_add_break: 3196 adcx $A1[1],%rbx 3197 mov %rax,48($tptr) 3198 mov %rbx,56($tptr) 3199 lea 64($tptr),$tptr # end of t[] buffer 3200___ 3201} 3202###################################################################### 3203# Montgomery reduction part, "word-by-word" algorithm. 3204# 3205# This new path is inspired by multiple submissions from Intel, by 3206# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 3207# Vinodh Gopal... 3208{ 3209my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); 3210 3211$code.=<<___; 3212 movq %xmm2,$nptr 3213__bn_sqrx8x_reduction: 3214 xor %eax,%eax # initial top-most carry bit 3215 mov 32+8(%rsp),%rbx # n0 3216 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) 3217 lea -8*8($nptr,$num),%rcx # end of n[] 3218 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer 3219 mov %rcx, 0+8(%rsp) # save end of n[] 3220 mov $tptr,8+8(%rsp) # save end of t[] 3221 3222 lea 48+8(%rsp),$tptr # initial t[] window 3223 jmp .Lsqrx8x_reduction_loop 3224 3225.align 32 3226.Lsqrx8x_reduction_loop: 3227 mov 8*1($tptr),%r9 3228 mov 8*2($tptr),%r10 3229 mov 8*3($tptr),%r11 3230 mov 8*4($tptr),%r12 3231 mov %rdx,%r8 3232 imulq %rbx,%rdx # n0*a[i] 3233 mov 8*5($tptr),%r13 3234 mov 8*6($tptr),%r14 3235 mov 8*7($tptr),%r15 3236 mov %rax,24+8(%rsp) # store top-most carry bit 3237 3238 lea 8*8($tptr),$tptr 3239 xor $carry,$carry # cf=0,of=0 3240 mov \$-8,%rcx 3241 jmp .Lsqrx8x_reduce 3242 3243.align 32 3244.Lsqrx8x_reduce: 3245 mov %r8, %rbx 3246 mulx 8*0($nptr),%rax,%r8 # n[0] 3247 adcx %rbx,%rax # discarded 3248 adox %r9,%r8 3249 3250 mulx 8*1($nptr),%rbx,%r9 # n[1] 3251 adcx %rbx,%r8 3252 adox %r10,%r9 3253 3254 mulx 8*2($nptr),%rbx,%r10 3255 adcx %rbx,%r9 3256 adox %r11,%r10 3257 3258 mulx 8*3($nptr),%rbx,%r11 3259 adcx %rbx,%r10 3260 adox %r12,%r11 3261 3262 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 3263 mov %rdx,%rax 3264 mov %r8,%rdx 3265 adcx %rbx,%r11 3266 adox %r13,%r12 3267 3268 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded 3269 mov %rax,%rdx 3270 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] 3271 3272 mulx 8*5($nptr),%rax,%r13 3273 adcx %rax,%r12 3274 adox %r14,%r13 3275 3276 mulx 8*6($nptr),%rax,%r14 3277 adcx %rax,%r13 3278 adox %r15,%r14 3279 3280 mulx 8*7($nptr),%rax,%r15 3281 mov %rbx,%rdx 3282 adcx %rax,%r14 3283 adox $carry,%r15 # $carry is 0 3284 adcx $carry,%r15 # cf=0 3285 3286 .byte 0x67,0x67,0x67 3287 inc %rcx # of=0 3288 jnz .Lsqrx8x_reduce 3289 3290 mov $carry,%rax # xor %rax,%rax 3291 cmp 0+8(%rsp),$nptr # end of n[]? 3292 jae .Lsqrx8x_no_tail 3293 3294 mov 48+8(%rsp),%rdx # pull n0*a[0] 3295 add 8*0($tptr),%r8 3296 lea 8*8($nptr),$nptr 3297 mov \$-8,%rcx 3298 adcx 8*1($tptr),%r9 3299 adcx 8*2($tptr),%r10 3300 adc 8*3($tptr),%r11 3301 adc 8*4($tptr),%r12 3302 adc 8*5($tptr),%r13 3303 adc 8*6($tptr),%r14 3304 adc 8*7($tptr),%r15 3305 lea 8*8($tptr),$tptr 3306 sbb %rax,%rax # top carry 3307 3308 xor $carry,$carry # of=0, cf=0 3309 mov %rax,16+8(%rsp) 3310 jmp .Lsqrx8x_tail 3311 3312.align 32 3313.Lsqrx8x_tail: 3314 mov %r8,%rbx 3315 mulx 8*0($nptr),%rax,%r8 3316 adcx %rax,%rbx 3317 adox %r9,%r8 3318 3319 mulx 8*1($nptr),%rax,%r9 3320 adcx %rax,%r8 3321 adox %r10,%r9 3322 3323 mulx 8*2($nptr),%rax,%r10 3324 adcx %rax,%r9 3325 adox %r11,%r10 3326 3327 mulx 8*3($nptr),%rax,%r11 3328 adcx %rax,%r10 3329 adox %r12,%r11 3330 3331 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 3332 adcx %rax,%r11 3333 adox %r13,%r12 3334 3335 mulx 8*5($nptr),%rax,%r13 3336 adcx %rax,%r12 3337 adox %r14,%r13 3338 3339 mulx 8*6($nptr),%rax,%r14 3340 adcx %rax,%r13 3341 adox %r15,%r14 3342 3343 mulx 8*7($nptr),%rax,%r15 3344 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] 3345 adcx %rax,%r14 3346 adox $carry,%r15 3347 mov %rbx,($tptr,%rcx,8) # save result 3348 mov %r8,%rbx 3349 adcx $carry,%r15 # cf=0 3350 3351 inc %rcx # of=0 3352 jnz .Lsqrx8x_tail 3353 3354 cmp 0+8(%rsp),$nptr # end of n[]? 3355 jae .Lsqrx8x_tail_done # break out of loop 3356 3357 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3358 mov 48+8(%rsp),%rdx # pull n0*a[0] 3359 lea 8*8($nptr),$nptr 3360 adc 8*0($tptr),%r8 3361 adc 8*1($tptr),%r9 3362 adc 8*2($tptr),%r10 3363 adc 8*3($tptr),%r11 3364 adc 8*4($tptr),%r12 3365 adc 8*5($tptr),%r13 3366 adc 8*6($tptr),%r14 3367 adc 8*7($tptr),%r15 3368 lea 8*8($tptr),$tptr 3369 sbb %rax,%rax 3370 sub \$8,%rcx # mov \$-8,%rcx 3371 3372 xor $carry,$carry # of=0, cf=0 3373 mov %rax,16+8(%rsp) 3374 jmp .Lsqrx8x_tail 3375 3376.align 32 3377.Lsqrx8x_tail_done: 3378 add 24+8(%rsp),%r8 # can this overflow? 3379 adc \$0,%r9 3380 adc \$0,%r10 3381 adc \$0,%r11 3382 adc \$0,%r12 3383 adc \$0,%r13 3384 adc \$0,%r14 3385 adc \$0,%r15 # can't overflow, because we 3386 # started with "overhung" part 3387 # of multiplication 3388 mov $carry,%rax # xor %rax,%rax 3389 3390 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3391.Lsqrx8x_no_tail: # %cf is 0 if jumped here 3392 adc 8*0($tptr),%r8 3393 movq %xmm3,%rcx 3394 adc 8*1($tptr),%r9 3395 mov 8*7($nptr),$carry 3396 movq %xmm2,$nptr # restore $nptr 3397 adc 8*2($tptr),%r10 3398 adc 8*3($tptr),%r11 3399 adc 8*4($tptr),%r12 3400 adc 8*5($tptr),%r13 3401 adc 8*6($tptr),%r14 3402 adc 8*7($tptr),%r15 3403 adc %rax,%rax # top-most carry 3404 3405 mov 32+8(%rsp),%rbx # n0 3406 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" 3407 3408 mov %r8,8*0($tptr) # store top 512 bits 3409 lea 8*8($tptr),%r8 # borrow %r8 3410 mov %r9,8*1($tptr) 3411 mov %r10,8*2($tptr) 3412 mov %r11,8*3($tptr) 3413 mov %r12,8*4($tptr) 3414 mov %r13,8*5($tptr) 3415 mov %r14,8*6($tptr) 3416 mov %r15,8*7($tptr) 3417 3418 lea 8*8($tptr,%rcx),$tptr # start of current t[] window 3419 cmp 8+8(%rsp),%r8 # end of t[]? 3420 jb .Lsqrx8x_reduction_loop 3421 ret 3422.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3423___ 3424} 3425############################################################## 3426# Post-condition, 4x unrolled 3427# 3428{ 3429my ($rptr,$nptr)=("%rdx","%rbp"); 3430$code.=<<___; 3431.align 32 3432__bn_postx4x_internal: 3433 mov 8*0($nptr),%r12 3434 mov %rcx,%r10 # -$num 3435 mov %rcx,%r9 # -$num 3436 neg %rax 3437 sar \$3+2,%rcx 3438 #lea 48+8(%rsp,%r9),$tptr 3439 movq %xmm1,$rptr # restore $rptr 3440 movq %xmm1,$aptr # prepare for back-to-back call 3441 dec %r12 # so that after 'not' we get -n[0] 3442 mov 8*1($nptr),%r13 3443 xor %r8,%r8 3444 mov 8*2($nptr),%r14 3445 mov 8*3($nptr),%r15 3446 jmp .Lsqrx4x_sub_entry 3447 3448.align 16 3449.Lsqrx4x_sub: 3450 mov 8*0($nptr),%r12 3451 mov 8*1($nptr),%r13 3452 mov 8*2($nptr),%r14 3453 mov 8*3($nptr),%r15 3454.Lsqrx4x_sub_entry: 3455 andn %rax,%r12,%r12 3456 lea 8*4($nptr),$nptr 3457 andn %rax,%r13,%r13 3458 andn %rax,%r14,%r14 3459 andn %rax,%r15,%r15 3460 3461 neg %r8 # mov %r8,%cf 3462 adc 8*0($tptr),%r12 3463 adc 8*1($tptr),%r13 3464 adc 8*2($tptr),%r14 3465 adc 8*3($tptr),%r15 3466 mov %r12,8*0($rptr) 3467 lea 8*4($tptr),$tptr 3468 mov %r13,8*1($rptr) 3469 sbb %r8,%r8 # mov %cf,%r8 3470 mov %r14,8*2($rptr) 3471 mov %r15,8*3($rptr) 3472 lea 8*4($rptr),$rptr 3473 3474 inc %rcx 3475 jnz .Lsqrx4x_sub 3476 3477 neg %r9 # restore $num 3478 3479 ret 3480.size __bn_postx4x_internal,.-__bn_postx4x_internal 3481___ 3482} 3483}}} 3484{ 3485my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order 3486 ("%rdi","%esi","%rdx","%ecx"); # Unix order 3487my $out=$inp; 3488my $STRIDE=2**5*8; 3489my $N=$STRIDE/4; 3490 3491$code.=<<___; 3492.globl bn_get_bits5 3493.type bn_get_bits5,\@abi-omnipotent 3494.align 16 3495bn_get_bits5: 3496 lea 0($inp),%r10 3497 lea 1($inp),%r11 3498 mov $num,%ecx 3499 shr \$4,$num 3500 and \$15,%ecx 3501 lea -8(%ecx),%eax 3502 cmp \$11,%ecx 3503 cmova %r11,%r10 3504 cmova %eax,%ecx 3505 movzw (%r10,$num,2),%eax 3506 shrl %cl,%eax 3507 and \$31,%eax 3508 ret 3509.size bn_get_bits5,.-bn_get_bits5 3510 3511.globl bn_scatter5 3512.type bn_scatter5,\@abi-omnipotent 3513.align 16 3514bn_scatter5: 3515 cmp \$0, $num 3516 jz .Lscatter_epilogue 3517 lea ($tbl,$idx,8),$tbl 3518.Lscatter: 3519 mov ($inp),%rax 3520 lea 8($inp),$inp 3521 mov %rax,($tbl) 3522 lea 32*8($tbl),$tbl 3523 sub \$1,$num 3524 jnz .Lscatter 3525.Lscatter_epilogue: 3526 ret 3527.size bn_scatter5,.-bn_scatter5 3528 3529.globl bn_gather5 3530.type bn_gather5,\@abi-omnipotent 3531.align 32 3532bn_gather5: 3533.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases 3534 # I can't trust assembler to use specific encoding:-( 3535 .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 3536 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp 3537 lea .Linc(%rip),%rax 3538 and \$-16,%rsp # shouldn't be formally required 3539 3540 movd $idx,%xmm5 3541 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 3542 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 3543 lea 128($tbl),%r11 # size optimization 3544 lea 128(%rsp),%rax # size optimization 3545 3546 pshufd \$0,%xmm5,%xmm5 # broadcast $idx 3547 movdqa %xmm1,%xmm4 3548 movdqa %xmm1,%xmm2 3549___ 3550######################################################################## 3551# calculate mask by comparing 0..31 to $idx and save result to stack 3552# 3553for($i=0;$i<$STRIDE/16;$i+=4) { 3554$code.=<<___; 3555 paddd %xmm0,%xmm1 3556 pcmpeqd %xmm5,%xmm0 # compare to 1,0 3557___ 3558$code.=<<___ if ($i); 3559 movdqa %xmm3,`16*($i-1)-128`(%rax) 3560___ 3561$code.=<<___; 3562 movdqa %xmm4,%xmm3 3563 3564 paddd %xmm1,%xmm2 3565 pcmpeqd %xmm5,%xmm1 # compare to 3,2 3566 movdqa %xmm0,`16*($i+0)-128`(%rax) 3567 movdqa %xmm4,%xmm0 3568 3569 paddd %xmm2,%xmm3 3570 pcmpeqd %xmm5,%xmm2 # compare to 5,4 3571 movdqa %xmm1,`16*($i+1)-128`(%rax) 3572 movdqa %xmm4,%xmm1 3573 3574 paddd %xmm3,%xmm0 3575 pcmpeqd %xmm5,%xmm3 # compare to 7,6 3576 movdqa %xmm2,`16*($i+2)-128`(%rax) 3577 movdqa %xmm4,%xmm2 3578___ 3579} 3580$code.=<<___; 3581 movdqa %xmm3,`16*($i-1)-128`(%rax) 3582 jmp .Lgather 3583 3584.align 32 3585.Lgather: 3586 pxor %xmm4,%xmm4 3587 pxor %xmm5,%xmm5 3588___ 3589for($i=0;$i<$STRIDE/16;$i+=4) { 3590$code.=<<___; 3591 movdqa `16*($i+0)-128`(%r11),%xmm0 3592 movdqa `16*($i+1)-128`(%r11),%xmm1 3593 movdqa `16*($i+2)-128`(%r11),%xmm2 3594 pand `16*($i+0)-128`(%rax),%xmm0 3595 movdqa `16*($i+3)-128`(%r11),%xmm3 3596 pand `16*($i+1)-128`(%rax),%xmm1 3597 por %xmm0,%xmm4 3598 pand `16*($i+2)-128`(%rax),%xmm2 3599 por %xmm1,%xmm5 3600 pand `16*($i+3)-128`(%rax),%xmm3 3601 por %xmm2,%xmm4 3602 por %xmm3,%xmm5 3603___ 3604} 3605$code.=<<___; 3606 por %xmm5,%xmm4 3607 lea $STRIDE(%r11),%r11 3608 pshufd \$0x4e,%xmm4,%xmm0 3609 por %xmm4,%xmm0 3610 movq %xmm0,($out) # m0=bp[0] 3611 lea 8($out),$out 3612 sub \$1,$num 3613 jnz .Lgather 3614 3615 lea (%r10),%rsp 3616 ret 3617.LSEH_end_bn_gather5: 3618.size bn_gather5,.-bn_gather5 3619___ 3620} 3621$code.=<<___; 3622.align 64 3623.Linc: 3624 .long 0,0, 1,1 3625 .long 2,2, 2,2 3626.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3627___ 3628 3629# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3630# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3631if ($win64) { 3632$rec="%rcx"; 3633$frame="%rdx"; 3634$context="%r8"; 3635$disp="%r9"; 3636 3637$code.=<<___; 3638.extern __imp_RtlVirtualUnwind 3639.type mul_handler,\@abi-omnipotent 3640.align 16 3641mul_handler: 3642 push %rsi 3643 push %rdi 3644 push %rbx 3645 push %rbp 3646 push %r12 3647 push %r13 3648 push %r14 3649 push %r15 3650 pushfq 3651 sub \$64,%rsp 3652 3653 mov 120($context),%rax # pull context->Rax 3654 mov 248($context),%rbx # pull context->Rip 3655 3656 mov 8($disp),%rsi # disp->ImageBase 3657 mov 56($disp),%r11 # disp->HandlerData 3658 3659 mov 0(%r11),%r10d # HandlerData[0] 3660 lea (%rsi,%r10),%r10 # end of prologue label 3661 cmp %r10,%rbx # context->Rip<end of prologue label 3662 jb .Lcommon_seh_tail 3663 3664 mov 4(%r11),%r10d # HandlerData[1] 3665 lea (%rsi,%r10),%r10 # epilogue label 3666 cmp %r10,%rbx # context->Rip>=epilogue label 3667 jb .Lcommon_pop_regs 3668 3669 mov 152($context),%rax # pull context->Rsp 3670 3671 mov 8(%r11),%r10d # HandlerData[2] 3672 lea (%rsi,%r10),%r10 # epilogue label 3673 cmp %r10,%rbx # context->Rip>=epilogue label 3674 jae .Lcommon_seh_tail 3675 3676 lea .Lmul_epilogue(%rip),%r10 3677 cmp %r10,%rbx 3678 ja .Lbody_40 3679 3680 mov 192($context),%r10 # pull $num 3681 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 3682 3683 jmp .Lcommon_pop_regs 3684 3685.Lbody_40: 3686 mov 40(%rax),%rax # pull saved stack pointer 3687.Lcommon_pop_regs: 3688 mov -8(%rax),%rbx 3689 mov -16(%rax),%rbp 3690 mov -24(%rax),%r12 3691 mov -32(%rax),%r13 3692 mov -40(%rax),%r14 3693 mov -48(%rax),%r15 3694 mov %rbx,144($context) # restore context->Rbx 3695 mov %rbp,160($context) # restore context->Rbp 3696 mov %r12,216($context) # restore context->R12 3697 mov %r13,224($context) # restore context->R13 3698 mov %r14,232($context) # restore context->R14 3699 mov %r15,240($context) # restore context->R15 3700 3701.Lcommon_seh_tail: 3702 mov 8(%rax),%rdi 3703 mov 16(%rax),%rsi 3704 mov %rax,152($context) # restore context->Rsp 3705 mov %rsi,168($context) # restore context->Rsi 3706 mov %rdi,176($context) # restore context->Rdi 3707 3708 mov 40($disp),%rdi # disp->ContextRecord 3709 mov $context,%rsi # context 3710 mov \$154,%ecx # sizeof(CONTEXT) 3711 .long 0xa548f3fc # cld; rep movsq 3712 3713 mov $disp,%rsi 3714 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3715 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3716 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3717 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3718 mov 40(%rsi),%r10 # disp->ContextRecord 3719 lea 56(%rsi),%r11 # &disp->HandlerData 3720 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3721 mov %r10,32(%rsp) # arg5 3722 mov %r11,40(%rsp) # arg6 3723 mov %r12,48(%rsp) # arg7 3724 mov %rcx,56(%rsp) # arg8, (NULL) 3725 call *__imp_RtlVirtualUnwind(%rip) 3726 3727 mov \$1,%eax # ExceptionContinueSearch 3728 add \$64,%rsp 3729 popfq 3730 pop %r15 3731 pop %r14 3732 pop %r13 3733 pop %r12 3734 pop %rbp 3735 pop %rbx 3736 pop %rdi 3737 pop %rsi 3738 ret 3739.size mul_handler,.-mul_handler 3740 3741.section .pdata 3742.align 4 3743 .rva .LSEH_begin_bn_mul_mont_gather5 3744 .rva .LSEH_end_bn_mul_mont_gather5 3745 .rva .LSEH_info_bn_mul_mont_gather5 3746 3747 .rva .LSEH_begin_bn_mul4x_mont_gather5 3748 .rva .LSEH_end_bn_mul4x_mont_gather5 3749 .rva .LSEH_info_bn_mul4x_mont_gather5 3750 3751 .rva .LSEH_begin_bn_power5 3752 .rva .LSEH_end_bn_power5 3753 .rva .LSEH_info_bn_power5 3754 3755 .rva .LSEH_begin_bn_from_mont8x 3756 .rva .LSEH_end_bn_from_mont8x 3757 .rva .LSEH_info_bn_from_mont8x 3758___ 3759$code.=<<___ if ($addx); 3760 .rva .LSEH_begin_bn_mulx4x_mont_gather5 3761 .rva .LSEH_end_bn_mulx4x_mont_gather5 3762 .rva .LSEH_info_bn_mulx4x_mont_gather5 3763 3764 .rva .LSEH_begin_bn_powerx5 3765 .rva .LSEH_end_bn_powerx5 3766 .rva .LSEH_info_bn_powerx5 3767___ 3768$code.=<<___; 3769 .rva .LSEH_begin_bn_gather5 3770 .rva .LSEH_end_bn_gather5 3771 .rva .LSEH_info_bn_gather5 3772 3773.section .xdata 3774.align 8 3775.LSEH_info_bn_mul_mont_gather5: 3776 .byte 9,0,0,0 3777 .rva mul_handler 3778 .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] 3779.align 8 3780.LSEH_info_bn_mul4x_mont_gather5: 3781 .byte 9,0,0,0 3782 .rva mul_handler 3783 .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 3784.align 8 3785.LSEH_info_bn_power5: 3786 .byte 9,0,0,0 3787 .rva mul_handler 3788 .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] 3789.align 8 3790.LSEH_info_bn_from_mont8x: 3791 .byte 9,0,0,0 3792 .rva mul_handler 3793 .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] 3794___ 3795$code.=<<___ if ($addx); 3796.align 8 3797.LSEH_info_bn_mulx4x_mont_gather5: 3798 .byte 9,0,0,0 3799 .rva mul_handler 3800 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 3801.align 8 3802.LSEH_info_bn_powerx5: 3803 .byte 9,0,0,0 3804 .rva mul_handler 3805 .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] 3806___ 3807$code.=<<___; 3808.align 8 3809.LSEH_info_bn_gather5: 3810 .byte 0x01,0x0b,0x03,0x0a 3811 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 3812 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) 3813.align 8 3814___ 3815} 3816 3817$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3818 3819print $code; 3820close STDOUT; 3821