1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# August 2011. 11# 12# Companion to x86_64-mont.pl that optimizes cache-timing attack 13# countermeasures. The subroutines are produced by replacing bp[i] 14# references in their x86_64-mont.pl counterparts with cache-neutral 15# references to powers table computed in BN_mod_exp_mont_consttime. 16# In addition subroutine that scatters elements of the powers table 17# is implemented, so that scatter-/gathering can be tuned without 18# bn_exp.c modifications. 19 20# August 2013. 21# 22# Add MULX/AD*X code paths and additional interfaces to optimize for 23# branch prediction unit. For input lengths that are multiples of 8 24# the np argument is not just modulus value, but one interleaved 25# with 0. This is to optimize post-condition... 26 27$flavour = shift; 28$output = shift; 29if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 30 31$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 32 33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 34( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 35( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 36die "can't locate x86_64-xlate.pl"; 37 38open OUT,"| \"$^X\" $xlate $flavour $output"; 39*STDOUT=*OUT; 40 41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 43 $addx = ($1>=2.23); 44} 45 46if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 48 $addx = ($1>=2.10); 49} 50 51if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 52 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 53 $addx = ($1>=12); 54} 55 56if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 57 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 58 $addx = ($ver>=3.03); 59} 60 61# int bn_mul_mont_gather5( 62$rp="%rdi"; # BN_ULONG *rp, 63$ap="%rsi"; # const BN_ULONG *ap, 64$bp="%rdx"; # const BN_ULONG *bp, 65$np="%rcx"; # const BN_ULONG *np, 66$n0="%r8"; # const BN_ULONG *n0, 67$num="%r9"; # int num, 68 # int idx); # 0 to 2^5-1, "index" in $bp holding 69 # pre-computed powers of a', interlaced 70 # in such manner that b[0] is $bp[idx], 71 # b[1] is [2^5+idx], etc. 72$lo0="%r10"; 73$hi0="%r11"; 74$hi1="%r13"; 75$i="%r14"; 76$j="%r15"; 77$m0="%rbx"; 78$m1="%rbp"; 79 80$code=<<___; 81.text 82 83.extern OPENSSL_ia32cap_P 84 85.globl bn_mul_mont_gather5 86.type bn_mul_mont_gather5,\@function,6 87.align 64 88bn_mul_mont_gather5: 89 mov ${num}d,${num}d 90 mov %rsp,%rax 91 test \$7,${num}d 92 jnz .Lmul_enter 93___ 94$code.=<<___ if ($addx); 95 mov OPENSSL_ia32cap_P+8(%rip),%r11d 96___ 97$code.=<<___; 98 jmp .Lmul4x_enter 99 100.align 16 101.Lmul_enter: 102 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument 103 push %rbx 104 push %rbp 105 push %r12 106 push %r13 107 push %r14 108 push %r15 109 110 neg $num 111 mov %rsp,%r11 112 lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) 113 neg $num # restore $num 114 and \$-1024,%r10 # minimize TLB usage 115 116 # Some OSes, *cough*-dows, insist on stack being "wired" to 117 # physical memory in strictly sequential manner, i.e. if stack 118 # allocation spans two pages, then reference to farmost one can 119 # be punishable by SEGV. But page walking can do good even on 120 # other OSes, because it guarantees that villain thread hits 121 # the guard page before it can make damage to innocent one... 122 sub %r10,%r11 123 and \$-4096,%r11 124 lea (%r10,%r11),%rsp 125 mov (%rsp),%r11 126 cmp %r10,%rsp 127 ja .Lmul_page_walk 128 jmp .Lmul_page_walk_done 129 130.Lmul_page_walk: 131 lea -4096(%rsp),%rsp 132 mov (%rsp),%r11 133 cmp %r10,%rsp 134 ja .Lmul_page_walk 135.Lmul_page_walk_done: 136 137 lea .Linc(%rip),%r10 138 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 139.Lmul_body: 140 141 lea 128($bp),%r12 # reassign $bp (+size optimization) 142___ 143 $bp="%r12"; 144 $STRIDE=2**5*8; # 5 is "window size" 145 $N=$STRIDE/4; # should match cache line size 146$code.=<<___; 147 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 148 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 149 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) 150 and \$-16,%r10 151 152 pshufd \$0,%xmm5,%xmm5 # broadcast index 153 movdqa %xmm1,%xmm4 154 movdqa %xmm1,%xmm2 155___ 156######################################################################## 157# calculate mask by comparing 0..31 to index and save result to stack 158# 159$code.=<<___; 160 paddd %xmm0,%xmm1 161 pcmpeqd %xmm5,%xmm0 # compare to 1,0 162 .byte 0x67 163 movdqa %xmm4,%xmm3 164___ 165for($k=0;$k<$STRIDE/16-4;$k+=4) { 166$code.=<<___; 167 paddd %xmm1,%xmm2 168 pcmpeqd %xmm5,%xmm1 # compare to 3,2 169 movdqa %xmm0,`16*($k+0)+112`(%r10) 170 movdqa %xmm4,%xmm0 171 172 paddd %xmm2,%xmm3 173 pcmpeqd %xmm5,%xmm2 # compare to 5,4 174 movdqa %xmm1,`16*($k+1)+112`(%r10) 175 movdqa %xmm4,%xmm1 176 177 paddd %xmm3,%xmm0 178 pcmpeqd %xmm5,%xmm3 # compare to 7,6 179 movdqa %xmm2,`16*($k+2)+112`(%r10) 180 movdqa %xmm4,%xmm2 181 182 paddd %xmm0,%xmm1 183 pcmpeqd %xmm5,%xmm0 184 movdqa %xmm3,`16*($k+3)+112`(%r10) 185 movdqa %xmm4,%xmm3 186___ 187} 188$code.=<<___; # last iteration can be optimized 189 paddd %xmm1,%xmm2 190 pcmpeqd %xmm5,%xmm1 191 movdqa %xmm0,`16*($k+0)+112`(%r10) 192 193 paddd %xmm2,%xmm3 194 .byte 0x67 195 pcmpeqd %xmm5,%xmm2 196 movdqa %xmm1,`16*($k+1)+112`(%r10) 197 198 pcmpeqd %xmm5,%xmm3 199 movdqa %xmm2,`16*($k+2)+112`(%r10) 200 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register 201 202 pand `16*($k+1)-128`($bp),%xmm1 203 pand `16*($k+2)-128`($bp),%xmm2 204 movdqa %xmm3,`16*($k+3)+112`(%r10) 205 pand `16*($k+3)-128`($bp),%xmm3 206 por %xmm2,%xmm0 207 por %xmm3,%xmm1 208___ 209for($k=0;$k<$STRIDE/16-4;$k+=4) { 210$code.=<<___; 211 movdqa `16*($k+0)-128`($bp),%xmm4 212 movdqa `16*($k+1)-128`($bp),%xmm5 213 movdqa `16*($k+2)-128`($bp),%xmm2 214 pand `16*($k+0)+112`(%r10),%xmm4 215 movdqa `16*($k+3)-128`($bp),%xmm3 216 pand `16*($k+1)+112`(%r10),%xmm5 217 por %xmm4,%xmm0 218 pand `16*($k+2)+112`(%r10),%xmm2 219 por %xmm5,%xmm1 220 pand `16*($k+3)+112`(%r10),%xmm3 221 por %xmm2,%xmm0 222 por %xmm3,%xmm1 223___ 224} 225$code.=<<___; 226 por %xmm1,%xmm0 227 pshufd \$0x4e,%xmm0,%xmm1 228 por %xmm1,%xmm0 229 lea $STRIDE($bp),$bp 230 movq %xmm0,$m0 # m0=bp[0] 231 232 mov ($n0),$n0 # pull n0[0] value 233 mov ($ap),%rax 234 235 xor $i,$i # i=0 236 xor $j,$j # j=0 237 238 mov $n0,$m1 239 mulq $m0 # ap[0]*bp[0] 240 mov %rax,$lo0 241 mov ($np),%rax 242 243 imulq $lo0,$m1 # "tp[0]"*n0 244 mov %rdx,$hi0 245 246 mulq $m1 # np[0]*m1 247 add %rax,$lo0 # discarded 248 mov 8($ap),%rax 249 adc \$0,%rdx 250 mov %rdx,$hi1 251 252 lea 1($j),$j # j++ 253 jmp .L1st_enter 254 255.align 16 256.L1st: 257 add %rax,$hi1 258 mov ($ap,$j,8),%rax 259 adc \$0,%rdx 260 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 261 mov $lo0,$hi0 262 adc \$0,%rdx 263 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 264 mov %rdx,$hi1 265 266.L1st_enter: 267 mulq $m0 # ap[j]*bp[0] 268 add %rax,$hi0 269 mov ($np,$j,8),%rax 270 adc \$0,%rdx 271 lea 1($j),$j # j++ 272 mov %rdx,$lo0 273 274 mulq $m1 # np[j]*m1 275 cmp $num,$j 276 jne .L1st # note that upon exit $j==$num, so 277 # they can be used interchangeably 278 279 add %rax,$hi1 280 adc \$0,%rdx 281 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 282 adc \$0,%rdx 283 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 284 mov %rdx,$hi1 285 mov $lo0,$hi0 286 287 xor %rdx,%rdx 288 add $hi0,$hi1 289 adc \$0,%rdx 290 mov $hi1,-8(%rsp,$num,8) 291 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 292 293 lea 1($i),$i # i++ 294 jmp .Louter 295.align 16 296.Louter: 297 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) 298 and \$-16,%rdx 299 pxor %xmm4,%xmm4 300 pxor %xmm5,%xmm5 301___ 302for($k=0;$k<$STRIDE/16;$k+=4) { 303$code.=<<___; 304 movdqa `16*($k+0)-128`($bp),%xmm0 305 movdqa `16*($k+1)-128`($bp),%xmm1 306 movdqa `16*($k+2)-128`($bp),%xmm2 307 movdqa `16*($k+3)-128`($bp),%xmm3 308 pand `16*($k+0)-128`(%rdx),%xmm0 309 pand `16*($k+1)-128`(%rdx),%xmm1 310 por %xmm0,%xmm4 311 pand `16*($k+2)-128`(%rdx),%xmm2 312 por %xmm1,%xmm5 313 pand `16*($k+3)-128`(%rdx),%xmm3 314 por %xmm2,%xmm4 315 por %xmm3,%xmm5 316___ 317} 318$code.=<<___; 319 por %xmm5,%xmm4 320 pshufd \$0x4e,%xmm4,%xmm0 321 por %xmm4,%xmm0 322 lea $STRIDE($bp),$bp 323 324 mov ($ap),%rax # ap[0] 325 movq %xmm0,$m0 # m0=bp[i] 326 327 xor $j,$j # j=0 328 mov $n0,$m1 329 mov (%rsp),$lo0 330 331 mulq $m0 # ap[0]*bp[i] 332 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 333 mov ($np),%rax 334 adc \$0,%rdx 335 336 imulq $lo0,$m1 # tp[0]*n0 337 mov %rdx,$hi0 338 339 mulq $m1 # np[0]*m1 340 add %rax,$lo0 # discarded 341 mov 8($ap),%rax 342 adc \$0,%rdx 343 mov 8(%rsp),$lo0 # tp[1] 344 mov %rdx,$hi1 345 346 lea 1($j),$j # j++ 347 jmp .Linner_enter 348 349.align 16 350.Linner: 351 add %rax,$hi1 352 mov ($ap,$j,8),%rax 353 adc \$0,%rdx 354 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 355 mov (%rsp,$j,8),$lo0 356 adc \$0,%rdx 357 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 358 mov %rdx,$hi1 359 360.Linner_enter: 361 mulq $m0 # ap[j]*bp[i] 362 add %rax,$hi0 363 mov ($np,$j,8),%rax 364 adc \$0,%rdx 365 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 366 mov %rdx,$hi0 367 adc \$0,$hi0 368 lea 1($j),$j # j++ 369 370 mulq $m1 # np[j]*m1 371 cmp $num,$j 372 jne .Linner # note that upon exit $j==$num, so 373 # they can be used interchangeably 374 add %rax,$hi1 375 adc \$0,%rdx 376 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 377 mov (%rsp,$num,8),$lo0 378 adc \$0,%rdx 379 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 380 mov %rdx,$hi1 381 382 xor %rdx,%rdx 383 add $hi0,$hi1 384 adc \$0,%rdx 385 add $lo0,$hi1 # pull upmost overflow bit 386 adc \$0,%rdx 387 mov $hi1,-8(%rsp,$num,8) 388 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 389 390 lea 1($i),$i # i++ 391 cmp $num,$i 392 jb .Louter 393 394 xor $i,$i # i=0 and clear CF! 395 mov (%rsp),%rax # tp[0] 396 lea (%rsp),$ap # borrow ap for tp 397 mov $num,$j # j=num 398 jmp .Lsub 399.align 16 400.Lsub: sbb ($np,$i,8),%rax 401 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 402 mov 8($ap,$i,8),%rax # tp[i+1] 403 lea 1($i),$i # i++ 404 dec $j # doesnn't affect CF! 405 jnz .Lsub 406 407 sbb \$0,%rax # handle upmost overflow bit 408 mov \$-1,%rbx 409 xor %rax,%rbx 410 xor $i,$i 411 mov $num,$j # j=num 412 413.Lcopy: # conditional copy 414 mov ($rp,$i,8),%rcx 415 mov (%rsp,$i,8),%rdx 416 and %rbx,%rcx 417 and %rax,%rdx 418 mov $i,(%rsp,$i,8) # zap temporary vector 419 or %rcx,%rdx 420 mov %rdx,($rp,$i,8) # rp[i]=tp[i] 421 lea 1($i),$i 422 sub \$1,$j 423 jnz .Lcopy 424 425 mov 8(%rsp,$num,8),%rsi # restore %rsp 426 mov \$1,%rax 427 428 mov -48(%rsi),%r15 429 mov -40(%rsi),%r14 430 mov -32(%rsi),%r13 431 mov -24(%rsi),%r12 432 mov -16(%rsi),%rbp 433 mov -8(%rsi),%rbx 434 lea (%rsi),%rsp 435.Lmul_epilogue: 436 ret 437.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 438___ 439{{{ 440my @A=("%r10","%r11"); 441my @N=("%r13","%rdi"); 442$code.=<<___; 443.type bn_mul4x_mont_gather5,\@function,6 444.align 32 445bn_mul4x_mont_gather5: 446 .byte 0x67 447 mov %rsp,%rax 448.Lmul4x_enter: 449___ 450$code.=<<___ if ($addx); 451 and \$0x80108,%r11d 452 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 453 je .Lmulx4x_enter 454___ 455$code.=<<___; 456 push %rbx 457 push %rbp 458 push %r12 459 push %r13 460 push %r14 461 push %r15 462.Lmul4x_prologue: 463 464 .byte 0x67 465 shl \$3,${num}d # convert $num to bytes 466 lea ($num,$num,2),%r10 # 3*$num in bytes 467 neg $num # -$num 468 469 ############################################################## 470 # Ensure that stack frame doesn't alias with $rptr+3*$num 471 # modulo 4096, which covers ret[num], am[num] and n[num] 472 # (see bn_exp.c). This is done to allow memory disambiguation 473 # logic do its magic. [Extra [num] is allocated in order 474 # to align with bn_power5's frame, which is cleansed after 475 # completing exponentiation. Extra 256 bytes is for power mask 476 # calculated from 7th argument, the index.] 477 # 478 lea -320(%rsp,$num,2),%r11 479 mov %rsp,%rbp 480 sub $rp,%r11 481 and \$4095,%r11 482 cmp %r11,%r10 483 jb .Lmul4xsp_alt 484 sub %r11,%rbp # align with $rp 485 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 486 jmp .Lmul4xsp_done 487 488.align 32 489.Lmul4xsp_alt: 490 lea 4096-320(,$num,2),%r10 491 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 492 sub %r10,%r11 493 mov \$0,%r10 494 cmovc %r10,%r11 495 sub %r11,%rbp 496.Lmul4xsp_done: 497 and \$-64,%rbp 498 mov %rsp,%r11 499 sub %rbp,%r11 500 and \$-4096,%r11 501 lea (%rbp,%r11),%rsp 502 mov (%rsp),%r10 503 cmp %rbp,%rsp 504 ja .Lmul4x_page_walk 505 jmp .Lmul4x_page_walk_done 506 507.Lmul4x_page_walk: 508 lea -4096(%rsp),%rsp 509 mov (%rsp),%r10 510 cmp %rbp,%rsp 511 ja .Lmul4x_page_walk 512.Lmul4x_page_walk_done: 513 514 neg $num 515 516 mov %rax,40(%rsp) 517.Lmul4x_body: 518 519 call mul4x_internal 520 521 mov 40(%rsp),%rsi # restore %rsp 522 mov \$1,%rax 523 524 mov -48(%rsi),%r15 525 mov -40(%rsi),%r14 526 mov -32(%rsi),%r13 527 mov -24(%rsi),%r12 528 mov -16(%rsi),%rbp 529 mov -8(%rsi),%rbx 530 lea (%rsi),%rsp 531.Lmul4x_epilogue: 532 ret 533.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 534 535.type mul4x_internal,\@abi-omnipotent 536.align 32 537mul4x_internal: 538 shl \$5,$num # $num was in bytes 539 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index 540 lea .Linc(%rip),%rax 541 lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) 542 shr \$5,$num # restore $num 543___ 544 $bp="%r12"; 545 $STRIDE=2**5*8; # 5 is "window size" 546 $N=$STRIDE/4; # should match cache line size 547 $tp=$i; 548$code.=<<___; 549 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 550 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 551 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) 552 lea 128(%rdx),$bp # size optimization 553 554 pshufd \$0,%xmm5,%xmm5 # broadcast index 555 movdqa %xmm1,%xmm4 556 .byte 0x67,0x67 557 movdqa %xmm1,%xmm2 558___ 559######################################################################## 560# calculate mask by comparing 0..31 to index and save result to stack 561# 562$code.=<<___; 563 paddd %xmm0,%xmm1 564 pcmpeqd %xmm5,%xmm0 # compare to 1,0 565 .byte 0x67 566 movdqa %xmm4,%xmm3 567___ 568for($i=0;$i<$STRIDE/16-4;$i+=4) { 569$code.=<<___; 570 paddd %xmm1,%xmm2 571 pcmpeqd %xmm5,%xmm1 # compare to 3,2 572 movdqa %xmm0,`16*($i+0)+112`(%r10) 573 movdqa %xmm4,%xmm0 574 575 paddd %xmm2,%xmm3 576 pcmpeqd %xmm5,%xmm2 # compare to 5,4 577 movdqa %xmm1,`16*($i+1)+112`(%r10) 578 movdqa %xmm4,%xmm1 579 580 paddd %xmm3,%xmm0 581 pcmpeqd %xmm5,%xmm3 # compare to 7,6 582 movdqa %xmm2,`16*($i+2)+112`(%r10) 583 movdqa %xmm4,%xmm2 584 585 paddd %xmm0,%xmm1 586 pcmpeqd %xmm5,%xmm0 587 movdqa %xmm3,`16*($i+3)+112`(%r10) 588 movdqa %xmm4,%xmm3 589___ 590} 591$code.=<<___; # last iteration can be optimized 592 paddd %xmm1,%xmm2 593 pcmpeqd %xmm5,%xmm1 594 movdqa %xmm0,`16*($i+0)+112`(%r10) 595 596 paddd %xmm2,%xmm3 597 .byte 0x67 598 pcmpeqd %xmm5,%xmm2 599 movdqa %xmm1,`16*($i+1)+112`(%r10) 600 601 pcmpeqd %xmm5,%xmm3 602 movdqa %xmm2,`16*($i+2)+112`(%r10) 603 pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register 604 605 pand `16*($i+1)-128`($bp),%xmm1 606 pand `16*($i+2)-128`($bp),%xmm2 607 movdqa %xmm3,`16*($i+3)+112`(%r10) 608 pand `16*($i+3)-128`($bp),%xmm3 609 por %xmm2,%xmm0 610 por %xmm3,%xmm1 611___ 612for($i=0;$i<$STRIDE/16-4;$i+=4) { 613$code.=<<___; 614 movdqa `16*($i+0)-128`($bp),%xmm4 615 movdqa `16*($i+1)-128`($bp),%xmm5 616 movdqa `16*($i+2)-128`($bp),%xmm2 617 pand `16*($i+0)+112`(%r10),%xmm4 618 movdqa `16*($i+3)-128`($bp),%xmm3 619 pand `16*($i+1)+112`(%r10),%xmm5 620 por %xmm4,%xmm0 621 pand `16*($i+2)+112`(%r10),%xmm2 622 por %xmm5,%xmm1 623 pand `16*($i+3)+112`(%r10),%xmm3 624 por %xmm2,%xmm0 625 por %xmm3,%xmm1 626___ 627} 628$code.=<<___; 629 por %xmm1,%xmm0 630 pshufd \$0x4e,%xmm0,%xmm1 631 por %xmm1,%xmm0 632 lea $STRIDE($bp),$bp 633 movq %xmm0,$m0 # m0=bp[0] 634 635 mov %r13,16+8(%rsp) # save end of b[num] 636 mov $rp, 56+8(%rsp) # save $rp 637 638 mov ($n0),$n0 # pull n0[0] value 639 mov ($ap),%rax 640 lea ($ap,$num),$ap # end of a[num] 641 neg $num 642 643 mov $n0,$m1 644 mulq $m0 # ap[0]*bp[0] 645 mov %rax,$A[0] 646 mov ($np),%rax 647 648 imulq $A[0],$m1 # "tp[0]"*n0 649 lea 64+8(%rsp),$tp 650 mov %rdx,$A[1] 651 652 mulq $m1 # np[0]*m1 653 add %rax,$A[0] # discarded 654 mov 8($ap,$num),%rax 655 adc \$0,%rdx 656 mov %rdx,$N[1] 657 658 mulq $m0 659 add %rax,$A[1] 660 mov 8*1($np),%rax 661 adc \$0,%rdx 662 mov %rdx,$A[0] 663 664 mulq $m1 665 add %rax,$N[1] 666 mov 16($ap,$num),%rax 667 adc \$0,%rdx 668 add $A[1],$N[1] 669 lea 4*8($num),$j # j=4 670 lea 8*4($np),$np 671 adc \$0,%rdx 672 mov $N[1],($tp) 673 mov %rdx,$N[0] 674 jmp .L1st4x 675 676.align 32 677.L1st4x: 678 mulq $m0 # ap[j]*bp[0] 679 add %rax,$A[0] 680 mov -8*2($np),%rax 681 lea 32($tp),$tp 682 adc \$0,%rdx 683 mov %rdx,$A[1] 684 685 mulq $m1 # np[j]*m1 686 add %rax,$N[0] 687 mov -8($ap,$j),%rax 688 adc \$0,%rdx 689 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 690 adc \$0,%rdx 691 mov $N[0],-24($tp) # tp[j-1] 692 mov %rdx,$N[1] 693 694 mulq $m0 # ap[j]*bp[0] 695 add %rax,$A[1] 696 mov -8*1($np),%rax 697 adc \$0,%rdx 698 mov %rdx,$A[0] 699 700 mulq $m1 # np[j]*m1 701 add %rax,$N[1] 702 mov ($ap,$j),%rax 703 adc \$0,%rdx 704 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 705 adc \$0,%rdx 706 mov $N[1],-16($tp) # tp[j-1] 707 mov %rdx,$N[0] 708 709 mulq $m0 # ap[j]*bp[0] 710 add %rax,$A[0] 711 mov 8*0($np),%rax 712 adc \$0,%rdx 713 mov %rdx,$A[1] 714 715 mulq $m1 # np[j]*m1 716 add %rax,$N[0] 717 mov 8($ap,$j),%rax 718 adc \$0,%rdx 719 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 720 adc \$0,%rdx 721 mov $N[0],-8($tp) # tp[j-1] 722 mov %rdx,$N[1] 723 724 mulq $m0 # ap[j]*bp[0] 725 add %rax,$A[1] 726 mov 8*1($np),%rax 727 adc \$0,%rdx 728 mov %rdx,$A[0] 729 730 mulq $m1 # np[j]*m1 731 add %rax,$N[1] 732 mov 16($ap,$j),%rax 733 adc \$0,%rdx 734 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 735 lea 8*4($np),$np 736 adc \$0,%rdx 737 mov $N[1],($tp) # tp[j-1] 738 mov %rdx,$N[0] 739 740 add \$32,$j # j+=4 741 jnz .L1st4x 742 743 mulq $m0 # ap[j]*bp[0] 744 add %rax,$A[0] 745 mov -8*2($np),%rax 746 lea 32($tp),$tp 747 adc \$0,%rdx 748 mov %rdx,$A[1] 749 750 mulq $m1 # np[j]*m1 751 add %rax,$N[0] 752 mov -8($ap),%rax 753 adc \$0,%rdx 754 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 755 adc \$0,%rdx 756 mov $N[0],-24($tp) # tp[j-1] 757 mov %rdx,$N[1] 758 759 mulq $m0 # ap[j]*bp[0] 760 add %rax,$A[1] 761 mov -8*1($np),%rax 762 adc \$0,%rdx 763 mov %rdx,$A[0] 764 765 mulq $m1 # np[j]*m1 766 add %rax,$N[1] 767 mov ($ap,$num),%rax # ap[0] 768 adc \$0,%rdx 769 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 770 adc \$0,%rdx 771 mov $N[1],-16($tp) # tp[j-1] 772 mov %rdx,$N[0] 773 774 lea ($np,$num),$np # rewind $np 775 776 xor $N[1],$N[1] 777 add $A[0],$N[0] 778 adc \$0,$N[1] 779 mov $N[0],-8($tp) 780 781 jmp .Louter4x 782 783.align 32 784.Louter4x: 785 lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) 786 pxor %xmm4,%xmm4 787 pxor %xmm5,%xmm5 788___ 789for($i=0;$i<$STRIDE/16;$i+=4) { 790$code.=<<___; 791 movdqa `16*($i+0)-128`($bp),%xmm0 792 movdqa `16*($i+1)-128`($bp),%xmm1 793 movdqa `16*($i+2)-128`($bp),%xmm2 794 movdqa `16*($i+3)-128`($bp),%xmm3 795 pand `16*($i+0)-128`(%rdx),%xmm0 796 pand `16*($i+1)-128`(%rdx),%xmm1 797 por %xmm0,%xmm4 798 pand `16*($i+2)-128`(%rdx),%xmm2 799 por %xmm1,%xmm5 800 pand `16*($i+3)-128`(%rdx),%xmm3 801 por %xmm2,%xmm4 802 por %xmm3,%xmm5 803___ 804} 805$code.=<<___; 806 por %xmm5,%xmm4 807 pshufd \$0x4e,%xmm4,%xmm0 808 por %xmm4,%xmm0 809 lea $STRIDE($bp),$bp 810 movq %xmm0,$m0 # m0=bp[i] 811 812 mov ($tp,$num),$A[0] 813 mov $n0,$m1 814 mulq $m0 # ap[0]*bp[i] 815 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 816 mov ($np),%rax 817 adc \$0,%rdx 818 819 imulq $A[0],$m1 # tp[0]*n0 820 mov %rdx,$A[1] 821 mov $N[1],($tp) # store upmost overflow bit 822 823 lea ($tp,$num),$tp # rewind $tp 824 825 mulq $m1 # np[0]*m1 826 add %rax,$A[0] # "$N[0]", discarded 827 mov 8($ap,$num),%rax 828 adc \$0,%rdx 829 mov %rdx,$N[1] 830 831 mulq $m0 # ap[j]*bp[i] 832 add %rax,$A[1] 833 mov 8*1($np),%rax 834 adc \$0,%rdx 835 add 8($tp),$A[1] # +tp[1] 836 adc \$0,%rdx 837 mov %rdx,$A[0] 838 839 mulq $m1 # np[j]*m1 840 add %rax,$N[1] 841 mov 16($ap,$num),%rax 842 adc \$0,%rdx 843 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 844 lea 4*8($num),$j # j=4 845 lea 8*4($np),$np 846 adc \$0,%rdx 847 mov %rdx,$N[0] 848 jmp .Linner4x 849 850.align 32 851.Linner4x: 852 mulq $m0 # ap[j]*bp[i] 853 add %rax,$A[0] 854 mov -8*2($np),%rax 855 adc \$0,%rdx 856 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 857 lea 32($tp),$tp 858 adc \$0,%rdx 859 mov %rdx,$A[1] 860 861 mulq $m1 # np[j]*m1 862 add %rax,$N[0] 863 mov -8($ap,$j),%rax 864 adc \$0,%rdx 865 add $A[0],$N[0] 866 adc \$0,%rdx 867 mov $N[1],-32($tp) # tp[j-1] 868 mov %rdx,$N[1] 869 870 mulq $m0 # ap[j]*bp[i] 871 add %rax,$A[1] 872 mov -8*1($np),%rax 873 adc \$0,%rdx 874 add -8($tp),$A[1] 875 adc \$0,%rdx 876 mov %rdx,$A[0] 877 878 mulq $m1 # np[j]*m1 879 add %rax,$N[1] 880 mov ($ap,$j),%rax 881 adc \$0,%rdx 882 add $A[1],$N[1] 883 adc \$0,%rdx 884 mov $N[0],-24($tp) # tp[j-1] 885 mov %rdx,$N[0] 886 887 mulq $m0 # ap[j]*bp[i] 888 add %rax,$A[0] 889 mov 8*0($np),%rax 890 adc \$0,%rdx 891 add ($tp),$A[0] # ap[j]*bp[i]+tp[j] 892 adc \$0,%rdx 893 mov %rdx,$A[1] 894 895 mulq $m1 # np[j]*m1 896 add %rax,$N[0] 897 mov 8($ap,$j),%rax 898 adc \$0,%rdx 899 add $A[0],$N[0] 900 adc \$0,%rdx 901 mov $N[1],-16($tp) # tp[j-1] 902 mov %rdx,$N[1] 903 904 mulq $m0 # ap[j]*bp[i] 905 add %rax,$A[1] 906 mov 8*1($np),%rax 907 adc \$0,%rdx 908 add 8($tp),$A[1] 909 adc \$0,%rdx 910 mov %rdx,$A[0] 911 912 mulq $m1 # np[j]*m1 913 add %rax,$N[1] 914 mov 16($ap,$j),%rax 915 adc \$0,%rdx 916 add $A[1],$N[1] 917 lea 8*4($np),$np 918 adc \$0,%rdx 919 mov $N[0],-8($tp) # tp[j-1] 920 mov %rdx,$N[0] 921 922 add \$32,$j # j+=4 923 jnz .Linner4x 924 925 mulq $m0 # ap[j]*bp[i] 926 add %rax,$A[0] 927 mov -8*2($np),%rax 928 adc \$0,%rdx 929 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 930 lea 32($tp),$tp 931 adc \$0,%rdx 932 mov %rdx,$A[1] 933 934 mulq $m1 # np[j]*m1 935 add %rax,$N[0] 936 mov -8($ap),%rax 937 adc \$0,%rdx 938 add $A[0],$N[0] 939 adc \$0,%rdx 940 mov $N[1],-32($tp) # tp[j-1] 941 mov %rdx,$N[1] 942 943 mulq $m0 # ap[j]*bp[i] 944 add %rax,$A[1] 945 mov $m1,%rax 946 mov -8*1($np),$m1 947 adc \$0,%rdx 948 add -8($tp),$A[1] 949 adc \$0,%rdx 950 mov %rdx,$A[0] 951 952 mulq $m1 # np[j]*m1 953 add %rax,$N[1] 954 mov ($ap,$num),%rax # ap[0] 955 adc \$0,%rdx 956 add $A[1],$N[1] 957 adc \$0,%rdx 958 mov $N[0],-24($tp) # tp[j-1] 959 mov %rdx,$N[0] 960 961 mov $N[1],-16($tp) # tp[j-1] 962 lea ($np,$num),$np # rewind $np 963 964 xor $N[1],$N[1] 965 add $A[0],$N[0] 966 adc \$0,$N[1] 967 add ($tp),$N[0] # pull upmost overflow bit 968 adc \$0,$N[1] # upmost overflow bit 969 mov $N[0],-8($tp) 970 971 cmp 16+8(%rsp),$bp 972 jb .Louter4x 973___ 974if (1) { 975$code.=<<___; 976 xor %rax,%rax 977 sub $N[0],$m1 # compare top-most words 978 adc $j,$j # $j is zero 979 or $j,$N[1] 980 sub $N[1],%rax # %rax=-$N[1] 981 lea ($tp,$num),%rbx # tptr in .sqr4x_sub 982 mov ($np),%r12 983 lea ($np),%rbp # nptr in .sqr4x_sub 984 mov %r9,%rcx 985 sar \$3+2,%rcx 986 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub 987 dec %r12 # so that after 'not' we get -n[0] 988 xor %r10,%r10 989 mov 8*1(%rbp),%r13 990 mov 8*2(%rbp),%r14 991 mov 8*3(%rbp),%r15 992 jmp .Lsqr4x_sub_entry 993___ 994} else { 995my @ri=("%rax",$bp,$m0,$m1); 996my $rp="%rdx"; 997$code.=<<___ 998 xor \$1,$N[1] 999 lea ($tp,$num),$tp # rewind $tp 1000 sar \$5,$num # cf=0 1001 lea ($np,$N[1],8),$np 1002 mov 56+8(%rsp),$rp # restore $rp 1003 jmp .Lsub4x 1004 1005.align 32 1006.Lsub4x: 1007 .byte 0x66 1008 mov 8*0($tp),@ri[0] 1009 mov 8*1($tp),@ri[1] 1010 .byte 0x66 1011 sbb 16*0($np),@ri[0] 1012 mov 8*2($tp),@ri[2] 1013 sbb 16*1($np),@ri[1] 1014 mov 3*8($tp),@ri[3] 1015 lea 4*8($tp),$tp 1016 sbb 16*2($np),@ri[2] 1017 mov @ri[0],8*0($rp) 1018 sbb 16*3($np),@ri[3] 1019 lea 16*4($np),$np 1020 mov @ri[1],8*1($rp) 1021 mov @ri[2],8*2($rp) 1022 mov @ri[3],8*3($rp) 1023 lea 8*4($rp),$rp 1024 1025 inc $num 1026 jnz .Lsub4x 1027 1028 ret 1029___ 1030} 1031$code.=<<___; 1032.size mul4x_internal,.-mul4x_internal 1033___ 1034}}} 1035{{{ 1036###################################################################### 1037# void bn_power5( 1038my $rptr="%rdi"; # BN_ULONG *rptr, 1039my $aptr="%rsi"; # const BN_ULONG *aptr, 1040my $bptr="%rdx"; # const void *table, 1041my $nptr="%rcx"; # const BN_ULONG *nptr, 1042my $n0 ="%r8"; # const BN_ULONG *n0); 1043my $num ="%r9"; # int num, has to be divisible by 8 1044 # int pwr 1045 1046my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 1047my @A0=("%r10","%r11"); 1048my @A1=("%r12","%r13"); 1049my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 1050 1051$code.=<<___; 1052.globl bn_power5 1053.type bn_power5,\@function,6 1054.align 32 1055bn_power5: 1056 mov %rsp,%rax 1057___ 1058$code.=<<___ if ($addx); 1059 mov OPENSSL_ia32cap_P+8(%rip),%r11d 1060 and \$0x80108,%r11d 1061 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 1062 je .Lpowerx5_enter 1063___ 1064$code.=<<___; 1065 push %rbx 1066 push %rbp 1067 push %r12 1068 push %r13 1069 push %r14 1070 push %r15 1071.Lpower5_prologue: 1072 1073 shl \$3,${num}d # convert $num to bytes 1074 lea ($num,$num,2),%r10d # 3*$num 1075 neg $num 1076 mov ($n0),$n0 # *n0 1077 1078 ############################################################## 1079 # Ensure that stack frame doesn't alias with $rptr+3*$num 1080 # modulo 4096, which covers ret[num], am[num] and n[num] 1081 # (see bn_exp.c). This is done to allow memory disambiguation 1082 # logic do its magic. [Extra 256 bytes is for power mask 1083 # calculated from 7th argument, the index.] 1084 # 1085 lea -320(%rsp,$num,2),%r11 1086 mov %rsp,%rbp 1087 sub $rptr,%r11 1088 and \$4095,%r11 1089 cmp %r11,%r10 1090 jb .Lpwr_sp_alt 1091 sub %r11,%rbp # align with $aptr 1092 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1093 jmp .Lpwr_sp_done 1094 1095.align 32 1096.Lpwr_sp_alt: 1097 lea 4096-320(,$num,2),%r10 1098 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1099 sub %r10,%r11 1100 mov \$0,%r10 1101 cmovc %r10,%r11 1102 sub %r11,%rbp 1103.Lpwr_sp_done: 1104 and \$-64,%rbp 1105 mov %rsp,%r11 1106 sub %rbp,%r11 1107 and \$-4096,%r11 1108 lea (%rbp,%r11),%rsp 1109 mov (%rsp),%r10 1110 cmp %rbp,%rsp 1111 ja .Lpwr_page_walk 1112 jmp .Lpwr_page_walk_done 1113 1114.Lpwr_page_walk: 1115 lea -4096(%rsp),%rsp 1116 mov (%rsp),%r10 1117 cmp %rbp,%rsp 1118 ja .Lpwr_page_walk 1119.Lpwr_page_walk_done: 1120 1121 mov $num,%r10 1122 neg $num 1123 1124 ############################################################## 1125 # Stack layout 1126 # 1127 # +0 saved $num, used in reduction section 1128 # +8 &t[2*$num], used in reduction section 1129 # +32 saved *n0 1130 # +40 saved %rsp 1131 # +48 t[2*$num] 1132 # 1133 mov $n0, 32(%rsp) 1134 mov %rax, 40(%rsp) # save original %rsp 1135.Lpower5_body: 1136 movq $rptr,%xmm1 # save $rptr, used in sqr8x 1137 movq $nptr,%xmm2 # save $nptr 1138 movq %r10, %xmm3 # -$num, used in sqr8x 1139 movq $bptr,%xmm4 1140 1141 call __bn_sqr8x_internal 1142 call __bn_post4x_internal 1143 call __bn_sqr8x_internal 1144 call __bn_post4x_internal 1145 call __bn_sqr8x_internal 1146 call __bn_post4x_internal 1147 call __bn_sqr8x_internal 1148 call __bn_post4x_internal 1149 call __bn_sqr8x_internal 1150 call __bn_post4x_internal 1151 1152 movq %xmm2,$nptr 1153 movq %xmm4,$bptr 1154 mov $aptr,$rptr 1155 mov 40(%rsp),%rax 1156 lea 32(%rsp),$n0 1157 1158 call mul4x_internal 1159 1160 mov 40(%rsp),%rsi # restore %rsp 1161 mov \$1,%rax 1162 mov -48(%rsi),%r15 1163 mov -40(%rsi),%r14 1164 mov -32(%rsi),%r13 1165 mov -24(%rsi),%r12 1166 mov -16(%rsi),%rbp 1167 mov -8(%rsi),%rbx 1168 lea (%rsi),%rsp 1169.Lpower5_epilogue: 1170 ret 1171.size bn_power5,.-bn_power5 1172 1173.globl bn_sqr8x_internal 1174.hidden bn_sqr8x_internal 1175.type bn_sqr8x_internal,\@abi-omnipotent 1176.align 32 1177bn_sqr8x_internal: 1178__bn_sqr8x_internal: 1179 ############################################################## 1180 # Squaring part: 1181 # 1182 # a) multiply-n-add everything but a[i]*a[i]; 1183 # b) shift result of a) by 1 to the left and accumulate 1184 # a[i]*a[i] products; 1185 # 1186 ############################################################## 1187 # a[1]a[0] 1188 # a[2]a[0] 1189 # a[3]a[0] 1190 # a[2]a[1] 1191 # a[4]a[0] 1192 # a[3]a[1] 1193 # a[5]a[0] 1194 # a[4]a[1] 1195 # a[3]a[2] 1196 # a[6]a[0] 1197 # a[5]a[1] 1198 # a[4]a[2] 1199 # a[7]a[0] 1200 # a[6]a[1] 1201 # a[5]a[2] 1202 # a[4]a[3] 1203 # a[7]a[1] 1204 # a[6]a[2] 1205 # a[5]a[3] 1206 # a[7]a[2] 1207 # a[6]a[3] 1208 # a[5]a[4] 1209 # a[7]a[3] 1210 # a[6]a[4] 1211 # a[7]a[4] 1212 # a[6]a[5] 1213 # a[7]a[5] 1214 # a[7]a[6] 1215 # a[1]a[0] 1216 # a[2]a[0] 1217 # a[3]a[0] 1218 # a[4]a[0] 1219 # a[5]a[0] 1220 # a[6]a[0] 1221 # a[7]a[0] 1222 # a[2]a[1] 1223 # a[3]a[1] 1224 # a[4]a[1] 1225 # a[5]a[1] 1226 # a[6]a[1] 1227 # a[7]a[1] 1228 # a[3]a[2] 1229 # a[4]a[2] 1230 # a[5]a[2] 1231 # a[6]a[2] 1232 # a[7]a[2] 1233 # a[4]a[3] 1234 # a[5]a[3] 1235 # a[6]a[3] 1236 # a[7]a[3] 1237 # a[5]a[4] 1238 # a[6]a[4] 1239 # a[7]a[4] 1240 # a[6]a[5] 1241 # a[7]a[5] 1242 # a[7]a[6] 1243 # a[0]a[0] 1244 # a[1]a[1] 1245 # a[2]a[2] 1246 # a[3]a[3] 1247 # a[4]a[4] 1248 # a[5]a[5] 1249 # a[6]a[6] 1250 # a[7]a[7] 1251 1252 lea 32(%r10),$i # $i=-($num-32) 1253 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 1254 1255 mov $num,$j # $j=$num 1256 1257 # comments apply to $num==8 case 1258 mov -32($aptr,$i),$a0 # a[0] 1259 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1260 mov -24($aptr,$i),%rax # a[1] 1261 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1262 mov -16($aptr,$i),$ai # a[2] 1263 mov %rax,$a1 1264 1265 mul $a0 # a[1]*a[0] 1266 mov %rax,$A0[0] # a[1]*a[0] 1267 mov $ai,%rax # a[2] 1268 mov %rdx,$A0[1] 1269 mov $A0[0],-24($tptr,$i) # t[1] 1270 1271 mul $a0 # a[2]*a[0] 1272 add %rax,$A0[1] 1273 mov $ai,%rax 1274 adc \$0,%rdx 1275 mov $A0[1],-16($tptr,$i) # t[2] 1276 mov %rdx,$A0[0] 1277 1278 1279 mov -8($aptr,$i),$ai # a[3] 1280 mul $a1 # a[2]*a[1] 1281 mov %rax,$A1[0] # a[2]*a[1]+t[3] 1282 mov $ai,%rax 1283 mov %rdx,$A1[1] 1284 1285 lea ($i),$j 1286 mul $a0 # a[3]*a[0] 1287 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1288 mov $ai,%rax 1289 mov %rdx,$A0[1] 1290 adc \$0,$A0[1] 1291 add $A1[0],$A0[0] 1292 adc \$0,$A0[1] 1293 mov $A0[0],-8($tptr,$j) # t[3] 1294 jmp .Lsqr4x_1st 1295 1296.align 32 1297.Lsqr4x_1st: 1298 mov ($aptr,$j),$ai # a[4] 1299 mul $a1 # a[3]*a[1] 1300 add %rax,$A1[1] # a[3]*a[1]+t[4] 1301 mov $ai,%rax 1302 mov %rdx,$A1[0] 1303 adc \$0,$A1[0] 1304 1305 mul $a0 # a[4]*a[0] 1306 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1307 mov $ai,%rax # a[3] 1308 mov 8($aptr,$j),$ai # a[5] 1309 mov %rdx,$A0[0] 1310 adc \$0,$A0[0] 1311 add $A1[1],$A0[1] 1312 adc \$0,$A0[0] 1313 1314 1315 mul $a1 # a[4]*a[3] 1316 add %rax,$A1[0] # a[4]*a[3]+t[5] 1317 mov $ai,%rax 1318 mov $A0[1],($tptr,$j) # t[4] 1319 mov %rdx,$A1[1] 1320 adc \$0,$A1[1] 1321 1322 mul $a0 # a[5]*a[2] 1323 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1324 mov $ai,%rax 1325 mov 16($aptr,$j),$ai # a[6] 1326 mov %rdx,$A0[1] 1327 adc \$0,$A0[1] 1328 add $A1[0],$A0[0] 1329 adc \$0,$A0[1] 1330 1331 mul $a1 # a[5]*a[3] 1332 add %rax,$A1[1] # a[5]*a[3]+t[6] 1333 mov $ai,%rax 1334 mov $A0[0],8($tptr,$j) # t[5] 1335 mov %rdx,$A1[0] 1336 adc \$0,$A1[0] 1337 1338 mul $a0 # a[6]*a[2] 1339 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 1340 mov $ai,%rax # a[3] 1341 mov 24($aptr,$j),$ai # a[7] 1342 mov %rdx,$A0[0] 1343 adc \$0,$A0[0] 1344 add $A1[1],$A0[1] 1345 adc \$0,$A0[0] 1346 1347 1348 mul $a1 # a[6]*a[5] 1349 add %rax,$A1[0] # a[6]*a[5]+t[7] 1350 mov $ai,%rax 1351 mov $A0[1],16($tptr,$j) # t[6] 1352 mov %rdx,$A1[1] 1353 adc \$0,$A1[1] 1354 lea 32($j),$j 1355 1356 mul $a0 # a[7]*a[4] 1357 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 1358 mov $ai,%rax 1359 mov %rdx,$A0[1] 1360 adc \$0,$A0[1] 1361 add $A1[0],$A0[0] 1362 adc \$0,$A0[1] 1363 mov $A0[0],-8($tptr,$j) # t[7] 1364 1365 cmp \$0,$j 1366 jne .Lsqr4x_1st 1367 1368 mul $a1 # a[7]*a[5] 1369 add %rax,$A1[1] 1370 lea 16($i),$i 1371 adc \$0,%rdx 1372 add $A0[1],$A1[1] 1373 adc \$0,%rdx 1374 1375 mov $A1[1],($tptr) # t[8] 1376 mov %rdx,$A1[0] 1377 mov %rdx,8($tptr) # t[9] 1378 jmp .Lsqr4x_outer 1379 1380.align 32 1381.Lsqr4x_outer: # comments apply to $num==6 case 1382 mov -32($aptr,$i),$a0 # a[0] 1383 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1384 mov -24($aptr,$i),%rax # a[1] 1385 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1386 mov -16($aptr,$i),$ai # a[2] 1387 mov %rax,$a1 1388 1389 mul $a0 # a[1]*a[0] 1390 mov -24($tptr,$i),$A0[0] # t[1] 1391 add %rax,$A0[0] # a[1]*a[0]+t[1] 1392 mov $ai,%rax # a[2] 1393 adc \$0,%rdx 1394 mov $A0[0],-24($tptr,$i) # t[1] 1395 mov %rdx,$A0[1] 1396 1397 mul $a0 # a[2]*a[0] 1398 add %rax,$A0[1] 1399 mov $ai,%rax 1400 adc \$0,%rdx 1401 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 1402 mov %rdx,$A0[0] 1403 adc \$0,$A0[0] 1404 mov $A0[1],-16($tptr,$i) # t[2] 1405 1406 xor $A1[0],$A1[0] 1407 1408 mov -8($aptr,$i),$ai # a[3] 1409 mul $a1 # a[2]*a[1] 1410 add %rax,$A1[0] # a[2]*a[1]+t[3] 1411 mov $ai,%rax 1412 adc \$0,%rdx 1413 add -8($tptr,$i),$A1[0] 1414 mov %rdx,$A1[1] 1415 adc \$0,$A1[1] 1416 1417 mul $a0 # a[3]*a[0] 1418 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1419 mov $ai,%rax 1420 adc \$0,%rdx 1421 add $A1[0],$A0[0] 1422 mov %rdx,$A0[1] 1423 adc \$0,$A0[1] 1424 mov $A0[0],-8($tptr,$i) # t[3] 1425 1426 lea ($i),$j 1427 jmp .Lsqr4x_inner 1428 1429.align 32 1430.Lsqr4x_inner: 1431 mov ($aptr,$j),$ai # a[4] 1432 mul $a1 # a[3]*a[1] 1433 add %rax,$A1[1] # a[3]*a[1]+t[4] 1434 mov $ai,%rax 1435 mov %rdx,$A1[0] 1436 adc \$0,$A1[0] 1437 add ($tptr,$j),$A1[1] 1438 adc \$0,$A1[0] 1439 1440 .byte 0x67 1441 mul $a0 # a[4]*a[0] 1442 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1443 mov $ai,%rax # a[3] 1444 mov 8($aptr,$j),$ai # a[5] 1445 mov %rdx,$A0[0] 1446 adc \$0,$A0[0] 1447 add $A1[1],$A0[1] 1448 adc \$0,$A0[0] 1449 1450 mul $a1 # a[4]*a[3] 1451 add %rax,$A1[0] # a[4]*a[3]+t[5] 1452 mov $A0[1],($tptr,$j) # t[4] 1453 mov $ai,%rax 1454 mov %rdx,$A1[1] 1455 adc \$0,$A1[1] 1456 add 8($tptr,$j),$A1[0] 1457 lea 16($j),$j # j++ 1458 adc \$0,$A1[1] 1459 1460 mul $a0 # a[5]*a[2] 1461 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1462 mov $ai,%rax 1463 adc \$0,%rdx 1464 add $A1[0],$A0[0] 1465 mov %rdx,$A0[1] 1466 adc \$0,$A0[1] 1467 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 1468 1469 cmp \$0,$j 1470 jne .Lsqr4x_inner 1471 1472 .byte 0x67 1473 mul $a1 # a[5]*a[3] 1474 add %rax,$A1[1] 1475 adc \$0,%rdx 1476 add $A0[1],$A1[1] 1477 adc \$0,%rdx 1478 1479 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 1480 mov %rdx,$A1[0] 1481 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below 1482 1483 add \$16,$i 1484 jnz .Lsqr4x_outer 1485 1486 # comments apply to $num==4 case 1487 mov -32($aptr),$a0 # a[0] 1488 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1489 mov -24($aptr),%rax # a[1] 1490 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1491 mov -16($aptr),$ai # a[2] 1492 mov %rax,$a1 1493 1494 mul $a0 # a[1]*a[0] 1495 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 1496 mov $ai,%rax # a[2] 1497 mov %rdx,$A0[1] 1498 adc \$0,$A0[1] 1499 1500 mul $a0 # a[2]*a[0] 1501 add %rax,$A0[1] 1502 mov $ai,%rax 1503 mov $A0[0],-24($tptr) # t[1] 1504 mov %rdx,$A0[0] 1505 adc \$0,$A0[0] 1506 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1507 mov -8($aptr),$ai # a[3] 1508 adc \$0,$A0[0] 1509 1510 mul $a1 # a[2]*a[1] 1511 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1512 mov $ai,%rax 1513 mov $A0[1],-16($tptr) # t[2] 1514 mov %rdx,$A1[1] 1515 adc \$0,$A1[1] 1516 1517 mul $a0 # a[3]*a[0] 1518 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1519 mov $ai,%rax 1520 mov %rdx,$A0[1] 1521 adc \$0,$A0[1] 1522 add $A1[0],$A0[0] 1523 adc \$0,$A0[1] 1524 mov $A0[0],-8($tptr) # t[3] 1525 1526 mul $a1 # a[3]*a[1] 1527 add %rax,$A1[1] 1528 mov -16($aptr),%rax # a[2] 1529 adc \$0,%rdx 1530 add $A0[1],$A1[1] 1531 adc \$0,%rdx 1532 1533 mov $A1[1],($tptr) # t[4] 1534 mov %rdx,$A1[0] 1535 mov %rdx,8($tptr) # t[5] 1536 1537 mul $ai # a[2]*a[3] 1538___ 1539{ 1540my ($shift,$carry)=($a0,$a1); 1541my @S=(@A1,$ai,$n0); 1542$code.=<<___; 1543 add \$16,$i 1544 xor $shift,$shift 1545 sub $num,$i # $i=16-$num 1546 xor $carry,$carry 1547 1548 add $A1[0],%rax # t[5] 1549 adc \$0,%rdx 1550 mov %rax,8($tptr) # t[5] 1551 mov %rdx,16($tptr) # t[6] 1552 mov $carry,24($tptr) # t[7] 1553 1554 mov -16($aptr,$i),%rax # a[0] 1555 lea 48+8(%rsp),$tptr 1556 xor $A0[0],$A0[0] # t[0] 1557 mov 8($tptr),$A0[1] # t[1] 1558 1559 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1560 shr \$63,$A0[0] 1561 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1562 shr \$63,$A0[1] 1563 or $A0[0],$S[1] # | t[2*i]>>63 1564 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1565 mov $A0[1],$shift # shift=t[2*i+1]>>63 1566 mul %rax # a[i]*a[i] 1567 neg $carry # mov $carry,cf 1568 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1569 adc %rax,$S[0] 1570 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1571 mov $S[0],($tptr) 1572 adc %rdx,$S[1] 1573 1574 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1575 mov $S[1],8($tptr) 1576 sbb $carry,$carry # mov cf,$carry 1577 shr \$63,$A0[0] 1578 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1579 shr \$63,$A0[1] 1580 or $A0[0],$S[3] # | t[2*i]>>63 1581 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1582 mov $A0[1],$shift # shift=t[2*i+1]>>63 1583 mul %rax # a[i]*a[i] 1584 neg $carry # mov $carry,cf 1585 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1586 adc %rax,$S[2] 1587 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1588 mov $S[2],16($tptr) 1589 adc %rdx,$S[3] 1590 lea 16($i),$i 1591 mov $S[3],24($tptr) 1592 sbb $carry,$carry # mov cf,$carry 1593 lea 64($tptr),$tptr 1594 jmp .Lsqr4x_shift_n_add 1595 1596.align 32 1597.Lsqr4x_shift_n_add: 1598 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1599 shr \$63,$A0[0] 1600 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1601 shr \$63,$A0[1] 1602 or $A0[0],$S[1] # | t[2*i]>>63 1603 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1604 mov $A0[1],$shift # shift=t[2*i+1]>>63 1605 mul %rax # a[i]*a[i] 1606 neg $carry # mov $carry,cf 1607 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1608 adc %rax,$S[0] 1609 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1610 mov $S[0],-32($tptr) 1611 adc %rdx,$S[1] 1612 1613 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1614 mov $S[1],-24($tptr) 1615 sbb $carry,$carry # mov cf,$carry 1616 shr \$63,$A0[0] 1617 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1618 shr \$63,$A0[1] 1619 or $A0[0],$S[3] # | t[2*i]>>63 1620 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch 1621 mov $A0[1],$shift # shift=t[2*i+1]>>63 1622 mul %rax # a[i]*a[i] 1623 neg $carry # mov $carry,cf 1624 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1625 adc %rax,$S[2] 1626 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1627 mov $S[2],-16($tptr) 1628 adc %rdx,$S[3] 1629 1630 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1631 mov $S[3],-8($tptr) 1632 sbb $carry,$carry # mov cf,$carry 1633 shr \$63,$A0[0] 1634 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1635 shr \$63,$A0[1] 1636 or $A0[0],$S[1] # | t[2*i]>>63 1637 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1638 mov $A0[1],$shift # shift=t[2*i+1]>>63 1639 mul %rax # a[i]*a[i] 1640 neg $carry # mov $carry,cf 1641 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1642 adc %rax,$S[0] 1643 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1644 mov $S[0],0($tptr) 1645 adc %rdx,$S[1] 1646 1647 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1648 mov $S[1],8($tptr) 1649 sbb $carry,$carry # mov cf,$carry 1650 shr \$63,$A0[0] 1651 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1652 shr \$63,$A0[1] 1653 or $A0[0],$S[3] # | t[2*i]>>63 1654 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1655 mov $A0[1],$shift # shift=t[2*i+1]>>63 1656 mul %rax # a[i]*a[i] 1657 neg $carry # mov $carry,cf 1658 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1659 adc %rax,$S[2] 1660 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1661 mov $S[2],16($tptr) 1662 adc %rdx,$S[3] 1663 mov $S[3],24($tptr) 1664 sbb $carry,$carry # mov cf,$carry 1665 lea 64($tptr),$tptr 1666 add \$32,$i 1667 jnz .Lsqr4x_shift_n_add 1668 1669 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1670 .byte 0x67 1671 shr \$63,$A0[0] 1672 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1673 shr \$63,$A0[1] 1674 or $A0[0],$S[1] # | t[2*i]>>63 1675 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1676 mov $A0[1],$shift # shift=t[2*i+1]>>63 1677 mul %rax # a[i]*a[i] 1678 neg $carry # mov $carry,cf 1679 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1680 adc %rax,$S[0] 1681 mov -8($aptr),%rax # a[i+1] # prefetch 1682 mov $S[0],-32($tptr) 1683 adc %rdx,$S[1] 1684 1685 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1686 mov $S[1],-24($tptr) 1687 sbb $carry,$carry # mov cf,$carry 1688 shr \$63,$A0[0] 1689 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1690 shr \$63,$A0[1] 1691 or $A0[0],$S[3] # | t[2*i]>>63 1692 mul %rax # a[i]*a[i] 1693 neg $carry # mov $carry,cf 1694 adc %rax,$S[2] 1695 adc %rdx,$S[3] 1696 mov $S[2],-16($tptr) 1697 mov $S[3],-8($tptr) 1698___ 1699} 1700###################################################################### 1701# Montgomery reduction part, "word-by-word" algorithm. 1702# 1703# This new path is inspired by multiple submissions from Intel, by 1704# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 1705# Vinodh Gopal... 1706{ 1707my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); 1708 1709$code.=<<___; 1710 movq %xmm2,$nptr 1711__bn_sqr8x_reduction: 1712 xor %rax,%rax 1713 lea ($nptr,$num),%rcx # end of n[] 1714 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer 1715 mov %rcx,0+8(%rsp) 1716 lea 48+8(%rsp,$num),$tptr # end of initial t[] window 1717 mov %rdx,8+8(%rsp) 1718 neg $num 1719 jmp .L8x_reduction_loop 1720 1721.align 32 1722.L8x_reduction_loop: 1723 lea ($tptr,$num),$tptr # start of current t[] window 1724 .byte 0x66 1725 mov 8*0($tptr),$m0 1726 mov 8*1($tptr),%r9 1727 mov 8*2($tptr),%r10 1728 mov 8*3($tptr),%r11 1729 mov 8*4($tptr),%r12 1730 mov 8*5($tptr),%r13 1731 mov 8*6($tptr),%r14 1732 mov 8*7($tptr),%r15 1733 mov %rax,(%rdx) # store top-most carry bit 1734 lea 8*8($tptr),$tptr 1735 1736 .byte 0x67 1737 mov $m0,%r8 1738 imulq 32+8(%rsp),$m0 # n0*a[0] 1739 mov 8*0($nptr),%rax # n[0] 1740 mov \$8,%ecx 1741 jmp .L8x_reduce 1742 1743.align 32 1744.L8x_reduce: 1745 mulq $m0 1746 mov 8*1($nptr),%rax # n[1] 1747 neg %r8 1748 mov %rdx,%r8 1749 adc \$0,%r8 1750 1751 mulq $m0 1752 add %rax,%r9 1753 mov 8*2($nptr),%rax 1754 adc \$0,%rdx 1755 add %r9,%r8 1756 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] 1757 mov %rdx,%r9 1758 adc \$0,%r9 1759 1760 mulq $m0 1761 add %rax,%r10 1762 mov 8*3($nptr),%rax 1763 adc \$0,%rdx 1764 add %r10,%r9 1765 mov 32+8(%rsp),$carry # pull n0, borrow $carry 1766 mov %rdx,%r10 1767 adc \$0,%r10 1768 1769 mulq $m0 1770 add %rax,%r11 1771 mov 8*4($nptr),%rax 1772 adc \$0,%rdx 1773 imulq %r8,$carry # modulo-scheduled 1774 add %r11,%r10 1775 mov %rdx,%r11 1776 adc \$0,%r11 1777 1778 mulq $m0 1779 add %rax,%r12 1780 mov 8*5($nptr),%rax 1781 adc \$0,%rdx 1782 add %r12,%r11 1783 mov %rdx,%r12 1784 adc \$0,%r12 1785 1786 mulq $m0 1787 add %rax,%r13 1788 mov 8*6($nptr),%rax 1789 adc \$0,%rdx 1790 add %r13,%r12 1791 mov %rdx,%r13 1792 adc \$0,%r13 1793 1794 mulq $m0 1795 add %rax,%r14 1796 mov 8*7($nptr),%rax 1797 adc \$0,%rdx 1798 add %r14,%r13 1799 mov %rdx,%r14 1800 adc \$0,%r14 1801 1802 mulq $m0 1803 mov $carry,$m0 # n0*a[i] 1804 add %rax,%r15 1805 mov 8*0($nptr),%rax # n[0] 1806 adc \$0,%rdx 1807 add %r15,%r14 1808 mov %rdx,%r15 1809 adc \$0,%r15 1810 1811 dec %ecx 1812 jnz .L8x_reduce 1813 1814 lea 8*8($nptr),$nptr 1815 xor %rax,%rax 1816 mov 8+8(%rsp),%rdx # pull end of t[] 1817 cmp 0+8(%rsp),$nptr # end of n[]? 1818 jae .L8x_no_tail 1819 1820 .byte 0x66 1821 add 8*0($tptr),%r8 1822 adc 8*1($tptr),%r9 1823 adc 8*2($tptr),%r10 1824 adc 8*3($tptr),%r11 1825 adc 8*4($tptr),%r12 1826 adc 8*5($tptr),%r13 1827 adc 8*6($tptr),%r14 1828 adc 8*7($tptr),%r15 1829 sbb $carry,$carry # top carry 1830 1831 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1832 mov \$8,%ecx 1833 mov 8*0($nptr),%rax 1834 jmp .L8x_tail 1835 1836.align 32 1837.L8x_tail: 1838 mulq $m0 1839 add %rax,%r8 1840 mov 8*1($nptr),%rax 1841 mov %r8,($tptr) # save result 1842 mov %rdx,%r8 1843 adc \$0,%r8 1844 1845 mulq $m0 1846 add %rax,%r9 1847 mov 8*2($nptr),%rax 1848 adc \$0,%rdx 1849 add %r9,%r8 1850 lea 8($tptr),$tptr # $tptr++ 1851 mov %rdx,%r9 1852 adc \$0,%r9 1853 1854 mulq $m0 1855 add %rax,%r10 1856 mov 8*3($nptr),%rax 1857 adc \$0,%rdx 1858 add %r10,%r9 1859 mov %rdx,%r10 1860 adc \$0,%r10 1861 1862 mulq $m0 1863 add %rax,%r11 1864 mov 8*4($nptr),%rax 1865 adc \$0,%rdx 1866 add %r11,%r10 1867 mov %rdx,%r11 1868 adc \$0,%r11 1869 1870 mulq $m0 1871 add %rax,%r12 1872 mov 8*5($nptr),%rax 1873 adc \$0,%rdx 1874 add %r12,%r11 1875 mov %rdx,%r12 1876 adc \$0,%r12 1877 1878 mulq $m0 1879 add %rax,%r13 1880 mov 8*6($nptr),%rax 1881 adc \$0,%rdx 1882 add %r13,%r12 1883 mov %rdx,%r13 1884 adc \$0,%r13 1885 1886 mulq $m0 1887 add %rax,%r14 1888 mov 8*7($nptr),%rax 1889 adc \$0,%rdx 1890 add %r14,%r13 1891 mov %rdx,%r14 1892 adc \$0,%r14 1893 1894 mulq $m0 1895 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] 1896 add %rax,%r15 1897 adc \$0,%rdx 1898 add %r15,%r14 1899 mov 8*0($nptr),%rax # pull n[0] 1900 mov %rdx,%r15 1901 adc \$0,%r15 1902 1903 dec %ecx 1904 jnz .L8x_tail 1905 1906 lea 8*8($nptr),$nptr 1907 mov 8+8(%rsp),%rdx # pull end of t[] 1908 cmp 0+8(%rsp),$nptr # end of n[]? 1909 jae .L8x_tail_done # break out of loop 1910 1911 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1912 neg $carry 1913 mov 8*0($nptr),%rax # pull n[0] 1914 adc 8*0($tptr),%r8 1915 adc 8*1($tptr),%r9 1916 adc 8*2($tptr),%r10 1917 adc 8*3($tptr),%r11 1918 adc 8*4($tptr),%r12 1919 adc 8*5($tptr),%r13 1920 adc 8*6($tptr),%r14 1921 adc 8*7($tptr),%r15 1922 sbb $carry,$carry # top carry 1923 1924 mov \$8,%ecx 1925 jmp .L8x_tail 1926 1927.align 32 1928.L8x_tail_done: 1929 xor %rax,%rax 1930 add (%rdx),%r8 # can this overflow? 1931 adc \$0,%r9 1932 adc \$0,%r10 1933 adc \$0,%r11 1934 adc \$0,%r12 1935 adc \$0,%r13 1936 adc \$0,%r14 1937 adc \$0,%r15 1938 adc \$0,%rax 1939 1940 neg $carry 1941.L8x_no_tail: 1942 adc 8*0($tptr),%r8 1943 adc 8*1($tptr),%r9 1944 adc 8*2($tptr),%r10 1945 adc 8*3($tptr),%r11 1946 adc 8*4($tptr),%r12 1947 adc 8*5($tptr),%r13 1948 adc 8*6($tptr),%r14 1949 adc 8*7($tptr),%r15 1950 adc \$0,%rax # top-most carry 1951 mov -8($nptr),%rcx # np[num-1] 1952 xor $carry,$carry 1953 1954 movq %xmm2,$nptr # restore $nptr 1955 1956 mov %r8,8*0($tptr) # store top 512 bits 1957 mov %r9,8*1($tptr) 1958 movq %xmm3,$num # $num is %r9, can't be moved upwards 1959 mov %r10,8*2($tptr) 1960 mov %r11,8*3($tptr) 1961 mov %r12,8*4($tptr) 1962 mov %r13,8*5($tptr) 1963 mov %r14,8*6($tptr) 1964 mov %r15,8*7($tptr) 1965 lea 8*8($tptr),$tptr 1966 1967 cmp %rdx,$tptr # end of t[]? 1968 jb .L8x_reduction_loop 1969 ret 1970.size bn_sqr8x_internal,.-bn_sqr8x_internal 1971___ 1972} 1973############################################################## 1974# Post-condition, 4x unrolled 1975# 1976{ 1977my ($tptr,$nptr)=("%rbx","%rbp"); 1978$code.=<<___; 1979.type __bn_post4x_internal,\@abi-omnipotent 1980.align 32 1981__bn_post4x_internal: 1982 mov 8*0($nptr),%r12 1983 lea (%rdi,$num),$tptr # %rdi was $tptr above 1984 mov $num,%rcx 1985 movq %xmm1,$rptr # restore $rptr 1986 neg %rax 1987 movq %xmm1,$aptr # prepare for back-to-back call 1988 sar \$3+2,%rcx 1989 dec %r12 # so that after 'not' we get -n[0] 1990 xor %r10,%r10 1991 mov 8*1($nptr),%r13 1992 mov 8*2($nptr),%r14 1993 mov 8*3($nptr),%r15 1994 jmp .Lsqr4x_sub_entry 1995 1996.align 16 1997.Lsqr4x_sub: 1998 mov 8*0($nptr),%r12 1999 mov 8*1($nptr),%r13 2000 mov 8*2($nptr),%r14 2001 mov 8*3($nptr),%r15 2002.Lsqr4x_sub_entry: 2003 lea 8*4($nptr),$nptr 2004 not %r12 2005 not %r13 2006 not %r14 2007 not %r15 2008 and %rax,%r12 2009 and %rax,%r13 2010 and %rax,%r14 2011 and %rax,%r15 2012 2013 neg %r10 # mov %r10,%cf 2014 adc 8*0($tptr),%r12 2015 adc 8*1($tptr),%r13 2016 adc 8*2($tptr),%r14 2017 adc 8*3($tptr),%r15 2018 mov %r12,8*0($rptr) 2019 lea 8*4($tptr),$tptr 2020 mov %r13,8*1($rptr) 2021 sbb %r10,%r10 # mov %cf,%r10 2022 mov %r14,8*2($rptr) 2023 mov %r15,8*3($rptr) 2024 lea 8*4($rptr),$rptr 2025 2026 inc %rcx # pass %cf 2027 jnz .Lsqr4x_sub 2028 2029 mov $num,%r10 # prepare for back-to-back call 2030 neg $num # restore $num 2031 ret 2032.size __bn_post4x_internal,.-__bn_post4x_internal 2033___ 2034} 2035{ 2036$code.=<<___; 2037.globl bn_from_montgomery 2038.type bn_from_montgomery,\@abi-omnipotent 2039.align 32 2040bn_from_montgomery: 2041 testl \$7,`($win64?"48(%rsp)":"%r9d")` 2042 jz bn_from_mont8x 2043 xor %eax,%eax 2044 ret 2045.size bn_from_montgomery,.-bn_from_montgomery 2046 2047.type bn_from_mont8x,\@function,6 2048.align 32 2049bn_from_mont8x: 2050 .byte 0x67 2051 mov %rsp,%rax 2052 push %rbx 2053 push %rbp 2054 push %r12 2055 push %r13 2056 push %r14 2057 push %r15 2058.Lfrom_prologue: 2059 2060 shl \$3,${num}d # convert $num to bytes 2061 lea ($num,$num,2),%r10 # 3*$num in bytes 2062 neg $num 2063 mov ($n0),$n0 # *n0 2064 2065 ############################################################## 2066 # Ensure that stack frame doesn't alias with $rptr+3*$num 2067 # modulo 4096, which covers ret[num], am[num] and n[num] 2068 # (see bn_exp.c). The stack is allocated to aligned with 2069 # bn_power5's frame, and as bn_from_montgomery happens to be 2070 # last operation, we use the opportunity to cleanse it. 2071 # 2072 lea -320(%rsp,$num,2),%r11 2073 mov %rsp,%rbp 2074 sub $rptr,%r11 2075 and \$4095,%r11 2076 cmp %r11,%r10 2077 jb .Lfrom_sp_alt 2078 sub %r11,%rbp # align with $aptr 2079 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2080 jmp .Lfrom_sp_done 2081 2082.align 32 2083.Lfrom_sp_alt: 2084 lea 4096-320(,$num,2),%r10 2085 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2086 sub %r10,%r11 2087 mov \$0,%r10 2088 cmovc %r10,%r11 2089 sub %r11,%rbp 2090.Lfrom_sp_done: 2091 and \$-64,%rbp 2092 mov %rsp,%r11 2093 sub %rbp,%r11 2094 and \$-4096,%r11 2095 lea (%rbp,%r11),%rsp 2096 mov (%rsp),%r10 2097 cmp %rbp,%rsp 2098 ja .Lfrom_page_walk 2099 jmp .Lfrom_page_walk_done 2100 2101.Lfrom_page_walk: 2102 lea -4096(%rsp),%rsp 2103 mov (%rsp),%r10 2104 cmp %rbp,%rsp 2105 ja .Lfrom_page_walk 2106.Lfrom_page_walk_done: 2107 2108 mov $num,%r10 2109 neg $num 2110 2111 ############################################################## 2112 # Stack layout 2113 # 2114 # +0 saved $num, used in reduction section 2115 # +8 &t[2*$num], used in reduction section 2116 # +32 saved *n0 2117 # +40 saved %rsp 2118 # +48 t[2*$num] 2119 # 2120 mov $n0, 32(%rsp) 2121 mov %rax, 40(%rsp) # save original %rsp 2122.Lfrom_body: 2123 mov $num,%r11 2124 lea 48(%rsp),%rax 2125 pxor %xmm0,%xmm0 2126 jmp .Lmul_by_1 2127 2128.align 32 2129.Lmul_by_1: 2130 movdqu ($aptr),%xmm1 2131 movdqu 16($aptr),%xmm2 2132 movdqu 32($aptr),%xmm3 2133 movdqa %xmm0,(%rax,$num) 2134 movdqu 48($aptr),%xmm4 2135 movdqa %xmm0,16(%rax,$num) 2136 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr 2137 movdqa %xmm1,(%rax) 2138 movdqa %xmm0,32(%rax,$num) 2139 movdqa %xmm2,16(%rax) 2140 movdqa %xmm0,48(%rax,$num) 2141 movdqa %xmm3,32(%rax) 2142 movdqa %xmm4,48(%rax) 2143 lea 64(%rax),%rax 2144 sub \$64,%r11 2145 jnz .Lmul_by_1 2146 2147 movq $rptr,%xmm1 2148 movq $nptr,%xmm2 2149 .byte 0x67 2150 mov $nptr,%rbp 2151 movq %r10, %xmm3 # -num 2152___ 2153$code.=<<___ if ($addx); 2154 mov OPENSSL_ia32cap_P+8(%rip),%r11d 2155 and \$0x80108,%r11d 2156 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 2157 jne .Lfrom_mont_nox 2158 2159 lea (%rax,$num),$rptr 2160 call __bn_sqrx8x_reduction 2161 call __bn_postx4x_internal 2162 2163 pxor %xmm0,%xmm0 2164 lea 48(%rsp),%rax 2165 mov 40(%rsp),%rsi # restore %rsp 2166 jmp .Lfrom_mont_zero 2167 2168.align 32 2169.Lfrom_mont_nox: 2170___ 2171$code.=<<___; 2172 call __bn_sqr8x_reduction 2173 call __bn_post4x_internal 2174 2175 pxor %xmm0,%xmm0 2176 lea 48(%rsp),%rax 2177 mov 40(%rsp),%rsi # restore %rsp 2178 jmp .Lfrom_mont_zero 2179 2180.align 32 2181.Lfrom_mont_zero: 2182 movdqa %xmm0,16*0(%rax) 2183 movdqa %xmm0,16*1(%rax) 2184 movdqa %xmm0,16*2(%rax) 2185 movdqa %xmm0,16*3(%rax) 2186 lea 16*4(%rax),%rax 2187 sub \$32,$num 2188 jnz .Lfrom_mont_zero 2189 2190 mov \$1,%rax 2191 mov -48(%rsi),%r15 2192 mov -40(%rsi),%r14 2193 mov -32(%rsi),%r13 2194 mov -24(%rsi),%r12 2195 mov -16(%rsi),%rbp 2196 mov -8(%rsi),%rbx 2197 lea (%rsi),%rsp 2198.Lfrom_epilogue: 2199 ret 2200.size bn_from_mont8x,.-bn_from_mont8x 2201___ 2202} 2203}}} 2204 2205if ($addx) {{{ 2206my $bp="%rdx"; # restore original value 2207 2208$code.=<<___; 2209.type bn_mulx4x_mont_gather5,\@function,6 2210.align 32 2211bn_mulx4x_mont_gather5: 2212 mov %rsp,%rax 2213.Lmulx4x_enter: 2214 push %rbx 2215 push %rbp 2216 push %r12 2217 push %r13 2218 push %r14 2219 push %r15 2220.Lmulx4x_prologue: 2221 2222 shl \$3,${num}d # convert $num to bytes 2223 lea ($num,$num,2),%r10 # 3*$num in bytes 2224 neg $num # -$num 2225 mov ($n0),$n0 # *n0 2226 2227 ############################################################## 2228 # Ensure that stack frame doesn't alias with $rptr+3*$num 2229 # modulo 4096, which covers ret[num], am[num] and n[num] 2230 # (see bn_exp.c). This is done to allow memory disambiguation 2231 # logic do its magic. [Extra [num] is allocated in order 2232 # to align with bn_power5's frame, which is cleansed after 2233 # completing exponentiation. Extra 256 bytes is for power mask 2234 # calculated from 7th argument, the index.] 2235 # 2236 lea -320(%rsp,$num,2),%r11 2237 mov %rsp,%rbp 2238 sub $rp,%r11 2239 and \$4095,%r11 2240 cmp %r11,%r10 2241 jb .Lmulx4xsp_alt 2242 sub %r11,%rbp # align with $aptr 2243 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2244 jmp .Lmulx4xsp_done 2245 2246.Lmulx4xsp_alt: 2247 lea 4096-320(,$num,2),%r10 2248 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2249 sub %r10,%r11 2250 mov \$0,%r10 2251 cmovc %r10,%r11 2252 sub %r11,%rbp 2253.Lmulx4xsp_done: 2254 and \$-64,%rbp # ensure alignment 2255 mov %rsp,%r11 2256 sub %rbp,%r11 2257 and \$-4096,%r11 2258 lea (%rbp,%r11),%rsp 2259 mov (%rsp),%r10 2260 cmp %rbp,%rsp 2261 ja .Lmulx4x_page_walk 2262 jmp .Lmulx4x_page_walk_done 2263 2264.Lmulx4x_page_walk: 2265 lea -4096(%rsp),%rsp 2266 mov (%rsp),%r10 2267 cmp %rbp,%rsp 2268 ja .Lmulx4x_page_walk 2269.Lmulx4x_page_walk_done: 2270 2271 ############################################################## 2272 # Stack layout 2273 # +0 -num 2274 # +8 off-loaded &b[i] 2275 # +16 end of b[num] 2276 # +24 inner counter 2277 # +32 saved n0 2278 # +40 saved %rsp 2279 # +48 2280 # +56 saved rp 2281 # +64 tmp[num+1] 2282 # 2283 mov $n0, 32(%rsp) # save *n0 2284 mov %rax,40(%rsp) # save original %rsp 2285.Lmulx4x_body: 2286 call mulx4x_internal 2287 2288 mov 40(%rsp),%rsi # restore %rsp 2289 mov \$1,%rax 2290 2291 mov -48(%rsi),%r15 2292 mov -40(%rsi),%r14 2293 mov -32(%rsi),%r13 2294 mov -24(%rsi),%r12 2295 mov -16(%rsi),%rbp 2296 mov -8(%rsi),%rbx 2297 lea (%rsi),%rsp 2298.Lmulx4x_epilogue: 2299 ret 2300.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2301 2302.type mulx4x_internal,\@abi-omnipotent 2303.align 32 2304mulx4x_internal: 2305 mov $num,8(%rsp) # save -$num (it was in bytes) 2306 mov $num,%r10 2307 neg $num # restore $num 2308 shl \$5,$num 2309 neg %r10 # restore $num 2310 lea 128($bp,$num),%r13 # end of powers table (+size optimization) 2311 shr \$5+5,$num 2312 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument 2313 sub \$1,$num 2314 lea .Linc(%rip),%rax 2315 mov %r13,16+8(%rsp) # end of b[num] 2316 mov $num,24+8(%rsp) # inner counter 2317 mov $rp, 56+8(%rsp) # save $rp 2318___ 2319my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 2320 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 2321my $rptr=$bptr; 2322my $STRIDE=2**5*8; # 5 is "window size" 2323my $N=$STRIDE/4; # should match cache line size 2324$code.=<<___; 2325 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 2326 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 2327 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton) 2328 lea 128($bp),$bptr # size optimization 2329 2330 pshufd \$0,%xmm5,%xmm5 # broadcast index 2331 movdqa %xmm1,%xmm4 2332 .byte 0x67 2333 movdqa %xmm1,%xmm2 2334___ 2335######################################################################## 2336# calculate mask by comparing 0..31 to index and save result to stack 2337# 2338$code.=<<___; 2339 .byte 0x67 2340 paddd %xmm0,%xmm1 2341 pcmpeqd %xmm5,%xmm0 # compare to 1,0 2342 movdqa %xmm4,%xmm3 2343___ 2344for($i=0;$i<$STRIDE/16-4;$i+=4) { 2345$code.=<<___; 2346 paddd %xmm1,%xmm2 2347 pcmpeqd %xmm5,%xmm1 # compare to 3,2 2348 movdqa %xmm0,`16*($i+0)+112`(%r10) 2349 movdqa %xmm4,%xmm0 2350 2351 paddd %xmm2,%xmm3 2352 pcmpeqd %xmm5,%xmm2 # compare to 5,4 2353 movdqa %xmm1,`16*($i+1)+112`(%r10) 2354 movdqa %xmm4,%xmm1 2355 2356 paddd %xmm3,%xmm0 2357 pcmpeqd %xmm5,%xmm3 # compare to 7,6 2358 movdqa %xmm2,`16*($i+2)+112`(%r10) 2359 movdqa %xmm4,%xmm2 2360 2361 paddd %xmm0,%xmm1 2362 pcmpeqd %xmm5,%xmm0 2363 movdqa %xmm3,`16*($i+3)+112`(%r10) 2364 movdqa %xmm4,%xmm3 2365___ 2366} 2367$code.=<<___; # last iteration can be optimized 2368 .byte 0x67 2369 paddd %xmm1,%xmm2 2370 pcmpeqd %xmm5,%xmm1 2371 movdqa %xmm0,`16*($i+0)+112`(%r10) 2372 2373 paddd %xmm2,%xmm3 2374 pcmpeqd %xmm5,%xmm2 2375 movdqa %xmm1,`16*($i+1)+112`(%r10) 2376 2377 pcmpeqd %xmm5,%xmm3 2378 movdqa %xmm2,`16*($i+2)+112`(%r10) 2379 2380 pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register 2381 pand `16*($i+1)-128`($bptr),%xmm1 2382 pand `16*($i+2)-128`($bptr),%xmm2 2383 movdqa %xmm3,`16*($i+3)+112`(%r10) 2384 pand `16*($i+3)-128`($bptr),%xmm3 2385 por %xmm2,%xmm0 2386 por %xmm3,%xmm1 2387___ 2388for($i=0;$i<$STRIDE/16-4;$i+=4) { 2389$code.=<<___; 2390 movdqa `16*($i+0)-128`($bptr),%xmm4 2391 movdqa `16*($i+1)-128`($bptr),%xmm5 2392 movdqa `16*($i+2)-128`($bptr),%xmm2 2393 pand `16*($i+0)+112`(%r10),%xmm4 2394 movdqa `16*($i+3)-128`($bptr),%xmm3 2395 pand `16*($i+1)+112`(%r10),%xmm5 2396 por %xmm4,%xmm0 2397 pand `16*($i+2)+112`(%r10),%xmm2 2398 por %xmm5,%xmm1 2399 pand `16*($i+3)+112`(%r10),%xmm3 2400 por %xmm2,%xmm0 2401 por %xmm3,%xmm1 2402___ 2403} 2404$code.=<<___; 2405 pxor %xmm1,%xmm0 2406 pshufd \$0x4e,%xmm0,%xmm1 2407 por %xmm1,%xmm0 2408 lea $STRIDE($bptr),$bptr 2409 movq %xmm0,%rdx # bp[0] 2410 lea 64+8*4+8(%rsp),$tptr 2411 2412 mov %rdx,$bi 2413 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 2414 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] 2415 add %rax,%r11 2416 mulx 2*8($aptr),%rax,%r13 # ... 2417 adc %rax,%r12 2418 adc \$0,%r13 2419 mulx 3*8($aptr),%rax,%r14 2420 2421 mov $mi,%r15 2422 imulq 32+8(%rsp),$mi # "t[0]"*n0 2423 xor $zero,$zero # cf=0, of=0 2424 mov $mi,%rdx 2425 2426 mov $bptr,8+8(%rsp) # off-load &b[i] 2427 2428 lea 4*8($aptr),$aptr 2429 adcx %rax,%r13 2430 adcx $zero,%r14 # cf=0 2431 2432 mulx 0*8($nptr),%rax,%r10 2433 adcx %rax,%r15 # discarded 2434 adox %r11,%r10 2435 mulx 1*8($nptr),%rax,%r11 2436 adcx %rax,%r10 2437 adox %r12,%r11 2438 mulx 2*8($nptr),%rax,%r12 2439 mov 24+8(%rsp),$bptr # counter value 2440 mov %r10,-8*4($tptr) 2441 adcx %rax,%r11 2442 adox %r13,%r12 2443 mulx 3*8($nptr),%rax,%r15 2444 mov $bi,%rdx 2445 mov %r11,-8*3($tptr) 2446 adcx %rax,%r12 2447 adox $zero,%r15 # of=0 2448 lea 4*8($nptr),$nptr 2449 mov %r12,-8*2($tptr) 2450 jmp .Lmulx4x_1st 2451 2452.align 32 2453.Lmulx4x_1st: 2454 adcx $zero,%r15 # cf=0, modulo-scheduled 2455 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 2456 adcx %r14,%r10 2457 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 2458 adcx %rax,%r11 2459 mulx 2*8($aptr),%r12,%rax # ... 2460 adcx %r14,%r12 2461 mulx 3*8($aptr),%r13,%r14 2462 .byte 0x67,0x67 2463 mov $mi,%rdx 2464 adcx %rax,%r13 2465 adcx $zero,%r14 # cf=0 2466 lea 4*8($aptr),$aptr 2467 lea 4*8($tptr),$tptr 2468 2469 adox %r15,%r10 2470 mulx 0*8($nptr),%rax,%r15 2471 adcx %rax,%r10 2472 adox %r15,%r11 2473 mulx 1*8($nptr),%rax,%r15 2474 adcx %rax,%r11 2475 adox %r15,%r12 2476 mulx 2*8($nptr),%rax,%r15 2477 mov %r10,-5*8($tptr) 2478 adcx %rax,%r12 2479 mov %r11,-4*8($tptr) 2480 adox %r15,%r13 2481 mulx 3*8($nptr),%rax,%r15 2482 mov $bi,%rdx 2483 mov %r12,-3*8($tptr) 2484 adcx %rax,%r13 2485 adox $zero,%r15 2486 lea 4*8($nptr),$nptr 2487 mov %r13,-2*8($tptr) 2488 2489 dec $bptr # of=0, pass cf 2490 jnz .Lmulx4x_1st 2491 2492 mov 8(%rsp),$num # load -num 2493 adc $zero,%r15 # modulo-scheduled 2494 lea ($aptr,$num),$aptr # rewind $aptr 2495 add %r15,%r14 2496 mov 8+8(%rsp),$bptr # re-load &b[i] 2497 adc $zero,$zero # top-most carry 2498 mov %r14,-1*8($tptr) 2499 jmp .Lmulx4x_outer 2500 2501.align 32 2502.Lmulx4x_outer: 2503 lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) 2504 pxor %xmm4,%xmm4 2505 .byte 0x67,0x67 2506 pxor %xmm5,%xmm5 2507___ 2508for($i=0;$i<$STRIDE/16;$i+=4) { 2509$code.=<<___; 2510 movdqa `16*($i+0)-128`($bptr),%xmm0 2511 movdqa `16*($i+1)-128`($bptr),%xmm1 2512 movdqa `16*($i+2)-128`($bptr),%xmm2 2513 pand `16*($i+0)+256`(%r10),%xmm0 2514 movdqa `16*($i+3)-128`($bptr),%xmm3 2515 pand `16*($i+1)+256`(%r10),%xmm1 2516 por %xmm0,%xmm4 2517 pand `16*($i+2)+256`(%r10),%xmm2 2518 por %xmm1,%xmm5 2519 pand `16*($i+3)+256`(%r10),%xmm3 2520 por %xmm2,%xmm4 2521 por %xmm3,%xmm5 2522___ 2523} 2524$code.=<<___; 2525 por %xmm5,%xmm4 2526 pshufd \$0x4e,%xmm4,%xmm0 2527 por %xmm4,%xmm0 2528 lea $STRIDE($bptr),$bptr 2529 movq %xmm0,%rdx # m0=bp[i] 2530 2531 mov $zero,($tptr) # save top-most carry 2532 lea 4*8($tptr,$num),$tptr # rewind $tptr 2533 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 2534 xor $zero,$zero # cf=0, of=0 2535 mov %rdx,$bi 2536 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 2537 adox -4*8($tptr),$mi # +t[0] 2538 adcx %r14,%r11 2539 mulx 2*8($aptr),%r15,%r13 # ... 2540 adox -3*8($tptr),%r11 2541 adcx %r15,%r12 2542 mulx 3*8($aptr),%rdx,%r14 2543 adox -2*8($tptr),%r12 2544 adcx %rdx,%r13 2545 lea ($nptr,$num),$nptr # rewind $nptr 2546 lea 4*8($aptr),$aptr 2547 adox -1*8($tptr),%r13 2548 adcx $zero,%r14 2549 adox $zero,%r14 2550 2551 mov $mi,%r15 2552 imulq 32+8(%rsp),$mi # "t[0]"*n0 2553 2554 mov $mi,%rdx 2555 xor $zero,$zero # cf=0, of=0 2556 mov $bptr,8+8(%rsp) # off-load &b[i] 2557 2558 mulx 0*8($nptr),%rax,%r10 2559 adcx %rax,%r15 # discarded 2560 adox %r11,%r10 2561 mulx 1*8($nptr),%rax,%r11 2562 adcx %rax,%r10 2563 adox %r12,%r11 2564 mulx 2*8($nptr),%rax,%r12 2565 adcx %rax,%r11 2566 adox %r13,%r12 2567 mulx 3*8($nptr),%rax,%r15 2568 mov $bi,%rdx 2569 mov 24+8(%rsp),$bptr # counter value 2570 mov %r10,-8*4($tptr) 2571 adcx %rax,%r12 2572 mov %r11,-8*3($tptr) 2573 adox $zero,%r15 # of=0 2574 mov %r12,-8*2($tptr) 2575 lea 4*8($nptr),$nptr 2576 jmp .Lmulx4x_inner 2577 2578.align 32 2579.Lmulx4x_inner: 2580 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 2581 adcx $zero,%r15 # cf=0, modulo-scheduled 2582 adox %r14,%r10 2583 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 2584 adcx 0*8($tptr),%r10 2585 adox %rax,%r11 2586 mulx 2*8($aptr),%r12,%rax # ... 2587 adcx 1*8($tptr),%r11 2588 adox %r14,%r12 2589 mulx 3*8($aptr),%r13,%r14 2590 mov $mi,%rdx 2591 adcx 2*8($tptr),%r12 2592 adox %rax,%r13 2593 adcx 3*8($tptr),%r13 2594 adox $zero,%r14 # of=0 2595 lea 4*8($aptr),$aptr 2596 lea 4*8($tptr),$tptr 2597 adcx $zero,%r14 # cf=0 2598 2599 adox %r15,%r10 2600 mulx 0*8($nptr),%rax,%r15 2601 adcx %rax,%r10 2602 adox %r15,%r11 2603 mulx 1*8($nptr),%rax,%r15 2604 adcx %rax,%r11 2605 adox %r15,%r12 2606 mulx 2*8($nptr),%rax,%r15 2607 mov %r10,-5*8($tptr) 2608 adcx %rax,%r12 2609 adox %r15,%r13 2610 mov %r11,-4*8($tptr) 2611 mulx 3*8($nptr),%rax,%r15 2612 mov $bi,%rdx 2613 lea 4*8($nptr),$nptr 2614 mov %r12,-3*8($tptr) 2615 adcx %rax,%r13 2616 adox $zero,%r15 2617 mov %r13,-2*8($tptr) 2618 2619 dec $bptr # of=0, pass cf 2620 jnz .Lmulx4x_inner 2621 2622 mov 0+8(%rsp),$num # load -num 2623 adc $zero,%r15 # modulo-scheduled 2624 sub 0*8($tptr),$bptr # pull top-most carry to %cf 2625 mov 8+8(%rsp),$bptr # re-load &b[i] 2626 mov 16+8(%rsp),%r10 2627 adc %r15,%r14 2628 lea ($aptr,$num),$aptr # rewind $aptr 2629 adc $zero,$zero # top-most carry 2630 mov %r14,-1*8($tptr) 2631 2632 cmp %r10,$bptr 2633 jb .Lmulx4x_outer 2634 2635 mov -8($nptr),%r10 2636 mov $zero,%r8 2637 mov ($nptr,$num),%r12 2638 lea ($nptr,$num),%rbp # rewind $nptr 2639 mov $num,%rcx 2640 lea ($tptr,$num),%rdi # rewind $tptr 2641 xor %eax,%eax 2642 xor %r15,%r15 2643 sub %r14,%r10 # compare top-most words 2644 adc %r15,%r15 2645 or %r15,%r8 2646 sar \$3+2,%rcx 2647 sub %r8,%rax # %rax=-%r8 2648 mov 56+8(%rsp),%rdx # restore rp 2649 dec %r12 # so that after 'not' we get -n[0] 2650 mov 8*1(%rbp),%r13 2651 xor %r8,%r8 2652 mov 8*2(%rbp),%r14 2653 mov 8*3(%rbp),%r15 2654 jmp .Lsqrx4x_sub_entry # common post-condition 2655.size mulx4x_internal,.-mulx4x_internal 2656___ 2657}{ 2658###################################################################### 2659# void bn_power5( 2660my $rptr="%rdi"; # BN_ULONG *rptr, 2661my $aptr="%rsi"; # const BN_ULONG *aptr, 2662my $bptr="%rdx"; # const void *table, 2663my $nptr="%rcx"; # const BN_ULONG *nptr, 2664my $n0 ="%r8"; # const BN_ULONG *n0); 2665my $num ="%r9"; # int num, has to be divisible by 8 2666 # int pwr); 2667 2668my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 2669my @A0=("%r10","%r11"); 2670my @A1=("%r12","%r13"); 2671my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 2672 2673$code.=<<___; 2674.type bn_powerx5,\@function,6 2675.align 32 2676bn_powerx5: 2677 mov %rsp,%rax 2678.Lpowerx5_enter: 2679 push %rbx 2680 push %rbp 2681 push %r12 2682 push %r13 2683 push %r14 2684 push %r15 2685.Lpowerx5_prologue: 2686 2687 shl \$3,${num}d # convert $num to bytes 2688 lea ($num,$num,2),%r10 # 3*$num in bytes 2689 neg $num 2690 mov ($n0),$n0 # *n0 2691 2692 ############################################################## 2693 # Ensure that stack frame doesn't alias with $rptr+3*$num 2694 # modulo 4096, which covers ret[num], am[num] and n[num] 2695 # (see bn_exp.c). This is done to allow memory disambiguation 2696 # logic do its magic. [Extra 256 bytes is for power mask 2697 # calculated from 7th argument, the index.] 2698 # 2699 lea -320(%rsp,$num,2),%r11 2700 mov %rsp,%rbp 2701 sub $rptr,%r11 2702 and \$4095,%r11 2703 cmp %r11,%r10 2704 jb .Lpwrx_sp_alt 2705 sub %r11,%rbp # align with $aptr 2706 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2707 jmp .Lpwrx_sp_done 2708 2709.align 32 2710.Lpwrx_sp_alt: 2711 lea 4096-320(,$num,2),%r10 2712 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) 2713 sub %r10,%r11 2714 mov \$0,%r10 2715 cmovc %r10,%r11 2716 sub %r11,%rbp 2717.Lpwrx_sp_done: 2718 and \$-64,%rbp 2719 mov %rsp,%r11 2720 sub %rbp,%r11 2721 and \$-4096,%r11 2722 lea (%rbp,%r11),%rsp 2723 mov (%rsp),%r10 2724 cmp %rbp,%rsp 2725 ja .Lpwrx_page_walk 2726 jmp .Lpwrx_page_walk_done 2727 2728.Lpwrx_page_walk: 2729 lea -4096(%rsp),%rsp 2730 mov (%rsp),%r10 2731 cmp %rbp,%rsp 2732 ja .Lpwrx_page_walk 2733.Lpwrx_page_walk_done: 2734 2735 mov $num,%r10 2736 neg $num 2737 2738 ############################################################## 2739 # Stack layout 2740 # 2741 # +0 saved $num, used in reduction section 2742 # +8 &t[2*$num], used in reduction section 2743 # +16 intermediate carry bit 2744 # +24 top-most carry bit, used in reduction section 2745 # +32 saved *n0 2746 # +40 saved %rsp 2747 # +48 t[2*$num] 2748 # 2749 pxor %xmm0,%xmm0 2750 movq $rptr,%xmm1 # save $rptr 2751 movq $nptr,%xmm2 # save $nptr 2752 movq %r10, %xmm3 # -$num 2753 movq $bptr,%xmm4 2754 mov $n0, 32(%rsp) 2755 mov %rax, 40(%rsp) # save original %rsp 2756.Lpowerx5_body: 2757 2758 call __bn_sqrx8x_internal 2759 call __bn_postx4x_internal 2760 call __bn_sqrx8x_internal 2761 call __bn_postx4x_internal 2762 call __bn_sqrx8x_internal 2763 call __bn_postx4x_internal 2764 call __bn_sqrx8x_internal 2765 call __bn_postx4x_internal 2766 call __bn_sqrx8x_internal 2767 call __bn_postx4x_internal 2768 2769 mov %r10,$num # -num 2770 mov $aptr,$rptr 2771 movq %xmm2,$nptr 2772 movq %xmm4,$bptr 2773 mov 40(%rsp),%rax 2774 2775 call mulx4x_internal 2776 2777 mov 40(%rsp),%rsi # restore %rsp 2778 mov \$1,%rax 2779 2780 mov -48(%rsi),%r15 2781 mov -40(%rsi),%r14 2782 mov -32(%rsi),%r13 2783 mov -24(%rsi),%r12 2784 mov -16(%rsi),%rbp 2785 mov -8(%rsi),%rbx 2786 lea (%rsi),%rsp 2787.Lpowerx5_epilogue: 2788 ret 2789.size bn_powerx5,.-bn_powerx5 2790 2791.globl bn_sqrx8x_internal 2792.hidden bn_sqrx8x_internal 2793.type bn_sqrx8x_internal,\@abi-omnipotent 2794.align 32 2795bn_sqrx8x_internal: 2796__bn_sqrx8x_internal: 2797 ################################################################## 2798 # Squaring part: 2799 # 2800 # a) multiply-n-add everything but a[i]*a[i]; 2801 # b) shift result of a) by 1 to the left and accumulate 2802 # a[i]*a[i] products; 2803 # 2804 ################################################################## 2805 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2806 # a[1]a[0] 2807 # a[2]a[0] 2808 # a[3]a[0] 2809 # a[2]a[1] 2810 # a[3]a[1] 2811 # a[3]a[2] 2812 # 2813 # a[4]a[0] 2814 # a[5]a[0] 2815 # a[6]a[0] 2816 # a[7]a[0] 2817 # a[4]a[1] 2818 # a[5]a[1] 2819 # a[6]a[1] 2820 # a[7]a[1] 2821 # a[4]a[2] 2822 # a[5]a[2] 2823 # a[6]a[2] 2824 # a[7]a[2] 2825 # a[4]a[3] 2826 # a[5]a[3] 2827 # a[6]a[3] 2828 # a[7]a[3] 2829 # 2830 # a[5]a[4] 2831 # a[6]a[4] 2832 # a[7]a[4] 2833 # a[6]a[5] 2834 # a[7]a[5] 2835 # a[7]a[6] 2836 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2837___ 2838{ 2839my ($zero,$carry)=("%rbp","%rcx"); 2840my $aaptr=$zero; 2841$code.=<<___; 2842 lea 48+8(%rsp),$tptr 2843 lea ($aptr,$num),$aaptr 2844 mov $num,0+8(%rsp) # save $num 2845 mov $aaptr,8+8(%rsp) # save end of $aptr 2846 jmp .Lsqr8x_zero_start 2847 2848.align 32 2849.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2850.Lsqrx8x_zero: 2851 .byte 0x3e 2852 movdqa %xmm0,0*8($tptr) 2853 movdqa %xmm0,2*8($tptr) 2854 movdqa %xmm0,4*8($tptr) 2855 movdqa %xmm0,6*8($tptr) 2856.Lsqr8x_zero_start: # aligned at 32 2857 movdqa %xmm0,8*8($tptr) 2858 movdqa %xmm0,10*8($tptr) 2859 movdqa %xmm0,12*8($tptr) 2860 movdqa %xmm0,14*8($tptr) 2861 lea 16*8($tptr),$tptr 2862 sub \$64,$num 2863 jnz .Lsqrx8x_zero 2864 2865 mov 0*8($aptr),%rdx # a[0], modulo-scheduled 2866 #xor %r9,%r9 # t[1], ex-$num, zero already 2867 xor %r10,%r10 2868 xor %r11,%r11 2869 xor %r12,%r12 2870 xor %r13,%r13 2871 xor %r14,%r14 2872 xor %r15,%r15 2873 lea 48+8(%rsp),$tptr 2874 xor $zero,$zero # cf=0, cf=0 2875 jmp .Lsqrx8x_outer_loop 2876 2877.align 32 2878.Lsqrx8x_outer_loop: 2879 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] 2880 adcx %r9,%r8 # a[1]*a[0]+=t[1] 2881 adox %rax,%r10 2882 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] 2883 adcx %r10,%r9 2884 adox %rax,%r11 2885 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... 2886 adcx %r11,%r10 2887 adox %rax,%r12 2888 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax 2889 adcx %r12,%r11 2890 adox %rax,%r13 2891 mulx 5*8($aptr),%r12,%rax 2892 adcx %r13,%r12 2893 adox %rax,%r14 2894 mulx 6*8($aptr),%r13,%rax 2895 adcx %r14,%r13 2896 adox %r15,%rax 2897 mulx 7*8($aptr),%r14,%r15 2898 mov 1*8($aptr),%rdx # a[1] 2899 adcx %rax,%r14 2900 adox $zero,%r15 2901 adc 8*8($tptr),%r15 2902 mov %r8,1*8($tptr) # t[1] 2903 mov %r9,2*8($tptr) # t[2] 2904 sbb $carry,$carry # mov %cf,$carry 2905 xor $zero,$zero # cf=0, of=0 2906 2907 2908 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] 2909 mulx 3*8($aptr),%r9,%rax # a[3]*a[1] 2910 adcx %r10,%r8 2911 adox %rbx,%r9 2912 mulx 4*8($aptr),%r10,%rbx # ... 2913 adcx %r11,%r9 2914 adox %rax,%r10 2915 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax 2916 adcx %r12,%r10 2917 adox %rbx,%r11 2918 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx 2919 adcx %r13,%r11 2920 adox %r14,%r12 2921 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 2922 mov 2*8($aptr),%rdx # a[2] 2923 adcx %rax,%r12 2924 adox %rbx,%r13 2925 adcx %r15,%r13 2926 adox $zero,%r14 # of=0 2927 adcx $zero,%r14 # cf=0 2928 2929 mov %r8,3*8($tptr) # t[3] 2930 mov %r9,4*8($tptr) # t[4] 2931 2932 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] 2933 mulx 4*8($aptr),%r9,%rax # a[4]*a[2] 2934 adcx %r10,%r8 2935 adox %rbx,%r9 2936 mulx 5*8($aptr),%r10,%rbx # ... 2937 adcx %r11,%r9 2938 adox %rax,%r10 2939 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax 2940 adcx %r12,%r10 2941 adox %r13,%r11 2942 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 2943 .byte 0x3e 2944 mov 3*8($aptr),%rdx # a[3] 2945 adcx %rbx,%r11 2946 adox %rax,%r12 2947 adcx %r14,%r12 2948 mov %r8,5*8($tptr) # t[5] 2949 mov %r9,6*8($tptr) # t[6] 2950 mulx 4*8($aptr),%r8,%rax # a[4]*a[3] 2951 adox $zero,%r13 # of=0 2952 adcx $zero,%r13 # cf=0 2953 2954 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] 2955 adcx %r10,%r8 2956 adox %rax,%r9 2957 mulx 6*8($aptr),%r10,%rax # ... 2958 adcx %r11,%r9 2959 adox %r12,%r10 2960 mulx 7*8($aptr),%r11,%r12 2961 mov 4*8($aptr),%rdx # a[4] 2962 mov 5*8($aptr),%r14 # a[5] 2963 adcx %rbx,%r10 2964 adox %rax,%r11 2965 mov 6*8($aptr),%r15 # a[6] 2966 adcx %r13,%r11 2967 adox $zero,%r12 # of=0 2968 adcx $zero,%r12 # cf=0 2969 2970 mov %r8,7*8($tptr) # t[7] 2971 mov %r9,8*8($tptr) # t[8] 2972 2973 mulx %r14,%r9,%rax # a[5]*a[4] 2974 mov 7*8($aptr),%r8 # a[7] 2975 adcx %r10,%r9 2976 mulx %r15,%r10,%rbx # a[6]*a[4] 2977 adox %rax,%r10 2978 adcx %r11,%r10 2979 mulx %r8,%r11,%rax # a[7]*a[4] 2980 mov %r14,%rdx # a[5] 2981 adox %rbx,%r11 2982 adcx %r12,%r11 2983 #adox $zero,%rax # of=0 2984 adcx $zero,%rax # cf=0 2985 2986 mulx %r15,%r14,%rbx # a[6]*a[5] 2987 mulx %r8,%r12,%r13 # a[7]*a[5] 2988 mov %r15,%rdx # a[6] 2989 lea 8*8($aptr),$aptr 2990 adcx %r14,%r11 2991 adox %rbx,%r12 2992 adcx %rax,%r12 2993 adox $zero,%r13 2994 2995 .byte 0x67,0x67 2996 mulx %r8,%r8,%r14 # a[7]*a[6] 2997 adcx %r8,%r13 2998 adcx $zero,%r14 2999 3000 cmp 8+8(%rsp),$aptr 3001 je .Lsqrx8x_outer_break 3002 3003 neg $carry # mov $carry,%cf 3004 mov \$-8,%rcx 3005 mov $zero,%r15 3006 mov 8*8($tptr),%r8 3007 adcx 9*8($tptr),%r9 # +=t[9] 3008 adcx 10*8($tptr),%r10 # ... 3009 adcx 11*8($tptr),%r11 3010 adc 12*8($tptr),%r12 3011 adc 13*8($tptr),%r13 3012 adc 14*8($tptr),%r14 3013 adc 15*8($tptr),%r15 3014 lea ($aptr),$aaptr 3015 lea 2*64($tptr),$tptr 3016 sbb %rax,%rax # mov %cf,$carry 3017 3018 mov -64($aptr),%rdx # a[0] 3019 mov %rax,16+8(%rsp) # offload $carry 3020 mov $tptr,24+8(%rsp) 3021 3022 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above 3023 xor %eax,%eax # cf=0, of=0 3024 jmp .Lsqrx8x_loop 3025 3026.align 32 3027.Lsqrx8x_loop: 3028 mov %r8,%rbx 3029 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] 3030 adcx %rax,%rbx # +=t[8] 3031 adox %r9,%r8 3032 3033 mulx 1*8($aaptr),%rax,%r9 # ... 3034 adcx %rax,%r8 3035 adox %r10,%r9 3036 3037 mulx 2*8($aaptr),%rax,%r10 3038 adcx %rax,%r9 3039 adox %r11,%r10 3040 3041 mulx 3*8($aaptr),%rax,%r11 3042 adcx %rax,%r10 3043 adox %r12,%r11 3044 3045 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 3046 adcx %rax,%r11 3047 adox %r13,%r12 3048 3049 mulx 5*8($aaptr),%rax,%r13 3050 adcx %rax,%r12 3051 adox %r14,%r13 3052 3053 mulx 6*8($aaptr),%rax,%r14 3054 mov %rbx,($tptr,%rcx,8) # store t[8+i] 3055 mov \$0,%ebx 3056 adcx %rax,%r13 3057 adox %r15,%r14 3058 3059 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 3060 mov 8($aptr,%rcx,8),%rdx # a[i] 3061 adcx %rax,%r14 3062 adox %rbx,%r15 # %rbx is 0, of=0 3063 adcx %rbx,%r15 # cf=0 3064 3065 .byte 0x67 3066 inc %rcx # of=0 3067 jnz .Lsqrx8x_loop 3068 3069 lea 8*8($aaptr),$aaptr 3070 mov \$-8,%rcx 3071 cmp 8+8(%rsp),$aaptr # done? 3072 je .Lsqrx8x_break 3073 3074 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3075 .byte 0x66 3076 mov -64($aptr),%rdx 3077 adcx 0*8($tptr),%r8 3078 adcx 1*8($tptr),%r9 3079 adc 2*8($tptr),%r10 3080 adc 3*8($tptr),%r11 3081 adc 4*8($tptr),%r12 3082 adc 5*8($tptr),%r13 3083 adc 6*8($tptr),%r14 3084 adc 7*8($tptr),%r15 3085 lea 8*8($tptr),$tptr 3086 .byte 0x67 3087 sbb %rax,%rax # mov %cf,%rax 3088 xor %ebx,%ebx # cf=0, of=0 3089 mov %rax,16+8(%rsp) # offload carry 3090 jmp .Lsqrx8x_loop 3091 3092.align 32 3093.Lsqrx8x_break: 3094 xor $zero,$zero 3095 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3096 adcx $zero,%r8 3097 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry 3098 adcx $zero,%r9 3099 mov 0*8($aptr),%rdx # a[8], modulo-scheduled 3100 adc \$0,%r10 3101 mov %r8,0*8($tptr) 3102 adc \$0,%r11 3103 adc \$0,%r12 3104 adc \$0,%r13 3105 adc \$0,%r14 3106 adc \$0,%r15 3107 cmp $carry,$tptr # cf=0, of=0 3108 je .Lsqrx8x_outer_loop 3109 3110 mov %r9,1*8($tptr) 3111 mov 1*8($carry),%r9 3112 mov %r10,2*8($tptr) 3113 mov 2*8($carry),%r10 3114 mov %r11,3*8($tptr) 3115 mov 3*8($carry),%r11 3116 mov %r12,4*8($tptr) 3117 mov 4*8($carry),%r12 3118 mov %r13,5*8($tptr) 3119 mov 5*8($carry),%r13 3120 mov %r14,6*8($tptr) 3121 mov 6*8($carry),%r14 3122 mov %r15,7*8($tptr) 3123 mov 7*8($carry),%r15 3124 mov $carry,$tptr 3125 jmp .Lsqrx8x_outer_loop 3126 3127.align 32 3128.Lsqrx8x_outer_break: 3129 mov %r9,9*8($tptr) # t[9] 3130 movq %xmm3,%rcx # -$num 3131 mov %r10,10*8($tptr) # ... 3132 mov %r11,11*8($tptr) 3133 mov %r12,12*8($tptr) 3134 mov %r13,13*8($tptr) 3135 mov %r14,14*8($tptr) 3136___ 3137}{ 3138my $i="%rcx"; 3139$code.=<<___; 3140 lea 48+8(%rsp),$tptr 3141 mov ($aptr,$i),%rdx # a[0] 3142 3143 mov 8($tptr),$A0[1] # t[1] 3144 xor $A0[0],$A0[0] # t[0], of=0, cf=0 3145 mov 0+8(%rsp),$num # restore $num 3146 adox $A0[1],$A0[1] 3147 mov 16($tptr),$A1[0] # t[2] # prefetch 3148 mov 24($tptr),$A1[1] # t[3] # prefetch 3149 #jmp .Lsqrx4x_shift_n_add # happens to be aligned 3150 3151.align 32 3152.Lsqrx4x_shift_n_add: 3153 mulx %rdx,%rax,%rbx 3154 adox $A1[0],$A1[0] 3155 adcx $A0[0],%rax 3156 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch 3157 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch 3158 adox $A1[1],$A1[1] 3159 adcx $A0[1],%rbx 3160 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch 3161 mov %rax,0($tptr) 3162 mov %rbx,8($tptr) 3163 3164 mulx %rdx,%rax,%rbx 3165 adox $A0[0],$A0[0] 3166 adcx $A1[0],%rax 3167 mov 16($aptr,$i),%rdx # a[i+2] # prefetch 3168 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch 3169 adox $A0[1],$A0[1] 3170 adcx $A1[1],%rbx 3171 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch 3172 mov %rax,16($tptr) 3173 mov %rbx,24($tptr) 3174 3175 mulx %rdx,%rax,%rbx 3176 adox $A1[0],$A1[0] 3177 adcx $A0[0],%rax 3178 mov 24($aptr,$i),%rdx # a[i+3] # prefetch 3179 lea 32($i),$i 3180 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch 3181 adox $A1[1],$A1[1] 3182 adcx $A0[1],%rbx 3183 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch 3184 mov %rax,32($tptr) 3185 mov %rbx,40($tptr) 3186 3187 mulx %rdx,%rax,%rbx 3188 adox $A0[0],$A0[0] 3189 adcx $A1[0],%rax 3190 jrcxz .Lsqrx4x_shift_n_add_break 3191 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch 3192 adox $A0[1],$A0[1] 3193 adcx $A1[1],%rbx 3194 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch 3195 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch 3196 mov %rax,48($tptr) 3197 mov %rbx,56($tptr) 3198 lea 64($tptr),$tptr 3199 nop 3200 jmp .Lsqrx4x_shift_n_add 3201 3202.align 32 3203.Lsqrx4x_shift_n_add_break: 3204 adcx $A1[1],%rbx 3205 mov %rax,48($tptr) 3206 mov %rbx,56($tptr) 3207 lea 64($tptr),$tptr # end of t[] buffer 3208___ 3209} 3210###################################################################### 3211# Montgomery reduction part, "word-by-word" algorithm. 3212# 3213# This new path is inspired by multiple submissions from Intel, by 3214# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 3215# Vinodh Gopal... 3216{ 3217my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); 3218 3219$code.=<<___; 3220 movq %xmm2,$nptr 3221__bn_sqrx8x_reduction: 3222 xor %eax,%eax # initial top-most carry bit 3223 mov 32+8(%rsp),%rbx # n0 3224 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) 3225 lea -8*8($nptr,$num),%rcx # end of n[] 3226 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer 3227 mov %rcx, 0+8(%rsp) # save end of n[] 3228 mov $tptr,8+8(%rsp) # save end of t[] 3229 3230 lea 48+8(%rsp),$tptr # initial t[] window 3231 jmp .Lsqrx8x_reduction_loop 3232 3233.align 32 3234.Lsqrx8x_reduction_loop: 3235 mov 8*1($tptr),%r9 3236 mov 8*2($tptr),%r10 3237 mov 8*3($tptr),%r11 3238 mov 8*4($tptr),%r12 3239 mov %rdx,%r8 3240 imulq %rbx,%rdx # n0*a[i] 3241 mov 8*5($tptr),%r13 3242 mov 8*6($tptr),%r14 3243 mov 8*7($tptr),%r15 3244 mov %rax,24+8(%rsp) # store top-most carry bit 3245 3246 lea 8*8($tptr),$tptr 3247 xor $carry,$carry # cf=0,of=0 3248 mov \$-8,%rcx 3249 jmp .Lsqrx8x_reduce 3250 3251.align 32 3252.Lsqrx8x_reduce: 3253 mov %r8, %rbx 3254 mulx 8*0($nptr),%rax,%r8 # n[0] 3255 adcx %rbx,%rax # discarded 3256 adox %r9,%r8 3257 3258 mulx 8*1($nptr),%rbx,%r9 # n[1] 3259 adcx %rbx,%r8 3260 adox %r10,%r9 3261 3262 mulx 8*2($nptr),%rbx,%r10 3263 adcx %rbx,%r9 3264 adox %r11,%r10 3265 3266 mulx 8*3($nptr),%rbx,%r11 3267 adcx %rbx,%r10 3268 adox %r12,%r11 3269 3270 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 3271 mov %rdx,%rax 3272 mov %r8,%rdx 3273 adcx %rbx,%r11 3274 adox %r13,%r12 3275 3276 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded 3277 mov %rax,%rdx 3278 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] 3279 3280 mulx 8*5($nptr),%rax,%r13 3281 adcx %rax,%r12 3282 adox %r14,%r13 3283 3284 mulx 8*6($nptr),%rax,%r14 3285 adcx %rax,%r13 3286 adox %r15,%r14 3287 3288 mulx 8*7($nptr),%rax,%r15 3289 mov %rbx,%rdx 3290 adcx %rax,%r14 3291 adox $carry,%r15 # $carry is 0 3292 adcx $carry,%r15 # cf=0 3293 3294 .byte 0x67,0x67,0x67 3295 inc %rcx # of=0 3296 jnz .Lsqrx8x_reduce 3297 3298 mov $carry,%rax # xor %rax,%rax 3299 cmp 0+8(%rsp),$nptr # end of n[]? 3300 jae .Lsqrx8x_no_tail 3301 3302 mov 48+8(%rsp),%rdx # pull n0*a[0] 3303 add 8*0($tptr),%r8 3304 lea 8*8($nptr),$nptr 3305 mov \$-8,%rcx 3306 adcx 8*1($tptr),%r9 3307 adcx 8*2($tptr),%r10 3308 adc 8*3($tptr),%r11 3309 adc 8*4($tptr),%r12 3310 adc 8*5($tptr),%r13 3311 adc 8*6($tptr),%r14 3312 adc 8*7($tptr),%r15 3313 lea 8*8($tptr),$tptr 3314 sbb %rax,%rax # top carry 3315 3316 xor $carry,$carry # of=0, cf=0 3317 mov %rax,16+8(%rsp) 3318 jmp .Lsqrx8x_tail 3319 3320.align 32 3321.Lsqrx8x_tail: 3322 mov %r8,%rbx 3323 mulx 8*0($nptr),%rax,%r8 3324 adcx %rax,%rbx 3325 adox %r9,%r8 3326 3327 mulx 8*1($nptr),%rax,%r9 3328 adcx %rax,%r8 3329 adox %r10,%r9 3330 3331 mulx 8*2($nptr),%rax,%r10 3332 adcx %rax,%r9 3333 adox %r11,%r10 3334 3335 mulx 8*3($nptr),%rax,%r11 3336 adcx %rax,%r10 3337 adox %r12,%r11 3338 3339 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 3340 adcx %rax,%r11 3341 adox %r13,%r12 3342 3343 mulx 8*5($nptr),%rax,%r13 3344 adcx %rax,%r12 3345 adox %r14,%r13 3346 3347 mulx 8*6($nptr),%rax,%r14 3348 adcx %rax,%r13 3349 adox %r15,%r14 3350 3351 mulx 8*7($nptr),%rax,%r15 3352 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] 3353 adcx %rax,%r14 3354 adox $carry,%r15 3355 mov %rbx,($tptr,%rcx,8) # save result 3356 mov %r8,%rbx 3357 adcx $carry,%r15 # cf=0 3358 3359 inc %rcx # of=0 3360 jnz .Lsqrx8x_tail 3361 3362 cmp 0+8(%rsp),$nptr # end of n[]? 3363 jae .Lsqrx8x_tail_done # break out of loop 3364 3365 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3366 mov 48+8(%rsp),%rdx # pull n0*a[0] 3367 lea 8*8($nptr),$nptr 3368 adc 8*0($tptr),%r8 3369 adc 8*1($tptr),%r9 3370 adc 8*2($tptr),%r10 3371 adc 8*3($tptr),%r11 3372 adc 8*4($tptr),%r12 3373 adc 8*5($tptr),%r13 3374 adc 8*6($tptr),%r14 3375 adc 8*7($tptr),%r15 3376 lea 8*8($tptr),$tptr 3377 sbb %rax,%rax 3378 sub \$8,%rcx # mov \$-8,%rcx 3379 3380 xor $carry,$carry # of=0, cf=0 3381 mov %rax,16+8(%rsp) 3382 jmp .Lsqrx8x_tail 3383 3384.align 32 3385.Lsqrx8x_tail_done: 3386 xor %rax,%rax 3387 add 24+8(%rsp),%r8 # can this overflow? 3388 adc \$0,%r9 3389 adc \$0,%r10 3390 adc \$0,%r11 3391 adc \$0,%r12 3392 adc \$0,%r13 3393 adc \$0,%r14 3394 adc \$0,%r15 3395 adc \$0,%rax 3396 3397 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3398.Lsqrx8x_no_tail: # %cf is 0 if jumped here 3399 adc 8*0($tptr),%r8 3400 movq %xmm3,%rcx 3401 adc 8*1($tptr),%r9 3402 mov 8*7($nptr),$carry 3403 movq %xmm2,$nptr # restore $nptr 3404 adc 8*2($tptr),%r10 3405 adc 8*3($tptr),%r11 3406 adc 8*4($tptr),%r12 3407 adc 8*5($tptr),%r13 3408 adc 8*6($tptr),%r14 3409 adc 8*7($tptr),%r15 3410 adc \$0,%rax # top-most carry 3411 3412 mov 32+8(%rsp),%rbx # n0 3413 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" 3414 3415 mov %r8,8*0($tptr) # store top 512 bits 3416 lea 8*8($tptr),%r8 # borrow %r8 3417 mov %r9,8*1($tptr) 3418 mov %r10,8*2($tptr) 3419 mov %r11,8*3($tptr) 3420 mov %r12,8*4($tptr) 3421 mov %r13,8*5($tptr) 3422 mov %r14,8*6($tptr) 3423 mov %r15,8*7($tptr) 3424 3425 lea 8*8($tptr,%rcx),$tptr # start of current t[] window 3426 cmp 8+8(%rsp),%r8 # end of t[]? 3427 jb .Lsqrx8x_reduction_loop 3428 ret 3429.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3430___ 3431} 3432############################################################## 3433# Post-condition, 4x unrolled 3434# 3435{ 3436my ($rptr,$nptr)=("%rdx","%rbp"); 3437$code.=<<___; 3438.align 32 3439__bn_postx4x_internal: 3440 mov 8*0($nptr),%r12 3441 mov %rcx,%r10 # -$num 3442 mov %rcx,%r9 # -$num 3443 neg %rax 3444 sar \$3+2,%rcx 3445 #lea 48+8(%rsp,%r9),$tptr 3446 movq %xmm1,$rptr # restore $rptr 3447 movq %xmm1,$aptr # prepare for back-to-back call 3448 dec %r12 # so that after 'not' we get -n[0] 3449 mov 8*1($nptr),%r13 3450 xor %r8,%r8 3451 mov 8*2($nptr),%r14 3452 mov 8*3($nptr),%r15 3453 jmp .Lsqrx4x_sub_entry 3454 3455.align 16 3456.Lsqrx4x_sub: 3457 mov 8*0($nptr),%r12 3458 mov 8*1($nptr),%r13 3459 mov 8*2($nptr),%r14 3460 mov 8*3($nptr),%r15 3461.Lsqrx4x_sub_entry: 3462 andn %rax,%r12,%r12 3463 lea 8*4($nptr),$nptr 3464 andn %rax,%r13,%r13 3465 andn %rax,%r14,%r14 3466 andn %rax,%r15,%r15 3467 3468 neg %r8 # mov %r8,%cf 3469 adc 8*0($tptr),%r12 3470 adc 8*1($tptr),%r13 3471 adc 8*2($tptr),%r14 3472 adc 8*3($tptr),%r15 3473 mov %r12,8*0($rptr) 3474 lea 8*4($tptr),$tptr 3475 mov %r13,8*1($rptr) 3476 sbb %r8,%r8 # mov %cf,%r8 3477 mov %r14,8*2($rptr) 3478 mov %r15,8*3($rptr) 3479 lea 8*4($rptr),$rptr 3480 3481 inc %rcx 3482 jnz .Lsqrx4x_sub 3483 3484 neg %r9 # restore $num 3485 3486 ret 3487.size __bn_postx4x_internal,.-__bn_postx4x_internal 3488___ 3489} 3490}}} 3491{ 3492my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order 3493 ("%rdi","%esi","%rdx","%ecx"); # Unix order 3494my $out=$inp; 3495my $STRIDE=2**5*8; 3496my $N=$STRIDE/4; 3497 3498$code.=<<___; 3499.globl bn_get_bits5 3500.type bn_get_bits5,\@abi-omnipotent 3501.align 16 3502bn_get_bits5: 3503 lea 0($inp),%r10 3504 lea 1($inp),%r11 3505 mov $num,%ecx 3506 shr \$4,$num 3507 and \$15,%ecx 3508 lea -8(%ecx),%eax 3509 cmp \$11,%ecx 3510 cmova %r11,%r10 3511 cmova %eax,%ecx 3512 movzw (%r10,$num,2),%eax 3513 shrl %cl,%eax 3514 and \$31,%eax 3515 ret 3516.size bn_get_bits5,.-bn_get_bits5 3517 3518.globl bn_scatter5 3519.type bn_scatter5,\@abi-omnipotent 3520.align 16 3521bn_scatter5: 3522 cmp \$0, $num 3523 jz .Lscatter_epilogue 3524 lea ($tbl,$idx,8),$tbl 3525.Lscatter: 3526 mov ($inp),%rax 3527 lea 8($inp),$inp 3528 mov %rax,($tbl) 3529 lea 32*8($tbl),$tbl 3530 sub \$1,$num 3531 jnz .Lscatter 3532.Lscatter_epilogue: 3533 ret 3534.size bn_scatter5,.-bn_scatter5 3535 3536.globl bn_gather5 3537.type bn_gather5,\@abi-omnipotent 3538.align 32 3539bn_gather5: 3540.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases 3541 # I can't trust assembler to use specific encoding:-( 3542 .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 3543 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp 3544 lea .Linc(%rip),%rax 3545 and \$-16,%rsp # shouldn't be formally required 3546 3547 movd $idx,%xmm5 3548 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 3549 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 3550 lea 128($tbl),%r11 # size optimization 3551 lea 128(%rsp),%rax # size optimization 3552 3553 pshufd \$0,%xmm5,%xmm5 # broadcast $idx 3554 movdqa %xmm1,%xmm4 3555 movdqa %xmm1,%xmm2 3556___ 3557######################################################################## 3558# calculate mask by comparing 0..31 to $idx and save result to stack 3559# 3560for($i=0;$i<$STRIDE/16;$i+=4) { 3561$code.=<<___; 3562 paddd %xmm0,%xmm1 3563 pcmpeqd %xmm5,%xmm0 # compare to 1,0 3564___ 3565$code.=<<___ if ($i); 3566 movdqa %xmm3,`16*($i-1)-128`(%rax) 3567___ 3568$code.=<<___; 3569 movdqa %xmm4,%xmm3 3570 3571 paddd %xmm1,%xmm2 3572 pcmpeqd %xmm5,%xmm1 # compare to 3,2 3573 movdqa %xmm0,`16*($i+0)-128`(%rax) 3574 movdqa %xmm4,%xmm0 3575 3576 paddd %xmm2,%xmm3 3577 pcmpeqd %xmm5,%xmm2 # compare to 5,4 3578 movdqa %xmm1,`16*($i+1)-128`(%rax) 3579 movdqa %xmm4,%xmm1 3580 3581 paddd %xmm3,%xmm0 3582 pcmpeqd %xmm5,%xmm3 # compare to 7,6 3583 movdqa %xmm2,`16*($i+2)-128`(%rax) 3584 movdqa %xmm4,%xmm2 3585___ 3586} 3587$code.=<<___; 3588 movdqa %xmm3,`16*($i-1)-128`(%rax) 3589 jmp .Lgather 3590 3591.align 32 3592.Lgather: 3593 pxor %xmm4,%xmm4 3594 pxor %xmm5,%xmm5 3595___ 3596for($i=0;$i<$STRIDE/16;$i+=4) { 3597$code.=<<___; 3598 movdqa `16*($i+0)-128`(%r11),%xmm0 3599 movdqa `16*($i+1)-128`(%r11),%xmm1 3600 movdqa `16*($i+2)-128`(%r11),%xmm2 3601 pand `16*($i+0)-128`(%rax),%xmm0 3602 movdqa `16*($i+3)-128`(%r11),%xmm3 3603 pand `16*($i+1)-128`(%rax),%xmm1 3604 por %xmm0,%xmm4 3605 pand `16*($i+2)-128`(%rax),%xmm2 3606 por %xmm1,%xmm5 3607 pand `16*($i+3)-128`(%rax),%xmm3 3608 por %xmm2,%xmm4 3609 por %xmm3,%xmm5 3610___ 3611} 3612$code.=<<___; 3613 por %xmm5,%xmm4 3614 lea $STRIDE(%r11),%r11 3615 pshufd \$0x4e,%xmm4,%xmm0 3616 por %xmm4,%xmm0 3617 movq %xmm0,($out) # m0=bp[0] 3618 lea 8($out),$out 3619 sub \$1,$num 3620 jnz .Lgather 3621 3622 lea (%r10),%rsp 3623 ret 3624.LSEH_end_bn_gather5: 3625.size bn_gather5,.-bn_gather5 3626___ 3627} 3628$code.=<<___; 3629.align 64 3630.Linc: 3631 .long 0,0, 1,1 3632 .long 2,2, 2,2 3633.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3634___ 3635 3636# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3637# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3638if ($win64) { 3639$rec="%rcx"; 3640$frame="%rdx"; 3641$context="%r8"; 3642$disp="%r9"; 3643 3644$code.=<<___; 3645.extern __imp_RtlVirtualUnwind 3646.type mul_handler,\@abi-omnipotent 3647.align 16 3648mul_handler: 3649 push %rsi 3650 push %rdi 3651 push %rbx 3652 push %rbp 3653 push %r12 3654 push %r13 3655 push %r14 3656 push %r15 3657 pushfq 3658 sub \$64,%rsp 3659 3660 mov 120($context),%rax # pull context->Rax 3661 mov 248($context),%rbx # pull context->Rip 3662 3663 mov 8($disp),%rsi # disp->ImageBase 3664 mov 56($disp),%r11 # disp->HandlerData 3665 3666 mov 0(%r11),%r10d # HandlerData[0] 3667 lea (%rsi,%r10),%r10 # end of prologue label 3668 cmp %r10,%rbx # context->Rip<end of prologue label 3669 jb .Lcommon_seh_tail 3670 3671 mov 4(%r11),%r10d # HandlerData[1] 3672 lea (%rsi,%r10),%r10 # epilogue label 3673 cmp %r10,%rbx # context->Rip>=epilogue label 3674 jb .Lcommon_pop_regs 3675 3676 mov 152($context),%rax # pull context->Rsp 3677 3678 mov 8(%r11),%r10d # HandlerData[2] 3679 lea (%rsi,%r10),%r10 # epilogue label 3680 cmp %r10,%rbx # context->Rip>=epilogue label 3681 jae .Lcommon_seh_tail 3682 3683 lea .Lmul_epilogue(%rip),%r10 3684 cmp %r10,%rbx 3685 ja .Lbody_40 3686 3687 mov 192($context),%r10 # pull $num 3688 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 3689 3690 jmp .Lcommon_pop_regs 3691 3692.Lbody_40: 3693 mov 40(%rax),%rax # pull saved stack pointer 3694.Lcommon_pop_regs: 3695 mov -8(%rax),%rbx 3696 mov -16(%rax),%rbp 3697 mov -24(%rax),%r12 3698 mov -32(%rax),%r13 3699 mov -40(%rax),%r14 3700 mov -48(%rax),%r15 3701 mov %rbx,144($context) # restore context->Rbx 3702 mov %rbp,160($context) # restore context->Rbp 3703 mov %r12,216($context) # restore context->R12 3704 mov %r13,224($context) # restore context->R13 3705 mov %r14,232($context) # restore context->R14 3706 mov %r15,240($context) # restore context->R15 3707 3708.Lcommon_seh_tail: 3709 mov 8(%rax),%rdi 3710 mov 16(%rax),%rsi 3711 mov %rax,152($context) # restore context->Rsp 3712 mov %rsi,168($context) # restore context->Rsi 3713 mov %rdi,176($context) # restore context->Rdi 3714 3715 mov 40($disp),%rdi # disp->ContextRecord 3716 mov $context,%rsi # context 3717 mov \$154,%ecx # sizeof(CONTEXT) 3718 .long 0xa548f3fc # cld; rep movsq 3719 3720 mov $disp,%rsi 3721 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3722 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3723 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3724 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3725 mov 40(%rsi),%r10 # disp->ContextRecord 3726 lea 56(%rsi),%r11 # &disp->HandlerData 3727 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3728 mov %r10,32(%rsp) # arg5 3729 mov %r11,40(%rsp) # arg6 3730 mov %r12,48(%rsp) # arg7 3731 mov %rcx,56(%rsp) # arg8, (NULL) 3732 call *__imp_RtlVirtualUnwind(%rip) 3733 3734 mov \$1,%eax # ExceptionContinueSearch 3735 add \$64,%rsp 3736 popfq 3737 pop %r15 3738 pop %r14 3739 pop %r13 3740 pop %r12 3741 pop %rbp 3742 pop %rbx 3743 pop %rdi 3744 pop %rsi 3745 ret 3746.size mul_handler,.-mul_handler 3747 3748.section .pdata 3749.align 4 3750 .rva .LSEH_begin_bn_mul_mont_gather5 3751 .rva .LSEH_end_bn_mul_mont_gather5 3752 .rva .LSEH_info_bn_mul_mont_gather5 3753 3754 .rva .LSEH_begin_bn_mul4x_mont_gather5 3755 .rva .LSEH_end_bn_mul4x_mont_gather5 3756 .rva .LSEH_info_bn_mul4x_mont_gather5 3757 3758 .rva .LSEH_begin_bn_power5 3759 .rva .LSEH_end_bn_power5 3760 .rva .LSEH_info_bn_power5 3761 3762 .rva .LSEH_begin_bn_from_mont8x 3763 .rva .LSEH_end_bn_from_mont8x 3764 .rva .LSEH_info_bn_from_mont8x 3765___ 3766$code.=<<___ if ($addx); 3767 .rva .LSEH_begin_bn_mulx4x_mont_gather5 3768 .rva .LSEH_end_bn_mulx4x_mont_gather5 3769 .rva .LSEH_info_bn_mulx4x_mont_gather5 3770 3771 .rva .LSEH_begin_bn_powerx5 3772 .rva .LSEH_end_bn_powerx5 3773 .rva .LSEH_info_bn_powerx5 3774___ 3775$code.=<<___; 3776 .rva .LSEH_begin_bn_gather5 3777 .rva .LSEH_end_bn_gather5 3778 .rva .LSEH_info_bn_gather5 3779 3780.section .xdata 3781.align 8 3782.LSEH_info_bn_mul_mont_gather5: 3783 .byte 9,0,0,0 3784 .rva mul_handler 3785 .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] 3786.align 8 3787.LSEH_info_bn_mul4x_mont_gather5: 3788 .byte 9,0,0,0 3789 .rva mul_handler 3790 .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 3791.align 8 3792.LSEH_info_bn_power5: 3793 .byte 9,0,0,0 3794 .rva mul_handler 3795 .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] 3796.align 8 3797.LSEH_info_bn_from_mont8x: 3798 .byte 9,0,0,0 3799 .rva mul_handler 3800 .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] 3801___ 3802$code.=<<___ if ($addx); 3803.align 8 3804.LSEH_info_bn_mulx4x_mont_gather5: 3805 .byte 9,0,0,0 3806 .rva mul_handler 3807 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 3808.align 8 3809.LSEH_info_bn_powerx5: 3810 .byte 9,0,0,0 3811 .rva mul_handler 3812 .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] 3813___ 3814$code.=<<___; 3815.align 8 3816.LSEH_info_bn_gather5: 3817 .byte 0x01,0x0b,0x03,0x0a 3818 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 3819 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) 3820.align 8 3821___ 3822} 3823 3824$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3825 3826print $code; 3827close STDOUT; 3828