1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# August 2011. 11# 12# Companion to x86_64-mont.pl that optimizes cache-timing attack 13# countermeasures. The subroutines are produced by replacing bp[i] 14# references in their x86_64-mont.pl counterparts with cache-neutral 15# references to powers table computed in BN_mod_exp_mont_consttime. 16# In addition subroutine that scatters elements of the powers table 17# is implemented, so that scatter-/gathering can be tuned without 18# bn_exp.c modifications. 19 20# August 2013. 21# 22# Add MULX/AD*X code paths and additional interfaces to optimize for 23# branch prediction unit. For input lengths that are multiples of 8 24# the np argument is not just modulus value, but one interleaved 25# with 0. This is to optimize post-condition... 26 27$flavour = shift; 28$output = shift; 29if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 30 31$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 32 33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 34( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 35( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 36die "can't locate x86_64-xlate.pl"; 37 38open OUT,"| \"$^X\" $xlate $flavour $output"; 39*STDOUT=*OUT; 40 41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 43 $addx = ($1>=2.23); 44} 45 46if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 48 $addx = ($1>=2.10); 49} 50 51if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 52 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 53 $addx = ($1>=12); 54} 55 56# int bn_mul_mont_gather5( 57$rp="%rdi"; # BN_ULONG *rp, 58$ap="%rsi"; # const BN_ULONG *ap, 59$bp="%rdx"; # const BN_ULONG *bp, 60$np="%rcx"; # const BN_ULONG *np, 61$n0="%r8"; # const BN_ULONG *n0, 62$num="%r9"; # int num, 63 # int idx); # 0 to 2^5-1, "index" in $bp holding 64 # pre-computed powers of a', interlaced 65 # in such manner that b[0] is $bp[idx], 66 # b[1] is [2^5+idx], etc. 67$lo0="%r10"; 68$hi0="%r11"; 69$hi1="%r13"; 70$i="%r14"; 71$j="%r15"; 72$m0="%rbx"; 73$m1="%rbp"; 74 75$code=<<___; 76.text 77 78.extern OPENSSL_ia32cap_P 79 80.globl bn_mul_mont_gather5 81.type bn_mul_mont_gather5,\@function,6 82.align 64 83bn_mul_mont_gather5: 84 test \$7,${num}d 85 jnz .Lmul_enter 86___ 87$code.=<<___ if ($addx); 88 mov OPENSSL_ia32cap_P+8(%rip),%r11d 89___ 90$code.=<<___; 91 jmp .Lmul4x_enter 92 93.align 16 94.Lmul_enter: 95 mov ${num}d,${num}d 96 mov %rsp,%rax 97 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument 98 push %rbx 99 push %rbp 100 push %r12 101 push %r13 102 push %r14 103 push %r15 104___ 105$code.=<<___ if ($win64); 106 lea -0x28(%rsp),%rsp 107 movaps %xmm6,(%rsp) 108 movaps %xmm7,0x10(%rsp) 109___ 110$code.=<<___; 111 lea 2($num),%r11 112 neg %r11 113 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) 114 and \$-1024,%rsp # minimize TLB usage 115 116 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 117.Lmul_body: 118 mov $bp,%r12 # reassign $bp 119___ 120 $bp="%r12"; 121 $STRIDE=2**5*8; # 5 is "window size" 122 $N=$STRIDE/4; # should match cache line size 123$code.=<<___; 124 mov %r10,%r11 125 shr \$`log($N/8)/log(2)`,%r10 126 and \$`$N/8-1`,%r11 127 not %r10 128 lea .Lmagic_masks(%rip),%rax 129 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 130 lea 96($bp,%r11,8),$bp # pointer within 1st cache line 131 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 132 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 133 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 134 movq 24(%rax,%r10,8),%xmm7 135 136 movq `0*$STRIDE/4-96`($bp),%xmm0 137 movq `1*$STRIDE/4-96`($bp),%xmm1 138 pand %xmm4,%xmm0 139 movq `2*$STRIDE/4-96`($bp),%xmm2 140 pand %xmm5,%xmm1 141 movq `3*$STRIDE/4-96`($bp),%xmm3 142 pand %xmm6,%xmm2 143 por %xmm1,%xmm0 144 pand %xmm7,%xmm3 145 por %xmm2,%xmm0 146 lea $STRIDE($bp),$bp 147 por %xmm3,%xmm0 148 149 movq %xmm0,$m0 # m0=bp[0] 150 151 mov ($n0),$n0 # pull n0[0] value 152 mov ($ap),%rax 153 154 xor $i,$i # i=0 155 xor $j,$j # j=0 156 157 movq `0*$STRIDE/4-96`($bp),%xmm0 158 movq `1*$STRIDE/4-96`($bp),%xmm1 159 pand %xmm4,%xmm0 160 movq `2*$STRIDE/4-96`($bp),%xmm2 161 pand %xmm5,%xmm1 162 163 mov $n0,$m1 164 mulq $m0 # ap[0]*bp[0] 165 mov %rax,$lo0 166 mov ($np),%rax 167 168 movq `3*$STRIDE/4-96`($bp),%xmm3 169 pand %xmm6,%xmm2 170 por %xmm1,%xmm0 171 pand %xmm7,%xmm3 172 173 imulq $lo0,$m1 # "tp[0]"*n0 174 mov %rdx,$hi0 175 176 por %xmm2,%xmm0 177 lea $STRIDE($bp),$bp 178 por %xmm3,%xmm0 179 180 mulq $m1 # np[0]*m1 181 add %rax,$lo0 # discarded 182 mov 8($ap),%rax 183 adc \$0,%rdx 184 mov %rdx,$hi1 185 186 lea 1($j),$j # j++ 187 jmp .L1st_enter 188 189.align 16 190.L1st: 191 add %rax,$hi1 192 mov ($ap,$j,8),%rax 193 adc \$0,%rdx 194 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 195 mov $lo0,$hi0 196 adc \$0,%rdx 197 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 198 mov %rdx,$hi1 199 200.L1st_enter: 201 mulq $m0 # ap[j]*bp[0] 202 add %rax,$hi0 203 mov ($np,$j,8),%rax 204 adc \$0,%rdx 205 lea 1($j),$j # j++ 206 mov %rdx,$lo0 207 208 mulq $m1 # np[j]*m1 209 cmp $num,$j 210 jne .L1st 211 212 movq %xmm0,$m0 # bp[1] 213 214 add %rax,$hi1 215 mov ($ap),%rax # ap[0] 216 adc \$0,%rdx 217 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 218 adc \$0,%rdx 219 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 220 mov %rdx,$hi1 221 mov $lo0,$hi0 222 223 xor %rdx,%rdx 224 add $hi0,$hi1 225 adc \$0,%rdx 226 mov $hi1,-8(%rsp,$num,8) 227 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 228 229 lea 1($i),$i # i++ 230 jmp .Louter 231.align 16 232.Louter: 233 xor $j,$j # j=0 234 mov $n0,$m1 235 mov (%rsp),$lo0 236 237 movq `0*$STRIDE/4-96`($bp),%xmm0 238 movq `1*$STRIDE/4-96`($bp),%xmm1 239 pand %xmm4,%xmm0 240 movq `2*$STRIDE/4-96`($bp),%xmm2 241 pand %xmm5,%xmm1 242 243 mulq $m0 # ap[0]*bp[i] 244 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 245 mov ($np),%rax 246 adc \$0,%rdx 247 248 movq `3*$STRIDE/4-96`($bp),%xmm3 249 pand %xmm6,%xmm2 250 por %xmm1,%xmm0 251 pand %xmm7,%xmm3 252 253 imulq $lo0,$m1 # tp[0]*n0 254 mov %rdx,$hi0 255 256 por %xmm2,%xmm0 257 lea $STRIDE($bp),$bp 258 por %xmm3,%xmm0 259 260 mulq $m1 # np[0]*m1 261 add %rax,$lo0 # discarded 262 mov 8($ap),%rax 263 adc \$0,%rdx 264 mov 8(%rsp),$lo0 # tp[1] 265 mov %rdx,$hi1 266 267 lea 1($j),$j # j++ 268 jmp .Linner_enter 269 270.align 16 271.Linner: 272 add %rax,$hi1 273 mov ($ap,$j,8),%rax 274 adc \$0,%rdx 275 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 276 mov (%rsp,$j,8),$lo0 277 adc \$0,%rdx 278 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 279 mov %rdx,$hi1 280 281.Linner_enter: 282 mulq $m0 # ap[j]*bp[i] 283 add %rax,$hi0 284 mov ($np,$j,8),%rax 285 adc \$0,%rdx 286 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 287 mov %rdx,$hi0 288 adc \$0,$hi0 289 lea 1($j),$j # j++ 290 291 mulq $m1 # np[j]*m1 292 cmp $num,$j 293 jne .Linner 294 295 movq %xmm0,$m0 # bp[i+1] 296 297 add %rax,$hi1 298 mov ($ap),%rax # ap[0] 299 adc \$0,%rdx 300 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 301 mov (%rsp,$j,8),$lo0 302 adc \$0,%rdx 303 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 304 mov %rdx,$hi1 305 306 xor %rdx,%rdx 307 add $hi0,$hi1 308 adc \$0,%rdx 309 add $lo0,$hi1 # pull upmost overflow bit 310 adc \$0,%rdx 311 mov $hi1,-8(%rsp,$num,8) 312 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 313 314 lea 1($i),$i # i++ 315 cmp $num,$i 316 jb .Louter 317 318 xor $i,$i # i=0 and clear CF! 319 mov (%rsp),%rax # tp[0] 320 lea (%rsp),$ap # borrow ap for tp 321 mov $num,$j # j=num 322 jmp .Lsub 323.align 16 324.Lsub: sbb ($np,$i,8),%rax 325 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 326 mov 8($ap,$i,8),%rax # tp[i+1] 327 lea 1($i),$i # i++ 328 dec $j # doesnn't affect CF! 329 jnz .Lsub 330 331 sbb \$0,%rax # handle upmost overflow bit 332 xor $i,$i 333 and %rax,$ap 334 not %rax 335 mov $rp,$np 336 and %rax,$np 337 mov $num,$j # j=num 338 or $np,$ap # ap=borrow?tp:rp 339.align 16 340.Lcopy: # copy or in-place refresh 341 mov ($ap,$i,8),%rax 342 mov $i,(%rsp,$i,8) # zap temporary vector 343 mov %rax,($rp,$i,8) # rp[i]=tp[i] 344 lea 1($i),$i 345 sub \$1,$j 346 jnz .Lcopy 347 348 mov 8(%rsp,$num,8),%rsi # restore %rsp 349 mov \$1,%rax 350___ 351$code.=<<___ if ($win64); 352 movaps -88(%rsi),%xmm6 353 movaps -72(%rsi),%xmm7 354___ 355$code.=<<___; 356 mov -48(%rsi),%r15 357 mov -40(%rsi),%r14 358 mov -32(%rsi),%r13 359 mov -24(%rsi),%r12 360 mov -16(%rsi),%rbp 361 mov -8(%rsi),%rbx 362 lea (%rsi),%rsp 363.Lmul_epilogue: 364 ret 365.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 366___ 367{{{ 368my @A=("%r10","%r11"); 369my @N=("%r13","%rdi"); 370$code.=<<___; 371.type bn_mul4x_mont_gather5,\@function,6 372.align 32 373bn_mul4x_mont_gather5: 374.Lmul4x_enter: 375___ 376$code.=<<___ if ($addx); 377 and \$0x80100,%r11d 378 cmp \$0x80100,%r11d 379 je .Lmulx4x_enter 380___ 381$code.=<<___; 382 .byte 0x67 383 mov %rsp,%rax 384 push %rbx 385 push %rbp 386 push %r12 387 push %r13 388 push %r14 389 push %r15 390___ 391$code.=<<___ if ($win64); 392 lea -0x28(%rsp),%rsp 393 movaps %xmm6,(%rsp) 394 movaps %xmm7,0x10(%rsp) 395___ 396$code.=<<___; 397 .byte 0x67 398 mov ${num}d,%r10d 399 shl \$3,${num}d 400 shl \$3+2,%r10d # 4*$num 401 neg $num # -$num 402 403 ############################################################## 404 # ensure that stack frame doesn't alias with $aptr+4*$num 405 # modulo 4096, which covers ret[num], am[num] and n[2*num] 406 # (see bn_exp.c). this is done to allow memory disambiguation 407 # logic do its magic. [excessive frame is allocated in order 408 # to allow bn_from_mont8x to clear it.] 409 # 410 lea -64(%rsp,$num,2),%r11 411 sub $ap,%r11 412 and \$4095,%r11 413 cmp %r11,%r10 414 jb .Lmul4xsp_alt 415 sub %r11,%rsp # align with $ap 416 lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) 417 jmp .Lmul4xsp_done 418 419.align 32 420.Lmul4xsp_alt: 421 lea 4096-64(,$num,2),%r10 422 lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) 423 sub %r10,%r11 424 mov \$0,%r10 425 cmovc %r10,%r11 426 sub %r11,%rsp 427.Lmul4xsp_done: 428 and \$-64,%rsp 429 neg $num 430 431 mov %rax,40(%rsp) 432.Lmul4x_body: 433 434 call mul4x_internal 435 436 mov 40(%rsp),%rsi # restore %rsp 437 mov \$1,%rax 438___ 439$code.=<<___ if ($win64); 440 movaps -88(%rsi),%xmm6 441 movaps -72(%rsi),%xmm7 442___ 443$code.=<<___; 444 mov -48(%rsi),%r15 445 mov -40(%rsi),%r14 446 mov -32(%rsi),%r13 447 mov -24(%rsi),%r12 448 mov -16(%rsi),%rbp 449 mov -8(%rsi),%rbx 450 lea (%rsi),%rsp 451.Lmul4x_epilogue: 452 ret 453.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 454 455.type mul4x_internal,\@abi-omnipotent 456.align 32 457mul4x_internal: 458 shl \$5,$num 459 mov `($win64?56:8)`(%rax),%r10d # load 7th argument 460 lea 256(%rdx,$num),%r13 461 shr \$5,$num # restore $num 462___ 463 $bp="%r12"; 464 $STRIDE=2**5*8; # 5 is "window size" 465 $N=$STRIDE/4; # should match cache line size 466 $tp=$i; 467$code.=<<___; 468 mov %r10,%r11 469 shr \$`log($N/8)/log(2)`,%r10 470 and \$`$N/8-1`,%r11 471 not %r10 472 lea .Lmagic_masks(%rip),%rax 473 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 474 lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line 475 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 476 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 477 add \$7,%r11 478 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 479 movq 24(%rax,%r10,8),%xmm7 480 and \$7,%r11 481 482 movq `0*$STRIDE/4-96`($bp),%xmm0 483 lea $STRIDE($bp),$tp # borrow $tp 484 movq `1*$STRIDE/4-96`($bp),%xmm1 485 pand %xmm4,%xmm0 486 movq `2*$STRIDE/4-96`($bp),%xmm2 487 pand %xmm5,%xmm1 488 movq `3*$STRIDE/4-96`($bp),%xmm3 489 pand %xmm6,%xmm2 490 .byte 0x67 491 por %xmm1,%xmm0 492 movq `0*$STRIDE/4-96`($tp),%xmm1 493 .byte 0x67 494 pand %xmm7,%xmm3 495 .byte 0x67 496 por %xmm2,%xmm0 497 movq `1*$STRIDE/4-96`($tp),%xmm2 498 .byte 0x67 499 pand %xmm4,%xmm1 500 .byte 0x67 501 por %xmm3,%xmm0 502 movq `2*$STRIDE/4-96`($tp),%xmm3 503 504 movq %xmm0,$m0 # m0=bp[0] 505 movq `3*$STRIDE/4-96`($tp),%xmm0 506 mov %r13,16+8(%rsp) # save end of b[num] 507 mov $rp, 56+8(%rsp) # save $rp 508 509 mov ($n0),$n0 # pull n0[0] value 510 mov ($ap),%rax 511 lea ($ap,$num),$ap # end of a[num] 512 neg $num 513 514 mov $n0,$m1 515 mulq $m0 # ap[0]*bp[0] 516 mov %rax,$A[0] 517 mov ($np),%rax 518 519 pand %xmm5,%xmm2 520 pand %xmm6,%xmm3 521 por %xmm2,%xmm1 522 523 imulq $A[0],$m1 # "tp[0]"*n0 524 ############################################################## 525 # $tp is chosen so that writing to top-most element of the 526 # vector occurs just "above" references to powers table, 527 # "above" modulo cache-line size, which effectively precludes 528 # possibility of memory disambiguation logic failure when 529 # accessing the table. 530 # 531 lea 64+8(%rsp,%r11,8),$tp 532 mov %rdx,$A[1] 533 534 pand %xmm7,%xmm0 535 por %xmm3,%xmm1 536 lea 2*$STRIDE($bp),$bp 537 por %xmm1,%xmm0 538 539 mulq $m1 # np[0]*m1 540 add %rax,$A[0] # discarded 541 mov 8($ap,$num),%rax 542 adc \$0,%rdx 543 mov %rdx,$N[1] 544 545 mulq $m0 546 add %rax,$A[1] 547 mov 16*1($np),%rax # interleaved with 0, therefore 16*n 548 adc \$0,%rdx 549 mov %rdx,$A[0] 550 551 mulq $m1 552 add %rax,$N[1] 553 mov 16($ap,$num),%rax 554 adc \$0,%rdx 555 add $A[1],$N[1] 556 lea 4*8($num),$j # j=4 557 lea 16*4($np),$np 558 adc \$0,%rdx 559 mov $N[1],($tp) 560 mov %rdx,$N[0] 561 jmp .L1st4x 562 563.align 32 564.L1st4x: 565 mulq $m0 # ap[j]*bp[0] 566 add %rax,$A[0] 567 mov -16*2($np),%rax 568 lea 32($tp),$tp 569 adc \$0,%rdx 570 mov %rdx,$A[1] 571 572 mulq $m1 # np[j]*m1 573 add %rax,$N[0] 574 mov -8($ap,$j),%rax 575 adc \$0,%rdx 576 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 577 adc \$0,%rdx 578 mov $N[0],-24($tp) # tp[j-1] 579 mov %rdx,$N[1] 580 581 mulq $m0 # ap[j]*bp[0] 582 add %rax,$A[1] 583 mov -16*1($np),%rax 584 adc \$0,%rdx 585 mov %rdx,$A[0] 586 587 mulq $m1 # np[j]*m1 588 add %rax,$N[1] 589 mov ($ap,$j),%rax 590 adc \$0,%rdx 591 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 592 adc \$0,%rdx 593 mov $N[1],-16($tp) # tp[j-1] 594 mov %rdx,$N[0] 595 596 mulq $m0 # ap[j]*bp[0] 597 add %rax,$A[0] 598 mov 16*0($np),%rax 599 adc \$0,%rdx 600 mov %rdx,$A[1] 601 602 mulq $m1 # np[j]*m1 603 add %rax,$N[0] 604 mov 8($ap,$j),%rax 605 adc \$0,%rdx 606 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 607 adc \$0,%rdx 608 mov $N[0],-8($tp) # tp[j-1] 609 mov %rdx,$N[1] 610 611 mulq $m0 # ap[j]*bp[0] 612 add %rax,$A[1] 613 mov 16*1($np),%rax 614 adc \$0,%rdx 615 mov %rdx,$A[0] 616 617 mulq $m1 # np[j]*m1 618 add %rax,$N[1] 619 mov 16($ap,$j),%rax 620 adc \$0,%rdx 621 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 622 lea 16*4($np),$np 623 adc \$0,%rdx 624 mov $N[1],($tp) # tp[j-1] 625 mov %rdx,$N[0] 626 627 add \$32,$j # j+=4 628 jnz .L1st4x 629 630 mulq $m0 # ap[j]*bp[0] 631 add %rax,$A[0] 632 mov -16*2($np),%rax 633 lea 32($tp),$tp 634 adc \$0,%rdx 635 mov %rdx,$A[1] 636 637 mulq $m1 # np[j]*m1 638 add %rax,$N[0] 639 mov -8($ap),%rax 640 adc \$0,%rdx 641 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 642 adc \$0,%rdx 643 mov $N[0],-24($tp) # tp[j-1] 644 mov %rdx,$N[1] 645 646 mulq $m0 # ap[j]*bp[0] 647 add %rax,$A[1] 648 mov -16*1($np),%rax 649 adc \$0,%rdx 650 mov %rdx,$A[0] 651 652 mulq $m1 # np[j]*m1 653 add %rax,$N[1] 654 mov ($ap,$num),%rax # ap[0] 655 adc \$0,%rdx 656 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 657 adc \$0,%rdx 658 mov $N[1],-16($tp) # tp[j-1] 659 mov %rdx,$N[0] 660 661 movq %xmm0,$m0 # bp[1] 662 lea ($np,$num,2),$np # rewind $np 663 664 xor $N[1],$N[1] 665 add $A[0],$N[0] 666 adc \$0,$N[1] 667 mov $N[0],-8($tp) 668 669 jmp .Louter4x 670 671.align 32 672.Louter4x: 673 mov ($tp,$num),$A[0] 674 mov $n0,$m1 675 mulq $m0 # ap[0]*bp[i] 676 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 677 mov ($np),%rax 678 adc \$0,%rdx 679 680 movq `0*$STRIDE/4-96`($bp),%xmm0 681 movq `1*$STRIDE/4-96`($bp),%xmm1 682 pand %xmm4,%xmm0 683 movq `2*$STRIDE/4-96`($bp),%xmm2 684 pand %xmm5,%xmm1 685 movq `3*$STRIDE/4-96`($bp),%xmm3 686 687 imulq $A[0],$m1 # tp[0]*n0 688 .byte 0x67 689 mov %rdx,$A[1] 690 mov $N[1],($tp) # store upmost overflow bit 691 692 pand %xmm6,%xmm2 693 por %xmm1,%xmm0 694 pand %xmm7,%xmm3 695 por %xmm2,%xmm0 696 lea ($tp,$num),$tp # rewind $tp 697 lea $STRIDE($bp),$bp 698 por %xmm3,%xmm0 699 700 mulq $m1 # np[0]*m1 701 add %rax,$A[0] # "$N[0]", discarded 702 mov 8($ap,$num),%rax 703 adc \$0,%rdx 704 mov %rdx,$N[1] 705 706 mulq $m0 # ap[j]*bp[i] 707 add %rax,$A[1] 708 mov 16*1($np),%rax # interleaved with 0, therefore 16*n 709 adc \$0,%rdx 710 add 8($tp),$A[1] # +tp[1] 711 adc \$0,%rdx 712 mov %rdx,$A[0] 713 714 mulq $m1 # np[j]*m1 715 add %rax,$N[1] 716 mov 16($ap,$num),%rax 717 adc \$0,%rdx 718 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 719 lea 4*8($num),$j # j=4 720 lea 16*4($np),$np 721 adc \$0,%rdx 722 mov %rdx,$N[0] 723 jmp .Linner4x 724 725.align 32 726.Linner4x: 727 mulq $m0 # ap[j]*bp[i] 728 add %rax,$A[0] 729 mov -16*2($np),%rax 730 adc \$0,%rdx 731 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 732 lea 32($tp),$tp 733 adc \$0,%rdx 734 mov %rdx,$A[1] 735 736 mulq $m1 # np[j]*m1 737 add %rax,$N[0] 738 mov -8($ap,$j),%rax 739 adc \$0,%rdx 740 add $A[0],$N[0] 741 adc \$0,%rdx 742 mov $N[1],-32($tp) # tp[j-1] 743 mov %rdx,$N[1] 744 745 mulq $m0 # ap[j]*bp[i] 746 add %rax,$A[1] 747 mov -16*1($np),%rax 748 adc \$0,%rdx 749 add -8($tp),$A[1] 750 adc \$0,%rdx 751 mov %rdx,$A[0] 752 753 mulq $m1 # np[j]*m1 754 add %rax,$N[1] 755 mov ($ap,$j),%rax 756 adc \$0,%rdx 757 add $A[1],$N[1] 758 adc \$0,%rdx 759 mov $N[0],-24($tp) # tp[j-1] 760 mov %rdx,$N[0] 761 762 mulq $m0 # ap[j]*bp[i] 763 add %rax,$A[0] 764 mov 16*0($np),%rax 765 adc \$0,%rdx 766 add ($tp),$A[0] # ap[j]*bp[i]+tp[j] 767 adc \$0,%rdx 768 mov %rdx,$A[1] 769 770 mulq $m1 # np[j]*m1 771 add %rax,$N[0] 772 mov 8($ap,$j),%rax 773 adc \$0,%rdx 774 add $A[0],$N[0] 775 adc \$0,%rdx 776 mov $N[1],-16($tp) # tp[j-1] 777 mov %rdx,$N[1] 778 779 mulq $m0 # ap[j]*bp[i] 780 add %rax,$A[1] 781 mov 16*1($np),%rax 782 adc \$0,%rdx 783 add 8($tp),$A[1] 784 adc \$0,%rdx 785 mov %rdx,$A[0] 786 787 mulq $m1 # np[j]*m1 788 add %rax,$N[1] 789 mov 16($ap,$j),%rax 790 adc \$0,%rdx 791 add $A[1],$N[1] 792 lea 16*4($np),$np 793 adc \$0,%rdx 794 mov $N[0],-8($tp) # tp[j-1] 795 mov %rdx,$N[0] 796 797 add \$32,$j # j+=4 798 jnz .Linner4x 799 800 mulq $m0 # ap[j]*bp[i] 801 add %rax,$A[0] 802 mov -16*2($np),%rax 803 adc \$0,%rdx 804 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 805 lea 32($tp),$tp 806 adc \$0,%rdx 807 mov %rdx,$A[1] 808 809 mulq $m1 # np[j]*m1 810 add %rax,$N[0] 811 mov -8($ap),%rax 812 adc \$0,%rdx 813 add $A[0],$N[0] 814 adc \$0,%rdx 815 mov $N[1],-32($tp) # tp[j-1] 816 mov %rdx,$N[1] 817 818 mulq $m0 # ap[j]*bp[i] 819 add %rax,$A[1] 820 mov $m1,%rax 821 mov -16*1($np),$m1 822 adc \$0,%rdx 823 add -8($tp),$A[1] 824 adc \$0,%rdx 825 mov %rdx,$A[0] 826 827 mulq $m1 # np[j]*m1 828 add %rax,$N[1] 829 mov ($ap,$num),%rax # ap[0] 830 adc \$0,%rdx 831 add $A[1],$N[1] 832 adc \$0,%rdx 833 mov $N[0],-24($tp) # tp[j-1] 834 mov %rdx,$N[0] 835 836 movq %xmm0,$m0 # bp[i+1] 837 mov $N[1],-16($tp) # tp[j-1] 838 lea ($np,$num,2),$np # rewind $np 839 840 xor $N[1],$N[1] 841 add $A[0],$N[0] 842 adc \$0,$N[1] 843 add ($tp),$N[0] # pull upmost overflow bit 844 adc \$0,$N[1] # upmost overflow bit 845 mov $N[0],-8($tp) 846 847 cmp 16+8(%rsp),$bp 848 jb .Louter4x 849___ 850if (1) { 851$code.=<<___; 852 sub $N[0],$m1 # compare top-most words 853 adc $j,$j # $j is zero 854 or $j,$N[1] 855 xor \$1,$N[1] 856 lea ($tp,$num),%rbx # tptr in .sqr4x_sub 857 lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub 858 mov %r9,%rcx 859 sar \$3+2,%rcx # cf=0 860 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub 861 jmp .Lsqr4x_sub 862___ 863} else { 864my @ri=("%rax",$bp,$m0,$m1); 865my $rp="%rdx"; 866$code.=<<___ 867 xor \$1,$N[1] 868 lea ($tp,$num),$tp # rewind $tp 869 sar \$5,$num # cf=0 870 lea ($np,$N[1],8),$np 871 mov 56+8(%rsp),$rp # restore $rp 872 jmp .Lsub4x 873 874.align 32 875.Lsub4x: 876 .byte 0x66 877 mov 8*0($tp),@ri[0] 878 mov 8*1($tp),@ri[1] 879 .byte 0x66 880 sbb 16*0($np),@ri[0] 881 mov 8*2($tp),@ri[2] 882 sbb 16*1($np),@ri[1] 883 mov 3*8($tp),@ri[3] 884 lea 4*8($tp),$tp 885 sbb 16*2($np),@ri[2] 886 mov @ri[0],8*0($rp) 887 sbb 16*3($np),@ri[3] 888 lea 16*4($np),$np 889 mov @ri[1],8*1($rp) 890 mov @ri[2],8*2($rp) 891 mov @ri[3],8*3($rp) 892 lea 8*4($rp),$rp 893 894 inc $num 895 jnz .Lsub4x 896 897 ret 898___ 899} 900$code.=<<___; 901.size mul4x_internal,.-mul4x_internal 902___ 903}}} 904{{{ 905###################################################################### 906# void bn_power5( 907my $rptr="%rdi"; # BN_ULONG *rptr, 908my $aptr="%rsi"; # const BN_ULONG *aptr, 909my $bptr="%rdx"; # const void *table, 910my $nptr="%rcx"; # const BN_ULONG *nptr, 911my $n0 ="%r8"; # const BN_ULONG *n0); 912my $num ="%r9"; # int num, has to be divisible by 8 913 # int pwr 914 915my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 916my @A0=("%r10","%r11"); 917my @A1=("%r12","%r13"); 918my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 919 920$code.=<<___; 921.globl bn_power5 922.type bn_power5,\@function,6 923.align 32 924bn_power5: 925___ 926$code.=<<___ if ($addx); 927 mov OPENSSL_ia32cap_P+8(%rip),%r11d 928 and \$0x80100,%r11d 929 cmp \$0x80100,%r11d 930 je .Lpowerx5_enter 931___ 932$code.=<<___; 933 mov %rsp,%rax 934 push %rbx 935 push %rbp 936 push %r12 937 push %r13 938 push %r14 939 push %r15 940___ 941$code.=<<___ if ($win64); 942 lea -0x28(%rsp),%rsp 943 movaps %xmm6,(%rsp) 944 movaps %xmm7,0x10(%rsp) 945___ 946$code.=<<___; 947 mov ${num}d,%r10d 948 shl \$3,${num}d # convert $num to bytes 949 shl \$3+2,%r10d # 4*$num 950 neg $num 951 mov ($n0),$n0 # *n0 952 953 ############################################################## 954 # ensure that stack frame doesn't alias with $aptr+4*$num 955 # modulo 4096, which covers ret[num], am[num] and n[2*num] 956 # (see bn_exp.c). this is done to allow memory disambiguation 957 # logic do its magic. 958 # 959 lea -64(%rsp,$num,2),%r11 960 sub $aptr,%r11 961 and \$4095,%r11 962 cmp %r11,%r10 963 jb .Lpwr_sp_alt 964 sub %r11,%rsp # align with $aptr 965 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 966 jmp .Lpwr_sp_done 967 968.align 32 969.Lpwr_sp_alt: 970 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num 971 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 972 sub %r10,%r11 973 mov \$0,%r10 974 cmovc %r10,%r11 975 sub %r11,%rsp 976.Lpwr_sp_done: 977 and \$-64,%rsp 978 mov $num,%r10 979 neg $num 980 981 ############################################################## 982 # Stack layout 983 # 984 # +0 saved $num, used in reduction section 985 # +8 &t[2*$num], used in reduction section 986 # +32 saved *n0 987 # +40 saved %rsp 988 # +48 t[2*$num] 989 # 990 mov $n0, 32(%rsp) 991 mov %rax, 40(%rsp) # save original %rsp 992.Lpower5_body: 993 movq $rptr,%xmm1 # save $rptr 994 movq $nptr,%xmm2 # save $nptr 995 movq %r10, %xmm3 # -$num 996 movq $bptr,%xmm4 997 998 call __bn_sqr8x_internal 999 call __bn_sqr8x_internal 1000 call __bn_sqr8x_internal 1001 call __bn_sqr8x_internal 1002 call __bn_sqr8x_internal 1003 1004 movq %xmm2,$nptr 1005 movq %xmm4,$bptr 1006 mov $aptr,$rptr 1007 mov 40(%rsp),%rax 1008 lea 32(%rsp),$n0 1009 1010 call mul4x_internal 1011 1012 mov 40(%rsp),%rsi # restore %rsp 1013 mov \$1,%rax 1014 mov -48(%rsi),%r15 1015 mov -40(%rsi),%r14 1016 mov -32(%rsi),%r13 1017 mov -24(%rsi),%r12 1018 mov -16(%rsi),%rbp 1019 mov -8(%rsi),%rbx 1020 lea (%rsi),%rsp 1021.Lpower5_epilogue: 1022 ret 1023.size bn_power5,.-bn_power5 1024 1025.globl bn_sqr8x_internal 1026.hidden bn_sqr8x_internal 1027.type bn_sqr8x_internal,\@abi-omnipotent 1028.align 32 1029bn_sqr8x_internal: 1030__bn_sqr8x_internal: 1031 ############################################################## 1032 # Squaring part: 1033 # 1034 # a) multiply-n-add everything but a[i]*a[i]; 1035 # b) shift result of a) by 1 to the left and accumulate 1036 # a[i]*a[i] products; 1037 # 1038 ############################################################## 1039 # a[1]a[0] 1040 # a[2]a[0] 1041 # a[3]a[0] 1042 # a[2]a[1] 1043 # a[4]a[0] 1044 # a[3]a[1] 1045 # a[5]a[0] 1046 # a[4]a[1] 1047 # a[3]a[2] 1048 # a[6]a[0] 1049 # a[5]a[1] 1050 # a[4]a[2] 1051 # a[7]a[0] 1052 # a[6]a[1] 1053 # a[5]a[2] 1054 # a[4]a[3] 1055 # a[7]a[1] 1056 # a[6]a[2] 1057 # a[5]a[3] 1058 # a[7]a[2] 1059 # a[6]a[3] 1060 # a[5]a[4] 1061 # a[7]a[3] 1062 # a[6]a[4] 1063 # a[7]a[4] 1064 # a[6]a[5] 1065 # a[7]a[5] 1066 # a[7]a[6] 1067 # a[1]a[0] 1068 # a[2]a[0] 1069 # a[3]a[0] 1070 # a[4]a[0] 1071 # a[5]a[0] 1072 # a[6]a[0] 1073 # a[7]a[0] 1074 # a[2]a[1] 1075 # a[3]a[1] 1076 # a[4]a[1] 1077 # a[5]a[1] 1078 # a[6]a[1] 1079 # a[7]a[1] 1080 # a[3]a[2] 1081 # a[4]a[2] 1082 # a[5]a[2] 1083 # a[6]a[2] 1084 # a[7]a[2] 1085 # a[4]a[3] 1086 # a[5]a[3] 1087 # a[6]a[3] 1088 # a[7]a[3] 1089 # a[5]a[4] 1090 # a[6]a[4] 1091 # a[7]a[4] 1092 # a[6]a[5] 1093 # a[7]a[5] 1094 # a[7]a[6] 1095 # a[0]a[0] 1096 # a[1]a[1] 1097 # a[2]a[2] 1098 # a[3]a[3] 1099 # a[4]a[4] 1100 # a[5]a[5] 1101 # a[6]a[6] 1102 # a[7]a[7] 1103 1104 lea 32(%r10),$i # $i=-($num-32) 1105 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 1106 1107 mov $num,$j # $j=$num 1108 1109 # comments apply to $num==8 case 1110 mov -32($aptr,$i),$a0 # a[0] 1111 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1112 mov -24($aptr,$i),%rax # a[1] 1113 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1114 mov -16($aptr,$i),$ai # a[2] 1115 mov %rax,$a1 1116 1117 mul $a0 # a[1]*a[0] 1118 mov %rax,$A0[0] # a[1]*a[0] 1119 mov $ai,%rax # a[2] 1120 mov %rdx,$A0[1] 1121 mov $A0[0],-24($tptr,$i) # t[1] 1122 1123 mul $a0 # a[2]*a[0] 1124 add %rax,$A0[1] 1125 mov $ai,%rax 1126 adc \$0,%rdx 1127 mov $A0[1],-16($tptr,$i) # t[2] 1128 mov %rdx,$A0[0] 1129 1130 1131 mov -8($aptr,$i),$ai # a[3] 1132 mul $a1 # a[2]*a[1] 1133 mov %rax,$A1[0] # a[2]*a[1]+t[3] 1134 mov $ai,%rax 1135 mov %rdx,$A1[1] 1136 1137 lea ($i),$j 1138 mul $a0 # a[3]*a[0] 1139 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1140 mov $ai,%rax 1141 mov %rdx,$A0[1] 1142 adc \$0,$A0[1] 1143 add $A1[0],$A0[0] 1144 adc \$0,$A0[1] 1145 mov $A0[0],-8($tptr,$j) # t[3] 1146 jmp .Lsqr4x_1st 1147 1148.align 32 1149.Lsqr4x_1st: 1150 mov ($aptr,$j),$ai # a[4] 1151 mul $a1 # a[3]*a[1] 1152 add %rax,$A1[1] # a[3]*a[1]+t[4] 1153 mov $ai,%rax 1154 mov %rdx,$A1[0] 1155 adc \$0,$A1[0] 1156 1157 mul $a0 # a[4]*a[0] 1158 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1159 mov $ai,%rax # a[3] 1160 mov 8($aptr,$j),$ai # a[5] 1161 mov %rdx,$A0[0] 1162 adc \$0,$A0[0] 1163 add $A1[1],$A0[1] 1164 adc \$0,$A0[0] 1165 1166 1167 mul $a1 # a[4]*a[3] 1168 add %rax,$A1[0] # a[4]*a[3]+t[5] 1169 mov $ai,%rax 1170 mov $A0[1],($tptr,$j) # t[4] 1171 mov %rdx,$A1[1] 1172 adc \$0,$A1[1] 1173 1174 mul $a0 # a[5]*a[2] 1175 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1176 mov $ai,%rax 1177 mov 16($aptr,$j),$ai # a[6] 1178 mov %rdx,$A0[1] 1179 adc \$0,$A0[1] 1180 add $A1[0],$A0[0] 1181 adc \$0,$A0[1] 1182 1183 mul $a1 # a[5]*a[3] 1184 add %rax,$A1[1] # a[5]*a[3]+t[6] 1185 mov $ai,%rax 1186 mov $A0[0],8($tptr,$j) # t[5] 1187 mov %rdx,$A1[0] 1188 adc \$0,$A1[0] 1189 1190 mul $a0 # a[6]*a[2] 1191 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 1192 mov $ai,%rax # a[3] 1193 mov 24($aptr,$j),$ai # a[7] 1194 mov %rdx,$A0[0] 1195 adc \$0,$A0[0] 1196 add $A1[1],$A0[1] 1197 adc \$0,$A0[0] 1198 1199 1200 mul $a1 # a[6]*a[5] 1201 add %rax,$A1[0] # a[6]*a[5]+t[7] 1202 mov $ai,%rax 1203 mov $A0[1],16($tptr,$j) # t[6] 1204 mov %rdx,$A1[1] 1205 adc \$0,$A1[1] 1206 lea 32($j),$j 1207 1208 mul $a0 # a[7]*a[4] 1209 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 1210 mov $ai,%rax 1211 mov %rdx,$A0[1] 1212 adc \$0,$A0[1] 1213 add $A1[0],$A0[0] 1214 adc \$0,$A0[1] 1215 mov $A0[0],-8($tptr,$j) # t[7] 1216 1217 cmp \$0,$j 1218 jne .Lsqr4x_1st 1219 1220 mul $a1 # a[7]*a[5] 1221 add %rax,$A1[1] 1222 lea 16($i),$i 1223 adc \$0,%rdx 1224 add $A0[1],$A1[1] 1225 adc \$0,%rdx 1226 1227 mov $A1[1],($tptr) # t[8] 1228 mov %rdx,$A1[0] 1229 mov %rdx,8($tptr) # t[9] 1230 jmp .Lsqr4x_outer 1231 1232.align 32 1233.Lsqr4x_outer: # comments apply to $num==6 case 1234 mov -32($aptr,$i),$a0 # a[0] 1235 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1236 mov -24($aptr,$i),%rax # a[1] 1237 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1238 mov -16($aptr,$i),$ai # a[2] 1239 mov %rax,$a1 1240 1241 mul $a0 # a[1]*a[0] 1242 mov -24($tptr,$i),$A0[0] # t[1] 1243 add %rax,$A0[0] # a[1]*a[0]+t[1] 1244 mov $ai,%rax # a[2] 1245 adc \$0,%rdx 1246 mov $A0[0],-24($tptr,$i) # t[1] 1247 mov %rdx,$A0[1] 1248 1249 mul $a0 # a[2]*a[0] 1250 add %rax,$A0[1] 1251 mov $ai,%rax 1252 adc \$0,%rdx 1253 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 1254 mov %rdx,$A0[0] 1255 adc \$0,$A0[0] 1256 mov $A0[1],-16($tptr,$i) # t[2] 1257 1258 xor $A1[0],$A1[0] 1259 1260 mov -8($aptr,$i),$ai # a[3] 1261 mul $a1 # a[2]*a[1] 1262 add %rax,$A1[0] # a[2]*a[1]+t[3] 1263 mov $ai,%rax 1264 adc \$0,%rdx 1265 add -8($tptr,$i),$A1[0] 1266 mov %rdx,$A1[1] 1267 adc \$0,$A1[1] 1268 1269 mul $a0 # a[3]*a[0] 1270 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1271 mov $ai,%rax 1272 adc \$0,%rdx 1273 add $A1[0],$A0[0] 1274 mov %rdx,$A0[1] 1275 adc \$0,$A0[1] 1276 mov $A0[0],-8($tptr,$i) # t[3] 1277 1278 lea ($i),$j 1279 jmp .Lsqr4x_inner 1280 1281.align 32 1282.Lsqr4x_inner: 1283 mov ($aptr,$j),$ai # a[4] 1284 mul $a1 # a[3]*a[1] 1285 add %rax,$A1[1] # a[3]*a[1]+t[4] 1286 mov $ai,%rax 1287 mov %rdx,$A1[0] 1288 adc \$0,$A1[0] 1289 add ($tptr,$j),$A1[1] 1290 adc \$0,$A1[0] 1291 1292 .byte 0x67 1293 mul $a0 # a[4]*a[0] 1294 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1295 mov $ai,%rax # a[3] 1296 mov 8($aptr,$j),$ai # a[5] 1297 mov %rdx,$A0[0] 1298 adc \$0,$A0[0] 1299 add $A1[1],$A0[1] 1300 adc \$0,$A0[0] 1301 1302 mul $a1 # a[4]*a[3] 1303 add %rax,$A1[0] # a[4]*a[3]+t[5] 1304 mov $A0[1],($tptr,$j) # t[4] 1305 mov $ai,%rax 1306 mov %rdx,$A1[1] 1307 adc \$0,$A1[1] 1308 add 8($tptr,$j),$A1[0] 1309 lea 16($j),$j # j++ 1310 adc \$0,$A1[1] 1311 1312 mul $a0 # a[5]*a[2] 1313 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1314 mov $ai,%rax 1315 adc \$0,%rdx 1316 add $A1[0],$A0[0] 1317 mov %rdx,$A0[1] 1318 adc \$0,$A0[1] 1319 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 1320 1321 cmp \$0,$j 1322 jne .Lsqr4x_inner 1323 1324 .byte 0x67 1325 mul $a1 # a[5]*a[3] 1326 add %rax,$A1[1] 1327 adc \$0,%rdx 1328 add $A0[1],$A1[1] 1329 adc \$0,%rdx 1330 1331 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 1332 mov %rdx,$A1[0] 1333 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below 1334 1335 add \$16,$i 1336 jnz .Lsqr4x_outer 1337 1338 # comments apply to $num==4 case 1339 mov -32($aptr),$a0 # a[0] 1340 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1341 mov -24($aptr),%rax # a[1] 1342 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1343 mov -16($aptr),$ai # a[2] 1344 mov %rax,$a1 1345 1346 mul $a0 # a[1]*a[0] 1347 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 1348 mov $ai,%rax # a[2] 1349 mov %rdx,$A0[1] 1350 adc \$0,$A0[1] 1351 1352 mul $a0 # a[2]*a[0] 1353 add %rax,$A0[1] 1354 mov $ai,%rax 1355 mov $A0[0],-24($tptr) # t[1] 1356 mov %rdx,$A0[0] 1357 adc \$0,$A0[0] 1358 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1359 mov -8($aptr),$ai # a[3] 1360 adc \$0,$A0[0] 1361 1362 mul $a1 # a[2]*a[1] 1363 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1364 mov $ai,%rax 1365 mov $A0[1],-16($tptr) # t[2] 1366 mov %rdx,$A1[1] 1367 adc \$0,$A1[1] 1368 1369 mul $a0 # a[3]*a[0] 1370 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1371 mov $ai,%rax 1372 mov %rdx,$A0[1] 1373 adc \$0,$A0[1] 1374 add $A1[0],$A0[0] 1375 adc \$0,$A0[1] 1376 mov $A0[0],-8($tptr) # t[3] 1377 1378 mul $a1 # a[3]*a[1] 1379 add %rax,$A1[1] 1380 mov -16($aptr),%rax # a[2] 1381 adc \$0,%rdx 1382 add $A0[1],$A1[1] 1383 adc \$0,%rdx 1384 1385 mov $A1[1],($tptr) # t[4] 1386 mov %rdx,$A1[0] 1387 mov %rdx,8($tptr) # t[5] 1388 1389 mul $ai # a[2]*a[3] 1390___ 1391{ 1392my ($shift,$carry)=($a0,$a1); 1393my @S=(@A1,$ai,$n0); 1394$code.=<<___; 1395 add \$16,$i 1396 xor $shift,$shift 1397 sub $num,$i # $i=16-$num 1398 xor $carry,$carry 1399 1400 add $A1[0],%rax # t[5] 1401 adc \$0,%rdx 1402 mov %rax,8($tptr) # t[5] 1403 mov %rdx,16($tptr) # t[6] 1404 mov $carry,24($tptr) # t[7] 1405 1406 mov -16($aptr,$i),%rax # a[0] 1407 lea 48+8(%rsp),$tptr 1408 xor $A0[0],$A0[0] # t[0] 1409 mov 8($tptr),$A0[1] # t[1] 1410 1411 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1412 shr \$63,$A0[0] 1413 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1414 shr \$63,$A0[1] 1415 or $A0[0],$S[1] # | t[2*i]>>63 1416 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1417 mov $A0[1],$shift # shift=t[2*i+1]>>63 1418 mul %rax # a[i]*a[i] 1419 neg $carry # mov $carry,cf 1420 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1421 adc %rax,$S[0] 1422 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1423 mov $S[0],($tptr) 1424 adc %rdx,$S[1] 1425 1426 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1427 mov $S[1],8($tptr) 1428 sbb $carry,$carry # mov cf,$carry 1429 shr \$63,$A0[0] 1430 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1431 shr \$63,$A0[1] 1432 or $A0[0],$S[3] # | t[2*i]>>63 1433 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1434 mov $A0[1],$shift # shift=t[2*i+1]>>63 1435 mul %rax # a[i]*a[i] 1436 neg $carry # mov $carry,cf 1437 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1438 adc %rax,$S[2] 1439 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1440 mov $S[2],16($tptr) 1441 adc %rdx,$S[3] 1442 lea 16($i),$i 1443 mov $S[3],24($tptr) 1444 sbb $carry,$carry # mov cf,$carry 1445 lea 64($tptr),$tptr 1446 jmp .Lsqr4x_shift_n_add 1447 1448.align 32 1449.Lsqr4x_shift_n_add: 1450 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1451 shr \$63,$A0[0] 1452 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1453 shr \$63,$A0[1] 1454 or $A0[0],$S[1] # | t[2*i]>>63 1455 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1456 mov $A0[1],$shift # shift=t[2*i+1]>>63 1457 mul %rax # a[i]*a[i] 1458 neg $carry # mov $carry,cf 1459 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1460 adc %rax,$S[0] 1461 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1462 mov $S[0],-32($tptr) 1463 adc %rdx,$S[1] 1464 1465 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1466 mov $S[1],-24($tptr) 1467 sbb $carry,$carry # mov cf,$carry 1468 shr \$63,$A0[0] 1469 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1470 shr \$63,$A0[1] 1471 or $A0[0],$S[3] # | t[2*i]>>63 1472 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch 1473 mov $A0[1],$shift # shift=t[2*i+1]>>63 1474 mul %rax # a[i]*a[i] 1475 neg $carry # mov $carry,cf 1476 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1477 adc %rax,$S[2] 1478 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1479 mov $S[2],-16($tptr) 1480 adc %rdx,$S[3] 1481 1482 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1483 mov $S[3],-8($tptr) 1484 sbb $carry,$carry # mov cf,$carry 1485 shr \$63,$A0[0] 1486 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1487 shr \$63,$A0[1] 1488 or $A0[0],$S[1] # | t[2*i]>>63 1489 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1490 mov $A0[1],$shift # shift=t[2*i+1]>>63 1491 mul %rax # a[i]*a[i] 1492 neg $carry # mov $carry,cf 1493 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1494 adc %rax,$S[0] 1495 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1496 mov $S[0],0($tptr) 1497 adc %rdx,$S[1] 1498 1499 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1500 mov $S[1],8($tptr) 1501 sbb $carry,$carry # mov cf,$carry 1502 shr \$63,$A0[0] 1503 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1504 shr \$63,$A0[1] 1505 or $A0[0],$S[3] # | t[2*i]>>63 1506 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1507 mov $A0[1],$shift # shift=t[2*i+1]>>63 1508 mul %rax # a[i]*a[i] 1509 neg $carry # mov $carry,cf 1510 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1511 adc %rax,$S[2] 1512 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1513 mov $S[2],16($tptr) 1514 adc %rdx,$S[3] 1515 mov $S[3],24($tptr) 1516 sbb $carry,$carry # mov cf,$carry 1517 lea 64($tptr),$tptr 1518 add \$32,$i 1519 jnz .Lsqr4x_shift_n_add 1520 1521 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1522 .byte 0x67 1523 shr \$63,$A0[0] 1524 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1525 shr \$63,$A0[1] 1526 or $A0[0],$S[1] # | t[2*i]>>63 1527 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1528 mov $A0[1],$shift # shift=t[2*i+1]>>63 1529 mul %rax # a[i]*a[i] 1530 neg $carry # mov $carry,cf 1531 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1532 adc %rax,$S[0] 1533 mov -8($aptr),%rax # a[i+1] # prefetch 1534 mov $S[0],-32($tptr) 1535 adc %rdx,$S[1] 1536 1537 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1538 mov $S[1],-24($tptr) 1539 sbb $carry,$carry # mov cf,$carry 1540 shr \$63,$A0[0] 1541 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1542 shr \$63,$A0[1] 1543 or $A0[0],$S[3] # | t[2*i]>>63 1544 mul %rax # a[i]*a[i] 1545 neg $carry # mov $carry,cf 1546 adc %rax,$S[2] 1547 adc %rdx,$S[3] 1548 mov $S[2],-16($tptr) 1549 mov $S[3],-8($tptr) 1550___ 1551} 1552###################################################################### 1553# Montgomery reduction part, "word-by-word" algorithm. 1554# 1555# This new path is inspired by multiple submissions from Intel, by 1556# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 1557# Vinodh Gopal... 1558{ 1559my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); 1560 1561$code.=<<___; 1562 movq %xmm2,$nptr 1563sqr8x_reduction: 1564 xor %rax,%rax 1565 lea ($nptr,$num,2),%rcx # end of n[] 1566 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer 1567 mov %rcx,0+8(%rsp) 1568 lea 48+8(%rsp,$num),$tptr # end of initial t[] window 1569 mov %rdx,8+8(%rsp) 1570 neg $num 1571 jmp .L8x_reduction_loop 1572 1573.align 32 1574.L8x_reduction_loop: 1575 lea ($tptr,$num),$tptr # start of current t[] window 1576 .byte 0x66 1577 mov 8*0($tptr),$m0 1578 mov 8*1($tptr),%r9 1579 mov 8*2($tptr),%r10 1580 mov 8*3($tptr),%r11 1581 mov 8*4($tptr),%r12 1582 mov 8*5($tptr),%r13 1583 mov 8*6($tptr),%r14 1584 mov 8*7($tptr),%r15 1585 mov %rax,(%rdx) # store top-most carry bit 1586 lea 8*8($tptr),$tptr 1587 1588 .byte 0x67 1589 mov $m0,%r8 1590 imulq 32+8(%rsp),$m0 # n0*a[0] 1591 mov 16*0($nptr),%rax # n[0] 1592 mov \$8,%ecx 1593 jmp .L8x_reduce 1594 1595.align 32 1596.L8x_reduce: 1597 mulq $m0 1598 mov 16*1($nptr),%rax # n[1] 1599 neg %r8 1600 mov %rdx,%r8 1601 adc \$0,%r8 1602 1603 mulq $m0 1604 add %rax,%r9 1605 mov 16*2($nptr),%rax 1606 adc \$0,%rdx 1607 add %r9,%r8 1608 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] 1609 mov %rdx,%r9 1610 adc \$0,%r9 1611 1612 mulq $m0 1613 add %rax,%r10 1614 mov 16*3($nptr),%rax 1615 adc \$0,%rdx 1616 add %r10,%r9 1617 mov 32+8(%rsp),$carry # pull n0, borrow $carry 1618 mov %rdx,%r10 1619 adc \$0,%r10 1620 1621 mulq $m0 1622 add %rax,%r11 1623 mov 16*4($nptr),%rax 1624 adc \$0,%rdx 1625 imulq %r8,$carry # modulo-scheduled 1626 add %r11,%r10 1627 mov %rdx,%r11 1628 adc \$0,%r11 1629 1630 mulq $m0 1631 add %rax,%r12 1632 mov 16*5($nptr),%rax 1633 adc \$0,%rdx 1634 add %r12,%r11 1635 mov %rdx,%r12 1636 adc \$0,%r12 1637 1638 mulq $m0 1639 add %rax,%r13 1640 mov 16*6($nptr),%rax 1641 adc \$0,%rdx 1642 add %r13,%r12 1643 mov %rdx,%r13 1644 adc \$0,%r13 1645 1646 mulq $m0 1647 add %rax,%r14 1648 mov 16*7($nptr),%rax 1649 adc \$0,%rdx 1650 add %r14,%r13 1651 mov %rdx,%r14 1652 adc \$0,%r14 1653 1654 mulq $m0 1655 mov $carry,$m0 # n0*a[i] 1656 add %rax,%r15 1657 mov 16*0($nptr),%rax # n[0] 1658 adc \$0,%rdx 1659 add %r15,%r14 1660 mov %rdx,%r15 1661 adc \$0,%r15 1662 1663 dec %ecx 1664 jnz .L8x_reduce 1665 1666 lea 16*8($nptr),$nptr 1667 xor %rax,%rax 1668 mov 8+8(%rsp),%rdx # pull end of t[] 1669 cmp 0+8(%rsp),$nptr # end of n[]? 1670 jae .L8x_no_tail 1671 1672 .byte 0x66 1673 add 8*0($tptr),%r8 1674 adc 8*1($tptr),%r9 1675 adc 8*2($tptr),%r10 1676 adc 8*3($tptr),%r11 1677 adc 8*4($tptr),%r12 1678 adc 8*5($tptr),%r13 1679 adc 8*6($tptr),%r14 1680 adc 8*7($tptr),%r15 1681 sbb $carry,$carry # top carry 1682 1683 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1684 mov \$8,%ecx 1685 mov 16*0($nptr),%rax 1686 jmp .L8x_tail 1687 1688.align 32 1689.L8x_tail: 1690 mulq $m0 1691 add %rax,%r8 1692 mov 16*1($nptr),%rax 1693 mov %r8,($tptr) # save result 1694 mov %rdx,%r8 1695 adc \$0,%r8 1696 1697 mulq $m0 1698 add %rax,%r9 1699 mov 16*2($nptr),%rax 1700 adc \$0,%rdx 1701 add %r9,%r8 1702 lea 8($tptr),$tptr # $tptr++ 1703 mov %rdx,%r9 1704 adc \$0,%r9 1705 1706 mulq $m0 1707 add %rax,%r10 1708 mov 16*3($nptr),%rax 1709 adc \$0,%rdx 1710 add %r10,%r9 1711 mov %rdx,%r10 1712 adc \$0,%r10 1713 1714 mulq $m0 1715 add %rax,%r11 1716 mov 16*4($nptr),%rax 1717 adc \$0,%rdx 1718 add %r11,%r10 1719 mov %rdx,%r11 1720 adc \$0,%r11 1721 1722 mulq $m0 1723 add %rax,%r12 1724 mov 16*5($nptr),%rax 1725 adc \$0,%rdx 1726 add %r12,%r11 1727 mov %rdx,%r12 1728 adc \$0,%r12 1729 1730 mulq $m0 1731 add %rax,%r13 1732 mov 16*6($nptr),%rax 1733 adc \$0,%rdx 1734 add %r13,%r12 1735 mov %rdx,%r13 1736 adc \$0,%r13 1737 1738 mulq $m0 1739 add %rax,%r14 1740 mov 16*7($nptr),%rax 1741 adc \$0,%rdx 1742 add %r14,%r13 1743 mov %rdx,%r14 1744 adc \$0,%r14 1745 1746 mulq $m0 1747 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] 1748 add %rax,%r15 1749 adc \$0,%rdx 1750 add %r15,%r14 1751 mov 16*0($nptr),%rax # pull n[0] 1752 mov %rdx,%r15 1753 adc \$0,%r15 1754 1755 dec %ecx 1756 jnz .L8x_tail 1757 1758 lea 16*8($nptr),$nptr 1759 mov 8+8(%rsp),%rdx # pull end of t[] 1760 cmp 0+8(%rsp),$nptr # end of n[]? 1761 jae .L8x_tail_done # break out of loop 1762 1763 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1764 neg $carry 1765 mov 8*0($nptr),%rax # pull n[0] 1766 adc 8*0($tptr),%r8 1767 adc 8*1($tptr),%r9 1768 adc 8*2($tptr),%r10 1769 adc 8*3($tptr),%r11 1770 adc 8*4($tptr),%r12 1771 adc 8*5($tptr),%r13 1772 adc 8*6($tptr),%r14 1773 adc 8*7($tptr),%r15 1774 sbb $carry,$carry # top carry 1775 1776 mov \$8,%ecx 1777 jmp .L8x_tail 1778 1779.align 32 1780.L8x_tail_done: 1781 add (%rdx),%r8 # can this overflow? 1782 xor %rax,%rax 1783 1784 neg $carry 1785.L8x_no_tail: 1786 adc 8*0($tptr),%r8 1787 adc 8*1($tptr),%r9 1788 adc 8*2($tptr),%r10 1789 adc 8*3($tptr),%r11 1790 adc 8*4($tptr),%r12 1791 adc 8*5($tptr),%r13 1792 adc 8*6($tptr),%r14 1793 adc 8*7($tptr),%r15 1794 adc \$0,%rax # top-most carry 1795 mov -16($nptr),%rcx # np[num-1] 1796 xor $carry,$carry 1797 1798 movq %xmm2,$nptr # restore $nptr 1799 1800 mov %r8,8*0($tptr) # store top 512 bits 1801 mov %r9,8*1($tptr) 1802 movq %xmm3,$num # $num is %r9, can't be moved upwards 1803 mov %r10,8*2($tptr) 1804 mov %r11,8*3($tptr) 1805 mov %r12,8*4($tptr) 1806 mov %r13,8*5($tptr) 1807 mov %r14,8*6($tptr) 1808 mov %r15,8*7($tptr) 1809 lea 8*8($tptr),$tptr 1810 1811 cmp %rdx,$tptr # end of t[]? 1812 jb .L8x_reduction_loop 1813___ 1814} 1815############################################################## 1816# Post-condition, 4x unrolled 1817# 1818{ 1819my ($tptr,$nptr)=("%rbx","%rbp"); 1820$code.=<<___; 1821 #xor %rsi,%rsi # %rsi was $carry above 1822 sub %r15,%rcx # compare top-most words 1823 lea (%rdi,$num),$tptr # %rdi was $tptr above 1824 adc %rsi,%rsi 1825 mov $num,%rcx 1826 or %rsi,%rax 1827 movq %xmm1,$rptr # restore $rptr 1828 xor \$1,%rax 1829 movq %xmm1,$aptr # prepare for back-to-back call 1830 lea ($nptr,%rax,8),$nptr 1831 sar \$3+2,%rcx # cf=0 1832 jmp .Lsqr4x_sub 1833 1834.align 32 1835.Lsqr4x_sub: 1836 .byte 0x66 1837 mov 8*0($tptr),%r12 1838 mov 8*1($tptr),%r13 1839 sbb 16*0($nptr),%r12 1840 mov 8*2($tptr),%r14 1841 sbb 16*1($nptr),%r13 1842 mov 8*3($tptr),%r15 1843 lea 8*4($tptr),$tptr 1844 sbb 16*2($nptr),%r14 1845 mov %r12,8*0($rptr) 1846 sbb 16*3($nptr),%r15 1847 lea 16*4($nptr),$nptr 1848 mov %r13,8*1($rptr) 1849 mov %r14,8*2($rptr) 1850 mov %r15,8*3($rptr) 1851 lea 8*4($rptr),$rptr 1852 1853 inc %rcx # pass %cf 1854 jnz .Lsqr4x_sub 1855___ 1856} 1857$code.=<<___; 1858 mov $num,%r10 # prepare for back-to-back call 1859 neg $num # restore $num 1860 ret 1861.size bn_sqr8x_internal,.-bn_sqr8x_internal 1862___ 1863{ 1864$code.=<<___; 1865.globl bn_from_montgomery 1866.type bn_from_montgomery,\@abi-omnipotent 1867.align 32 1868bn_from_montgomery: 1869 testl \$7,`($win64?"48(%rsp)":"%r9d")` 1870 jz bn_from_mont8x 1871 xor %eax,%eax 1872 ret 1873.size bn_from_montgomery,.-bn_from_montgomery 1874 1875.type bn_from_mont8x,\@function,6 1876.align 32 1877bn_from_mont8x: 1878 .byte 0x67 1879 mov %rsp,%rax 1880 push %rbx 1881 push %rbp 1882 push %r12 1883 push %r13 1884 push %r14 1885 push %r15 1886___ 1887$code.=<<___ if ($win64); 1888 lea -0x28(%rsp),%rsp 1889 movaps %xmm6,(%rsp) 1890 movaps %xmm7,0x10(%rsp) 1891___ 1892$code.=<<___; 1893 .byte 0x67 1894 mov ${num}d,%r10d 1895 shl \$3,${num}d # convert $num to bytes 1896 shl \$3+2,%r10d # 4*$num 1897 neg $num 1898 mov ($n0),$n0 # *n0 1899 1900 ############################################################## 1901 # ensure that stack frame doesn't alias with $aptr+4*$num 1902 # modulo 4096, which covers ret[num], am[num] and n[2*num] 1903 # (see bn_exp.c). this is done to allow memory disambiguation 1904 # logic do its magic. 1905 # 1906 lea -64(%rsp,$num,2),%r11 1907 sub $aptr,%r11 1908 and \$4095,%r11 1909 cmp %r11,%r10 1910 jb .Lfrom_sp_alt 1911 sub %r11,%rsp # align with $aptr 1912 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 1913 jmp .Lfrom_sp_done 1914 1915.align 32 1916.Lfrom_sp_alt: 1917 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num 1918 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 1919 sub %r10,%r11 1920 mov \$0,%r10 1921 cmovc %r10,%r11 1922 sub %r11,%rsp 1923.Lfrom_sp_done: 1924 and \$-64,%rsp 1925 mov $num,%r10 1926 neg $num 1927 1928 ############################################################## 1929 # Stack layout 1930 # 1931 # +0 saved $num, used in reduction section 1932 # +8 &t[2*$num], used in reduction section 1933 # +32 saved *n0 1934 # +40 saved %rsp 1935 # +48 t[2*$num] 1936 # 1937 mov $n0, 32(%rsp) 1938 mov %rax, 40(%rsp) # save original %rsp 1939.Lfrom_body: 1940 mov $num,%r11 1941 lea 48(%rsp),%rax 1942 pxor %xmm0,%xmm0 1943 jmp .Lmul_by_1 1944 1945.align 32 1946.Lmul_by_1: 1947 movdqu ($aptr),%xmm1 1948 movdqu 16($aptr),%xmm2 1949 movdqu 32($aptr),%xmm3 1950 movdqa %xmm0,(%rax,$num) 1951 movdqu 48($aptr),%xmm4 1952 movdqa %xmm0,16(%rax,$num) 1953 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr 1954 movdqa %xmm1,(%rax) 1955 movdqa %xmm0,32(%rax,$num) 1956 movdqa %xmm2,16(%rax) 1957 movdqa %xmm0,48(%rax,$num) 1958 movdqa %xmm3,32(%rax) 1959 movdqa %xmm4,48(%rax) 1960 lea 64(%rax),%rax 1961 sub \$64,%r11 1962 jnz .Lmul_by_1 1963 1964 movq $rptr,%xmm1 1965 movq $nptr,%xmm2 1966 .byte 0x67 1967 mov $nptr,%rbp 1968 movq %r10, %xmm3 # -num 1969___ 1970$code.=<<___ if ($addx); 1971 mov OPENSSL_ia32cap_P+8(%rip),%r11d 1972 and \$0x80100,%r11d 1973 cmp \$0x80100,%r11d 1974 jne .Lfrom_mont_nox 1975 1976 lea (%rax,$num),$rptr 1977 call sqrx8x_reduction 1978 1979 pxor %xmm0,%xmm0 1980 lea 48(%rsp),%rax 1981 mov 40(%rsp),%rsi # restore %rsp 1982 jmp .Lfrom_mont_zero 1983 1984.align 32 1985.Lfrom_mont_nox: 1986___ 1987$code.=<<___; 1988 call sqr8x_reduction 1989 1990 pxor %xmm0,%xmm0 1991 lea 48(%rsp),%rax 1992 mov 40(%rsp),%rsi # restore %rsp 1993 jmp .Lfrom_mont_zero 1994 1995.align 32 1996.Lfrom_mont_zero: 1997 movdqa %xmm0,16*0(%rax) 1998 movdqa %xmm0,16*1(%rax) 1999 movdqa %xmm0,16*2(%rax) 2000 movdqa %xmm0,16*3(%rax) 2001 lea 16*4(%rax),%rax 2002 sub \$32,$num 2003 jnz .Lfrom_mont_zero 2004 2005 mov \$1,%rax 2006 mov -48(%rsi),%r15 2007 mov -40(%rsi),%r14 2008 mov -32(%rsi),%r13 2009 mov -24(%rsi),%r12 2010 mov -16(%rsi),%rbp 2011 mov -8(%rsi),%rbx 2012 lea (%rsi),%rsp 2013.Lfrom_epilogue: 2014 ret 2015.size bn_from_mont8x,.-bn_from_mont8x 2016___ 2017} 2018}}} 2019 2020if ($addx) {{{ 2021my $bp="%rdx"; # restore original value 2022 2023$code.=<<___; 2024.type bn_mulx4x_mont_gather5,\@function,6 2025.align 32 2026bn_mulx4x_mont_gather5: 2027.Lmulx4x_enter: 2028 .byte 0x67 2029 mov %rsp,%rax 2030 push %rbx 2031 push %rbp 2032 push %r12 2033 push %r13 2034 push %r14 2035 push %r15 2036___ 2037$code.=<<___ if ($win64); 2038 lea -0x28(%rsp),%rsp 2039 movaps %xmm6,(%rsp) 2040 movaps %xmm7,0x10(%rsp) 2041___ 2042$code.=<<___; 2043 .byte 0x67 2044 mov ${num}d,%r10d 2045 shl \$3,${num}d # convert $num to bytes 2046 shl \$3+2,%r10d # 4*$num 2047 neg $num # -$num 2048 mov ($n0),$n0 # *n0 2049 2050 ############################################################## 2051 # ensure that stack frame doesn't alias with $aptr+4*$num 2052 # modulo 4096, which covers a[num], ret[num] and n[2*num] 2053 # (see bn_exp.c). this is done to allow memory disambiguation 2054 # logic do its magic. [excessive frame is allocated in order 2055 # to allow bn_from_mont8x to clear it.] 2056 # 2057 lea -64(%rsp,$num,2),%r11 2058 sub $ap,%r11 2059 and \$4095,%r11 2060 cmp %r11,%r10 2061 jb .Lmulx4xsp_alt 2062 sub %r11,%rsp # align with $aptr 2063 lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) 2064 jmp .Lmulx4xsp_done 2065 2066.align 32 2067.Lmulx4xsp_alt: 2068 lea 4096-64(,$num,2),%r10 # 4096-frame-$num 2069 lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) 2070 sub %r10,%r11 2071 mov \$0,%r10 2072 cmovc %r10,%r11 2073 sub %r11,%rsp 2074.Lmulx4xsp_done: 2075 and \$-64,%rsp # ensure alignment 2076 ############################################################## 2077 # Stack layout 2078 # +0 -num 2079 # +8 off-loaded &b[i] 2080 # +16 end of b[num] 2081 # +24 inner counter 2082 # +32 saved n0 2083 # +40 saved %rsp 2084 # +48 2085 # +56 saved rp 2086 # +64 tmp[num+1] 2087 # 2088 mov $n0, 32(%rsp) # save *n0 2089 mov %rax,40(%rsp) # save original %rsp 2090.Lmulx4x_body: 2091 call mulx4x_internal 2092 2093 mov 40(%rsp),%rsi # restore %rsp 2094 mov \$1,%rax 2095___ 2096$code.=<<___ if ($win64); 2097 movaps -88(%rsi),%xmm6 2098 movaps -72(%rsi),%xmm7 2099___ 2100$code.=<<___; 2101 mov -48(%rsi),%r15 2102 mov -40(%rsi),%r14 2103 mov -32(%rsi),%r13 2104 mov -24(%rsi),%r12 2105 mov -16(%rsi),%rbp 2106 mov -8(%rsi),%rbx 2107 lea (%rsi),%rsp 2108.Lmulx4x_epilogue: 2109 ret 2110.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2111 2112.type mulx4x_internal,\@abi-omnipotent 2113.align 32 2114mulx4x_internal: 2115 .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num 2116 .byte 0x67 2117 neg $num # restore $num 2118 shl \$5,$num 2119 lea 256($bp,$num),%r13 2120 shr \$5+5,$num 2121 mov `($win64?56:8)`(%rax),%r10d # load 7th argument 2122 sub \$1,$num 2123 mov %r13,16+8(%rsp) # end of b[num] 2124 mov $num,24+8(%rsp) # inner counter 2125 mov $rp, 56+8(%rsp) # save $rp 2126___ 2127my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 2128 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 2129my $rptr=$bptr; 2130my $STRIDE=2**5*8; # 5 is "window size" 2131my $N=$STRIDE/4; # should match cache line size 2132$code.=<<___; 2133 mov %r10,%r11 2134 shr \$`log($N/8)/log(2)`,%r10 2135 and \$`$N/8-1`,%r11 2136 not %r10 2137 lea .Lmagic_masks(%rip),%rax 2138 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 2139 lea 96($bp,%r11,8),$bptr # pointer within 1st cache line 2140 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 2141 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 2142 add \$7,%r11 2143 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 2144 movq 24(%rax,%r10,8),%xmm7 2145 and \$7,%r11 2146 2147 movq `0*$STRIDE/4-96`($bptr),%xmm0 2148 lea $STRIDE($bptr),$tptr # borrow $tptr 2149 movq `1*$STRIDE/4-96`($bptr),%xmm1 2150 pand %xmm4,%xmm0 2151 movq `2*$STRIDE/4-96`($bptr),%xmm2 2152 pand %xmm5,%xmm1 2153 movq `3*$STRIDE/4-96`($bptr),%xmm3 2154 pand %xmm6,%xmm2 2155 por %xmm1,%xmm0 2156 movq `0*$STRIDE/4-96`($tptr),%xmm1 2157 pand %xmm7,%xmm3 2158 por %xmm2,%xmm0 2159 movq `1*$STRIDE/4-96`($tptr),%xmm2 2160 por %xmm3,%xmm0 2161 .byte 0x67,0x67 2162 pand %xmm4,%xmm1 2163 movq `2*$STRIDE/4-96`($tptr),%xmm3 2164 2165 movq %xmm0,%rdx # bp[0] 2166 movq `3*$STRIDE/4-96`($tptr),%xmm0 2167 lea 2*$STRIDE($bptr),$bptr # next &b[i] 2168 pand %xmm5,%xmm2 2169 .byte 0x67,0x67 2170 pand %xmm6,%xmm3 2171 ############################################################## 2172 # $tptr is chosen so that writing to top-most element of the 2173 # vector occurs just "above" references to powers table, 2174 # "above" modulo cache-line size, which effectively precludes 2175 # possibility of memory disambiguation logic failure when 2176 # accessing the table. 2177 # 2178 lea 64+8*4+8(%rsp,%r11,8),$tptr 2179 2180 mov %rdx,$bi 2181 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 2182 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] 2183 add %rax,%r11 2184 mulx 2*8($aptr),%rax,%r13 # ... 2185 adc %rax,%r12 2186 adc \$0,%r13 2187 mulx 3*8($aptr),%rax,%r14 2188 2189 mov $mi,%r15 2190 imulq 32+8(%rsp),$mi # "t[0]"*n0 2191 xor $zero,$zero # cf=0, of=0 2192 mov $mi,%rdx 2193 2194 por %xmm2,%xmm1 2195 pand %xmm7,%xmm0 2196 por %xmm3,%xmm1 2197 mov $bptr,8+8(%rsp) # off-load &b[i] 2198 por %xmm1,%xmm0 2199 2200 .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr 2201 adcx %rax,%r13 2202 adcx $zero,%r14 # cf=0 2203 2204 mulx 0*16($nptr),%rax,%r10 2205 adcx %rax,%r15 # discarded 2206 adox %r11,%r10 2207 mulx 1*16($nptr),%rax,%r11 2208 adcx %rax,%r10 2209 adox %r12,%r11 2210 mulx 2*16($nptr),%rax,%r12 2211 mov 24+8(%rsp),$bptr # counter value 2212 .byte 0x66 2213 mov %r10,-8*4($tptr) 2214 adcx %rax,%r11 2215 adox %r13,%r12 2216 mulx 3*16($nptr),%rax,%r15 2217 .byte 0x67,0x67 2218 mov $bi,%rdx 2219 mov %r11,-8*3($tptr) 2220 adcx %rax,%r12 2221 adox $zero,%r15 # of=0 2222 .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr 2223 mov %r12,-8*2($tptr) 2224 #jmp .Lmulx4x_1st 2225 2226.align 32 2227.Lmulx4x_1st: 2228 adcx $zero,%r15 # cf=0, modulo-scheduled 2229 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 2230 adcx %r14,%r10 2231 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 2232 adcx %rax,%r11 2233 mulx 2*8($aptr),%r12,%rax # ... 2234 adcx %r14,%r12 2235 mulx 3*8($aptr),%r13,%r14 2236 .byte 0x67,0x67 2237 mov $mi,%rdx 2238 adcx %rax,%r13 2239 adcx $zero,%r14 # cf=0 2240 lea 4*8($aptr),$aptr 2241 lea 4*8($tptr),$tptr 2242 2243 adox %r15,%r10 2244 mulx 0*16($nptr),%rax,%r15 2245 adcx %rax,%r10 2246 adox %r15,%r11 2247 mulx 1*16($nptr),%rax,%r15 2248 adcx %rax,%r11 2249 adox %r15,%r12 2250 mulx 2*16($nptr),%rax,%r15 2251 mov %r10,-5*8($tptr) 2252 adcx %rax,%r12 2253 mov %r11,-4*8($tptr) 2254 adox %r15,%r13 2255 mulx 3*16($nptr),%rax,%r15 2256 mov $bi,%rdx 2257 mov %r12,-3*8($tptr) 2258 adcx %rax,%r13 2259 adox $zero,%r15 2260 lea 4*16($nptr),$nptr 2261 mov %r13,-2*8($tptr) 2262 2263 dec $bptr # of=0, pass cf 2264 jnz .Lmulx4x_1st 2265 2266 mov 8(%rsp),$num # load -num 2267 movq %xmm0,%rdx # bp[1] 2268 adc $zero,%r15 # modulo-scheduled 2269 lea ($aptr,$num),$aptr # rewind $aptr 2270 add %r15,%r14 2271 mov 8+8(%rsp),$bptr # re-load &b[i] 2272 adc $zero,$zero # top-most carry 2273 mov %r14,-1*8($tptr) 2274 jmp .Lmulx4x_outer 2275 2276.align 32 2277.Lmulx4x_outer: 2278 mov $zero,($tptr) # save top-most carry 2279 lea 4*8($tptr,$num),$tptr # rewind $tptr 2280 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 2281 xor $zero,$zero # cf=0, of=0 2282 mov %rdx,$bi 2283 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 2284 adox -4*8($tptr),$mi # +t[0] 2285 adcx %r14,%r11 2286 mulx 2*8($aptr),%r15,%r13 # ... 2287 adox -3*8($tptr),%r11 2288 adcx %r15,%r12 2289 mulx 3*8($aptr),%rdx,%r14 2290 adox -2*8($tptr),%r12 2291 adcx %rdx,%r13 2292 lea ($nptr,$num,2),$nptr # rewind $nptr 2293 lea 4*8($aptr),$aptr 2294 adox -1*8($tptr),%r13 2295 adcx $zero,%r14 2296 adox $zero,%r14 2297 2298 .byte 0x67 2299 mov $mi,%r15 2300 imulq 32+8(%rsp),$mi # "t[0]"*n0 2301 2302 movq `0*$STRIDE/4-96`($bptr),%xmm0 2303 .byte 0x67,0x67 2304 mov $mi,%rdx 2305 movq `1*$STRIDE/4-96`($bptr),%xmm1 2306 .byte 0x67 2307 pand %xmm4,%xmm0 2308 movq `2*$STRIDE/4-96`($bptr),%xmm2 2309 .byte 0x67 2310 pand %xmm5,%xmm1 2311 movq `3*$STRIDE/4-96`($bptr),%xmm3 2312 add \$$STRIDE,$bptr # next &b[i] 2313 .byte 0x67 2314 pand %xmm6,%xmm2 2315 por %xmm1,%xmm0 2316 pand %xmm7,%xmm3 2317 xor $zero,$zero # cf=0, of=0 2318 mov $bptr,8+8(%rsp) # off-load &b[i] 2319 2320 mulx 0*16($nptr),%rax,%r10 2321 adcx %rax,%r15 # discarded 2322 adox %r11,%r10 2323 mulx 1*16($nptr),%rax,%r11 2324 adcx %rax,%r10 2325 adox %r12,%r11 2326 mulx 2*16($nptr),%rax,%r12 2327 adcx %rax,%r11 2328 adox %r13,%r12 2329 mulx 3*16($nptr),%rax,%r15 2330 mov $bi,%rdx 2331 por %xmm2,%xmm0 2332 mov 24+8(%rsp),$bptr # counter value 2333 mov %r10,-8*4($tptr) 2334 por %xmm3,%xmm0 2335 adcx %rax,%r12 2336 mov %r11,-8*3($tptr) 2337 adox $zero,%r15 # of=0 2338 mov %r12,-8*2($tptr) 2339 lea 4*16($nptr),$nptr 2340 jmp .Lmulx4x_inner 2341 2342.align 32 2343.Lmulx4x_inner: 2344 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 2345 adcx $zero,%r15 # cf=0, modulo-scheduled 2346 adox %r14,%r10 2347 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 2348 adcx 0*8($tptr),%r10 2349 adox %rax,%r11 2350 mulx 2*8($aptr),%r12,%rax # ... 2351 adcx 1*8($tptr),%r11 2352 adox %r14,%r12 2353 mulx 3*8($aptr),%r13,%r14 2354 mov $mi,%rdx 2355 adcx 2*8($tptr),%r12 2356 adox %rax,%r13 2357 adcx 3*8($tptr),%r13 2358 adox $zero,%r14 # of=0 2359 lea 4*8($aptr),$aptr 2360 lea 4*8($tptr),$tptr 2361 adcx $zero,%r14 # cf=0 2362 2363 adox %r15,%r10 2364 mulx 0*16($nptr),%rax,%r15 2365 adcx %rax,%r10 2366 adox %r15,%r11 2367 mulx 1*16($nptr),%rax,%r15 2368 adcx %rax,%r11 2369 adox %r15,%r12 2370 mulx 2*16($nptr),%rax,%r15 2371 mov %r10,-5*8($tptr) 2372 adcx %rax,%r12 2373 adox %r15,%r13 2374 mov %r11,-4*8($tptr) 2375 mulx 3*16($nptr),%rax,%r15 2376 mov $bi,%rdx 2377 lea 4*16($nptr),$nptr 2378 mov %r12,-3*8($tptr) 2379 adcx %rax,%r13 2380 adox $zero,%r15 2381 mov %r13,-2*8($tptr) 2382 2383 dec $bptr # of=0, pass cf 2384 jnz .Lmulx4x_inner 2385 2386 mov 0+8(%rsp),$num # load -num 2387 movq %xmm0,%rdx # bp[i+1] 2388 adc $zero,%r15 # modulo-scheduled 2389 sub 0*8($tptr),$bptr # pull top-most carry to %cf 2390 mov 8+8(%rsp),$bptr # re-load &b[i] 2391 mov 16+8(%rsp),%r10 2392 adc %r15,%r14 2393 lea ($aptr,$num),$aptr # rewind $aptr 2394 adc $zero,$zero # top-most carry 2395 mov %r14,-1*8($tptr) 2396 2397 cmp %r10,$bptr 2398 jb .Lmulx4x_outer 2399 2400 mov -16($nptr),%r10 2401 xor %r15,%r15 2402 sub %r14,%r10 # compare top-most words 2403 adc %r15,%r15 2404 or %r15,$zero 2405 xor \$1,$zero 2406 lea ($tptr,$num),%rdi # rewind $tptr 2407 lea ($nptr,$num,2),$nptr # rewind $nptr 2408 .byte 0x67,0x67 2409 sar \$3+2,$num # cf=0 2410 lea ($nptr,$zero,8),%rbp 2411 mov 56+8(%rsp),%rdx # restore rp 2412 mov $num,%rcx 2413 jmp .Lsqrx4x_sub # common post-condition 2414.size mulx4x_internal,.-mulx4x_internal 2415___ 2416}{ 2417###################################################################### 2418# void bn_power5( 2419my $rptr="%rdi"; # BN_ULONG *rptr, 2420my $aptr="%rsi"; # const BN_ULONG *aptr, 2421my $bptr="%rdx"; # const void *table, 2422my $nptr="%rcx"; # const BN_ULONG *nptr, 2423my $n0 ="%r8"; # const BN_ULONG *n0); 2424my $num ="%r9"; # int num, has to be divisible by 8 2425 # int pwr); 2426 2427my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 2428my @A0=("%r10","%r11"); 2429my @A1=("%r12","%r13"); 2430my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 2431 2432$code.=<<___; 2433.type bn_powerx5,\@function,6 2434.align 32 2435bn_powerx5: 2436.Lpowerx5_enter: 2437 .byte 0x67 2438 mov %rsp,%rax 2439 push %rbx 2440 push %rbp 2441 push %r12 2442 push %r13 2443 push %r14 2444 push %r15 2445___ 2446$code.=<<___ if ($win64); 2447 lea -0x28(%rsp),%rsp 2448 movaps %xmm6,(%rsp) 2449 movaps %xmm7,0x10(%rsp) 2450___ 2451$code.=<<___; 2452 .byte 0x67 2453 mov ${num}d,%r10d 2454 shl \$3,${num}d # convert $num to bytes 2455 shl \$3+2,%r10d # 4*$num 2456 neg $num 2457 mov ($n0),$n0 # *n0 2458 2459 ############################################################## 2460 # ensure that stack frame doesn't alias with $aptr+4*$num 2461 # modulo 4096, which covers ret[num], am[num] and n[2*num] 2462 # (see bn_exp.c). this is done to allow memory disambiguation 2463 # logic do its magic. 2464 # 2465 lea -64(%rsp,$num,2),%r11 2466 sub $aptr,%r11 2467 and \$4095,%r11 2468 cmp %r11,%r10 2469 jb .Lpwrx_sp_alt 2470 sub %r11,%rsp # align with $aptr 2471 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 2472 jmp .Lpwrx_sp_done 2473 2474.align 32 2475.Lpwrx_sp_alt: 2476 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num 2477 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 2478 sub %r10,%r11 2479 mov \$0,%r10 2480 cmovc %r10,%r11 2481 sub %r11,%rsp 2482.Lpwrx_sp_done: 2483 and \$-64,%rsp 2484 mov $num,%r10 2485 neg $num 2486 2487 ############################################################## 2488 # Stack layout 2489 # 2490 # +0 saved $num, used in reduction section 2491 # +8 &t[2*$num], used in reduction section 2492 # +16 intermediate carry bit 2493 # +24 top-most carry bit, used in reduction section 2494 # +32 saved *n0 2495 # +40 saved %rsp 2496 # +48 t[2*$num] 2497 # 2498 pxor %xmm0,%xmm0 2499 movq $rptr,%xmm1 # save $rptr 2500 movq $nptr,%xmm2 # save $nptr 2501 movq %r10, %xmm3 # -$num 2502 movq $bptr,%xmm4 2503 mov $n0, 32(%rsp) 2504 mov %rax, 40(%rsp) # save original %rsp 2505.Lpowerx5_body: 2506 2507 call __bn_sqrx8x_internal 2508 call __bn_sqrx8x_internal 2509 call __bn_sqrx8x_internal 2510 call __bn_sqrx8x_internal 2511 call __bn_sqrx8x_internal 2512 2513 mov %r10,$num # -num 2514 mov $aptr,$rptr 2515 movq %xmm2,$nptr 2516 movq %xmm4,$bptr 2517 mov 40(%rsp),%rax 2518 2519 call mulx4x_internal 2520 2521 mov 40(%rsp),%rsi # restore %rsp 2522 mov \$1,%rax 2523___ 2524$code.=<<___ if ($win64); 2525 movaps -88(%rsi),%xmm6 2526 movaps -72(%rsi),%xmm7 2527___ 2528$code.=<<___; 2529 mov -48(%rsi),%r15 2530 mov -40(%rsi),%r14 2531 mov -32(%rsi),%r13 2532 mov -24(%rsi),%r12 2533 mov -16(%rsi),%rbp 2534 mov -8(%rsi),%rbx 2535 lea (%rsi),%rsp 2536.Lpowerx5_epilogue: 2537 ret 2538.size bn_powerx5,.-bn_powerx5 2539 2540.globl bn_sqrx8x_internal 2541.hidden bn_sqrx8x_internal 2542.type bn_sqrx8x_internal,\@abi-omnipotent 2543.align 32 2544bn_sqrx8x_internal: 2545__bn_sqrx8x_internal: 2546 ################################################################## 2547 # Squaring part: 2548 # 2549 # a) multiply-n-add everything but a[i]*a[i]; 2550 # b) shift result of a) by 1 to the left and accumulate 2551 # a[i]*a[i] products; 2552 # 2553 ################################################################## 2554 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2555 # a[1]a[0] 2556 # a[2]a[0] 2557 # a[3]a[0] 2558 # a[2]a[1] 2559 # a[3]a[1] 2560 # a[3]a[2] 2561 # 2562 # a[4]a[0] 2563 # a[5]a[0] 2564 # a[6]a[0] 2565 # a[7]a[0] 2566 # a[4]a[1] 2567 # a[5]a[1] 2568 # a[6]a[1] 2569 # a[7]a[1] 2570 # a[4]a[2] 2571 # a[5]a[2] 2572 # a[6]a[2] 2573 # a[7]a[2] 2574 # a[4]a[3] 2575 # a[5]a[3] 2576 # a[6]a[3] 2577 # a[7]a[3] 2578 # 2579 # a[5]a[4] 2580 # a[6]a[4] 2581 # a[7]a[4] 2582 # a[6]a[5] 2583 # a[7]a[5] 2584 # a[7]a[6] 2585 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2586___ 2587{ 2588my ($zero,$carry)=("%rbp","%rcx"); 2589my $aaptr=$zero; 2590$code.=<<___; 2591 lea 48+8(%rsp),$tptr 2592 lea ($aptr,$num),$aaptr 2593 mov $num,0+8(%rsp) # save $num 2594 mov $aaptr,8+8(%rsp) # save end of $aptr 2595 jmp .Lsqr8x_zero_start 2596 2597.align 32 2598.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2599.Lsqrx8x_zero: 2600 .byte 0x3e 2601 movdqa %xmm0,0*8($tptr) 2602 movdqa %xmm0,2*8($tptr) 2603 movdqa %xmm0,4*8($tptr) 2604 movdqa %xmm0,6*8($tptr) 2605.Lsqr8x_zero_start: # aligned at 32 2606 movdqa %xmm0,8*8($tptr) 2607 movdqa %xmm0,10*8($tptr) 2608 movdqa %xmm0,12*8($tptr) 2609 movdqa %xmm0,14*8($tptr) 2610 lea 16*8($tptr),$tptr 2611 sub \$64,$num 2612 jnz .Lsqrx8x_zero 2613 2614 mov 0*8($aptr),%rdx # a[0], modulo-scheduled 2615 #xor %r9,%r9 # t[1], ex-$num, zero already 2616 xor %r10,%r10 2617 xor %r11,%r11 2618 xor %r12,%r12 2619 xor %r13,%r13 2620 xor %r14,%r14 2621 xor %r15,%r15 2622 lea 48+8(%rsp),$tptr 2623 xor $zero,$zero # cf=0, cf=0 2624 jmp .Lsqrx8x_outer_loop 2625 2626.align 32 2627.Lsqrx8x_outer_loop: 2628 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] 2629 adcx %r9,%r8 # a[1]*a[0]+=t[1] 2630 adox %rax,%r10 2631 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] 2632 adcx %r10,%r9 2633 adox %rax,%r11 2634 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... 2635 adcx %r11,%r10 2636 adox %rax,%r12 2637 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax 2638 adcx %r12,%r11 2639 adox %rax,%r13 2640 mulx 5*8($aptr),%r12,%rax 2641 adcx %r13,%r12 2642 adox %rax,%r14 2643 mulx 6*8($aptr),%r13,%rax 2644 adcx %r14,%r13 2645 adox %r15,%rax 2646 mulx 7*8($aptr),%r14,%r15 2647 mov 1*8($aptr),%rdx # a[1] 2648 adcx %rax,%r14 2649 adox $zero,%r15 2650 adc 8*8($tptr),%r15 2651 mov %r8,1*8($tptr) # t[1] 2652 mov %r9,2*8($tptr) # t[2] 2653 sbb $carry,$carry # mov %cf,$carry 2654 xor $zero,$zero # cf=0, of=0 2655 2656 2657 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] 2658 mulx 3*8($aptr),%r9,%rax # a[3]*a[1] 2659 adcx %r10,%r8 2660 adox %rbx,%r9 2661 mulx 4*8($aptr),%r10,%rbx # ... 2662 adcx %r11,%r9 2663 adox %rax,%r10 2664 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax 2665 adcx %r12,%r10 2666 adox %rbx,%r11 2667 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx 2668 adcx %r13,%r11 2669 adox %r14,%r12 2670 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 2671 mov 2*8($aptr),%rdx # a[2] 2672 adcx %rax,%r12 2673 adox %rbx,%r13 2674 adcx %r15,%r13 2675 adox $zero,%r14 # of=0 2676 adcx $zero,%r14 # cf=0 2677 2678 mov %r8,3*8($tptr) # t[3] 2679 mov %r9,4*8($tptr) # t[4] 2680 2681 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] 2682 mulx 4*8($aptr),%r9,%rax # a[4]*a[2] 2683 adcx %r10,%r8 2684 adox %rbx,%r9 2685 mulx 5*8($aptr),%r10,%rbx # ... 2686 adcx %r11,%r9 2687 adox %rax,%r10 2688 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax 2689 adcx %r12,%r10 2690 adox %r13,%r11 2691 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 2692 .byte 0x3e 2693 mov 3*8($aptr),%rdx # a[3] 2694 adcx %rbx,%r11 2695 adox %rax,%r12 2696 adcx %r14,%r12 2697 mov %r8,5*8($tptr) # t[5] 2698 mov %r9,6*8($tptr) # t[6] 2699 mulx 4*8($aptr),%r8,%rax # a[4]*a[3] 2700 adox $zero,%r13 # of=0 2701 adcx $zero,%r13 # cf=0 2702 2703 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] 2704 adcx %r10,%r8 2705 adox %rax,%r9 2706 mulx 6*8($aptr),%r10,%rax # ... 2707 adcx %r11,%r9 2708 adox %r12,%r10 2709 mulx 7*8($aptr),%r11,%r12 2710 mov 4*8($aptr),%rdx # a[4] 2711 mov 5*8($aptr),%r14 # a[5] 2712 adcx %rbx,%r10 2713 adox %rax,%r11 2714 mov 6*8($aptr),%r15 # a[6] 2715 adcx %r13,%r11 2716 adox $zero,%r12 # of=0 2717 adcx $zero,%r12 # cf=0 2718 2719 mov %r8,7*8($tptr) # t[7] 2720 mov %r9,8*8($tptr) # t[8] 2721 2722 mulx %r14,%r9,%rax # a[5]*a[4] 2723 mov 7*8($aptr),%r8 # a[7] 2724 adcx %r10,%r9 2725 mulx %r15,%r10,%rbx # a[6]*a[4] 2726 adox %rax,%r10 2727 adcx %r11,%r10 2728 mulx %r8,%r11,%rax # a[7]*a[4] 2729 mov %r14,%rdx # a[5] 2730 adox %rbx,%r11 2731 adcx %r12,%r11 2732 #adox $zero,%rax # of=0 2733 adcx $zero,%rax # cf=0 2734 2735 mulx %r15,%r14,%rbx # a[6]*a[5] 2736 mulx %r8,%r12,%r13 # a[7]*a[5] 2737 mov %r15,%rdx # a[6] 2738 lea 8*8($aptr),$aptr 2739 adcx %r14,%r11 2740 adox %rbx,%r12 2741 adcx %rax,%r12 2742 adox $zero,%r13 2743 2744 .byte 0x67,0x67 2745 mulx %r8,%r8,%r14 # a[7]*a[6] 2746 adcx %r8,%r13 2747 adcx $zero,%r14 2748 2749 cmp 8+8(%rsp),$aptr 2750 je .Lsqrx8x_outer_break 2751 2752 neg $carry # mov $carry,%cf 2753 mov \$-8,%rcx 2754 mov $zero,%r15 2755 mov 8*8($tptr),%r8 2756 adcx 9*8($tptr),%r9 # +=t[9] 2757 adcx 10*8($tptr),%r10 # ... 2758 adcx 11*8($tptr),%r11 2759 adc 12*8($tptr),%r12 2760 adc 13*8($tptr),%r13 2761 adc 14*8($tptr),%r14 2762 adc 15*8($tptr),%r15 2763 lea ($aptr),$aaptr 2764 lea 2*64($tptr),$tptr 2765 sbb %rax,%rax # mov %cf,$carry 2766 2767 mov -64($aptr),%rdx # a[0] 2768 mov %rax,16+8(%rsp) # offload $carry 2769 mov $tptr,24+8(%rsp) 2770 2771 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above 2772 xor %eax,%eax # cf=0, of=0 2773 jmp .Lsqrx8x_loop 2774 2775.align 32 2776.Lsqrx8x_loop: 2777 mov %r8,%rbx 2778 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] 2779 adcx %rax,%rbx # +=t[8] 2780 adox %r9,%r8 2781 2782 mulx 1*8($aaptr),%rax,%r9 # ... 2783 adcx %rax,%r8 2784 adox %r10,%r9 2785 2786 mulx 2*8($aaptr),%rax,%r10 2787 adcx %rax,%r9 2788 adox %r11,%r10 2789 2790 mulx 3*8($aaptr),%rax,%r11 2791 adcx %rax,%r10 2792 adox %r12,%r11 2793 2794 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 2795 adcx %rax,%r11 2796 adox %r13,%r12 2797 2798 mulx 5*8($aaptr),%rax,%r13 2799 adcx %rax,%r12 2800 adox %r14,%r13 2801 2802 mulx 6*8($aaptr),%rax,%r14 2803 mov %rbx,($tptr,%rcx,8) # store t[8+i] 2804 mov \$0,%ebx 2805 adcx %rax,%r13 2806 adox %r15,%r14 2807 2808 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 2809 mov 8($aptr,%rcx,8),%rdx # a[i] 2810 adcx %rax,%r14 2811 adox %rbx,%r15 # %rbx is 0, of=0 2812 adcx %rbx,%r15 # cf=0 2813 2814 .byte 0x67 2815 inc %rcx # of=0 2816 jnz .Lsqrx8x_loop 2817 2818 lea 8*8($aaptr),$aaptr 2819 mov \$-8,%rcx 2820 cmp 8+8(%rsp),$aaptr # done? 2821 je .Lsqrx8x_break 2822 2823 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 2824 .byte 0x66 2825 mov -64($aptr),%rdx 2826 adcx 0*8($tptr),%r8 2827 adcx 1*8($tptr),%r9 2828 adc 2*8($tptr),%r10 2829 adc 3*8($tptr),%r11 2830 adc 4*8($tptr),%r12 2831 adc 5*8($tptr),%r13 2832 adc 6*8($tptr),%r14 2833 adc 7*8($tptr),%r15 2834 lea 8*8($tptr),$tptr 2835 .byte 0x67 2836 sbb %rax,%rax # mov %cf,%rax 2837 xor %ebx,%ebx # cf=0, of=0 2838 mov %rax,16+8(%rsp) # offload carry 2839 jmp .Lsqrx8x_loop 2840 2841.align 32 2842.Lsqrx8x_break: 2843 sub 16+8(%rsp),%r8 # consume last carry 2844 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry 2845 mov 0*8($aptr),%rdx # a[8], modulo-scheduled 2846 xor %ebp,%ebp # xor $zero,$zero 2847 mov %r8,0*8($tptr) 2848 cmp $carry,$tptr # cf=0, of=0 2849 je .Lsqrx8x_outer_loop 2850 2851 mov %r9,1*8($tptr) 2852 mov 1*8($carry),%r9 2853 mov %r10,2*8($tptr) 2854 mov 2*8($carry),%r10 2855 mov %r11,3*8($tptr) 2856 mov 3*8($carry),%r11 2857 mov %r12,4*8($tptr) 2858 mov 4*8($carry),%r12 2859 mov %r13,5*8($tptr) 2860 mov 5*8($carry),%r13 2861 mov %r14,6*8($tptr) 2862 mov 6*8($carry),%r14 2863 mov %r15,7*8($tptr) 2864 mov 7*8($carry),%r15 2865 mov $carry,$tptr 2866 jmp .Lsqrx8x_outer_loop 2867 2868.align 32 2869.Lsqrx8x_outer_break: 2870 mov %r9,9*8($tptr) # t[9] 2871 movq %xmm3,%rcx # -$num 2872 mov %r10,10*8($tptr) # ... 2873 mov %r11,11*8($tptr) 2874 mov %r12,12*8($tptr) 2875 mov %r13,13*8($tptr) 2876 mov %r14,14*8($tptr) 2877___ 2878}{ 2879my $i="%rcx"; 2880$code.=<<___; 2881 lea 48+8(%rsp),$tptr 2882 mov ($aptr,$i),%rdx # a[0] 2883 2884 mov 8($tptr),$A0[1] # t[1] 2885 xor $A0[0],$A0[0] # t[0], of=0, cf=0 2886 mov 0+8(%rsp),$num # restore $num 2887 adox $A0[1],$A0[1] 2888 mov 16($tptr),$A1[0] # t[2] # prefetch 2889 mov 24($tptr),$A1[1] # t[3] # prefetch 2890 #jmp .Lsqrx4x_shift_n_add # happens to be aligned 2891 2892.align 32 2893.Lsqrx4x_shift_n_add: 2894 mulx %rdx,%rax,%rbx 2895 adox $A1[0],$A1[0] 2896 adcx $A0[0],%rax 2897 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch 2898 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch 2899 adox $A1[1],$A1[1] 2900 adcx $A0[1],%rbx 2901 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch 2902 mov %rax,0($tptr) 2903 mov %rbx,8($tptr) 2904 2905 mulx %rdx,%rax,%rbx 2906 adox $A0[0],$A0[0] 2907 adcx $A1[0],%rax 2908 mov 16($aptr,$i),%rdx # a[i+2] # prefetch 2909 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch 2910 adox $A0[1],$A0[1] 2911 adcx $A1[1],%rbx 2912 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch 2913 mov %rax,16($tptr) 2914 mov %rbx,24($tptr) 2915 2916 mulx %rdx,%rax,%rbx 2917 adox $A1[0],$A1[0] 2918 adcx $A0[0],%rax 2919 mov 24($aptr,$i),%rdx # a[i+3] # prefetch 2920 lea 32($i),$i 2921 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch 2922 adox $A1[1],$A1[1] 2923 adcx $A0[1],%rbx 2924 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch 2925 mov %rax,32($tptr) 2926 mov %rbx,40($tptr) 2927 2928 mulx %rdx,%rax,%rbx 2929 adox $A0[0],$A0[0] 2930 adcx $A1[0],%rax 2931 jrcxz .Lsqrx4x_shift_n_add_break 2932 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch 2933 adox $A0[1],$A0[1] 2934 adcx $A1[1],%rbx 2935 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch 2936 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch 2937 mov %rax,48($tptr) 2938 mov %rbx,56($tptr) 2939 lea 64($tptr),$tptr 2940 nop 2941 jmp .Lsqrx4x_shift_n_add 2942 2943.align 32 2944.Lsqrx4x_shift_n_add_break: 2945 adcx $A1[1],%rbx 2946 mov %rax,48($tptr) 2947 mov %rbx,56($tptr) 2948 lea 64($tptr),$tptr # end of t[] buffer 2949___ 2950} 2951###################################################################### 2952# Montgomery reduction part, "word-by-word" algorithm. 2953# 2954# This new path is inspired by multiple submissions from Intel, by 2955# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 2956# Vinodh Gopal... 2957{ 2958my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); 2959 2960$code.=<<___; 2961 movq %xmm2,$nptr 2962sqrx8x_reduction: 2963 xor %eax,%eax # initial top-most carry bit 2964 mov 32+8(%rsp),%rbx # n0 2965 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) 2966 lea -128($nptr,$num,2),%rcx # end of n[] 2967 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer 2968 mov %rcx, 0+8(%rsp) # save end of n[] 2969 mov $tptr,8+8(%rsp) # save end of t[] 2970 2971 lea 48+8(%rsp),$tptr # initial t[] window 2972 jmp .Lsqrx8x_reduction_loop 2973 2974.align 32 2975.Lsqrx8x_reduction_loop: 2976 mov 8*1($tptr),%r9 2977 mov 8*2($tptr),%r10 2978 mov 8*3($tptr),%r11 2979 mov 8*4($tptr),%r12 2980 mov %rdx,%r8 2981 imulq %rbx,%rdx # n0*a[i] 2982 mov 8*5($tptr),%r13 2983 mov 8*6($tptr),%r14 2984 mov 8*7($tptr),%r15 2985 mov %rax,24+8(%rsp) # store top-most carry bit 2986 2987 lea 8*8($tptr),$tptr 2988 xor $carry,$carry # cf=0,of=0 2989 mov \$-8,%rcx 2990 jmp .Lsqrx8x_reduce 2991 2992.align 32 2993.Lsqrx8x_reduce: 2994 mov %r8, %rbx 2995 mulx 16*0($nptr),%rax,%r8 # n[0] 2996 adcx %rbx,%rax # discarded 2997 adox %r9,%r8 2998 2999 mulx 16*1($nptr),%rbx,%r9 # n[1] 3000 adcx %rbx,%r8 3001 adox %r10,%r9 3002 3003 mulx 16*2($nptr),%rbx,%r10 3004 adcx %rbx,%r9 3005 adox %r11,%r10 3006 3007 mulx 16*3($nptr),%rbx,%r11 3008 adcx %rbx,%r10 3009 adox %r12,%r11 3010 3011 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12 3012 mov %rdx,%rax 3013 mov %r8,%rdx 3014 adcx %rbx,%r11 3015 adox %r13,%r12 3016 3017 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded 3018 mov %rax,%rdx 3019 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] 3020 3021 mulx 16*5($nptr),%rax,%r13 3022 adcx %rax,%r12 3023 adox %r14,%r13 3024 3025 mulx 16*6($nptr),%rax,%r14 3026 adcx %rax,%r13 3027 adox %r15,%r14 3028 3029 mulx 16*7($nptr),%rax,%r15 3030 mov %rbx,%rdx 3031 adcx %rax,%r14 3032 adox $carry,%r15 # $carry is 0 3033 adcx $carry,%r15 # cf=0 3034 3035 .byte 0x67,0x67,0x67 3036 inc %rcx # of=0 3037 jnz .Lsqrx8x_reduce 3038 3039 mov $carry,%rax # xor %rax,%rax 3040 cmp 0+8(%rsp),$nptr # end of n[]? 3041 jae .Lsqrx8x_no_tail 3042 3043 mov 48+8(%rsp),%rdx # pull n0*a[0] 3044 add 8*0($tptr),%r8 3045 lea 16*8($nptr),$nptr 3046 mov \$-8,%rcx 3047 adcx 8*1($tptr),%r9 3048 adcx 8*2($tptr),%r10 3049 adc 8*3($tptr),%r11 3050 adc 8*4($tptr),%r12 3051 adc 8*5($tptr),%r13 3052 adc 8*6($tptr),%r14 3053 adc 8*7($tptr),%r15 3054 lea 8*8($tptr),$tptr 3055 sbb %rax,%rax # top carry 3056 3057 xor $carry,$carry # of=0, cf=0 3058 mov %rax,16+8(%rsp) 3059 jmp .Lsqrx8x_tail 3060 3061.align 32 3062.Lsqrx8x_tail: 3063 mov %r8,%rbx 3064 mulx 16*0($nptr),%rax,%r8 3065 adcx %rax,%rbx 3066 adox %r9,%r8 3067 3068 mulx 16*1($nptr),%rax,%r9 3069 adcx %rax,%r8 3070 adox %r10,%r9 3071 3072 mulx 16*2($nptr),%rax,%r10 3073 adcx %rax,%r9 3074 adox %r11,%r10 3075 3076 mulx 16*3($nptr),%rax,%r11 3077 adcx %rax,%r10 3078 adox %r12,%r11 3079 3080 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12 3081 adcx %rax,%r11 3082 adox %r13,%r12 3083 3084 mulx 16*5($nptr),%rax,%r13 3085 adcx %rax,%r12 3086 adox %r14,%r13 3087 3088 mulx 16*6($nptr),%rax,%r14 3089 adcx %rax,%r13 3090 adox %r15,%r14 3091 3092 mulx 16*7($nptr),%rax,%r15 3093 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] 3094 adcx %rax,%r14 3095 adox $carry,%r15 3096 mov %rbx,($tptr,%rcx,8) # save result 3097 mov %r8,%rbx 3098 adcx $carry,%r15 # cf=0 3099 3100 inc %rcx # of=0 3101 jnz .Lsqrx8x_tail 3102 3103 cmp 0+8(%rsp),$nptr # end of n[]? 3104 jae .Lsqrx8x_tail_done # break out of loop 3105 3106 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3107 mov 48+8(%rsp),%rdx # pull n0*a[0] 3108 lea 16*8($nptr),$nptr 3109 adc 8*0($tptr),%r8 3110 adc 8*1($tptr),%r9 3111 adc 8*2($tptr),%r10 3112 adc 8*3($tptr),%r11 3113 adc 8*4($tptr),%r12 3114 adc 8*5($tptr),%r13 3115 adc 8*6($tptr),%r14 3116 adc 8*7($tptr),%r15 3117 lea 8*8($tptr),$tptr 3118 sbb %rax,%rax 3119 sub \$8,%rcx # mov \$-8,%rcx 3120 3121 xor $carry,$carry # of=0, cf=0 3122 mov %rax,16+8(%rsp) 3123 jmp .Lsqrx8x_tail 3124 3125.align 32 3126.Lsqrx8x_tail_done: 3127 add 24+8(%rsp),%r8 # can this overflow? 3128 mov $carry,%rax # xor %rax,%rax 3129 3130 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3131.Lsqrx8x_no_tail: # %cf is 0 if jumped here 3132 adc 8*0($tptr),%r8 3133 movq %xmm3,%rcx 3134 adc 8*1($tptr),%r9 3135 mov 16*7($nptr),$carry 3136 movq %xmm2,$nptr # restore $nptr 3137 adc 8*2($tptr),%r10 3138 adc 8*3($tptr),%r11 3139 adc 8*4($tptr),%r12 3140 adc 8*5($tptr),%r13 3141 adc 8*6($tptr),%r14 3142 adc 8*7($tptr),%r15 3143 adc %rax,%rax # top-most carry 3144 3145 mov 32+8(%rsp),%rbx # n0 3146 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" 3147 3148 mov %r8,8*0($tptr) # store top 512 bits 3149 lea 8*8($tptr),%r8 # borrow %r8 3150 mov %r9,8*1($tptr) 3151 mov %r10,8*2($tptr) 3152 mov %r11,8*3($tptr) 3153 mov %r12,8*4($tptr) 3154 mov %r13,8*5($tptr) 3155 mov %r14,8*6($tptr) 3156 mov %r15,8*7($tptr) 3157 3158 lea 8*8($tptr,%rcx),$tptr # start of current t[] window 3159 cmp 8+8(%rsp),%r8 # end of t[]? 3160 jb .Lsqrx8x_reduction_loop 3161___ 3162} 3163############################################################## 3164# Post-condition, 4x unrolled 3165# 3166{ 3167my ($rptr,$nptr)=("%rdx","%rbp"); 3168my @ri=map("%r$_",(10..13)); 3169my @ni=map("%r$_",(14..15)); 3170$code.=<<___; 3171 xor %rbx,%rbx 3172 sub %r15,%rsi # compare top-most words 3173 adc %rbx,%rbx 3174 mov %rcx,%r10 # -$num 3175 .byte 0x67 3176 or %rbx,%rax 3177 .byte 0x67 3178 mov %rcx,%r9 # -$num 3179 xor \$1,%rax 3180 sar \$3+2,%rcx # cf=0 3181 #lea 48+8(%rsp,%r9),$tptr 3182 lea ($nptr,%rax,8),$nptr 3183 movq %xmm1,$rptr # restore $rptr 3184 movq %xmm1,$aptr # prepare for back-to-back call 3185 jmp .Lsqrx4x_sub 3186 3187.align 32 3188.Lsqrx4x_sub: 3189 .byte 0x66 3190 mov 8*0($tptr),%r12 3191 mov 8*1($tptr),%r13 3192 sbb 16*0($nptr),%r12 3193 mov 8*2($tptr),%r14 3194 sbb 16*1($nptr),%r13 3195 mov 8*3($tptr),%r15 3196 lea 8*4($tptr),$tptr 3197 sbb 16*2($nptr),%r14 3198 mov %r12,8*0($rptr) 3199 sbb 16*3($nptr),%r15 3200 lea 16*4($nptr),$nptr 3201 mov %r13,8*1($rptr) 3202 mov %r14,8*2($rptr) 3203 mov %r15,8*3($rptr) 3204 lea 8*4($rptr),$rptr 3205 3206 inc %rcx 3207 jnz .Lsqrx4x_sub 3208___ 3209} 3210$code.=<<___; 3211 neg %r9 # restore $num 3212 3213 ret 3214.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3215___ 3216}}} 3217{ 3218my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order 3219 ("%rdi","%esi","%rdx","%ecx"); # Unix order 3220my $out=$inp; 3221my $STRIDE=2**5*8; 3222my $N=$STRIDE/4; 3223 3224$code.=<<___; 3225.globl bn_get_bits5 3226.type bn_get_bits5,\@abi-omnipotent 3227.align 16 3228bn_get_bits5: 3229 lea 0($inp),%r10 3230 lea 1($inp),%r11 3231 mov $num,%ecx 3232 shr \$4,$num 3233 and \$15,%ecx 3234 lea -8(%ecx),%eax 3235 cmp \$11,%ecx 3236 cmova %r11,%r10 3237 cmova %eax,%ecx 3238 movzw (%r10,$num,2),%eax 3239 shrl %cl,%eax 3240 and \$31,%eax 3241 ret 3242.size bn_get_bits5,.-bn_get_bits5 3243 3244.globl bn_scatter5 3245.type bn_scatter5,\@abi-omnipotent 3246.align 16 3247bn_scatter5: 3248 cmp \$0, $num 3249 jz .Lscatter_epilogue 3250 lea ($tbl,$idx,8),$tbl 3251.Lscatter: 3252 mov ($inp),%rax 3253 lea 8($inp),$inp 3254 mov %rax,($tbl) 3255 lea 32*8($tbl),$tbl 3256 sub \$1,$num 3257 jnz .Lscatter 3258.Lscatter_epilogue: 3259 ret 3260.size bn_scatter5,.-bn_scatter5 3261 3262.globl bn_gather5 3263.type bn_gather5,\@abi-omnipotent 3264.align 16 3265bn_gather5: 3266___ 3267$code.=<<___ if ($win64); 3268.LSEH_begin_bn_gather5: 3269 # I can't trust assembler to use specific encoding:-( 3270 .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp 3271 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 3272 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) 3273___ 3274$code.=<<___; 3275 mov $idx,%r11d 3276 shr \$`log($N/8)/log(2)`,$idx 3277 and \$`$N/8-1`,%r11 3278 not $idx 3279 lea .Lmagic_masks(%rip),%rax 3280 and \$`2**5/($N/8)-1`,$idx # 5 is "window size" 3281 lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line 3282 movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which 3283 movq 8(%rax,$idx,8),%xmm5 # cache line contains element 3284 movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument 3285 movq 24(%rax,$idx,8),%xmm7 3286 jmp .Lgather 3287.align 16 3288.Lgather: 3289 movq `0*$STRIDE/4-128`($tbl),%xmm0 3290 movq `1*$STRIDE/4-128`($tbl),%xmm1 3291 pand %xmm4,%xmm0 3292 movq `2*$STRIDE/4-128`($tbl),%xmm2 3293 pand %xmm5,%xmm1 3294 movq `3*$STRIDE/4-128`($tbl),%xmm3 3295 pand %xmm6,%xmm2 3296 por %xmm1,%xmm0 3297 pand %xmm7,%xmm3 3298 .byte 0x67,0x67 3299 por %xmm2,%xmm0 3300 lea $STRIDE($tbl),$tbl 3301 por %xmm3,%xmm0 3302 3303 movq %xmm0,($out) # m0=bp[0] 3304 lea 8($out),$out 3305 sub \$1,$num 3306 jnz .Lgather 3307___ 3308$code.=<<___ if ($win64); 3309 movaps (%rsp),%xmm6 3310 movaps 0x10(%rsp),%xmm7 3311 lea 0x28(%rsp),%rsp 3312___ 3313$code.=<<___; 3314 ret 3315.LSEH_end_bn_gather5: 3316.size bn_gather5,.-bn_gather5 3317___ 3318} 3319$code.=<<___; 3320.align 64 3321.Lmagic_masks: 3322 .long 0,0, 0,0, 0,0, -1,-1 3323 .long 0,0, 0,0, 0,0, 0,0 3324.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3325___ 3326 3327# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3328# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3329if ($win64) { 3330$rec="%rcx"; 3331$frame="%rdx"; 3332$context="%r8"; 3333$disp="%r9"; 3334 3335$code.=<<___; 3336.extern __imp_RtlVirtualUnwind 3337.type mul_handler,\@abi-omnipotent 3338.align 16 3339mul_handler: 3340 push %rsi 3341 push %rdi 3342 push %rbx 3343 push %rbp 3344 push %r12 3345 push %r13 3346 push %r14 3347 push %r15 3348 pushfq 3349 sub \$64,%rsp 3350 3351 mov 120($context),%rax # pull context->Rax 3352 mov 248($context),%rbx # pull context->Rip 3353 3354 mov 8($disp),%rsi # disp->ImageBase 3355 mov 56($disp),%r11 # disp->HandlerData 3356 3357 mov 0(%r11),%r10d # HandlerData[0] 3358 lea (%rsi,%r10),%r10 # end of prologue label 3359 cmp %r10,%rbx # context->Rip<end of prologue label 3360 jb .Lcommon_seh_tail 3361 3362 mov 152($context),%rax # pull context->Rsp 3363 3364 mov 4(%r11),%r10d # HandlerData[1] 3365 lea (%rsi,%r10),%r10 # epilogue label 3366 cmp %r10,%rbx # context->Rip>=epilogue label 3367 jae .Lcommon_seh_tail 3368 3369 lea .Lmul_epilogue(%rip),%r10 3370 cmp %r10,%rbx 3371 jb .Lbody_40 3372 3373 mov 192($context),%r10 # pull $num 3374 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 3375 jmp .Lbody_proceed 3376 3377.Lbody_40: 3378 mov 40(%rax),%rax # pull saved stack pointer 3379.Lbody_proceed: 3380 3381 movaps -88(%rax),%xmm0 3382 movaps -72(%rax),%xmm1 3383 3384 mov -8(%rax),%rbx 3385 mov -16(%rax),%rbp 3386 mov -24(%rax),%r12 3387 mov -32(%rax),%r13 3388 mov -40(%rax),%r14 3389 mov -48(%rax),%r15 3390 mov %rbx,144($context) # restore context->Rbx 3391 mov %rbp,160($context) # restore context->Rbp 3392 mov %r12,216($context) # restore context->R12 3393 mov %r13,224($context) # restore context->R13 3394 mov %r14,232($context) # restore context->R14 3395 mov %r15,240($context) # restore context->R15 3396 movups %xmm0,512($context) # restore context->Xmm6 3397 movups %xmm1,528($context) # restore context->Xmm7 3398 3399.Lcommon_seh_tail: 3400 mov 8(%rax),%rdi 3401 mov 16(%rax),%rsi 3402 mov %rax,152($context) # restore context->Rsp 3403 mov %rsi,168($context) # restore context->Rsi 3404 mov %rdi,176($context) # restore context->Rdi 3405 3406 mov 40($disp),%rdi # disp->ContextRecord 3407 mov $context,%rsi # context 3408 mov \$154,%ecx # sizeof(CONTEXT) 3409 .long 0xa548f3fc # cld; rep movsq 3410 3411 mov $disp,%rsi 3412 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3413 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3414 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3415 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3416 mov 40(%rsi),%r10 # disp->ContextRecord 3417 lea 56(%rsi),%r11 # &disp->HandlerData 3418 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3419 mov %r10,32(%rsp) # arg5 3420 mov %r11,40(%rsp) # arg6 3421 mov %r12,48(%rsp) # arg7 3422 mov %rcx,56(%rsp) # arg8, (NULL) 3423 call *__imp_RtlVirtualUnwind(%rip) 3424 3425 mov \$1,%eax # ExceptionContinueSearch 3426 add \$64,%rsp 3427 popfq 3428 pop %r15 3429 pop %r14 3430 pop %r13 3431 pop %r12 3432 pop %rbp 3433 pop %rbx 3434 pop %rdi 3435 pop %rsi 3436 ret 3437.size mul_handler,.-mul_handler 3438 3439.section .pdata 3440.align 4 3441 .rva .LSEH_begin_bn_mul_mont_gather5 3442 .rva .LSEH_end_bn_mul_mont_gather5 3443 .rva .LSEH_info_bn_mul_mont_gather5 3444 3445 .rva .LSEH_begin_bn_mul4x_mont_gather5 3446 .rva .LSEH_end_bn_mul4x_mont_gather5 3447 .rva .LSEH_info_bn_mul4x_mont_gather5 3448 3449 .rva .LSEH_begin_bn_power5 3450 .rva .LSEH_end_bn_power5 3451 .rva .LSEH_info_bn_power5 3452 3453 .rva .LSEH_begin_bn_from_mont8x 3454 .rva .LSEH_end_bn_from_mont8x 3455 .rva .LSEH_info_bn_from_mont8x 3456___ 3457$code.=<<___ if ($addx); 3458 .rva .LSEH_begin_bn_mulx4x_mont_gather5 3459 .rva .LSEH_end_bn_mulx4x_mont_gather5 3460 .rva .LSEH_info_bn_mulx4x_mont_gather5 3461 3462 .rva .LSEH_begin_bn_powerx5 3463 .rva .LSEH_end_bn_powerx5 3464 .rva .LSEH_info_bn_powerx5 3465___ 3466$code.=<<___; 3467 .rva .LSEH_begin_bn_gather5 3468 .rva .LSEH_end_bn_gather5 3469 .rva .LSEH_info_bn_gather5 3470 3471.section .xdata 3472.align 8 3473.LSEH_info_bn_mul_mont_gather5: 3474 .byte 9,0,0,0 3475 .rva mul_handler 3476 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 3477.align 8 3478.LSEH_info_bn_mul4x_mont_gather5: 3479 .byte 9,0,0,0 3480 .rva mul_handler 3481 .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 3482.align 8 3483.LSEH_info_bn_power5: 3484 .byte 9,0,0,0 3485 .rva mul_handler 3486 .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[] 3487.align 8 3488.LSEH_info_bn_from_mont8x: 3489 .byte 9,0,0,0 3490 .rva mul_handler 3491 .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] 3492___ 3493$code.=<<___ if ($addx); 3494.align 8 3495.LSEH_info_bn_mulx4x_mont_gather5: 3496 .byte 9,0,0,0 3497 .rva mul_handler 3498 .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 3499.align 8 3500.LSEH_info_bn_powerx5: 3501 .byte 9,0,0,0 3502 .rva mul_handler 3503 .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] 3504___ 3505$code.=<<___; 3506.align 8 3507.LSEH_info_bn_gather5: 3508 .byte 0x01,0x0d,0x05,0x00 3509 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 3510 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 3511 .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 3512.align 8 3513___ 3514} 3515 3516$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3517 3518print $code; 3519close STDOUT; 3520