1#! /usr/bin/env perl 2# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# August 2011. 18# 19# Companion to x86_64-mont.pl that optimizes cache-timing attack 20# countermeasures. The subroutines are produced by replacing bp[i] 21# references in their x86_64-mont.pl counterparts with cache-neutral 22# references to powers table computed in BN_mod_exp_mont_consttime. 23# In addition subroutine that scatters elements of the powers table 24# is implemented, so that scatter-/gathering can be tuned without 25# bn_exp.c modifications. 26 27# August 2013. 28# 29# Add MULX/AD*X code paths and additional interfaces to optimize for 30# branch prediction unit. For input lengths that are multiples of 8 31# the np argument is not just modulus value, but one interleaved 32# with 0. This is to optimize post-condition... 33 34$flavour = shift; 35$output = shift; 36if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 37 38$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 39 40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 41( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 42( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 43die "can't locate x86_64-xlate.pl"; 44 45open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 46*STDOUT=*OUT; 47 48if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 49 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 50 $addx = ($1>=2.23); 51} 52 53if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 54 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 55 $addx = ($1>=2.10); 56} 57 58if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 59 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 60 $addx = ($1>=12); 61} 62 63if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 64 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 65 $addx = ($ver>=3.03); 66} 67 68# int bn_mul_mont_gather5( 69$rp="%rdi"; # BN_ULONG *rp, 70$ap="%rsi"; # const BN_ULONG *ap, 71$bp="%rdx"; # const BN_ULONG *bp, 72$np="%rcx"; # const BN_ULONG *np, 73$n0="%r8"; # const BN_ULONG *n0, 74$num="%r9"; # int num, 75 # int idx); # 0 to 2^5-1, "index" in $bp holding 76 # pre-computed powers of a', interlaced 77 # in such manner that b[0] is $bp[idx], 78 # b[1] is [2^5+idx], etc. 79$lo0="%r10"; 80$hi0="%r11"; 81$hi1="%r13"; 82$i="%r14"; 83$j="%r15"; 84$m0="%rbx"; 85$m1="%rbp"; 86 87$code=<<___; 88.text 89 90.extern OPENSSL_ia32cap_P 91 92.globl bn_mul_mont_gather5 93.type bn_mul_mont_gather5,\@function,6 94.align 64 95bn_mul_mont_gather5: 96.cfi_startproc 97 mov ${num}d,${num}d 98 mov %rsp,%rax 99.cfi_def_cfa_register %rax 100 test \$7,${num}d 101 jnz .Lmul_enter 102___ 103$code.=<<___ if ($addx); 104 mov OPENSSL_ia32cap_P+8(%rip),%r11d 105___ 106$code.=<<___; 107 jmp .Lmul4x_enter 108 109.align 16 110.Lmul_enter: 111 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument 112 push %rbx 113.cfi_push %rbx 114 push %rbp 115.cfi_push %rbp 116 push %r12 117.cfi_push %r12 118 push %r13 119.cfi_push %r13 120 push %r14 121.cfi_push %r14 122 push %r15 123.cfi_push %r15 124 125 neg $num 126 mov %rsp,%r11 127 lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) 128 neg $num # restore $num 129 and \$-1024,%r10 # minimize TLB usage 130 131 # An OS-agnostic version of __chkstk. 132 # 133 # Some OSes (Windows) insist on stack being "wired" to 134 # physical memory in strictly sequential manner, i.e. if stack 135 # allocation spans two pages, then reference to farmost one can 136 # be punishable by SEGV. But page walking can do good even on 137 # other OSes, because it guarantees that villain thread hits 138 # the guard page before it can make damage to innocent one... 139 sub %r10,%r11 140 and \$-4096,%r11 141 lea (%r10,%r11),%rsp 142 mov (%rsp),%r11 143 cmp %r10,%rsp 144 ja .Lmul_page_walk 145 jmp .Lmul_page_walk_done 146 147.Lmul_page_walk: 148 lea -4096(%rsp),%rsp 149 mov (%rsp),%r11 150 cmp %r10,%rsp 151 ja .Lmul_page_walk 152.Lmul_page_walk_done: 153 154 lea .Linc(%rip),%r10 155 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 156.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 157.Lmul_body: 158 159 lea 128($bp),%r12 # reassign $bp (+size optimization) 160___ 161 $bp="%r12"; 162 $STRIDE=2**5*8; # 5 is "window size" 163 $N=$STRIDE/4; # should match cache line size 164$code.=<<___; 165 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 166 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 167 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) 168 and \$-16,%r10 169 170 pshufd \$0,%xmm5,%xmm5 # broadcast index 171 movdqa %xmm1,%xmm4 172 movdqa %xmm1,%xmm2 173___ 174######################################################################## 175# calculate mask by comparing 0..31 to index and save result to stack 176# 177$code.=<<___; 178 paddd %xmm0,%xmm1 179 pcmpeqd %xmm5,%xmm0 # compare to 1,0 180 .byte 0x67 181 movdqa %xmm4,%xmm3 182___ 183for($k=0;$k<$STRIDE/16-4;$k+=4) { 184$code.=<<___; 185 paddd %xmm1,%xmm2 186 pcmpeqd %xmm5,%xmm1 # compare to 3,2 187 movdqa %xmm0,`16*($k+0)+112`(%r10) 188 movdqa %xmm4,%xmm0 189 190 paddd %xmm2,%xmm3 191 pcmpeqd %xmm5,%xmm2 # compare to 5,4 192 movdqa %xmm1,`16*($k+1)+112`(%r10) 193 movdqa %xmm4,%xmm1 194 195 paddd %xmm3,%xmm0 196 pcmpeqd %xmm5,%xmm3 # compare to 7,6 197 movdqa %xmm2,`16*($k+2)+112`(%r10) 198 movdqa %xmm4,%xmm2 199 200 paddd %xmm0,%xmm1 201 pcmpeqd %xmm5,%xmm0 202 movdqa %xmm3,`16*($k+3)+112`(%r10) 203 movdqa %xmm4,%xmm3 204___ 205} 206$code.=<<___; # last iteration can be optimized 207 paddd %xmm1,%xmm2 208 pcmpeqd %xmm5,%xmm1 209 movdqa %xmm0,`16*($k+0)+112`(%r10) 210 211 paddd %xmm2,%xmm3 212 .byte 0x67 213 pcmpeqd %xmm5,%xmm2 214 movdqa %xmm1,`16*($k+1)+112`(%r10) 215 216 pcmpeqd %xmm5,%xmm3 217 movdqa %xmm2,`16*($k+2)+112`(%r10) 218 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register 219 220 pand `16*($k+1)-128`($bp),%xmm1 221 pand `16*($k+2)-128`($bp),%xmm2 222 movdqa %xmm3,`16*($k+3)+112`(%r10) 223 pand `16*($k+3)-128`($bp),%xmm3 224 por %xmm2,%xmm0 225 por %xmm3,%xmm1 226___ 227for($k=0;$k<$STRIDE/16-4;$k+=4) { 228$code.=<<___; 229 movdqa `16*($k+0)-128`($bp),%xmm4 230 movdqa `16*($k+1)-128`($bp),%xmm5 231 movdqa `16*($k+2)-128`($bp),%xmm2 232 pand `16*($k+0)+112`(%r10),%xmm4 233 movdqa `16*($k+3)-128`($bp),%xmm3 234 pand `16*($k+1)+112`(%r10),%xmm5 235 por %xmm4,%xmm0 236 pand `16*($k+2)+112`(%r10),%xmm2 237 por %xmm5,%xmm1 238 pand `16*($k+3)+112`(%r10),%xmm3 239 por %xmm2,%xmm0 240 por %xmm3,%xmm1 241___ 242} 243$code.=<<___; 244 por %xmm1,%xmm0 245 pshufd \$0x4e,%xmm0,%xmm1 246 por %xmm1,%xmm0 247 lea $STRIDE($bp),$bp 248 movq %xmm0,$m0 # m0=bp[0] 249 250 mov ($n0),$n0 # pull n0[0] value 251 mov ($ap),%rax 252 253 xor $i,$i # i=0 254 xor $j,$j # j=0 255 256 mov $n0,$m1 257 mulq $m0 # ap[0]*bp[0] 258 mov %rax,$lo0 259 mov ($np),%rax 260 261 imulq $lo0,$m1 # "tp[0]"*n0 262 mov %rdx,$hi0 263 264 mulq $m1 # np[0]*m1 265 add %rax,$lo0 # discarded 266 mov 8($ap),%rax 267 adc \$0,%rdx 268 mov %rdx,$hi1 269 270 lea 1($j),$j # j++ 271 jmp .L1st_enter 272 273.align 16 274.L1st: 275 add %rax,$hi1 276 mov ($ap,$j,8),%rax 277 adc \$0,%rdx 278 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 279 mov $lo0,$hi0 280 adc \$0,%rdx 281 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 282 mov %rdx,$hi1 283 284.L1st_enter: 285 mulq $m0 # ap[j]*bp[0] 286 add %rax,$hi0 287 mov ($np,$j,8),%rax 288 adc \$0,%rdx 289 lea 1($j),$j # j++ 290 mov %rdx,$lo0 291 292 mulq $m1 # np[j]*m1 293 cmp $num,$j 294 jne .L1st # note that upon exit $j==$num, so 295 # they can be used interchangeably 296 297 add %rax,$hi1 298 adc \$0,%rdx 299 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 300 adc \$0,%rdx 301 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 302 mov %rdx,$hi1 303 mov $lo0,$hi0 304 305 xor %rdx,%rdx 306 add $hi0,$hi1 307 adc \$0,%rdx 308 mov $hi1,-8(%rsp,$num,8) 309 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 310 311 lea 1($i),$i # i++ 312 jmp .Louter 313.align 16 314.Louter: 315 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) 316 and \$-16,%rdx 317 pxor %xmm4,%xmm4 318 pxor %xmm5,%xmm5 319___ 320for($k=0;$k<$STRIDE/16;$k+=4) { 321$code.=<<___; 322 movdqa `16*($k+0)-128`($bp),%xmm0 323 movdqa `16*($k+1)-128`($bp),%xmm1 324 movdqa `16*($k+2)-128`($bp),%xmm2 325 movdqa `16*($k+3)-128`($bp),%xmm3 326 pand `16*($k+0)-128`(%rdx),%xmm0 327 pand `16*($k+1)-128`(%rdx),%xmm1 328 por %xmm0,%xmm4 329 pand `16*($k+2)-128`(%rdx),%xmm2 330 por %xmm1,%xmm5 331 pand `16*($k+3)-128`(%rdx),%xmm3 332 por %xmm2,%xmm4 333 por %xmm3,%xmm5 334___ 335} 336$code.=<<___; 337 por %xmm5,%xmm4 338 pshufd \$0x4e,%xmm4,%xmm0 339 por %xmm4,%xmm0 340 lea $STRIDE($bp),$bp 341 342 mov ($ap),%rax # ap[0] 343 movq %xmm0,$m0 # m0=bp[i] 344 345 xor $j,$j # j=0 346 mov $n0,$m1 347 mov (%rsp),$lo0 348 349 mulq $m0 # ap[0]*bp[i] 350 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 351 mov ($np),%rax 352 adc \$0,%rdx 353 354 imulq $lo0,$m1 # tp[0]*n0 355 mov %rdx,$hi0 356 357 mulq $m1 # np[0]*m1 358 add %rax,$lo0 # discarded 359 mov 8($ap),%rax 360 adc \$0,%rdx 361 mov 8(%rsp),$lo0 # tp[1] 362 mov %rdx,$hi1 363 364 lea 1($j),$j # j++ 365 jmp .Linner_enter 366 367.align 16 368.Linner: 369 add %rax,$hi1 370 mov ($ap,$j,8),%rax 371 adc \$0,%rdx 372 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 373 mov (%rsp,$j,8),$lo0 374 adc \$0,%rdx 375 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 376 mov %rdx,$hi1 377 378.Linner_enter: 379 mulq $m0 # ap[j]*bp[i] 380 add %rax,$hi0 381 mov ($np,$j,8),%rax 382 adc \$0,%rdx 383 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 384 mov %rdx,$hi0 385 adc \$0,$hi0 386 lea 1($j),$j # j++ 387 388 mulq $m1 # np[j]*m1 389 cmp $num,$j 390 jne .Linner # note that upon exit $j==$num, so 391 # they can be used interchangeably 392 add %rax,$hi1 393 adc \$0,%rdx 394 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 395 mov (%rsp,$num,8),$lo0 396 adc \$0,%rdx 397 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 398 mov %rdx,$hi1 399 400 xor %rdx,%rdx 401 add $hi0,$hi1 402 adc \$0,%rdx 403 add $lo0,$hi1 # pull upmost overflow bit 404 adc \$0,%rdx 405 mov $hi1,-8(%rsp,$num,8) 406 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 407 408 lea 1($i),$i # i++ 409 cmp $num,$i 410 jb .Louter 411 412 xor $i,$i # i=0 and clear CF! 413 mov (%rsp),%rax # tp[0] 414 lea (%rsp),$ap # borrow ap for tp 415 mov $num,$j # j=num 416 jmp .Lsub 417.align 16 418.Lsub: sbb ($np,$i,8),%rax 419 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 420 mov 8($ap,$i,8),%rax # tp[i+1] 421 lea 1($i),$i # i++ 422 dec $j # doesn't affect CF! 423 jnz .Lsub 424 425 sbb \$0,%rax # handle upmost overflow bit 426 mov \$-1,%rbx 427 xor %rax,%rbx 428 xor $i,$i 429 mov $num,$j # j=num 430 431.Lcopy: # conditional copy 432 mov ($rp,$i,8),%rcx 433 mov (%rsp,$i,8),%rdx 434 and %rbx,%rcx 435 and %rax,%rdx 436 mov $i,(%rsp,$i,8) # zap temporary vector 437 or %rcx,%rdx 438 mov %rdx,($rp,$i,8) # rp[i]=tp[i] 439 lea 1($i),$i 440 sub \$1,$j 441 jnz .Lcopy 442 443 mov 8(%rsp,$num,8),%rsi # restore %rsp 444.cfi_def_cfa %rsi,8 445 mov \$1,%rax 446 447 mov -48(%rsi),%r15 448.cfi_restore %r15 449 mov -40(%rsi),%r14 450.cfi_restore %r14 451 mov -32(%rsi),%r13 452.cfi_restore %r13 453 mov -24(%rsi),%r12 454.cfi_restore %r12 455 mov -16(%rsi),%rbp 456.cfi_restore %rbp 457 mov -8(%rsi),%rbx 458.cfi_restore %rbx 459 lea (%rsi),%rsp 460.cfi_def_cfa_register %rsp 461.Lmul_epilogue: 462 ret 463.cfi_endproc 464.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 465___ 466{{{ 467my @A=("%r10","%r11"); 468my @N=("%r13","%rdi"); 469$code.=<<___; 470.type bn_mul4x_mont_gather5,\@function,6 471.align 32 472bn_mul4x_mont_gather5: 473.cfi_startproc 474 .byte 0x67 475 mov %rsp,%rax 476.cfi_def_cfa_register %rax 477.Lmul4x_enter: 478___ 479$code.=<<___ if ($addx); 480 and \$0x80108,%r11d 481 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 482 je .Lmulx4x_enter 483___ 484$code.=<<___; 485 push %rbx 486.cfi_push %rbx 487 push %rbp 488.cfi_push %rbp 489 push %r12 490.cfi_push %r12 491 push %r13 492.cfi_push %r13 493 push %r14 494.cfi_push %r14 495 push %r15 496.cfi_push %r15 497.Lmul4x_prologue: 498 499 .byte 0x67 500 shl \$3,${num}d # convert $num to bytes 501 lea ($num,$num,2),%r10 # 3*$num in bytes 502 neg $num # -$num 503 504 ############################################################## 505 # Ensure that stack frame doesn't alias with $rptr+3*$num 506 # modulo 4096, which covers ret[num], am[num] and n[num] 507 # (see bn_exp.c). This is done to allow memory disambiguation 508 # logic do its magic. [Extra [num] is allocated in order 509 # to align with bn_power5's frame, which is cleansed after 510 # completing exponentiation. Extra 256 bytes is for power mask 511 # calculated from 7th argument, the index.] 512 # 513 lea -320(%rsp,$num,2),%r11 514 mov %rsp,%rbp 515 sub $rp,%r11 516 and \$4095,%r11 517 cmp %r11,%r10 518 jb .Lmul4xsp_alt 519 sub %r11,%rbp # align with $rp 520 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 521 jmp .Lmul4xsp_done 522 523.align 32 524.Lmul4xsp_alt: 525 lea 4096-320(,$num,2),%r10 526 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 527 sub %r10,%r11 528 mov \$0,%r10 529 cmovc %r10,%r11 530 sub %r11,%rbp 531.Lmul4xsp_done: 532 and \$-64,%rbp 533 mov %rsp,%r11 534 sub %rbp,%r11 535 and \$-4096,%r11 536 lea (%rbp,%r11),%rsp 537 mov (%rsp),%r10 538 cmp %rbp,%rsp 539 ja .Lmul4x_page_walk 540 jmp .Lmul4x_page_walk_done 541 542.Lmul4x_page_walk: 543 lea -4096(%rsp),%rsp 544 mov (%rsp),%r10 545 cmp %rbp,%rsp 546 ja .Lmul4x_page_walk 547.Lmul4x_page_walk_done: 548 549 neg $num 550 551 mov %rax,40(%rsp) 552.cfi_cfa_expression %rsp+40,deref,+8 553.Lmul4x_body: 554 555 call mul4x_internal 556 557 mov 40(%rsp),%rsi # restore %rsp 558.cfi_def_cfa %rsi,8 559 mov \$1,%rax 560 561 mov -48(%rsi),%r15 562.cfi_restore %r15 563 mov -40(%rsi),%r14 564.cfi_restore %r14 565 mov -32(%rsi),%r13 566.cfi_restore %r13 567 mov -24(%rsi),%r12 568.cfi_restore %r12 569 mov -16(%rsi),%rbp 570.cfi_restore %rbp 571 mov -8(%rsi),%rbx 572.cfi_restore %rbx 573 lea (%rsi),%rsp 574.cfi_def_cfa_register %rsp 575.Lmul4x_epilogue: 576 ret 577.cfi_endproc 578.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 579 580.type mul4x_internal,\@abi-omnipotent 581.align 32 582mul4x_internal: 583 shl \$5,$num # $num was in bytes 584 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index 585 lea .Linc(%rip),%rax 586 lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) 587 shr \$5,$num # restore $num 588___ 589 $bp="%r12"; 590 $STRIDE=2**5*8; # 5 is "window size" 591 $N=$STRIDE/4; # should match cache line size 592 $tp=$i; 593$code.=<<___; 594 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 595 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 596 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) 597 lea 128(%rdx),$bp # size optimization 598 599 pshufd \$0,%xmm5,%xmm5 # broadcast index 600 movdqa %xmm1,%xmm4 601 .byte 0x67,0x67 602 movdqa %xmm1,%xmm2 603___ 604######################################################################## 605# calculate mask by comparing 0..31 to index and save result to stack 606# 607$code.=<<___; 608 paddd %xmm0,%xmm1 609 pcmpeqd %xmm5,%xmm0 # compare to 1,0 610 .byte 0x67 611 movdqa %xmm4,%xmm3 612___ 613for($i=0;$i<$STRIDE/16-4;$i+=4) { 614$code.=<<___; 615 paddd %xmm1,%xmm2 616 pcmpeqd %xmm5,%xmm1 # compare to 3,2 617 movdqa %xmm0,`16*($i+0)+112`(%r10) 618 movdqa %xmm4,%xmm0 619 620 paddd %xmm2,%xmm3 621 pcmpeqd %xmm5,%xmm2 # compare to 5,4 622 movdqa %xmm1,`16*($i+1)+112`(%r10) 623 movdqa %xmm4,%xmm1 624 625 paddd %xmm3,%xmm0 626 pcmpeqd %xmm5,%xmm3 # compare to 7,6 627 movdqa %xmm2,`16*($i+2)+112`(%r10) 628 movdqa %xmm4,%xmm2 629 630 paddd %xmm0,%xmm1 631 pcmpeqd %xmm5,%xmm0 632 movdqa %xmm3,`16*($i+3)+112`(%r10) 633 movdqa %xmm4,%xmm3 634___ 635} 636$code.=<<___; # last iteration can be optimized 637 paddd %xmm1,%xmm2 638 pcmpeqd %xmm5,%xmm1 639 movdqa %xmm0,`16*($i+0)+112`(%r10) 640 641 paddd %xmm2,%xmm3 642 .byte 0x67 643 pcmpeqd %xmm5,%xmm2 644 movdqa %xmm1,`16*($i+1)+112`(%r10) 645 646 pcmpeqd %xmm5,%xmm3 647 movdqa %xmm2,`16*($i+2)+112`(%r10) 648 pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register 649 650 pand `16*($i+1)-128`($bp),%xmm1 651 pand `16*($i+2)-128`($bp),%xmm2 652 movdqa %xmm3,`16*($i+3)+112`(%r10) 653 pand `16*($i+3)-128`($bp),%xmm3 654 por %xmm2,%xmm0 655 por %xmm3,%xmm1 656___ 657for($i=0;$i<$STRIDE/16-4;$i+=4) { 658$code.=<<___; 659 movdqa `16*($i+0)-128`($bp),%xmm4 660 movdqa `16*($i+1)-128`($bp),%xmm5 661 movdqa `16*($i+2)-128`($bp),%xmm2 662 pand `16*($i+0)+112`(%r10),%xmm4 663 movdqa `16*($i+3)-128`($bp),%xmm3 664 pand `16*($i+1)+112`(%r10),%xmm5 665 por %xmm4,%xmm0 666 pand `16*($i+2)+112`(%r10),%xmm2 667 por %xmm5,%xmm1 668 pand `16*($i+3)+112`(%r10),%xmm3 669 por %xmm2,%xmm0 670 por %xmm3,%xmm1 671___ 672} 673$code.=<<___; 674 por %xmm1,%xmm0 675 pshufd \$0x4e,%xmm0,%xmm1 676 por %xmm1,%xmm0 677 lea $STRIDE($bp),$bp 678 movq %xmm0,$m0 # m0=bp[0] 679 680 mov %r13,16+8(%rsp) # save end of b[num] 681 mov $rp, 56+8(%rsp) # save $rp 682 683 mov ($n0),$n0 # pull n0[0] value 684 mov ($ap),%rax 685 lea ($ap,$num),$ap # end of a[num] 686 neg $num 687 688 mov $n0,$m1 689 mulq $m0 # ap[0]*bp[0] 690 mov %rax,$A[0] 691 mov ($np),%rax 692 693 imulq $A[0],$m1 # "tp[0]"*n0 694 lea 64+8(%rsp),$tp 695 mov %rdx,$A[1] 696 697 mulq $m1 # np[0]*m1 698 add %rax,$A[0] # discarded 699 mov 8($ap,$num),%rax 700 adc \$0,%rdx 701 mov %rdx,$N[1] 702 703 mulq $m0 704 add %rax,$A[1] 705 mov 8*1($np),%rax 706 adc \$0,%rdx 707 mov %rdx,$A[0] 708 709 mulq $m1 710 add %rax,$N[1] 711 mov 16($ap,$num),%rax 712 adc \$0,%rdx 713 add $A[1],$N[1] 714 lea 4*8($num),$j # j=4 715 lea 8*4($np),$np 716 adc \$0,%rdx 717 mov $N[1],($tp) 718 mov %rdx,$N[0] 719 jmp .L1st4x 720 721.align 32 722.L1st4x: 723 mulq $m0 # ap[j]*bp[0] 724 add %rax,$A[0] 725 mov -8*2($np),%rax 726 lea 32($tp),$tp 727 adc \$0,%rdx 728 mov %rdx,$A[1] 729 730 mulq $m1 # np[j]*m1 731 add %rax,$N[0] 732 mov -8($ap,$j),%rax 733 adc \$0,%rdx 734 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 735 adc \$0,%rdx 736 mov $N[0],-24($tp) # tp[j-1] 737 mov %rdx,$N[1] 738 739 mulq $m0 # ap[j]*bp[0] 740 add %rax,$A[1] 741 mov -8*1($np),%rax 742 adc \$0,%rdx 743 mov %rdx,$A[0] 744 745 mulq $m1 # np[j]*m1 746 add %rax,$N[1] 747 mov ($ap,$j),%rax 748 adc \$0,%rdx 749 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 750 adc \$0,%rdx 751 mov $N[1],-16($tp) # tp[j-1] 752 mov %rdx,$N[0] 753 754 mulq $m0 # ap[j]*bp[0] 755 add %rax,$A[0] 756 mov 8*0($np),%rax 757 adc \$0,%rdx 758 mov %rdx,$A[1] 759 760 mulq $m1 # np[j]*m1 761 add %rax,$N[0] 762 mov 8($ap,$j),%rax 763 adc \$0,%rdx 764 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 765 adc \$0,%rdx 766 mov $N[0],-8($tp) # tp[j-1] 767 mov %rdx,$N[1] 768 769 mulq $m0 # ap[j]*bp[0] 770 add %rax,$A[1] 771 mov 8*1($np),%rax 772 adc \$0,%rdx 773 mov %rdx,$A[0] 774 775 mulq $m1 # np[j]*m1 776 add %rax,$N[1] 777 mov 16($ap,$j),%rax 778 adc \$0,%rdx 779 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 780 lea 8*4($np),$np 781 adc \$0,%rdx 782 mov $N[1],($tp) # tp[j-1] 783 mov %rdx,$N[0] 784 785 add \$32,$j # j+=4 786 jnz .L1st4x 787 788 mulq $m0 # ap[j]*bp[0] 789 add %rax,$A[0] 790 mov -8*2($np),%rax 791 lea 32($tp),$tp 792 adc \$0,%rdx 793 mov %rdx,$A[1] 794 795 mulq $m1 # np[j]*m1 796 add %rax,$N[0] 797 mov -8($ap),%rax 798 adc \$0,%rdx 799 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 800 adc \$0,%rdx 801 mov $N[0],-24($tp) # tp[j-1] 802 mov %rdx,$N[1] 803 804 mulq $m0 # ap[j]*bp[0] 805 add %rax,$A[1] 806 mov -8*1($np),%rax 807 adc \$0,%rdx 808 mov %rdx,$A[0] 809 810 mulq $m1 # np[j]*m1 811 add %rax,$N[1] 812 mov ($ap,$num),%rax # ap[0] 813 adc \$0,%rdx 814 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 815 adc \$0,%rdx 816 mov $N[1],-16($tp) # tp[j-1] 817 mov %rdx,$N[0] 818 819 lea ($np,$num),$np # rewind $np 820 821 xor $N[1],$N[1] 822 add $A[0],$N[0] 823 adc \$0,$N[1] 824 mov $N[0],-8($tp) 825 826 jmp .Louter4x 827 828.align 32 829.Louter4x: 830 lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) 831 pxor %xmm4,%xmm4 832 pxor %xmm5,%xmm5 833___ 834for($i=0;$i<$STRIDE/16;$i+=4) { 835$code.=<<___; 836 movdqa `16*($i+0)-128`($bp),%xmm0 837 movdqa `16*($i+1)-128`($bp),%xmm1 838 movdqa `16*($i+2)-128`($bp),%xmm2 839 movdqa `16*($i+3)-128`($bp),%xmm3 840 pand `16*($i+0)-128`(%rdx),%xmm0 841 pand `16*($i+1)-128`(%rdx),%xmm1 842 por %xmm0,%xmm4 843 pand `16*($i+2)-128`(%rdx),%xmm2 844 por %xmm1,%xmm5 845 pand `16*($i+3)-128`(%rdx),%xmm3 846 por %xmm2,%xmm4 847 por %xmm3,%xmm5 848___ 849} 850$code.=<<___; 851 por %xmm5,%xmm4 852 pshufd \$0x4e,%xmm4,%xmm0 853 por %xmm4,%xmm0 854 lea $STRIDE($bp),$bp 855 movq %xmm0,$m0 # m0=bp[i] 856 857 mov ($tp,$num),$A[0] 858 mov $n0,$m1 859 mulq $m0 # ap[0]*bp[i] 860 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 861 mov ($np),%rax 862 adc \$0,%rdx 863 864 imulq $A[0],$m1 # tp[0]*n0 865 mov %rdx,$A[1] 866 mov $N[1],($tp) # store upmost overflow bit 867 868 lea ($tp,$num),$tp # rewind $tp 869 870 mulq $m1 # np[0]*m1 871 add %rax,$A[0] # "$N[0]", discarded 872 mov 8($ap,$num),%rax 873 adc \$0,%rdx 874 mov %rdx,$N[1] 875 876 mulq $m0 # ap[j]*bp[i] 877 add %rax,$A[1] 878 mov 8*1($np),%rax 879 adc \$0,%rdx 880 add 8($tp),$A[1] # +tp[1] 881 adc \$0,%rdx 882 mov %rdx,$A[0] 883 884 mulq $m1 # np[j]*m1 885 add %rax,$N[1] 886 mov 16($ap,$num),%rax 887 adc \$0,%rdx 888 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 889 lea 4*8($num),$j # j=4 890 lea 8*4($np),$np 891 adc \$0,%rdx 892 mov %rdx,$N[0] 893 jmp .Linner4x 894 895.align 32 896.Linner4x: 897 mulq $m0 # ap[j]*bp[i] 898 add %rax,$A[0] 899 mov -8*2($np),%rax 900 adc \$0,%rdx 901 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 902 lea 32($tp),$tp 903 adc \$0,%rdx 904 mov %rdx,$A[1] 905 906 mulq $m1 # np[j]*m1 907 add %rax,$N[0] 908 mov -8($ap,$j),%rax 909 adc \$0,%rdx 910 add $A[0],$N[0] 911 adc \$0,%rdx 912 mov $N[1],-32($tp) # tp[j-1] 913 mov %rdx,$N[1] 914 915 mulq $m0 # ap[j]*bp[i] 916 add %rax,$A[1] 917 mov -8*1($np),%rax 918 adc \$0,%rdx 919 add -8($tp),$A[1] 920 adc \$0,%rdx 921 mov %rdx,$A[0] 922 923 mulq $m1 # np[j]*m1 924 add %rax,$N[1] 925 mov ($ap,$j),%rax 926 adc \$0,%rdx 927 add $A[1],$N[1] 928 adc \$0,%rdx 929 mov $N[0],-24($tp) # tp[j-1] 930 mov %rdx,$N[0] 931 932 mulq $m0 # ap[j]*bp[i] 933 add %rax,$A[0] 934 mov 8*0($np),%rax 935 adc \$0,%rdx 936 add ($tp),$A[0] # ap[j]*bp[i]+tp[j] 937 adc \$0,%rdx 938 mov %rdx,$A[1] 939 940 mulq $m1 # np[j]*m1 941 add %rax,$N[0] 942 mov 8($ap,$j),%rax 943 adc \$0,%rdx 944 add $A[0],$N[0] 945 adc \$0,%rdx 946 mov $N[1],-16($tp) # tp[j-1] 947 mov %rdx,$N[1] 948 949 mulq $m0 # ap[j]*bp[i] 950 add %rax,$A[1] 951 mov 8*1($np),%rax 952 adc \$0,%rdx 953 add 8($tp),$A[1] 954 adc \$0,%rdx 955 mov %rdx,$A[0] 956 957 mulq $m1 # np[j]*m1 958 add %rax,$N[1] 959 mov 16($ap,$j),%rax 960 adc \$0,%rdx 961 add $A[1],$N[1] 962 lea 8*4($np),$np 963 adc \$0,%rdx 964 mov $N[0],-8($tp) # tp[j-1] 965 mov %rdx,$N[0] 966 967 add \$32,$j # j+=4 968 jnz .Linner4x 969 970 mulq $m0 # ap[j]*bp[i] 971 add %rax,$A[0] 972 mov -8*2($np),%rax 973 adc \$0,%rdx 974 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 975 lea 32($tp),$tp 976 adc \$0,%rdx 977 mov %rdx,$A[1] 978 979 mulq $m1 # np[j]*m1 980 add %rax,$N[0] 981 mov -8($ap),%rax 982 adc \$0,%rdx 983 add $A[0],$N[0] 984 adc \$0,%rdx 985 mov $N[1],-32($tp) # tp[j-1] 986 mov %rdx,$N[1] 987 988 mulq $m0 # ap[j]*bp[i] 989 add %rax,$A[1] 990 mov $m1,%rax 991 mov -8*1($np),$m1 992 adc \$0,%rdx 993 add -8($tp),$A[1] 994 adc \$0,%rdx 995 mov %rdx,$A[0] 996 997 mulq $m1 # np[j]*m1 998 add %rax,$N[1] 999 mov ($ap,$num),%rax # ap[0] 1000 adc \$0,%rdx 1001 add $A[1],$N[1] 1002 adc \$0,%rdx 1003 mov $N[0],-24($tp) # tp[j-1] 1004 mov %rdx,$N[0] 1005 1006 mov $N[1],-16($tp) # tp[j-1] 1007 lea ($np,$num),$np # rewind $np 1008 1009 xor $N[1],$N[1] 1010 add $A[0],$N[0] 1011 adc \$0,$N[1] 1012 add ($tp),$N[0] # pull upmost overflow bit 1013 adc \$0,$N[1] # upmost overflow bit 1014 mov $N[0],-8($tp) 1015 1016 cmp 16+8(%rsp),$bp 1017 jb .Louter4x 1018___ 1019if (1) { 1020$code.=<<___; 1021 xor %rax,%rax 1022 sub $N[0],$m1 # compare top-most words 1023 adc $j,$j # $j is zero 1024 or $j,$N[1] 1025 sub $N[1],%rax # %rax=-$N[1] 1026 lea ($tp,$num),%rbx # tptr in .sqr4x_sub 1027 mov ($np),%r12 1028 lea ($np),%rbp # nptr in .sqr4x_sub 1029 mov %r9,%rcx 1030 sar \$3+2,%rcx 1031 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub 1032 dec %r12 # so that after 'not' we get -n[0] 1033 xor %r10,%r10 1034 mov 8*1(%rbp),%r13 1035 mov 8*2(%rbp),%r14 1036 mov 8*3(%rbp),%r15 1037 jmp .Lsqr4x_sub_entry 1038___ 1039} else { 1040my @ri=("%rax",$bp,$m0,$m1); 1041my $rp="%rdx"; 1042$code.=<<___ 1043 xor \$1,$N[1] 1044 lea ($tp,$num),$tp # rewind $tp 1045 sar \$5,$num # cf=0 1046 lea ($np,$N[1],8),$np 1047 mov 56+8(%rsp),$rp # restore $rp 1048 jmp .Lsub4x 1049 1050.align 32 1051.Lsub4x: 1052 .byte 0x66 1053 mov 8*0($tp),@ri[0] 1054 mov 8*1($tp),@ri[1] 1055 .byte 0x66 1056 sbb 16*0($np),@ri[0] 1057 mov 8*2($tp),@ri[2] 1058 sbb 16*1($np),@ri[1] 1059 mov 3*8($tp),@ri[3] 1060 lea 4*8($tp),$tp 1061 sbb 16*2($np),@ri[2] 1062 mov @ri[0],8*0($rp) 1063 sbb 16*3($np),@ri[3] 1064 lea 16*4($np),$np 1065 mov @ri[1],8*1($rp) 1066 mov @ri[2],8*2($rp) 1067 mov @ri[3],8*3($rp) 1068 lea 8*4($rp),$rp 1069 1070 inc $num 1071 jnz .Lsub4x 1072 1073 ret 1074___ 1075} 1076$code.=<<___; 1077.size mul4x_internal,.-mul4x_internal 1078___ 1079}}} 1080{{{ 1081###################################################################### 1082# void bn_power5( 1083my $rptr="%rdi"; # BN_ULONG *rptr, 1084my $aptr="%rsi"; # const BN_ULONG *aptr, 1085my $bptr="%rdx"; # const void *table, 1086my $nptr="%rcx"; # const BN_ULONG *nptr, 1087my $n0 ="%r8"; # const BN_ULONG *n0); 1088my $num ="%r9"; # int num, has to be divisible by 8 1089 # int pwr 1090 1091my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 1092my @A0=("%r10","%r11"); 1093my @A1=("%r12","%r13"); 1094my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 1095 1096$code.=<<___; 1097.globl bn_power5 1098.type bn_power5,\@function,6 1099.align 32 1100bn_power5: 1101.cfi_startproc 1102 mov %rsp,%rax 1103.cfi_def_cfa_register %rax 1104___ 1105$code.=<<___ if ($addx); 1106 mov OPENSSL_ia32cap_P+8(%rip),%r11d 1107 and \$0x80108,%r11d 1108 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 1109 je .Lpowerx5_enter 1110___ 1111$code.=<<___; 1112 push %rbx 1113.cfi_push %rbx 1114 push %rbp 1115.cfi_push %rbp 1116 push %r12 1117.cfi_push %r12 1118 push %r13 1119.cfi_push %r13 1120 push %r14 1121.cfi_push %r14 1122 push %r15 1123.cfi_push %r15 1124.Lpower5_prologue: 1125 1126 shl \$3,${num}d # convert $num to bytes 1127 lea ($num,$num,2),%r10d # 3*$num 1128 neg $num 1129 mov ($n0),$n0 # *n0 1130 1131 ############################################################## 1132 # Ensure that stack frame doesn't alias with $rptr+3*$num 1133 # modulo 4096, which covers ret[num], am[num] and n[num] 1134 # (see bn_exp.c). This is done to allow memory disambiguation 1135 # logic do its magic. [Extra 256 bytes is for power mask 1136 # calculated from 7th argument, the index.] 1137 # 1138 lea -320(%rsp,$num,2),%r11 1139 mov %rsp,%rbp 1140 sub $rptr,%r11 1141 and \$4095,%r11 1142 cmp %r11,%r10 1143 jb .Lpwr_sp_alt 1144 sub %r11,%rbp # align with $aptr 1145 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1146 jmp .Lpwr_sp_done 1147 1148.align 32 1149.Lpwr_sp_alt: 1150 lea 4096-320(,$num,2),%r10 1151 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1152 sub %r10,%r11 1153 mov \$0,%r10 1154 cmovc %r10,%r11 1155 sub %r11,%rbp 1156.Lpwr_sp_done: 1157 and \$-64,%rbp 1158 mov %rsp,%r11 1159 sub %rbp,%r11 1160 and \$-4096,%r11 1161 lea (%rbp,%r11),%rsp 1162 mov (%rsp),%r10 1163 cmp %rbp,%rsp 1164 ja .Lpwr_page_walk 1165 jmp .Lpwr_page_walk_done 1166 1167.Lpwr_page_walk: 1168 lea -4096(%rsp),%rsp 1169 mov (%rsp),%r10 1170 cmp %rbp,%rsp 1171 ja .Lpwr_page_walk 1172.Lpwr_page_walk_done: 1173 1174 mov $num,%r10 1175 neg $num 1176 1177 ############################################################## 1178 # Stack layout 1179 # 1180 # +0 saved $num, used in reduction section 1181 # +8 &t[2*$num], used in reduction section 1182 # +32 saved *n0 1183 # +40 saved %rsp 1184 # +48 t[2*$num] 1185 # 1186 mov $n0, 32(%rsp) 1187 mov %rax, 40(%rsp) # save original %rsp 1188.cfi_cfa_expression %rsp+40,deref,+8 1189.Lpower5_body: 1190 movq $rptr,%xmm1 # save $rptr, used in sqr8x 1191 movq $nptr,%xmm2 # save $nptr 1192 movq %r10, %xmm3 # -$num, used in sqr8x 1193 movq $bptr,%xmm4 1194 1195 call __bn_sqr8x_internal 1196 call __bn_post4x_internal 1197 call __bn_sqr8x_internal 1198 call __bn_post4x_internal 1199 call __bn_sqr8x_internal 1200 call __bn_post4x_internal 1201 call __bn_sqr8x_internal 1202 call __bn_post4x_internal 1203 call __bn_sqr8x_internal 1204 call __bn_post4x_internal 1205 1206 movq %xmm2,$nptr 1207 movq %xmm4,$bptr 1208 mov $aptr,$rptr 1209 mov 40(%rsp),%rax 1210 lea 32(%rsp),$n0 1211 1212 call mul4x_internal 1213 1214 mov 40(%rsp),%rsi # restore %rsp 1215.cfi_def_cfa %rsi,8 1216 mov \$1,%rax 1217 mov -48(%rsi),%r15 1218.cfi_restore %r15 1219 mov -40(%rsi),%r14 1220.cfi_restore %r14 1221 mov -32(%rsi),%r13 1222.cfi_restore %r13 1223 mov -24(%rsi),%r12 1224.cfi_restore %r12 1225 mov -16(%rsi),%rbp 1226.cfi_restore %rbp 1227 mov -8(%rsi),%rbx 1228.cfi_restore %rbx 1229 lea (%rsi),%rsp 1230.cfi_def_cfa_register %rsp 1231.Lpower5_epilogue: 1232 ret 1233.cfi_endproc 1234.size bn_power5,.-bn_power5 1235 1236.globl bn_sqr8x_internal 1237.hidden bn_sqr8x_internal 1238.type bn_sqr8x_internal,\@abi-omnipotent 1239.align 32 1240bn_sqr8x_internal: 1241__bn_sqr8x_internal: 1242 ############################################################## 1243 # Squaring part: 1244 # 1245 # a) multiply-n-add everything but a[i]*a[i]; 1246 # b) shift result of a) by 1 to the left and accumulate 1247 # a[i]*a[i] products; 1248 # 1249 ############################################################## 1250 # a[1]a[0] 1251 # a[2]a[0] 1252 # a[3]a[0] 1253 # a[2]a[1] 1254 # a[4]a[0] 1255 # a[3]a[1] 1256 # a[5]a[0] 1257 # a[4]a[1] 1258 # a[3]a[2] 1259 # a[6]a[0] 1260 # a[5]a[1] 1261 # a[4]a[2] 1262 # a[7]a[0] 1263 # a[6]a[1] 1264 # a[5]a[2] 1265 # a[4]a[3] 1266 # a[7]a[1] 1267 # a[6]a[2] 1268 # a[5]a[3] 1269 # a[7]a[2] 1270 # a[6]a[3] 1271 # a[5]a[4] 1272 # a[7]a[3] 1273 # a[6]a[4] 1274 # a[7]a[4] 1275 # a[6]a[5] 1276 # a[7]a[5] 1277 # a[7]a[6] 1278 # a[1]a[0] 1279 # a[2]a[0] 1280 # a[3]a[0] 1281 # a[4]a[0] 1282 # a[5]a[0] 1283 # a[6]a[0] 1284 # a[7]a[0] 1285 # a[2]a[1] 1286 # a[3]a[1] 1287 # a[4]a[1] 1288 # a[5]a[1] 1289 # a[6]a[1] 1290 # a[7]a[1] 1291 # a[3]a[2] 1292 # a[4]a[2] 1293 # a[5]a[2] 1294 # a[6]a[2] 1295 # a[7]a[2] 1296 # a[4]a[3] 1297 # a[5]a[3] 1298 # a[6]a[3] 1299 # a[7]a[3] 1300 # a[5]a[4] 1301 # a[6]a[4] 1302 # a[7]a[4] 1303 # a[6]a[5] 1304 # a[7]a[5] 1305 # a[7]a[6] 1306 # a[0]a[0] 1307 # a[1]a[1] 1308 # a[2]a[2] 1309 # a[3]a[3] 1310 # a[4]a[4] 1311 # a[5]a[5] 1312 # a[6]a[6] 1313 # a[7]a[7] 1314 1315 lea 32(%r10),$i # $i=-($num-32) 1316 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 1317 1318 mov $num,$j # $j=$num 1319 1320 # comments apply to $num==8 case 1321 mov -32($aptr,$i),$a0 # a[0] 1322 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1323 mov -24($aptr,$i),%rax # a[1] 1324 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1325 mov -16($aptr,$i),$ai # a[2] 1326 mov %rax,$a1 1327 1328 mul $a0 # a[1]*a[0] 1329 mov %rax,$A0[0] # a[1]*a[0] 1330 mov $ai,%rax # a[2] 1331 mov %rdx,$A0[1] 1332 mov $A0[0],-24($tptr,$i) # t[1] 1333 1334 mul $a0 # a[2]*a[0] 1335 add %rax,$A0[1] 1336 mov $ai,%rax 1337 adc \$0,%rdx 1338 mov $A0[1],-16($tptr,$i) # t[2] 1339 mov %rdx,$A0[0] 1340 1341 1342 mov -8($aptr,$i),$ai # a[3] 1343 mul $a1 # a[2]*a[1] 1344 mov %rax,$A1[0] # a[2]*a[1]+t[3] 1345 mov $ai,%rax 1346 mov %rdx,$A1[1] 1347 1348 lea ($i),$j 1349 mul $a0 # a[3]*a[0] 1350 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1351 mov $ai,%rax 1352 mov %rdx,$A0[1] 1353 adc \$0,$A0[1] 1354 add $A1[0],$A0[0] 1355 adc \$0,$A0[1] 1356 mov $A0[0],-8($tptr,$j) # t[3] 1357 jmp .Lsqr4x_1st 1358 1359.align 32 1360.Lsqr4x_1st: 1361 mov ($aptr,$j),$ai # a[4] 1362 mul $a1 # a[3]*a[1] 1363 add %rax,$A1[1] # a[3]*a[1]+t[4] 1364 mov $ai,%rax 1365 mov %rdx,$A1[0] 1366 adc \$0,$A1[0] 1367 1368 mul $a0 # a[4]*a[0] 1369 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1370 mov $ai,%rax # a[3] 1371 mov 8($aptr,$j),$ai # a[5] 1372 mov %rdx,$A0[0] 1373 adc \$0,$A0[0] 1374 add $A1[1],$A0[1] 1375 adc \$0,$A0[0] 1376 1377 1378 mul $a1 # a[4]*a[3] 1379 add %rax,$A1[0] # a[4]*a[3]+t[5] 1380 mov $ai,%rax 1381 mov $A0[1],($tptr,$j) # t[4] 1382 mov %rdx,$A1[1] 1383 adc \$0,$A1[1] 1384 1385 mul $a0 # a[5]*a[2] 1386 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1387 mov $ai,%rax 1388 mov 16($aptr,$j),$ai # a[6] 1389 mov %rdx,$A0[1] 1390 adc \$0,$A0[1] 1391 add $A1[0],$A0[0] 1392 adc \$0,$A0[1] 1393 1394 mul $a1 # a[5]*a[3] 1395 add %rax,$A1[1] # a[5]*a[3]+t[6] 1396 mov $ai,%rax 1397 mov $A0[0],8($tptr,$j) # t[5] 1398 mov %rdx,$A1[0] 1399 adc \$0,$A1[0] 1400 1401 mul $a0 # a[6]*a[2] 1402 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 1403 mov $ai,%rax # a[3] 1404 mov 24($aptr,$j),$ai # a[7] 1405 mov %rdx,$A0[0] 1406 adc \$0,$A0[0] 1407 add $A1[1],$A0[1] 1408 adc \$0,$A0[0] 1409 1410 1411 mul $a1 # a[6]*a[5] 1412 add %rax,$A1[0] # a[6]*a[5]+t[7] 1413 mov $ai,%rax 1414 mov $A0[1],16($tptr,$j) # t[6] 1415 mov %rdx,$A1[1] 1416 adc \$0,$A1[1] 1417 lea 32($j),$j 1418 1419 mul $a0 # a[7]*a[4] 1420 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 1421 mov $ai,%rax 1422 mov %rdx,$A0[1] 1423 adc \$0,$A0[1] 1424 add $A1[0],$A0[0] 1425 adc \$0,$A0[1] 1426 mov $A0[0],-8($tptr,$j) # t[7] 1427 1428 cmp \$0,$j 1429 jne .Lsqr4x_1st 1430 1431 mul $a1 # a[7]*a[5] 1432 add %rax,$A1[1] 1433 lea 16($i),$i 1434 adc \$0,%rdx 1435 add $A0[1],$A1[1] 1436 adc \$0,%rdx 1437 1438 mov $A1[1],($tptr) # t[8] 1439 mov %rdx,$A1[0] 1440 mov %rdx,8($tptr) # t[9] 1441 jmp .Lsqr4x_outer 1442 1443.align 32 1444.Lsqr4x_outer: # comments apply to $num==6 case 1445 mov -32($aptr,$i),$a0 # a[0] 1446 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1447 mov -24($aptr,$i),%rax # a[1] 1448 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1449 mov -16($aptr,$i),$ai # a[2] 1450 mov %rax,$a1 1451 1452 mul $a0 # a[1]*a[0] 1453 mov -24($tptr,$i),$A0[0] # t[1] 1454 add %rax,$A0[0] # a[1]*a[0]+t[1] 1455 mov $ai,%rax # a[2] 1456 adc \$0,%rdx 1457 mov $A0[0],-24($tptr,$i) # t[1] 1458 mov %rdx,$A0[1] 1459 1460 mul $a0 # a[2]*a[0] 1461 add %rax,$A0[1] 1462 mov $ai,%rax 1463 adc \$0,%rdx 1464 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 1465 mov %rdx,$A0[0] 1466 adc \$0,$A0[0] 1467 mov $A0[1],-16($tptr,$i) # t[2] 1468 1469 xor $A1[0],$A1[0] 1470 1471 mov -8($aptr,$i),$ai # a[3] 1472 mul $a1 # a[2]*a[1] 1473 add %rax,$A1[0] # a[2]*a[1]+t[3] 1474 mov $ai,%rax 1475 adc \$0,%rdx 1476 add -8($tptr,$i),$A1[0] 1477 mov %rdx,$A1[1] 1478 adc \$0,$A1[1] 1479 1480 mul $a0 # a[3]*a[0] 1481 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1482 mov $ai,%rax 1483 adc \$0,%rdx 1484 add $A1[0],$A0[0] 1485 mov %rdx,$A0[1] 1486 adc \$0,$A0[1] 1487 mov $A0[0],-8($tptr,$i) # t[3] 1488 1489 lea ($i),$j 1490 jmp .Lsqr4x_inner 1491 1492.align 32 1493.Lsqr4x_inner: 1494 mov ($aptr,$j),$ai # a[4] 1495 mul $a1 # a[3]*a[1] 1496 add %rax,$A1[1] # a[3]*a[1]+t[4] 1497 mov $ai,%rax 1498 mov %rdx,$A1[0] 1499 adc \$0,$A1[0] 1500 add ($tptr,$j),$A1[1] 1501 adc \$0,$A1[0] 1502 1503 .byte 0x67 1504 mul $a0 # a[4]*a[0] 1505 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1506 mov $ai,%rax # a[3] 1507 mov 8($aptr,$j),$ai # a[5] 1508 mov %rdx,$A0[0] 1509 adc \$0,$A0[0] 1510 add $A1[1],$A0[1] 1511 adc \$0,$A0[0] 1512 1513 mul $a1 # a[4]*a[3] 1514 add %rax,$A1[0] # a[4]*a[3]+t[5] 1515 mov $A0[1],($tptr,$j) # t[4] 1516 mov $ai,%rax 1517 mov %rdx,$A1[1] 1518 adc \$0,$A1[1] 1519 add 8($tptr,$j),$A1[0] 1520 lea 16($j),$j # j++ 1521 adc \$0,$A1[1] 1522 1523 mul $a0 # a[5]*a[2] 1524 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1525 mov $ai,%rax 1526 adc \$0,%rdx 1527 add $A1[0],$A0[0] 1528 mov %rdx,$A0[1] 1529 adc \$0,$A0[1] 1530 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 1531 1532 cmp \$0,$j 1533 jne .Lsqr4x_inner 1534 1535 .byte 0x67 1536 mul $a1 # a[5]*a[3] 1537 add %rax,$A1[1] 1538 adc \$0,%rdx 1539 add $A0[1],$A1[1] 1540 adc \$0,%rdx 1541 1542 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 1543 mov %rdx,$A1[0] 1544 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below 1545 1546 add \$16,$i 1547 jnz .Lsqr4x_outer 1548 1549 # comments apply to $num==4 case 1550 mov -32($aptr),$a0 # a[0] 1551 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1552 mov -24($aptr),%rax # a[1] 1553 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1554 mov -16($aptr),$ai # a[2] 1555 mov %rax,$a1 1556 1557 mul $a0 # a[1]*a[0] 1558 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 1559 mov $ai,%rax # a[2] 1560 mov %rdx,$A0[1] 1561 adc \$0,$A0[1] 1562 1563 mul $a0 # a[2]*a[0] 1564 add %rax,$A0[1] 1565 mov $ai,%rax 1566 mov $A0[0],-24($tptr) # t[1] 1567 mov %rdx,$A0[0] 1568 adc \$0,$A0[0] 1569 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1570 mov -8($aptr),$ai # a[3] 1571 adc \$0,$A0[0] 1572 1573 mul $a1 # a[2]*a[1] 1574 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1575 mov $ai,%rax 1576 mov $A0[1],-16($tptr) # t[2] 1577 mov %rdx,$A1[1] 1578 adc \$0,$A1[1] 1579 1580 mul $a0 # a[3]*a[0] 1581 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1582 mov $ai,%rax 1583 mov %rdx,$A0[1] 1584 adc \$0,$A0[1] 1585 add $A1[0],$A0[0] 1586 adc \$0,$A0[1] 1587 mov $A0[0],-8($tptr) # t[3] 1588 1589 mul $a1 # a[3]*a[1] 1590 add %rax,$A1[1] 1591 mov -16($aptr),%rax # a[2] 1592 adc \$0,%rdx 1593 add $A0[1],$A1[1] 1594 adc \$0,%rdx 1595 1596 mov $A1[1],($tptr) # t[4] 1597 mov %rdx,$A1[0] 1598 mov %rdx,8($tptr) # t[5] 1599 1600 mul $ai # a[2]*a[3] 1601___ 1602{ 1603my ($shift,$carry)=($a0,$a1); 1604my @S=(@A1,$ai,$n0); 1605$code.=<<___; 1606 add \$16,$i 1607 xor $shift,$shift 1608 sub $num,$i # $i=16-$num 1609 xor $carry,$carry 1610 1611 add $A1[0],%rax # t[5] 1612 adc \$0,%rdx 1613 mov %rax,8($tptr) # t[5] 1614 mov %rdx,16($tptr) # t[6] 1615 mov $carry,24($tptr) # t[7] 1616 1617 mov -16($aptr,$i),%rax # a[0] 1618 lea 48+8(%rsp),$tptr 1619 xor $A0[0],$A0[0] # t[0] 1620 mov 8($tptr),$A0[1] # t[1] 1621 1622 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1623 shr \$63,$A0[0] 1624 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1625 shr \$63,$A0[1] 1626 or $A0[0],$S[1] # | t[2*i]>>63 1627 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1628 mov $A0[1],$shift # shift=t[2*i+1]>>63 1629 mul %rax # a[i]*a[i] 1630 neg $carry # mov $carry,cf 1631 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1632 adc %rax,$S[0] 1633 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1634 mov $S[0],($tptr) 1635 adc %rdx,$S[1] 1636 1637 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1638 mov $S[1],8($tptr) 1639 sbb $carry,$carry # mov cf,$carry 1640 shr \$63,$A0[0] 1641 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1642 shr \$63,$A0[1] 1643 or $A0[0],$S[3] # | t[2*i]>>63 1644 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1645 mov $A0[1],$shift # shift=t[2*i+1]>>63 1646 mul %rax # a[i]*a[i] 1647 neg $carry # mov $carry,cf 1648 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1649 adc %rax,$S[2] 1650 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1651 mov $S[2],16($tptr) 1652 adc %rdx,$S[3] 1653 lea 16($i),$i 1654 mov $S[3],24($tptr) 1655 sbb $carry,$carry # mov cf,$carry 1656 lea 64($tptr),$tptr 1657 jmp .Lsqr4x_shift_n_add 1658 1659.align 32 1660.Lsqr4x_shift_n_add: 1661 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1662 shr \$63,$A0[0] 1663 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1664 shr \$63,$A0[1] 1665 or $A0[0],$S[1] # | t[2*i]>>63 1666 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1667 mov $A0[1],$shift # shift=t[2*i+1]>>63 1668 mul %rax # a[i]*a[i] 1669 neg $carry # mov $carry,cf 1670 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1671 adc %rax,$S[0] 1672 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1673 mov $S[0],-32($tptr) 1674 adc %rdx,$S[1] 1675 1676 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1677 mov $S[1],-24($tptr) 1678 sbb $carry,$carry # mov cf,$carry 1679 shr \$63,$A0[0] 1680 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1681 shr \$63,$A0[1] 1682 or $A0[0],$S[3] # | t[2*i]>>63 1683 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch 1684 mov $A0[1],$shift # shift=t[2*i+1]>>63 1685 mul %rax # a[i]*a[i] 1686 neg $carry # mov $carry,cf 1687 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1688 adc %rax,$S[2] 1689 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1690 mov $S[2],-16($tptr) 1691 adc %rdx,$S[3] 1692 1693 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1694 mov $S[3],-8($tptr) 1695 sbb $carry,$carry # mov cf,$carry 1696 shr \$63,$A0[0] 1697 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1698 shr \$63,$A0[1] 1699 or $A0[0],$S[1] # | t[2*i]>>63 1700 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1701 mov $A0[1],$shift # shift=t[2*i+1]>>63 1702 mul %rax # a[i]*a[i] 1703 neg $carry # mov $carry,cf 1704 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1705 adc %rax,$S[0] 1706 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1707 mov $S[0],0($tptr) 1708 adc %rdx,$S[1] 1709 1710 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1711 mov $S[1],8($tptr) 1712 sbb $carry,$carry # mov cf,$carry 1713 shr \$63,$A0[0] 1714 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1715 shr \$63,$A0[1] 1716 or $A0[0],$S[3] # | t[2*i]>>63 1717 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1718 mov $A0[1],$shift # shift=t[2*i+1]>>63 1719 mul %rax # a[i]*a[i] 1720 neg $carry # mov $carry,cf 1721 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1722 adc %rax,$S[2] 1723 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1724 mov $S[2],16($tptr) 1725 adc %rdx,$S[3] 1726 mov $S[3],24($tptr) 1727 sbb $carry,$carry # mov cf,$carry 1728 lea 64($tptr),$tptr 1729 add \$32,$i 1730 jnz .Lsqr4x_shift_n_add 1731 1732 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1733 .byte 0x67 1734 shr \$63,$A0[0] 1735 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1736 shr \$63,$A0[1] 1737 or $A0[0],$S[1] # | t[2*i]>>63 1738 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1739 mov $A0[1],$shift # shift=t[2*i+1]>>63 1740 mul %rax # a[i]*a[i] 1741 neg $carry # mov $carry,cf 1742 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1743 adc %rax,$S[0] 1744 mov -8($aptr),%rax # a[i+1] # prefetch 1745 mov $S[0],-32($tptr) 1746 adc %rdx,$S[1] 1747 1748 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1749 mov $S[1],-24($tptr) 1750 sbb $carry,$carry # mov cf,$carry 1751 shr \$63,$A0[0] 1752 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1753 shr \$63,$A0[1] 1754 or $A0[0],$S[3] # | t[2*i]>>63 1755 mul %rax # a[i]*a[i] 1756 neg $carry # mov $carry,cf 1757 adc %rax,$S[2] 1758 adc %rdx,$S[3] 1759 mov $S[2],-16($tptr) 1760 mov $S[3],-8($tptr) 1761___ 1762} 1763###################################################################### 1764# Montgomery reduction part, "word-by-word" algorithm. 1765# 1766# This new path is inspired by multiple submissions from Intel, by 1767# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 1768# Vinodh Gopal... 1769{ 1770my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); 1771 1772$code.=<<___; 1773 movq %xmm2,$nptr 1774__bn_sqr8x_reduction: 1775 xor %rax,%rax 1776 lea ($nptr,$num),%rcx # end of n[] 1777 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer 1778 mov %rcx,0+8(%rsp) 1779 lea 48+8(%rsp,$num),$tptr # end of initial t[] window 1780 mov %rdx,8+8(%rsp) 1781 neg $num 1782 jmp .L8x_reduction_loop 1783 1784.align 32 1785.L8x_reduction_loop: 1786 lea ($tptr,$num),$tptr # start of current t[] window 1787 .byte 0x66 1788 mov 8*0($tptr),$m0 1789 mov 8*1($tptr),%r9 1790 mov 8*2($tptr),%r10 1791 mov 8*3($tptr),%r11 1792 mov 8*4($tptr),%r12 1793 mov 8*5($tptr),%r13 1794 mov 8*6($tptr),%r14 1795 mov 8*7($tptr),%r15 1796 mov %rax,(%rdx) # store top-most carry bit 1797 lea 8*8($tptr),$tptr 1798 1799 .byte 0x67 1800 mov $m0,%r8 1801 imulq 32+8(%rsp),$m0 # n0*a[0] 1802 mov 8*0($nptr),%rax # n[0] 1803 mov \$8,%ecx 1804 jmp .L8x_reduce 1805 1806.align 32 1807.L8x_reduce: 1808 mulq $m0 1809 mov 8*1($nptr),%rax # n[1] 1810 neg %r8 1811 mov %rdx,%r8 1812 adc \$0,%r8 1813 1814 mulq $m0 1815 add %rax,%r9 1816 mov 8*2($nptr),%rax 1817 adc \$0,%rdx 1818 add %r9,%r8 1819 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] 1820 mov %rdx,%r9 1821 adc \$0,%r9 1822 1823 mulq $m0 1824 add %rax,%r10 1825 mov 8*3($nptr),%rax 1826 adc \$0,%rdx 1827 add %r10,%r9 1828 mov 32+8(%rsp),$carry # pull n0, borrow $carry 1829 mov %rdx,%r10 1830 adc \$0,%r10 1831 1832 mulq $m0 1833 add %rax,%r11 1834 mov 8*4($nptr),%rax 1835 adc \$0,%rdx 1836 imulq %r8,$carry # modulo-scheduled 1837 add %r11,%r10 1838 mov %rdx,%r11 1839 adc \$0,%r11 1840 1841 mulq $m0 1842 add %rax,%r12 1843 mov 8*5($nptr),%rax 1844 adc \$0,%rdx 1845 add %r12,%r11 1846 mov %rdx,%r12 1847 adc \$0,%r12 1848 1849 mulq $m0 1850 add %rax,%r13 1851 mov 8*6($nptr),%rax 1852 adc \$0,%rdx 1853 add %r13,%r12 1854 mov %rdx,%r13 1855 adc \$0,%r13 1856 1857 mulq $m0 1858 add %rax,%r14 1859 mov 8*7($nptr),%rax 1860 adc \$0,%rdx 1861 add %r14,%r13 1862 mov %rdx,%r14 1863 adc \$0,%r14 1864 1865 mulq $m0 1866 mov $carry,$m0 # n0*a[i] 1867 add %rax,%r15 1868 mov 8*0($nptr),%rax # n[0] 1869 adc \$0,%rdx 1870 add %r15,%r14 1871 mov %rdx,%r15 1872 adc \$0,%r15 1873 1874 dec %ecx 1875 jnz .L8x_reduce 1876 1877 lea 8*8($nptr),$nptr 1878 xor %rax,%rax 1879 mov 8+8(%rsp),%rdx # pull end of t[] 1880 cmp 0+8(%rsp),$nptr # end of n[]? 1881 jae .L8x_no_tail 1882 1883 .byte 0x66 1884 add 8*0($tptr),%r8 1885 adc 8*1($tptr),%r9 1886 adc 8*2($tptr),%r10 1887 adc 8*3($tptr),%r11 1888 adc 8*4($tptr),%r12 1889 adc 8*5($tptr),%r13 1890 adc 8*6($tptr),%r14 1891 adc 8*7($tptr),%r15 1892 sbb $carry,$carry # top carry 1893 1894 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1895 mov \$8,%ecx 1896 mov 8*0($nptr),%rax 1897 jmp .L8x_tail 1898 1899.align 32 1900.L8x_tail: 1901 mulq $m0 1902 add %rax,%r8 1903 mov 8*1($nptr),%rax 1904 mov %r8,($tptr) # save result 1905 mov %rdx,%r8 1906 adc \$0,%r8 1907 1908 mulq $m0 1909 add %rax,%r9 1910 mov 8*2($nptr),%rax 1911 adc \$0,%rdx 1912 add %r9,%r8 1913 lea 8($tptr),$tptr # $tptr++ 1914 mov %rdx,%r9 1915 adc \$0,%r9 1916 1917 mulq $m0 1918 add %rax,%r10 1919 mov 8*3($nptr),%rax 1920 adc \$0,%rdx 1921 add %r10,%r9 1922 mov %rdx,%r10 1923 adc \$0,%r10 1924 1925 mulq $m0 1926 add %rax,%r11 1927 mov 8*4($nptr),%rax 1928 adc \$0,%rdx 1929 add %r11,%r10 1930 mov %rdx,%r11 1931 adc \$0,%r11 1932 1933 mulq $m0 1934 add %rax,%r12 1935 mov 8*5($nptr),%rax 1936 adc \$0,%rdx 1937 add %r12,%r11 1938 mov %rdx,%r12 1939 adc \$0,%r12 1940 1941 mulq $m0 1942 add %rax,%r13 1943 mov 8*6($nptr),%rax 1944 adc \$0,%rdx 1945 add %r13,%r12 1946 mov %rdx,%r13 1947 adc \$0,%r13 1948 1949 mulq $m0 1950 add %rax,%r14 1951 mov 8*7($nptr),%rax 1952 adc \$0,%rdx 1953 add %r14,%r13 1954 mov %rdx,%r14 1955 adc \$0,%r14 1956 1957 mulq $m0 1958 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] 1959 add %rax,%r15 1960 adc \$0,%rdx 1961 add %r15,%r14 1962 mov 8*0($nptr),%rax # pull n[0] 1963 mov %rdx,%r15 1964 adc \$0,%r15 1965 1966 dec %ecx 1967 jnz .L8x_tail 1968 1969 lea 8*8($nptr),$nptr 1970 mov 8+8(%rsp),%rdx # pull end of t[] 1971 cmp 0+8(%rsp),$nptr # end of n[]? 1972 jae .L8x_tail_done # break out of loop 1973 1974 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1975 neg $carry 1976 mov 8*0($nptr),%rax # pull n[0] 1977 adc 8*0($tptr),%r8 1978 adc 8*1($tptr),%r9 1979 adc 8*2($tptr),%r10 1980 adc 8*3($tptr),%r11 1981 adc 8*4($tptr),%r12 1982 adc 8*5($tptr),%r13 1983 adc 8*6($tptr),%r14 1984 adc 8*7($tptr),%r15 1985 sbb $carry,$carry # top carry 1986 1987 mov \$8,%ecx 1988 jmp .L8x_tail 1989 1990.align 32 1991.L8x_tail_done: 1992 xor %rax,%rax 1993 add (%rdx),%r8 # can this overflow? 1994 adc \$0,%r9 1995 adc \$0,%r10 1996 adc \$0,%r11 1997 adc \$0,%r12 1998 adc \$0,%r13 1999 adc \$0,%r14 2000 adc \$0,%r15 2001 adc \$0,%rax 2002 2003 neg $carry 2004.L8x_no_tail: 2005 adc 8*0($tptr),%r8 2006 adc 8*1($tptr),%r9 2007 adc 8*2($tptr),%r10 2008 adc 8*3($tptr),%r11 2009 adc 8*4($tptr),%r12 2010 adc 8*5($tptr),%r13 2011 adc 8*6($tptr),%r14 2012 adc 8*7($tptr),%r15 2013 adc \$0,%rax # top-most carry 2014 mov -8($nptr),%rcx # np[num-1] 2015 xor $carry,$carry 2016 2017 movq %xmm2,$nptr # restore $nptr 2018 2019 mov %r8,8*0($tptr) # store top 512 bits 2020 mov %r9,8*1($tptr) 2021 movq %xmm3,$num # $num is %r9, can't be moved upwards 2022 mov %r10,8*2($tptr) 2023 mov %r11,8*3($tptr) 2024 mov %r12,8*4($tptr) 2025 mov %r13,8*5($tptr) 2026 mov %r14,8*6($tptr) 2027 mov %r15,8*7($tptr) 2028 lea 8*8($tptr),$tptr 2029 2030 cmp %rdx,$tptr # end of t[]? 2031 jb .L8x_reduction_loop 2032 ret 2033.size bn_sqr8x_internal,.-bn_sqr8x_internal 2034___ 2035} 2036############################################################## 2037# Post-condition, 4x unrolled 2038# 2039{ 2040my ($tptr,$nptr)=("%rbx","%rbp"); 2041$code.=<<___; 2042.type __bn_post4x_internal,\@abi-omnipotent 2043.align 32 2044__bn_post4x_internal: 2045 mov 8*0($nptr),%r12 2046 lea (%rdi,$num),$tptr # %rdi was $tptr above 2047 mov $num,%rcx 2048 movq %xmm1,$rptr # restore $rptr 2049 neg %rax 2050 movq %xmm1,$aptr # prepare for back-to-back call 2051 sar \$3+2,%rcx 2052 dec %r12 # so that after 'not' we get -n[0] 2053 xor %r10,%r10 2054 mov 8*1($nptr),%r13 2055 mov 8*2($nptr),%r14 2056 mov 8*3($nptr),%r15 2057 jmp .Lsqr4x_sub_entry 2058 2059.align 16 2060.Lsqr4x_sub: 2061 mov 8*0($nptr),%r12 2062 mov 8*1($nptr),%r13 2063 mov 8*2($nptr),%r14 2064 mov 8*3($nptr),%r15 2065.Lsqr4x_sub_entry: 2066 lea 8*4($nptr),$nptr 2067 not %r12 2068 not %r13 2069 not %r14 2070 not %r15 2071 and %rax,%r12 2072 and %rax,%r13 2073 and %rax,%r14 2074 and %rax,%r15 2075 2076 neg %r10 # mov %r10,%cf 2077 adc 8*0($tptr),%r12 2078 adc 8*1($tptr),%r13 2079 adc 8*2($tptr),%r14 2080 adc 8*3($tptr),%r15 2081 mov %r12,8*0($rptr) 2082 lea 8*4($tptr),$tptr 2083 mov %r13,8*1($rptr) 2084 sbb %r10,%r10 # mov %cf,%r10 2085 mov %r14,8*2($rptr) 2086 mov %r15,8*3($rptr) 2087 lea 8*4($rptr),$rptr 2088 2089 inc %rcx # pass %cf 2090 jnz .Lsqr4x_sub 2091 2092 mov $num,%r10 # prepare for back-to-back call 2093 neg $num # restore $num 2094 ret 2095.size __bn_post4x_internal,.-__bn_post4x_internal 2096___ 2097} 2098{ 2099$code.=<<___; 2100.globl bn_from_montgomery 2101.type bn_from_montgomery,\@abi-omnipotent 2102.align 32 2103bn_from_montgomery: 2104 testl \$7,`($win64?"48(%rsp)":"%r9d")` 2105 jz bn_from_mont8x 2106 xor %eax,%eax 2107 ret 2108.size bn_from_montgomery,.-bn_from_montgomery 2109 2110.type bn_from_mont8x,\@function,6 2111.align 32 2112bn_from_mont8x: 2113.cfi_startproc 2114 .byte 0x67 2115 mov %rsp,%rax 2116.cfi_def_cfa_register %rax 2117 push %rbx 2118.cfi_push %rbx 2119 push %rbp 2120.cfi_push %rbp 2121 push %r12 2122.cfi_push %r12 2123 push %r13 2124.cfi_push %r13 2125 push %r14 2126.cfi_push %r14 2127 push %r15 2128.cfi_push %r15 2129.Lfrom_prologue: 2130 2131 shl \$3,${num}d # convert $num to bytes 2132 lea ($num,$num,2),%r10 # 3*$num in bytes 2133 neg $num 2134 mov ($n0),$n0 # *n0 2135 2136 ############################################################## 2137 # Ensure that stack frame doesn't alias with $rptr+3*$num 2138 # modulo 4096, which covers ret[num], am[num] and n[num] 2139 # (see bn_exp.c). The stack is allocated to aligned with 2140 # bn_power5's frame, and as bn_from_montgomery happens to be 2141 # last operation, we use the opportunity to cleanse it. 2142 # 2143 lea -320(%rsp,$num,2),%r11 2144 mov %rsp,%rbp 2145 sub $rptr,%r11 2146 and \$4095,%r11 2147 cmp %r11,%r10 2148 jb .Lfrom_sp_alt 2149 sub %r11,%rbp # align with $aptr 2150 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2151 jmp .Lfrom_sp_done 2152 2153.align 32 2154.Lfrom_sp_alt: 2155 lea 4096-320(,$num,2),%r10 2156 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2157 sub %r10,%r11 2158 mov \$0,%r10 2159 cmovc %r10,%r11 2160 sub %r11,%rbp 2161.Lfrom_sp_done: 2162 and \$-64,%rbp 2163 mov %rsp,%r11 2164 sub %rbp,%r11 2165 and \$-4096,%r11 2166 lea (%rbp,%r11),%rsp 2167 mov (%rsp),%r10 2168 cmp %rbp,%rsp 2169 ja .Lfrom_page_walk 2170 jmp .Lfrom_page_walk_done 2171 2172.Lfrom_page_walk: 2173 lea -4096(%rsp),%rsp 2174 mov (%rsp),%r10 2175 cmp %rbp,%rsp 2176 ja .Lfrom_page_walk 2177.Lfrom_page_walk_done: 2178 2179 mov $num,%r10 2180 neg $num 2181 2182 ############################################################## 2183 # Stack layout 2184 # 2185 # +0 saved $num, used in reduction section 2186 # +8 &t[2*$num], used in reduction section 2187 # +32 saved *n0 2188 # +40 saved %rsp 2189 # +48 t[2*$num] 2190 # 2191 mov $n0, 32(%rsp) 2192 mov %rax, 40(%rsp) # save original %rsp 2193.cfi_cfa_expression %rsp+40,deref,+8 2194.Lfrom_body: 2195 mov $num,%r11 2196 lea 48(%rsp),%rax 2197 pxor %xmm0,%xmm0 2198 jmp .Lmul_by_1 2199 2200.align 32 2201.Lmul_by_1: 2202 movdqu ($aptr),%xmm1 2203 movdqu 16($aptr),%xmm2 2204 movdqu 32($aptr),%xmm3 2205 movdqa %xmm0,(%rax,$num) 2206 movdqu 48($aptr),%xmm4 2207 movdqa %xmm0,16(%rax,$num) 2208 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr 2209 movdqa %xmm1,(%rax) 2210 movdqa %xmm0,32(%rax,$num) 2211 movdqa %xmm2,16(%rax) 2212 movdqa %xmm0,48(%rax,$num) 2213 movdqa %xmm3,32(%rax) 2214 movdqa %xmm4,48(%rax) 2215 lea 64(%rax),%rax 2216 sub \$64,%r11 2217 jnz .Lmul_by_1 2218 2219 movq $rptr,%xmm1 2220 movq $nptr,%xmm2 2221 .byte 0x67 2222 mov $nptr,%rbp 2223 movq %r10, %xmm3 # -num 2224___ 2225$code.=<<___ if ($addx); 2226 mov OPENSSL_ia32cap_P+8(%rip),%r11d 2227 and \$0x80108,%r11d 2228 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 2229 jne .Lfrom_mont_nox 2230 2231 lea (%rax,$num),$rptr 2232 call __bn_sqrx8x_reduction 2233 call __bn_postx4x_internal 2234 2235 pxor %xmm0,%xmm0 2236 lea 48(%rsp),%rax 2237 jmp .Lfrom_mont_zero 2238 2239.align 32 2240.Lfrom_mont_nox: 2241___ 2242$code.=<<___; 2243 call __bn_sqr8x_reduction 2244 call __bn_post4x_internal 2245 2246 pxor %xmm0,%xmm0 2247 lea 48(%rsp),%rax 2248 jmp .Lfrom_mont_zero 2249 2250.align 32 2251.Lfrom_mont_zero: 2252 mov 40(%rsp),%rsi # restore %rsp 2253.cfi_def_cfa %rsi,8 2254 movdqa %xmm0,16*0(%rax) 2255 movdqa %xmm0,16*1(%rax) 2256 movdqa %xmm0,16*2(%rax) 2257 movdqa %xmm0,16*3(%rax) 2258 lea 16*4(%rax),%rax 2259 sub \$32,$num 2260 jnz .Lfrom_mont_zero 2261 2262 mov \$1,%rax 2263 mov -48(%rsi),%r15 2264.cfi_restore %r15 2265 mov -40(%rsi),%r14 2266.cfi_restore %r14 2267 mov -32(%rsi),%r13 2268.cfi_restore %r13 2269 mov -24(%rsi),%r12 2270.cfi_restore %r12 2271 mov -16(%rsi),%rbp 2272.cfi_restore %rbp 2273 mov -8(%rsi),%rbx 2274.cfi_restore %rbx 2275 lea (%rsi),%rsp 2276.cfi_def_cfa_register %rsp 2277.Lfrom_epilogue: 2278 ret 2279.cfi_endproc 2280.size bn_from_mont8x,.-bn_from_mont8x 2281___ 2282} 2283}}} 2284 2285if ($addx) {{{ 2286my $bp="%rdx"; # restore original value 2287 2288$code.=<<___; 2289.type bn_mulx4x_mont_gather5,\@function,6 2290.align 32 2291bn_mulx4x_mont_gather5: 2292.cfi_startproc 2293 mov %rsp,%rax 2294.cfi_def_cfa_register %rax 2295.Lmulx4x_enter: 2296 push %rbx 2297.cfi_push %rbx 2298 push %rbp 2299.cfi_push %rbp 2300 push %r12 2301.cfi_push %r12 2302 push %r13 2303.cfi_push %r13 2304 push %r14 2305.cfi_push %r14 2306 push %r15 2307.cfi_push %r15 2308.Lmulx4x_prologue: 2309 2310 shl \$3,${num}d # convert $num to bytes 2311 lea ($num,$num,2),%r10 # 3*$num in bytes 2312 neg $num # -$num 2313 mov ($n0),$n0 # *n0 2314 2315 ############################################################## 2316 # Ensure that stack frame doesn't alias with $rptr+3*$num 2317 # modulo 4096, which covers ret[num], am[num] and n[num] 2318 # (see bn_exp.c). This is done to allow memory disambiguation 2319 # logic do its magic. [Extra [num] is allocated in order 2320 # to align with bn_power5's frame, which is cleansed after 2321 # completing exponentiation. Extra 256 bytes is for power mask 2322 # calculated from 7th argument, the index.] 2323 # 2324 lea -320(%rsp,$num,2),%r11 2325 mov %rsp,%rbp 2326 sub $rp,%r11 2327 and \$4095,%r11 2328 cmp %r11,%r10 2329 jb .Lmulx4xsp_alt 2330 sub %r11,%rbp # align with $aptr 2331 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2332 jmp .Lmulx4xsp_done 2333 2334.Lmulx4xsp_alt: 2335 lea 4096-320(,$num,2),%r10 2336 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2337 sub %r10,%r11 2338 mov \$0,%r10 2339 cmovc %r10,%r11 2340 sub %r11,%rbp 2341.Lmulx4xsp_done: 2342 and \$-64,%rbp # ensure alignment 2343 mov %rsp,%r11 2344 sub %rbp,%r11 2345 and \$-4096,%r11 2346 lea (%rbp,%r11),%rsp 2347 mov (%rsp),%r10 2348 cmp %rbp,%rsp 2349 ja .Lmulx4x_page_walk 2350 jmp .Lmulx4x_page_walk_done 2351 2352.Lmulx4x_page_walk: 2353 lea -4096(%rsp),%rsp 2354 mov (%rsp),%r10 2355 cmp %rbp,%rsp 2356 ja .Lmulx4x_page_walk 2357.Lmulx4x_page_walk_done: 2358 2359 ############################################################## 2360 # Stack layout 2361 # +0 -num 2362 # +8 off-loaded &b[i] 2363 # +16 end of b[num] 2364 # +24 inner counter 2365 # +32 saved n0 2366 # +40 saved %rsp 2367 # +48 2368 # +56 saved rp 2369 # +64 tmp[num+1] 2370 # 2371 mov $n0, 32(%rsp) # save *n0 2372 mov %rax,40(%rsp) # save original %rsp 2373.cfi_cfa_expression %rsp+40,deref,+8 2374.Lmulx4x_body: 2375 call mulx4x_internal 2376 2377 mov 40(%rsp),%rsi # restore %rsp 2378.cfi_def_cfa %rsi,8 2379 mov \$1,%rax 2380 2381 mov -48(%rsi),%r15 2382.cfi_restore %r15 2383 mov -40(%rsi),%r14 2384.cfi_restore %r14 2385 mov -32(%rsi),%r13 2386.cfi_restore %r13 2387 mov -24(%rsi),%r12 2388.cfi_restore %r12 2389 mov -16(%rsi),%rbp 2390.cfi_restore %rbp 2391 mov -8(%rsi),%rbx 2392.cfi_restore %rbx 2393 lea (%rsi),%rsp 2394.cfi_def_cfa_register %rsp 2395.Lmulx4x_epilogue: 2396 ret 2397.cfi_endproc 2398.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2399 2400.type mulx4x_internal,\@abi-omnipotent 2401.align 32 2402mulx4x_internal: 2403 mov $num,8(%rsp) # save -$num (it was in bytes) 2404 mov $num,%r10 2405 neg $num # restore $num 2406 shl \$5,$num 2407 neg %r10 # restore $num 2408 lea 128($bp,$num),%r13 # end of powers table (+size optimization) 2409 shr \$5+5,$num 2410 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument 2411 sub \$1,$num 2412 lea .Linc(%rip),%rax 2413 mov %r13,16+8(%rsp) # end of b[num] 2414 mov $num,24+8(%rsp) # inner counter 2415 mov $rp, 56+8(%rsp) # save $rp 2416___ 2417my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 2418 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 2419my $rptr=$bptr; 2420my $STRIDE=2**5*8; # 5 is "window size" 2421my $N=$STRIDE/4; # should match cache line size 2422$code.=<<___; 2423 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 2424 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 2425 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) 2426 lea 128($bp),$bptr # size optimization 2427 2428 pshufd \$0,%xmm5,%xmm5 # broadcast index 2429 movdqa %xmm1,%xmm4 2430 .byte 0x67 2431 movdqa %xmm1,%xmm2 2432___ 2433######################################################################## 2434# calculate mask by comparing 0..31 to index and save result to stack 2435# 2436$code.=<<___; 2437 .byte 0x67 2438 paddd %xmm0,%xmm1 2439 pcmpeqd %xmm5,%xmm0 # compare to 1,0 2440 movdqa %xmm4,%xmm3 2441___ 2442for($i=0;$i<$STRIDE/16-4;$i+=4) { 2443$code.=<<___; 2444 paddd %xmm1,%xmm2 2445 pcmpeqd %xmm5,%xmm1 # compare to 3,2 2446 movdqa %xmm0,`16*($i+0)+112`(%r10) 2447 movdqa %xmm4,%xmm0 2448 2449 paddd %xmm2,%xmm3 2450 pcmpeqd %xmm5,%xmm2 # compare to 5,4 2451 movdqa %xmm1,`16*($i+1)+112`(%r10) 2452 movdqa %xmm4,%xmm1 2453 2454 paddd %xmm3,%xmm0 2455 pcmpeqd %xmm5,%xmm3 # compare to 7,6 2456 movdqa %xmm2,`16*($i+2)+112`(%r10) 2457 movdqa %xmm4,%xmm2 2458 2459 paddd %xmm0,%xmm1 2460 pcmpeqd %xmm5,%xmm0 2461 movdqa %xmm3,`16*($i+3)+112`(%r10) 2462 movdqa %xmm4,%xmm3 2463___ 2464} 2465$code.=<<___; # last iteration can be optimized 2466 .byte 0x67 2467 paddd %xmm1,%xmm2 2468 pcmpeqd %xmm5,%xmm1 2469 movdqa %xmm0,`16*($i+0)+112`(%r10) 2470 2471 paddd %xmm2,%xmm3 2472 pcmpeqd %xmm5,%xmm2 2473 movdqa %xmm1,`16*($i+1)+112`(%r10) 2474 2475 pcmpeqd %xmm5,%xmm3 2476 movdqa %xmm2,`16*($i+2)+112`(%r10) 2477 2478 pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register 2479 pand `16*($i+1)-128`($bptr),%xmm1 2480 pand `16*($i+2)-128`($bptr),%xmm2 2481 movdqa %xmm3,`16*($i+3)+112`(%r10) 2482 pand `16*($i+3)-128`($bptr),%xmm3 2483 por %xmm2,%xmm0 2484 por %xmm3,%xmm1 2485___ 2486for($i=0;$i<$STRIDE/16-4;$i+=4) { 2487$code.=<<___; 2488 movdqa `16*($i+0)-128`($bptr),%xmm4 2489 movdqa `16*($i+1)-128`($bptr),%xmm5 2490 movdqa `16*($i+2)-128`($bptr),%xmm2 2491 pand `16*($i+0)+112`(%r10),%xmm4 2492 movdqa `16*($i+3)-128`($bptr),%xmm3 2493 pand `16*($i+1)+112`(%r10),%xmm5 2494 por %xmm4,%xmm0 2495 pand `16*($i+2)+112`(%r10),%xmm2 2496 por %xmm5,%xmm1 2497 pand `16*($i+3)+112`(%r10),%xmm3 2498 por %xmm2,%xmm0 2499 por %xmm3,%xmm1 2500___ 2501} 2502$code.=<<___; 2503 pxor %xmm1,%xmm0 2504 pshufd \$0x4e,%xmm0,%xmm1 2505 por %xmm1,%xmm0 2506 lea $STRIDE($bptr),$bptr 2507 movq %xmm0,%rdx # bp[0] 2508 lea 64+8*4+8(%rsp),$tptr 2509 2510 mov %rdx,$bi 2511 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 2512 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] 2513 add %rax,%r11 2514 mulx 2*8($aptr),%rax,%r13 # ... 2515 adc %rax,%r12 2516 adc \$0,%r13 2517 mulx 3*8($aptr),%rax,%r14 2518 2519 mov $mi,%r15 2520 imulq 32+8(%rsp),$mi # "t[0]"*n0 2521 xor $zero,$zero # cf=0, of=0 2522 mov $mi,%rdx 2523 2524 mov $bptr,8+8(%rsp) # off-load &b[i] 2525 2526 lea 4*8($aptr),$aptr 2527 adcx %rax,%r13 2528 adcx $zero,%r14 # cf=0 2529 2530 mulx 0*8($nptr),%rax,%r10 2531 adcx %rax,%r15 # discarded 2532 adox %r11,%r10 2533 mulx 1*8($nptr),%rax,%r11 2534 adcx %rax,%r10 2535 adox %r12,%r11 2536 mulx 2*8($nptr),%rax,%r12 2537 mov 24+8(%rsp),$bptr # counter value 2538 mov %r10,-8*4($tptr) 2539 adcx %rax,%r11 2540 adox %r13,%r12 2541 mulx 3*8($nptr),%rax,%r15 2542 mov $bi,%rdx 2543 mov %r11,-8*3($tptr) 2544 adcx %rax,%r12 2545 adox $zero,%r15 # of=0 2546 lea 4*8($nptr),$nptr 2547 mov %r12,-8*2($tptr) 2548 jmp .Lmulx4x_1st 2549 2550.align 32 2551.Lmulx4x_1st: 2552 adcx $zero,%r15 # cf=0, modulo-scheduled 2553 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 2554 adcx %r14,%r10 2555 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 2556 adcx %rax,%r11 2557 mulx 2*8($aptr),%r12,%rax # ... 2558 adcx %r14,%r12 2559 mulx 3*8($aptr),%r13,%r14 2560 .byte 0x67,0x67 2561 mov $mi,%rdx 2562 adcx %rax,%r13 2563 adcx $zero,%r14 # cf=0 2564 lea 4*8($aptr),$aptr 2565 lea 4*8($tptr),$tptr 2566 2567 adox %r15,%r10 2568 mulx 0*8($nptr),%rax,%r15 2569 adcx %rax,%r10 2570 adox %r15,%r11 2571 mulx 1*8($nptr),%rax,%r15 2572 adcx %rax,%r11 2573 adox %r15,%r12 2574 mulx 2*8($nptr),%rax,%r15 2575 mov %r10,-5*8($tptr) 2576 adcx %rax,%r12 2577 mov %r11,-4*8($tptr) 2578 adox %r15,%r13 2579 mulx 3*8($nptr),%rax,%r15 2580 mov $bi,%rdx 2581 mov %r12,-3*8($tptr) 2582 adcx %rax,%r13 2583 adox $zero,%r15 2584 lea 4*8($nptr),$nptr 2585 mov %r13,-2*8($tptr) 2586 2587 dec $bptr # of=0, pass cf 2588 jnz .Lmulx4x_1st 2589 2590 mov 8(%rsp),$num # load -num 2591 adc $zero,%r15 # modulo-scheduled 2592 lea ($aptr,$num),$aptr # rewind $aptr 2593 add %r15,%r14 2594 mov 8+8(%rsp),$bptr # re-load &b[i] 2595 adc $zero,$zero # top-most carry 2596 mov %r14,-1*8($tptr) 2597 jmp .Lmulx4x_outer 2598 2599.align 32 2600.Lmulx4x_outer: 2601 lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) 2602 pxor %xmm4,%xmm4 2603 .byte 0x67,0x67 2604 pxor %xmm5,%xmm5 2605___ 2606for($i=0;$i<$STRIDE/16;$i+=4) { 2607$code.=<<___; 2608 movdqa `16*($i+0)-128`($bptr),%xmm0 2609 movdqa `16*($i+1)-128`($bptr),%xmm1 2610 movdqa `16*($i+2)-128`($bptr),%xmm2 2611 pand `16*($i+0)+256`(%r10),%xmm0 2612 movdqa `16*($i+3)-128`($bptr),%xmm3 2613 pand `16*($i+1)+256`(%r10),%xmm1 2614 por %xmm0,%xmm4 2615 pand `16*($i+2)+256`(%r10),%xmm2 2616 por %xmm1,%xmm5 2617 pand `16*($i+3)+256`(%r10),%xmm3 2618 por %xmm2,%xmm4 2619 por %xmm3,%xmm5 2620___ 2621} 2622$code.=<<___; 2623 por %xmm5,%xmm4 2624 pshufd \$0x4e,%xmm4,%xmm0 2625 por %xmm4,%xmm0 2626 lea $STRIDE($bptr),$bptr 2627 movq %xmm0,%rdx # m0=bp[i] 2628 2629 mov $zero,($tptr) # save top-most carry 2630 lea 4*8($tptr,$num),$tptr # rewind $tptr 2631 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 2632 xor $zero,$zero # cf=0, of=0 2633 mov %rdx,$bi 2634 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 2635 adox -4*8($tptr),$mi # +t[0] 2636 adcx %r14,%r11 2637 mulx 2*8($aptr),%r15,%r13 # ... 2638 adox -3*8($tptr),%r11 2639 adcx %r15,%r12 2640 mulx 3*8($aptr),%rdx,%r14 2641 adox -2*8($tptr),%r12 2642 adcx %rdx,%r13 2643 lea ($nptr,$num),$nptr # rewind $nptr 2644 lea 4*8($aptr),$aptr 2645 adox -1*8($tptr),%r13 2646 adcx $zero,%r14 2647 adox $zero,%r14 2648 2649 mov $mi,%r15 2650 imulq 32+8(%rsp),$mi # "t[0]"*n0 2651 2652 mov $mi,%rdx 2653 xor $zero,$zero # cf=0, of=0 2654 mov $bptr,8+8(%rsp) # off-load &b[i] 2655 2656 mulx 0*8($nptr),%rax,%r10 2657 adcx %rax,%r15 # discarded 2658 adox %r11,%r10 2659 mulx 1*8($nptr),%rax,%r11 2660 adcx %rax,%r10 2661 adox %r12,%r11 2662 mulx 2*8($nptr),%rax,%r12 2663 adcx %rax,%r11 2664 adox %r13,%r12 2665 mulx 3*8($nptr),%rax,%r15 2666 mov $bi,%rdx 2667 mov 24+8(%rsp),$bptr # counter value 2668 mov %r10,-8*4($tptr) 2669 adcx %rax,%r12 2670 mov %r11,-8*3($tptr) 2671 adox $zero,%r15 # of=0 2672 mov %r12,-8*2($tptr) 2673 lea 4*8($nptr),$nptr 2674 jmp .Lmulx4x_inner 2675 2676.align 32 2677.Lmulx4x_inner: 2678 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 2679 adcx $zero,%r15 # cf=0, modulo-scheduled 2680 adox %r14,%r10 2681 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 2682 adcx 0*8($tptr),%r10 2683 adox %rax,%r11 2684 mulx 2*8($aptr),%r12,%rax # ... 2685 adcx 1*8($tptr),%r11 2686 adox %r14,%r12 2687 mulx 3*8($aptr),%r13,%r14 2688 mov $mi,%rdx 2689 adcx 2*8($tptr),%r12 2690 adox %rax,%r13 2691 adcx 3*8($tptr),%r13 2692 adox $zero,%r14 # of=0 2693 lea 4*8($aptr),$aptr 2694 lea 4*8($tptr),$tptr 2695 adcx $zero,%r14 # cf=0 2696 2697 adox %r15,%r10 2698 mulx 0*8($nptr),%rax,%r15 2699 adcx %rax,%r10 2700 adox %r15,%r11 2701 mulx 1*8($nptr),%rax,%r15 2702 adcx %rax,%r11 2703 adox %r15,%r12 2704 mulx 2*8($nptr),%rax,%r15 2705 mov %r10,-5*8($tptr) 2706 adcx %rax,%r12 2707 adox %r15,%r13 2708 mov %r11,-4*8($tptr) 2709 mulx 3*8($nptr),%rax,%r15 2710 mov $bi,%rdx 2711 lea 4*8($nptr),$nptr 2712 mov %r12,-3*8($tptr) 2713 adcx %rax,%r13 2714 adox $zero,%r15 2715 mov %r13,-2*8($tptr) 2716 2717 dec $bptr # of=0, pass cf 2718 jnz .Lmulx4x_inner 2719 2720 mov 0+8(%rsp),$num # load -num 2721 adc $zero,%r15 # modulo-scheduled 2722 sub 0*8($tptr),$bptr # pull top-most carry to %cf 2723 mov 8+8(%rsp),$bptr # re-load &b[i] 2724 mov 16+8(%rsp),%r10 2725 adc %r15,%r14 2726 lea ($aptr,$num),$aptr # rewind $aptr 2727 adc $zero,$zero # top-most carry 2728 mov %r14,-1*8($tptr) 2729 2730 cmp %r10,$bptr 2731 jb .Lmulx4x_outer 2732 2733 mov -8($nptr),%r10 2734 mov $zero,%r8 2735 mov ($nptr,$num),%r12 2736 lea ($nptr,$num),%rbp # rewind $nptr 2737 mov $num,%rcx 2738 lea ($tptr,$num),%rdi # rewind $tptr 2739 xor %eax,%eax 2740 xor %r15,%r15 2741 sub %r14,%r10 # compare top-most words 2742 adc %r15,%r15 2743 or %r15,%r8 2744 sar \$3+2,%rcx 2745 sub %r8,%rax # %rax=-%r8 2746 mov 56+8(%rsp),%rdx # restore rp 2747 dec %r12 # so that after 'not' we get -n[0] 2748 mov 8*1(%rbp),%r13 2749 xor %r8,%r8 2750 mov 8*2(%rbp),%r14 2751 mov 8*3(%rbp),%r15 2752 jmp .Lsqrx4x_sub_entry # common post-condition 2753.size mulx4x_internal,.-mulx4x_internal 2754___ 2755}{ 2756###################################################################### 2757# void bn_power5( 2758my $rptr="%rdi"; # BN_ULONG *rptr, 2759my $aptr="%rsi"; # const BN_ULONG *aptr, 2760my $bptr="%rdx"; # const void *table, 2761my $nptr="%rcx"; # const BN_ULONG *nptr, 2762my $n0 ="%r8"; # const BN_ULONG *n0); 2763my $num ="%r9"; # int num, has to be divisible by 8 2764 # int pwr); 2765 2766my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 2767my @A0=("%r10","%r11"); 2768my @A1=("%r12","%r13"); 2769my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 2770 2771$code.=<<___; 2772.type bn_powerx5,\@function,6 2773.align 32 2774bn_powerx5: 2775.cfi_startproc 2776 mov %rsp,%rax 2777.cfi_def_cfa_register %rax 2778.Lpowerx5_enter: 2779 push %rbx 2780.cfi_push %rbx 2781 push %rbp 2782.cfi_push %rbp 2783 push %r12 2784.cfi_push %r12 2785 push %r13 2786.cfi_push %r13 2787 push %r14 2788.cfi_push %r14 2789 push %r15 2790.cfi_push %r15 2791.Lpowerx5_prologue: 2792 2793 shl \$3,${num}d # convert $num to bytes 2794 lea ($num,$num,2),%r10 # 3*$num in bytes 2795 neg $num 2796 mov ($n0),$n0 # *n0 2797 2798 ############################################################## 2799 # Ensure that stack frame doesn't alias with $rptr+3*$num 2800 # modulo 4096, which covers ret[num], am[num] and n[num] 2801 # (see bn_exp.c). This is done to allow memory disambiguation 2802 # logic do its magic. [Extra 256 bytes is for power mask 2803 # calculated from 7th argument, the index.] 2804 # 2805 lea -320(%rsp,$num,2),%r11 2806 mov %rsp,%rbp 2807 sub $rptr,%r11 2808 and \$4095,%r11 2809 cmp %r11,%r10 2810 jb .Lpwrx_sp_alt 2811 sub %r11,%rbp # align with $aptr 2812 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2813 jmp .Lpwrx_sp_done 2814 2815.align 32 2816.Lpwrx_sp_alt: 2817 lea 4096-320(,$num,2),%r10 2818 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) 2819 sub %r10,%r11 2820 mov \$0,%r10 2821 cmovc %r10,%r11 2822 sub %r11,%rbp 2823.Lpwrx_sp_done: 2824 and \$-64,%rbp 2825 mov %rsp,%r11 2826 sub %rbp,%r11 2827 and \$-4096,%r11 2828 lea (%rbp,%r11),%rsp 2829 mov (%rsp),%r10 2830 cmp %rbp,%rsp 2831 ja .Lpwrx_page_walk 2832 jmp .Lpwrx_page_walk_done 2833 2834.Lpwrx_page_walk: 2835 lea -4096(%rsp),%rsp 2836 mov (%rsp),%r10 2837 cmp %rbp,%rsp 2838 ja .Lpwrx_page_walk 2839.Lpwrx_page_walk_done: 2840 2841 mov $num,%r10 2842 neg $num 2843 2844 ############################################################## 2845 # Stack layout 2846 # 2847 # +0 saved $num, used in reduction section 2848 # +8 &t[2*$num], used in reduction section 2849 # +16 intermediate carry bit 2850 # +24 top-most carry bit, used in reduction section 2851 # +32 saved *n0 2852 # +40 saved %rsp 2853 # +48 t[2*$num] 2854 # 2855 pxor %xmm0,%xmm0 2856 movq $rptr,%xmm1 # save $rptr 2857 movq $nptr,%xmm2 # save $nptr 2858 movq %r10, %xmm3 # -$num 2859 movq $bptr,%xmm4 2860 mov $n0, 32(%rsp) 2861 mov %rax, 40(%rsp) # save original %rsp 2862.cfi_cfa_expression %rsp+40,deref,+8 2863.Lpowerx5_body: 2864 2865 call __bn_sqrx8x_internal 2866 call __bn_postx4x_internal 2867 call __bn_sqrx8x_internal 2868 call __bn_postx4x_internal 2869 call __bn_sqrx8x_internal 2870 call __bn_postx4x_internal 2871 call __bn_sqrx8x_internal 2872 call __bn_postx4x_internal 2873 call __bn_sqrx8x_internal 2874 call __bn_postx4x_internal 2875 2876 mov %r10,$num # -num 2877 mov $aptr,$rptr 2878 movq %xmm2,$nptr 2879 movq %xmm4,$bptr 2880 mov 40(%rsp),%rax 2881 2882 call mulx4x_internal 2883 2884 mov 40(%rsp),%rsi # restore %rsp 2885.cfi_def_cfa %rsi,8 2886 mov \$1,%rax 2887 2888 mov -48(%rsi),%r15 2889.cfi_restore %r15 2890 mov -40(%rsi),%r14 2891.cfi_restore %r14 2892 mov -32(%rsi),%r13 2893.cfi_restore %r13 2894 mov -24(%rsi),%r12 2895.cfi_restore %r12 2896 mov -16(%rsi),%rbp 2897.cfi_restore %rbp 2898 mov -8(%rsi),%rbx 2899.cfi_restore %rbx 2900 lea (%rsi),%rsp 2901.cfi_def_cfa_register %rsp 2902.Lpowerx5_epilogue: 2903 ret 2904.cfi_endproc 2905.size bn_powerx5,.-bn_powerx5 2906 2907.globl bn_sqrx8x_internal 2908.hidden bn_sqrx8x_internal 2909.type bn_sqrx8x_internal,\@abi-omnipotent 2910.align 32 2911bn_sqrx8x_internal: 2912__bn_sqrx8x_internal: 2913 ################################################################## 2914 # Squaring part: 2915 # 2916 # a) multiply-n-add everything but a[i]*a[i]; 2917 # b) shift result of a) by 1 to the left and accumulate 2918 # a[i]*a[i] products; 2919 # 2920 ################################################################## 2921 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2922 # a[1]a[0] 2923 # a[2]a[0] 2924 # a[3]a[0] 2925 # a[2]a[1] 2926 # a[3]a[1] 2927 # a[3]a[2] 2928 # 2929 # a[4]a[0] 2930 # a[5]a[0] 2931 # a[6]a[0] 2932 # a[7]a[0] 2933 # a[4]a[1] 2934 # a[5]a[1] 2935 # a[6]a[1] 2936 # a[7]a[1] 2937 # a[4]a[2] 2938 # a[5]a[2] 2939 # a[6]a[2] 2940 # a[7]a[2] 2941 # a[4]a[3] 2942 # a[5]a[3] 2943 # a[6]a[3] 2944 # a[7]a[3] 2945 # 2946 # a[5]a[4] 2947 # a[6]a[4] 2948 # a[7]a[4] 2949 # a[6]a[5] 2950 # a[7]a[5] 2951 # a[7]a[6] 2952 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2953___ 2954{ 2955my ($zero,$carry)=("%rbp","%rcx"); 2956my $aaptr=$zero; 2957$code.=<<___; 2958 lea 48+8(%rsp),$tptr 2959 lea ($aptr,$num),$aaptr 2960 mov $num,0+8(%rsp) # save $num 2961 mov $aaptr,8+8(%rsp) # save end of $aptr 2962 jmp .Lsqr8x_zero_start 2963 2964.align 32 2965.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2966.Lsqrx8x_zero: 2967 .byte 0x3e 2968 movdqa %xmm0,0*8($tptr) 2969 movdqa %xmm0,2*8($tptr) 2970 movdqa %xmm0,4*8($tptr) 2971 movdqa %xmm0,6*8($tptr) 2972.Lsqr8x_zero_start: # aligned at 32 2973 movdqa %xmm0,8*8($tptr) 2974 movdqa %xmm0,10*8($tptr) 2975 movdqa %xmm0,12*8($tptr) 2976 movdqa %xmm0,14*8($tptr) 2977 lea 16*8($tptr),$tptr 2978 sub \$64,$num 2979 jnz .Lsqrx8x_zero 2980 2981 mov 0*8($aptr),%rdx # a[0], modulo-scheduled 2982 #xor %r9,%r9 # t[1], ex-$num, zero already 2983 xor %r10,%r10 2984 xor %r11,%r11 2985 xor %r12,%r12 2986 xor %r13,%r13 2987 xor %r14,%r14 2988 xor %r15,%r15 2989 lea 48+8(%rsp),$tptr 2990 xor $zero,$zero # cf=0, cf=0 2991 jmp .Lsqrx8x_outer_loop 2992 2993.align 32 2994.Lsqrx8x_outer_loop: 2995 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] 2996 adcx %r9,%r8 # a[1]*a[0]+=t[1] 2997 adox %rax,%r10 2998 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] 2999 adcx %r10,%r9 3000 adox %rax,%r11 3001 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... 3002 adcx %r11,%r10 3003 adox %rax,%r12 3004 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax 3005 adcx %r12,%r11 3006 adox %rax,%r13 3007 mulx 5*8($aptr),%r12,%rax 3008 adcx %r13,%r12 3009 adox %rax,%r14 3010 mulx 6*8($aptr),%r13,%rax 3011 adcx %r14,%r13 3012 adox %r15,%rax 3013 mulx 7*8($aptr),%r14,%r15 3014 mov 1*8($aptr),%rdx # a[1] 3015 adcx %rax,%r14 3016 adox $zero,%r15 3017 adc 8*8($tptr),%r15 3018 mov %r8,1*8($tptr) # t[1] 3019 mov %r9,2*8($tptr) # t[2] 3020 sbb $carry,$carry # mov %cf,$carry 3021 xor $zero,$zero # cf=0, of=0 3022 3023 3024 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] 3025 mulx 3*8($aptr),%r9,%rax # a[3]*a[1] 3026 adcx %r10,%r8 3027 adox %rbx,%r9 3028 mulx 4*8($aptr),%r10,%rbx # ... 3029 adcx %r11,%r9 3030 adox %rax,%r10 3031 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax 3032 adcx %r12,%r10 3033 adox %rbx,%r11 3034 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx 3035 adcx %r13,%r11 3036 adox %r14,%r12 3037 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 3038 mov 2*8($aptr),%rdx # a[2] 3039 adcx %rax,%r12 3040 adox %rbx,%r13 3041 adcx %r15,%r13 3042 adox $zero,%r14 # of=0 3043 adcx $zero,%r14 # cf=0 3044 3045 mov %r8,3*8($tptr) # t[3] 3046 mov %r9,4*8($tptr) # t[4] 3047 3048 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] 3049 mulx 4*8($aptr),%r9,%rax # a[4]*a[2] 3050 adcx %r10,%r8 3051 adox %rbx,%r9 3052 mulx 5*8($aptr),%r10,%rbx # ... 3053 adcx %r11,%r9 3054 adox %rax,%r10 3055 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax 3056 adcx %r12,%r10 3057 adox %r13,%r11 3058 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 3059 .byte 0x3e 3060 mov 3*8($aptr),%rdx # a[3] 3061 adcx %rbx,%r11 3062 adox %rax,%r12 3063 adcx %r14,%r12 3064 mov %r8,5*8($tptr) # t[5] 3065 mov %r9,6*8($tptr) # t[6] 3066 mulx 4*8($aptr),%r8,%rax # a[4]*a[3] 3067 adox $zero,%r13 # of=0 3068 adcx $zero,%r13 # cf=0 3069 3070 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] 3071 adcx %r10,%r8 3072 adox %rax,%r9 3073 mulx 6*8($aptr),%r10,%rax # ... 3074 adcx %r11,%r9 3075 adox %r12,%r10 3076 mulx 7*8($aptr),%r11,%r12 3077 mov 4*8($aptr),%rdx # a[4] 3078 mov 5*8($aptr),%r14 # a[5] 3079 adcx %rbx,%r10 3080 adox %rax,%r11 3081 mov 6*8($aptr),%r15 # a[6] 3082 adcx %r13,%r11 3083 adox $zero,%r12 # of=0 3084 adcx $zero,%r12 # cf=0 3085 3086 mov %r8,7*8($tptr) # t[7] 3087 mov %r9,8*8($tptr) # t[8] 3088 3089 mulx %r14,%r9,%rax # a[5]*a[4] 3090 mov 7*8($aptr),%r8 # a[7] 3091 adcx %r10,%r9 3092 mulx %r15,%r10,%rbx # a[6]*a[4] 3093 adox %rax,%r10 3094 adcx %r11,%r10 3095 mulx %r8,%r11,%rax # a[7]*a[4] 3096 mov %r14,%rdx # a[5] 3097 adox %rbx,%r11 3098 adcx %r12,%r11 3099 #adox $zero,%rax # of=0 3100 adcx $zero,%rax # cf=0 3101 3102 mulx %r15,%r14,%rbx # a[6]*a[5] 3103 mulx %r8,%r12,%r13 # a[7]*a[5] 3104 mov %r15,%rdx # a[6] 3105 lea 8*8($aptr),$aptr 3106 adcx %r14,%r11 3107 adox %rbx,%r12 3108 adcx %rax,%r12 3109 adox $zero,%r13 3110 3111 .byte 0x67,0x67 3112 mulx %r8,%r8,%r14 # a[7]*a[6] 3113 adcx %r8,%r13 3114 adcx $zero,%r14 3115 3116 cmp 8+8(%rsp),$aptr 3117 je .Lsqrx8x_outer_break 3118 3119 neg $carry # mov $carry,%cf 3120 mov \$-8,%rcx 3121 mov $zero,%r15 3122 mov 8*8($tptr),%r8 3123 adcx 9*8($tptr),%r9 # +=t[9] 3124 adcx 10*8($tptr),%r10 # ... 3125 adcx 11*8($tptr),%r11 3126 adc 12*8($tptr),%r12 3127 adc 13*8($tptr),%r13 3128 adc 14*8($tptr),%r14 3129 adc 15*8($tptr),%r15 3130 lea ($aptr),$aaptr 3131 lea 2*64($tptr),$tptr 3132 sbb %rax,%rax # mov %cf,$carry 3133 3134 mov -64($aptr),%rdx # a[0] 3135 mov %rax,16+8(%rsp) # offload $carry 3136 mov $tptr,24+8(%rsp) 3137 3138 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above 3139 xor %eax,%eax # cf=0, of=0 3140 jmp .Lsqrx8x_loop 3141 3142.align 32 3143.Lsqrx8x_loop: 3144 mov %r8,%rbx 3145 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] 3146 adcx %rax,%rbx # +=t[8] 3147 adox %r9,%r8 3148 3149 mulx 1*8($aaptr),%rax,%r9 # ... 3150 adcx %rax,%r8 3151 adox %r10,%r9 3152 3153 mulx 2*8($aaptr),%rax,%r10 3154 adcx %rax,%r9 3155 adox %r11,%r10 3156 3157 mulx 3*8($aaptr),%rax,%r11 3158 adcx %rax,%r10 3159 adox %r12,%r11 3160 3161 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 3162 adcx %rax,%r11 3163 adox %r13,%r12 3164 3165 mulx 5*8($aaptr),%rax,%r13 3166 adcx %rax,%r12 3167 adox %r14,%r13 3168 3169 mulx 6*8($aaptr),%rax,%r14 3170 mov %rbx,($tptr,%rcx,8) # store t[8+i] 3171 mov \$0,%ebx 3172 adcx %rax,%r13 3173 adox %r15,%r14 3174 3175 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 3176 mov 8($aptr,%rcx,8),%rdx # a[i] 3177 adcx %rax,%r14 3178 adox %rbx,%r15 # %rbx is 0, of=0 3179 adcx %rbx,%r15 # cf=0 3180 3181 .byte 0x67 3182 inc %rcx # of=0 3183 jnz .Lsqrx8x_loop 3184 3185 lea 8*8($aaptr),$aaptr 3186 mov \$-8,%rcx 3187 cmp 8+8(%rsp),$aaptr # done? 3188 je .Lsqrx8x_break 3189 3190 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3191 .byte 0x66 3192 mov -64($aptr),%rdx 3193 adcx 0*8($tptr),%r8 3194 adcx 1*8($tptr),%r9 3195 adc 2*8($tptr),%r10 3196 adc 3*8($tptr),%r11 3197 adc 4*8($tptr),%r12 3198 adc 5*8($tptr),%r13 3199 adc 6*8($tptr),%r14 3200 adc 7*8($tptr),%r15 3201 lea 8*8($tptr),$tptr 3202 .byte 0x67 3203 sbb %rax,%rax # mov %cf,%rax 3204 xor %ebx,%ebx # cf=0, of=0 3205 mov %rax,16+8(%rsp) # offload carry 3206 jmp .Lsqrx8x_loop 3207 3208.align 32 3209.Lsqrx8x_break: 3210 xor $zero,$zero 3211 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3212 adcx $zero,%r8 3213 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry 3214 adcx $zero,%r9 3215 mov 0*8($aptr),%rdx # a[8], modulo-scheduled 3216 adc \$0,%r10 3217 mov %r8,0*8($tptr) 3218 adc \$0,%r11 3219 adc \$0,%r12 3220 adc \$0,%r13 3221 adc \$0,%r14 3222 adc \$0,%r15 3223 cmp $carry,$tptr # cf=0, of=0 3224 je .Lsqrx8x_outer_loop 3225 3226 mov %r9,1*8($tptr) 3227 mov 1*8($carry),%r9 3228 mov %r10,2*8($tptr) 3229 mov 2*8($carry),%r10 3230 mov %r11,3*8($tptr) 3231 mov 3*8($carry),%r11 3232 mov %r12,4*8($tptr) 3233 mov 4*8($carry),%r12 3234 mov %r13,5*8($tptr) 3235 mov 5*8($carry),%r13 3236 mov %r14,6*8($tptr) 3237 mov 6*8($carry),%r14 3238 mov %r15,7*8($tptr) 3239 mov 7*8($carry),%r15 3240 mov $carry,$tptr 3241 jmp .Lsqrx8x_outer_loop 3242 3243.align 32 3244.Lsqrx8x_outer_break: 3245 mov %r9,9*8($tptr) # t[9] 3246 movq %xmm3,%rcx # -$num 3247 mov %r10,10*8($tptr) # ... 3248 mov %r11,11*8($tptr) 3249 mov %r12,12*8($tptr) 3250 mov %r13,13*8($tptr) 3251 mov %r14,14*8($tptr) 3252___ 3253}{ 3254my $i="%rcx"; 3255$code.=<<___; 3256 lea 48+8(%rsp),$tptr 3257 mov ($aptr,$i),%rdx # a[0] 3258 3259 mov 8($tptr),$A0[1] # t[1] 3260 xor $A0[0],$A0[0] # t[0], of=0, cf=0 3261 mov 0+8(%rsp),$num # restore $num 3262 adox $A0[1],$A0[1] 3263 mov 16($tptr),$A1[0] # t[2] # prefetch 3264 mov 24($tptr),$A1[1] # t[3] # prefetch 3265 #jmp .Lsqrx4x_shift_n_add # happens to be aligned 3266 3267.align 32 3268.Lsqrx4x_shift_n_add: 3269 mulx %rdx,%rax,%rbx 3270 adox $A1[0],$A1[0] 3271 adcx $A0[0],%rax 3272 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch 3273 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch 3274 adox $A1[1],$A1[1] 3275 adcx $A0[1],%rbx 3276 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch 3277 mov %rax,0($tptr) 3278 mov %rbx,8($tptr) 3279 3280 mulx %rdx,%rax,%rbx 3281 adox $A0[0],$A0[0] 3282 adcx $A1[0],%rax 3283 mov 16($aptr,$i),%rdx # a[i+2] # prefetch 3284 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch 3285 adox $A0[1],$A0[1] 3286 adcx $A1[1],%rbx 3287 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch 3288 mov %rax,16($tptr) 3289 mov %rbx,24($tptr) 3290 3291 mulx %rdx,%rax,%rbx 3292 adox $A1[0],$A1[0] 3293 adcx $A0[0],%rax 3294 mov 24($aptr,$i),%rdx # a[i+3] # prefetch 3295 lea 32($i),$i 3296 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch 3297 adox $A1[1],$A1[1] 3298 adcx $A0[1],%rbx 3299 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch 3300 mov %rax,32($tptr) 3301 mov %rbx,40($tptr) 3302 3303 mulx %rdx,%rax,%rbx 3304 adox $A0[0],$A0[0] 3305 adcx $A1[0],%rax 3306 jrcxz .Lsqrx4x_shift_n_add_break 3307 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch 3308 adox $A0[1],$A0[1] 3309 adcx $A1[1],%rbx 3310 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch 3311 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch 3312 mov %rax,48($tptr) 3313 mov %rbx,56($tptr) 3314 lea 64($tptr),$tptr 3315 nop 3316 jmp .Lsqrx4x_shift_n_add 3317 3318.align 32 3319.Lsqrx4x_shift_n_add_break: 3320 adcx $A1[1],%rbx 3321 mov %rax,48($tptr) 3322 mov %rbx,56($tptr) 3323 lea 64($tptr),$tptr # end of t[] buffer 3324___ 3325} 3326###################################################################### 3327# Montgomery reduction part, "word-by-word" algorithm. 3328# 3329# This new path is inspired by multiple submissions from Intel, by 3330# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 3331# Vinodh Gopal... 3332{ 3333my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); 3334 3335$code.=<<___; 3336 movq %xmm2,$nptr 3337__bn_sqrx8x_reduction: 3338 xor %eax,%eax # initial top-most carry bit 3339 mov 32+8(%rsp),%rbx # n0 3340 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) 3341 lea -8*8($nptr,$num),%rcx # end of n[] 3342 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer 3343 mov %rcx, 0+8(%rsp) # save end of n[] 3344 mov $tptr,8+8(%rsp) # save end of t[] 3345 3346 lea 48+8(%rsp),$tptr # initial t[] window 3347 jmp .Lsqrx8x_reduction_loop 3348 3349.align 32 3350.Lsqrx8x_reduction_loop: 3351 mov 8*1($tptr),%r9 3352 mov 8*2($tptr),%r10 3353 mov 8*3($tptr),%r11 3354 mov 8*4($tptr),%r12 3355 mov %rdx,%r8 3356 imulq %rbx,%rdx # n0*a[i] 3357 mov 8*5($tptr),%r13 3358 mov 8*6($tptr),%r14 3359 mov 8*7($tptr),%r15 3360 mov %rax,24+8(%rsp) # store top-most carry bit 3361 3362 lea 8*8($tptr),$tptr 3363 xor $carry,$carry # cf=0,of=0 3364 mov \$-8,%rcx 3365 jmp .Lsqrx8x_reduce 3366 3367.align 32 3368.Lsqrx8x_reduce: 3369 mov %r8, %rbx 3370 mulx 8*0($nptr),%rax,%r8 # n[0] 3371 adcx %rbx,%rax # discarded 3372 adox %r9,%r8 3373 3374 mulx 8*1($nptr),%rbx,%r9 # n[1] 3375 adcx %rbx,%r8 3376 adox %r10,%r9 3377 3378 mulx 8*2($nptr),%rbx,%r10 3379 adcx %rbx,%r9 3380 adox %r11,%r10 3381 3382 mulx 8*3($nptr),%rbx,%r11 3383 adcx %rbx,%r10 3384 adox %r12,%r11 3385 3386 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 3387 mov %rdx,%rax 3388 mov %r8,%rdx 3389 adcx %rbx,%r11 3390 adox %r13,%r12 3391 3392 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded 3393 mov %rax,%rdx 3394 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] 3395 3396 mulx 8*5($nptr),%rax,%r13 3397 adcx %rax,%r12 3398 adox %r14,%r13 3399 3400 mulx 8*6($nptr),%rax,%r14 3401 adcx %rax,%r13 3402 adox %r15,%r14 3403 3404 mulx 8*7($nptr),%rax,%r15 3405 mov %rbx,%rdx 3406 adcx %rax,%r14 3407 adox $carry,%r15 # $carry is 0 3408 adcx $carry,%r15 # cf=0 3409 3410 .byte 0x67,0x67,0x67 3411 inc %rcx # of=0 3412 jnz .Lsqrx8x_reduce 3413 3414 mov $carry,%rax # xor %rax,%rax 3415 cmp 0+8(%rsp),$nptr # end of n[]? 3416 jae .Lsqrx8x_no_tail 3417 3418 mov 48+8(%rsp),%rdx # pull n0*a[0] 3419 add 8*0($tptr),%r8 3420 lea 8*8($nptr),$nptr 3421 mov \$-8,%rcx 3422 adcx 8*1($tptr),%r9 3423 adcx 8*2($tptr),%r10 3424 adc 8*3($tptr),%r11 3425 adc 8*4($tptr),%r12 3426 adc 8*5($tptr),%r13 3427 adc 8*6($tptr),%r14 3428 adc 8*7($tptr),%r15 3429 lea 8*8($tptr),$tptr 3430 sbb %rax,%rax # top carry 3431 3432 xor $carry,$carry # of=0, cf=0 3433 mov %rax,16+8(%rsp) 3434 jmp .Lsqrx8x_tail 3435 3436.align 32 3437.Lsqrx8x_tail: 3438 mov %r8,%rbx 3439 mulx 8*0($nptr),%rax,%r8 3440 adcx %rax,%rbx 3441 adox %r9,%r8 3442 3443 mulx 8*1($nptr),%rax,%r9 3444 adcx %rax,%r8 3445 adox %r10,%r9 3446 3447 mulx 8*2($nptr),%rax,%r10 3448 adcx %rax,%r9 3449 adox %r11,%r10 3450 3451 mulx 8*3($nptr),%rax,%r11 3452 adcx %rax,%r10 3453 adox %r12,%r11 3454 3455 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 3456 adcx %rax,%r11 3457 adox %r13,%r12 3458 3459 mulx 8*5($nptr),%rax,%r13 3460 adcx %rax,%r12 3461 adox %r14,%r13 3462 3463 mulx 8*6($nptr),%rax,%r14 3464 adcx %rax,%r13 3465 adox %r15,%r14 3466 3467 mulx 8*7($nptr),%rax,%r15 3468 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] 3469 adcx %rax,%r14 3470 adox $carry,%r15 3471 mov %rbx,($tptr,%rcx,8) # save result 3472 mov %r8,%rbx 3473 adcx $carry,%r15 # cf=0 3474 3475 inc %rcx # of=0 3476 jnz .Lsqrx8x_tail 3477 3478 cmp 0+8(%rsp),$nptr # end of n[]? 3479 jae .Lsqrx8x_tail_done # break out of loop 3480 3481 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3482 mov 48+8(%rsp),%rdx # pull n0*a[0] 3483 lea 8*8($nptr),$nptr 3484 adc 8*0($tptr),%r8 3485 adc 8*1($tptr),%r9 3486 adc 8*2($tptr),%r10 3487 adc 8*3($tptr),%r11 3488 adc 8*4($tptr),%r12 3489 adc 8*5($tptr),%r13 3490 adc 8*6($tptr),%r14 3491 adc 8*7($tptr),%r15 3492 lea 8*8($tptr),$tptr 3493 sbb %rax,%rax 3494 sub \$8,%rcx # mov \$-8,%rcx 3495 3496 xor $carry,$carry # of=0, cf=0 3497 mov %rax,16+8(%rsp) 3498 jmp .Lsqrx8x_tail 3499 3500.align 32 3501.Lsqrx8x_tail_done: 3502 xor %rax,%rax 3503 add 24+8(%rsp),%r8 # can this overflow? 3504 adc \$0,%r9 3505 adc \$0,%r10 3506 adc \$0,%r11 3507 adc \$0,%r12 3508 adc \$0,%r13 3509 adc \$0,%r14 3510 adc \$0,%r15 3511 adc \$0,%rax 3512 3513 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3514.Lsqrx8x_no_tail: # %cf is 0 if jumped here 3515 adc 8*0($tptr),%r8 3516 movq %xmm3,%rcx 3517 adc 8*1($tptr),%r9 3518 mov 8*7($nptr),$carry 3519 movq %xmm2,$nptr # restore $nptr 3520 adc 8*2($tptr),%r10 3521 adc 8*3($tptr),%r11 3522 adc 8*4($tptr),%r12 3523 adc 8*5($tptr),%r13 3524 adc 8*6($tptr),%r14 3525 adc 8*7($tptr),%r15 3526 adc \$0,%rax # top-most carry 3527 3528 mov 32+8(%rsp),%rbx # n0 3529 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" 3530 3531 mov %r8,8*0($tptr) # store top 512 bits 3532 lea 8*8($tptr),%r8 # borrow %r8 3533 mov %r9,8*1($tptr) 3534 mov %r10,8*2($tptr) 3535 mov %r11,8*3($tptr) 3536 mov %r12,8*4($tptr) 3537 mov %r13,8*5($tptr) 3538 mov %r14,8*6($tptr) 3539 mov %r15,8*7($tptr) 3540 3541 lea 8*8($tptr,%rcx),$tptr # start of current t[] window 3542 cmp 8+8(%rsp),%r8 # end of t[]? 3543 jb .Lsqrx8x_reduction_loop 3544 ret 3545.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3546___ 3547} 3548############################################################## 3549# Post-condition, 4x unrolled 3550# 3551{ 3552my ($rptr,$nptr)=("%rdx","%rbp"); 3553$code.=<<___; 3554.align 32 3555__bn_postx4x_internal: 3556 mov 8*0($nptr),%r12 3557 mov %rcx,%r10 # -$num 3558 mov %rcx,%r9 # -$num 3559 neg %rax 3560 sar \$3+2,%rcx 3561 #lea 48+8(%rsp,%r9),$tptr 3562 movq %xmm1,$rptr # restore $rptr 3563 movq %xmm1,$aptr # prepare for back-to-back call 3564 dec %r12 # so that after 'not' we get -n[0] 3565 mov 8*1($nptr),%r13 3566 xor %r8,%r8 3567 mov 8*2($nptr),%r14 3568 mov 8*3($nptr),%r15 3569 jmp .Lsqrx4x_sub_entry 3570 3571.align 16 3572.Lsqrx4x_sub: 3573 mov 8*0($nptr),%r12 3574 mov 8*1($nptr),%r13 3575 mov 8*2($nptr),%r14 3576 mov 8*3($nptr),%r15 3577.Lsqrx4x_sub_entry: 3578 andn %rax,%r12,%r12 3579 lea 8*4($nptr),$nptr 3580 andn %rax,%r13,%r13 3581 andn %rax,%r14,%r14 3582 andn %rax,%r15,%r15 3583 3584 neg %r8 # mov %r8,%cf 3585 adc 8*0($tptr),%r12 3586 adc 8*1($tptr),%r13 3587 adc 8*2($tptr),%r14 3588 adc 8*3($tptr),%r15 3589 mov %r12,8*0($rptr) 3590 lea 8*4($tptr),$tptr 3591 mov %r13,8*1($rptr) 3592 sbb %r8,%r8 # mov %cf,%r8 3593 mov %r14,8*2($rptr) 3594 mov %r15,8*3($rptr) 3595 lea 8*4($rptr),$rptr 3596 3597 inc %rcx 3598 jnz .Lsqrx4x_sub 3599 3600 neg %r9 # restore $num 3601 3602 ret 3603.size __bn_postx4x_internal,.-__bn_postx4x_internal 3604___ 3605} 3606}}} 3607{ 3608my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order 3609 ("%rdi","%esi","%rdx","%ecx"); # Unix order 3610my $out=$inp; 3611my $STRIDE=2**5*8; 3612my $N=$STRIDE/4; 3613 3614$code.=<<___; 3615.globl bn_get_bits5 3616.type bn_get_bits5,\@abi-omnipotent 3617.align 16 3618bn_get_bits5: 3619 lea 0($inp),%r10 3620 lea 1($inp),%r11 3621 mov $num,%ecx 3622 shr \$4,$num 3623 and \$15,%ecx 3624 lea -8(%ecx),%eax 3625 cmp \$11,%ecx 3626 cmova %r11,%r10 3627 cmova %eax,%ecx 3628 movzw (%r10,$num,2),%eax 3629 shrl %cl,%eax 3630 and \$31,%eax 3631 ret 3632.size bn_get_bits5,.-bn_get_bits5 3633 3634.globl bn_scatter5 3635.type bn_scatter5,\@abi-omnipotent 3636.align 16 3637bn_scatter5: 3638 cmp \$0, $num 3639 jz .Lscatter_epilogue 3640 lea ($tbl,$idx,8),$tbl 3641.Lscatter: 3642 mov ($inp),%rax 3643 lea 8($inp),$inp 3644 mov %rax,($tbl) 3645 lea 32*8($tbl),$tbl 3646 sub \$1,$num 3647 jnz .Lscatter 3648.Lscatter_epilogue: 3649 ret 3650.size bn_scatter5,.-bn_scatter5 3651 3652.globl bn_gather5 3653.type bn_gather5,\@abi-omnipotent 3654.align 32 3655bn_gather5: 3656.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases 3657 # I can't trust assembler to use specific encoding:-( 3658 .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 3659 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp 3660 lea .Linc(%rip),%rax 3661 and \$-16,%rsp # shouldn't be formally required 3662 3663 movd $idx,%xmm5 3664 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 3665 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 3666 lea 128($tbl),%r11 # size optimization 3667 lea 128(%rsp),%rax # size optimization 3668 3669 pshufd \$0,%xmm5,%xmm5 # broadcast $idx 3670 movdqa %xmm1,%xmm4 3671 movdqa %xmm1,%xmm2 3672___ 3673######################################################################## 3674# calculate mask by comparing 0..31 to $idx and save result to stack 3675# 3676for($i=0;$i<$STRIDE/16;$i+=4) { 3677$code.=<<___; 3678 paddd %xmm0,%xmm1 3679 pcmpeqd %xmm5,%xmm0 # compare to 1,0 3680___ 3681$code.=<<___ if ($i); 3682 movdqa %xmm3,`16*($i-1)-128`(%rax) 3683___ 3684$code.=<<___; 3685 movdqa %xmm4,%xmm3 3686 3687 paddd %xmm1,%xmm2 3688 pcmpeqd %xmm5,%xmm1 # compare to 3,2 3689 movdqa %xmm0,`16*($i+0)-128`(%rax) 3690 movdqa %xmm4,%xmm0 3691 3692 paddd %xmm2,%xmm3 3693 pcmpeqd %xmm5,%xmm2 # compare to 5,4 3694 movdqa %xmm1,`16*($i+1)-128`(%rax) 3695 movdqa %xmm4,%xmm1 3696 3697 paddd %xmm3,%xmm0 3698 pcmpeqd %xmm5,%xmm3 # compare to 7,6 3699 movdqa %xmm2,`16*($i+2)-128`(%rax) 3700 movdqa %xmm4,%xmm2 3701___ 3702} 3703$code.=<<___; 3704 movdqa %xmm3,`16*($i-1)-128`(%rax) 3705 jmp .Lgather 3706 3707.align 32 3708.Lgather: 3709 pxor %xmm4,%xmm4 3710 pxor %xmm5,%xmm5 3711___ 3712for($i=0;$i<$STRIDE/16;$i+=4) { 3713$code.=<<___; 3714 movdqa `16*($i+0)-128`(%r11),%xmm0 3715 movdqa `16*($i+1)-128`(%r11),%xmm1 3716 movdqa `16*($i+2)-128`(%r11),%xmm2 3717 pand `16*($i+0)-128`(%rax),%xmm0 3718 movdqa `16*($i+3)-128`(%r11),%xmm3 3719 pand `16*($i+1)-128`(%rax),%xmm1 3720 por %xmm0,%xmm4 3721 pand `16*($i+2)-128`(%rax),%xmm2 3722 por %xmm1,%xmm5 3723 pand `16*($i+3)-128`(%rax),%xmm3 3724 por %xmm2,%xmm4 3725 por %xmm3,%xmm5 3726___ 3727} 3728$code.=<<___; 3729 por %xmm5,%xmm4 3730 lea $STRIDE(%r11),%r11 3731 pshufd \$0x4e,%xmm4,%xmm0 3732 por %xmm4,%xmm0 3733 movq %xmm0,($out) # m0=bp[0] 3734 lea 8($out),$out 3735 sub \$1,$num 3736 jnz .Lgather 3737 3738 lea (%r10),%rsp 3739 ret 3740.LSEH_end_bn_gather5: 3741.size bn_gather5,.-bn_gather5 3742___ 3743} 3744$code.=<<___; 3745.align 64 3746.Linc: 3747 .long 0,0, 1,1 3748 .long 2,2, 2,2 3749.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3750___ 3751 3752# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3753# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3754if ($win64) { 3755$rec="%rcx"; 3756$frame="%rdx"; 3757$context="%r8"; 3758$disp="%r9"; 3759 3760$code.=<<___; 3761.extern __imp_RtlVirtualUnwind 3762.type mul_handler,\@abi-omnipotent 3763.align 16 3764mul_handler: 3765 push %rsi 3766 push %rdi 3767 push %rbx 3768 push %rbp 3769 push %r12 3770 push %r13 3771 push %r14 3772 push %r15 3773 pushfq 3774 sub \$64,%rsp 3775 3776 mov 120($context),%rax # pull context->Rax 3777 mov 248($context),%rbx # pull context->Rip 3778 3779 mov 8($disp),%rsi # disp->ImageBase 3780 mov 56($disp),%r11 # disp->HandlerData 3781 3782 mov 0(%r11),%r10d # HandlerData[0] 3783 lea (%rsi,%r10),%r10 # end of prologue label 3784 cmp %r10,%rbx # context->Rip<end of prologue label 3785 jb .Lcommon_seh_tail 3786 3787 mov 4(%r11),%r10d # HandlerData[1] 3788 lea (%rsi,%r10),%r10 # beginning of body label 3789 cmp %r10,%rbx # context->Rip<body label 3790 jb .Lcommon_pop_regs 3791 3792 mov 152($context),%rax # pull context->Rsp 3793 3794 mov 8(%r11),%r10d # HandlerData[2] 3795 lea (%rsi,%r10),%r10 # epilogue label 3796 cmp %r10,%rbx # context->Rip>=epilogue label 3797 jae .Lcommon_seh_tail 3798 3799 lea .Lmul_epilogue(%rip),%r10 3800 cmp %r10,%rbx 3801 ja .Lbody_40 3802 3803 mov 192($context),%r10 # pull $num 3804 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 3805 3806 jmp .Lcommon_pop_regs 3807 3808.Lbody_40: 3809 mov 40(%rax),%rax # pull saved stack pointer 3810.Lcommon_pop_regs: 3811 mov -8(%rax),%rbx 3812 mov -16(%rax),%rbp 3813 mov -24(%rax),%r12 3814 mov -32(%rax),%r13 3815 mov -40(%rax),%r14 3816 mov -48(%rax),%r15 3817 mov %rbx,144($context) # restore context->Rbx 3818 mov %rbp,160($context) # restore context->Rbp 3819 mov %r12,216($context) # restore context->R12 3820 mov %r13,224($context) # restore context->R13 3821 mov %r14,232($context) # restore context->R14 3822 mov %r15,240($context) # restore context->R15 3823 3824.Lcommon_seh_tail: 3825 mov 8(%rax),%rdi 3826 mov 16(%rax),%rsi 3827 mov %rax,152($context) # restore context->Rsp 3828 mov %rsi,168($context) # restore context->Rsi 3829 mov %rdi,176($context) # restore context->Rdi 3830 3831 mov 40($disp),%rdi # disp->ContextRecord 3832 mov $context,%rsi # context 3833 mov \$154,%ecx # sizeof(CONTEXT) 3834 .long 0xa548f3fc # cld; rep movsq 3835 3836 mov $disp,%rsi 3837 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3838 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3839 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3840 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3841 mov 40(%rsi),%r10 # disp->ContextRecord 3842 lea 56(%rsi),%r11 # &disp->HandlerData 3843 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3844 mov %r10,32(%rsp) # arg5 3845 mov %r11,40(%rsp) # arg6 3846 mov %r12,48(%rsp) # arg7 3847 mov %rcx,56(%rsp) # arg8, (NULL) 3848 call *__imp_RtlVirtualUnwind(%rip) 3849 3850 mov \$1,%eax # ExceptionContinueSearch 3851 add \$64,%rsp 3852 popfq 3853 pop %r15 3854 pop %r14 3855 pop %r13 3856 pop %r12 3857 pop %rbp 3858 pop %rbx 3859 pop %rdi 3860 pop %rsi 3861 ret 3862.size mul_handler,.-mul_handler 3863 3864.section .pdata 3865.align 4 3866 .rva .LSEH_begin_bn_mul_mont_gather5 3867 .rva .LSEH_end_bn_mul_mont_gather5 3868 .rva .LSEH_info_bn_mul_mont_gather5 3869 3870 .rva .LSEH_begin_bn_mul4x_mont_gather5 3871 .rva .LSEH_end_bn_mul4x_mont_gather5 3872 .rva .LSEH_info_bn_mul4x_mont_gather5 3873 3874 .rva .LSEH_begin_bn_power5 3875 .rva .LSEH_end_bn_power5 3876 .rva .LSEH_info_bn_power5 3877 3878 .rva .LSEH_begin_bn_from_mont8x 3879 .rva .LSEH_end_bn_from_mont8x 3880 .rva .LSEH_info_bn_from_mont8x 3881___ 3882$code.=<<___ if ($addx); 3883 .rva .LSEH_begin_bn_mulx4x_mont_gather5 3884 .rva .LSEH_end_bn_mulx4x_mont_gather5 3885 .rva .LSEH_info_bn_mulx4x_mont_gather5 3886 3887 .rva .LSEH_begin_bn_powerx5 3888 .rva .LSEH_end_bn_powerx5 3889 .rva .LSEH_info_bn_powerx5 3890___ 3891$code.=<<___; 3892 .rva .LSEH_begin_bn_gather5 3893 .rva .LSEH_end_bn_gather5 3894 .rva .LSEH_info_bn_gather5 3895 3896.section .xdata 3897.align 8 3898.LSEH_info_bn_mul_mont_gather5: 3899 .byte 9,0,0,0 3900 .rva mul_handler 3901 .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] 3902.align 8 3903.LSEH_info_bn_mul4x_mont_gather5: 3904 .byte 9,0,0,0 3905 .rva mul_handler 3906 .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 3907.align 8 3908.LSEH_info_bn_power5: 3909 .byte 9,0,0,0 3910 .rva mul_handler 3911 .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] 3912.align 8 3913.LSEH_info_bn_from_mont8x: 3914 .byte 9,0,0,0 3915 .rva mul_handler 3916 .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] 3917___ 3918$code.=<<___ if ($addx); 3919.align 8 3920.LSEH_info_bn_mulx4x_mont_gather5: 3921 .byte 9,0,0,0 3922 .rva mul_handler 3923 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 3924.align 8 3925.LSEH_info_bn_powerx5: 3926 .byte 9,0,0,0 3927 .rva mul_handler 3928 .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] 3929___ 3930$code.=<<___; 3931.align 8 3932.LSEH_info_bn_gather5: 3933 .byte 0x01,0x0b,0x03,0x0a 3934 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 3935 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) 3936.align 8 3937___ 3938} 3939 3940$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3941 3942print $code; 3943close STDOUT; 3944