1#! /usr/bin/env perl 2# Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# August 2011. 18# 19# Companion to x86_64-mont.pl that optimizes cache-timing attack 20# countermeasures. The subroutines are produced by replacing bp[i] 21# references in their x86_64-mont.pl counterparts with cache-neutral 22# references to powers table computed in BN_mod_exp_mont_consttime. 23# In addition subroutine that scatters elements of the powers table 24# is implemented, so that scatter-/gathering can be tuned without 25# bn_exp.c modifications. 26 27# August 2013. 28# 29# Add MULX/AD*X code paths and additional interfaces to optimize for 30# branch prediction unit. For input lengths that are multiples of 8 31# the np argument is not just modulus value, but one interleaved 32# with 0. This is to optimize post-condition... 33 34$flavour = shift; 35$output = shift; 36if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 37 38$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 39 40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 41( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 42( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 43die "can't locate x86_64-xlate.pl"; 44 45open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 46*STDOUT=*OUT; 47 48if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 49 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 50 $addx = ($1>=2.23); 51} 52 53if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 54 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 55 $addx = ($1>=2.10); 56} 57 58if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 59 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 60 $addx = ($1>=12); 61} 62 63if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 64 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 65 $addx = ($ver>=3.03); 66} 67 68# int bn_mul_mont_gather5( 69$rp="%rdi"; # BN_ULONG *rp, 70$ap="%rsi"; # const BN_ULONG *ap, 71$bp="%rdx"; # const BN_ULONG *bp, 72$np="%rcx"; # const BN_ULONG *np, 73$n0="%r8"; # const BN_ULONG *n0, 74$num="%r9"; # int num, 75 # int idx); # 0 to 2^5-1, "index" in $bp holding 76 # pre-computed powers of a', interlaced 77 # in such manner that b[0] is $bp[idx], 78 # b[1] is [2^5+idx], etc. 79$lo0="%r10"; 80$hi0="%r11"; 81$hi1="%r13"; 82$i="%r14"; 83$j="%r15"; 84$m0="%rbx"; 85$m1="%rbp"; 86 87$code=<<___; 88.text 89 90.extern OPENSSL_ia32cap_P 91 92.globl bn_mul_mont_gather5 93.type bn_mul_mont_gather5,\@function,6 94.align 64 95bn_mul_mont_gather5: 96.cfi_startproc 97 mov ${num}d,${num}d 98 mov %rsp,%rax 99.cfi_def_cfa_register %rax 100 test \$7,${num}d 101 jnz .Lmul_enter 102___ 103$code.=<<___ if ($addx); 104 mov OPENSSL_ia32cap_P+8(%rip),%r11d 105___ 106$code.=<<___; 107 jmp .Lmul4x_enter 108 109.align 16 110.Lmul_enter: 111 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument 112 push %rbx 113.cfi_push %rbx 114 push %rbp 115.cfi_push %rbp 116 push %r12 117.cfi_push %r12 118 push %r13 119.cfi_push %r13 120 push %r14 121.cfi_push %r14 122 push %r15 123.cfi_push %r15 124 125 neg $num 126 mov %rsp,%r11 127 lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) 128 neg $num # restore $num 129 and \$-1024,%r10 # minimize TLB usage 130 131 # An OS-agnostic version of __chkstk. 132 # 133 # Some OSes (Windows) insist on stack being "wired" to 134 # physical memory in strictly sequential manner, i.e. if stack 135 # allocation spans two pages, then reference to farmost one can 136 # be punishable by SEGV. But page walking can do good even on 137 # other OSes, because it guarantees that villain thread hits 138 # the guard page before it can make damage to innocent one... 139 sub %r10,%r11 140 and \$-4096,%r11 141 lea (%r10,%r11),%rsp 142 mov (%rsp),%r11 143 cmp %r10,%rsp 144 ja .Lmul_page_walk 145 jmp .Lmul_page_walk_done 146 147.Lmul_page_walk: 148 lea -4096(%rsp),%rsp 149 mov (%rsp),%r11 150 cmp %r10,%rsp 151 ja .Lmul_page_walk 152.Lmul_page_walk_done: 153 154 lea .Linc(%rip),%r10 155 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 156.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 157.Lmul_body: 158 159 lea 128($bp),%r12 # reassign $bp (+size optimization) 160___ 161 $bp="%r12"; 162 $STRIDE=2**5*8; # 5 is "window size" 163 $N=$STRIDE/4; # should match cache line size 164$code.=<<___; 165 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 166 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 167 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) 168 and \$-16,%r10 169 170 pshufd \$0,%xmm5,%xmm5 # broadcast index 171 movdqa %xmm1,%xmm4 172 movdqa %xmm1,%xmm2 173___ 174######################################################################## 175# calculate mask by comparing 0..31 to index and save result to stack 176# 177$code.=<<___; 178 paddd %xmm0,%xmm1 179 pcmpeqd %xmm5,%xmm0 # compare to 1,0 180 .byte 0x67 181 movdqa %xmm4,%xmm3 182___ 183for($k=0;$k<$STRIDE/16-4;$k+=4) { 184$code.=<<___; 185 paddd %xmm1,%xmm2 186 pcmpeqd %xmm5,%xmm1 # compare to 3,2 187 movdqa %xmm0,`16*($k+0)+112`(%r10) 188 movdqa %xmm4,%xmm0 189 190 paddd %xmm2,%xmm3 191 pcmpeqd %xmm5,%xmm2 # compare to 5,4 192 movdqa %xmm1,`16*($k+1)+112`(%r10) 193 movdqa %xmm4,%xmm1 194 195 paddd %xmm3,%xmm0 196 pcmpeqd %xmm5,%xmm3 # compare to 7,6 197 movdqa %xmm2,`16*($k+2)+112`(%r10) 198 movdqa %xmm4,%xmm2 199 200 paddd %xmm0,%xmm1 201 pcmpeqd %xmm5,%xmm0 202 movdqa %xmm3,`16*($k+3)+112`(%r10) 203 movdqa %xmm4,%xmm3 204___ 205} 206$code.=<<___; # last iteration can be optimized 207 paddd %xmm1,%xmm2 208 pcmpeqd %xmm5,%xmm1 209 movdqa %xmm0,`16*($k+0)+112`(%r10) 210 211 paddd %xmm2,%xmm3 212 .byte 0x67 213 pcmpeqd %xmm5,%xmm2 214 movdqa %xmm1,`16*($k+1)+112`(%r10) 215 216 pcmpeqd %xmm5,%xmm3 217 movdqa %xmm2,`16*($k+2)+112`(%r10) 218 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register 219 220 pand `16*($k+1)-128`($bp),%xmm1 221 pand `16*($k+2)-128`($bp),%xmm2 222 movdqa %xmm3,`16*($k+3)+112`(%r10) 223 pand `16*($k+3)-128`($bp),%xmm3 224 por %xmm2,%xmm0 225 por %xmm3,%xmm1 226___ 227for($k=0;$k<$STRIDE/16-4;$k+=4) { 228$code.=<<___; 229 movdqa `16*($k+0)-128`($bp),%xmm4 230 movdqa `16*($k+1)-128`($bp),%xmm5 231 movdqa `16*($k+2)-128`($bp),%xmm2 232 pand `16*($k+0)+112`(%r10),%xmm4 233 movdqa `16*($k+3)-128`($bp),%xmm3 234 pand `16*($k+1)+112`(%r10),%xmm5 235 por %xmm4,%xmm0 236 pand `16*($k+2)+112`(%r10),%xmm2 237 por %xmm5,%xmm1 238 pand `16*($k+3)+112`(%r10),%xmm3 239 por %xmm2,%xmm0 240 por %xmm3,%xmm1 241___ 242} 243$code.=<<___; 244 por %xmm1,%xmm0 245 pshufd \$0x4e,%xmm0,%xmm1 246 por %xmm1,%xmm0 247 lea $STRIDE($bp),$bp 248 movq %xmm0,$m0 # m0=bp[0] 249 250 mov ($n0),$n0 # pull n0[0] value 251 mov ($ap),%rax 252 253 xor $i,$i # i=0 254 xor $j,$j # j=0 255 256 mov $n0,$m1 257 mulq $m0 # ap[0]*bp[0] 258 mov %rax,$lo0 259 mov ($np),%rax 260 261 imulq $lo0,$m1 # "tp[0]"*n0 262 mov %rdx,$hi0 263 264 mulq $m1 # np[0]*m1 265 add %rax,$lo0 # discarded 266 mov 8($ap),%rax 267 adc \$0,%rdx 268 mov %rdx,$hi1 269 270 lea 1($j),$j # j++ 271 jmp .L1st_enter 272 273.align 16 274.L1st: 275 add %rax,$hi1 276 mov ($ap,$j,8),%rax 277 adc \$0,%rdx 278 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 279 mov $lo0,$hi0 280 adc \$0,%rdx 281 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 282 mov %rdx,$hi1 283 284.L1st_enter: 285 mulq $m0 # ap[j]*bp[0] 286 add %rax,$hi0 287 mov ($np,$j,8),%rax 288 adc \$0,%rdx 289 lea 1($j),$j # j++ 290 mov %rdx,$lo0 291 292 mulq $m1 # np[j]*m1 293 cmp $num,$j 294 jne .L1st # note that upon exit $j==$num, so 295 # they can be used interchangeably 296 297 add %rax,$hi1 298 adc \$0,%rdx 299 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 300 adc \$0,%rdx 301 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 302 mov %rdx,$hi1 303 mov $lo0,$hi0 304 305 xor %rdx,%rdx 306 add $hi0,$hi1 307 adc \$0,%rdx 308 mov $hi1,-8(%rsp,$num,8) 309 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 310 311 lea 1($i),$i # i++ 312 jmp .Louter 313.align 16 314.Louter: 315 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) 316 and \$-16,%rdx 317 pxor %xmm4,%xmm4 318 pxor %xmm5,%xmm5 319___ 320for($k=0;$k<$STRIDE/16;$k+=4) { 321$code.=<<___; 322 movdqa `16*($k+0)-128`($bp),%xmm0 323 movdqa `16*($k+1)-128`($bp),%xmm1 324 movdqa `16*($k+2)-128`($bp),%xmm2 325 movdqa `16*($k+3)-128`($bp),%xmm3 326 pand `16*($k+0)-128`(%rdx),%xmm0 327 pand `16*($k+1)-128`(%rdx),%xmm1 328 por %xmm0,%xmm4 329 pand `16*($k+2)-128`(%rdx),%xmm2 330 por %xmm1,%xmm5 331 pand `16*($k+3)-128`(%rdx),%xmm3 332 por %xmm2,%xmm4 333 por %xmm3,%xmm5 334___ 335} 336$code.=<<___; 337 por %xmm5,%xmm4 338 pshufd \$0x4e,%xmm4,%xmm0 339 por %xmm4,%xmm0 340 lea $STRIDE($bp),$bp 341 342 mov ($ap),%rax # ap[0] 343 movq %xmm0,$m0 # m0=bp[i] 344 345 xor $j,$j # j=0 346 mov $n0,$m1 347 mov (%rsp),$lo0 348 349 mulq $m0 # ap[0]*bp[i] 350 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 351 mov ($np),%rax 352 adc \$0,%rdx 353 354 imulq $lo0,$m1 # tp[0]*n0 355 mov %rdx,$hi0 356 357 mulq $m1 # np[0]*m1 358 add %rax,$lo0 # discarded 359 mov 8($ap),%rax 360 adc \$0,%rdx 361 mov 8(%rsp),$lo0 # tp[1] 362 mov %rdx,$hi1 363 364 lea 1($j),$j # j++ 365 jmp .Linner_enter 366 367.align 16 368.Linner: 369 add %rax,$hi1 370 mov ($ap,$j,8),%rax 371 adc \$0,%rdx 372 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 373 mov (%rsp,$j,8),$lo0 374 adc \$0,%rdx 375 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 376 mov %rdx,$hi1 377 378.Linner_enter: 379 mulq $m0 # ap[j]*bp[i] 380 add %rax,$hi0 381 mov ($np,$j,8),%rax 382 adc \$0,%rdx 383 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 384 mov %rdx,$hi0 385 adc \$0,$hi0 386 lea 1($j),$j # j++ 387 388 mulq $m1 # np[j]*m1 389 cmp $num,$j 390 jne .Linner # note that upon exit $j==$num, so 391 # they can be used interchangeably 392 add %rax,$hi1 393 adc \$0,%rdx 394 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 395 mov (%rsp,$num,8),$lo0 396 adc \$0,%rdx 397 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 398 mov %rdx,$hi1 399 400 xor %rdx,%rdx 401 add $hi0,$hi1 402 adc \$0,%rdx 403 add $lo0,$hi1 # pull upmost overflow bit 404 adc \$0,%rdx 405 mov $hi1,-8(%rsp,$num,8) 406 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 407 408 lea 1($i),$i # i++ 409 cmp $num,$i 410 jb .Louter 411 412 xor $i,$i # i=0 and clear CF! 413 mov (%rsp),%rax # tp[0] 414 lea (%rsp),$ap # borrow ap for tp 415 mov $num,$j # j=num 416 jmp .Lsub 417.align 16 418.Lsub: sbb ($np,$i,8),%rax 419 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 420 mov 8($ap,$i,8),%rax # tp[i+1] 421 lea 1($i),$i # i++ 422 dec $j # doesn't affect CF! 423 jnz .Lsub 424 425 sbb \$0,%rax # handle upmost overflow bit 426 mov \$-1,%rbx 427 xor %rax,%rbx 428 xor $i,$i 429 mov $num,$j # j=num 430 431.Lcopy: # conditional copy 432 mov ($rp,$i,8),%rcx 433 mov (%rsp,$i,8),%rdx 434 and %rbx,%rcx 435 and %rax,%rdx 436 mov $i,(%rsp,$i,8) # zap temporary vector 437 or %rcx,%rdx 438 mov %rdx,($rp,$i,8) # rp[i]=tp[i] 439 lea 1($i),$i 440 sub \$1,$j 441 jnz .Lcopy 442 443 mov 8(%rsp,$num,8),%rsi # restore %rsp 444.cfi_def_cfa %rsi,8 445 mov \$1,%rax 446 447 mov -48(%rsi),%r15 448.cfi_restore %r15 449 mov -40(%rsi),%r14 450.cfi_restore %r14 451 mov -32(%rsi),%r13 452.cfi_restore %r13 453 mov -24(%rsi),%r12 454.cfi_restore %r12 455 mov -16(%rsi),%rbp 456.cfi_restore %rbp 457 mov -8(%rsi),%rbx 458.cfi_restore %rbx 459 lea (%rsi),%rsp 460.cfi_def_cfa_register %rsp 461.Lmul_epilogue: 462 ret 463.cfi_endproc 464.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 465___ 466{{{ 467my @A=("%r10","%r11"); 468my @N=("%r13","%rdi"); 469$code.=<<___; 470.type bn_mul4x_mont_gather5,\@function,6 471.align 32 472bn_mul4x_mont_gather5: 473.cfi_startproc 474 .byte 0x67 475 mov %rsp,%rax 476.cfi_def_cfa_register %rax 477.Lmul4x_enter: 478___ 479$code.=<<___ if ($addx); 480 and \$0x80108,%r11d 481 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 482 je .Lmulx4x_enter 483___ 484$code.=<<___; 485 push %rbx 486.cfi_push %rbx 487 push %rbp 488.cfi_push %rbp 489 push %r12 490.cfi_push %r12 491 push %r13 492.cfi_push %r13 493 push %r14 494.cfi_push %r14 495 push %r15 496.cfi_push %r15 497.Lmul4x_prologue: 498 499 .byte 0x67 500 shl \$3,${num}d # convert $num to bytes 501 lea ($num,$num,2),%r10 # 3*$num in bytes 502 neg $num # -$num 503 504 ############################################################## 505 # Ensure that stack frame doesn't alias with $rptr+3*$num 506 # modulo 4096, which covers ret[num], am[num] and n[num] 507 # (see bn_exp.c). This is done to allow memory disambiguation 508 # logic do its magic. [Extra [num] is allocated in order 509 # to align with bn_power5's frame, which is cleansed after 510 # completing exponentiation. Extra 256 bytes is for power mask 511 # calculated from 7th argument, the index.] 512 # 513 lea -320(%rsp,$num,2),%r11 514 mov %rsp,%rbp 515 sub $rp,%r11 516 and \$4095,%r11 517 cmp %r11,%r10 518 jb .Lmul4xsp_alt 519 sub %r11,%rbp # align with $rp 520 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 521 jmp .Lmul4xsp_done 522 523.align 32 524.Lmul4xsp_alt: 525 lea 4096-320(,$num,2),%r10 526 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 527 sub %r10,%r11 528 mov \$0,%r10 529 cmovc %r10,%r11 530 sub %r11,%rbp 531.Lmul4xsp_done: 532 and \$-64,%rbp 533 mov %rsp,%r11 534 sub %rbp,%r11 535 and \$-4096,%r11 536 lea (%rbp,%r11),%rsp 537 mov (%rsp),%r10 538 cmp %rbp,%rsp 539 ja .Lmul4x_page_walk 540 jmp .Lmul4x_page_walk_done 541 542.Lmul4x_page_walk: 543 lea -4096(%rsp),%rsp 544 mov (%rsp),%r10 545 cmp %rbp,%rsp 546 ja .Lmul4x_page_walk 547.Lmul4x_page_walk_done: 548 549 neg $num 550 551 mov %rax,40(%rsp) 552.cfi_cfa_expression %rsp+40,deref,+8 553.Lmul4x_body: 554 555 call mul4x_internal 556 557 mov 40(%rsp),%rsi # restore %rsp 558.cfi_def_cfa %rsi,8 559 mov \$1,%rax 560 561 mov -48(%rsi),%r15 562.cfi_restore %r15 563 mov -40(%rsi),%r14 564.cfi_restore %r14 565 mov -32(%rsi),%r13 566.cfi_restore %r13 567 mov -24(%rsi),%r12 568.cfi_restore %r12 569 mov -16(%rsi),%rbp 570.cfi_restore %rbp 571 mov -8(%rsi),%rbx 572.cfi_restore %rbx 573 lea (%rsi),%rsp 574.cfi_def_cfa_register %rsp 575.Lmul4x_epilogue: 576 ret 577.cfi_endproc 578.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 579 580.type mul4x_internal,\@abi-omnipotent 581.align 32 582mul4x_internal: 583 shl \$5,$num # $num was in bytes 584 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index 585 lea .Linc(%rip),%rax 586 lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) 587 shr \$5,$num # restore $num 588___ 589 $bp="%r12"; 590 $STRIDE=2**5*8; # 5 is "window size" 591 $N=$STRIDE/4; # should match cache line size 592 $tp=$i; 593$code.=<<___; 594 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 595 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 596 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) 597 lea 128(%rdx),$bp # size optimization 598 599 pshufd \$0,%xmm5,%xmm5 # broadcast index 600 movdqa %xmm1,%xmm4 601 .byte 0x67,0x67 602 movdqa %xmm1,%xmm2 603___ 604######################################################################## 605# calculate mask by comparing 0..31 to index and save result to stack 606# 607$code.=<<___; 608 paddd %xmm0,%xmm1 609 pcmpeqd %xmm5,%xmm0 # compare to 1,0 610 .byte 0x67 611 movdqa %xmm4,%xmm3 612___ 613for($i=0;$i<$STRIDE/16-4;$i+=4) { 614$code.=<<___; 615 paddd %xmm1,%xmm2 616 pcmpeqd %xmm5,%xmm1 # compare to 3,2 617 movdqa %xmm0,`16*($i+0)+112`(%r10) 618 movdqa %xmm4,%xmm0 619 620 paddd %xmm2,%xmm3 621 pcmpeqd %xmm5,%xmm2 # compare to 5,4 622 movdqa %xmm1,`16*($i+1)+112`(%r10) 623 movdqa %xmm4,%xmm1 624 625 paddd %xmm3,%xmm0 626 pcmpeqd %xmm5,%xmm3 # compare to 7,6 627 movdqa %xmm2,`16*($i+2)+112`(%r10) 628 movdqa %xmm4,%xmm2 629 630 paddd %xmm0,%xmm1 631 pcmpeqd %xmm5,%xmm0 632 movdqa %xmm3,`16*($i+3)+112`(%r10) 633 movdqa %xmm4,%xmm3 634___ 635} 636$code.=<<___; # last iteration can be optimized 637 paddd %xmm1,%xmm2 638 pcmpeqd %xmm5,%xmm1 639 movdqa %xmm0,`16*($i+0)+112`(%r10) 640 641 paddd %xmm2,%xmm3 642 .byte 0x67 643 pcmpeqd %xmm5,%xmm2 644 movdqa %xmm1,`16*($i+1)+112`(%r10) 645 646 pcmpeqd %xmm5,%xmm3 647 movdqa %xmm2,`16*($i+2)+112`(%r10) 648 pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register 649 650 pand `16*($i+1)-128`($bp),%xmm1 651 pand `16*($i+2)-128`($bp),%xmm2 652 movdqa %xmm3,`16*($i+3)+112`(%r10) 653 pand `16*($i+3)-128`($bp),%xmm3 654 por %xmm2,%xmm0 655 por %xmm3,%xmm1 656___ 657for($i=0;$i<$STRIDE/16-4;$i+=4) { 658$code.=<<___; 659 movdqa `16*($i+0)-128`($bp),%xmm4 660 movdqa `16*($i+1)-128`($bp),%xmm5 661 movdqa `16*($i+2)-128`($bp),%xmm2 662 pand `16*($i+0)+112`(%r10),%xmm4 663 movdqa `16*($i+3)-128`($bp),%xmm3 664 pand `16*($i+1)+112`(%r10),%xmm5 665 por %xmm4,%xmm0 666 pand `16*($i+2)+112`(%r10),%xmm2 667 por %xmm5,%xmm1 668 pand `16*($i+3)+112`(%r10),%xmm3 669 por %xmm2,%xmm0 670 por %xmm3,%xmm1 671___ 672} 673$code.=<<___; 674 por %xmm1,%xmm0 675 pshufd \$0x4e,%xmm0,%xmm1 676 por %xmm1,%xmm0 677 lea $STRIDE($bp),$bp 678 movq %xmm0,$m0 # m0=bp[0] 679 680 mov %r13,16+8(%rsp) # save end of b[num] 681 mov $rp, 56+8(%rsp) # save $rp 682 683 mov ($n0),$n0 # pull n0[0] value 684 mov ($ap),%rax 685 lea ($ap,$num),$ap # end of a[num] 686 neg $num 687 688 mov $n0,$m1 689 mulq $m0 # ap[0]*bp[0] 690 mov %rax,$A[0] 691 mov ($np),%rax 692 693 imulq $A[0],$m1 # "tp[0]"*n0 694 lea 64+8(%rsp),$tp 695 mov %rdx,$A[1] 696 697 mulq $m1 # np[0]*m1 698 add %rax,$A[0] # discarded 699 mov 8($ap,$num),%rax 700 adc \$0,%rdx 701 mov %rdx,$N[1] 702 703 mulq $m0 704 add %rax,$A[1] 705 mov 8*1($np),%rax 706 adc \$0,%rdx 707 mov %rdx,$A[0] 708 709 mulq $m1 710 add %rax,$N[1] 711 mov 16($ap,$num),%rax 712 adc \$0,%rdx 713 add $A[1],$N[1] 714 lea 4*8($num),$j # j=4 715 lea 8*4($np),$np 716 adc \$0,%rdx 717 mov $N[1],($tp) 718 mov %rdx,$N[0] 719 jmp .L1st4x 720 721.align 32 722.L1st4x: 723 mulq $m0 # ap[j]*bp[0] 724 add %rax,$A[0] 725 mov -8*2($np),%rax 726 lea 32($tp),$tp 727 adc \$0,%rdx 728 mov %rdx,$A[1] 729 730 mulq $m1 # np[j]*m1 731 add %rax,$N[0] 732 mov -8($ap,$j),%rax 733 adc \$0,%rdx 734 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 735 adc \$0,%rdx 736 mov $N[0],-24($tp) # tp[j-1] 737 mov %rdx,$N[1] 738 739 mulq $m0 # ap[j]*bp[0] 740 add %rax,$A[1] 741 mov -8*1($np),%rax 742 adc \$0,%rdx 743 mov %rdx,$A[0] 744 745 mulq $m1 # np[j]*m1 746 add %rax,$N[1] 747 mov ($ap,$j),%rax 748 adc \$0,%rdx 749 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 750 adc \$0,%rdx 751 mov $N[1],-16($tp) # tp[j-1] 752 mov %rdx,$N[0] 753 754 mulq $m0 # ap[j]*bp[0] 755 add %rax,$A[0] 756 mov 8*0($np),%rax 757 adc \$0,%rdx 758 mov %rdx,$A[1] 759 760 mulq $m1 # np[j]*m1 761 add %rax,$N[0] 762 mov 8($ap,$j),%rax 763 adc \$0,%rdx 764 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 765 adc \$0,%rdx 766 mov $N[0],-8($tp) # tp[j-1] 767 mov %rdx,$N[1] 768 769 mulq $m0 # ap[j]*bp[0] 770 add %rax,$A[1] 771 mov 8*1($np),%rax 772 adc \$0,%rdx 773 mov %rdx,$A[0] 774 775 mulq $m1 # np[j]*m1 776 add %rax,$N[1] 777 mov 16($ap,$j),%rax 778 adc \$0,%rdx 779 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 780 lea 8*4($np),$np 781 adc \$0,%rdx 782 mov $N[1],($tp) # tp[j-1] 783 mov %rdx,$N[0] 784 785 add \$32,$j # j+=4 786 jnz .L1st4x 787 788 mulq $m0 # ap[j]*bp[0] 789 add %rax,$A[0] 790 mov -8*2($np),%rax 791 lea 32($tp),$tp 792 adc \$0,%rdx 793 mov %rdx,$A[1] 794 795 mulq $m1 # np[j]*m1 796 add %rax,$N[0] 797 mov -8($ap),%rax 798 adc \$0,%rdx 799 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 800 adc \$0,%rdx 801 mov $N[0],-24($tp) # tp[j-1] 802 mov %rdx,$N[1] 803 804 mulq $m0 # ap[j]*bp[0] 805 add %rax,$A[1] 806 mov -8*1($np),%rax 807 adc \$0,%rdx 808 mov %rdx,$A[0] 809 810 mulq $m1 # np[j]*m1 811 add %rax,$N[1] 812 mov ($ap,$num),%rax # ap[0] 813 adc \$0,%rdx 814 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 815 adc \$0,%rdx 816 mov $N[1],-16($tp) # tp[j-1] 817 mov %rdx,$N[0] 818 819 lea ($np,$num),$np # rewind $np 820 821 xor $N[1],$N[1] 822 add $A[0],$N[0] 823 adc \$0,$N[1] 824 mov $N[0],-8($tp) 825 826 jmp .Louter4x 827 828.align 32 829.Louter4x: 830 lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) 831 pxor %xmm4,%xmm4 832 pxor %xmm5,%xmm5 833___ 834for($i=0;$i<$STRIDE/16;$i+=4) { 835$code.=<<___; 836 movdqa `16*($i+0)-128`($bp),%xmm0 837 movdqa `16*($i+1)-128`($bp),%xmm1 838 movdqa `16*($i+2)-128`($bp),%xmm2 839 movdqa `16*($i+3)-128`($bp),%xmm3 840 pand `16*($i+0)-128`(%rdx),%xmm0 841 pand `16*($i+1)-128`(%rdx),%xmm1 842 por %xmm0,%xmm4 843 pand `16*($i+2)-128`(%rdx),%xmm2 844 por %xmm1,%xmm5 845 pand `16*($i+3)-128`(%rdx),%xmm3 846 por %xmm2,%xmm4 847 por %xmm3,%xmm5 848___ 849} 850$code.=<<___; 851 por %xmm5,%xmm4 852 pshufd \$0x4e,%xmm4,%xmm0 853 por %xmm4,%xmm0 854 lea $STRIDE($bp),$bp 855 movq %xmm0,$m0 # m0=bp[i] 856 857 mov ($tp,$num),$A[0] 858 mov $n0,$m1 859 mulq $m0 # ap[0]*bp[i] 860 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 861 mov ($np),%rax 862 adc \$0,%rdx 863 864 imulq $A[0],$m1 # tp[0]*n0 865 mov %rdx,$A[1] 866 mov $N[1],($tp) # store upmost overflow bit 867 868 lea ($tp,$num),$tp # rewind $tp 869 870 mulq $m1 # np[0]*m1 871 add %rax,$A[0] # "$N[0]", discarded 872 mov 8($ap,$num),%rax 873 adc \$0,%rdx 874 mov %rdx,$N[1] 875 876 mulq $m0 # ap[j]*bp[i] 877 add %rax,$A[1] 878 mov 8*1($np),%rax 879 adc \$0,%rdx 880 add 8($tp),$A[1] # +tp[1] 881 adc \$0,%rdx 882 mov %rdx,$A[0] 883 884 mulq $m1 # np[j]*m1 885 add %rax,$N[1] 886 mov 16($ap,$num),%rax 887 adc \$0,%rdx 888 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 889 lea 4*8($num),$j # j=4 890 lea 8*4($np),$np 891 adc \$0,%rdx 892 mov %rdx,$N[0] 893 jmp .Linner4x 894 895.align 32 896.Linner4x: 897 mulq $m0 # ap[j]*bp[i] 898 add %rax,$A[0] 899 mov -8*2($np),%rax 900 adc \$0,%rdx 901 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 902 lea 32($tp),$tp 903 adc \$0,%rdx 904 mov %rdx,$A[1] 905 906 mulq $m1 # np[j]*m1 907 add %rax,$N[0] 908 mov -8($ap,$j),%rax 909 adc \$0,%rdx 910 add $A[0],$N[0] 911 adc \$0,%rdx 912 mov $N[1],-32($tp) # tp[j-1] 913 mov %rdx,$N[1] 914 915 mulq $m0 # ap[j]*bp[i] 916 add %rax,$A[1] 917 mov -8*1($np),%rax 918 adc \$0,%rdx 919 add -8($tp),$A[1] 920 adc \$0,%rdx 921 mov %rdx,$A[0] 922 923 mulq $m1 # np[j]*m1 924 add %rax,$N[1] 925 mov ($ap,$j),%rax 926 adc \$0,%rdx 927 add $A[1],$N[1] 928 adc \$0,%rdx 929 mov $N[0],-24($tp) # tp[j-1] 930 mov %rdx,$N[0] 931 932 mulq $m0 # ap[j]*bp[i] 933 add %rax,$A[0] 934 mov 8*0($np),%rax 935 adc \$0,%rdx 936 add ($tp),$A[0] # ap[j]*bp[i]+tp[j] 937 adc \$0,%rdx 938 mov %rdx,$A[1] 939 940 mulq $m1 # np[j]*m1 941 add %rax,$N[0] 942 mov 8($ap,$j),%rax 943 adc \$0,%rdx 944 add $A[0],$N[0] 945 adc \$0,%rdx 946 mov $N[1],-16($tp) # tp[j-1] 947 mov %rdx,$N[1] 948 949 mulq $m0 # ap[j]*bp[i] 950 add %rax,$A[1] 951 mov 8*1($np),%rax 952 adc \$0,%rdx 953 add 8($tp),$A[1] 954 adc \$0,%rdx 955 mov %rdx,$A[0] 956 957 mulq $m1 # np[j]*m1 958 add %rax,$N[1] 959 mov 16($ap,$j),%rax 960 adc \$0,%rdx 961 add $A[1],$N[1] 962 lea 8*4($np),$np 963 adc \$0,%rdx 964 mov $N[0],-8($tp) # tp[j-1] 965 mov %rdx,$N[0] 966 967 add \$32,$j # j+=4 968 jnz .Linner4x 969 970 mulq $m0 # ap[j]*bp[i] 971 add %rax,$A[0] 972 mov -8*2($np),%rax 973 adc \$0,%rdx 974 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 975 lea 32($tp),$tp 976 adc \$0,%rdx 977 mov %rdx,$A[1] 978 979 mulq $m1 # np[j]*m1 980 add %rax,$N[0] 981 mov -8($ap),%rax 982 adc \$0,%rdx 983 add $A[0],$N[0] 984 adc \$0,%rdx 985 mov $N[1],-32($tp) # tp[j-1] 986 mov %rdx,$N[1] 987 988 mulq $m0 # ap[j]*bp[i] 989 add %rax,$A[1] 990 mov $m1,%rax 991 mov -8*1($np),$m1 992 adc \$0,%rdx 993 add -8($tp),$A[1] 994 adc \$0,%rdx 995 mov %rdx,$A[0] 996 997 mulq $m1 # np[j]*m1 998 add %rax,$N[1] 999 mov ($ap,$num),%rax # ap[0] 1000 adc \$0,%rdx 1001 add $A[1],$N[1] 1002 adc \$0,%rdx 1003 mov $N[0],-24($tp) # tp[j-1] 1004 mov %rdx,$N[0] 1005 1006 mov $N[1],-16($tp) # tp[j-1] 1007 lea ($np,$num),$np # rewind $np 1008 1009 xor $N[1],$N[1] 1010 add $A[0],$N[0] 1011 adc \$0,$N[1] 1012 add ($tp),$N[0] # pull upmost overflow bit 1013 adc \$0,$N[1] # upmost overflow bit 1014 mov $N[0],-8($tp) 1015 1016 cmp 16+8(%rsp),$bp 1017 jb .Louter4x 1018___ 1019if (1) { 1020$code.=<<___; 1021 xor %rax,%rax 1022 sub $N[0],$m1 # compare top-most words 1023 adc $j,$j # $j is zero 1024 or $j,$N[1] 1025 sub $N[1],%rax # %rax=-$N[1] 1026 lea ($tp,$num),%rbx # tptr in .sqr4x_sub 1027 mov ($np),%r12 1028 lea ($np),%rbp # nptr in .sqr4x_sub 1029 mov %r9,%rcx 1030 sar \$3+2,%rcx 1031 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub 1032 dec %r12 # so that after 'not' we get -n[0] 1033 xor %r10,%r10 1034 mov 8*1(%rbp),%r13 1035 mov 8*2(%rbp),%r14 1036 mov 8*3(%rbp),%r15 1037 jmp .Lsqr4x_sub_entry 1038___ 1039} else { 1040my @ri=("%rax",$bp,$m0,$m1); 1041my $rp="%rdx"; 1042$code.=<<___ 1043 xor \$1,$N[1] 1044 lea ($tp,$num),$tp # rewind $tp 1045 sar \$5,$num # cf=0 1046 lea ($np,$N[1],8),$np 1047 mov 56+8(%rsp),$rp # restore $rp 1048 jmp .Lsub4x 1049 1050.align 32 1051.Lsub4x: 1052 .byte 0x66 1053 mov 8*0($tp),@ri[0] 1054 mov 8*1($tp),@ri[1] 1055 .byte 0x66 1056 sbb 16*0($np),@ri[0] 1057 mov 8*2($tp),@ri[2] 1058 sbb 16*1($np),@ri[1] 1059 mov 3*8($tp),@ri[3] 1060 lea 4*8($tp),$tp 1061 sbb 16*2($np),@ri[2] 1062 mov @ri[0],8*0($rp) 1063 sbb 16*3($np),@ri[3] 1064 lea 16*4($np),$np 1065 mov @ri[1],8*1($rp) 1066 mov @ri[2],8*2($rp) 1067 mov @ri[3],8*3($rp) 1068 lea 8*4($rp),$rp 1069 1070 inc $num 1071 jnz .Lsub4x 1072 1073 ret 1074___ 1075} 1076$code.=<<___; 1077.size mul4x_internal,.-mul4x_internal 1078___ 1079}}} 1080{{{ 1081###################################################################### 1082# void bn_power5( 1083my $rptr="%rdi"; # BN_ULONG *rptr, 1084my $aptr="%rsi"; # const BN_ULONG *aptr, 1085my $bptr="%rdx"; # const void *table, 1086my $nptr="%rcx"; # const BN_ULONG *nptr, 1087my $n0 ="%r8"; # const BN_ULONG *n0); 1088my $num ="%r9"; # int num, has to be divisible by 8 1089 # int pwr 1090 1091my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 1092my @A0=("%r10","%r11"); 1093my @A1=("%r12","%r13"); 1094my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 1095 1096$code.=<<___; 1097.globl bn_power5 1098.type bn_power5,\@function,6 1099.align 32 1100bn_power5: 1101.cfi_startproc 1102 mov %rsp,%rax 1103.cfi_def_cfa_register %rax 1104___ 1105$code.=<<___ if ($addx); 1106 mov OPENSSL_ia32cap_P+8(%rip),%r11d 1107 and \$0x80108,%r11d 1108 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 1109 je .Lpowerx5_enter 1110___ 1111$code.=<<___; 1112 push %rbx 1113.cfi_push %rbx 1114 push %rbp 1115.cfi_push %rbp 1116 push %r12 1117.cfi_push %r12 1118 push %r13 1119.cfi_push %r13 1120 push %r14 1121.cfi_push %r14 1122 push %r15 1123.cfi_push %r15 1124.Lpower5_prologue: 1125 1126 shl \$3,${num}d # convert $num to bytes 1127 lea ($num,$num,2),%r10d # 3*$num 1128 neg $num 1129 mov ($n0),$n0 # *n0 1130 1131 ############################################################## 1132 # Ensure that stack frame doesn't alias with $rptr+3*$num 1133 # modulo 4096, which covers ret[num], am[num] and n[num] 1134 # (see bn_exp.c). This is done to allow memory disambiguation 1135 # logic do its magic. [Extra 256 bytes is for power mask 1136 # calculated from 7th argument, the index.] 1137 # 1138 lea -320(%rsp,$num,2),%r11 1139 mov %rsp,%rbp 1140 sub $rptr,%r11 1141 and \$4095,%r11 1142 cmp %r11,%r10 1143 jb .Lpwr_sp_alt 1144 sub %r11,%rbp # align with $aptr 1145 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1146 jmp .Lpwr_sp_done 1147 1148.align 32 1149.Lpwr_sp_alt: 1150 lea 4096-320(,$num,2),%r10 1151 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1152 sub %r10,%r11 1153 mov \$0,%r10 1154 cmovc %r10,%r11 1155 sub %r11,%rbp 1156.Lpwr_sp_done: 1157 and \$-64,%rbp 1158 mov %rsp,%r11 1159 sub %rbp,%r11 1160 and \$-4096,%r11 1161 lea (%rbp,%r11),%rsp 1162 mov (%rsp),%r10 1163 cmp %rbp,%rsp 1164 ja .Lpwr_page_walk 1165 jmp .Lpwr_page_walk_done 1166 1167.Lpwr_page_walk: 1168 lea -4096(%rsp),%rsp 1169 mov (%rsp),%r10 1170 cmp %rbp,%rsp 1171 ja .Lpwr_page_walk 1172.Lpwr_page_walk_done: 1173 1174 mov $num,%r10 1175 neg $num 1176 1177 ############################################################## 1178 # Stack layout 1179 # 1180 # +0 saved $num, used in reduction section 1181 # +8 &t[2*$num], used in reduction section 1182 # +32 saved *n0 1183 # +40 saved %rsp 1184 # +48 t[2*$num] 1185 # 1186 mov $n0, 32(%rsp) 1187 mov %rax, 40(%rsp) # save original %rsp 1188.cfi_cfa_expression %rsp+40,deref,+8 1189.Lpower5_body: 1190 movq $rptr,%xmm1 # save $rptr, used in sqr8x 1191 movq $nptr,%xmm2 # save $nptr 1192 movq %r10, %xmm3 # -$num, used in sqr8x 1193 movq $bptr,%xmm4 1194 1195 call __bn_sqr8x_internal 1196 call __bn_post4x_internal 1197 call __bn_sqr8x_internal 1198 call __bn_post4x_internal 1199 call __bn_sqr8x_internal 1200 call __bn_post4x_internal 1201 call __bn_sqr8x_internal 1202 call __bn_post4x_internal 1203 call __bn_sqr8x_internal 1204 call __bn_post4x_internal 1205 1206 movq %xmm2,$nptr 1207 movq %xmm4,$bptr 1208 mov $aptr,$rptr 1209 mov 40(%rsp),%rax 1210 lea 32(%rsp),$n0 1211 1212 call mul4x_internal 1213 1214 mov 40(%rsp),%rsi # restore %rsp 1215.cfi_def_cfa %rsi,8 1216 mov \$1,%rax 1217 mov -48(%rsi),%r15 1218.cfi_restore %r15 1219 mov -40(%rsi),%r14 1220.cfi_restore %r14 1221 mov -32(%rsi),%r13 1222.cfi_restore %r13 1223 mov -24(%rsi),%r12 1224.cfi_restore %r12 1225 mov -16(%rsi),%rbp 1226.cfi_restore %rbp 1227 mov -8(%rsi),%rbx 1228.cfi_restore %rbx 1229 lea (%rsi),%rsp 1230.cfi_def_cfa_register %rsp 1231.Lpower5_epilogue: 1232 ret 1233.cfi_endproc 1234.size bn_power5,.-bn_power5 1235 1236.globl bn_sqr8x_internal 1237.hidden bn_sqr8x_internal 1238.type bn_sqr8x_internal,\@abi-omnipotent 1239.align 32 1240bn_sqr8x_internal: 1241__bn_sqr8x_internal: 1242 ############################################################## 1243 # Squaring part: 1244 # 1245 # a) multiply-n-add everything but a[i]*a[i]; 1246 # b) shift result of a) by 1 to the left and accumulate 1247 # a[i]*a[i] products; 1248 # 1249 ############################################################## 1250 # a[1]a[0] 1251 # a[2]a[0] 1252 # a[3]a[0] 1253 # a[2]a[1] 1254 # a[4]a[0] 1255 # a[3]a[1] 1256 # a[5]a[0] 1257 # a[4]a[1] 1258 # a[3]a[2] 1259 # a[6]a[0] 1260 # a[5]a[1] 1261 # a[4]a[2] 1262 # a[7]a[0] 1263 # a[6]a[1] 1264 # a[5]a[2] 1265 # a[4]a[3] 1266 # a[7]a[1] 1267 # a[6]a[2] 1268 # a[5]a[3] 1269 # a[7]a[2] 1270 # a[6]a[3] 1271 # a[5]a[4] 1272 # a[7]a[3] 1273 # a[6]a[4] 1274 # a[7]a[4] 1275 # a[6]a[5] 1276 # a[7]a[5] 1277 # a[7]a[6] 1278 # a[1]a[0] 1279 # a[2]a[0] 1280 # a[3]a[0] 1281 # a[4]a[0] 1282 # a[5]a[0] 1283 # a[6]a[0] 1284 # a[7]a[0] 1285 # a[2]a[1] 1286 # a[3]a[1] 1287 # a[4]a[1] 1288 # a[5]a[1] 1289 # a[6]a[1] 1290 # a[7]a[1] 1291 # a[3]a[2] 1292 # a[4]a[2] 1293 # a[5]a[2] 1294 # a[6]a[2] 1295 # a[7]a[2] 1296 # a[4]a[3] 1297 # a[5]a[3] 1298 # a[6]a[3] 1299 # a[7]a[3] 1300 # a[5]a[4] 1301 # a[6]a[4] 1302 # a[7]a[4] 1303 # a[6]a[5] 1304 # a[7]a[5] 1305 # a[7]a[6] 1306 # a[0]a[0] 1307 # a[1]a[1] 1308 # a[2]a[2] 1309 # a[3]a[3] 1310 # a[4]a[4] 1311 # a[5]a[5] 1312 # a[6]a[6] 1313 # a[7]a[7] 1314 1315 lea 32(%r10),$i # $i=-($num-32) 1316 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 1317 1318 mov $num,$j # $j=$num 1319 1320 # comments apply to $num==8 case 1321 mov -32($aptr,$i),$a0 # a[0] 1322 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1323 mov -24($aptr,$i),%rax # a[1] 1324 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1325 mov -16($aptr,$i),$ai # a[2] 1326 mov %rax,$a1 1327 1328 mul $a0 # a[1]*a[0] 1329 mov %rax,$A0[0] # a[1]*a[0] 1330 mov $ai,%rax # a[2] 1331 mov %rdx,$A0[1] 1332 mov $A0[0],-24($tptr,$i) # t[1] 1333 1334 mul $a0 # a[2]*a[0] 1335 add %rax,$A0[1] 1336 mov $ai,%rax 1337 adc \$0,%rdx 1338 mov $A0[1],-16($tptr,$i) # t[2] 1339 mov %rdx,$A0[0] 1340 1341 1342 mov -8($aptr,$i),$ai # a[3] 1343 mul $a1 # a[2]*a[1] 1344 mov %rax,$A1[0] # a[2]*a[1]+t[3] 1345 mov $ai,%rax 1346 mov %rdx,$A1[1] 1347 1348 lea ($i),$j 1349 mul $a0 # a[3]*a[0] 1350 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1351 mov $ai,%rax 1352 mov %rdx,$A0[1] 1353 adc \$0,$A0[1] 1354 add $A1[0],$A0[0] 1355 adc \$0,$A0[1] 1356 mov $A0[0],-8($tptr,$j) # t[3] 1357 jmp .Lsqr4x_1st 1358 1359.align 32 1360.Lsqr4x_1st: 1361 mov ($aptr,$j),$ai # a[4] 1362 mul $a1 # a[3]*a[1] 1363 add %rax,$A1[1] # a[3]*a[1]+t[4] 1364 mov $ai,%rax 1365 mov %rdx,$A1[0] 1366 adc \$0,$A1[0] 1367 1368 mul $a0 # a[4]*a[0] 1369 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1370 mov $ai,%rax # a[3] 1371 mov 8($aptr,$j),$ai # a[5] 1372 mov %rdx,$A0[0] 1373 adc \$0,$A0[0] 1374 add $A1[1],$A0[1] 1375 adc \$0,$A0[0] 1376 1377 1378 mul $a1 # a[4]*a[3] 1379 add %rax,$A1[0] # a[4]*a[3]+t[5] 1380 mov $ai,%rax 1381 mov $A0[1],($tptr,$j) # t[4] 1382 mov %rdx,$A1[1] 1383 adc \$0,$A1[1] 1384 1385 mul $a0 # a[5]*a[2] 1386 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1387 mov $ai,%rax 1388 mov 16($aptr,$j),$ai # a[6] 1389 mov %rdx,$A0[1] 1390 adc \$0,$A0[1] 1391 add $A1[0],$A0[0] 1392 adc \$0,$A0[1] 1393 1394 mul $a1 # a[5]*a[3] 1395 add %rax,$A1[1] # a[5]*a[3]+t[6] 1396 mov $ai,%rax 1397 mov $A0[0],8($tptr,$j) # t[5] 1398 mov %rdx,$A1[0] 1399 adc \$0,$A1[0] 1400 1401 mul $a0 # a[6]*a[2] 1402 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 1403 mov $ai,%rax # a[3] 1404 mov 24($aptr,$j),$ai # a[7] 1405 mov %rdx,$A0[0] 1406 adc \$0,$A0[0] 1407 add $A1[1],$A0[1] 1408 adc \$0,$A0[0] 1409 1410 1411 mul $a1 # a[6]*a[5] 1412 add %rax,$A1[0] # a[6]*a[5]+t[7] 1413 mov $ai,%rax 1414 mov $A0[1],16($tptr,$j) # t[6] 1415 mov %rdx,$A1[1] 1416 adc \$0,$A1[1] 1417 lea 32($j),$j 1418 1419 mul $a0 # a[7]*a[4] 1420 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 1421 mov $ai,%rax 1422 mov %rdx,$A0[1] 1423 adc \$0,$A0[1] 1424 add $A1[0],$A0[0] 1425 adc \$0,$A0[1] 1426 mov $A0[0],-8($tptr,$j) # t[7] 1427 1428 cmp \$0,$j 1429 jne .Lsqr4x_1st 1430 1431 mul $a1 # a[7]*a[5] 1432 add %rax,$A1[1] 1433 lea 16($i),$i 1434 adc \$0,%rdx 1435 add $A0[1],$A1[1] 1436 adc \$0,%rdx 1437 1438 mov $A1[1],($tptr) # t[8] 1439 mov %rdx,$A1[0] 1440 mov %rdx,8($tptr) # t[9] 1441 jmp .Lsqr4x_outer 1442 1443.align 32 1444.Lsqr4x_outer: # comments apply to $num==6 case 1445 mov -32($aptr,$i),$a0 # a[0] 1446 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1447 mov -24($aptr,$i),%rax # a[1] 1448 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1449 mov -16($aptr,$i),$ai # a[2] 1450 mov %rax,$a1 1451 1452 mul $a0 # a[1]*a[0] 1453 mov -24($tptr,$i),$A0[0] # t[1] 1454 add %rax,$A0[0] # a[1]*a[0]+t[1] 1455 mov $ai,%rax # a[2] 1456 adc \$0,%rdx 1457 mov $A0[0],-24($tptr,$i) # t[1] 1458 mov %rdx,$A0[1] 1459 1460 mul $a0 # a[2]*a[0] 1461 add %rax,$A0[1] 1462 mov $ai,%rax 1463 adc \$0,%rdx 1464 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 1465 mov %rdx,$A0[0] 1466 adc \$0,$A0[0] 1467 mov $A0[1],-16($tptr,$i) # t[2] 1468 1469 xor $A1[0],$A1[0] 1470 1471 mov -8($aptr,$i),$ai # a[3] 1472 mul $a1 # a[2]*a[1] 1473 add %rax,$A1[0] # a[2]*a[1]+t[3] 1474 mov $ai,%rax 1475 adc \$0,%rdx 1476 add -8($tptr,$i),$A1[0] 1477 mov %rdx,$A1[1] 1478 adc \$0,$A1[1] 1479 1480 mul $a0 # a[3]*a[0] 1481 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1482 mov $ai,%rax 1483 adc \$0,%rdx 1484 add $A1[0],$A0[0] 1485 mov %rdx,$A0[1] 1486 adc \$0,$A0[1] 1487 mov $A0[0],-8($tptr,$i) # t[3] 1488 1489 lea ($i),$j 1490 jmp .Lsqr4x_inner 1491 1492.align 32 1493.Lsqr4x_inner: 1494 mov ($aptr,$j),$ai # a[4] 1495 mul $a1 # a[3]*a[1] 1496 add %rax,$A1[1] # a[3]*a[1]+t[4] 1497 mov $ai,%rax 1498 mov %rdx,$A1[0] 1499 adc \$0,$A1[0] 1500 add ($tptr,$j),$A1[1] 1501 adc \$0,$A1[0] 1502 1503 .byte 0x67 1504 mul $a0 # a[4]*a[0] 1505 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1506 mov $ai,%rax # a[3] 1507 mov 8($aptr,$j),$ai # a[5] 1508 mov %rdx,$A0[0] 1509 adc \$0,$A0[0] 1510 add $A1[1],$A0[1] 1511 adc \$0,$A0[0] 1512 1513 mul $a1 # a[4]*a[3] 1514 add %rax,$A1[0] # a[4]*a[3]+t[5] 1515 mov $A0[1],($tptr,$j) # t[4] 1516 mov $ai,%rax 1517 mov %rdx,$A1[1] 1518 adc \$0,$A1[1] 1519 add 8($tptr,$j),$A1[0] 1520 lea 16($j),$j # j++ 1521 adc \$0,$A1[1] 1522 1523 mul $a0 # a[5]*a[2] 1524 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1525 mov $ai,%rax 1526 adc \$0,%rdx 1527 add $A1[0],$A0[0] 1528 mov %rdx,$A0[1] 1529 adc \$0,$A0[1] 1530 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 1531 1532 cmp \$0,$j 1533 jne .Lsqr4x_inner 1534 1535 .byte 0x67 1536 mul $a1 # a[5]*a[3] 1537 add %rax,$A1[1] 1538 adc \$0,%rdx 1539 add $A0[1],$A1[1] 1540 adc \$0,%rdx 1541 1542 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 1543 mov %rdx,$A1[0] 1544 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below 1545 1546 add \$16,$i 1547 jnz .Lsqr4x_outer 1548 1549 # comments apply to $num==4 case 1550 mov -32($aptr),$a0 # a[0] 1551 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1552 mov -24($aptr),%rax # a[1] 1553 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1554 mov -16($aptr),$ai # a[2] 1555 mov %rax,$a1 1556 1557 mul $a0 # a[1]*a[0] 1558 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 1559 mov $ai,%rax # a[2] 1560 mov %rdx,$A0[1] 1561 adc \$0,$A0[1] 1562 1563 mul $a0 # a[2]*a[0] 1564 add %rax,$A0[1] 1565 mov $ai,%rax 1566 mov $A0[0],-24($tptr) # t[1] 1567 mov %rdx,$A0[0] 1568 adc \$0,$A0[0] 1569 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1570 mov -8($aptr),$ai # a[3] 1571 adc \$0,$A0[0] 1572 1573 mul $a1 # a[2]*a[1] 1574 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1575 mov $ai,%rax 1576 mov $A0[1],-16($tptr) # t[2] 1577 mov %rdx,$A1[1] 1578 adc \$0,$A1[1] 1579 1580 mul $a0 # a[3]*a[0] 1581 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1582 mov $ai,%rax 1583 mov %rdx,$A0[1] 1584 adc \$0,$A0[1] 1585 add $A1[0],$A0[0] 1586 adc \$0,$A0[1] 1587 mov $A0[0],-8($tptr) # t[3] 1588 1589 mul $a1 # a[3]*a[1] 1590 add %rax,$A1[1] 1591 mov -16($aptr),%rax # a[2] 1592 adc \$0,%rdx 1593 add $A0[1],$A1[1] 1594 adc \$0,%rdx 1595 1596 mov $A1[1],($tptr) # t[4] 1597 mov %rdx,$A1[0] 1598 mov %rdx,8($tptr) # t[5] 1599 1600 mul $ai # a[2]*a[3] 1601___ 1602{ 1603my ($shift,$carry)=($a0,$a1); 1604my @S=(@A1,$ai,$n0); 1605$code.=<<___; 1606 add \$16,$i 1607 xor $shift,$shift 1608 sub $num,$i # $i=16-$num 1609 xor $carry,$carry 1610 1611 add $A1[0],%rax # t[5] 1612 adc \$0,%rdx 1613 mov %rax,8($tptr) # t[5] 1614 mov %rdx,16($tptr) # t[6] 1615 mov $carry,24($tptr) # t[7] 1616 1617 mov -16($aptr,$i),%rax # a[0] 1618 lea 48+8(%rsp),$tptr 1619 xor $A0[0],$A0[0] # t[0] 1620 mov 8($tptr),$A0[1] # t[1] 1621 1622 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1623 shr \$63,$A0[0] 1624 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1625 shr \$63,$A0[1] 1626 or $A0[0],$S[1] # | t[2*i]>>63 1627 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1628 mov $A0[1],$shift # shift=t[2*i+1]>>63 1629 mul %rax # a[i]*a[i] 1630 neg $carry # mov $carry,cf 1631 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1632 adc %rax,$S[0] 1633 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1634 mov $S[0],($tptr) 1635 adc %rdx,$S[1] 1636 1637 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1638 mov $S[1],8($tptr) 1639 sbb $carry,$carry # mov cf,$carry 1640 shr \$63,$A0[0] 1641 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1642 shr \$63,$A0[1] 1643 or $A0[0],$S[3] # | t[2*i]>>63 1644 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1645 mov $A0[1],$shift # shift=t[2*i+1]>>63 1646 mul %rax # a[i]*a[i] 1647 neg $carry # mov $carry,cf 1648 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1649 adc %rax,$S[2] 1650 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1651 mov $S[2],16($tptr) 1652 adc %rdx,$S[3] 1653 lea 16($i),$i 1654 mov $S[3],24($tptr) 1655 sbb $carry,$carry # mov cf,$carry 1656 lea 64($tptr),$tptr 1657 jmp .Lsqr4x_shift_n_add 1658 1659.align 32 1660.Lsqr4x_shift_n_add: 1661 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1662 shr \$63,$A0[0] 1663 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1664 shr \$63,$A0[1] 1665 or $A0[0],$S[1] # | t[2*i]>>63 1666 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1667 mov $A0[1],$shift # shift=t[2*i+1]>>63 1668 mul %rax # a[i]*a[i] 1669 neg $carry # mov $carry,cf 1670 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1671 adc %rax,$S[0] 1672 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1673 mov $S[0],-32($tptr) 1674 adc %rdx,$S[1] 1675 1676 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1677 mov $S[1],-24($tptr) 1678 sbb $carry,$carry # mov cf,$carry 1679 shr \$63,$A0[0] 1680 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1681 shr \$63,$A0[1] 1682 or $A0[0],$S[3] # | t[2*i]>>63 1683 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch 1684 mov $A0[1],$shift # shift=t[2*i+1]>>63 1685 mul %rax # a[i]*a[i] 1686 neg $carry # mov $carry,cf 1687 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1688 adc %rax,$S[2] 1689 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1690 mov $S[2],-16($tptr) 1691 adc %rdx,$S[3] 1692 1693 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1694 mov $S[3],-8($tptr) 1695 sbb $carry,$carry # mov cf,$carry 1696 shr \$63,$A0[0] 1697 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1698 shr \$63,$A0[1] 1699 or $A0[0],$S[1] # | t[2*i]>>63 1700 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1701 mov $A0[1],$shift # shift=t[2*i+1]>>63 1702 mul %rax # a[i]*a[i] 1703 neg $carry # mov $carry,cf 1704 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1705 adc %rax,$S[0] 1706 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1707 mov $S[0],0($tptr) 1708 adc %rdx,$S[1] 1709 1710 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1711 mov $S[1],8($tptr) 1712 sbb $carry,$carry # mov cf,$carry 1713 shr \$63,$A0[0] 1714 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1715 shr \$63,$A0[1] 1716 or $A0[0],$S[3] # | t[2*i]>>63 1717 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1718 mov $A0[1],$shift # shift=t[2*i+1]>>63 1719 mul %rax # a[i]*a[i] 1720 neg $carry # mov $carry,cf 1721 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1722 adc %rax,$S[2] 1723 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1724 mov $S[2],16($tptr) 1725 adc %rdx,$S[3] 1726 mov $S[3],24($tptr) 1727 sbb $carry,$carry # mov cf,$carry 1728 lea 64($tptr),$tptr 1729 add \$32,$i 1730 jnz .Lsqr4x_shift_n_add 1731 1732 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1733 .byte 0x67 1734 shr \$63,$A0[0] 1735 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1736 shr \$63,$A0[1] 1737 or $A0[0],$S[1] # | t[2*i]>>63 1738 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1739 mov $A0[1],$shift # shift=t[2*i+1]>>63 1740 mul %rax # a[i]*a[i] 1741 neg $carry # mov $carry,cf 1742 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1743 adc %rax,$S[0] 1744 mov -8($aptr),%rax # a[i+1] # prefetch 1745 mov $S[0],-32($tptr) 1746 adc %rdx,$S[1] 1747 1748 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1749 mov $S[1],-24($tptr) 1750 sbb $carry,$carry # mov cf,$carry 1751 shr \$63,$A0[0] 1752 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1753 shr \$63,$A0[1] 1754 or $A0[0],$S[3] # | t[2*i]>>63 1755 mul %rax # a[i]*a[i] 1756 neg $carry # mov $carry,cf 1757 adc %rax,$S[2] 1758 adc %rdx,$S[3] 1759 mov $S[2],-16($tptr) 1760 mov $S[3],-8($tptr) 1761___ 1762} 1763###################################################################### 1764# Montgomery reduction part, "word-by-word" algorithm. 1765# 1766# This new path is inspired by multiple submissions from Intel, by 1767# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 1768# Vinodh Gopal... 1769{ 1770my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); 1771 1772$code.=<<___; 1773 movq %xmm2,$nptr 1774__bn_sqr8x_reduction: 1775 xor %rax,%rax 1776 lea ($nptr,$num),%rcx # end of n[] 1777 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer 1778 mov %rcx,0+8(%rsp) 1779 lea 48+8(%rsp,$num),$tptr # end of initial t[] window 1780 mov %rdx,8+8(%rsp) 1781 neg $num 1782 jmp .L8x_reduction_loop 1783 1784.align 32 1785.L8x_reduction_loop: 1786 lea ($tptr,$num),$tptr # start of current t[] window 1787 .byte 0x66 1788 mov 8*0($tptr),$m0 1789 mov 8*1($tptr),%r9 1790 mov 8*2($tptr),%r10 1791 mov 8*3($tptr),%r11 1792 mov 8*4($tptr),%r12 1793 mov 8*5($tptr),%r13 1794 mov 8*6($tptr),%r14 1795 mov 8*7($tptr),%r15 1796 mov %rax,(%rdx) # store top-most carry bit 1797 lea 8*8($tptr),$tptr 1798 1799 .byte 0x67 1800 mov $m0,%r8 1801 imulq 32+8(%rsp),$m0 # n0*a[0] 1802 mov 8*0($nptr),%rax # n[0] 1803 mov \$8,%ecx 1804 jmp .L8x_reduce 1805 1806.align 32 1807.L8x_reduce: 1808 mulq $m0 1809 mov 8*1($nptr),%rax # n[1] 1810 neg %r8 1811 mov %rdx,%r8 1812 adc \$0,%r8 1813 1814 mulq $m0 1815 add %rax,%r9 1816 mov 8*2($nptr),%rax 1817 adc \$0,%rdx 1818 add %r9,%r8 1819 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] 1820 mov %rdx,%r9 1821 adc \$0,%r9 1822 1823 mulq $m0 1824 add %rax,%r10 1825 mov 8*3($nptr),%rax 1826 adc \$0,%rdx 1827 add %r10,%r9 1828 mov 32+8(%rsp),$carry # pull n0, borrow $carry 1829 mov %rdx,%r10 1830 adc \$0,%r10 1831 1832 mulq $m0 1833 add %rax,%r11 1834 mov 8*4($nptr),%rax 1835 adc \$0,%rdx 1836 imulq %r8,$carry # modulo-scheduled 1837 add %r11,%r10 1838 mov %rdx,%r11 1839 adc \$0,%r11 1840 1841 mulq $m0 1842 add %rax,%r12 1843 mov 8*5($nptr),%rax 1844 adc \$0,%rdx 1845 add %r12,%r11 1846 mov %rdx,%r12 1847 adc \$0,%r12 1848 1849 mulq $m0 1850 add %rax,%r13 1851 mov 8*6($nptr),%rax 1852 adc \$0,%rdx 1853 add %r13,%r12 1854 mov %rdx,%r13 1855 adc \$0,%r13 1856 1857 mulq $m0 1858 add %rax,%r14 1859 mov 8*7($nptr),%rax 1860 adc \$0,%rdx 1861 add %r14,%r13 1862 mov %rdx,%r14 1863 adc \$0,%r14 1864 1865 mulq $m0 1866 mov $carry,$m0 # n0*a[i] 1867 add %rax,%r15 1868 mov 8*0($nptr),%rax # n[0] 1869 adc \$0,%rdx 1870 add %r15,%r14 1871 mov %rdx,%r15 1872 adc \$0,%r15 1873 1874 dec %ecx 1875 jnz .L8x_reduce 1876 1877 lea 8*8($nptr),$nptr 1878 xor %rax,%rax 1879 mov 8+8(%rsp),%rdx # pull end of t[] 1880 cmp 0+8(%rsp),$nptr # end of n[]? 1881 jae .L8x_no_tail 1882 1883 .byte 0x66 1884 add 8*0($tptr),%r8 1885 adc 8*1($tptr),%r9 1886 adc 8*2($tptr),%r10 1887 adc 8*3($tptr),%r11 1888 adc 8*4($tptr),%r12 1889 adc 8*5($tptr),%r13 1890 adc 8*6($tptr),%r14 1891 adc 8*7($tptr),%r15 1892 sbb $carry,$carry # top carry 1893 1894 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1895 mov \$8,%ecx 1896 mov 8*0($nptr),%rax 1897 jmp .L8x_tail 1898 1899.align 32 1900.L8x_tail: 1901 mulq $m0 1902 add %rax,%r8 1903 mov 8*1($nptr),%rax 1904 mov %r8,($tptr) # save result 1905 mov %rdx,%r8 1906 adc \$0,%r8 1907 1908 mulq $m0 1909 add %rax,%r9 1910 mov 8*2($nptr),%rax 1911 adc \$0,%rdx 1912 add %r9,%r8 1913 lea 8($tptr),$tptr # $tptr++ 1914 mov %rdx,%r9 1915 adc \$0,%r9 1916 1917 mulq $m0 1918 add %rax,%r10 1919 mov 8*3($nptr),%rax 1920 adc \$0,%rdx 1921 add %r10,%r9 1922 mov %rdx,%r10 1923 adc \$0,%r10 1924 1925 mulq $m0 1926 add %rax,%r11 1927 mov 8*4($nptr),%rax 1928 adc \$0,%rdx 1929 add %r11,%r10 1930 mov %rdx,%r11 1931 adc \$0,%r11 1932 1933 mulq $m0 1934 add %rax,%r12 1935 mov 8*5($nptr),%rax 1936 adc \$0,%rdx 1937 add %r12,%r11 1938 mov %rdx,%r12 1939 adc \$0,%r12 1940 1941 mulq $m0 1942 add %rax,%r13 1943 mov 8*6($nptr),%rax 1944 adc \$0,%rdx 1945 add %r13,%r12 1946 mov %rdx,%r13 1947 adc \$0,%r13 1948 1949 mulq $m0 1950 add %rax,%r14 1951 mov 8*7($nptr),%rax 1952 adc \$0,%rdx 1953 add %r14,%r13 1954 mov %rdx,%r14 1955 adc \$0,%r14 1956 1957 mulq $m0 1958 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] 1959 add %rax,%r15 1960 adc \$0,%rdx 1961 add %r15,%r14 1962 mov 8*0($nptr),%rax # pull n[0] 1963 mov %rdx,%r15 1964 adc \$0,%r15 1965 1966 dec %ecx 1967 jnz .L8x_tail 1968 1969 lea 8*8($nptr),$nptr 1970 mov 8+8(%rsp),%rdx # pull end of t[] 1971 cmp 0+8(%rsp),$nptr # end of n[]? 1972 jae .L8x_tail_done # break out of loop 1973 1974 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1975 neg $carry 1976 mov 8*0($nptr),%rax # pull n[0] 1977 adc 8*0($tptr),%r8 1978 adc 8*1($tptr),%r9 1979 adc 8*2($tptr),%r10 1980 adc 8*3($tptr),%r11 1981 adc 8*4($tptr),%r12 1982 adc 8*5($tptr),%r13 1983 adc 8*6($tptr),%r14 1984 adc 8*7($tptr),%r15 1985 sbb $carry,$carry # top carry 1986 1987 mov \$8,%ecx 1988 jmp .L8x_tail 1989 1990.align 32 1991.L8x_tail_done: 1992 xor %rax,%rax 1993 add (%rdx),%r8 # can this overflow? 1994 adc \$0,%r9 1995 adc \$0,%r10 1996 adc \$0,%r11 1997 adc \$0,%r12 1998 adc \$0,%r13 1999 adc \$0,%r14 2000 adc \$0,%r15 2001 adc \$0,%rax 2002 2003 neg $carry 2004.L8x_no_tail: 2005 adc 8*0($tptr),%r8 2006 adc 8*1($tptr),%r9 2007 adc 8*2($tptr),%r10 2008 adc 8*3($tptr),%r11 2009 adc 8*4($tptr),%r12 2010 adc 8*5($tptr),%r13 2011 adc 8*6($tptr),%r14 2012 adc 8*7($tptr),%r15 2013 adc \$0,%rax # top-most carry 2014 mov -8($nptr),%rcx # np[num-1] 2015 xor $carry,$carry 2016 2017 movq %xmm2,$nptr # restore $nptr 2018 2019 mov %r8,8*0($tptr) # store top 512 bits 2020 mov %r9,8*1($tptr) 2021 movq %xmm3,$num # $num is %r9, can't be moved upwards 2022 mov %r10,8*2($tptr) 2023 mov %r11,8*3($tptr) 2024 mov %r12,8*4($tptr) 2025 mov %r13,8*5($tptr) 2026 mov %r14,8*6($tptr) 2027 mov %r15,8*7($tptr) 2028 lea 8*8($tptr),$tptr 2029 2030 cmp %rdx,$tptr # end of t[]? 2031 jb .L8x_reduction_loop 2032 ret 2033.size bn_sqr8x_internal,.-bn_sqr8x_internal 2034___ 2035} 2036############################################################## 2037# Post-condition, 4x unrolled 2038# 2039{ 2040my ($tptr,$nptr)=("%rbx","%rbp"); 2041$code.=<<___; 2042.type __bn_post4x_internal,\@abi-omnipotent 2043.align 32 2044__bn_post4x_internal: 2045 mov 8*0($nptr),%r12 2046 lea (%rdi,$num),$tptr # %rdi was $tptr above 2047 mov $num,%rcx 2048 movq %xmm1,$rptr # restore $rptr 2049 neg %rax 2050 movq %xmm1,$aptr # prepare for back-to-back call 2051 sar \$3+2,%rcx 2052 dec %r12 # so that after 'not' we get -n[0] 2053 xor %r10,%r10 2054 mov 8*1($nptr),%r13 2055 mov 8*2($nptr),%r14 2056 mov 8*3($nptr),%r15 2057 jmp .Lsqr4x_sub_entry 2058 2059.align 16 2060.Lsqr4x_sub: 2061 mov 8*0($nptr),%r12 2062 mov 8*1($nptr),%r13 2063 mov 8*2($nptr),%r14 2064 mov 8*3($nptr),%r15 2065.Lsqr4x_sub_entry: 2066 lea 8*4($nptr),$nptr 2067 not %r12 2068 not %r13 2069 not %r14 2070 not %r15 2071 and %rax,%r12 2072 and %rax,%r13 2073 and %rax,%r14 2074 and %rax,%r15 2075 2076 neg %r10 # mov %r10,%cf 2077 adc 8*0($tptr),%r12 2078 adc 8*1($tptr),%r13 2079 adc 8*2($tptr),%r14 2080 adc 8*3($tptr),%r15 2081 mov %r12,8*0($rptr) 2082 lea 8*4($tptr),$tptr 2083 mov %r13,8*1($rptr) 2084 sbb %r10,%r10 # mov %cf,%r10 2085 mov %r14,8*2($rptr) 2086 mov %r15,8*3($rptr) 2087 lea 8*4($rptr),$rptr 2088 2089 inc %rcx # pass %cf 2090 jnz .Lsqr4x_sub 2091 2092 mov $num,%r10 # prepare for back-to-back call 2093 neg $num # restore $num 2094 ret 2095.size __bn_post4x_internal,.-__bn_post4x_internal 2096___ 2097} 2098{ 2099$code.=<<___; 2100.globl bn_from_montgomery 2101.type bn_from_montgomery,\@abi-omnipotent 2102.align 32 2103bn_from_montgomery: 2104 testl \$7,`($win64?"48(%rsp)":"%r9d")` 2105 jz bn_from_mont8x 2106 xor %eax,%eax 2107 ret 2108.size bn_from_montgomery,.-bn_from_montgomery 2109 2110.type bn_from_mont8x,\@function,6 2111.align 32 2112bn_from_mont8x: 2113.cfi_startproc 2114 .byte 0x67 2115 mov %rsp,%rax 2116.cfi_def_cfa_register %rax 2117 push %rbx 2118.cfi_push %rbx 2119 push %rbp 2120.cfi_push %rbp 2121 push %r12 2122.cfi_push %r12 2123 push %r13 2124.cfi_push %r13 2125 push %r14 2126.cfi_push %r14 2127 push %r15 2128.cfi_push %r15 2129.Lfrom_prologue: 2130 2131 shl \$3,${num}d # convert $num to bytes 2132 lea ($num,$num,2),%r10 # 3*$num in bytes 2133 neg $num 2134 mov ($n0),$n0 # *n0 2135 2136 ############################################################## 2137 # Ensure that stack frame doesn't alias with $rptr+3*$num 2138 # modulo 4096, which covers ret[num], am[num] and n[num] 2139 # (see bn_exp.c). The stack is allocated to aligned with 2140 # bn_power5's frame, and as bn_from_montgomery happens to be 2141 # last operation, we use the opportunity to cleanse it. 2142 # 2143 lea -320(%rsp,$num,2),%r11 2144 mov %rsp,%rbp 2145 sub $rptr,%r11 2146 and \$4095,%r11 2147 cmp %r11,%r10 2148 jb .Lfrom_sp_alt 2149 sub %r11,%rbp # align with $aptr 2150 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2151 jmp .Lfrom_sp_done 2152 2153.align 32 2154.Lfrom_sp_alt: 2155 lea 4096-320(,$num,2),%r10 2156 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2157 sub %r10,%r11 2158 mov \$0,%r10 2159 cmovc %r10,%r11 2160 sub %r11,%rbp 2161.Lfrom_sp_done: 2162 and \$-64,%rbp 2163 mov %rsp,%r11 2164 sub %rbp,%r11 2165 and \$-4096,%r11 2166 lea (%rbp,%r11),%rsp 2167 mov (%rsp),%r10 2168 cmp %rbp,%rsp 2169 ja .Lfrom_page_walk 2170 jmp .Lfrom_page_walk_done 2171 2172.Lfrom_page_walk: 2173 lea -4096(%rsp),%rsp 2174 mov (%rsp),%r10 2175 cmp %rbp,%rsp 2176 ja .Lfrom_page_walk 2177.Lfrom_page_walk_done: 2178 2179 mov $num,%r10 2180 neg $num 2181 2182 ############################################################## 2183 # Stack layout 2184 # 2185 # +0 saved $num, used in reduction section 2186 # +8 &t[2*$num], used in reduction section 2187 # +32 saved *n0 2188 # +40 saved %rsp 2189 # +48 t[2*$num] 2190 # 2191 mov $n0, 32(%rsp) 2192 mov %rax, 40(%rsp) # save original %rsp 2193.cfi_cfa_expression %rsp+40,deref,+8 2194.Lfrom_body: 2195 mov $num,%r11 2196 lea 48(%rsp),%rax 2197 pxor %xmm0,%xmm0 2198 jmp .Lmul_by_1 2199 2200.align 32 2201.Lmul_by_1: 2202 movdqu ($aptr),%xmm1 2203 movdqu 16($aptr),%xmm2 2204 movdqu 32($aptr),%xmm3 2205 movdqa %xmm0,(%rax,$num) 2206 movdqu 48($aptr),%xmm4 2207 movdqa %xmm0,16(%rax,$num) 2208 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr 2209 movdqa %xmm1,(%rax) 2210 movdqa %xmm0,32(%rax,$num) 2211 movdqa %xmm2,16(%rax) 2212 movdqa %xmm0,48(%rax,$num) 2213 movdqa %xmm3,32(%rax) 2214 movdqa %xmm4,48(%rax) 2215 lea 64(%rax),%rax 2216 sub \$64,%r11 2217 jnz .Lmul_by_1 2218 2219 movq $rptr,%xmm1 2220 movq $nptr,%xmm2 2221 .byte 0x67 2222 mov $nptr,%rbp 2223 movq %r10, %xmm3 # -num 2224___ 2225$code.=<<___ if ($addx); 2226 mov OPENSSL_ia32cap_P+8(%rip),%r11d 2227 and \$0x80108,%r11d 2228 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 2229 jne .Lfrom_mont_nox 2230 2231 lea (%rax,$num),$rptr 2232 call __bn_sqrx8x_reduction 2233 call __bn_postx4x_internal 2234 2235 pxor %xmm0,%xmm0 2236 lea 48(%rsp),%rax 2237 jmp .Lfrom_mont_zero 2238 2239.align 32 2240.Lfrom_mont_nox: 2241___ 2242$code.=<<___; 2243 call __bn_sqr8x_reduction 2244 call __bn_post4x_internal 2245 2246 pxor %xmm0,%xmm0 2247 lea 48(%rsp),%rax 2248 jmp .Lfrom_mont_zero 2249 2250.align 32 2251.Lfrom_mont_zero: 2252 mov 40(%rsp),%rsi # restore %rsp 2253.cfi_def_cfa %rsi,8 2254 movdqa %xmm0,16*0(%rax) 2255 movdqa %xmm0,16*1(%rax) 2256 movdqa %xmm0,16*2(%rax) 2257 movdqa %xmm0,16*3(%rax) 2258 lea 16*4(%rax),%rax 2259 sub \$32,$num 2260 jnz .Lfrom_mont_zero 2261 2262 mov \$1,%rax 2263 mov -48(%rsi),%r15 2264.cfi_restore %r15 2265 mov -40(%rsi),%r14 2266.cfi_restore %r14 2267 mov -32(%rsi),%r13 2268.cfi_restore %r13 2269 mov -24(%rsi),%r12 2270.cfi_restore %r12 2271 mov -16(%rsi),%rbp 2272.cfi_restore %rbp 2273 mov -8(%rsi),%rbx 2274.cfi_restore %rbx 2275 lea (%rsi),%rsp 2276.cfi_def_cfa_register %rsp 2277.Lfrom_epilogue: 2278 ret 2279.cfi_endproc 2280.size bn_from_mont8x,.-bn_from_mont8x 2281___ 2282} 2283}}} 2284 2285if ($addx) {{{ 2286my $bp="%rdx"; # restore original value 2287 2288$code.=<<___; 2289.type bn_mulx4x_mont_gather5,\@function,6 2290.align 32 2291bn_mulx4x_mont_gather5: 2292.cfi_startproc 2293 mov %rsp,%rax 2294.cfi_def_cfa_register %rax 2295.Lmulx4x_enter: 2296 push %rbx 2297.cfi_push %rbx 2298 push %rbp 2299.cfi_push %rbp 2300 push %r12 2301.cfi_push %r12 2302 push %r13 2303.cfi_push %r13 2304 push %r14 2305.cfi_push %r14 2306 push %r15 2307.cfi_push %r15 2308.Lmulx4x_prologue: 2309 2310 shl \$3,${num}d # convert $num to bytes 2311 lea ($num,$num,2),%r10 # 3*$num in bytes 2312 neg $num # -$num 2313 mov ($n0),$n0 # *n0 2314 2315 ############################################################## 2316 # Ensure that stack frame doesn't alias with $rptr+3*$num 2317 # modulo 4096, which covers ret[num], am[num] and n[num] 2318 # (see bn_exp.c). This is done to allow memory disambiguation 2319 # logic do its magic. [Extra [num] is allocated in order 2320 # to align with bn_power5's frame, which is cleansed after 2321 # completing exponentiation. Extra 256 bytes is for power mask 2322 # calculated from 7th argument, the index.] 2323 # 2324 lea -320(%rsp,$num,2),%r11 2325 mov %rsp,%rbp 2326 sub $rp,%r11 2327 and \$4095,%r11 2328 cmp %r11,%r10 2329 jb .Lmulx4xsp_alt 2330 sub %r11,%rbp # align with $aptr 2331 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2332 jmp .Lmulx4xsp_done 2333 2334.Lmulx4xsp_alt: 2335 lea 4096-320(,$num,2),%r10 2336 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2337 sub %r10,%r11 2338 mov \$0,%r10 2339 cmovc %r10,%r11 2340 sub %r11,%rbp 2341.Lmulx4xsp_done: 2342 and \$-64,%rbp # ensure alignment 2343 mov %rsp,%r11 2344 sub %rbp,%r11 2345 and \$-4096,%r11 2346 lea (%rbp,%r11),%rsp 2347 mov (%rsp),%r10 2348 cmp %rbp,%rsp 2349 ja .Lmulx4x_page_walk 2350 jmp .Lmulx4x_page_walk_done 2351 2352.Lmulx4x_page_walk: 2353 lea -4096(%rsp),%rsp 2354 mov (%rsp),%r10 2355 cmp %rbp,%rsp 2356 ja .Lmulx4x_page_walk 2357.Lmulx4x_page_walk_done: 2358 2359 ############################################################## 2360 # Stack layout 2361 # +0 -num 2362 # +8 off-loaded &b[i] 2363 # +16 end of b[num] 2364 # +24 inner counter 2365 # +32 saved n0 2366 # +40 saved %rsp 2367 # +48 2368 # +56 saved rp 2369 # +64 tmp[num+1] 2370 # 2371 mov $n0, 32(%rsp) # save *n0 2372 mov %rax,40(%rsp) # save original %rsp 2373.cfi_cfa_expression %rsp+40,deref,+8 2374.Lmulx4x_body: 2375 call mulx4x_internal 2376 2377 mov 40(%rsp),%rsi # restore %rsp 2378.cfi_def_cfa %rsi,8 2379 mov \$1,%rax 2380 2381 mov -48(%rsi),%r15 2382.cfi_restore %r15 2383 mov -40(%rsi),%r14 2384.cfi_restore %r14 2385 mov -32(%rsi),%r13 2386.cfi_restore %r13 2387 mov -24(%rsi),%r12 2388.cfi_restore %r12 2389 mov -16(%rsi),%rbp 2390.cfi_restore %rbp 2391 mov -8(%rsi),%rbx 2392.cfi_restore %rbx 2393 lea (%rsi),%rsp 2394.cfi_def_cfa_register %rsp 2395.Lmulx4x_epilogue: 2396 ret 2397.cfi_endproc 2398.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2399 2400.type mulx4x_internal,\@abi-omnipotent 2401.align 32 2402mulx4x_internal: 2403 mov $num,8(%rsp) # save -$num (it was in bytes) 2404 mov $num,%r10 2405 neg $num # restore $num 2406 shl \$5,$num 2407 neg %r10 # restore $num 2408 lea 128($bp,$num),%r13 # end of powers table (+size optimization) 2409 shr \$5+5,$num 2410 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument 2411 sub \$1,$num 2412 lea .Linc(%rip),%rax 2413 mov %r13,16+8(%rsp) # end of b[num] 2414 mov $num,24+8(%rsp) # inner counter 2415 mov $rp, 56+8(%rsp) # save $rp 2416___ 2417my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 2418 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 2419my $rptr=$bptr; 2420my $STRIDE=2**5*8; # 5 is "window size" 2421my $N=$STRIDE/4; # should match cache line size 2422$code.=<<___; 2423 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 2424 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 2425 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) 2426 lea 128($bp),$bptr # size optimization 2427 2428 pshufd \$0,%xmm5,%xmm5 # broadcast index 2429 movdqa %xmm1,%xmm4 2430 .byte 0x67 2431 movdqa %xmm1,%xmm2 2432___ 2433######################################################################## 2434# calculate mask by comparing 0..31 to index and save result to stack 2435# 2436$code.=<<___; 2437 .byte 0x67 2438 paddd %xmm0,%xmm1 2439 pcmpeqd %xmm5,%xmm0 # compare to 1,0 2440 movdqa %xmm4,%xmm3 2441___ 2442for($i=0;$i<$STRIDE/16-4;$i+=4) { 2443$code.=<<___; 2444 paddd %xmm1,%xmm2 2445 pcmpeqd %xmm5,%xmm1 # compare to 3,2 2446 movdqa %xmm0,`16*($i+0)+112`(%r10) 2447 movdqa %xmm4,%xmm0 2448 2449 paddd %xmm2,%xmm3 2450 pcmpeqd %xmm5,%xmm2 # compare to 5,4 2451 movdqa %xmm1,`16*($i+1)+112`(%r10) 2452 movdqa %xmm4,%xmm1 2453 2454 paddd %xmm3,%xmm0 2455 pcmpeqd %xmm5,%xmm3 # compare to 7,6 2456 movdqa %xmm2,`16*($i+2)+112`(%r10) 2457 movdqa %xmm4,%xmm2 2458 2459 paddd %xmm0,%xmm1 2460 pcmpeqd %xmm5,%xmm0 2461 movdqa %xmm3,`16*($i+3)+112`(%r10) 2462 movdqa %xmm4,%xmm3 2463___ 2464} 2465$code.=<<___; # last iteration can be optimized 2466 .byte 0x67 2467 paddd %xmm1,%xmm2 2468 pcmpeqd %xmm5,%xmm1 2469 movdqa %xmm0,`16*($i+0)+112`(%r10) 2470 2471 paddd %xmm2,%xmm3 2472 pcmpeqd %xmm5,%xmm2 2473 movdqa %xmm1,`16*($i+1)+112`(%r10) 2474 2475 pcmpeqd %xmm5,%xmm3 2476 movdqa %xmm2,`16*($i+2)+112`(%r10) 2477 2478 pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register 2479 pand `16*($i+1)-128`($bptr),%xmm1 2480 pand `16*($i+2)-128`($bptr),%xmm2 2481 movdqa %xmm3,`16*($i+3)+112`(%r10) 2482 pand `16*($i+3)-128`($bptr),%xmm3 2483 por %xmm2,%xmm0 2484 por %xmm3,%xmm1 2485___ 2486for($i=0;$i<$STRIDE/16-4;$i+=4) { 2487$code.=<<___; 2488 movdqa `16*($i+0)-128`($bptr),%xmm4 2489 movdqa `16*($i+1)-128`($bptr),%xmm5 2490 movdqa `16*($i+2)-128`($bptr),%xmm2 2491 pand `16*($i+0)+112`(%r10),%xmm4 2492 movdqa `16*($i+3)-128`($bptr),%xmm3 2493 pand `16*($i+1)+112`(%r10),%xmm5 2494 por %xmm4,%xmm0 2495 pand `16*($i+2)+112`(%r10),%xmm2 2496 por %xmm5,%xmm1 2497 pand `16*($i+3)+112`(%r10),%xmm3 2498 por %xmm2,%xmm0 2499 por %xmm3,%xmm1 2500___ 2501} 2502$code.=<<___; 2503 pxor %xmm1,%xmm0 2504 pshufd \$0x4e,%xmm0,%xmm1 2505 por %xmm1,%xmm0 2506 lea $STRIDE($bptr),$bptr 2507 movq %xmm0,%rdx # bp[0] 2508 lea 64+8*4+8(%rsp),$tptr 2509 2510 mov %rdx,$bi 2511 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 2512 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] 2513 add %rax,%r11 2514 mulx 2*8($aptr),%rax,%r13 # ... 2515 adc %rax,%r12 2516 adc \$0,%r13 2517 mulx 3*8($aptr),%rax,%r14 2518 2519 mov $mi,%r15 2520 imulq 32+8(%rsp),$mi # "t[0]"*n0 2521 xor $zero,$zero # cf=0, of=0 2522 mov $mi,%rdx 2523 2524 mov $bptr,8+8(%rsp) # off-load &b[i] 2525 2526 lea 4*8($aptr),$aptr 2527 adcx %rax,%r13 2528 adcx $zero,%r14 # cf=0 2529 2530 mulx 0*8($nptr),%rax,%r10 2531 adcx %rax,%r15 # discarded 2532 adox %r11,%r10 2533 mulx 1*8($nptr),%rax,%r11 2534 adcx %rax,%r10 2535 adox %r12,%r11 2536 mulx 2*8($nptr),%rax,%r12 2537 mov 24+8(%rsp),$bptr # counter value 2538 mov %r10,-8*4($tptr) 2539 adcx %rax,%r11 2540 adox %r13,%r12 2541 mulx 3*8($nptr),%rax,%r15 2542 mov $bi,%rdx 2543 mov %r11,-8*3($tptr) 2544 adcx %rax,%r12 2545 adox $zero,%r15 # of=0 2546 lea 4*8($nptr),$nptr 2547 mov %r12,-8*2($tptr) 2548 jmp .Lmulx4x_1st 2549 2550.align 32 2551.Lmulx4x_1st: 2552 adcx $zero,%r15 # cf=0, modulo-scheduled 2553 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 2554 adcx %r14,%r10 2555 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 2556 adcx %rax,%r11 2557 mulx 2*8($aptr),%r12,%rax # ... 2558 adcx %r14,%r12 2559 mulx 3*8($aptr),%r13,%r14 2560 .byte 0x67,0x67 2561 mov $mi,%rdx 2562 adcx %rax,%r13 2563 adcx $zero,%r14 # cf=0 2564 lea 4*8($aptr),$aptr 2565 lea 4*8($tptr),$tptr 2566 2567 adox %r15,%r10 2568 mulx 0*8($nptr),%rax,%r15 2569 adcx %rax,%r10 2570 adox %r15,%r11 2571 mulx 1*8($nptr),%rax,%r15 2572 adcx %rax,%r11 2573 adox %r15,%r12 2574 mulx 2*8($nptr),%rax,%r15 2575 mov %r10,-5*8($tptr) 2576 adcx %rax,%r12 2577 mov %r11,-4*8($tptr) 2578 adox %r15,%r13 2579 mulx 3*8($nptr),%rax,%r15 2580 mov $bi,%rdx 2581 mov %r12,-3*8($tptr) 2582 adcx %rax,%r13 2583 adox $zero,%r15 2584 lea 4*8($nptr),$nptr 2585 mov %r13,-2*8($tptr) 2586 2587 dec $bptr # of=0, pass cf 2588 jnz .Lmulx4x_1st 2589 2590 mov 8(%rsp),$num # load -num 2591 adc $zero,%r15 # modulo-scheduled 2592 lea ($aptr,$num),$aptr # rewind $aptr 2593 add %r15,%r14 2594 mov 8+8(%rsp),$bptr # re-load &b[i] 2595 adc $zero,$zero # top-most carry 2596 mov %r14,-1*8($tptr) 2597 jmp .Lmulx4x_outer 2598 2599.align 32 2600.Lmulx4x_outer: 2601 lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) 2602 pxor %xmm4,%xmm4 2603 .byte 0x67,0x67 2604 pxor %xmm5,%xmm5 2605___ 2606for($i=0;$i<$STRIDE/16;$i+=4) { 2607$code.=<<___; 2608 movdqa `16*($i+0)-128`($bptr),%xmm0 2609 movdqa `16*($i+1)-128`($bptr),%xmm1 2610 movdqa `16*($i+2)-128`($bptr),%xmm2 2611 pand `16*($i+0)+256`(%r10),%xmm0 2612 movdqa `16*($i+3)-128`($bptr),%xmm3 2613 pand `16*($i+1)+256`(%r10),%xmm1 2614 por %xmm0,%xmm4 2615 pand `16*($i+2)+256`(%r10),%xmm2 2616 por %xmm1,%xmm5 2617 pand `16*($i+3)+256`(%r10),%xmm3 2618 por %xmm2,%xmm4 2619 por %xmm3,%xmm5 2620___ 2621} 2622$code.=<<___; 2623 por %xmm5,%xmm4 2624 pshufd \$0x4e,%xmm4,%xmm0 2625 por %xmm4,%xmm0 2626 lea $STRIDE($bptr),$bptr 2627 movq %xmm0,%rdx # m0=bp[i] 2628 2629 mov $zero,($tptr) # save top-most carry 2630 lea 4*8($tptr,$num),$tptr # rewind $tptr 2631 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 2632 xor $zero,$zero # cf=0, of=0 2633 mov %rdx,$bi 2634 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 2635 adox -4*8($tptr),$mi # +t[0] 2636 adcx %r14,%r11 2637 mulx 2*8($aptr),%r15,%r13 # ... 2638 adox -3*8($tptr),%r11 2639 adcx %r15,%r12 2640 mulx 3*8($aptr),%rdx,%r14 2641 adox -2*8($tptr),%r12 2642 adcx %rdx,%r13 2643 lea ($nptr,$num),$nptr # rewind $nptr 2644 lea 4*8($aptr),$aptr 2645 adox -1*8($tptr),%r13 2646 adcx $zero,%r14 2647 adox $zero,%r14 2648 2649 mov $mi,%r15 2650 imulq 32+8(%rsp),$mi # "t[0]"*n0 2651 2652 mov $mi,%rdx 2653 xor $zero,$zero # cf=0, of=0 2654 mov $bptr,8+8(%rsp) # off-load &b[i] 2655 2656 mulx 0*8($nptr),%rax,%r10 2657 adcx %rax,%r15 # discarded 2658 adox %r11,%r10 2659 mulx 1*8($nptr),%rax,%r11 2660 adcx %rax,%r10 2661 adox %r12,%r11 2662 mulx 2*8($nptr),%rax,%r12 2663 adcx %rax,%r11 2664 adox %r13,%r12 2665 mulx 3*8($nptr),%rax,%r15 2666 mov $bi,%rdx 2667 mov 24+8(%rsp),$bptr # counter value 2668 mov %r10,-8*4($tptr) 2669 adcx %rax,%r12 2670 mov %r11,-8*3($tptr) 2671 adox $zero,%r15 # of=0 2672 mov %r12,-8*2($tptr) 2673 lea 4*8($nptr),$nptr 2674 jmp .Lmulx4x_inner 2675 2676.align 32 2677.Lmulx4x_inner: 2678 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 2679 adcx $zero,%r15 # cf=0, modulo-scheduled 2680 adox %r14,%r10 2681 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 2682 adcx 0*8($tptr),%r10 2683 adox %rax,%r11 2684 mulx 2*8($aptr),%r12,%rax # ... 2685 adcx 1*8($tptr),%r11 2686 adox %r14,%r12 2687 mulx 3*8($aptr),%r13,%r14 2688 mov $mi,%rdx 2689 adcx 2*8($tptr),%r12 2690 adox %rax,%r13 2691 adcx 3*8($tptr),%r13 2692 adox $zero,%r14 # of=0 2693 lea 4*8($aptr),$aptr 2694 lea 4*8($tptr),$tptr 2695 adcx $zero,%r14 # cf=0 2696 2697 adox %r15,%r10 2698 mulx 0*8($nptr),%rax,%r15 2699 adcx %rax,%r10 2700 adox %r15,%r11 2701 mulx 1*8($nptr),%rax,%r15 2702 adcx %rax,%r11 2703 adox %r15,%r12 2704 mulx 2*8($nptr),%rax,%r15 2705 mov %r10,-5*8($tptr) 2706 adcx %rax,%r12 2707 adox %r15,%r13 2708 mov %r11,-4*8($tptr) 2709 mulx 3*8($nptr),%rax,%r15 2710 mov $bi,%rdx 2711 lea 4*8($nptr),$nptr 2712 mov %r12,-3*8($tptr) 2713 adcx %rax,%r13 2714 adox $zero,%r15 2715 mov %r13,-2*8($tptr) 2716 2717 dec $bptr # of=0, pass cf 2718 jnz .Lmulx4x_inner 2719 2720 mov 0+8(%rsp),$num # load -num 2721 adc $zero,%r15 # modulo-scheduled 2722 sub 0*8($tptr),$bptr # pull top-most carry to %cf 2723 mov 8+8(%rsp),$bptr # re-load &b[i] 2724 mov 16+8(%rsp),%r10 2725 adc %r15,%r14 2726 lea ($aptr,$num),$aptr # rewind $aptr 2727 adc $zero,$zero # top-most carry 2728 mov %r14,-1*8($tptr) 2729 2730 cmp %r10,$bptr 2731 jb .Lmulx4x_outer 2732 2733 mov -8($nptr),%r10 2734 mov $zero,%r8 2735 mov ($nptr,$num),%r12 2736 lea ($nptr,$num),%rbp # rewind $nptr 2737 mov $num,%rcx 2738 lea ($tptr,$num),%rdi # rewind $tptr 2739 xor %eax,%eax 2740 xor %r15,%r15 2741 sub %r14,%r10 # compare top-most words 2742 adc %r15,%r15 2743 or %r15,%r8 2744 sar \$3+2,%rcx 2745 sub %r8,%rax # %rax=-%r8 2746 mov 56+8(%rsp),%rdx # restore rp 2747 dec %r12 # so that after 'not' we get -n[0] 2748 mov 8*1(%rbp),%r13 2749 xor %r8,%r8 2750 mov 8*2(%rbp),%r14 2751 mov 8*3(%rbp),%r15 2752 jmp .Lsqrx4x_sub_entry # common post-condition 2753.size mulx4x_internal,.-mulx4x_internal 2754___ 2755}{ 2756###################################################################### 2757# void bn_power5( 2758my $rptr="%rdi"; # BN_ULONG *rptr, 2759my $aptr="%rsi"; # const BN_ULONG *aptr, 2760my $bptr="%rdx"; # const void *table, 2761my $nptr="%rcx"; # const BN_ULONG *nptr, 2762my $n0 ="%r8"; # const BN_ULONG *n0); 2763my $num ="%r9"; # int num, has to be divisible by 8 2764 # int pwr); 2765 2766my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 2767my @A0=("%r10","%r11"); 2768my @A1=("%r12","%r13"); 2769my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 2770 2771$code.=<<___; 2772.type bn_powerx5,\@function,6 2773.align 32 2774bn_powerx5: 2775.cfi_startproc 2776 mov %rsp,%rax 2777.cfi_def_cfa_register %rax 2778.Lpowerx5_enter: 2779 push %rbx 2780.cfi_push %rbx 2781 push %rbp 2782.cfi_push %rbp 2783 push %r12 2784.cfi_push %r12 2785 push %r13 2786.cfi_push %r13 2787 push %r14 2788.cfi_push %r14 2789 push %r15 2790.cfi_push %r15 2791.Lpowerx5_prologue: 2792 2793 shl \$3,${num}d # convert $num to bytes 2794 lea ($num,$num,2),%r10 # 3*$num in bytes 2795 neg $num 2796 mov ($n0),$n0 # *n0 2797 2798 ############################################################## 2799 # Ensure that stack frame doesn't alias with $rptr+3*$num 2800 # modulo 4096, which covers ret[num], am[num] and n[num] 2801 # (see bn_exp.c). This is done to allow memory disambiguation 2802 # logic do its magic. [Extra 256 bytes is for power mask 2803 # calculated from 7th argument, the index.] 2804 # 2805 lea -320(%rsp,$num,2),%r11 2806 mov %rsp,%rbp 2807 sub $rptr,%r11 2808 and \$4095,%r11 2809 cmp %r11,%r10 2810 jb .Lpwrx_sp_alt 2811 sub %r11,%rbp # align with $aptr 2812 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2813 jmp .Lpwrx_sp_done 2814 2815.align 32 2816.Lpwrx_sp_alt: 2817 lea 4096-320(,$num,2),%r10 2818 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) 2819 sub %r10,%r11 2820 mov \$0,%r10 2821 cmovc %r10,%r11 2822 sub %r11,%rbp 2823.Lpwrx_sp_done: 2824 and \$-64,%rbp 2825 mov %rsp,%r11 2826 sub %rbp,%r11 2827 and \$-4096,%r11 2828 lea (%rbp,%r11),%rsp 2829 mov (%rsp),%r10 2830 cmp %rbp,%rsp 2831 ja .Lpwrx_page_walk 2832 jmp .Lpwrx_page_walk_done 2833 2834.Lpwrx_page_walk: 2835 lea -4096(%rsp),%rsp 2836 mov (%rsp),%r10 2837 cmp %rbp,%rsp 2838 ja .Lpwrx_page_walk 2839.Lpwrx_page_walk_done: 2840 2841 mov $num,%r10 2842 neg $num 2843 2844 ############################################################## 2845 # Stack layout 2846 # 2847 # +0 saved $num, used in reduction section 2848 # +8 &t[2*$num], used in reduction section 2849 # +16 intermediate carry bit 2850 # +24 top-most carry bit, used in reduction section 2851 # +32 saved *n0 2852 # +40 saved %rsp 2853 # +48 t[2*$num] 2854 # 2855 pxor %xmm0,%xmm0 2856 movq $rptr,%xmm1 # save $rptr 2857 movq $nptr,%xmm2 # save $nptr 2858 movq %r10, %xmm3 # -$num 2859 movq $bptr,%xmm4 2860 mov $n0, 32(%rsp) 2861 mov %rax, 40(%rsp) # save original %rsp 2862.cfi_cfa_expression %rsp+40,deref,+8 2863.Lpowerx5_body: 2864 2865 call __bn_sqrx8x_internal 2866 call __bn_postx4x_internal 2867 call __bn_sqrx8x_internal 2868 call __bn_postx4x_internal 2869 call __bn_sqrx8x_internal 2870 call __bn_postx4x_internal 2871 call __bn_sqrx8x_internal 2872 call __bn_postx4x_internal 2873 call __bn_sqrx8x_internal 2874 call __bn_postx4x_internal 2875 2876 mov %r10,$num # -num 2877 mov $aptr,$rptr 2878 movq %xmm2,$nptr 2879 movq %xmm4,$bptr 2880 mov 40(%rsp),%rax 2881 2882 call mulx4x_internal 2883 2884 mov 40(%rsp),%rsi # restore %rsp 2885.cfi_def_cfa %rsi,8 2886 mov \$1,%rax 2887 2888 mov -48(%rsi),%r15 2889.cfi_restore %r15 2890 mov -40(%rsi),%r14 2891.cfi_restore %r14 2892 mov -32(%rsi),%r13 2893.cfi_restore %r13 2894 mov -24(%rsi),%r12 2895.cfi_restore %r12 2896 mov -16(%rsi),%rbp 2897.cfi_restore %rbp 2898 mov -8(%rsi),%rbx 2899.cfi_restore %rbx 2900 lea (%rsi),%rsp 2901.cfi_def_cfa_register %rsp 2902.Lpowerx5_epilogue: 2903 ret 2904.cfi_endproc 2905.size bn_powerx5,.-bn_powerx5 2906 2907.globl bn_sqrx8x_internal 2908.hidden bn_sqrx8x_internal 2909.type bn_sqrx8x_internal,\@abi-omnipotent 2910.align 32 2911bn_sqrx8x_internal: 2912__bn_sqrx8x_internal: 2913.cfi_startproc 2914 ################################################################## 2915 # Squaring part: 2916 # 2917 # a) multiply-n-add everything but a[i]*a[i]; 2918 # b) shift result of a) by 1 to the left and accumulate 2919 # a[i]*a[i] products; 2920 # 2921 ################################################################## 2922 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2923 # a[1]a[0] 2924 # a[2]a[0] 2925 # a[3]a[0] 2926 # a[2]a[1] 2927 # a[3]a[1] 2928 # a[3]a[2] 2929 # 2930 # a[4]a[0] 2931 # a[5]a[0] 2932 # a[6]a[0] 2933 # a[7]a[0] 2934 # a[4]a[1] 2935 # a[5]a[1] 2936 # a[6]a[1] 2937 # a[7]a[1] 2938 # a[4]a[2] 2939 # a[5]a[2] 2940 # a[6]a[2] 2941 # a[7]a[2] 2942 # a[4]a[3] 2943 # a[5]a[3] 2944 # a[6]a[3] 2945 # a[7]a[3] 2946 # 2947 # a[5]a[4] 2948 # a[6]a[4] 2949 # a[7]a[4] 2950 # a[6]a[5] 2951 # a[7]a[5] 2952 # a[7]a[6] 2953 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2954___ 2955{ 2956my ($zero,$carry)=("%rbp","%rcx"); 2957my $aaptr=$zero; 2958$code.=<<___; 2959 lea 48+8(%rsp),$tptr 2960 lea ($aptr,$num),$aaptr 2961 mov $num,0+8(%rsp) # save $num 2962 mov $aaptr,8+8(%rsp) # save end of $aptr 2963 jmp .Lsqr8x_zero_start 2964 2965.align 32 2966.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2967.Lsqrx8x_zero: 2968 .byte 0x3e 2969 movdqa %xmm0,0*8($tptr) 2970 movdqa %xmm0,2*8($tptr) 2971 movdqa %xmm0,4*8($tptr) 2972 movdqa %xmm0,6*8($tptr) 2973.Lsqr8x_zero_start: # aligned at 32 2974 movdqa %xmm0,8*8($tptr) 2975 movdqa %xmm0,10*8($tptr) 2976 movdqa %xmm0,12*8($tptr) 2977 movdqa %xmm0,14*8($tptr) 2978 lea 16*8($tptr),$tptr 2979 sub \$64,$num 2980 jnz .Lsqrx8x_zero 2981 2982 mov 0*8($aptr),%rdx # a[0], modulo-scheduled 2983 #xor %r9,%r9 # t[1], ex-$num, zero already 2984 xor %r10,%r10 2985 xor %r11,%r11 2986 xor %r12,%r12 2987 xor %r13,%r13 2988 xor %r14,%r14 2989 xor %r15,%r15 2990 lea 48+8(%rsp),$tptr 2991 xor $zero,$zero # cf=0, cf=0 2992 jmp .Lsqrx8x_outer_loop 2993 2994.align 32 2995.Lsqrx8x_outer_loop: 2996 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] 2997 adcx %r9,%r8 # a[1]*a[0]+=t[1] 2998 adox %rax,%r10 2999 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] 3000 adcx %r10,%r9 3001 adox %rax,%r11 3002 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... 3003 adcx %r11,%r10 3004 adox %rax,%r12 3005 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax 3006 adcx %r12,%r11 3007 adox %rax,%r13 3008 mulx 5*8($aptr),%r12,%rax 3009 adcx %r13,%r12 3010 adox %rax,%r14 3011 mulx 6*8($aptr),%r13,%rax 3012 adcx %r14,%r13 3013 adox %r15,%rax 3014 mulx 7*8($aptr),%r14,%r15 3015 mov 1*8($aptr),%rdx # a[1] 3016 adcx %rax,%r14 3017 adox $zero,%r15 3018 adc 8*8($tptr),%r15 3019 mov %r8,1*8($tptr) # t[1] 3020 mov %r9,2*8($tptr) # t[2] 3021 sbb $carry,$carry # mov %cf,$carry 3022 xor $zero,$zero # cf=0, of=0 3023 3024 3025 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] 3026 mulx 3*8($aptr),%r9,%rax # a[3]*a[1] 3027 adcx %r10,%r8 3028 adox %rbx,%r9 3029 mulx 4*8($aptr),%r10,%rbx # ... 3030 adcx %r11,%r9 3031 adox %rax,%r10 3032 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax 3033 adcx %r12,%r10 3034 adox %rbx,%r11 3035 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx 3036 adcx %r13,%r11 3037 adox %r14,%r12 3038 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 3039 mov 2*8($aptr),%rdx # a[2] 3040 adcx %rax,%r12 3041 adox %rbx,%r13 3042 adcx %r15,%r13 3043 adox $zero,%r14 # of=0 3044 adcx $zero,%r14 # cf=0 3045 3046 mov %r8,3*8($tptr) # t[3] 3047 mov %r9,4*8($tptr) # t[4] 3048 3049 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] 3050 mulx 4*8($aptr),%r9,%rax # a[4]*a[2] 3051 adcx %r10,%r8 3052 adox %rbx,%r9 3053 mulx 5*8($aptr),%r10,%rbx # ... 3054 adcx %r11,%r9 3055 adox %rax,%r10 3056 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax 3057 adcx %r12,%r10 3058 adox %r13,%r11 3059 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 3060 .byte 0x3e 3061 mov 3*8($aptr),%rdx # a[3] 3062 adcx %rbx,%r11 3063 adox %rax,%r12 3064 adcx %r14,%r12 3065 mov %r8,5*8($tptr) # t[5] 3066 mov %r9,6*8($tptr) # t[6] 3067 mulx 4*8($aptr),%r8,%rax # a[4]*a[3] 3068 adox $zero,%r13 # of=0 3069 adcx $zero,%r13 # cf=0 3070 3071 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] 3072 adcx %r10,%r8 3073 adox %rax,%r9 3074 mulx 6*8($aptr),%r10,%rax # ... 3075 adcx %r11,%r9 3076 adox %r12,%r10 3077 mulx 7*8($aptr),%r11,%r12 3078 mov 4*8($aptr),%rdx # a[4] 3079 mov 5*8($aptr),%r14 # a[5] 3080 adcx %rbx,%r10 3081 adox %rax,%r11 3082 mov 6*8($aptr),%r15 # a[6] 3083 adcx %r13,%r11 3084 adox $zero,%r12 # of=0 3085 adcx $zero,%r12 # cf=0 3086 3087 mov %r8,7*8($tptr) # t[7] 3088 mov %r9,8*8($tptr) # t[8] 3089 3090 mulx %r14,%r9,%rax # a[5]*a[4] 3091 mov 7*8($aptr),%r8 # a[7] 3092 adcx %r10,%r9 3093 mulx %r15,%r10,%rbx # a[6]*a[4] 3094 adox %rax,%r10 3095 adcx %r11,%r10 3096 mulx %r8,%r11,%rax # a[7]*a[4] 3097 mov %r14,%rdx # a[5] 3098 adox %rbx,%r11 3099 adcx %r12,%r11 3100 #adox $zero,%rax # of=0 3101 adcx $zero,%rax # cf=0 3102 3103 mulx %r15,%r14,%rbx # a[6]*a[5] 3104 mulx %r8,%r12,%r13 # a[7]*a[5] 3105 mov %r15,%rdx # a[6] 3106 lea 8*8($aptr),$aptr 3107 adcx %r14,%r11 3108 adox %rbx,%r12 3109 adcx %rax,%r12 3110 adox $zero,%r13 3111 3112 .byte 0x67,0x67 3113 mulx %r8,%r8,%r14 # a[7]*a[6] 3114 adcx %r8,%r13 3115 adcx $zero,%r14 3116 3117 cmp 8+8(%rsp),$aptr 3118 je .Lsqrx8x_outer_break 3119 3120 neg $carry # mov $carry,%cf 3121 mov \$-8,%rcx 3122 mov $zero,%r15 3123 mov 8*8($tptr),%r8 3124 adcx 9*8($tptr),%r9 # +=t[9] 3125 adcx 10*8($tptr),%r10 # ... 3126 adcx 11*8($tptr),%r11 3127 adc 12*8($tptr),%r12 3128 adc 13*8($tptr),%r13 3129 adc 14*8($tptr),%r14 3130 adc 15*8($tptr),%r15 3131 lea ($aptr),$aaptr 3132 lea 2*64($tptr),$tptr 3133 sbb %rax,%rax # mov %cf,$carry 3134 3135 mov -64($aptr),%rdx # a[0] 3136 mov %rax,16+8(%rsp) # offload $carry 3137 mov $tptr,24+8(%rsp) 3138 3139 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above 3140 xor %eax,%eax # cf=0, of=0 3141 jmp .Lsqrx8x_loop 3142 3143.align 32 3144.Lsqrx8x_loop: 3145 mov %r8,%rbx 3146 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] 3147 adcx %rax,%rbx # +=t[8] 3148 adox %r9,%r8 3149 3150 mulx 1*8($aaptr),%rax,%r9 # ... 3151 adcx %rax,%r8 3152 adox %r10,%r9 3153 3154 mulx 2*8($aaptr),%rax,%r10 3155 adcx %rax,%r9 3156 adox %r11,%r10 3157 3158 mulx 3*8($aaptr),%rax,%r11 3159 adcx %rax,%r10 3160 adox %r12,%r11 3161 3162 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 3163 adcx %rax,%r11 3164 adox %r13,%r12 3165 3166 mulx 5*8($aaptr),%rax,%r13 3167 adcx %rax,%r12 3168 adox %r14,%r13 3169 3170 mulx 6*8($aaptr),%rax,%r14 3171 mov %rbx,($tptr,%rcx,8) # store t[8+i] 3172 mov \$0,%ebx 3173 adcx %rax,%r13 3174 adox %r15,%r14 3175 3176 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 3177 mov 8($aptr,%rcx,8),%rdx # a[i] 3178 adcx %rax,%r14 3179 adox %rbx,%r15 # %rbx is 0, of=0 3180 adcx %rbx,%r15 # cf=0 3181 3182 .byte 0x67 3183 inc %rcx # of=0 3184 jnz .Lsqrx8x_loop 3185 3186 lea 8*8($aaptr),$aaptr 3187 mov \$-8,%rcx 3188 cmp 8+8(%rsp),$aaptr # done? 3189 je .Lsqrx8x_break 3190 3191 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3192 .byte 0x66 3193 mov -64($aptr),%rdx 3194 adcx 0*8($tptr),%r8 3195 adcx 1*8($tptr),%r9 3196 adc 2*8($tptr),%r10 3197 adc 3*8($tptr),%r11 3198 adc 4*8($tptr),%r12 3199 adc 5*8($tptr),%r13 3200 adc 6*8($tptr),%r14 3201 adc 7*8($tptr),%r15 3202 lea 8*8($tptr),$tptr 3203 .byte 0x67 3204 sbb %rax,%rax # mov %cf,%rax 3205 xor %ebx,%ebx # cf=0, of=0 3206 mov %rax,16+8(%rsp) # offload carry 3207 jmp .Lsqrx8x_loop 3208 3209.align 32 3210.Lsqrx8x_break: 3211 xor $zero,$zero 3212 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3213 adcx $zero,%r8 3214 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry 3215 adcx $zero,%r9 3216 mov 0*8($aptr),%rdx # a[8], modulo-scheduled 3217 adc \$0,%r10 3218 mov %r8,0*8($tptr) 3219 adc \$0,%r11 3220 adc \$0,%r12 3221 adc \$0,%r13 3222 adc \$0,%r14 3223 adc \$0,%r15 3224 cmp $carry,$tptr # cf=0, of=0 3225 je .Lsqrx8x_outer_loop 3226 3227 mov %r9,1*8($tptr) 3228 mov 1*8($carry),%r9 3229 mov %r10,2*8($tptr) 3230 mov 2*8($carry),%r10 3231 mov %r11,3*8($tptr) 3232 mov 3*8($carry),%r11 3233 mov %r12,4*8($tptr) 3234 mov 4*8($carry),%r12 3235 mov %r13,5*8($tptr) 3236 mov 5*8($carry),%r13 3237 mov %r14,6*8($tptr) 3238 mov 6*8($carry),%r14 3239 mov %r15,7*8($tptr) 3240 mov 7*8($carry),%r15 3241 mov $carry,$tptr 3242 jmp .Lsqrx8x_outer_loop 3243 3244.align 32 3245.Lsqrx8x_outer_break: 3246 mov %r9,9*8($tptr) # t[9] 3247 movq %xmm3,%rcx # -$num 3248 mov %r10,10*8($tptr) # ... 3249 mov %r11,11*8($tptr) 3250 mov %r12,12*8($tptr) 3251 mov %r13,13*8($tptr) 3252 mov %r14,14*8($tptr) 3253___ 3254}{ 3255my $i="%rcx"; 3256$code.=<<___; 3257 lea 48+8(%rsp),$tptr 3258 mov ($aptr,$i),%rdx # a[0] 3259 3260 mov 8($tptr),$A0[1] # t[1] 3261 xor $A0[0],$A0[0] # t[0], of=0, cf=0 3262 mov 0+8(%rsp),$num # restore $num 3263 adox $A0[1],$A0[1] 3264 mov 16($tptr),$A1[0] # t[2] # prefetch 3265 mov 24($tptr),$A1[1] # t[3] # prefetch 3266 #jmp .Lsqrx4x_shift_n_add # happens to be aligned 3267 3268.align 32 3269.Lsqrx4x_shift_n_add: 3270 mulx %rdx,%rax,%rbx 3271 adox $A1[0],$A1[0] 3272 adcx $A0[0],%rax 3273 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch 3274 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch 3275 adox $A1[1],$A1[1] 3276 adcx $A0[1],%rbx 3277 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch 3278 mov %rax,0($tptr) 3279 mov %rbx,8($tptr) 3280 3281 mulx %rdx,%rax,%rbx 3282 adox $A0[0],$A0[0] 3283 adcx $A1[0],%rax 3284 mov 16($aptr,$i),%rdx # a[i+2] # prefetch 3285 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch 3286 adox $A0[1],$A0[1] 3287 adcx $A1[1],%rbx 3288 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch 3289 mov %rax,16($tptr) 3290 mov %rbx,24($tptr) 3291 3292 mulx %rdx,%rax,%rbx 3293 adox $A1[0],$A1[0] 3294 adcx $A0[0],%rax 3295 mov 24($aptr,$i),%rdx # a[i+3] # prefetch 3296 lea 32($i),$i 3297 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch 3298 adox $A1[1],$A1[1] 3299 adcx $A0[1],%rbx 3300 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch 3301 mov %rax,32($tptr) 3302 mov %rbx,40($tptr) 3303 3304 mulx %rdx,%rax,%rbx 3305 adox $A0[0],$A0[0] 3306 adcx $A1[0],%rax 3307 jrcxz .Lsqrx4x_shift_n_add_break 3308 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch 3309 adox $A0[1],$A0[1] 3310 adcx $A1[1],%rbx 3311 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch 3312 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch 3313 mov %rax,48($tptr) 3314 mov %rbx,56($tptr) 3315 lea 64($tptr),$tptr 3316 nop 3317 jmp .Lsqrx4x_shift_n_add 3318 3319.align 32 3320.Lsqrx4x_shift_n_add_break: 3321 adcx $A1[1],%rbx 3322 mov %rax,48($tptr) 3323 mov %rbx,56($tptr) 3324 lea 64($tptr),$tptr # end of t[] buffer 3325___ 3326} 3327###################################################################### 3328# Montgomery reduction part, "word-by-word" algorithm. 3329# 3330# This new path is inspired by multiple submissions from Intel, by 3331# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 3332# Vinodh Gopal... 3333{ 3334my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); 3335 3336$code.=<<___; 3337 movq %xmm2,$nptr 3338__bn_sqrx8x_reduction: 3339 xor %eax,%eax # initial top-most carry bit 3340 mov 32+8(%rsp),%rbx # n0 3341 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) 3342 lea -8*8($nptr,$num),%rcx # end of n[] 3343 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer 3344 mov %rcx, 0+8(%rsp) # save end of n[] 3345 mov $tptr,8+8(%rsp) # save end of t[] 3346 3347 lea 48+8(%rsp),$tptr # initial t[] window 3348 jmp .Lsqrx8x_reduction_loop 3349 3350.align 32 3351.Lsqrx8x_reduction_loop: 3352 mov 8*1($tptr),%r9 3353 mov 8*2($tptr),%r10 3354 mov 8*3($tptr),%r11 3355 mov 8*4($tptr),%r12 3356 mov %rdx,%r8 3357 imulq %rbx,%rdx # n0*a[i] 3358 mov 8*5($tptr),%r13 3359 mov 8*6($tptr),%r14 3360 mov 8*7($tptr),%r15 3361 mov %rax,24+8(%rsp) # store top-most carry bit 3362 3363 lea 8*8($tptr),$tptr 3364 xor $carry,$carry # cf=0,of=0 3365 mov \$-8,%rcx 3366 jmp .Lsqrx8x_reduce 3367 3368.align 32 3369.Lsqrx8x_reduce: 3370 mov %r8, %rbx 3371 mulx 8*0($nptr),%rax,%r8 # n[0] 3372 adcx %rbx,%rax # discarded 3373 adox %r9,%r8 3374 3375 mulx 8*1($nptr),%rbx,%r9 # n[1] 3376 adcx %rbx,%r8 3377 adox %r10,%r9 3378 3379 mulx 8*2($nptr),%rbx,%r10 3380 adcx %rbx,%r9 3381 adox %r11,%r10 3382 3383 mulx 8*3($nptr),%rbx,%r11 3384 adcx %rbx,%r10 3385 adox %r12,%r11 3386 3387 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 3388 mov %rdx,%rax 3389 mov %r8,%rdx 3390 adcx %rbx,%r11 3391 adox %r13,%r12 3392 3393 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded 3394 mov %rax,%rdx 3395 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] 3396 3397 mulx 8*5($nptr),%rax,%r13 3398 adcx %rax,%r12 3399 adox %r14,%r13 3400 3401 mulx 8*6($nptr),%rax,%r14 3402 adcx %rax,%r13 3403 adox %r15,%r14 3404 3405 mulx 8*7($nptr),%rax,%r15 3406 mov %rbx,%rdx 3407 adcx %rax,%r14 3408 adox $carry,%r15 # $carry is 0 3409 adcx $carry,%r15 # cf=0 3410 3411 .byte 0x67,0x67,0x67 3412 inc %rcx # of=0 3413 jnz .Lsqrx8x_reduce 3414 3415 mov $carry,%rax # xor %rax,%rax 3416 cmp 0+8(%rsp),$nptr # end of n[]? 3417 jae .Lsqrx8x_no_tail 3418 3419 mov 48+8(%rsp),%rdx # pull n0*a[0] 3420 add 8*0($tptr),%r8 3421 lea 8*8($nptr),$nptr 3422 mov \$-8,%rcx 3423 adcx 8*1($tptr),%r9 3424 adcx 8*2($tptr),%r10 3425 adc 8*3($tptr),%r11 3426 adc 8*4($tptr),%r12 3427 adc 8*5($tptr),%r13 3428 adc 8*6($tptr),%r14 3429 adc 8*7($tptr),%r15 3430 lea 8*8($tptr),$tptr 3431 sbb %rax,%rax # top carry 3432 3433 xor $carry,$carry # of=0, cf=0 3434 mov %rax,16+8(%rsp) 3435 jmp .Lsqrx8x_tail 3436 3437.align 32 3438.Lsqrx8x_tail: 3439 mov %r8,%rbx 3440 mulx 8*0($nptr),%rax,%r8 3441 adcx %rax,%rbx 3442 adox %r9,%r8 3443 3444 mulx 8*1($nptr),%rax,%r9 3445 adcx %rax,%r8 3446 adox %r10,%r9 3447 3448 mulx 8*2($nptr),%rax,%r10 3449 adcx %rax,%r9 3450 adox %r11,%r10 3451 3452 mulx 8*3($nptr),%rax,%r11 3453 adcx %rax,%r10 3454 adox %r12,%r11 3455 3456 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 3457 adcx %rax,%r11 3458 adox %r13,%r12 3459 3460 mulx 8*5($nptr),%rax,%r13 3461 adcx %rax,%r12 3462 adox %r14,%r13 3463 3464 mulx 8*6($nptr),%rax,%r14 3465 adcx %rax,%r13 3466 adox %r15,%r14 3467 3468 mulx 8*7($nptr),%rax,%r15 3469 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] 3470 adcx %rax,%r14 3471 adox $carry,%r15 3472 mov %rbx,($tptr,%rcx,8) # save result 3473 mov %r8,%rbx 3474 adcx $carry,%r15 # cf=0 3475 3476 inc %rcx # of=0 3477 jnz .Lsqrx8x_tail 3478 3479 cmp 0+8(%rsp),$nptr # end of n[]? 3480 jae .Lsqrx8x_tail_done # break out of loop 3481 3482 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3483 mov 48+8(%rsp),%rdx # pull n0*a[0] 3484 lea 8*8($nptr),$nptr 3485 adc 8*0($tptr),%r8 3486 adc 8*1($tptr),%r9 3487 adc 8*2($tptr),%r10 3488 adc 8*3($tptr),%r11 3489 adc 8*4($tptr),%r12 3490 adc 8*5($tptr),%r13 3491 adc 8*6($tptr),%r14 3492 adc 8*7($tptr),%r15 3493 lea 8*8($tptr),$tptr 3494 sbb %rax,%rax 3495 sub \$8,%rcx # mov \$-8,%rcx 3496 3497 xor $carry,$carry # of=0, cf=0 3498 mov %rax,16+8(%rsp) 3499 jmp .Lsqrx8x_tail 3500 3501.align 32 3502.Lsqrx8x_tail_done: 3503 xor %rax,%rax 3504 add 24+8(%rsp),%r8 # can this overflow? 3505 adc \$0,%r9 3506 adc \$0,%r10 3507 adc \$0,%r11 3508 adc \$0,%r12 3509 adc \$0,%r13 3510 adc \$0,%r14 3511 adc \$0,%r15 3512 adc \$0,%rax 3513 3514 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3515.Lsqrx8x_no_tail: # %cf is 0 if jumped here 3516 adc 8*0($tptr),%r8 3517 movq %xmm3,%rcx 3518 adc 8*1($tptr),%r9 3519 mov 8*7($nptr),$carry 3520 movq %xmm2,$nptr # restore $nptr 3521 adc 8*2($tptr),%r10 3522 adc 8*3($tptr),%r11 3523 adc 8*4($tptr),%r12 3524 adc 8*5($tptr),%r13 3525 adc 8*6($tptr),%r14 3526 adc 8*7($tptr),%r15 3527 adc \$0,%rax # top-most carry 3528 3529 mov 32+8(%rsp),%rbx # n0 3530 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" 3531 3532 mov %r8,8*0($tptr) # store top 512 bits 3533 lea 8*8($tptr),%r8 # borrow %r8 3534 mov %r9,8*1($tptr) 3535 mov %r10,8*2($tptr) 3536 mov %r11,8*3($tptr) 3537 mov %r12,8*4($tptr) 3538 mov %r13,8*5($tptr) 3539 mov %r14,8*6($tptr) 3540 mov %r15,8*7($tptr) 3541 3542 lea 8*8($tptr,%rcx),$tptr # start of current t[] window 3543 cmp 8+8(%rsp),%r8 # end of t[]? 3544 jb .Lsqrx8x_reduction_loop 3545 ret 3546.cfi_endproc 3547.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3548___ 3549} 3550############################################################## 3551# Post-condition, 4x unrolled 3552# 3553{ 3554my ($rptr,$nptr)=("%rdx","%rbp"); 3555$code.=<<___; 3556.align 32 3557__bn_postx4x_internal: 3558 mov 8*0($nptr),%r12 3559 mov %rcx,%r10 # -$num 3560 mov %rcx,%r9 # -$num 3561 neg %rax 3562 sar \$3+2,%rcx 3563 #lea 48+8(%rsp,%r9),$tptr 3564 movq %xmm1,$rptr # restore $rptr 3565 movq %xmm1,$aptr # prepare for back-to-back call 3566 dec %r12 # so that after 'not' we get -n[0] 3567 mov 8*1($nptr),%r13 3568 xor %r8,%r8 3569 mov 8*2($nptr),%r14 3570 mov 8*3($nptr),%r15 3571 jmp .Lsqrx4x_sub_entry 3572 3573.align 16 3574.Lsqrx4x_sub: 3575 mov 8*0($nptr),%r12 3576 mov 8*1($nptr),%r13 3577 mov 8*2($nptr),%r14 3578 mov 8*3($nptr),%r15 3579.Lsqrx4x_sub_entry: 3580 andn %rax,%r12,%r12 3581 lea 8*4($nptr),$nptr 3582 andn %rax,%r13,%r13 3583 andn %rax,%r14,%r14 3584 andn %rax,%r15,%r15 3585 3586 neg %r8 # mov %r8,%cf 3587 adc 8*0($tptr),%r12 3588 adc 8*1($tptr),%r13 3589 adc 8*2($tptr),%r14 3590 adc 8*3($tptr),%r15 3591 mov %r12,8*0($rptr) 3592 lea 8*4($tptr),$tptr 3593 mov %r13,8*1($rptr) 3594 sbb %r8,%r8 # mov %cf,%r8 3595 mov %r14,8*2($rptr) 3596 mov %r15,8*3($rptr) 3597 lea 8*4($rptr),$rptr 3598 3599 inc %rcx 3600 jnz .Lsqrx4x_sub 3601 3602 neg %r9 # restore $num 3603 3604 ret 3605.size __bn_postx4x_internal,.-__bn_postx4x_internal 3606___ 3607} 3608}}} 3609{ 3610my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order 3611 ("%rdi","%esi","%rdx","%ecx"); # Unix order 3612my $out=$inp; 3613my $STRIDE=2**5*8; 3614my $N=$STRIDE/4; 3615 3616$code.=<<___; 3617.globl bn_get_bits5 3618.type bn_get_bits5,\@abi-omnipotent 3619.align 16 3620bn_get_bits5: 3621 lea 0($inp),%r10 3622 lea 1($inp),%r11 3623 mov $num,%ecx 3624 shr \$4,$num 3625 and \$15,%ecx 3626 lea -8(%ecx),%eax 3627 cmp \$11,%ecx 3628 cmova %r11,%r10 3629 cmova %eax,%ecx 3630 movzw (%r10,$num,2),%eax 3631 shrl %cl,%eax 3632 and \$31,%eax 3633 ret 3634.size bn_get_bits5,.-bn_get_bits5 3635 3636.globl bn_scatter5 3637.type bn_scatter5,\@abi-omnipotent 3638.align 16 3639bn_scatter5: 3640 cmp \$0, $num 3641 jz .Lscatter_epilogue 3642 lea ($tbl,$idx,8),$tbl 3643.Lscatter: 3644 mov ($inp),%rax 3645 lea 8($inp),$inp 3646 mov %rax,($tbl) 3647 lea 32*8($tbl),$tbl 3648 sub \$1,$num 3649 jnz .Lscatter 3650.Lscatter_epilogue: 3651 ret 3652.size bn_scatter5,.-bn_scatter5 3653 3654.globl bn_gather5 3655.type bn_gather5,\@abi-omnipotent 3656.align 32 3657bn_gather5: 3658.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases 3659 # I can't trust assembler to use specific encoding:-( 3660 .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 3661 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp 3662 lea .Linc(%rip),%rax 3663 and \$-16,%rsp # shouldn't be formally required 3664 3665 movd $idx,%xmm5 3666 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 3667 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 3668 lea 128($tbl),%r11 # size optimization 3669 lea 128(%rsp),%rax # size optimization 3670 3671 pshufd \$0,%xmm5,%xmm5 # broadcast $idx 3672 movdqa %xmm1,%xmm4 3673 movdqa %xmm1,%xmm2 3674___ 3675######################################################################## 3676# calculate mask by comparing 0..31 to $idx and save result to stack 3677# 3678for($i=0;$i<$STRIDE/16;$i+=4) { 3679$code.=<<___; 3680 paddd %xmm0,%xmm1 3681 pcmpeqd %xmm5,%xmm0 # compare to 1,0 3682___ 3683$code.=<<___ if ($i); 3684 movdqa %xmm3,`16*($i-1)-128`(%rax) 3685___ 3686$code.=<<___; 3687 movdqa %xmm4,%xmm3 3688 3689 paddd %xmm1,%xmm2 3690 pcmpeqd %xmm5,%xmm1 # compare to 3,2 3691 movdqa %xmm0,`16*($i+0)-128`(%rax) 3692 movdqa %xmm4,%xmm0 3693 3694 paddd %xmm2,%xmm3 3695 pcmpeqd %xmm5,%xmm2 # compare to 5,4 3696 movdqa %xmm1,`16*($i+1)-128`(%rax) 3697 movdqa %xmm4,%xmm1 3698 3699 paddd %xmm3,%xmm0 3700 pcmpeqd %xmm5,%xmm3 # compare to 7,6 3701 movdqa %xmm2,`16*($i+2)-128`(%rax) 3702 movdqa %xmm4,%xmm2 3703___ 3704} 3705$code.=<<___; 3706 movdqa %xmm3,`16*($i-1)-128`(%rax) 3707 jmp .Lgather 3708 3709.align 32 3710.Lgather: 3711 pxor %xmm4,%xmm4 3712 pxor %xmm5,%xmm5 3713___ 3714for($i=0;$i<$STRIDE/16;$i+=4) { 3715$code.=<<___; 3716 movdqa `16*($i+0)-128`(%r11),%xmm0 3717 movdqa `16*($i+1)-128`(%r11),%xmm1 3718 movdqa `16*($i+2)-128`(%r11),%xmm2 3719 pand `16*($i+0)-128`(%rax),%xmm0 3720 movdqa `16*($i+3)-128`(%r11),%xmm3 3721 pand `16*($i+1)-128`(%rax),%xmm1 3722 por %xmm0,%xmm4 3723 pand `16*($i+2)-128`(%rax),%xmm2 3724 por %xmm1,%xmm5 3725 pand `16*($i+3)-128`(%rax),%xmm3 3726 por %xmm2,%xmm4 3727 por %xmm3,%xmm5 3728___ 3729} 3730$code.=<<___; 3731 por %xmm5,%xmm4 3732 lea $STRIDE(%r11),%r11 3733 pshufd \$0x4e,%xmm4,%xmm0 3734 por %xmm4,%xmm0 3735 movq %xmm0,($out) # m0=bp[0] 3736 lea 8($out),$out 3737 sub \$1,$num 3738 jnz .Lgather 3739 3740 lea (%r10),%rsp 3741 ret 3742.LSEH_end_bn_gather5: 3743.size bn_gather5,.-bn_gather5 3744___ 3745} 3746$code.=<<___; 3747.align 64 3748.Linc: 3749 .long 0,0, 1,1 3750 .long 2,2, 2,2 3751.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3752___ 3753 3754# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3755# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3756if ($win64) { 3757$rec="%rcx"; 3758$frame="%rdx"; 3759$context="%r8"; 3760$disp="%r9"; 3761 3762$code.=<<___; 3763.extern __imp_RtlVirtualUnwind 3764.type mul_handler,\@abi-omnipotent 3765.align 16 3766mul_handler: 3767 push %rsi 3768 push %rdi 3769 push %rbx 3770 push %rbp 3771 push %r12 3772 push %r13 3773 push %r14 3774 push %r15 3775 pushfq 3776 sub \$64,%rsp 3777 3778 mov 120($context),%rax # pull context->Rax 3779 mov 248($context),%rbx # pull context->Rip 3780 3781 mov 8($disp),%rsi # disp->ImageBase 3782 mov 56($disp),%r11 # disp->HandlerData 3783 3784 mov 0(%r11),%r10d # HandlerData[0] 3785 lea (%rsi,%r10),%r10 # end of prologue label 3786 cmp %r10,%rbx # context->Rip<end of prologue label 3787 jb .Lcommon_seh_tail 3788 3789 mov 4(%r11),%r10d # HandlerData[1] 3790 lea (%rsi,%r10),%r10 # beginning of body label 3791 cmp %r10,%rbx # context->Rip<body label 3792 jb .Lcommon_pop_regs 3793 3794 mov 152($context),%rax # pull context->Rsp 3795 3796 mov 8(%r11),%r10d # HandlerData[2] 3797 lea (%rsi,%r10),%r10 # epilogue label 3798 cmp %r10,%rbx # context->Rip>=epilogue label 3799 jae .Lcommon_seh_tail 3800 3801 lea .Lmul_epilogue(%rip),%r10 3802 cmp %r10,%rbx 3803 ja .Lbody_40 3804 3805 mov 192($context),%r10 # pull $num 3806 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 3807 3808 jmp .Lcommon_pop_regs 3809 3810.Lbody_40: 3811 mov 40(%rax),%rax # pull saved stack pointer 3812.Lcommon_pop_regs: 3813 mov -8(%rax),%rbx 3814 mov -16(%rax),%rbp 3815 mov -24(%rax),%r12 3816 mov -32(%rax),%r13 3817 mov -40(%rax),%r14 3818 mov -48(%rax),%r15 3819 mov %rbx,144($context) # restore context->Rbx 3820 mov %rbp,160($context) # restore context->Rbp 3821 mov %r12,216($context) # restore context->R12 3822 mov %r13,224($context) # restore context->R13 3823 mov %r14,232($context) # restore context->R14 3824 mov %r15,240($context) # restore context->R15 3825 3826.Lcommon_seh_tail: 3827 mov 8(%rax),%rdi 3828 mov 16(%rax),%rsi 3829 mov %rax,152($context) # restore context->Rsp 3830 mov %rsi,168($context) # restore context->Rsi 3831 mov %rdi,176($context) # restore context->Rdi 3832 3833 mov 40($disp),%rdi # disp->ContextRecord 3834 mov $context,%rsi # context 3835 mov \$154,%ecx # sizeof(CONTEXT) 3836 .long 0xa548f3fc # cld; rep movsq 3837 3838 mov $disp,%rsi 3839 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3840 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3841 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3842 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3843 mov 40(%rsi),%r10 # disp->ContextRecord 3844 lea 56(%rsi),%r11 # &disp->HandlerData 3845 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3846 mov %r10,32(%rsp) # arg5 3847 mov %r11,40(%rsp) # arg6 3848 mov %r12,48(%rsp) # arg7 3849 mov %rcx,56(%rsp) # arg8, (NULL) 3850 call *__imp_RtlVirtualUnwind(%rip) 3851 3852 mov \$1,%eax # ExceptionContinueSearch 3853 add \$64,%rsp 3854 popfq 3855 pop %r15 3856 pop %r14 3857 pop %r13 3858 pop %r12 3859 pop %rbp 3860 pop %rbx 3861 pop %rdi 3862 pop %rsi 3863 ret 3864.size mul_handler,.-mul_handler 3865 3866.section .pdata 3867.align 4 3868 .rva .LSEH_begin_bn_mul_mont_gather5 3869 .rva .LSEH_end_bn_mul_mont_gather5 3870 .rva .LSEH_info_bn_mul_mont_gather5 3871 3872 .rva .LSEH_begin_bn_mul4x_mont_gather5 3873 .rva .LSEH_end_bn_mul4x_mont_gather5 3874 .rva .LSEH_info_bn_mul4x_mont_gather5 3875 3876 .rva .LSEH_begin_bn_power5 3877 .rva .LSEH_end_bn_power5 3878 .rva .LSEH_info_bn_power5 3879 3880 .rva .LSEH_begin_bn_from_mont8x 3881 .rva .LSEH_end_bn_from_mont8x 3882 .rva .LSEH_info_bn_from_mont8x 3883___ 3884$code.=<<___ if ($addx); 3885 .rva .LSEH_begin_bn_mulx4x_mont_gather5 3886 .rva .LSEH_end_bn_mulx4x_mont_gather5 3887 .rva .LSEH_info_bn_mulx4x_mont_gather5 3888 3889 .rva .LSEH_begin_bn_powerx5 3890 .rva .LSEH_end_bn_powerx5 3891 .rva .LSEH_info_bn_powerx5 3892___ 3893$code.=<<___; 3894 .rva .LSEH_begin_bn_gather5 3895 .rva .LSEH_end_bn_gather5 3896 .rva .LSEH_info_bn_gather5 3897 3898.section .xdata 3899.align 8 3900.LSEH_info_bn_mul_mont_gather5: 3901 .byte 9,0,0,0 3902 .rva mul_handler 3903 .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] 3904.align 8 3905.LSEH_info_bn_mul4x_mont_gather5: 3906 .byte 9,0,0,0 3907 .rva mul_handler 3908 .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 3909.align 8 3910.LSEH_info_bn_power5: 3911 .byte 9,0,0,0 3912 .rva mul_handler 3913 .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] 3914.align 8 3915.LSEH_info_bn_from_mont8x: 3916 .byte 9,0,0,0 3917 .rva mul_handler 3918 .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] 3919___ 3920$code.=<<___ if ($addx); 3921.align 8 3922.LSEH_info_bn_mulx4x_mont_gather5: 3923 .byte 9,0,0,0 3924 .rva mul_handler 3925 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 3926.align 8 3927.LSEH_info_bn_powerx5: 3928 .byte 9,0,0,0 3929 .rva mul_handler 3930 .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] 3931___ 3932$code.=<<___; 3933.align 8 3934.LSEH_info_bn_gather5: 3935 .byte 0x01,0x0b,0x03,0x0a 3936 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 3937 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) 3938.align 8 3939___ 3940} 3941 3942$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3943 3944print $code; 3945close STDOUT; 3946