1#! /usr/bin/env perl 2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# Multi-buffer SHA1 procedure processes n buffers in parallel by 18# placing buffer data to designated lane of SIMD register. n is 19# naturally limited to 4 on pre-AVX2 processors and to 8 on 20# AVX2-capable processors such as Haswell. 21# 22# this +aesni(i) sha1 aesni-sha1 gain(iv) 23# ------------------------------------------------------------------- 24# Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68% 25# Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51% 26# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% 27# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% 28# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% 29# Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145% 30# Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% 31# 32# (i) multi-block CBC encrypt with 128-bit key; 33# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, 34# because of lower AES-NI instruction throughput; 35# (iii) "this" is for n=8, when we gather twice as much data, result 36# for n=4 is 8.00+4.44=12.4; 37# (iv) presented improvement coefficients are asymptotic limits and 38# in real-life application are somewhat lower, e.g. for 2KB 39# fragments they range from 30% to 100% (on Haswell); 40 41$flavour = shift; 42$output = shift; 43if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 44 45$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 46 47$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 48( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 49( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 50die "can't locate x86_64-xlate.pl"; 51 52$avx=0; 53 54if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 55 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 56 $avx = ($1>=2.19) + ($1>=2.22); 57} 58 59if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 60 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 61 $avx = ($1>=2.09) + ($1>=2.10); 62} 63 64if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 65 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 66 $avx = ($1>=10) + ($1>=11); 67} 68 69if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 70 $avx = ($2>=3.0) + ($2>3.0); 71} 72 73open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 74*STDOUT=*OUT; 75 76# void sha1_multi_block ( 77# struct { unsigned int A[8]; 78# unsigned int B[8]; 79# unsigned int C[8]; 80# unsigned int D[8]; 81# unsigned int E[8]; } *ctx, 82# struct { void *ptr; int blocks; } inp[8], 83# int num); /* 1 or 2 */ 84# 85$ctx="%rdi"; # 1st arg 86$inp="%rsi"; # 2nd arg 87$num="%edx"; 88@ptr=map("%r$_",(8..11)); 89$Tbl="%rbp"; 90 91@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4)); 92($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9)); 93@Xi=map("%xmm$_",(10..14)); 94$K="%xmm15"; 95 96if (1) { 97 # Atom-specific optimization aiming to eliminate pshufb with high 98 # registers [and thus get rid of 48 cycles accumulated penalty] 99 @Xi=map("%xmm$_",(0..4)); 100 ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); 101 @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); 102} 103 104$REG_SZ=16; 105 106sub Xi_off { 107my $off = shift; 108 109 $off %= 16; $off *= $REG_SZ; 110 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; 111} 112 113sub BODY_00_19 { 114my ($i,$a,$b,$c,$d,$e)=@_; 115my $j=$i+1; 116my $k=$i+2; 117 118# Loads are performed 2+3/4 iterations in advance. 3/4 means that out 119# of 4 words you would expect to be loaded per given iteration one is 120# spilled to next iteration. In other words indices in four input 121# streams are distributed as following: 122# 123# $i==0: 0,0,0,0,1,1,1,1,2,2,2, 124# $i==1: 2,3,3,3, 125# $i==2: 3,4,4,4, 126# ... 127# $i==13: 14,15,15,15, 128# $i==14: 15 129# 130# Then at $i==15 Xupdate is applied one iteration in advance... 131$code.=<<___ if ($i==0); 132 movd (@ptr[0]),@Xi[0] 133 lea `16*4`(@ptr[0]),@ptr[0] 134 movd (@ptr[1]),@Xi[2] # borrow @Xi[2] 135 lea `16*4`(@ptr[1]),@ptr[1] 136 movd (@ptr[2]),@Xi[3] # borrow @Xi[3] 137 lea `16*4`(@ptr[2]),@ptr[2] 138 movd (@ptr[3]),@Xi[4] # borrow @Xi[4] 139 lea `16*4`(@ptr[3]),@ptr[3] 140 punpckldq @Xi[3],@Xi[0] 141 movd `4*$j-16*4`(@ptr[0]),@Xi[1] 142 punpckldq @Xi[4],@Xi[2] 143 movd `4*$j-16*4`(@ptr[1]),$t3 144 punpckldq @Xi[2],@Xi[0] 145 movd `4*$j-16*4`(@ptr[2]),$t2 146 pshufb $tx,@Xi[0] 147___ 148$code.=<<___ if ($i<14); # just load input 149 movd `4*$j-16*4`(@ptr[3]),$t1 150 punpckldq $t2,@Xi[1] 151 movdqa $a,$t2 152 paddd $K,$e # e+=K_00_19 153 punpckldq $t1,$t3 154 movdqa $b,$t1 155 movdqa $b,$t0 156 pslld \$5,$t2 157 pandn $d,$t1 158 pand $c,$t0 159 punpckldq $t3,@Xi[1] 160 movdqa $a,$t3 161 162 movdqa @Xi[0],`&Xi_off($i)` 163 paddd @Xi[0],$e # e+=X[i] 164 movd `4*$k-16*4`(@ptr[0]),@Xi[2] 165 psrld \$27,$t3 166 pxor $t1,$t0 # Ch(b,c,d) 167 movdqa $b,$t1 168 169 por $t3,$t2 # rol(a,5) 170 movd `4*$k-16*4`(@ptr[1]),$t3 171 pslld \$30,$t1 172 paddd $t0,$e # e+=Ch(b,c,d) 173 174 psrld \$2,$b 175 paddd $t2,$e # e+=rol(a,5) 176 pshufb $tx,@Xi[1] 177 movd `4*$k-16*4`(@ptr[2]),$t2 178 por $t1,$b # b=rol(b,30) 179___ 180$code.=<<___ if ($i==14); # just load input 181 movd `4*$j-16*4`(@ptr[3]),$t1 182 punpckldq $t2,@Xi[1] 183 movdqa $a,$t2 184 paddd $K,$e # e+=K_00_19 185 punpckldq $t1,$t3 186 movdqa $b,$t1 187 movdqa $b,$t0 188 pslld \$5,$t2 189 prefetcht0 63(@ptr[0]) 190 pandn $d,$t1 191 pand $c,$t0 192 punpckldq $t3,@Xi[1] 193 movdqa $a,$t3 194 195 movdqa @Xi[0],`&Xi_off($i)` 196 paddd @Xi[0],$e # e+=X[i] 197 psrld \$27,$t3 198 pxor $t1,$t0 # Ch(b,c,d) 199 movdqa $b,$t1 200 prefetcht0 63(@ptr[1]) 201 202 por $t3,$t2 # rol(a,5) 203 pslld \$30,$t1 204 paddd $t0,$e # e+=Ch(b,c,d) 205 prefetcht0 63(@ptr[2]) 206 207 psrld \$2,$b 208 paddd $t2,$e # e+=rol(a,5) 209 pshufb $tx,@Xi[1] 210 prefetcht0 63(@ptr[3]) 211 por $t1,$b # b=rol(b,30) 212___ 213$code.=<<___ if ($i>=13 && $i<15); 214 movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" 215___ 216$code.=<<___ if ($i>=15); # apply Xupdate 217 pxor @Xi[-2],@Xi[1] # "X[13]" 218 movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 219 220 movdqa $a,$t2 221 pxor `&Xi_off($j+8)`,@Xi[1] 222 paddd $K,$e # e+=K_00_19 223 movdqa $b,$t1 224 pslld \$5,$t2 225 pxor @Xi[3],@Xi[1] 226 movdqa $b,$t0 227 pandn $d,$t1 228 movdqa @Xi[1],$tx 229 pand $c,$t0 230 movdqa $a,$t3 231 psrld \$31,$tx 232 paddd @Xi[1],@Xi[1] 233 234 movdqa @Xi[0],`&Xi_off($i)` 235 paddd @Xi[0],$e # e+=X[i] 236 psrld \$27,$t3 237 pxor $t1,$t0 # Ch(b,c,d) 238 239 movdqa $b,$t1 240 por $t3,$t2 # rol(a,5) 241 pslld \$30,$t1 242 paddd $t0,$e # e+=Ch(b,c,d) 243 244 psrld \$2,$b 245 paddd $t2,$e # e+=rol(a,5) 246 por $tx,@Xi[1] # rol \$1,@Xi[1] 247 por $t1,$b # b=rol(b,30) 248___ 249push(@Xi,shift(@Xi)); 250} 251 252sub BODY_20_39 { 253my ($i,$a,$b,$c,$d,$e)=@_; 254my $j=$i+1; 255 256$code.=<<___ if ($i<79); 257 pxor @Xi[-2],@Xi[1] # "X[13]" 258 movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 259 260 movdqa $a,$t2 261 movdqa $d,$t0 262 pxor `&Xi_off($j+8)`,@Xi[1] 263 paddd $K,$e # e+=K_20_39 264 pslld \$5,$t2 265 pxor $b,$t0 266 267 movdqa $a,$t3 268___ 269$code.=<<___ if ($i<72); 270 movdqa @Xi[0],`&Xi_off($i)` 271___ 272$code.=<<___ if ($i<79); 273 paddd @Xi[0],$e # e+=X[i] 274 pxor @Xi[3],@Xi[1] 275 psrld \$27,$t3 276 pxor $c,$t0 # Parity(b,c,d) 277 movdqa $b,$t1 278 279 pslld \$30,$t1 280 movdqa @Xi[1],$tx 281 por $t3,$t2 # rol(a,5) 282 psrld \$31,$tx 283 paddd $t0,$e # e+=Parity(b,c,d) 284 paddd @Xi[1],@Xi[1] 285 286 psrld \$2,$b 287 paddd $t2,$e # e+=rol(a,5) 288 por $tx,@Xi[1] # rol(@Xi[1],1) 289 por $t1,$b # b=rol(b,30) 290___ 291$code.=<<___ if ($i==79); 292 movdqa $a,$t2 293 paddd $K,$e # e+=K_20_39 294 movdqa $d,$t0 295 pslld \$5,$t2 296 pxor $b,$t0 297 298 movdqa $a,$t3 299 paddd @Xi[0],$e # e+=X[i] 300 psrld \$27,$t3 301 movdqa $b,$t1 302 pxor $c,$t0 # Parity(b,c,d) 303 304 pslld \$30,$t1 305 por $t3,$t2 # rol(a,5) 306 paddd $t0,$e # e+=Parity(b,c,d) 307 308 psrld \$2,$b 309 paddd $t2,$e # e+=rol(a,5) 310 por $t1,$b # b=rol(b,30) 311___ 312push(@Xi,shift(@Xi)); 313} 314 315sub BODY_40_59 { 316my ($i,$a,$b,$c,$d,$e)=@_; 317my $j=$i+1; 318 319$code.=<<___; 320 pxor @Xi[-2],@Xi[1] # "X[13]" 321 movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 322 323 movdqa $a,$t2 324 movdqa $d,$t1 325 pxor `&Xi_off($j+8)`,@Xi[1] 326 pxor @Xi[3],@Xi[1] 327 paddd $K,$e # e+=K_40_59 328 pslld \$5,$t2 329 movdqa $a,$t3 330 pand $c,$t1 331 332 movdqa $d,$t0 333 movdqa @Xi[1],$tx 334 psrld \$27,$t3 335 paddd $t1,$e 336 pxor $c,$t0 337 338 movdqa @Xi[0],`&Xi_off($i)` 339 paddd @Xi[0],$e # e+=X[i] 340 por $t3,$t2 # rol(a,5) 341 psrld \$31,$tx 342 pand $b,$t0 343 movdqa $b,$t1 344 345 pslld \$30,$t1 346 paddd @Xi[1],@Xi[1] 347 paddd $t0,$e # e+=Maj(b,d,c) 348 349 psrld \$2,$b 350 paddd $t2,$e # e+=rol(a,5) 351 por $tx,@Xi[1] # rol(@X[1],1) 352 por $t1,$b # b=rol(b,30) 353___ 354push(@Xi,shift(@Xi)); 355} 356 357$code.=<<___; 358.text 359 360.extern OPENSSL_ia32cap_P 361 362.globl sha1_multi_block 363.type sha1_multi_block,\@function,3 364.align 32 365sha1_multi_block: 366.cfi_startproc 367 mov OPENSSL_ia32cap_P+4(%rip),%rcx 368 bt \$61,%rcx # check SHA bit 369 jc _shaext_shortcut 370___ 371$code.=<<___ if ($avx); 372 test \$`1<<28`,%ecx 373 jnz _avx_shortcut 374___ 375$code.=<<___; 376 mov %rsp,%rax 377.cfi_def_cfa_register %rax 378 push %rbx 379.cfi_push %rbx 380 push %rbp 381.cfi_push %rbx 382___ 383$code.=<<___ if ($win64); 384 lea -0xa8(%rsp),%rsp 385 movaps %xmm6,(%rsp) 386 movaps %xmm7,0x10(%rsp) 387 movaps %xmm8,0x20(%rsp) 388 movaps %xmm9,0x30(%rsp) 389 movaps %xmm10,-0x78(%rax) 390 movaps %xmm11,-0x68(%rax) 391 movaps %xmm12,-0x58(%rax) 392 movaps %xmm13,-0x48(%rax) 393 movaps %xmm14,-0x38(%rax) 394 movaps %xmm15,-0x28(%rax) 395___ 396$code.=<<___; 397 sub \$`$REG_SZ*18`,%rsp 398 and \$-256,%rsp 399 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 400.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 401.Lbody: 402 lea K_XX_XX(%rip),$Tbl 403 lea `$REG_SZ*16`(%rsp),%rbx 404 405.Loop_grande: 406 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 407 xor $num,$num 408___ 409for($i=0;$i<4;$i++) { 410 $code.=<<___; 411 mov `16*$i+0`($inp),@ptr[$i] # input pointer 412 mov `16*$i+8`($inp),%ecx # number of blocks 413 cmp $num,%ecx 414 cmovg %ecx,$num # find maximum 415 test %ecx,%ecx 416 mov %ecx,`4*$i`(%rbx) # initialize counters 417 cmovle $Tbl,@ptr[$i] # cancel input 418___ 419} 420$code.=<<___; 421 test $num,$num 422 jz .Ldone 423 424 movdqu 0x00($ctx),$A # load context 425 lea 128(%rsp),%rax 426 movdqu 0x20($ctx),$B 427 movdqu 0x40($ctx),$C 428 movdqu 0x60($ctx),$D 429 movdqu 0x80($ctx),$E 430 movdqa 0x60($Tbl),$tx # pbswap_mask 431 movdqa -0x20($Tbl),$K # K_00_19 432 jmp .Loop 433 434.align 32 435.Loop: 436___ 437for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 438$code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 439for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 440$code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59 441for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 442$code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79 443for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 444$code.=<<___; 445 movdqa (%rbx),@Xi[0] # pull counters 446 mov \$1,%ecx 447 cmp 4*0(%rbx),%ecx # examine counters 448 pxor $t2,$t2 449 cmovge $Tbl,@ptr[0] # cancel input 450 cmp 4*1(%rbx),%ecx 451 movdqa @Xi[0],@Xi[1] 452 cmovge $Tbl,@ptr[1] 453 cmp 4*2(%rbx),%ecx 454 pcmpgtd $t2,@Xi[1] # mask value 455 cmovge $Tbl,@ptr[2] 456 cmp 4*3(%rbx),%ecx 457 paddd @Xi[1],@Xi[0] # counters-- 458 cmovge $Tbl,@ptr[3] 459 460 movdqu 0x00($ctx),$t0 461 pand @Xi[1],$A 462 movdqu 0x20($ctx),$t1 463 pand @Xi[1],$B 464 paddd $t0,$A 465 movdqu 0x40($ctx),$t2 466 pand @Xi[1],$C 467 paddd $t1,$B 468 movdqu 0x60($ctx),$t3 469 pand @Xi[1],$D 470 paddd $t2,$C 471 movdqu 0x80($ctx),$tx 472 pand @Xi[1],$E 473 movdqu $A,0x00($ctx) 474 paddd $t3,$D 475 movdqu $B,0x20($ctx) 476 paddd $tx,$E 477 movdqu $C,0x40($ctx) 478 movdqu $D,0x60($ctx) 479 movdqu $E,0x80($ctx) 480 481 movdqa @Xi[0],(%rbx) # save counters 482 movdqa 0x60($Tbl),$tx # pbswap_mask 483 movdqa -0x20($Tbl),$K # K_00_19 484 dec $num 485 jnz .Loop 486 487 mov `$REG_SZ*17+8`(%rsp),$num 488 lea $REG_SZ($ctx),$ctx 489 lea `16*$REG_SZ/4`($inp),$inp 490 dec $num 491 jnz .Loop_grande 492 493.Ldone: 494 mov `$REG_SZ*17`(%rsp),%rax # original %rsp 495.cfi_def_cfa %rax,8 496___ 497$code.=<<___ if ($win64); 498 movaps -0xb8(%rax),%xmm6 499 movaps -0xa8(%rax),%xmm7 500 movaps -0x98(%rax),%xmm8 501 movaps -0x88(%rax),%xmm9 502 movaps -0x78(%rax),%xmm10 503 movaps -0x68(%rax),%xmm11 504 movaps -0x58(%rax),%xmm12 505 movaps -0x48(%rax),%xmm13 506 movaps -0x38(%rax),%xmm14 507 movaps -0x28(%rax),%xmm15 508___ 509$code.=<<___; 510 mov -16(%rax),%rbp 511.cfi_restore %rbp 512 mov -8(%rax),%rbx 513.cfi_restore %rbx 514 lea (%rax),%rsp 515.cfi_def_cfa_register %rsp 516.Lepilogue: 517 ret 518.cfi_endproc 519.size sha1_multi_block,.-sha1_multi_block 520___ 521 {{{ 522my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10)); 523my @MSG0=map("%xmm$_",(4..7)); 524my @MSG1=map("%xmm$_",(11..14)); 525 526$code.=<<___; 527.type sha1_multi_block_shaext,\@function,3 528.align 32 529sha1_multi_block_shaext: 530.cfi_startproc 531_shaext_shortcut: 532 mov %rsp,%rax 533.cfi_def_cfa_register %rax 534 push %rbx 535.cfi_push %rbx 536 push %rbp 537.cfi_push %rbp 538___ 539$code.=<<___ if ($win64); 540 lea -0xa8(%rsp),%rsp 541 movaps %xmm6,(%rsp) 542 movaps %xmm7,0x10(%rsp) 543 movaps %xmm8,0x20(%rsp) 544 movaps %xmm9,0x30(%rsp) 545 movaps %xmm10,-0x78(%rax) 546 movaps %xmm11,-0x68(%rax) 547 movaps %xmm12,-0x58(%rax) 548 movaps %xmm13,-0x48(%rax) 549 movaps %xmm14,-0x38(%rax) 550 movaps %xmm15,-0x28(%rax) 551___ 552$code.=<<___; 553 sub \$`$REG_SZ*18`,%rsp 554 shl \$1,$num # we process pair at a time 555 and \$-256,%rsp 556 lea 0x40($ctx),$ctx # size optimization 557 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 558.Lbody_shaext: 559 lea `$REG_SZ*16`(%rsp),%rbx 560 movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap 561 562.Loop_grande_shaext: 563 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 564 xor $num,$num 565___ 566for($i=0;$i<2;$i++) { 567 $code.=<<___; 568 mov `16*$i+0`($inp),@ptr[$i] # input pointer 569 mov `16*$i+8`($inp),%ecx # number of blocks 570 cmp $num,%ecx 571 cmovg %ecx,$num # find maximum 572 test %ecx,%ecx 573 mov %ecx,`4*$i`(%rbx) # initialize counters 574 cmovle %rsp,@ptr[$i] # cancel input 575___ 576} 577$code.=<<___; 578 test $num,$num 579 jz .Ldone_shaext 580 581 movq 0x00-0x40($ctx),$ABCD0 # a1.a0 582 movq 0x20-0x40($ctx),@MSG0[0]# b1.b0 583 movq 0x40-0x40($ctx),@MSG0[1]# c1.c0 584 movq 0x60-0x40($ctx),@MSG0[2]# d1.d0 585 movq 0x80-0x40($ctx),@MSG0[3]# e1.e0 586 587 punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0 588 punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0 589 590 movdqa $ABCD0,$ABCD1 591 punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0 592 punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1 593 594 pshufd \$0b00111111,@MSG0[3],$E0 595 pshufd \$0b01111111,@MSG0[3],$E1 596 pshufd \$0b00011011,$ABCD0,$ABCD0 597 pshufd \$0b00011011,$ABCD1,$ABCD1 598 jmp .Loop_shaext 599 600.align 32 601.Loop_shaext: 602 movdqu 0x00(@ptr[0]),@MSG0[0] 603 movdqu 0x00(@ptr[1]),@MSG1[0] 604 movdqu 0x10(@ptr[0]),@MSG0[1] 605 movdqu 0x10(@ptr[1]),@MSG1[1] 606 movdqu 0x20(@ptr[0]),@MSG0[2] 607 pshufb $BSWAP,@MSG0[0] 608 movdqu 0x20(@ptr[1]),@MSG1[2] 609 pshufb $BSWAP,@MSG1[0] 610 movdqu 0x30(@ptr[0]),@MSG0[3] 611 lea 0x40(@ptr[0]),@ptr[0] 612 pshufb $BSWAP,@MSG0[1] 613 movdqu 0x30(@ptr[1]),@MSG1[3] 614 lea 0x40(@ptr[1]),@ptr[1] 615 pshufb $BSWAP,@MSG1[1] 616 617 movdqa $E0,0x50(%rsp) # offload 618 paddd @MSG0[0],$E0 619 movdqa $E1,0x70(%rsp) 620 paddd @MSG1[0],$E1 621 movdqa $ABCD0,0x40(%rsp) # offload 622 movdqa $ABCD0,$E0_ 623 movdqa $ABCD1,0x60(%rsp) 624 movdqa $ABCD1,$E1_ 625 sha1rnds4 \$0,$E0,$ABCD0 # 0-3 626 sha1nexte @MSG0[1],$E0_ 627 sha1rnds4 \$0,$E1,$ABCD1 # 0-3 628 sha1nexte @MSG1[1],$E1_ 629 pshufb $BSWAP,@MSG0[2] 630 prefetcht0 127(@ptr[0]) 631 sha1msg1 @MSG0[1],@MSG0[0] 632 pshufb $BSWAP,@MSG1[2] 633 prefetcht0 127(@ptr[1]) 634 sha1msg1 @MSG1[1],@MSG1[0] 635 636 pshufb $BSWAP,@MSG0[3] 637 movdqa $ABCD0,$E0 638 pshufb $BSWAP,@MSG1[3] 639 movdqa $ABCD1,$E1 640 sha1rnds4 \$0,$E0_,$ABCD0 # 4-7 641 sha1nexte @MSG0[2],$E0 642 sha1rnds4 \$0,$E1_,$ABCD1 # 4-7 643 sha1nexte @MSG1[2],$E1 644 pxor @MSG0[2],@MSG0[0] 645 sha1msg1 @MSG0[2],@MSG0[1] 646 pxor @MSG1[2],@MSG1[0] 647 sha1msg1 @MSG1[2],@MSG1[1] 648___ 649for($i=2;$i<20-4;$i++) { 650$code.=<<___; 651 movdqa $ABCD0,$E0_ 652 movdqa $ABCD1,$E1_ 653 sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11 654 sha1nexte @MSG0[3],$E0_ 655 sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11 656 sha1nexte @MSG1[3],$E1_ 657 sha1msg2 @MSG0[3],@MSG0[0] 658 sha1msg2 @MSG1[3],@MSG1[0] 659 pxor @MSG0[3],@MSG0[1] 660 sha1msg1 @MSG0[3],@MSG0[2] 661 pxor @MSG1[3],@MSG1[1] 662 sha1msg1 @MSG1[3],@MSG1[2] 663___ 664 ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1); 665 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); 666} 667$code.=<<___; 668 movdqa $ABCD0,$E0_ 669 movdqa $ABCD1,$E1_ 670 sha1rnds4 \$3,$E0,$ABCD0 # 64-67 671 sha1nexte @MSG0[3],$E0_ 672 sha1rnds4 \$3,$E1,$ABCD1 # 64-67 673 sha1nexte @MSG1[3],$E1_ 674 sha1msg2 @MSG0[3],@MSG0[0] 675 sha1msg2 @MSG1[3],@MSG1[0] 676 pxor @MSG0[3],@MSG0[1] 677 pxor @MSG1[3],@MSG1[1] 678 679 mov \$1,%ecx 680 pxor @MSG0[2],@MSG0[2] # zero 681 cmp 4*0(%rbx),%ecx # examine counters 682 cmovge %rsp,@ptr[0] # cancel input 683 684 movdqa $ABCD0,$E0 685 movdqa $ABCD1,$E1 686 sha1rnds4 \$3,$E0_,$ABCD0 # 68-71 687 sha1nexte @MSG0[0],$E0 688 sha1rnds4 \$3,$E1_,$ABCD1 # 68-71 689 sha1nexte @MSG1[0],$E1 690 sha1msg2 @MSG0[0],@MSG0[1] 691 sha1msg2 @MSG1[0],@MSG1[1] 692 693 cmp 4*1(%rbx),%ecx 694 cmovge %rsp,@ptr[1] 695 movq (%rbx),@MSG0[0] # pull counters 696 697 movdqa $ABCD0,$E0_ 698 movdqa $ABCD1,$E1_ 699 sha1rnds4 \$3,$E0,$ABCD0 # 72-75 700 sha1nexte @MSG0[1],$E0_ 701 sha1rnds4 \$3,$E1,$ABCD1 # 72-75 702 sha1nexte @MSG1[1],$E1_ 703 704 pshufd \$0x00,@MSG0[0],@MSG1[2] 705 pshufd \$0x55,@MSG0[0],@MSG1[3] 706 movdqa @MSG0[0],@MSG0[1] 707 pcmpgtd @MSG0[2],@MSG1[2] 708 pcmpgtd @MSG0[2],@MSG1[3] 709 710 movdqa $ABCD0,$E0 711 movdqa $ABCD1,$E1 712 sha1rnds4 \$3,$E0_,$ABCD0 # 76-79 713 sha1nexte $MSG0[2],$E0 714 sha1rnds4 \$3,$E1_,$ABCD1 # 76-79 715 sha1nexte $MSG0[2],$E1 716 717 pcmpgtd @MSG0[2],@MSG0[1] # counter mask 718 pand @MSG1[2],$ABCD0 719 pand @MSG1[2],$E0 720 pand @MSG1[3],$ABCD1 721 pand @MSG1[3],$E1 722 paddd @MSG0[1],@MSG0[0] # counters-- 723 724 paddd 0x40(%rsp),$ABCD0 725 paddd 0x50(%rsp),$E0 726 paddd 0x60(%rsp),$ABCD1 727 paddd 0x70(%rsp),$E1 728 729 movq @MSG0[0],(%rbx) # save counters 730 dec $num 731 jnz .Loop_shaext 732 733 mov `$REG_SZ*17+8`(%rsp),$num 734 735 pshufd \$0b00011011,$ABCD0,$ABCD0 736 pshufd \$0b00011011,$ABCD1,$ABCD1 737 738 movdqa $ABCD0,@MSG0[0] 739 punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0 740 punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0 741 punpckhdq $E1,$E0 # e1.e0.xx.xx 742 movq $ABCD0,0x00-0x40($ctx) # a1.a0 743 psrldq \$8,$ABCD0 744 movq @MSG0[0],0x40-0x40($ctx)# c1.c0 745 psrldq \$8,@MSG0[0] 746 movq $ABCD0,0x20-0x40($ctx) # b1.b0 747 psrldq \$8,$E0 748 movq @MSG0[0],0x60-0x40($ctx)# d1.d0 749 movq $E0,0x80-0x40($ctx) # e1.e0 750 751 lea `$REG_SZ/2`($ctx),$ctx 752 lea `16*2`($inp),$inp 753 dec $num 754 jnz .Loop_grande_shaext 755 756.Ldone_shaext: 757 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp 758___ 759$code.=<<___ if ($win64); 760 movaps -0xb8(%rax),%xmm6 761 movaps -0xa8(%rax),%xmm7 762 movaps -0x98(%rax),%xmm8 763 movaps -0x88(%rax),%xmm9 764 movaps -0x78(%rax),%xmm10 765 movaps -0x68(%rax),%xmm11 766 movaps -0x58(%rax),%xmm12 767 movaps -0x48(%rax),%xmm13 768 movaps -0x38(%rax),%xmm14 769 movaps -0x28(%rax),%xmm15 770___ 771$code.=<<___; 772 mov -16(%rax),%rbp 773.cfi_restore %rbp 774 mov -8(%rax),%rbx 775.cfi_restore %rbx 776 lea (%rax),%rsp 777.cfi_def_cfa_register %rsp 778.Lepilogue_shaext: 779 ret 780.cfi_endproc 781.size sha1_multi_block_shaext,.-sha1_multi_block_shaext 782___ 783 }}} 784 785 if ($avx) {{{ 786sub BODY_00_19_avx { 787my ($i,$a,$b,$c,$d,$e)=@_; 788my $j=$i+1; 789my $k=$i+2; 790my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128"; 791my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4]; 792 793$code.=<<___ if ($i==0 && $REG_SZ==16); 794 vmovd (@ptr[0]),@Xi[0] 795 lea `16*4`(@ptr[0]),@ptr[0] 796 vmovd (@ptr[1]),@Xi[2] # borrow Xi[2] 797 lea `16*4`(@ptr[1]),@ptr[1] 798 vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] 799 lea `16*4`(@ptr[2]),@ptr[2] 800 vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2] 801 lea `16*4`(@ptr[3]),@ptr[3] 802 vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] 803 vpunpckldq @Xi[2],@Xi[0],@Xi[0] 804 vmovd `4*$j-16*4`($ptr_n),$t3 805 vpshufb $tx,@Xi[0],@Xi[0] 806___ 807$code.=<<___ if ($i<15 && $REG_SZ==16); # just load input 808 vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] 809 vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3 810___ 811$code.=<<___ if ($i==0 && $REG_SZ==32); 812 vmovd (@ptr[0]),@Xi[0] 813 lea `16*4`(@ptr[0]),@ptr[0] 814 vmovd (@ptr[4]),@Xi[2] # borrow Xi[2] 815 lea `16*4`(@ptr[4]),@ptr[4] 816 vmovd (@ptr[1]),$t2 817 lea `16*4`(@ptr[1]),@ptr[1] 818 vmovd (@ptr[5]),$t1 819 lea `16*4`(@ptr[5]),@ptr[5] 820 vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] 821 lea `16*4`(@ptr[2]),@ptr[2] 822 vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2] 823 lea `16*4`(@ptr[6]),@ptr[6] 824 vpinsrd \$1,(@ptr[3]),$t2,$t2 825 lea `16*4`(@ptr[3]),@ptr[3] 826 vpunpckldq $t2,@Xi[0],@Xi[0] 827 vpinsrd \$1,(@ptr[7]),$t1,$t1 828 lea `16*4`(@ptr[7]),@ptr[7] 829 vpunpckldq $t1,@Xi[2],@Xi[2] 830 vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] 831 vinserti128 @Xi[2],@Xi[0],@Xi[0] 832 vmovd `4*$j-16*4`($ptr_n),$t3 833 vpshufb $tx,@Xi[0],@Xi[0] 834___ 835$code.=<<___ if ($i<15 && $REG_SZ==32); # just load input 836 vmovd `4*$j-16*4`(@ptr[1]),$t2 837 vmovd `4*$j-16*4`(@ptr[5]),$t1 838 vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] 839 vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3 840 vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2 841 vpunpckldq $t2,@Xi[1],@Xi[1] 842 vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1 843 vpunpckldq $t1,$t3,$t3 844___ 845$code.=<<___ if ($i<14); 846 vpaddd $K,$e,$e # e+=K_00_19 847 vpslld \$5,$a,$t2 848 vpandn $d,$b,$t1 849 vpand $c,$b,$t0 850 851 vmovdqa @Xi[0],`&Xi_off($i)` 852 vpaddd @Xi[0],$e,$e # e+=X[i] 853 $vpack $t3,@Xi[1],@Xi[1] 854 vpsrld \$27,$a,$t3 855 vpxor $t1,$t0,$t0 # Ch(b,c,d) 856 vmovd `4*$k-16*4`(@ptr[0]),@Xi[2] 857 858 vpslld \$30,$b,$t1 859 vpor $t3,$t2,$t2 # rol(a,5) 860 vmovd `4*$k-16*4`($ptr_n),$t3 861 vpaddd $t0,$e,$e # e+=Ch(b,c,d) 862 863 vpsrld \$2,$b,$b 864 vpaddd $t2,$e,$e # e+=rol(a,5) 865 vpshufb $tx,@Xi[1],@Xi[1] 866 vpor $t1,$b,$b # b=rol(b,30) 867___ 868$code.=<<___ if ($i==14); 869 vpaddd $K,$e,$e # e+=K_00_19 870 prefetcht0 63(@ptr[0]) 871 vpslld \$5,$a,$t2 872 vpandn $d,$b,$t1 873 vpand $c,$b,$t0 874 875 vmovdqa @Xi[0],`&Xi_off($i)` 876 vpaddd @Xi[0],$e,$e # e+=X[i] 877 $vpack $t3,@Xi[1],@Xi[1] 878 vpsrld \$27,$a,$t3 879 prefetcht0 63(@ptr[1]) 880 vpxor $t1,$t0,$t0 # Ch(b,c,d) 881 882 vpslld \$30,$b,$t1 883 vpor $t3,$t2,$t2 # rol(a,5) 884 prefetcht0 63(@ptr[2]) 885 vpaddd $t0,$e,$e # e+=Ch(b,c,d) 886 887 vpsrld \$2,$b,$b 888 vpaddd $t2,$e,$e # e+=rol(a,5) 889 prefetcht0 63(@ptr[3]) 890 vpshufb $tx,@Xi[1],@Xi[1] 891 vpor $t1,$b,$b # b=rol(b,30) 892___ 893$code.=<<___ if ($i>=13 && $i<15); 894 vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" 895___ 896$code.=<<___ if ($i>=15); # apply Xupdate 897 vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 898 vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 899 900 vpaddd $K,$e,$e # e+=K_00_19 901 vpslld \$5,$a,$t2 902 vpandn $d,$b,$t1 903 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` 904 vpand $c,$b,$t0 905 906 vmovdqa @Xi[0],`&Xi_off($i)` 907 vpaddd @Xi[0],$e,$e # e+=X[i] 908 vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 909 vpsrld \$27,$a,$t3 910 vpxor $t1,$t0,$t0 # Ch(b,c,d) 911 vpxor @Xi[3],@Xi[1],@Xi[1] 912 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` 913 914 vpslld \$30,$b,$t1 915 vpor $t3,$t2,$t2 # rol(a,5) 916 vpaddd $t0,$e,$e # e+=Ch(b,c,d) 917 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` 918 vpsrld \$31,@Xi[1],$tx 919 vpaddd @Xi[1],@Xi[1],@Xi[1] 920 921 vpsrld \$2,$b,$b 922 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` 923 vpaddd $t2,$e,$e # e+=rol(a,5) 924 vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] 925 vpor $t1,$b,$b # b=rol(b,30) 926___ 927push(@Xi,shift(@Xi)); 928} 929 930sub BODY_20_39_avx { 931my ($i,$a,$b,$c,$d,$e)=@_; 932my $j=$i+1; 933 934$code.=<<___ if ($i<79); 935 vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 936 vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 937 938 vpslld \$5,$a,$t2 939 vpaddd $K,$e,$e # e+=K_20_39 940 vpxor $b,$d,$t0 941___ 942$code.=<<___ if ($i<72); 943 vmovdqa @Xi[0],`&Xi_off($i)` 944___ 945$code.=<<___ if ($i<79); 946 vpaddd @Xi[0],$e,$e # e+=X[i] 947 vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 948 vpsrld \$27,$a,$t3 949 vpxor $c,$t0,$t0 # Parity(b,c,d) 950 vpxor @Xi[3],@Xi[1],@Xi[1] 951 952 vpslld \$30,$b,$t1 953 vpor $t3,$t2,$t2 # rol(a,5) 954 vpaddd $t0,$e,$e # e+=Parity(b,c,d) 955 vpsrld \$31,@Xi[1],$tx 956 vpaddd @Xi[1],@Xi[1],@Xi[1] 957 958 vpsrld \$2,$b,$b 959 vpaddd $t2,$e,$e # e+=rol(a,5) 960 vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1) 961 vpor $t1,$b,$b # b=rol(b,30) 962___ 963$code.=<<___ if ($i==79); 964 vpslld \$5,$a,$t2 965 vpaddd $K,$e,$e # e+=K_20_39 966 vpxor $b,$d,$t0 967 968 vpsrld \$27,$a,$t3 969 vpaddd @Xi[0],$e,$e # e+=X[i] 970 vpxor $c,$t0,$t0 # Parity(b,c,d) 971 972 vpslld \$30,$b,$t1 973 vpor $t3,$t2,$t2 # rol(a,5) 974 vpaddd $t0,$e,$e # e+=Parity(b,c,d) 975 976 vpsrld \$2,$b,$b 977 vpaddd $t2,$e,$e # e+=rol(a,5) 978 vpor $t1,$b,$b # b=rol(b,30) 979___ 980push(@Xi,shift(@Xi)); 981} 982 983sub BODY_40_59_avx { 984my ($i,$a,$b,$c,$d,$e)=@_; 985my $j=$i+1; 986 987$code.=<<___; 988 vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 989 vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 990 991 vpaddd $K,$e,$e # e+=K_40_59 992 vpslld \$5,$a,$t2 993 vpand $c,$d,$t1 994 vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 995 996 vpaddd $t1,$e,$e 997 vpsrld \$27,$a,$t3 998 vpxor $c,$d,$t0 999 vpxor @Xi[3],@Xi[1],@Xi[1] 1000 1001 vmovdqu @Xi[0],`&Xi_off($i)` 1002 vpaddd @Xi[0],$e,$e # e+=X[i] 1003 vpor $t3,$t2,$t2 # rol(a,5) 1004 vpsrld \$31,@Xi[1],$tx 1005 vpand $b,$t0,$t0 1006 vpaddd @Xi[1],@Xi[1],@Xi[1] 1007 1008 vpslld \$30,$b,$t1 1009 vpaddd $t0,$e,$e # e+=Maj(b,d,c) 1010 1011 vpsrld \$2,$b,$b 1012 vpaddd $t2,$e,$e # e+=rol(a,5) 1013 vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1) 1014 vpor $t1,$b,$b # b=rol(b,30) 1015___ 1016push(@Xi,shift(@Xi)); 1017} 1018 1019$code.=<<___; 1020.type sha1_multi_block_avx,\@function,3 1021.align 32 1022sha1_multi_block_avx: 1023.cfi_startproc 1024_avx_shortcut: 1025___ 1026$code.=<<___ if ($avx>1); 1027 shr \$32,%rcx 1028 cmp \$2,$num 1029 jb .Lavx 1030 test \$`1<<5`,%ecx 1031 jnz _avx2_shortcut 1032 jmp .Lavx 1033.align 32 1034.Lavx: 1035___ 1036$code.=<<___; 1037 mov %rsp,%rax 1038.cfi_def_cfa_register %rax 1039 push %rbx 1040.cfi_push %rbx 1041 push %rbp 1042.cfi_push %rbp 1043___ 1044$code.=<<___ if ($win64); 1045 lea -0xa8(%rsp),%rsp 1046 movaps %xmm6,(%rsp) 1047 movaps %xmm7,0x10(%rsp) 1048 movaps %xmm8,0x20(%rsp) 1049 movaps %xmm9,0x30(%rsp) 1050 movaps %xmm10,-0x78(%rax) 1051 movaps %xmm11,-0x68(%rax) 1052 movaps %xmm12,-0x58(%rax) 1053 movaps %xmm13,-0x48(%rax) 1054 movaps %xmm14,-0x38(%rax) 1055 movaps %xmm15,-0x28(%rax) 1056___ 1057$code.=<<___; 1058 sub \$`$REG_SZ*18`, %rsp 1059 and \$-256,%rsp 1060 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1061.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 1062.Lbody_avx: 1063 lea K_XX_XX(%rip),$Tbl 1064 lea `$REG_SZ*16`(%rsp),%rbx 1065 1066 vzeroupper 1067.Loop_grande_avx: 1068 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 1069 xor $num,$num 1070___ 1071for($i=0;$i<4;$i++) { 1072 $code.=<<___; 1073 mov `16*$i+0`($inp),@ptr[$i] # input pointer 1074 mov `16*$i+8`($inp),%ecx # number of blocks 1075 cmp $num,%ecx 1076 cmovg %ecx,$num # find maximum 1077 test %ecx,%ecx 1078 mov %ecx,`4*$i`(%rbx) # initialize counters 1079 cmovle $Tbl,@ptr[$i] # cancel input 1080___ 1081} 1082$code.=<<___; 1083 test $num,$num 1084 jz .Ldone_avx 1085 1086 vmovdqu 0x00($ctx),$A # load context 1087 lea 128(%rsp),%rax 1088 vmovdqu 0x20($ctx),$B 1089 vmovdqu 0x40($ctx),$C 1090 vmovdqu 0x60($ctx),$D 1091 vmovdqu 0x80($ctx),$E 1092 vmovdqu 0x60($Tbl),$tx # pbswap_mask 1093 jmp .Loop_avx 1094 1095.align 32 1096.Loop_avx: 1097___ 1098$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 1099for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } 1100$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 1101for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1102$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 1103for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } 1104$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 1105for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1106$code.=<<___; 1107 mov \$1,%ecx 1108___ 1109for($i=0;$i<4;$i++) { 1110 $code.=<<___; 1111 cmp `4*$i`(%rbx),%ecx # examine counters 1112 cmovge $Tbl,@ptr[$i] # cancel input 1113___ 1114} 1115$code.=<<___; 1116 vmovdqu (%rbx),$t0 # pull counters 1117 vpxor $t2,$t2,$t2 1118 vmovdqa $t0,$t1 1119 vpcmpgtd $t2,$t1,$t1 # mask value 1120 vpaddd $t1,$t0,$t0 # counters-- 1121 1122 vpand $t1,$A,$A 1123 vpand $t1,$B,$B 1124 vpaddd 0x00($ctx),$A,$A 1125 vpand $t1,$C,$C 1126 vpaddd 0x20($ctx),$B,$B 1127 vpand $t1,$D,$D 1128 vpaddd 0x40($ctx),$C,$C 1129 vpand $t1,$E,$E 1130 vpaddd 0x60($ctx),$D,$D 1131 vpaddd 0x80($ctx),$E,$E 1132 vmovdqu $A,0x00($ctx) 1133 vmovdqu $B,0x20($ctx) 1134 vmovdqu $C,0x40($ctx) 1135 vmovdqu $D,0x60($ctx) 1136 vmovdqu $E,0x80($ctx) 1137 1138 vmovdqu $t0,(%rbx) # save counters 1139 vmovdqu 0x60($Tbl),$tx # pbswap_mask 1140 dec $num 1141 jnz .Loop_avx 1142 1143 mov `$REG_SZ*17+8`(%rsp),$num 1144 lea $REG_SZ($ctx),$ctx 1145 lea `16*$REG_SZ/4`($inp),$inp 1146 dec $num 1147 jnz .Loop_grande_avx 1148 1149.Ldone_avx: 1150 mov `$REG_SZ*17`(%rsp),%rax # original %rsp 1151.cfi_def_cfa %rax,8 1152 vzeroupper 1153___ 1154$code.=<<___ if ($win64); 1155 movaps -0xb8(%rax),%xmm6 1156 movaps -0xa8(%rax),%xmm7 1157 movaps -0x98(%rax),%xmm8 1158 movaps -0x88(%rax),%xmm9 1159 movaps -0x78(%rax),%xmm10 1160 movaps -0x68(%rax),%xmm11 1161 movaps -0x58(%rax),%xmm12 1162 movaps -0x48(%rax),%xmm13 1163 movaps -0x38(%rax),%xmm14 1164 movaps -0x28(%rax),%xmm15 1165___ 1166$code.=<<___; 1167 mov -16(%rax),%rbp 1168.cfi_restore %rbp 1169 mov -8(%rax),%rbx 1170.cfi_restore %rbx 1171 lea (%rax),%rsp 1172.cfi_def_cfa_register %rsp 1173.Lepilogue_avx: 1174 ret 1175.cfi_endproc 1176.size sha1_multi_block_avx,.-sha1_multi_block_avx 1177___ 1178 1179 if ($avx>1) { 1180$code =~ s/\`([^\`]*)\`/eval $1/gem; 1181 1182$REG_SZ=32; 1183 1184@ptr=map("%r$_",(12..15,8..11)); 1185 1186@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4)); 1187($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9)); 1188@Xi=map("%ymm$_",(10..14)); 1189$K="%ymm15"; 1190 1191$code.=<<___; 1192.type sha1_multi_block_avx2,\@function,3 1193.align 32 1194sha1_multi_block_avx2: 1195.cfi_startproc 1196_avx2_shortcut: 1197 mov %rsp,%rax 1198.cfi_def_cfa_register %rax 1199 push %rbx 1200.cfi_push %rbx 1201 push %rbp 1202.cfi_push %rbp 1203 push %r12 1204.cfi_push %r12 1205 push %r13 1206.cfi_push %r13 1207 push %r14 1208.cfi_push %r14 1209 push %r15 1210.cfi_push %r15 1211___ 1212$code.=<<___ if ($win64); 1213 lea -0xa8(%rsp),%rsp 1214 movaps %xmm6,(%rsp) 1215 movaps %xmm7,0x10(%rsp) 1216 movaps %xmm8,0x20(%rsp) 1217 movaps %xmm9,0x30(%rsp) 1218 movaps %xmm10,0x40(%rsp) 1219 movaps %xmm11,0x50(%rsp) 1220 movaps %xmm12,-0x78(%rax) 1221 movaps %xmm13,-0x68(%rax) 1222 movaps %xmm14,-0x58(%rax) 1223 movaps %xmm15,-0x48(%rax) 1224___ 1225$code.=<<___; 1226 sub \$`$REG_SZ*18`, %rsp 1227 and \$-256,%rsp 1228 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1229.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 1230.Lbody_avx2: 1231 lea K_XX_XX(%rip),$Tbl 1232 shr \$1,$num 1233 1234 vzeroupper 1235.Loop_grande_avx2: 1236 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 1237 xor $num,$num 1238 lea `$REG_SZ*16`(%rsp),%rbx 1239___ 1240for($i=0;$i<8;$i++) { 1241 $code.=<<___; 1242 mov `16*$i+0`($inp),@ptr[$i] # input pointer 1243 mov `16*$i+8`($inp),%ecx # number of blocks 1244 cmp $num,%ecx 1245 cmovg %ecx,$num # find maximum 1246 test %ecx,%ecx 1247 mov %ecx,`4*$i`(%rbx) # initialize counters 1248 cmovle $Tbl,@ptr[$i] # cancel input 1249___ 1250} 1251$code.=<<___; 1252 vmovdqu 0x00($ctx),$A # load context 1253 lea 128(%rsp),%rax 1254 vmovdqu 0x20($ctx),$B 1255 lea 256+128(%rsp),%rbx 1256 vmovdqu 0x40($ctx),$C 1257 vmovdqu 0x60($ctx),$D 1258 vmovdqu 0x80($ctx),$E 1259 vmovdqu 0x60($Tbl),$tx # pbswap_mask 1260 jmp .Loop_avx2 1261 1262.align 32 1263.Loop_avx2: 1264___ 1265$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 1266for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } 1267$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 1268for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1269$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 1270for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } 1271$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 1272for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1273$code.=<<___; 1274 mov \$1,%ecx 1275 lea `$REG_SZ*16`(%rsp),%rbx 1276___ 1277for($i=0;$i<8;$i++) { 1278 $code.=<<___; 1279 cmp `4*$i`(%rbx),%ecx # examine counters 1280 cmovge $Tbl,@ptr[$i] # cancel input 1281___ 1282} 1283$code.=<<___; 1284 vmovdqu (%rbx),$t0 # pull counters 1285 vpxor $t2,$t2,$t2 1286 vmovdqa $t0,$t1 1287 vpcmpgtd $t2,$t1,$t1 # mask value 1288 vpaddd $t1,$t0,$t0 # counters-- 1289 1290 vpand $t1,$A,$A 1291 vpand $t1,$B,$B 1292 vpaddd 0x00($ctx),$A,$A 1293 vpand $t1,$C,$C 1294 vpaddd 0x20($ctx),$B,$B 1295 vpand $t1,$D,$D 1296 vpaddd 0x40($ctx),$C,$C 1297 vpand $t1,$E,$E 1298 vpaddd 0x60($ctx),$D,$D 1299 vpaddd 0x80($ctx),$E,$E 1300 vmovdqu $A,0x00($ctx) 1301 vmovdqu $B,0x20($ctx) 1302 vmovdqu $C,0x40($ctx) 1303 vmovdqu $D,0x60($ctx) 1304 vmovdqu $E,0x80($ctx) 1305 1306 vmovdqu $t0,(%rbx) # save counters 1307 lea 256+128(%rsp),%rbx 1308 vmovdqu 0x60($Tbl),$tx # pbswap_mask 1309 dec $num 1310 jnz .Loop_avx2 1311 1312 #mov `$REG_SZ*17+8`(%rsp),$num 1313 #lea $REG_SZ($ctx),$ctx 1314 #lea `16*$REG_SZ/4`($inp),$inp 1315 #dec $num 1316 #jnz .Loop_grande_avx2 1317 1318.Ldone_avx2: 1319 mov `$REG_SZ*17`(%rsp),%rax # original %rsp 1320.cfi_def_cfa %rax,8 1321 vzeroupper 1322___ 1323$code.=<<___ if ($win64); 1324 movaps -0xd8(%rax),%xmm6 1325 movaps -0xc8(%rax),%xmm7 1326 movaps -0xb8(%rax),%xmm8 1327 movaps -0xa8(%rax),%xmm9 1328 movaps -0x98(%rax),%xmm10 1329 movaps -0x88(%rax),%xmm11 1330 movaps -0x78(%rax),%xmm12 1331 movaps -0x68(%rax),%xmm13 1332 movaps -0x58(%rax),%xmm14 1333 movaps -0x48(%rax),%xmm15 1334___ 1335$code.=<<___; 1336 mov -48(%rax),%r15 1337.cfi_restore %r15 1338 mov -40(%rax),%r14 1339.cfi_restore %r14 1340 mov -32(%rax),%r13 1341.cfi_restore %r13 1342 mov -24(%rax),%r12 1343.cfi_restore %r12 1344 mov -16(%rax),%rbp 1345.cfi_restore %rbp 1346 mov -8(%rax),%rbx 1347.cfi_restore %rbx 1348 lea (%rax),%rsp 1349.cfi_def_cfa_register %rsp 1350.Lepilogue_avx2: 1351 ret 1352.cfi_endproc 1353.size sha1_multi_block_avx2,.-sha1_multi_block_avx2 1354___ 1355 } }}} 1356$code.=<<___; 1357 1358.align 256 1359 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1360 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1361K_XX_XX: 1362 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1363 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1364 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1365 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1366 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1367 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1368 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1369 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1370 .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 1371 .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1372___ 1373 1374if ($win64) { 1375# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1376# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1377$rec="%rcx"; 1378$frame="%rdx"; 1379$context="%r8"; 1380$disp="%r9"; 1381 1382$code.=<<___; 1383.extern __imp_RtlVirtualUnwind 1384.type se_handler,\@abi-omnipotent 1385.align 16 1386se_handler: 1387 push %rsi 1388 push %rdi 1389 push %rbx 1390 push %rbp 1391 push %r12 1392 push %r13 1393 push %r14 1394 push %r15 1395 pushfq 1396 sub \$64,%rsp 1397 1398 mov 120($context),%rax # pull context->Rax 1399 mov 248($context),%rbx # pull context->Rip 1400 1401 mov 8($disp),%rsi # disp->ImageBase 1402 mov 56($disp),%r11 # disp->HandlerData 1403 1404 mov 0(%r11),%r10d # HandlerData[0] 1405 lea (%rsi,%r10),%r10 # end of prologue label 1406 cmp %r10,%rbx # context->Rip<.Lbody 1407 jb .Lin_prologue 1408 1409 mov 152($context),%rax # pull context->Rsp 1410 1411 mov 4(%r11),%r10d # HandlerData[1] 1412 lea (%rsi,%r10),%r10 # epilogue label 1413 cmp %r10,%rbx # context->Rip>=.Lepilogue 1414 jae .Lin_prologue 1415 1416 mov `16*17`(%rax),%rax # pull saved stack pointer 1417 1418 mov -8(%rax),%rbx 1419 mov -16(%rax),%rbp 1420 mov %rbx,144($context) # restore context->Rbx 1421 mov %rbp,160($context) # restore context->Rbp 1422 1423 lea -24-10*16(%rax),%rsi 1424 lea 512($context),%rdi # &context.Xmm6 1425 mov \$20,%ecx 1426 .long 0xa548f3fc # cld; rep movsq 1427 1428.Lin_prologue: 1429 mov 8(%rax),%rdi 1430 mov 16(%rax),%rsi 1431 mov %rax,152($context) # restore context->Rsp 1432 mov %rsi,168($context) # restore context->Rsi 1433 mov %rdi,176($context) # restore context->Rdi 1434 1435 mov 40($disp),%rdi # disp->ContextRecord 1436 mov $context,%rsi # context 1437 mov \$154,%ecx # sizeof(CONTEXT) 1438 .long 0xa548f3fc # cld; rep movsq 1439 1440 mov $disp,%rsi 1441 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1442 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1443 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1444 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1445 mov 40(%rsi),%r10 # disp->ContextRecord 1446 lea 56(%rsi),%r11 # &disp->HandlerData 1447 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1448 mov %r10,32(%rsp) # arg5 1449 mov %r11,40(%rsp) # arg6 1450 mov %r12,48(%rsp) # arg7 1451 mov %rcx,56(%rsp) # arg8, (NULL) 1452 call *__imp_RtlVirtualUnwind(%rip) 1453 1454 mov \$1,%eax # ExceptionContinueSearch 1455 add \$64,%rsp 1456 popfq 1457 pop %r15 1458 pop %r14 1459 pop %r13 1460 pop %r12 1461 pop %rbp 1462 pop %rbx 1463 pop %rdi 1464 pop %rsi 1465 ret 1466.size se_handler,.-se_handler 1467___ 1468$code.=<<___ if ($avx>1); 1469.type avx2_handler,\@abi-omnipotent 1470.align 16 1471avx2_handler: 1472 push %rsi 1473 push %rdi 1474 push %rbx 1475 push %rbp 1476 push %r12 1477 push %r13 1478 push %r14 1479 push %r15 1480 pushfq 1481 sub \$64,%rsp 1482 1483 mov 120($context),%rax # pull context->Rax 1484 mov 248($context),%rbx # pull context->Rip 1485 1486 mov 8($disp),%rsi # disp->ImageBase 1487 mov 56($disp),%r11 # disp->HandlerData 1488 1489 mov 0(%r11),%r10d # HandlerData[0] 1490 lea (%rsi,%r10),%r10 # end of prologue label 1491 cmp %r10,%rbx # context->Rip<body label 1492 jb .Lin_prologue 1493 1494 mov 152($context),%rax # pull context->Rsp 1495 1496 mov 4(%r11),%r10d # HandlerData[1] 1497 lea (%rsi,%r10),%r10 # epilogue label 1498 cmp %r10,%rbx # context->Rip>=epilogue label 1499 jae .Lin_prologue 1500 1501 mov `32*17`($context),%rax # pull saved stack pointer 1502 1503 mov -8(%rax),%rbx 1504 mov -16(%rax),%rbp 1505 mov -24(%rax),%r12 1506 mov -32(%rax),%r13 1507 mov -40(%rax),%r14 1508 mov -48(%rax),%r15 1509 mov %rbx,144($context) # restore context->Rbx 1510 mov %rbp,160($context) # restore context->Rbp 1511 mov %r12,216($context) # restore context->R12 1512 mov %r13,224($context) # restore context->R13 1513 mov %r14,232($context) # restore context->R14 1514 mov %r15,240($context) # restore context->R15 1515 1516 lea -56-10*16(%rax),%rsi 1517 lea 512($context),%rdi # &context.Xmm6 1518 mov \$20,%ecx 1519 .long 0xa548f3fc # cld; rep movsq 1520 1521 jmp .Lin_prologue 1522.size avx2_handler,.-avx2_handler 1523___ 1524$code.=<<___; 1525.section .pdata 1526.align 4 1527 .rva .LSEH_begin_sha1_multi_block 1528 .rva .LSEH_end_sha1_multi_block 1529 .rva .LSEH_info_sha1_multi_block 1530 .rva .LSEH_begin_sha1_multi_block_shaext 1531 .rva .LSEH_end_sha1_multi_block_shaext 1532 .rva .LSEH_info_sha1_multi_block_shaext 1533___ 1534$code.=<<___ if ($avx); 1535 .rva .LSEH_begin_sha1_multi_block_avx 1536 .rva .LSEH_end_sha1_multi_block_avx 1537 .rva .LSEH_info_sha1_multi_block_avx 1538___ 1539$code.=<<___ if ($avx>1); 1540 .rva .LSEH_begin_sha1_multi_block_avx2 1541 .rva .LSEH_end_sha1_multi_block_avx2 1542 .rva .LSEH_info_sha1_multi_block_avx2 1543___ 1544$code.=<<___; 1545.section .xdata 1546.align 8 1547.LSEH_info_sha1_multi_block: 1548 .byte 9,0,0,0 1549 .rva se_handler 1550 .rva .Lbody,.Lepilogue # HandlerData[] 1551.LSEH_info_sha1_multi_block_shaext: 1552 .byte 9,0,0,0 1553 .rva se_handler 1554 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] 1555___ 1556$code.=<<___ if ($avx); 1557.LSEH_info_sha1_multi_block_avx: 1558 .byte 9,0,0,0 1559 .rva se_handler 1560 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] 1561___ 1562$code.=<<___ if ($avx>1); 1563.LSEH_info_sha1_multi_block_avx2: 1564 .byte 9,0,0,0 1565 .rva avx2_handler 1566 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] 1567___ 1568} 1569#################################################################### 1570 1571sub rex { 1572 local *opcode=shift; 1573 my ($dst,$src)=@_; 1574 my $rex=0; 1575 1576 $rex|=0x04 if ($dst>=8); 1577 $rex|=0x01 if ($src>=8); 1578 unshift @opcode,$rex|0x40 if ($rex); 1579} 1580 1581sub sha1rnds4 { 1582 if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1583 my @opcode=(0x0f,0x3a,0xcc); 1584 rex(\@opcode,$3,$2); 1585 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 1586 my $c=$1; 1587 push @opcode,$c=~/^0/?oct($c):$c; 1588 return ".byte\t".join(',',@opcode); 1589 } else { 1590 return "sha1rnds4\t".@_[0]; 1591 } 1592} 1593 1594sub sha1op38 { 1595 my $instr = shift; 1596 my %opcodelet = ( 1597 "sha1nexte" => 0xc8, 1598 "sha1msg1" => 0xc9, 1599 "sha1msg2" => 0xca ); 1600 1601 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1602 my @opcode=(0x0f,0x38); 1603 rex(\@opcode,$2,$1); 1604 push @opcode,$opcodelet{$instr}; 1605 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1606 return ".byte\t".join(',',@opcode); 1607 } else { 1608 return $instr."\t".@_[0]; 1609 } 1610} 1611 1612foreach (split("\n",$code)) { 1613 s/\`([^\`]*)\`/eval($1)/ge; 1614 1615 s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or 1616 s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or 1617 1618 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1619 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1620 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or 1621 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1622 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or 1623 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1624 1625 print $_,"\n"; 1626} 1627 1628close STDOUT or die "error closing STDOUT: $!"; 1629