1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# Multi-buffer SHA1 procedure processes n buffers in parallel by 11# placing buffer data to designated lane of SIMD register. n is 12# naturally limited to 4 on pre-AVX2 processors and to 8 on 13# AVX2-capable processors such as Haswell. 14# 15# this +aesni(i) sha1 aesni-sha1 gain(iv) 16# ------------------------------------------------------------------- 17# Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68% 18# Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51% 19# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% 20# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% 21# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% 22# Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% 23# 24# (i) multi-block CBC encrypt with 128-bit key; 25# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, 26# because of lower AES-NI instruction throughput; 27# (iii) "this" is for n=8, when we gather twice as much data, result 28# for n=4 is 8.00+4.44=12.4; 29# (iv) presented improvement coefficients are asymptotic limits and 30# in real-life application are somewhat lower, e.g. for 2KB 31# fragments they range from 30% to 100% (on Haswell); 32 33$flavour = shift; 34$output = shift; 35if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 36 37$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 38 39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 41( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 42die "can't locate x86_64-xlate.pl"; 43 44$avx=0; 45 46if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 47 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 48 $avx = ($1>=2.19) + ($1>=2.22); 49} 50 51if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 52 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 53 $avx = ($1>=2.09) + ($1>=2.10); 54} 55 56if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 57 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 58 $avx = ($1>=10) + ($1>=11); 59} 60 61if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) { 62 $avx = ($2>=3.0) + ($2>3.0); 63} 64 65open OUT,"| \"$^X\" $xlate $flavour $output"; 66*STDOUT=*OUT; 67 68# void sha1_multi_block ( 69# struct { unsigned int A[8]; 70# unsigned int B[8]; 71# unsigned int C[8]; 72# unsigned int D[8]; 73# unsigned int E[8]; } *ctx, 74# struct { void *ptr; int blocks; } inp[8], 75# int num); /* 1 or 2 */ 76# 77$ctx="%rdi"; # 1st arg 78$inp="%rsi"; # 2nd arg 79$num="%edx"; 80@ptr=map("%r$_",(8..11)); 81$Tbl="%rbp"; 82 83@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4)); 84($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9)); 85@Xi=map("%xmm$_",(10..14)); 86$K="%xmm15"; 87 88if (1) { 89 # Atom-specific optimization aiming to eliminate pshufb with high 90 # registers [and thus get rid of 48 cycles accumulated penalty] 91 @Xi=map("%xmm$_",(0..4)); 92 ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); 93 @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); 94} 95 96$REG_SZ=16; 97 98sub Xi_off { 99my $off = shift; 100 101 $off %= 16; $off *= $REG_SZ; 102 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; 103} 104 105sub BODY_00_19 { 106my ($i,$a,$b,$c,$d,$e)=@_; 107my $j=$i+1; 108my $k=$i+2; 109 110# Loads are performed 2+3/4 iterations in advance. 3/4 means that out 111# of 4 words you would expect to be loaded per given iteration one is 112# spilled to next iteration. In other words indices in four input 113# streams are distributed as following: 114# 115# $i==0: 0,0,0,0,1,1,1,1,2,2,2, 116# $i==1: 2,3,3,3, 117# $i==2: 3,4,4,4, 118# ... 119# $i==13: 14,15,15,15, 120# $i==14: 15 121# 122# Then at $i==15 Xupdate is applied one iteration in advance... 123$code.=<<___ if ($i==0); 124 movd (@ptr[0]),@Xi[0] 125 lea `16*4`(@ptr[0]),@ptr[0] 126 movd (@ptr[1]),@Xi[2] # borrow @Xi[2] 127 lea `16*4`(@ptr[1]),@ptr[1] 128 movd (@ptr[2]),@Xi[3] # borrow @Xi[3] 129 lea `16*4`(@ptr[2]),@ptr[2] 130 movd (@ptr[3]),@Xi[4] # borrow @Xi[4] 131 lea `16*4`(@ptr[3]),@ptr[3] 132 punpckldq @Xi[3],@Xi[0] 133 movd `4*$j-16*4`(@ptr[0]),@Xi[1] 134 punpckldq @Xi[4],@Xi[2] 135 movd `4*$j-16*4`(@ptr[1]),$t3 136 punpckldq @Xi[2],@Xi[0] 137 movd `4*$j-16*4`(@ptr[2]),$t2 138 pshufb $tx,@Xi[0] 139___ 140$code.=<<___ if ($i<14); # just load input 141 movd `4*$j-16*4`(@ptr[3]),$t1 142 punpckldq $t2,@Xi[1] 143 movdqa $a,$t2 144 paddd $K,$e # e+=K_00_19 145 punpckldq $t1,$t3 146 movdqa $b,$t1 147 movdqa $b,$t0 148 pslld \$5,$t2 149 pandn $d,$t1 150 pand $c,$t0 151 punpckldq $t3,@Xi[1] 152 movdqa $a,$t3 153 154 movdqa @Xi[0],`&Xi_off($i)` 155 paddd @Xi[0],$e # e+=X[i] 156 movd `4*$k-16*4`(@ptr[0]),@Xi[2] 157 psrld \$27,$t3 158 pxor $t1,$t0 # Ch(b,c,d) 159 movdqa $b,$t1 160 161 por $t3,$t2 # rol(a,5) 162 movd `4*$k-16*4`(@ptr[1]),$t3 163 pslld \$30,$t1 164 paddd $t0,$e # e+=Ch(b,c,d) 165 166 psrld \$2,$b 167 paddd $t2,$e # e+=rol(a,5) 168 pshufb $tx,@Xi[1] 169 movd `4*$k-16*4`(@ptr[2]),$t2 170 por $t1,$b # b=rol(b,30) 171___ 172$code.=<<___ if ($i==14); # just load input 173 movd `4*$j-16*4`(@ptr[3]),$t1 174 punpckldq $t2,@Xi[1] 175 movdqa $a,$t2 176 paddd $K,$e # e+=K_00_19 177 punpckldq $t1,$t3 178 movdqa $b,$t1 179 movdqa $b,$t0 180 pslld \$5,$t2 181 prefetcht0 63(@ptr[0]) 182 pandn $d,$t1 183 pand $c,$t0 184 punpckldq $t3,@Xi[1] 185 movdqa $a,$t3 186 187 movdqa @Xi[0],`&Xi_off($i)` 188 paddd @Xi[0],$e # e+=X[i] 189 psrld \$27,$t3 190 pxor $t1,$t0 # Ch(b,c,d) 191 movdqa $b,$t1 192 prefetcht0 63(@ptr[1]) 193 194 por $t3,$t2 # rol(a,5) 195 pslld \$30,$t1 196 paddd $t0,$e # e+=Ch(b,c,d) 197 prefetcht0 63(@ptr[2]) 198 199 psrld \$2,$b 200 paddd $t2,$e # e+=rol(a,5) 201 pshufb $tx,@Xi[1] 202 prefetcht0 63(@ptr[3]) 203 por $t1,$b # b=rol(b,30) 204___ 205$code.=<<___ if ($i>=13 && $i<15); 206 movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" 207___ 208$code.=<<___ if ($i>=15); # apply Xupdate 209 pxor @Xi[-2],@Xi[1] # "X[13]" 210 movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 211 212 movdqa $a,$t2 213 pxor `&Xi_off($j+8)`,@Xi[1] 214 paddd $K,$e # e+=K_00_19 215 movdqa $b,$t1 216 pslld \$5,$t2 217 pxor @Xi[3],@Xi[1] 218 movdqa $b,$t0 219 pandn $d,$t1 220 movdqa @Xi[1],$tx 221 pand $c,$t0 222 movdqa $a,$t3 223 psrld \$31,$tx 224 paddd @Xi[1],@Xi[1] 225 226 movdqa @Xi[0],`&Xi_off($i)` 227 paddd @Xi[0],$e # e+=X[i] 228 psrld \$27,$t3 229 pxor $t1,$t0 # Ch(b,c,d) 230 231 movdqa $b,$t1 232 por $t3,$t2 # rol(a,5) 233 pslld \$30,$t1 234 paddd $t0,$e # e+=Ch(b,c,d) 235 236 psrld \$2,$b 237 paddd $t2,$e # e+=rol(a,5) 238 por $tx,@Xi[1] # rol \$1,@Xi[1] 239 por $t1,$b # b=rol(b,30) 240___ 241push(@Xi,shift(@Xi)); 242} 243 244sub BODY_20_39 { 245my ($i,$a,$b,$c,$d,$e)=@_; 246my $j=$i+1; 247 248$code.=<<___ if ($i<79); 249 pxor @Xi[-2],@Xi[1] # "X[13]" 250 movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 251 252 movdqa $a,$t2 253 movdqa $d,$t0 254 pxor `&Xi_off($j+8)`,@Xi[1] 255 paddd $K,$e # e+=K_20_39 256 pslld \$5,$t2 257 pxor $b,$t0 258 259 movdqa $a,$t3 260___ 261$code.=<<___ if ($i<72); 262 movdqa @Xi[0],`&Xi_off($i)` 263___ 264$code.=<<___ if ($i<79); 265 paddd @Xi[0],$e # e+=X[i] 266 pxor @Xi[3],@Xi[1] 267 psrld \$27,$t3 268 pxor $c,$t0 # Parity(b,c,d) 269 movdqa $b,$t1 270 271 pslld \$30,$t1 272 movdqa @Xi[1],$tx 273 por $t3,$t2 # rol(a,5) 274 psrld \$31,$tx 275 paddd $t0,$e # e+=Parity(b,c,d) 276 paddd @Xi[1],@Xi[1] 277 278 psrld \$2,$b 279 paddd $t2,$e # e+=rol(a,5) 280 por $tx,@Xi[1] # rol(@Xi[1],1) 281 por $t1,$b # b=rol(b,30) 282___ 283$code.=<<___ if ($i==79); 284 movdqa $a,$t2 285 paddd $K,$e # e+=K_20_39 286 movdqa $d,$t0 287 pslld \$5,$t2 288 pxor $b,$t0 289 290 movdqa $a,$t3 291 paddd @Xi[0],$e # e+=X[i] 292 psrld \$27,$t3 293 movdqa $b,$t1 294 pxor $c,$t0 # Parity(b,c,d) 295 296 pslld \$30,$t1 297 por $t3,$t2 # rol(a,5) 298 paddd $t0,$e # e+=Parity(b,c,d) 299 300 psrld \$2,$b 301 paddd $t2,$e # e+=rol(a,5) 302 por $t1,$b # b=rol(b,30) 303___ 304push(@Xi,shift(@Xi)); 305} 306 307sub BODY_40_59 { 308my ($i,$a,$b,$c,$d,$e)=@_; 309my $j=$i+1; 310 311$code.=<<___; 312 pxor @Xi[-2],@Xi[1] # "X[13]" 313 movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 314 315 movdqa $a,$t2 316 movdqa $d,$t1 317 pxor `&Xi_off($j+8)`,@Xi[1] 318 pxor @Xi[3],@Xi[1] 319 paddd $K,$e # e+=K_40_59 320 pslld \$5,$t2 321 movdqa $a,$t3 322 pand $c,$t1 323 324 movdqa $d,$t0 325 movdqa @Xi[1],$tx 326 psrld \$27,$t3 327 paddd $t1,$e 328 pxor $c,$t0 329 330 movdqa @Xi[0],`&Xi_off($i)` 331 paddd @Xi[0],$e # e+=X[i] 332 por $t3,$t2 # rol(a,5) 333 psrld \$31,$tx 334 pand $b,$t0 335 movdqa $b,$t1 336 337 pslld \$30,$t1 338 paddd @Xi[1],@Xi[1] 339 paddd $t0,$e # e+=Maj(b,d,c) 340 341 psrld \$2,$b 342 paddd $t2,$e # e+=rol(a,5) 343 por $tx,@Xi[1] # rol(@X[1],1) 344 por $t1,$b # b=rol(b,30) 345___ 346push(@Xi,shift(@Xi)); 347} 348 349$code.=<<___; 350.text 351 352.extern OPENSSL_ia32cap_P 353 354.globl sha1_multi_block 355.type sha1_multi_block,\@function,3 356.align 32 357sha1_multi_block: 358 mov OPENSSL_ia32cap_P+4(%rip),%rcx 359 bt \$61,%rcx # check SHA bit 360 jc _shaext_shortcut 361___ 362$code.=<<___ if ($avx); 363 test \$`1<<28`,%ecx 364 jnz _avx_shortcut 365___ 366$code.=<<___; 367 mov %rsp,%rax 368 push %rbx 369 push %rbp 370___ 371$code.=<<___ if ($win64); 372 lea -0xa8(%rsp),%rsp 373 movaps %xmm6,(%rsp) 374 movaps %xmm7,0x10(%rsp) 375 movaps %xmm8,0x20(%rsp) 376 movaps %xmm9,0x30(%rsp) 377 movaps %xmm10,-0x78(%rax) 378 movaps %xmm11,-0x68(%rax) 379 movaps %xmm12,-0x58(%rax) 380 movaps %xmm13,-0x48(%rax) 381 movaps %xmm14,-0x38(%rax) 382 movaps %xmm15,-0x28(%rax) 383___ 384$code.=<<___; 385 sub \$`$REG_SZ*18`,%rsp 386 and \$-256,%rsp 387 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 388.Lbody: 389 lea K_XX_XX(%rip),$Tbl 390 lea `$REG_SZ*16`(%rsp),%rbx 391 392.Loop_grande: 393 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 394 xor $num,$num 395___ 396for($i=0;$i<4;$i++) { 397 $code.=<<___; 398 mov `16*$i+0`($inp),@ptr[$i] # input pointer 399 mov `16*$i+8`($inp),%ecx # number of blocks 400 cmp $num,%ecx 401 cmovg %ecx,$num # find maximum 402 test %ecx,%ecx 403 mov %ecx,`4*$i`(%rbx) # initialize counters 404 cmovle $Tbl,@ptr[$i] # cancel input 405___ 406} 407$code.=<<___; 408 test $num,$num 409 jz .Ldone 410 411 movdqu 0x00($ctx),$A # load context 412 lea 128(%rsp),%rax 413 movdqu 0x20($ctx),$B 414 movdqu 0x40($ctx),$C 415 movdqu 0x60($ctx),$D 416 movdqu 0x80($ctx),$E 417 movdqa 0x60($Tbl),$tx # pbswap_mask 418 movdqa -0x20($Tbl),$K # K_00_19 419 jmp .Loop 420 421.align 32 422.Loop: 423___ 424for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 425$code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 426for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 427$code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59 428for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 429$code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79 430for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 431$code.=<<___; 432 movdqa (%rbx),@Xi[0] # pull counters 433 mov \$1,%ecx 434 cmp 4*0(%rbx),%ecx # examinte counters 435 pxor $t2,$t2 436 cmovge $Tbl,@ptr[0] # cancel input 437 cmp 4*1(%rbx),%ecx 438 movdqa @Xi[0],@Xi[1] 439 cmovge $Tbl,@ptr[1] 440 cmp 4*2(%rbx),%ecx 441 pcmpgtd $t2,@Xi[1] # mask value 442 cmovge $Tbl,@ptr[2] 443 cmp 4*3(%rbx),%ecx 444 paddd @Xi[1],@Xi[0] # counters-- 445 cmovge $Tbl,@ptr[3] 446 447 movdqu 0x00($ctx),$t0 448 pand @Xi[1],$A 449 movdqu 0x20($ctx),$t1 450 pand @Xi[1],$B 451 paddd $t0,$A 452 movdqu 0x40($ctx),$t2 453 pand @Xi[1],$C 454 paddd $t1,$B 455 movdqu 0x60($ctx),$t3 456 pand @Xi[1],$D 457 paddd $t2,$C 458 movdqu 0x80($ctx),$tx 459 pand @Xi[1],$E 460 movdqu $A,0x00($ctx) 461 paddd $t3,$D 462 movdqu $B,0x20($ctx) 463 paddd $tx,$E 464 movdqu $C,0x40($ctx) 465 movdqu $D,0x60($ctx) 466 movdqu $E,0x80($ctx) 467 468 movdqa @Xi[0],(%rbx) # save counters 469 movdqa 0x60($Tbl),$tx # pbswap_mask 470 movdqa -0x20($Tbl),$K # K_00_19 471 dec $num 472 jnz .Loop 473 474 mov `$REG_SZ*17+8`(%rsp),$num 475 lea $REG_SZ($ctx),$ctx 476 lea `16*$REG_SZ/4`($inp),$inp 477 dec $num 478 jnz .Loop_grande 479 480.Ldone: 481 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp 482___ 483$code.=<<___ if ($win64); 484 movaps -0xb8(%rax),%xmm6 485 movaps -0xa8(%rax),%xmm7 486 movaps -0x98(%rax),%xmm8 487 movaps -0x88(%rax),%xmm9 488 movaps -0x78(%rax),%xmm10 489 movaps -0x68(%rax),%xmm11 490 movaps -0x58(%rax),%xmm12 491 movaps -0x48(%rax),%xmm13 492 movaps -0x38(%rax),%xmm14 493 movaps -0x28(%rax),%xmm15 494___ 495$code.=<<___; 496 mov -16(%rax),%rbp 497 mov -8(%rax),%rbx 498 lea (%rax),%rsp 499.Lepilogue: 500 ret 501.size sha1_multi_block,.-sha1_multi_block 502___ 503 {{{ 504my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10)); 505my @MSG0=map("%xmm$_",(4..7)); 506my @MSG1=map("%xmm$_",(11..14)); 507 508$code.=<<___; 509.type sha1_multi_block_shaext,\@function,3 510.align 32 511sha1_multi_block_shaext: 512_shaext_shortcut: 513 mov %rsp,%rax 514 push %rbx 515 push %rbp 516___ 517$code.=<<___ if ($win64); 518 lea -0xa8(%rsp),%rsp 519 movaps %xmm6,(%rsp) 520 movaps %xmm7,0x10(%rsp) 521 movaps %xmm8,0x20(%rsp) 522 movaps %xmm9,0x30(%rsp) 523 movaps %xmm10,-0x78(%rax) 524 movaps %xmm11,-0x68(%rax) 525 movaps %xmm12,-0x58(%rax) 526 movaps %xmm13,-0x48(%rax) 527 movaps %xmm14,-0x38(%rax) 528 movaps %xmm15,-0x28(%rax) 529___ 530$code.=<<___; 531 sub \$`$REG_SZ*18`,%rsp 532 shl \$1,$num # we process pair at a time 533 and \$-256,%rsp 534 lea 0x40($ctx),$ctx # size optimization 535 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 536.Lbody_shaext: 537 lea `$REG_SZ*16`(%rsp),%rbx 538 movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap 539 540.Loop_grande_shaext: 541 mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num 542 xor $num,$num 543___ 544for($i=0;$i<2;$i++) { 545 $code.=<<___; 546 mov `16*$i+0`($inp),@ptr[$i] # input pointer 547 mov `16*$i+8`($inp),%ecx # number of blocks 548 cmp $num,%ecx 549 cmovg %ecx,$num # find maximum 550 test %ecx,%ecx 551 mov %ecx,`4*$i`(%rbx) # initialize counters 552 cmovle %rsp,@ptr[$i] # cancel input 553___ 554} 555$code.=<<___; 556 test $num,$num 557 jz .Ldone_shaext 558 559 movq 0x00-0x40($ctx),$ABCD0 # a1.a0 560 movq 0x20-0x40($ctx),@MSG0[0]# b1.b0 561 movq 0x40-0x40($ctx),@MSG0[1]# c1.c0 562 movq 0x60-0x40($ctx),@MSG0[2]# d1.d0 563 movq 0x80-0x40($ctx),@MSG0[3]# e1.e0 564 565 punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0 566 punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0 567 568 movdqa $ABCD0,$ABCD1 569 punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0 570 punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1 571 572 pshufd \$0b00111111,@MSG0[3],$E0 573 pshufd \$0b01111111,@MSG0[3],$E1 574 pshufd \$0b00011011,$ABCD0,$ABCD0 575 pshufd \$0b00011011,$ABCD1,$ABCD1 576 jmp .Loop_shaext 577 578.align 32 579.Loop_shaext: 580 movdqu 0x00(@ptr[0]),@MSG0[0] 581 movdqu 0x00(@ptr[1]),@MSG1[0] 582 movdqu 0x10(@ptr[0]),@MSG0[1] 583 movdqu 0x10(@ptr[1]),@MSG1[1] 584 movdqu 0x20(@ptr[0]),@MSG0[2] 585 pshufb $BSWAP,@MSG0[0] 586 movdqu 0x20(@ptr[1]),@MSG1[2] 587 pshufb $BSWAP,@MSG1[0] 588 movdqu 0x30(@ptr[0]),@MSG0[3] 589 lea 0x40(@ptr[0]),@ptr[0] 590 pshufb $BSWAP,@MSG0[1] 591 movdqu 0x30(@ptr[1]),@MSG1[3] 592 lea 0x40(@ptr[1]),@ptr[1] 593 pshufb $BSWAP,@MSG1[1] 594 595 movdqa $E0,0x50(%rsp) # offload 596 paddd @MSG0[0],$E0 597 movdqa $E1,0x70(%rsp) 598 paddd @MSG1[0],$E1 599 movdqa $ABCD0,0x40(%rsp) # offload 600 movdqa $ABCD0,$E0_ 601 movdqa $ABCD1,0x60(%rsp) 602 movdqa $ABCD1,$E1_ 603 sha1rnds4 \$0,$E0,$ABCD0 # 0-3 604 sha1nexte @MSG0[1],$E0_ 605 sha1rnds4 \$0,$E1,$ABCD1 # 0-3 606 sha1nexte @MSG1[1],$E1_ 607 pshufb $BSWAP,@MSG0[2] 608 prefetcht0 127(@ptr[0]) 609 sha1msg1 @MSG0[1],@MSG0[0] 610 pshufb $BSWAP,@MSG1[2] 611 prefetcht0 127(@ptr[1]) 612 sha1msg1 @MSG1[1],@MSG1[0] 613 614 pshufb $BSWAP,@MSG0[3] 615 movdqa $ABCD0,$E0 616 pshufb $BSWAP,@MSG1[3] 617 movdqa $ABCD1,$E1 618 sha1rnds4 \$0,$E0_,$ABCD0 # 4-7 619 sha1nexte @MSG0[2],$E0 620 sha1rnds4 \$0,$E1_,$ABCD1 # 4-7 621 sha1nexte @MSG1[2],$E1 622 pxor @MSG0[2],@MSG0[0] 623 sha1msg1 @MSG0[2],@MSG0[1] 624 pxor @MSG1[2],@MSG1[0] 625 sha1msg1 @MSG1[2],@MSG1[1] 626___ 627for($i=2;$i<20-4;$i++) { 628$code.=<<___; 629 movdqa $ABCD0,$E0_ 630 movdqa $ABCD1,$E1_ 631 sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11 632 sha1nexte @MSG0[3],$E0_ 633 sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11 634 sha1nexte @MSG1[3],$E1_ 635 sha1msg2 @MSG0[3],@MSG0[0] 636 sha1msg2 @MSG1[3],@MSG1[0] 637 pxor @MSG0[3],@MSG0[1] 638 sha1msg1 @MSG0[3],@MSG0[2] 639 pxor @MSG1[3],@MSG1[1] 640 sha1msg1 @MSG1[3],@MSG1[2] 641___ 642 ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1); 643 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); 644} 645$code.=<<___; 646 movdqa $ABCD0,$E0_ 647 movdqa $ABCD1,$E1_ 648 sha1rnds4 \$3,$E0,$ABCD0 # 64-67 649 sha1nexte @MSG0[3],$E0_ 650 sha1rnds4 \$3,$E1,$ABCD1 # 64-67 651 sha1nexte @MSG1[3],$E1_ 652 sha1msg2 @MSG0[3],@MSG0[0] 653 sha1msg2 @MSG1[3],@MSG1[0] 654 pxor @MSG0[3],@MSG0[1] 655 pxor @MSG1[3],@MSG1[1] 656 657 mov \$1,%ecx 658 pxor @MSG0[2],@MSG0[2] # zero 659 cmp 4*0(%rbx),%ecx # examine counters 660 cmovge %rsp,@ptr[0] # cancel input 661 662 movdqa $ABCD0,$E0 663 movdqa $ABCD1,$E1 664 sha1rnds4 \$3,$E0_,$ABCD0 # 68-71 665 sha1nexte @MSG0[0],$E0 666 sha1rnds4 \$3,$E1_,$ABCD1 # 68-71 667 sha1nexte @MSG1[0],$E1 668 sha1msg2 @MSG0[0],@MSG0[1] 669 sha1msg2 @MSG1[0],@MSG1[1] 670 671 cmp 4*1(%rbx),%ecx 672 cmovge %rsp,@ptr[1] 673 movq (%rbx),@MSG0[0] # pull counters 674 675 movdqa $ABCD0,$E0_ 676 movdqa $ABCD1,$E1_ 677 sha1rnds4 \$3,$E0,$ABCD0 # 72-75 678 sha1nexte @MSG0[1],$E0_ 679 sha1rnds4 \$3,$E1,$ABCD1 # 72-75 680 sha1nexte @MSG1[1],$E1_ 681 682 pshufd \$0x00,@MSG0[0],@MSG1[2] 683 pshufd \$0x55,@MSG0[0],@MSG1[3] 684 movdqa @MSG0[0],@MSG0[1] 685 pcmpgtd @MSG0[2],@MSG1[2] 686 pcmpgtd @MSG0[2],@MSG1[3] 687 688 movdqa $ABCD0,$E0 689 movdqa $ABCD1,$E1 690 sha1rnds4 \$3,$E0_,$ABCD0 # 76-79 691 sha1nexte $MSG0[2],$E0 692 sha1rnds4 \$3,$E1_,$ABCD1 # 76-79 693 sha1nexte $MSG0[2],$E1 694 695 pcmpgtd @MSG0[2],@MSG0[1] # counter mask 696 pand @MSG1[2],$ABCD0 697 pand @MSG1[2],$E0 698 pand @MSG1[3],$ABCD1 699 pand @MSG1[3],$E1 700 paddd @MSG0[1],@MSG0[0] # counters-- 701 702 paddd 0x40(%rsp),$ABCD0 703 paddd 0x50(%rsp),$E0 704 paddd 0x60(%rsp),$ABCD1 705 paddd 0x70(%rsp),$E1 706 707 movq @MSG0[0],(%rbx) # save counters 708 dec $num 709 jnz .Loop_shaext 710 711 mov `$REG_SZ*17+8`(%rsp),$num 712 713 pshufd \$0b00011011,$ABCD0,$ABCD0 714 pshufd \$0b00011011,$ABCD1,$ABCD1 715 716 movdqa $ABCD0,@MSG0[0] 717 punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0 718 punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0 719 punpckhdq $E1,$E0 # e1.e0.xx.xx 720 movq $ABCD0,0x00-0x40($ctx) # a1.a0 721 psrldq \$8,$ABCD0 722 movq @MSG0[0],0x40-0x40($ctx)# c1.c0 723 psrldq \$8,@MSG0[0] 724 movq $ABCD0,0x20-0x40($ctx) # b1.b0 725 psrldq \$8,$E0 726 movq @MSG0[0],0x60-0x40($ctx)# d1.d0 727 movq $E0,0x80-0x40($ctx) # e1.e0 728 729 lea `$REG_SZ/2`($ctx),$ctx 730 lea `16*2`($inp),$inp 731 dec $num 732 jnz .Loop_grande_shaext 733 734.Ldone_shaext: 735 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp 736___ 737$code.=<<___ if ($win64); 738 movaps -0xb8(%rax),%xmm6 739 movaps -0xa8(%rax),%xmm7 740 movaps -0x98(%rax),%xmm8 741 movaps -0x88(%rax),%xmm9 742 movaps -0x78(%rax),%xmm10 743 movaps -0x68(%rax),%xmm11 744 movaps -0x58(%rax),%xmm12 745 movaps -0x48(%rax),%xmm13 746 movaps -0x38(%rax),%xmm14 747 movaps -0x28(%rax),%xmm15 748___ 749$code.=<<___; 750 mov -16(%rax),%rbp 751 mov -8(%rax),%rbx 752 lea (%rax),%rsp 753.Lepilogue_shaext: 754 ret 755.size sha1_multi_block_shaext,.-sha1_multi_block_shaext 756___ 757 }}} 758 759 if ($avx) {{{ 760sub BODY_00_19_avx { 761my ($i,$a,$b,$c,$d,$e)=@_; 762my $j=$i+1; 763my $k=$i+2; 764my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128"; 765my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4]; 766 767$code.=<<___ if ($i==0 && $REG_SZ==16); 768 vmovd (@ptr[0]),@Xi[0] 769 lea `16*4`(@ptr[0]),@ptr[0] 770 vmovd (@ptr[1]),@Xi[2] # borrow Xi[2] 771 lea `16*4`(@ptr[1]),@ptr[1] 772 vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] 773 lea `16*4`(@ptr[2]),@ptr[2] 774 vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2] 775 lea `16*4`(@ptr[3]),@ptr[3] 776 vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] 777 vpunpckldq @Xi[2],@Xi[0],@Xi[0] 778 vmovd `4*$j-16*4`($ptr_n),$t3 779 vpshufb $tx,@Xi[0],@Xi[0] 780___ 781$code.=<<___ if ($i<15 && $REG_SZ==16); # just load input 782 vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] 783 vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3 784___ 785$code.=<<___ if ($i==0 && $REG_SZ==32); 786 vmovd (@ptr[0]),@Xi[0] 787 lea `16*4`(@ptr[0]),@ptr[0] 788 vmovd (@ptr[4]),@Xi[2] # borrow Xi[2] 789 lea `16*4`(@ptr[4]),@ptr[4] 790 vmovd (@ptr[1]),$t2 791 lea `16*4`(@ptr[1]),@ptr[1] 792 vmovd (@ptr[5]),$t1 793 lea `16*4`(@ptr[5]),@ptr[5] 794 vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] 795 lea `16*4`(@ptr[2]),@ptr[2] 796 vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2] 797 lea `16*4`(@ptr[6]),@ptr[6] 798 vpinsrd \$1,(@ptr[3]),$t2,$t2 799 lea `16*4`(@ptr[3]),@ptr[3] 800 vpunpckldq $t2,@Xi[0],@Xi[0] 801 vpinsrd \$1,(@ptr[7]),$t1,$t1 802 lea `16*4`(@ptr[7]),@ptr[7] 803 vpunpckldq $t1,@Xi[2],@Xi[2] 804 vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] 805 vinserti128 @Xi[2],@Xi[0],@Xi[0] 806 vmovd `4*$j-16*4`($ptr_n),$t3 807 vpshufb $tx,@Xi[0],@Xi[0] 808___ 809$code.=<<___ if ($i<15 && $REG_SZ==32); # just load input 810 vmovd `4*$j-16*4`(@ptr[1]),$t2 811 vmovd `4*$j-16*4`(@ptr[5]),$t1 812 vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] 813 vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3 814 vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2 815 vpunpckldq $t2,@Xi[1],@Xi[1] 816 vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1 817 vpunpckldq $t1,$t3,$t3 818___ 819$code.=<<___ if ($i<14); 820 vpaddd $K,$e,$e # e+=K_00_19 821 vpslld \$5,$a,$t2 822 vpandn $d,$b,$t1 823 vpand $c,$b,$t0 824 825 vmovdqa @Xi[0],`&Xi_off($i)` 826 vpaddd @Xi[0],$e,$e # e+=X[i] 827 $vpack $t3,@Xi[1],@Xi[1] 828 vpsrld \$27,$a,$t3 829 vpxor $t1,$t0,$t0 # Ch(b,c,d) 830 vmovd `4*$k-16*4`(@ptr[0]),@Xi[2] 831 832 vpslld \$30,$b,$t1 833 vpor $t3,$t2,$t2 # rol(a,5) 834 vmovd `4*$k-16*4`($ptr_n),$t3 835 vpaddd $t0,$e,$e # e+=Ch(b,c,d) 836 837 vpsrld \$2,$b,$b 838 vpaddd $t2,$e,$e # e+=rol(a,5) 839 vpshufb $tx,@Xi[1],@Xi[1] 840 vpor $t1,$b,$b # b=rol(b,30) 841___ 842$code.=<<___ if ($i==14); 843 vpaddd $K,$e,$e # e+=K_00_19 844 prefetcht0 63(@ptr[0]) 845 vpslld \$5,$a,$t2 846 vpandn $d,$b,$t1 847 vpand $c,$b,$t0 848 849 vmovdqa @Xi[0],`&Xi_off($i)` 850 vpaddd @Xi[0],$e,$e # e+=X[i] 851 $vpack $t3,@Xi[1],@Xi[1] 852 vpsrld \$27,$a,$t3 853 prefetcht0 63(@ptr[1]) 854 vpxor $t1,$t0,$t0 # Ch(b,c,d) 855 856 vpslld \$30,$b,$t1 857 vpor $t3,$t2,$t2 # rol(a,5) 858 prefetcht0 63(@ptr[2]) 859 vpaddd $t0,$e,$e # e+=Ch(b,c,d) 860 861 vpsrld \$2,$b,$b 862 vpaddd $t2,$e,$e # e+=rol(a,5) 863 prefetcht0 63(@ptr[3]) 864 vpshufb $tx,@Xi[1],@Xi[1] 865 vpor $t1,$b,$b # b=rol(b,30) 866___ 867$code.=<<___ if ($i>=13 && $i<15); 868 vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" 869___ 870$code.=<<___ if ($i>=15); # apply Xupdate 871 vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 872 vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 873 874 vpaddd $K,$e,$e # e+=K_00_19 875 vpslld \$5,$a,$t2 876 vpandn $d,$b,$t1 877 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` 878 vpand $c,$b,$t0 879 880 vmovdqa @Xi[0],`&Xi_off($i)` 881 vpaddd @Xi[0],$e,$e # e+=X[i] 882 vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 883 vpsrld \$27,$a,$t3 884 vpxor $t1,$t0,$t0 # Ch(b,c,d) 885 vpxor @Xi[3],@Xi[1],@Xi[1] 886 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` 887 888 vpslld \$30,$b,$t1 889 vpor $t3,$t2,$t2 # rol(a,5) 890 vpaddd $t0,$e,$e # e+=Ch(b,c,d) 891 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` 892 vpsrld \$31,@Xi[1],$tx 893 vpaddd @Xi[1],@Xi[1],@Xi[1] 894 895 vpsrld \$2,$b,$b 896 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` 897 vpaddd $t2,$e,$e # e+=rol(a,5) 898 vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] 899 vpor $t1,$b,$b # b=rol(b,30) 900___ 901push(@Xi,shift(@Xi)); 902} 903 904sub BODY_20_39_avx { 905my ($i,$a,$b,$c,$d,$e)=@_; 906my $j=$i+1; 907 908$code.=<<___ if ($i<79); 909 vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 910 vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 911 912 vpslld \$5,$a,$t2 913 vpaddd $K,$e,$e # e+=K_20_39 914 vpxor $b,$d,$t0 915___ 916$code.=<<___ if ($i<72); 917 vmovdqa @Xi[0],`&Xi_off($i)` 918___ 919$code.=<<___ if ($i<79); 920 vpaddd @Xi[0],$e,$e # e+=X[i] 921 vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 922 vpsrld \$27,$a,$t3 923 vpxor $c,$t0,$t0 # Parity(b,c,d) 924 vpxor @Xi[3],@Xi[1],@Xi[1] 925 926 vpslld \$30,$b,$t1 927 vpor $t3,$t2,$t2 # rol(a,5) 928 vpaddd $t0,$e,$e # e+=Parity(b,c,d) 929 vpsrld \$31,@Xi[1],$tx 930 vpaddd @Xi[1],@Xi[1],@Xi[1] 931 932 vpsrld \$2,$b,$b 933 vpaddd $t2,$e,$e # e+=rol(a,5) 934 vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1) 935 vpor $t1,$b,$b # b=rol(b,30) 936___ 937$code.=<<___ if ($i==79); 938 vpslld \$5,$a,$t2 939 vpaddd $K,$e,$e # e+=K_20_39 940 vpxor $b,$d,$t0 941 942 vpsrld \$27,$a,$t3 943 vpaddd @Xi[0],$e,$e # e+=X[i] 944 vpxor $c,$t0,$t0 # Parity(b,c,d) 945 946 vpslld \$30,$b,$t1 947 vpor $t3,$t2,$t2 # rol(a,5) 948 vpaddd $t0,$e,$e # e+=Parity(b,c,d) 949 950 vpsrld \$2,$b,$b 951 vpaddd $t2,$e,$e # e+=rol(a,5) 952 vpor $t1,$b,$b # b=rol(b,30) 953___ 954push(@Xi,shift(@Xi)); 955} 956 957sub BODY_40_59_avx { 958my ($i,$a,$b,$c,$d,$e)=@_; 959my $j=$i+1; 960 961$code.=<<___; 962 vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 963 vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 964 965 vpaddd $K,$e,$e # e+=K_40_59 966 vpslld \$5,$a,$t2 967 vpand $c,$d,$t1 968 vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 969 970 vpaddd $t1,$e,$e 971 vpsrld \$27,$a,$t3 972 vpxor $c,$d,$t0 973 vpxor @Xi[3],@Xi[1],@Xi[1] 974 975 vmovdqu @Xi[0],`&Xi_off($i)` 976 vpaddd @Xi[0],$e,$e # e+=X[i] 977 vpor $t3,$t2,$t2 # rol(a,5) 978 vpsrld \$31,@Xi[1],$tx 979 vpand $b,$t0,$t0 980 vpaddd @Xi[1],@Xi[1],@Xi[1] 981 982 vpslld \$30,$b,$t1 983 vpaddd $t0,$e,$e # e+=Maj(b,d,c) 984 985 vpsrld \$2,$b,$b 986 vpaddd $t2,$e,$e # e+=rol(a,5) 987 vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1) 988 vpor $t1,$b,$b # b=rol(b,30) 989___ 990push(@Xi,shift(@Xi)); 991} 992 993$code.=<<___; 994.type sha1_multi_block_avx,\@function,3 995.align 32 996sha1_multi_block_avx: 997_avx_shortcut: 998___ 999$code.=<<___ if ($avx>1); 1000 shr \$32,%rcx 1001 cmp \$2,$num 1002 jb .Lavx 1003 test \$`1<<5`,%ecx 1004 jnz _avx2_shortcut 1005 jmp .Lavx 1006.align 32 1007.Lavx: 1008___ 1009$code.=<<___; 1010 mov %rsp,%rax 1011 push %rbx 1012 push %rbp 1013___ 1014$code.=<<___ if ($win64); 1015 lea -0xa8(%rsp),%rsp 1016 movaps %xmm6,(%rsp) 1017 movaps %xmm7,0x10(%rsp) 1018 movaps %xmm8,0x20(%rsp) 1019 movaps %xmm9,0x30(%rsp) 1020 movaps %xmm10,-0x78(%rax) 1021 movaps %xmm11,-0x68(%rax) 1022 movaps %xmm12,-0x58(%rax) 1023 movaps %xmm13,-0x48(%rax) 1024 movaps %xmm14,-0x38(%rax) 1025 movaps %xmm15,-0x28(%rax) 1026___ 1027$code.=<<___; 1028 sub \$`$REG_SZ*18`, %rsp 1029 and \$-256,%rsp 1030 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1031.Lbody_avx: 1032 lea K_XX_XX(%rip),$Tbl 1033 lea `$REG_SZ*16`(%rsp),%rbx 1034 1035 vzeroupper 1036.Loop_grande_avx: 1037 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 1038 xor $num,$num 1039___ 1040for($i=0;$i<4;$i++) { 1041 $code.=<<___; 1042 mov `16*$i+0`($inp),@ptr[$i] # input pointer 1043 mov `16*$i+8`($inp),%ecx # number of blocks 1044 cmp $num,%ecx 1045 cmovg %ecx,$num # find maximum 1046 test %ecx,%ecx 1047 mov %ecx,`4*$i`(%rbx) # initialize counters 1048 cmovle $Tbl,@ptr[$i] # cancel input 1049___ 1050} 1051$code.=<<___; 1052 test $num,$num 1053 jz .Ldone_avx 1054 1055 vmovdqu 0x00($ctx),$A # load context 1056 lea 128(%rsp),%rax 1057 vmovdqu 0x20($ctx),$B 1058 vmovdqu 0x40($ctx),$C 1059 vmovdqu 0x60($ctx),$D 1060 vmovdqu 0x80($ctx),$E 1061 vmovdqu 0x60($Tbl),$tx # pbswap_mask 1062 jmp .Loop_avx 1063 1064.align 32 1065.Loop_avx: 1066___ 1067$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 1068for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } 1069$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 1070for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1071$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 1072for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } 1073$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 1074for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1075$code.=<<___; 1076 mov \$1,%ecx 1077___ 1078for($i=0;$i<4;$i++) { 1079 $code.=<<___; 1080 cmp `4*$i`(%rbx),%ecx # examine counters 1081 cmovge $Tbl,@ptr[$i] # cancel input 1082___ 1083} 1084$code.=<<___; 1085 vmovdqu (%rbx),$t0 # pull counters 1086 vpxor $t2,$t2,$t2 1087 vmovdqa $t0,$t1 1088 vpcmpgtd $t2,$t1,$t1 # mask value 1089 vpaddd $t1,$t0,$t0 # counters-- 1090 1091 vpand $t1,$A,$A 1092 vpand $t1,$B,$B 1093 vpaddd 0x00($ctx),$A,$A 1094 vpand $t1,$C,$C 1095 vpaddd 0x20($ctx),$B,$B 1096 vpand $t1,$D,$D 1097 vpaddd 0x40($ctx),$C,$C 1098 vpand $t1,$E,$E 1099 vpaddd 0x60($ctx),$D,$D 1100 vpaddd 0x80($ctx),$E,$E 1101 vmovdqu $A,0x00($ctx) 1102 vmovdqu $B,0x20($ctx) 1103 vmovdqu $C,0x40($ctx) 1104 vmovdqu $D,0x60($ctx) 1105 vmovdqu $E,0x80($ctx) 1106 1107 vmovdqu $t0,(%rbx) # save counters 1108 vmovdqu 0x60($Tbl),$tx # pbswap_mask 1109 dec $num 1110 jnz .Loop_avx 1111 1112 mov `$REG_SZ*17+8`(%rsp),$num 1113 lea $REG_SZ($ctx),$ctx 1114 lea `16*$REG_SZ/4`($inp),$inp 1115 dec $num 1116 jnz .Loop_grande_avx 1117 1118.Ldone_avx: 1119 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp 1120 vzeroupper 1121___ 1122$code.=<<___ if ($win64); 1123 movaps -0xb8(%rax),%xmm6 1124 movaps -0xa8(%rax),%xmm7 1125 movaps -0x98(%rax),%xmm8 1126 movaps -0x88(%rax),%xmm9 1127 movaps -0x78(%rax),%xmm10 1128 movaps -0x68(%rax),%xmm11 1129 movaps -0x58(%rax),%xmm12 1130 movaps -0x48(%rax),%xmm13 1131 movaps -0x38(%rax),%xmm14 1132 movaps -0x28(%rax),%xmm15 1133___ 1134$code.=<<___; 1135 mov -16(%rax),%rbp 1136 mov -8(%rax),%rbx 1137 lea (%rax),%rsp 1138.Lepilogue_avx: 1139 ret 1140.size sha1_multi_block_avx,.-sha1_multi_block_avx 1141___ 1142 1143 if ($avx>1) { 1144$code =~ s/\`([^\`]*)\`/eval $1/gem; 1145 1146$REG_SZ=32; 1147 1148@ptr=map("%r$_",(12..15,8..11)); 1149 1150@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4)); 1151($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9)); 1152@Xi=map("%ymm$_",(10..14)); 1153$K="%ymm15"; 1154 1155$code.=<<___; 1156.type sha1_multi_block_avx2,\@function,3 1157.align 32 1158sha1_multi_block_avx2: 1159_avx2_shortcut: 1160 mov %rsp,%rax 1161 push %rbx 1162 push %rbp 1163 push %r12 1164 push %r13 1165 push %r14 1166 push %r15 1167___ 1168$code.=<<___ if ($win64); 1169 lea -0xa8(%rsp),%rsp 1170 movaps %xmm6,(%rsp) 1171 movaps %xmm7,0x10(%rsp) 1172 movaps %xmm8,0x20(%rsp) 1173 movaps %xmm9,0x30(%rsp) 1174 movaps %xmm10,0x40(%rsp) 1175 movaps %xmm11,0x50(%rsp) 1176 movaps %xmm12,-0x78(%rax) 1177 movaps %xmm13,-0x68(%rax) 1178 movaps %xmm14,-0x58(%rax) 1179 movaps %xmm15,-0x48(%rax) 1180___ 1181$code.=<<___; 1182 sub \$`$REG_SZ*18`, %rsp 1183 and \$-256,%rsp 1184 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1185.Lbody_avx2: 1186 lea K_XX_XX(%rip),$Tbl 1187 shr \$1,$num 1188 1189 vzeroupper 1190.Loop_grande_avx2: 1191 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 1192 xor $num,$num 1193 lea `$REG_SZ*16`(%rsp),%rbx 1194___ 1195for($i=0;$i<8;$i++) { 1196 $code.=<<___; 1197 mov `16*$i+0`($inp),@ptr[$i] # input pointer 1198 mov `16*$i+8`($inp),%ecx # number of blocks 1199 cmp $num,%ecx 1200 cmovg %ecx,$num # find maximum 1201 test %ecx,%ecx 1202 mov %ecx,`4*$i`(%rbx) # initialize counters 1203 cmovle $Tbl,@ptr[$i] # cancel input 1204___ 1205} 1206$code.=<<___; 1207 vmovdqu 0x00($ctx),$A # load context 1208 lea 128(%rsp),%rax 1209 vmovdqu 0x20($ctx),$B 1210 lea 256+128(%rsp),%rbx 1211 vmovdqu 0x40($ctx),$C 1212 vmovdqu 0x60($ctx),$D 1213 vmovdqu 0x80($ctx),$E 1214 vmovdqu 0x60($Tbl),$tx # pbswap_mask 1215 jmp .Loop_avx2 1216 1217.align 32 1218.Loop_avx2: 1219___ 1220$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 1221for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } 1222$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 1223for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1224$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 1225for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } 1226$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 1227for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1228$code.=<<___; 1229 mov \$1,%ecx 1230 lea `$REG_SZ*16`(%rsp),%rbx 1231___ 1232for($i=0;$i<8;$i++) { 1233 $code.=<<___; 1234 cmp `4*$i`(%rbx),%ecx # examine counters 1235 cmovge $Tbl,@ptr[$i] # cancel input 1236___ 1237} 1238$code.=<<___; 1239 vmovdqu (%rbx),$t0 # pull counters 1240 vpxor $t2,$t2,$t2 1241 vmovdqa $t0,$t1 1242 vpcmpgtd $t2,$t1,$t1 # mask value 1243 vpaddd $t1,$t0,$t0 # counters-- 1244 1245 vpand $t1,$A,$A 1246 vpand $t1,$B,$B 1247 vpaddd 0x00($ctx),$A,$A 1248 vpand $t1,$C,$C 1249 vpaddd 0x20($ctx),$B,$B 1250 vpand $t1,$D,$D 1251 vpaddd 0x40($ctx),$C,$C 1252 vpand $t1,$E,$E 1253 vpaddd 0x60($ctx),$D,$D 1254 vpaddd 0x80($ctx),$E,$E 1255 vmovdqu $A,0x00($ctx) 1256 vmovdqu $B,0x20($ctx) 1257 vmovdqu $C,0x40($ctx) 1258 vmovdqu $D,0x60($ctx) 1259 vmovdqu $E,0x80($ctx) 1260 1261 vmovdqu $t0,(%rbx) # save counters 1262 lea 256+128(%rsp),%rbx 1263 vmovdqu 0x60($Tbl),$tx # pbswap_mask 1264 dec $num 1265 jnz .Loop_avx2 1266 1267 #mov `$REG_SZ*17+8`(%rsp),$num 1268 #lea $REG_SZ($ctx),$ctx 1269 #lea `16*$REG_SZ/4`($inp),$inp 1270 #dec $num 1271 #jnz .Loop_grande_avx2 1272 1273.Ldone_avx2: 1274 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp 1275 vzeroupper 1276___ 1277$code.=<<___ if ($win64); 1278 movaps -0xd8(%rax),%xmm6 1279 movaps -0xc8(%rax),%xmm7 1280 movaps -0xb8(%rax),%xmm8 1281 movaps -0xa8(%rax),%xmm9 1282 movaps -0x98(%rax),%xmm10 1283 movaps -0x88(%rax),%xmm11 1284 movaps -0x78(%rax),%xmm12 1285 movaps -0x68(%rax),%xmm13 1286 movaps -0x58(%rax),%xmm14 1287 movaps -0x48(%rax),%xmm15 1288___ 1289$code.=<<___; 1290 mov -48(%rax),%r15 1291 mov -40(%rax),%r14 1292 mov -32(%rax),%r13 1293 mov -24(%rax),%r12 1294 mov -16(%rax),%rbp 1295 mov -8(%rax),%rbx 1296 lea (%rax),%rsp 1297.Lepilogue_avx2: 1298 ret 1299.size sha1_multi_block_avx2,.-sha1_multi_block_avx2 1300___ 1301 } }}} 1302$code.=<<___; 1303 1304.align 256 1305 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1306 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1307K_XX_XX: 1308 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1309 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1310 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1311 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1312 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1313 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1314 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1315 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1316 .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 1317 .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1318___ 1319 1320if ($win64) { 1321# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1322# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1323$rec="%rcx"; 1324$frame="%rdx"; 1325$context="%r8"; 1326$disp="%r9"; 1327 1328$code.=<<___; 1329.extern __imp_RtlVirtualUnwind 1330.type se_handler,\@abi-omnipotent 1331.align 16 1332se_handler: 1333 push %rsi 1334 push %rdi 1335 push %rbx 1336 push %rbp 1337 push %r12 1338 push %r13 1339 push %r14 1340 push %r15 1341 pushfq 1342 sub \$64,%rsp 1343 1344 mov 120($context),%rax # pull context->Rax 1345 mov 248($context),%rbx # pull context->Rip 1346 1347 mov 8($disp),%rsi # disp->ImageBase 1348 mov 56($disp),%r11 # disp->HandlerData 1349 1350 mov 0(%r11),%r10d # HandlerData[0] 1351 lea (%rsi,%r10),%r10 # end of prologue label 1352 cmp %r10,%rbx # context->Rip<.Lbody 1353 jb .Lin_prologue 1354 1355 mov 152($context),%rax # pull context->Rsp 1356 1357 mov 4(%r11),%r10d # HandlerData[1] 1358 lea (%rsi,%r10),%r10 # epilogue label 1359 cmp %r10,%rbx # context->Rip>=.Lepilogue 1360 jae .Lin_prologue 1361 1362 mov `16*17`(%rax),%rax # pull saved stack pointer 1363 1364 mov -8(%rax),%rbx 1365 mov -16(%rax),%rbp 1366 mov %rbx,144($context) # restore context->Rbx 1367 mov %rbp,160($context) # restore context->Rbp 1368 1369 lea -24-10*16(%rax),%rsi 1370 lea 512($context),%rdi # &context.Xmm6 1371 mov \$20,%ecx 1372 .long 0xa548f3fc # cld; rep movsq 1373 1374.Lin_prologue: 1375 mov 8(%rax),%rdi 1376 mov 16(%rax),%rsi 1377 mov %rax,152($context) # restore context->Rsp 1378 mov %rsi,168($context) # restore context->Rsi 1379 mov %rdi,176($context) # restore context->Rdi 1380 1381 mov 40($disp),%rdi # disp->ContextRecord 1382 mov $context,%rsi # context 1383 mov \$154,%ecx # sizeof(CONTEXT) 1384 .long 0xa548f3fc # cld; rep movsq 1385 1386 mov $disp,%rsi 1387 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1388 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1389 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1390 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1391 mov 40(%rsi),%r10 # disp->ContextRecord 1392 lea 56(%rsi),%r11 # &disp->HandlerData 1393 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1394 mov %r10,32(%rsp) # arg5 1395 mov %r11,40(%rsp) # arg6 1396 mov %r12,48(%rsp) # arg7 1397 mov %rcx,56(%rsp) # arg8, (NULL) 1398 call *__imp_RtlVirtualUnwind(%rip) 1399 1400 mov \$1,%eax # ExceptionContinueSearch 1401 add \$64,%rsp 1402 popfq 1403 pop %r15 1404 pop %r14 1405 pop %r13 1406 pop %r12 1407 pop %rbp 1408 pop %rbx 1409 pop %rdi 1410 pop %rsi 1411 ret 1412.size se_handler,.-se_handler 1413___ 1414$code.=<<___ if ($avx>1); 1415.type avx2_handler,\@abi-omnipotent 1416.align 16 1417avx2_handler: 1418 push %rsi 1419 push %rdi 1420 push %rbx 1421 push %rbp 1422 push %r12 1423 push %r13 1424 push %r14 1425 push %r15 1426 pushfq 1427 sub \$64,%rsp 1428 1429 mov 120($context),%rax # pull context->Rax 1430 mov 248($context),%rbx # pull context->Rip 1431 1432 mov 8($disp),%rsi # disp->ImageBase 1433 mov 56($disp),%r11 # disp->HandlerData 1434 1435 mov 0(%r11),%r10d # HandlerData[0] 1436 lea (%rsi,%r10),%r10 # end of prologue label 1437 cmp %r10,%rbx # context->Rip<body label 1438 jb .Lin_prologue 1439 1440 mov 152($context),%rax # pull context->Rsp 1441 1442 mov 4(%r11),%r10d # HandlerData[1] 1443 lea (%rsi,%r10),%r10 # epilogue label 1444 cmp %r10,%rbx # context->Rip>=epilogue label 1445 jae .Lin_prologue 1446 1447 mov `32*17`($context),%rax # pull saved stack pointer 1448 1449 mov -8(%rax),%rbx 1450 mov -16(%rax),%rbp 1451 mov -24(%rax),%r12 1452 mov -32(%rax),%r13 1453 mov -40(%rax),%r14 1454 mov -48(%rax),%r15 1455 mov %rbx,144($context) # restore context->Rbx 1456 mov %rbp,160($context) # restore context->Rbp 1457 mov %r12,216($context) # restore cotnext->R12 1458 mov %r13,224($context) # restore cotnext->R13 1459 mov %r14,232($context) # restore cotnext->R14 1460 mov %r15,240($context) # restore cotnext->R15 1461 1462 lea -56-10*16(%rax),%rsi 1463 lea 512($context),%rdi # &context.Xmm6 1464 mov \$20,%ecx 1465 .long 0xa548f3fc # cld; rep movsq 1466 1467 jmp .Lin_prologue 1468.size avx2_handler,.-avx2_handler 1469___ 1470$code.=<<___; 1471.section .pdata 1472.align 4 1473 .rva .LSEH_begin_sha1_multi_block 1474 .rva .LSEH_end_sha1_multi_block 1475 .rva .LSEH_info_sha1_multi_block 1476 .rva .LSEH_begin_sha1_multi_block_shaext 1477 .rva .LSEH_end_sha1_multi_block_shaext 1478 .rva .LSEH_info_sha1_multi_block_shaext 1479___ 1480$code.=<<___ if ($avx); 1481 .rva .LSEH_begin_sha1_multi_block_avx 1482 .rva .LSEH_end_sha1_multi_block_avx 1483 .rva .LSEH_info_sha1_multi_block_avx 1484___ 1485$code.=<<___ if ($avx>1); 1486 .rva .LSEH_begin_sha1_multi_block_avx2 1487 .rva .LSEH_end_sha1_multi_block_avx2 1488 .rva .LSEH_info_sha1_multi_block_avx2 1489___ 1490$code.=<<___; 1491.section .xdata 1492.align 8 1493.LSEH_info_sha1_multi_block: 1494 .byte 9,0,0,0 1495 .rva se_handler 1496 .rva .Lbody,.Lepilogue # HandlerData[] 1497.LSEH_info_sha1_multi_block_shaext: 1498 .byte 9,0,0,0 1499 .rva se_handler 1500 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] 1501___ 1502$code.=<<___ if ($avx); 1503.LSEH_info_sha1_multi_block_avx: 1504 .byte 9,0,0,0 1505 .rva se_handler 1506 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] 1507___ 1508$code.=<<___ if ($avx>1); 1509.LSEH_info_sha1_multi_block_avx2: 1510 .byte 9,0,0,0 1511 .rva avx2_handler 1512 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] 1513___ 1514} 1515#################################################################### 1516 1517sub rex { 1518 local *opcode=shift; 1519 my ($dst,$src)=@_; 1520 my $rex=0; 1521 1522 $rex|=0x04 if ($dst>=8); 1523 $rex|=0x01 if ($src>=8); 1524 unshift @opcode,$rex|0x40 if ($rex); 1525} 1526 1527sub sha1rnds4 { 1528 if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1529 my @opcode=(0x0f,0x3a,0xcc); 1530 rex(\@opcode,$3,$2); 1531 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 1532 my $c=$1; 1533 push @opcode,$c=~/^0/?oct($c):$c; 1534 return ".byte\t".join(',',@opcode); 1535 } else { 1536 return "sha1rnds4\t".@_[0]; 1537 } 1538} 1539 1540sub sha1op38 { 1541 my $instr = shift; 1542 my %opcodelet = ( 1543 "sha1nexte" => 0xc8, 1544 "sha1msg1" => 0xc9, 1545 "sha1msg2" => 0xca ); 1546 1547 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1548 my @opcode=(0x0f,0x38); 1549 rex(\@opcode,$2,$1); 1550 push @opcode,$opcodelet{$instr}; 1551 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1552 return ".byte\t".join(',',@opcode); 1553 } else { 1554 return $instr."\t".@_[0]; 1555 } 1556} 1557 1558foreach (split("\n",$code)) { 1559 s/\`([^\`]*)\`/eval($1)/ge; 1560 1561 s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or 1562 s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or 1563 1564 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1565 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1566 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or 1567 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1568 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or 1569 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1570 1571 print $_,"\n"; 1572} 1573 1574close STDOUT; 1575