1#! /usr/bin/env perl 2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. Rights for redistribution and usage in source and binary 13# forms are granted according to the License. 14# ==================================================================== 15# 16# sha256/512_block procedure for x86_64. 17# 18# 40% improvement over compiler-generated code on Opteron. On EM64T 19# sha256 was observed to run >80% faster and sha512 - >40%. No magical 20# tricks, just straight implementation... I really wonder why gcc 21# [being armed with inline assembler] fails to generate as fast code. 22# The only thing which is cool about this module is that it's very 23# same instruction sequence used for both SHA-256 and SHA-512. In 24# former case the instructions operate on 32-bit operands, while in 25# latter - on 64-bit ones. All I had to do is to get one flavor right, 26# the other one passed the test right away:-) 27# 28# sha256_block runs in ~1005 cycles on Opteron, which gives you 29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 30# frequency in GHz. sha512_block runs in ~1275 cycles, which results 31# in 128*1000/1275=100MBps per GHz. Is there room for improvement? 32# Well, if you compare it to IA-64 implementation, which maintains 33# X[16] in register bank[!], tends to 4 instructions per CPU clock 34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way 35# issue Opteron pipeline and X[16] maintained in memory. So that *if* 36# there is a way to improve it, *then* the only way would be to try to 37# offload X[16] updates to SSE unit, but that would require "deeper" 38# loop unroll, which in turn would naturally cause size blow-up, not 39# to mention increased complexity! And once again, only *if* it's 40# actually possible to noticeably improve overall ILP, instruction 41# level parallelism, on a given CPU implementation in this case. 42# 43# Special note on Intel EM64T. While Opteron CPU exhibits perfect 44# performance ratio of 1.5 between 64- and 32-bit flavors [see above], 45# [currently available] EM64T CPUs apparently are far from it. On the 46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 47# sha256_block:-( This is presumably because 64-bit shifts/rotates 48# apparently are not atomic instructions, but implemented in microcode. 49# 50# May 2012. 51# 52# Optimization including one of Pavel Semjanov's ideas, alternative 53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 54# unfortunately -2% SHA512 on P4 [which nobody should care about 55# that much]. 56# 57# June 2012. 58# 59# Add SIMD code paths, see below for improvement coefficients. SSSE3 60# code path was not attempted for SHA512, because improvement is not 61# estimated to be high enough, noticeably less than 9%, to justify 62# the effort, not on pre-AVX processors. [Obviously with exclusion 63# for VIA Nano, but it has SHA512 instruction that is faster and 64# should be used instead.] For reference, corresponding estimated 65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 66# higher coefficients are observed on VIA Nano and Bulldozer has more 67# to do with specifics of their architecture [which is topic for 68# separate discussion]. 69# 70# November 2012. 71# 72# Add AVX2 code path. Two consecutive input blocks are loaded to 73# 256-bit %ymm registers, with data from first block to least 74# significant 128-bit halves and data from second to most significant. 75# The data is then processed with same SIMD instruction sequence as 76# for AVX, but with %ymm as operands. Side effect is increased stack 77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 78# code size increase. 79# 80# March 2014. 81# 82# Add support for Intel SHA Extensions. 83 84###################################################################### 85# Current performance in cycles per processed byte (less is better): 86# 87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 88# 89# AMD K8 14.9 - - 9.57 - 90# P4 17.3 - - 30.8 - 91# Core 2 15.6 13.8(+13%) - 9.97 - 92# Westmere 14.8 12.3(+19%) - 9.58 - 93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) 97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) 99# VIA Nano 23.0 16.5(+39%) - 14.7 - 100# Atom 23.0 18.9(+22%) - 14.7 - 101# Silvermont 27.4 20.6(+33%) - 17.5 - 102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) 103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - 104# 105# (*) whichever best applicable, including SHAEXT; 106# (**) switch from ror to shrd stands for fair share of improvement; 107# (***) execution time is fully determined by remaining integer-only 108# part, body_00_15; reducing the amount of SIMD instructions 109# below certain limit makes no difference/sense; to conserve 110# space SHA256 XOP code path is therefore omitted; 111 112# $output is the last argument if it looks like a file (it has an extension) 113# $flavour is the first argument if it doesn't look like a file 114$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 115$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 116 117$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 118 119$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 120( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 121( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 122die "can't locate x86_64-xlate.pl"; 123 124if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 125 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 126 $avx = ($1>=2.19) + ($1>=2.22); 127} 128 129if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 130 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 131 $avx = ($1>=2.09) + ($1>=2.10); 132} 133 134if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 135 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 136 $avx = ($1>=10) + ($1>=11); 137} 138 139if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 140 $avx = ($2>=3.0) + ($2>3.0); 141} 142 143$shaext=1; ### set to zero if compiling for 1.0.1 144$avx=1 if (!$shaext && $avx); 145 146open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 147 or die "can't call $xlate: $!"; 148*STDOUT=*OUT; 149 150if ($output =~ /512/) { 151 $func="sha512_block_data_order"; 152 $TABLE="K512"; 153 $SZ=8; 154 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 155 "%r8", "%r9", "%r10","%r11"); 156 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 157 @Sigma0=(28,34,39); 158 @Sigma1=(14,18,41); 159 @sigma0=(1, 8, 7); 160 @sigma1=(19,61, 6); 161 $rounds=80; 162} else { 163 $func="sha256_block_data_order"; 164 $TABLE="K256"; 165 $SZ=4; 166 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 167 "%r8d","%r9d","%r10d","%r11d"); 168 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 169 @Sigma0=( 2,13,22); 170 @Sigma1=( 6,11,25); 171 @sigma0=( 7,18, 3); 172 @sigma1=(17,19,10); 173 $rounds=64; 174} 175 176$ctx="%rdi"; # 1st arg, zapped by $a3 177$inp="%rsi"; # 2nd arg 178$Tbl="%rbp"; 179 180$_ctx="16*$SZ+0*8(%rsp)"; 181$_inp="16*$SZ+1*8(%rsp)"; 182$_end="16*$SZ+2*8(%rsp)"; 183$_rsp="`16*$SZ+3*8`(%rsp)"; 184$framesz="16*$SZ+4*8"; 185 186 187sub ROUND_00_15() 188{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 189 my $STRIDE=$SZ; 190 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 191 192$code.=<<___; 193 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 194 mov $f,$a2 195 196 xor $e,$a0 197 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 198 xor $g,$a2 # f^g 199 200 mov $T1,`$SZ*($i&0xf)`(%rsp) 201 xor $a,$a1 202 and $e,$a2 # (f^g)&e 203 204 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 205 add $h,$T1 # T1+=h 206 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 207 208 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 209 xor $e,$a0 210 add $a2,$T1 # T1+=Ch(e,f,g) 211 212 mov $a,$a2 213 add ($Tbl),$T1 # T1+=K[round] 214 xor $a,$a1 215 216 xor $b,$a2 # a^b, b^c in next round 217 ror \$$Sigma1[0],$a0 # Sigma1(e) 218 mov $b,$h 219 220 and $a2,$a3 221 ror \$$Sigma0[0],$a1 # Sigma0(a) 222 add $a0,$T1 # T1+=Sigma1(e) 223 224 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 225 add $T1,$d # d+=T1 226 add $T1,$h # h+=T1 227 228 lea $STRIDE($Tbl),$Tbl # round++ 229___ 230$code.=<<___ if ($i<15); 231 add $a1,$h # h+=Sigma0(a) 232___ 233 ($a2,$a3) = ($a3,$a2); 234} 235 236sub ROUND_16_XX() 237{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 238 239$code.=<<___; 240 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 241 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 242 243 mov $a0,$T1 244 ror \$`$sigma0[1]-$sigma0[0]`,$a0 245 add $a1,$a # modulo-scheduled h+=Sigma0(a) 246 mov $a2,$a1 247 ror \$`$sigma1[1]-$sigma1[0]`,$a2 248 249 xor $T1,$a0 250 shr \$$sigma0[2],$T1 251 ror \$$sigma0[0],$a0 252 xor $a1,$a2 253 shr \$$sigma1[2],$a1 254 255 ror \$$sigma1[0],$a2 256 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 257 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 258 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 259 260 add `$SZ*($i&0xf)`(%rsp),$T1 261 mov $e,$a0 262 add $a2,$T1 263 mov $a,$a1 264___ 265 &ROUND_00_15(@_); 266} 267 268$code=<<___; 269.text 270 271.extern OPENSSL_ia32cap_P 272.globl $func 273.type $func,\@function,3 274.align 16 275$func: 276.cfi_startproc 277___ 278$code.=<<___ if ($SZ==4 || $avx); 279 lea OPENSSL_ia32cap_P(%rip),%r11 280 mov 0(%r11),%r9d 281 mov 4(%r11),%r10d 282 mov 8(%r11),%r11d 283___ 284$code.=<<___ if ($SZ==4 && $shaext); 285 test \$`1<<29`,%r11d # check for SHA 286 jnz _shaext_shortcut 287___ 288$code.=<<___ if ($avx && $SZ==8); 289 test \$`1<<11`,%r10d # check for XOP 290 jnz .Lxop_shortcut 291___ 292$code.=<<___ if ($avx>1); 293 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 294 cmp \$`1<<8|1<<5|1<<3`,%r11d 295 je .Lavx2_shortcut 296___ 297$code.=<<___ if ($avx); 298 and \$`1<<30`,%r9d # mask "Intel CPU" bit 299 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 300 or %r9d,%r10d 301 cmp \$`1<<28|1<<9|1<<30`,%r10d 302 je .Lavx_shortcut 303___ 304$code.=<<___ if ($SZ==4); 305 test \$`1<<9`,%r10d 306 jnz .Lssse3_shortcut 307___ 308$code.=<<___; 309 mov %rsp,%rax # copy %rsp 310.cfi_def_cfa_register %rax 311 push %rbx 312.cfi_push %rbx 313 push %rbp 314.cfi_push %rbp 315 push %r12 316.cfi_push %r12 317 push %r13 318.cfi_push %r13 319 push %r14 320.cfi_push %r14 321 push %r15 322.cfi_push %r15 323 shl \$4,%rdx # num*16 324 sub \$$framesz,%rsp 325 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 326 and \$-64,%rsp # align stack frame 327 mov $ctx,$_ctx # save ctx, 1st arg 328 mov $inp,$_inp # save inp, 2nd arh 329 mov %rdx,$_end # save end pointer, "3rd" arg 330 mov %rax,$_rsp # save copy of %rsp 331.cfi_cfa_expression $_rsp,deref,+8 332.Lprologue: 333 334 mov $SZ*0($ctx),$A 335 mov $SZ*1($ctx),$B 336 mov $SZ*2($ctx),$C 337 mov $SZ*3($ctx),$D 338 mov $SZ*4($ctx),$E 339 mov $SZ*5($ctx),$F 340 mov $SZ*6($ctx),$G 341 mov $SZ*7($ctx),$H 342 jmp .Lloop 343 344.align 16 345.Lloop: 346 mov $B,$a3 347 lea $TABLE(%rip),$Tbl 348 xor $C,$a3 # magic 349___ 350 for($i=0;$i<16;$i++) { 351 $code.=" mov $SZ*$i($inp),$T1\n"; 352 $code.=" mov @ROT[4],$a0\n"; 353 $code.=" mov @ROT[0],$a1\n"; 354 $code.=" bswap $T1\n"; 355 &ROUND_00_15($i,@ROT); 356 unshift(@ROT,pop(@ROT)); 357 } 358$code.=<<___; 359 jmp .Lrounds_16_xx 360.align 16 361.Lrounds_16_xx: 362___ 363 for(;$i<32;$i++) { 364 &ROUND_16_XX($i,@ROT); 365 unshift(@ROT,pop(@ROT)); 366 } 367 368$code.=<<___; 369 cmpb \$0,`$SZ-1`($Tbl) 370 jnz .Lrounds_16_xx 371 372 mov $_ctx,$ctx 373 add $a1,$A # modulo-scheduled h+=Sigma0(a) 374 lea 16*$SZ($inp),$inp 375 376 add $SZ*0($ctx),$A 377 add $SZ*1($ctx),$B 378 add $SZ*2($ctx),$C 379 add $SZ*3($ctx),$D 380 add $SZ*4($ctx),$E 381 add $SZ*5($ctx),$F 382 add $SZ*6($ctx),$G 383 add $SZ*7($ctx),$H 384 385 cmp $_end,$inp 386 387 mov $A,$SZ*0($ctx) 388 mov $B,$SZ*1($ctx) 389 mov $C,$SZ*2($ctx) 390 mov $D,$SZ*3($ctx) 391 mov $E,$SZ*4($ctx) 392 mov $F,$SZ*5($ctx) 393 mov $G,$SZ*6($ctx) 394 mov $H,$SZ*7($ctx) 395 jb .Lloop 396 397 mov $_rsp,%rsi 398.cfi_def_cfa %rsi,8 399 mov -48(%rsi),%r15 400.cfi_restore %r15 401 mov -40(%rsi),%r14 402.cfi_restore %r14 403 mov -32(%rsi),%r13 404.cfi_restore %r13 405 mov -24(%rsi),%r12 406.cfi_restore %r12 407 mov -16(%rsi),%rbp 408.cfi_restore %rbp 409 mov -8(%rsi),%rbx 410.cfi_restore %rbx 411 lea (%rsi),%rsp 412.cfi_def_cfa_register %rsp 413.Lepilogue: 414 ret 415.cfi_endproc 416.size $func,.-$func 417___ 418 419if ($SZ==4) { 420$code.=<<___; 421.align 64 422.type $TABLE,\@object 423$TABLE: 424 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 425 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 426 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 427 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 428 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 429 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 430 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 431 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 432 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 433 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 434 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 435 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 436 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 437 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 438 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 439 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 440 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 441 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 442 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 443 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 444 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 445 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 446 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 447 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 448 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 449 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 450 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 451 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 452 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 453 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 454 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 455 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 456 457 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 458 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 459 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 460 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 461 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 462 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 463 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 464___ 465} else { 466$code.=<<___; 467.align 64 468.type $TABLE,\@object 469$TABLE: 470 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 471 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 472 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 473 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 474 .quad 0x3956c25bf348b538,0x59f111f1b605d019 475 .quad 0x3956c25bf348b538,0x59f111f1b605d019 476 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 477 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 478 .quad 0xd807aa98a3030242,0x12835b0145706fbe 479 .quad 0xd807aa98a3030242,0x12835b0145706fbe 480 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 481 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 482 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 483 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 484 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 485 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 486 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 487 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 488 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 489 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 490 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 491 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 492 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 493 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 494 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 495 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 496 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 497 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 498 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 499 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 500 .quad 0x06ca6351e003826f,0x142929670a0e6e70 501 .quad 0x06ca6351e003826f,0x142929670a0e6e70 502 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 503 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 504 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 505 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 506 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 507 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 508 .quad 0x81c2c92e47edaee6,0x92722c851482353b 509 .quad 0x81c2c92e47edaee6,0x92722c851482353b 510 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 511 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 512 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 513 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 514 .quad 0xd192e819d6ef5218,0xd69906245565a910 515 .quad 0xd192e819d6ef5218,0xd69906245565a910 516 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 517 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 518 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 519 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 520 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 521 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 522 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 523 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 524 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 525 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 526 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 527 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 528 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 529 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 530 .quad 0x90befffa23631e28,0xa4506cebde82bde9 531 .quad 0x90befffa23631e28,0xa4506cebde82bde9 532 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 533 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 534 .quad 0xca273eceea26619c,0xd186b8c721c0c207 535 .quad 0xca273eceea26619c,0xd186b8c721c0c207 536 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 537 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 538 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 539 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 540 .quad 0x113f9804bef90dae,0x1b710b35131c471b 541 .quad 0x113f9804bef90dae,0x1b710b35131c471b 542 .quad 0x28db77f523047d84,0x32caab7b40c72493 543 .quad 0x28db77f523047d84,0x32caab7b40c72493 544 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 545 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 546 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 547 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 548 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 549 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 550 551 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 552 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 553 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 554___ 555} 556 557###################################################################### 558# SIMD code paths 559# 560if ($SZ==4 && $shaext) {{{ 561###################################################################### 562# Intel SHA Extensions implementation of SHA256 update function. 563# 564my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 565 566my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 567my @MSG=map("%xmm$_",(3..6)); 568 569$code.=<<___; 570.type sha256_block_data_order_shaext,\@function,3 571.align 64 572sha256_block_data_order_shaext: 573_shaext_shortcut: 574.cfi_startproc 575___ 576$code.=<<___ if ($win64); 577 lea `-8-5*16`(%rsp),%rsp 578 movaps %xmm6,-8-5*16(%rax) 579 movaps %xmm7,-8-4*16(%rax) 580 movaps %xmm8,-8-3*16(%rax) 581 movaps %xmm9,-8-2*16(%rax) 582 movaps %xmm10,-8-1*16(%rax) 583.Lprologue_shaext: 584___ 585$code.=<<___; 586 lea K256+0x80(%rip),$Tbl 587 movdqu ($ctx),$ABEF # DCBA 588 movdqu 16($ctx),$CDGH # HGFE 589 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 590 591 pshufd \$0x1b,$ABEF,$Wi # ABCD 592 pshufd \$0xb1,$ABEF,$ABEF # CDAB 593 pshufd \$0x1b,$CDGH,$CDGH # EFGH 594 movdqa $TMP,$BSWAP # offload 595 palignr \$8,$CDGH,$ABEF # ABEF 596 punpcklqdq $Wi,$CDGH # CDGH 597 jmp .Loop_shaext 598 599.align 16 600.Loop_shaext: 601 movdqu ($inp),@MSG[0] 602 movdqu 0x10($inp),@MSG[1] 603 movdqu 0x20($inp),@MSG[2] 604 pshufb $TMP,@MSG[0] 605 movdqu 0x30($inp),@MSG[3] 606 607 movdqa 0*32-0x80($Tbl),$Wi 608 paddd @MSG[0],$Wi 609 pshufb $TMP,@MSG[1] 610 movdqa $CDGH,$CDGH_SAVE # offload 611 sha256rnds2 $ABEF,$CDGH # 0-3 612 pshufd \$0x0e,$Wi,$Wi 613 nop 614 movdqa $ABEF,$ABEF_SAVE # offload 615 sha256rnds2 $CDGH,$ABEF 616 617 movdqa 1*32-0x80($Tbl),$Wi 618 paddd @MSG[1],$Wi 619 pshufb $TMP,@MSG[2] 620 sha256rnds2 $ABEF,$CDGH # 4-7 621 pshufd \$0x0e,$Wi,$Wi 622 lea 0x40($inp),$inp 623 sha256msg1 @MSG[1],@MSG[0] 624 sha256rnds2 $CDGH,$ABEF 625 626 movdqa 2*32-0x80($Tbl),$Wi 627 paddd @MSG[2],$Wi 628 pshufb $TMP,@MSG[3] 629 sha256rnds2 $ABEF,$CDGH # 8-11 630 pshufd \$0x0e,$Wi,$Wi 631 movdqa @MSG[3],$TMP 632 palignr \$4,@MSG[2],$TMP 633 nop 634 paddd $TMP,@MSG[0] 635 sha256msg1 @MSG[2],@MSG[1] 636 sha256rnds2 $CDGH,$ABEF 637 638 movdqa 3*32-0x80($Tbl),$Wi 639 paddd @MSG[3],$Wi 640 sha256msg2 @MSG[3],@MSG[0] 641 sha256rnds2 $ABEF,$CDGH # 12-15 642 pshufd \$0x0e,$Wi,$Wi 643 movdqa @MSG[0],$TMP 644 palignr \$4,@MSG[3],$TMP 645 nop 646 paddd $TMP,@MSG[1] 647 sha256msg1 @MSG[3],@MSG[2] 648 sha256rnds2 $CDGH,$ABEF 649___ 650for($i=4;$i<16-3;$i++) { 651$code.=<<___; 652 movdqa $i*32-0x80($Tbl),$Wi 653 paddd @MSG[0],$Wi 654 sha256msg2 @MSG[0],@MSG[1] 655 sha256rnds2 $ABEF,$CDGH # 16-19... 656 pshufd \$0x0e,$Wi,$Wi 657 movdqa @MSG[1],$TMP 658 palignr \$4,@MSG[0],$TMP 659 nop 660 paddd $TMP,@MSG[2] 661 sha256msg1 @MSG[0],@MSG[3] 662 sha256rnds2 $CDGH,$ABEF 663___ 664 push(@MSG,shift(@MSG)); 665} 666$code.=<<___; 667 movdqa 13*32-0x80($Tbl),$Wi 668 paddd @MSG[0],$Wi 669 sha256msg2 @MSG[0],@MSG[1] 670 sha256rnds2 $ABEF,$CDGH # 52-55 671 pshufd \$0x0e,$Wi,$Wi 672 movdqa @MSG[1],$TMP 673 palignr \$4,@MSG[0],$TMP 674 sha256rnds2 $CDGH,$ABEF 675 paddd $TMP,@MSG[2] 676 677 movdqa 14*32-0x80($Tbl),$Wi 678 paddd @MSG[1],$Wi 679 sha256rnds2 $ABEF,$CDGH # 56-59 680 pshufd \$0x0e,$Wi,$Wi 681 sha256msg2 @MSG[1],@MSG[2] 682 movdqa $BSWAP,$TMP 683 sha256rnds2 $CDGH,$ABEF 684 685 movdqa 15*32-0x80($Tbl),$Wi 686 paddd @MSG[2],$Wi 687 nop 688 sha256rnds2 $ABEF,$CDGH # 60-63 689 pshufd \$0x0e,$Wi,$Wi 690 dec $num 691 nop 692 sha256rnds2 $CDGH,$ABEF 693 694 paddd $CDGH_SAVE,$CDGH 695 paddd $ABEF_SAVE,$ABEF 696 jnz .Loop_shaext 697 698 pshufd \$0xb1,$CDGH,$CDGH # DCHG 699 pshufd \$0x1b,$ABEF,$TMP # FEBA 700 pshufd \$0xb1,$ABEF,$ABEF # BAFE 701 punpckhqdq $CDGH,$ABEF # DCBA 702 palignr \$8,$TMP,$CDGH # HGFE 703 704 movdqu $ABEF,($ctx) 705 movdqu $CDGH,16($ctx) 706___ 707$code.=<<___ if ($win64); 708 movaps -8-5*16(%rax),%xmm6 709 movaps -8-4*16(%rax),%xmm7 710 movaps -8-3*16(%rax),%xmm8 711 movaps -8-2*16(%rax),%xmm9 712 movaps -8-1*16(%rax),%xmm10 713 mov %rax,%rsp 714.Lepilogue_shaext: 715___ 716$code.=<<___; 717 ret 718.cfi_endproc 719.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 720___ 721}}} 722{{{ 723 724my $a4=$T1; 725my ($a,$b,$c,$d,$e,$f,$g,$h); 726 727sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 728{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 729 my $arg = pop; 730 $arg = "\$$arg" if ($arg*1 eq $arg); 731 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 732} 733 734sub body_00_15 () { 735 ( 736 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 737 738 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 739 '&mov ($a,$a1)', 740 '&mov ($a4,$f)', 741 742 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 743 '&xor ($a0,$e)', 744 '&xor ($a4,$g)', # f^g 745 746 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 747 '&xor ($a1,$a)', 748 '&and ($a4,$e)', # (f^g)&e 749 750 '&xor ($a0,$e)', 751 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 752 '&mov ($a2,$a)', 753 754 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 755 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 756 '&xor ($a2,$b)', # a^b, b^c in next round 757 758 '&add ($h,$a4)', # h+=Ch(e,f,g) 759 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 760 '&and ($a3,$a2)', # (b^c)&(a^b) 761 762 '&xor ($a1,$a)', 763 '&add ($h,$a0)', # h+=Sigma1(e) 764 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 765 766 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 767 '&add ($d,$h)', # d+=h 768 '&add ($h,$a3)', # h+=Maj(a,b,c) 769 770 '&mov ($a0,$d)', 771 '&add ($a1,$h);'. # h+=Sigma0(a) 772 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 773 ); 774} 775 776###################################################################### 777# SSSE3 code path 778# 779if ($SZ==4) { # SHA256 only 780my @X = map("%xmm$_",(0..3)); 781my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 782 783$code.=<<___; 784.type ${func}_ssse3,\@function,3 785.align 64 786${func}_ssse3: 787.cfi_startproc 788.Lssse3_shortcut: 789 mov %rsp,%rax # copy %rsp 790.cfi_def_cfa_register %rax 791 push %rbx 792.cfi_push %rbx 793 push %rbp 794.cfi_push %rbp 795 push %r12 796.cfi_push %r12 797 push %r13 798.cfi_push %r13 799 push %r14 800.cfi_push %r14 801 push %r15 802.cfi_push %r15 803 shl \$4,%rdx # num*16 804 sub \$`$framesz+$win64*16*4`,%rsp 805 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 806 and \$-64,%rsp # align stack frame 807 mov $ctx,$_ctx # save ctx, 1st arg 808 mov $inp,$_inp # save inp, 2nd arh 809 mov %rdx,$_end # save end pointer, "3rd" arg 810 mov %rax,$_rsp # save copy of %rsp 811.cfi_cfa_expression $_rsp,deref,+8 812___ 813$code.=<<___ if ($win64); 814 movaps %xmm6,16*$SZ+32(%rsp) 815 movaps %xmm7,16*$SZ+48(%rsp) 816 movaps %xmm8,16*$SZ+64(%rsp) 817 movaps %xmm9,16*$SZ+80(%rsp) 818___ 819$code.=<<___; 820.Lprologue_ssse3: 821 822 mov $SZ*0($ctx),$A 823 mov $SZ*1($ctx),$B 824 mov $SZ*2($ctx),$C 825 mov $SZ*3($ctx),$D 826 mov $SZ*4($ctx),$E 827 mov $SZ*5($ctx),$F 828 mov $SZ*6($ctx),$G 829 mov $SZ*7($ctx),$H 830___ 831 832$code.=<<___; 833 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 834 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 835 jmp .Lloop_ssse3 836.align 16 837.Lloop_ssse3: 838 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 839 movdqu 0x00($inp),@X[0] 840 movdqu 0x10($inp),@X[1] 841 movdqu 0x20($inp),@X[2] 842 pshufb $t3,@X[0] 843 movdqu 0x30($inp),@X[3] 844 lea $TABLE(%rip),$Tbl 845 pshufb $t3,@X[1] 846 movdqa 0x00($Tbl),$t0 847 movdqa 0x20($Tbl),$t1 848 pshufb $t3,@X[2] 849 paddd @X[0],$t0 850 movdqa 0x40($Tbl),$t2 851 pshufb $t3,@X[3] 852 movdqa 0x60($Tbl),$t3 853 paddd @X[1],$t1 854 paddd @X[2],$t2 855 paddd @X[3],$t3 856 movdqa $t0,0x00(%rsp) 857 mov $A,$a1 858 movdqa $t1,0x10(%rsp) 859 mov $B,$a3 860 movdqa $t2,0x20(%rsp) 861 xor $C,$a3 # magic 862 movdqa $t3,0x30(%rsp) 863 mov $E,$a0 864 jmp .Lssse3_00_47 865 866.align 16 867.Lssse3_00_47: 868 sub \$`-16*2*$SZ`,$Tbl # size optimization 869___ 870sub Xupdate_256_SSSE3 () { 871 ( 872 '&movdqa ($t0,@X[1]);', 873 '&movdqa ($t3,@X[3])', 874 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 875 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 876 '&movdqa ($t1,$t0)', 877 '&movdqa ($t2,$t0);', 878 '&psrld ($t0,$sigma0[2])', 879 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 880 '&psrld ($t2,$sigma0[0])', 881 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 882 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 883 '&pxor ($t0,$t2)', 884 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 885 '&pxor ($t0,$t1)', 886 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 887 '&pxor ($t0,$t2);', 888 '&movdqa ($t2,$t3)', 889 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 890 '&psrld ($t3,$sigma1[2])', 891 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 892 '&psrlq ($t2,$sigma1[0])', 893 '&pxor ($t3,$t2);', 894 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 895 '&pxor ($t3,$t2)', 896 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 897 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 898 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 899 '&movdqa ($t2,$t3);', 900 '&psrld ($t3,$sigma1[2])', 901 '&psrlq ($t2,$sigma1[0])', 902 '&pxor ($t3,$t2);', 903 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 904 '&pxor ($t3,$t2);', 905 '&movdqa ($t2,16*2*$j."($Tbl)")', 906 '&pshufb ($t3,$t5)', 907 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 908 ); 909} 910 911sub SSSE3_256_00_47 () { 912my $j = shift; 913my $body = shift; 914my @X = @_; 915my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 916 917 if (0) { 918 foreach (Xupdate_256_SSSE3()) { # 36 instructions 919 eval; 920 eval(shift(@insns)); 921 eval(shift(@insns)); 922 eval(shift(@insns)); 923 } 924 } else { # squeeze extra 4% on Westmere and 19% on Atom 925 eval(shift(@insns)); #@ 926 &movdqa ($t0,@X[1]); 927 eval(shift(@insns)); 928 eval(shift(@insns)); 929 &movdqa ($t3,@X[3]); 930 eval(shift(@insns)); #@ 931 eval(shift(@insns)); 932 eval(shift(@insns)); 933 eval(shift(@insns)); #@ 934 eval(shift(@insns)); 935 &palignr ($t0,@X[0],$SZ); # X[1..4] 936 eval(shift(@insns)); 937 eval(shift(@insns)); 938 &palignr ($t3,@X[2],$SZ); # X[9..12] 939 eval(shift(@insns)); 940 eval(shift(@insns)); 941 eval(shift(@insns)); 942 eval(shift(@insns)); #@ 943 &movdqa ($t1,$t0); 944 eval(shift(@insns)); 945 eval(shift(@insns)); 946 &movdqa ($t2,$t0); 947 eval(shift(@insns)); #@ 948 eval(shift(@insns)); 949 &psrld ($t0,$sigma0[2]); 950 eval(shift(@insns)); 951 eval(shift(@insns)); 952 eval(shift(@insns)); 953 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 954 eval(shift(@insns)); #@ 955 eval(shift(@insns)); 956 &psrld ($t2,$sigma0[0]); 957 eval(shift(@insns)); 958 eval(shift(@insns)); 959 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 960 eval(shift(@insns)); 961 eval(shift(@insns)); #@ 962 &pslld ($t1,8*$SZ-$sigma0[1]); 963 eval(shift(@insns)); 964 eval(shift(@insns)); 965 &pxor ($t0,$t2); 966 eval(shift(@insns)); #@ 967 eval(shift(@insns)); 968 eval(shift(@insns)); 969 eval(shift(@insns)); #@ 970 &psrld ($t2,$sigma0[1]-$sigma0[0]); 971 eval(shift(@insns)); 972 &pxor ($t0,$t1); 973 eval(shift(@insns)); 974 eval(shift(@insns)); 975 &pslld ($t1,$sigma0[1]-$sigma0[0]); 976 eval(shift(@insns)); 977 eval(shift(@insns)); 978 &pxor ($t0,$t2); 979 eval(shift(@insns)); 980 eval(shift(@insns)); #@ 981 &movdqa ($t2,$t3); 982 eval(shift(@insns)); 983 eval(shift(@insns)); 984 &pxor ($t0,$t1); # sigma0(X[1..4]) 985 eval(shift(@insns)); #@ 986 eval(shift(@insns)); 987 eval(shift(@insns)); 988 &psrld ($t3,$sigma1[2]); 989 eval(shift(@insns)); 990 eval(shift(@insns)); 991 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 992 eval(shift(@insns)); #@ 993 eval(shift(@insns)); 994 &psrlq ($t2,$sigma1[0]); 995 eval(shift(@insns)); 996 eval(shift(@insns)); 997 eval(shift(@insns)); 998 &pxor ($t3,$t2); 999 eval(shift(@insns)); #@ 1000 eval(shift(@insns)); 1001 eval(shift(@insns)); 1002 eval(shift(@insns)); #@ 1003 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1004 eval(shift(@insns)); 1005 eval(shift(@insns)); 1006 &pxor ($t3,$t2); 1007 eval(shift(@insns)); #@ 1008 eval(shift(@insns)); 1009 eval(shift(@insns)); 1010 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 1011 &pshufd ($t3,$t3,0b10000000); 1012 eval(shift(@insns)); 1013 eval(shift(@insns)); 1014 eval(shift(@insns)); 1015 &psrldq ($t3,8); 1016 eval(shift(@insns)); 1017 eval(shift(@insns)); #@ 1018 eval(shift(@insns)); 1019 eval(shift(@insns)); 1020 eval(shift(@insns)); #@ 1021 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1022 eval(shift(@insns)); 1023 eval(shift(@insns)); 1024 eval(shift(@insns)); 1025 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 1026 eval(shift(@insns)); 1027 eval(shift(@insns)); #@ 1028 eval(shift(@insns)); 1029 &movdqa ($t2,$t3); 1030 eval(shift(@insns)); 1031 eval(shift(@insns)); 1032 &psrld ($t3,$sigma1[2]); 1033 eval(shift(@insns)); 1034 eval(shift(@insns)); #@ 1035 &psrlq ($t2,$sigma1[0]); 1036 eval(shift(@insns)); 1037 eval(shift(@insns)); 1038 &pxor ($t3,$t2); 1039 eval(shift(@insns)); #@ 1040 eval(shift(@insns)); 1041 eval(shift(@insns)); 1042 eval(shift(@insns)); #@ 1043 eval(shift(@insns)); 1044 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1045 eval(shift(@insns)); 1046 eval(shift(@insns)); 1047 eval(shift(@insns)); 1048 &pxor ($t3,$t2); 1049 eval(shift(@insns)); 1050 eval(shift(@insns)); 1051 eval(shift(@insns)); #@ 1052 #&pshufb ($t3,$t5); 1053 &pshufd ($t3,$t3,0b00001000); 1054 eval(shift(@insns)); 1055 eval(shift(@insns)); 1056 &movdqa ($t2,16*2*$j."($Tbl)"); 1057 eval(shift(@insns)); #@ 1058 eval(shift(@insns)); 1059 &pslldq ($t3,8); 1060 eval(shift(@insns)); 1061 eval(shift(@insns)); 1062 eval(shift(@insns)); 1063 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1064 eval(shift(@insns)); #@ 1065 eval(shift(@insns)); 1066 eval(shift(@insns)); 1067 } 1068 &paddd ($t2,@X[0]); 1069 foreach (@insns) { eval; } # remaining instructions 1070 &movdqa (16*$j."(%rsp)",$t2); 1071} 1072 1073 for ($i=0,$j=0; $j<4; $j++) { 1074 &SSSE3_256_00_47($j,\&body_00_15,@X); 1075 push(@X,shift(@X)); # rotate(@X) 1076 } 1077 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1078 &jne (".Lssse3_00_47"); 1079 1080 for ($i=0; $i<16; ) { 1081 foreach(body_00_15()) { eval; } 1082 } 1083$code.=<<___; 1084 mov $_ctx,$ctx 1085 mov $a1,$A 1086 1087 add $SZ*0($ctx),$A 1088 lea 16*$SZ($inp),$inp 1089 add $SZ*1($ctx),$B 1090 add $SZ*2($ctx),$C 1091 add $SZ*3($ctx),$D 1092 add $SZ*4($ctx),$E 1093 add $SZ*5($ctx),$F 1094 add $SZ*6($ctx),$G 1095 add $SZ*7($ctx),$H 1096 1097 cmp $_end,$inp 1098 1099 mov $A,$SZ*0($ctx) 1100 mov $B,$SZ*1($ctx) 1101 mov $C,$SZ*2($ctx) 1102 mov $D,$SZ*3($ctx) 1103 mov $E,$SZ*4($ctx) 1104 mov $F,$SZ*5($ctx) 1105 mov $G,$SZ*6($ctx) 1106 mov $H,$SZ*7($ctx) 1107 jb .Lloop_ssse3 1108 1109 mov $_rsp,%rsi 1110.cfi_def_cfa %rsi,8 1111___ 1112$code.=<<___ if ($win64); 1113 movaps 16*$SZ+32(%rsp),%xmm6 1114 movaps 16*$SZ+48(%rsp),%xmm7 1115 movaps 16*$SZ+64(%rsp),%xmm8 1116 movaps 16*$SZ+80(%rsp),%xmm9 1117___ 1118$code.=<<___; 1119 mov -48(%rsi),%r15 1120.cfi_restore %r15 1121 mov -40(%rsi),%r14 1122.cfi_restore %r14 1123 mov -32(%rsi),%r13 1124.cfi_restore %r13 1125 mov -24(%rsi),%r12 1126.cfi_restore %r12 1127 mov -16(%rsi),%rbp 1128.cfi_restore %rbp 1129 mov -8(%rsi),%rbx 1130.cfi_restore %rbx 1131 lea (%rsi),%rsp 1132.cfi_def_cfa_register %rsp 1133.Lepilogue_ssse3: 1134 ret 1135.cfi_endproc 1136.size ${func}_ssse3,.-${func}_ssse3 1137___ 1138} 1139 1140if ($avx) {{ 1141###################################################################### 1142# XOP code path 1143# 1144if ($SZ==8) { # SHA512 only 1145$code.=<<___; 1146.type ${func}_xop,\@function,3 1147.align 64 1148${func}_xop: 1149.cfi_startproc 1150.Lxop_shortcut: 1151 mov %rsp,%rax # copy %rsp 1152.cfi_def_cfa_register %rax 1153 push %rbx 1154.cfi_push %rbx 1155 push %rbp 1156.cfi_push %rbp 1157 push %r12 1158.cfi_push %r12 1159 push %r13 1160.cfi_push %r13 1161 push %r14 1162.cfi_push %r14 1163 push %r15 1164.cfi_push %r15 1165 shl \$4,%rdx # num*16 1166 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1167 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1168 and \$-64,%rsp # align stack frame 1169 mov $ctx,$_ctx # save ctx, 1st arg 1170 mov $inp,$_inp # save inp, 2nd arh 1171 mov %rdx,$_end # save end pointer, "3rd" arg 1172 mov %rax,$_rsp # save copy of %rsp 1173.cfi_cfa_expression $_rsp,deref,+8 1174___ 1175$code.=<<___ if ($win64); 1176 movaps %xmm6,16*$SZ+32(%rsp) 1177 movaps %xmm7,16*$SZ+48(%rsp) 1178 movaps %xmm8,16*$SZ+64(%rsp) 1179 movaps %xmm9,16*$SZ+80(%rsp) 1180___ 1181$code.=<<___ if ($win64 && $SZ>4); 1182 movaps %xmm10,16*$SZ+96(%rsp) 1183 movaps %xmm11,16*$SZ+112(%rsp) 1184___ 1185$code.=<<___; 1186.Lprologue_xop: 1187 1188 vzeroupper 1189 mov $SZ*0($ctx),$A 1190 mov $SZ*1($ctx),$B 1191 mov $SZ*2($ctx),$C 1192 mov $SZ*3($ctx),$D 1193 mov $SZ*4($ctx),$E 1194 mov $SZ*5($ctx),$F 1195 mov $SZ*6($ctx),$G 1196 mov $SZ*7($ctx),$H 1197 jmp .Lloop_xop 1198___ 1199 if ($SZ==4) { # SHA256 1200 my @X = map("%xmm$_",(0..3)); 1201 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 1202 1203$code.=<<___; 1204.align 16 1205.Lloop_xop: 1206 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1207 vmovdqu 0x00($inp),@X[0] 1208 vmovdqu 0x10($inp),@X[1] 1209 vmovdqu 0x20($inp),@X[2] 1210 vmovdqu 0x30($inp),@X[3] 1211 vpshufb $t3,@X[0],@X[0] 1212 lea $TABLE(%rip),$Tbl 1213 vpshufb $t3,@X[1],@X[1] 1214 vpshufb $t3,@X[2],@X[2] 1215 vpaddd 0x00($Tbl),@X[0],$t0 1216 vpshufb $t3,@X[3],@X[3] 1217 vpaddd 0x20($Tbl),@X[1],$t1 1218 vpaddd 0x40($Tbl),@X[2],$t2 1219 vpaddd 0x60($Tbl),@X[3],$t3 1220 vmovdqa $t0,0x00(%rsp) 1221 mov $A,$a1 1222 vmovdqa $t1,0x10(%rsp) 1223 mov $B,$a3 1224 vmovdqa $t2,0x20(%rsp) 1225 xor $C,$a3 # magic 1226 vmovdqa $t3,0x30(%rsp) 1227 mov $E,$a0 1228 jmp .Lxop_00_47 1229 1230.align 16 1231.Lxop_00_47: 1232 sub \$`-16*2*$SZ`,$Tbl # size optimization 1233___ 1234sub XOP_256_00_47 () { 1235my $j = shift; 1236my $body = shift; 1237my @X = @_; 1238my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1239 1240 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 1241 eval(shift(@insns)); 1242 eval(shift(@insns)); 1243 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 1244 eval(shift(@insns)); 1245 eval(shift(@insns)); 1246 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 1247 eval(shift(@insns)); 1248 eval(shift(@insns)); 1249 &vpsrld ($t0,$t0,$sigma0[2]); 1250 eval(shift(@insns)); 1251 eval(shift(@insns)); 1252 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 1253 eval(shift(@insns)); 1254 eval(shift(@insns)); 1255 eval(shift(@insns)); 1256 eval(shift(@insns)); 1257 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 1258 eval(shift(@insns)); 1259 eval(shift(@insns)); 1260 &vpxor ($t0,$t0,$t1); 1261 eval(shift(@insns)); 1262 eval(shift(@insns)); 1263 eval(shift(@insns)); 1264 eval(shift(@insns)); 1265 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 1266 eval(shift(@insns)); 1267 eval(shift(@insns)); 1268 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 1269 eval(shift(@insns)); 1270 eval(shift(@insns)); 1271 &vpsrld ($t2,@X[3],$sigma1[2]); 1272 eval(shift(@insns)); 1273 eval(shift(@insns)); 1274 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 1275 eval(shift(@insns)); 1276 eval(shift(@insns)); 1277 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1278 eval(shift(@insns)); 1279 eval(shift(@insns)); 1280 &vpxor ($t3,$t3,$t2); 1281 eval(shift(@insns)); 1282 eval(shift(@insns)); 1283 eval(shift(@insns)); 1284 eval(shift(@insns)); 1285 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1286 eval(shift(@insns)); 1287 eval(shift(@insns)); 1288 eval(shift(@insns)); 1289 eval(shift(@insns)); 1290 &vpsrldq ($t3,$t3,8); 1291 eval(shift(@insns)); 1292 eval(shift(@insns)); 1293 eval(shift(@insns)); 1294 eval(shift(@insns)); 1295 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1296 eval(shift(@insns)); 1297 eval(shift(@insns)); 1298 eval(shift(@insns)); 1299 eval(shift(@insns)); 1300 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 1301 eval(shift(@insns)); 1302 eval(shift(@insns)); 1303 &vpsrld ($t2,@X[0],$sigma1[2]); 1304 eval(shift(@insns)); 1305 eval(shift(@insns)); 1306 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1307 eval(shift(@insns)); 1308 eval(shift(@insns)); 1309 &vpxor ($t3,$t3,$t2); 1310 eval(shift(@insns)); 1311 eval(shift(@insns)); 1312 eval(shift(@insns)); 1313 eval(shift(@insns)); 1314 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 1315 eval(shift(@insns)); 1316 eval(shift(@insns)); 1317 eval(shift(@insns)); 1318 eval(shift(@insns)); 1319 &vpslldq ($t3,$t3,8); # 22 instructions 1320 eval(shift(@insns)); 1321 eval(shift(@insns)); 1322 eval(shift(@insns)); 1323 eval(shift(@insns)); 1324 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1325 eval(shift(@insns)); 1326 eval(shift(@insns)); 1327 eval(shift(@insns)); 1328 eval(shift(@insns)); 1329 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1330 foreach (@insns) { eval; } # remaining instructions 1331 &vmovdqa (16*$j."(%rsp)",$t2); 1332} 1333 1334 for ($i=0,$j=0; $j<4; $j++) { 1335 &XOP_256_00_47($j,\&body_00_15,@X); 1336 push(@X,shift(@X)); # rotate(@X) 1337 } 1338 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1339 &jne (".Lxop_00_47"); 1340 1341 for ($i=0; $i<16; ) { 1342 foreach(body_00_15()) { eval; } 1343 } 1344 1345 } else { # SHA512 1346 my @X = map("%xmm$_",(0..7)); 1347 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1348 1349$code.=<<___; 1350.align 16 1351.Lloop_xop: 1352 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1353 vmovdqu 0x00($inp),@X[0] 1354 lea $TABLE+0x80(%rip),$Tbl # size optimization 1355 vmovdqu 0x10($inp),@X[1] 1356 vmovdqu 0x20($inp),@X[2] 1357 vpshufb $t3,@X[0],@X[0] 1358 vmovdqu 0x30($inp),@X[3] 1359 vpshufb $t3,@X[1],@X[1] 1360 vmovdqu 0x40($inp),@X[4] 1361 vpshufb $t3,@X[2],@X[2] 1362 vmovdqu 0x50($inp),@X[5] 1363 vpshufb $t3,@X[3],@X[3] 1364 vmovdqu 0x60($inp),@X[6] 1365 vpshufb $t3,@X[4],@X[4] 1366 vmovdqu 0x70($inp),@X[7] 1367 vpshufb $t3,@X[5],@X[5] 1368 vpaddq -0x80($Tbl),@X[0],$t0 1369 vpshufb $t3,@X[6],@X[6] 1370 vpaddq -0x60($Tbl),@X[1],$t1 1371 vpshufb $t3,@X[7],@X[7] 1372 vpaddq -0x40($Tbl),@X[2],$t2 1373 vpaddq -0x20($Tbl),@X[3],$t3 1374 vmovdqa $t0,0x00(%rsp) 1375 vpaddq 0x00($Tbl),@X[4],$t0 1376 vmovdqa $t1,0x10(%rsp) 1377 vpaddq 0x20($Tbl),@X[5],$t1 1378 vmovdqa $t2,0x20(%rsp) 1379 vpaddq 0x40($Tbl),@X[6],$t2 1380 vmovdqa $t3,0x30(%rsp) 1381 vpaddq 0x60($Tbl),@X[7],$t3 1382 vmovdqa $t0,0x40(%rsp) 1383 mov $A,$a1 1384 vmovdqa $t1,0x50(%rsp) 1385 mov $B,$a3 1386 vmovdqa $t2,0x60(%rsp) 1387 xor $C,$a3 # magic 1388 vmovdqa $t3,0x70(%rsp) 1389 mov $E,$a0 1390 jmp .Lxop_00_47 1391 1392.align 16 1393.Lxop_00_47: 1394 add \$`16*2*$SZ`,$Tbl 1395___ 1396sub XOP_512_00_47 () { 1397my $j = shift; 1398my $body = shift; 1399my @X = @_; 1400my @insns = (&$body,&$body); # 52 instructions 1401 1402 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] 1403 eval(shift(@insns)); 1404 eval(shift(@insns)); 1405 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] 1406 eval(shift(@insns)); 1407 eval(shift(@insns)); 1408 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); 1409 eval(shift(@insns)); 1410 eval(shift(@insns)); 1411 &vpsrlq ($t0,$t0,$sigma0[2]); 1412 eval(shift(@insns)); 1413 eval(shift(@insns)); 1414 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] 1415 eval(shift(@insns)); 1416 eval(shift(@insns)); 1417 eval(shift(@insns)); 1418 eval(shift(@insns)); 1419 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); 1420 eval(shift(@insns)); 1421 eval(shift(@insns)); 1422 &vpxor ($t0,$t0,$t1); 1423 eval(shift(@insns)); 1424 eval(shift(@insns)); 1425 eval(shift(@insns)); 1426 eval(shift(@insns)); 1427 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); 1428 eval(shift(@insns)); 1429 eval(shift(@insns)); 1430 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) 1431 eval(shift(@insns)); 1432 eval(shift(@insns)); 1433 &vpsrlq ($t2,@X[7],$sigma1[2]); 1434 eval(shift(@insns)); 1435 eval(shift(@insns)); 1436 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) 1437 eval(shift(@insns)); 1438 eval(shift(@insns)); 1439 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); 1440 eval(shift(@insns)); 1441 eval(shift(@insns)); 1442 &vpxor ($t3,$t3,$t2); 1443 eval(shift(@insns)); 1444 eval(shift(@insns)); 1445 eval(shift(@insns)); 1446 eval(shift(@insns)); 1447 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1448 eval(shift(@insns)); 1449 eval(shift(@insns)); 1450 eval(shift(@insns)); 1451 eval(shift(@insns)); 1452 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1453 eval(shift(@insns)); 1454 eval(shift(@insns)); 1455 eval(shift(@insns)); 1456 eval(shift(@insns)); 1457 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1458 foreach (@insns) { eval; } # remaining instructions 1459 &vmovdqa (16*$j."(%rsp)",$t2); 1460} 1461 1462 for ($i=0,$j=0; $j<8; $j++) { 1463 &XOP_512_00_47($j,\&body_00_15,@X); 1464 push(@X,shift(@X)); # rotate(@X) 1465 } 1466 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1467 &jne (".Lxop_00_47"); 1468 1469 for ($i=0; $i<16; ) { 1470 foreach(body_00_15()) { eval; } 1471 } 1472} 1473$code.=<<___; 1474 mov $_ctx,$ctx 1475 mov $a1,$A 1476 1477 add $SZ*0($ctx),$A 1478 lea 16*$SZ($inp),$inp 1479 add $SZ*1($ctx),$B 1480 add $SZ*2($ctx),$C 1481 add $SZ*3($ctx),$D 1482 add $SZ*4($ctx),$E 1483 add $SZ*5($ctx),$F 1484 add $SZ*6($ctx),$G 1485 add $SZ*7($ctx),$H 1486 1487 cmp $_end,$inp 1488 1489 mov $A,$SZ*0($ctx) 1490 mov $B,$SZ*1($ctx) 1491 mov $C,$SZ*2($ctx) 1492 mov $D,$SZ*3($ctx) 1493 mov $E,$SZ*4($ctx) 1494 mov $F,$SZ*5($ctx) 1495 mov $G,$SZ*6($ctx) 1496 mov $H,$SZ*7($ctx) 1497 jb .Lloop_xop 1498 1499 mov $_rsp,%rsi 1500.cfi_def_cfa %rsi,8 1501 vzeroupper 1502___ 1503$code.=<<___ if ($win64); 1504 movaps 16*$SZ+32(%rsp),%xmm6 1505 movaps 16*$SZ+48(%rsp),%xmm7 1506 movaps 16*$SZ+64(%rsp),%xmm8 1507 movaps 16*$SZ+80(%rsp),%xmm9 1508___ 1509$code.=<<___ if ($win64 && $SZ>4); 1510 movaps 16*$SZ+96(%rsp),%xmm10 1511 movaps 16*$SZ+112(%rsp),%xmm11 1512___ 1513$code.=<<___; 1514 mov -48(%rsi),%r15 1515.cfi_restore %r15 1516 mov -40(%rsi),%r14 1517.cfi_restore %r14 1518 mov -32(%rsi),%r13 1519.cfi_restore %r13 1520 mov -24(%rsi),%r12 1521.cfi_restore %r12 1522 mov -16(%rsi),%rbp 1523.cfi_restore %rbp 1524 mov -8(%rsi),%rbx 1525.cfi_restore %rbx 1526 lea (%rsi),%rsp 1527.cfi_def_cfa_register %rsp 1528.Lepilogue_xop: 1529 ret 1530.cfi_endproc 1531.size ${func}_xop,.-${func}_xop 1532___ 1533} 1534###################################################################### 1535# AVX+shrd code path 1536# 1537local *ror = sub { &shrd(@_[0],@_) }; 1538 1539$code.=<<___; 1540.type ${func}_avx,\@function,3 1541.align 64 1542${func}_avx: 1543.cfi_startproc 1544.Lavx_shortcut: 1545 mov %rsp,%rax # copy %rsp 1546.cfi_def_cfa_register %rax 1547 push %rbx 1548.cfi_push %rbx 1549 push %rbp 1550.cfi_push %rbp 1551 push %r12 1552.cfi_push %r12 1553 push %r13 1554.cfi_push %r13 1555 push %r14 1556.cfi_push %r14 1557 push %r15 1558.cfi_push %r15 1559 shl \$4,%rdx # num*16 1560 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1561 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1562 and \$-64,%rsp # align stack frame 1563 mov $ctx,$_ctx # save ctx, 1st arg 1564 mov $inp,$_inp # save inp, 2nd arh 1565 mov %rdx,$_end # save end pointer, "3rd" arg 1566 mov %rax,$_rsp # save copy of %rsp 1567.cfi_cfa_expression $_rsp,deref,+8 1568___ 1569$code.=<<___ if ($win64); 1570 movaps %xmm6,16*$SZ+32(%rsp) 1571 movaps %xmm7,16*$SZ+48(%rsp) 1572 movaps %xmm8,16*$SZ+64(%rsp) 1573 movaps %xmm9,16*$SZ+80(%rsp) 1574___ 1575$code.=<<___ if ($win64 && $SZ>4); 1576 movaps %xmm10,16*$SZ+96(%rsp) 1577 movaps %xmm11,16*$SZ+112(%rsp) 1578___ 1579$code.=<<___; 1580.Lprologue_avx: 1581 1582 vzeroupper 1583 mov $SZ*0($ctx),$A 1584 mov $SZ*1($ctx),$B 1585 mov $SZ*2($ctx),$C 1586 mov $SZ*3($ctx),$D 1587 mov $SZ*4($ctx),$E 1588 mov $SZ*5($ctx),$F 1589 mov $SZ*6($ctx),$G 1590 mov $SZ*7($ctx),$H 1591___ 1592 if ($SZ==4) { # SHA256 1593 my @X = map("%xmm$_",(0..3)); 1594 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1595 1596$code.=<<___; 1597 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1598 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1599 jmp .Lloop_avx 1600.align 16 1601.Lloop_avx: 1602 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1603 vmovdqu 0x00($inp),@X[0] 1604 vmovdqu 0x10($inp),@X[1] 1605 vmovdqu 0x20($inp),@X[2] 1606 vmovdqu 0x30($inp),@X[3] 1607 vpshufb $t3,@X[0],@X[0] 1608 lea $TABLE(%rip),$Tbl 1609 vpshufb $t3,@X[1],@X[1] 1610 vpshufb $t3,@X[2],@X[2] 1611 vpaddd 0x00($Tbl),@X[0],$t0 1612 vpshufb $t3,@X[3],@X[3] 1613 vpaddd 0x20($Tbl),@X[1],$t1 1614 vpaddd 0x40($Tbl),@X[2],$t2 1615 vpaddd 0x60($Tbl),@X[3],$t3 1616 vmovdqa $t0,0x00(%rsp) 1617 mov $A,$a1 1618 vmovdqa $t1,0x10(%rsp) 1619 mov $B,$a3 1620 vmovdqa $t2,0x20(%rsp) 1621 xor $C,$a3 # magic 1622 vmovdqa $t3,0x30(%rsp) 1623 mov $E,$a0 1624 jmp .Lavx_00_47 1625 1626.align 16 1627.Lavx_00_47: 1628 sub \$`-16*2*$SZ`,$Tbl # size optimization 1629___ 1630sub Xupdate_256_AVX () { 1631 ( 1632 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1633 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1634 '&vpsrld ($t2,$t0,$sigma0[0]);', 1635 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1636 '&vpsrld ($t3,$t0,$sigma0[2])', 1637 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1638 '&vpxor ($t0,$t3,$t2)', 1639 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1640 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1641 '&vpxor ($t0,$t0,$t1)', 1642 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1643 '&vpxor ($t0,$t0,$t2)', 1644 '&vpsrld ($t2,$t3,$sigma1[2]);', 1645 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1646 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1647 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1648 '&vpxor ($t2,$t2,$t3);', 1649 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1650 '&vpxor ($t2,$t2,$t3)', 1651 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1652 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1653 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1654 '&vpsrld ($t2,$t3,$sigma1[2])', 1655 '&vpsrlq ($t3,$t3,$sigma1[0])', 1656 '&vpxor ($t2,$t2,$t3);', 1657 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1658 '&vpxor ($t2,$t2,$t3)', 1659 '&vpshufb ($t2,$t2,$t5)', 1660 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1661 ); 1662} 1663 1664sub AVX_256_00_47 () { 1665my $j = shift; 1666my $body = shift; 1667my @X = @_; 1668my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1669 1670 foreach (Xupdate_256_AVX()) { # 29 instructions 1671 eval; 1672 eval(shift(@insns)); 1673 eval(shift(@insns)); 1674 eval(shift(@insns)); 1675 } 1676 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1677 foreach (@insns) { eval; } # remaining instructions 1678 &vmovdqa (16*$j."(%rsp)",$t2); 1679} 1680 1681 for ($i=0,$j=0; $j<4; $j++) { 1682 &AVX_256_00_47($j,\&body_00_15,@X); 1683 push(@X,shift(@X)); # rotate(@X) 1684 } 1685 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1686 &jne (".Lavx_00_47"); 1687 1688 for ($i=0; $i<16; ) { 1689 foreach(body_00_15()) { eval; } 1690 } 1691 1692 } else { # SHA512 1693 my @X = map("%xmm$_",(0..7)); 1694 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1695 1696$code.=<<___; 1697 jmp .Lloop_avx 1698.align 16 1699.Lloop_avx: 1700 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1701 vmovdqu 0x00($inp),@X[0] 1702 lea $TABLE+0x80(%rip),$Tbl # size optimization 1703 vmovdqu 0x10($inp),@X[1] 1704 vmovdqu 0x20($inp),@X[2] 1705 vpshufb $t3,@X[0],@X[0] 1706 vmovdqu 0x30($inp),@X[3] 1707 vpshufb $t3,@X[1],@X[1] 1708 vmovdqu 0x40($inp),@X[4] 1709 vpshufb $t3,@X[2],@X[2] 1710 vmovdqu 0x50($inp),@X[5] 1711 vpshufb $t3,@X[3],@X[3] 1712 vmovdqu 0x60($inp),@X[6] 1713 vpshufb $t3,@X[4],@X[4] 1714 vmovdqu 0x70($inp),@X[7] 1715 vpshufb $t3,@X[5],@X[5] 1716 vpaddq -0x80($Tbl),@X[0],$t0 1717 vpshufb $t3,@X[6],@X[6] 1718 vpaddq -0x60($Tbl),@X[1],$t1 1719 vpshufb $t3,@X[7],@X[7] 1720 vpaddq -0x40($Tbl),@X[2],$t2 1721 vpaddq -0x20($Tbl),@X[3],$t3 1722 vmovdqa $t0,0x00(%rsp) 1723 vpaddq 0x00($Tbl),@X[4],$t0 1724 vmovdqa $t1,0x10(%rsp) 1725 vpaddq 0x20($Tbl),@X[5],$t1 1726 vmovdqa $t2,0x20(%rsp) 1727 vpaddq 0x40($Tbl),@X[6],$t2 1728 vmovdqa $t3,0x30(%rsp) 1729 vpaddq 0x60($Tbl),@X[7],$t3 1730 vmovdqa $t0,0x40(%rsp) 1731 mov $A,$a1 1732 vmovdqa $t1,0x50(%rsp) 1733 mov $B,$a3 1734 vmovdqa $t2,0x60(%rsp) 1735 xor $C,$a3 # magic 1736 vmovdqa $t3,0x70(%rsp) 1737 mov $E,$a0 1738 jmp .Lavx_00_47 1739 1740.align 16 1741.Lavx_00_47: 1742 add \$`16*2*$SZ`,$Tbl 1743___ 1744sub Xupdate_512_AVX () { 1745 ( 1746 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1747 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1748 '&vpsrlq ($t2,$t0,$sigma0[0])', 1749 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1750 '&vpsrlq ($t3,$t0,$sigma0[2])', 1751 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1752 '&vpxor ($t0,$t3,$t2)', 1753 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1754 '&vpxor ($t0,$t0,$t1)', 1755 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1756 '&vpxor ($t0,$t0,$t2)', 1757 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1758 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1759 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1760 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1761 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1762 '&vpxor ($t3,$t3,$t2)', 1763 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1764 '&vpxor ($t3,$t3,$t1)', 1765 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1766 '&vpxor ($t3,$t3,$t2)', 1767 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1768 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1769 ); 1770} 1771 1772sub AVX_512_00_47 () { 1773my $j = shift; 1774my $body = shift; 1775my @X = @_; 1776my @insns = (&$body,&$body); # 52 instructions 1777 1778 foreach (Xupdate_512_AVX()) { # 23 instructions 1779 eval; 1780 eval(shift(@insns)); 1781 eval(shift(@insns)); 1782 } 1783 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1784 foreach (@insns) { eval; } # remaining instructions 1785 &vmovdqa (16*$j."(%rsp)",$t2); 1786} 1787 1788 for ($i=0,$j=0; $j<8; $j++) { 1789 &AVX_512_00_47($j,\&body_00_15,@X); 1790 push(@X,shift(@X)); # rotate(@X) 1791 } 1792 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1793 &jne (".Lavx_00_47"); 1794 1795 for ($i=0; $i<16; ) { 1796 foreach(body_00_15()) { eval; } 1797 } 1798} 1799$code.=<<___; 1800 mov $_ctx,$ctx 1801 mov $a1,$A 1802 1803 add $SZ*0($ctx),$A 1804 lea 16*$SZ($inp),$inp 1805 add $SZ*1($ctx),$B 1806 add $SZ*2($ctx),$C 1807 add $SZ*3($ctx),$D 1808 add $SZ*4($ctx),$E 1809 add $SZ*5($ctx),$F 1810 add $SZ*6($ctx),$G 1811 add $SZ*7($ctx),$H 1812 1813 cmp $_end,$inp 1814 1815 mov $A,$SZ*0($ctx) 1816 mov $B,$SZ*1($ctx) 1817 mov $C,$SZ*2($ctx) 1818 mov $D,$SZ*3($ctx) 1819 mov $E,$SZ*4($ctx) 1820 mov $F,$SZ*5($ctx) 1821 mov $G,$SZ*6($ctx) 1822 mov $H,$SZ*7($ctx) 1823 jb .Lloop_avx 1824 1825 mov $_rsp,%rsi 1826.cfi_def_cfa %rsi,8 1827 vzeroupper 1828___ 1829$code.=<<___ if ($win64); 1830 movaps 16*$SZ+32(%rsp),%xmm6 1831 movaps 16*$SZ+48(%rsp),%xmm7 1832 movaps 16*$SZ+64(%rsp),%xmm8 1833 movaps 16*$SZ+80(%rsp),%xmm9 1834___ 1835$code.=<<___ if ($win64 && $SZ>4); 1836 movaps 16*$SZ+96(%rsp),%xmm10 1837 movaps 16*$SZ+112(%rsp),%xmm11 1838___ 1839$code.=<<___; 1840 mov -48(%rsi),%r15 1841.cfi_restore %r15 1842 mov -40(%rsi),%r14 1843.cfi_restore %r14 1844 mov -32(%rsi),%r13 1845.cfi_restore %r13 1846 mov -24(%rsi),%r12 1847.cfi_restore %r12 1848 mov -16(%rsi),%rbp 1849.cfi_restore %rbp 1850 mov -8(%rsi),%rbx 1851.cfi_restore %rbx 1852 lea (%rsi),%rsp 1853.cfi_def_cfa_register %rsp 1854.Lepilogue_avx: 1855 ret 1856.cfi_endproc 1857.size ${func}_avx,.-${func}_avx 1858___ 1859 1860if ($avx>1) {{ 1861###################################################################### 1862# AVX2+BMI code path 1863# 1864my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1865my $PUSH8=8*2*$SZ; 1866use integer; 1867 1868sub bodyx_00_15 () { 1869 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1870 ( 1871 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1872 1873 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1874 '&and ($a4,$e)', # f&e 1875 '&rorx ($a0,$e,$Sigma1[2])', 1876 '&rorx ($a2,$e,$Sigma1[1])', 1877 1878 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1879 '&lea ($h,"($h,$a4)")', 1880 '&andn ($a4,$e,$g)', # ~e&g 1881 '&xor ($a0,$a2)', 1882 1883 '&rorx ($a1,$e,$Sigma1[0])', 1884 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1885 '&xor ($a0,$a1)', # Sigma1(e) 1886 '&mov ($a2,$a)', 1887 1888 '&rorx ($a4,$a,$Sigma0[2])', 1889 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1890 '&xor ($a2,$b)', # a^b, b^c in next round 1891 '&rorx ($a1,$a,$Sigma0[1])', 1892 1893 '&rorx ($a0,$a,$Sigma0[0])', 1894 '&lea ($d,"($d,$h)")', # d+=h 1895 '&and ($a3,$a2)', # (b^c)&(a^b) 1896 '&xor ($a1,$a4)', 1897 1898 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1899 '&xor ($a1,$a0)', # Sigma0(a) 1900 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1901 '&mov ($a4,$e)', # copy of f in future 1902 1903 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1904 ); 1905 # and at the finish one has to $a+=$a1 1906} 1907 1908$code.=<<___; 1909.type ${func}_avx2,\@function,3 1910.align 64 1911${func}_avx2: 1912.cfi_startproc 1913.Lavx2_shortcut: 1914 mov %rsp,%rax # copy %rsp 1915.cfi_def_cfa_register %rax 1916 push %rbx 1917.cfi_push %rbx 1918 push %rbp 1919.cfi_push %rbp 1920 push %r12 1921.cfi_push %r12 1922 push %r13 1923.cfi_push %r13 1924 push %r14 1925.cfi_push %r14 1926 push %r15 1927.cfi_push %r15 1928 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1929 shl \$4,%rdx # num*16 1930 and \$-256*$SZ,%rsp # align stack frame 1931 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1932 add \$`2*$SZ*($rounds-8)`,%rsp 1933 mov $ctx,$_ctx # save ctx, 1st arg 1934 mov $inp,$_inp # save inp, 2nd arh 1935 mov %rdx,$_end # save end pointer, "3rd" arg 1936 mov %rax,$_rsp # save copy of %rsp 1937.cfi_cfa_expression $_rsp,deref,+8 1938___ 1939$code.=<<___ if ($win64); 1940 movaps %xmm6,16*$SZ+32(%rsp) 1941 movaps %xmm7,16*$SZ+48(%rsp) 1942 movaps %xmm8,16*$SZ+64(%rsp) 1943 movaps %xmm9,16*$SZ+80(%rsp) 1944___ 1945$code.=<<___ if ($win64 && $SZ>4); 1946 movaps %xmm10,16*$SZ+96(%rsp) 1947 movaps %xmm11,16*$SZ+112(%rsp) 1948___ 1949$code.=<<___; 1950.Lprologue_avx2: 1951 1952 vzeroupper 1953 sub \$-16*$SZ,$inp # inp++, size optimization 1954 mov $SZ*0($ctx),$A 1955 mov $inp,%r12 # borrow $T1 1956 mov $SZ*1($ctx),$B 1957 cmp %rdx,$inp # $_end 1958 mov $SZ*2($ctx),$C 1959 cmove %rsp,%r12 # next block or random data 1960 mov $SZ*3($ctx),$D 1961 mov $SZ*4($ctx),$E 1962 mov $SZ*5($ctx),$F 1963 mov $SZ*6($ctx),$G 1964 mov $SZ*7($ctx),$H 1965___ 1966 if ($SZ==4) { # SHA256 1967 my @X = map("%ymm$_",(0..3)); 1968 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1969 1970$code.=<<___; 1971 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1972 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1973 jmp .Loop_avx2 1974.align 16 1975.Loop_avx2: 1976 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1977 vmovdqu -16*$SZ+0($inp),%xmm0 1978 vmovdqu -16*$SZ+16($inp),%xmm1 1979 vmovdqu -16*$SZ+32($inp),%xmm2 1980 vmovdqu -16*$SZ+48($inp),%xmm3 1981 #mov $inp,$_inp # offload $inp 1982 vinserti128 \$1,(%r12),@X[0],@X[0] 1983 vinserti128 \$1,16(%r12),@X[1],@X[1] 1984 vpshufb $t3,@X[0],@X[0] 1985 vinserti128 \$1,32(%r12),@X[2],@X[2] 1986 vpshufb $t3,@X[1],@X[1] 1987 vinserti128 \$1,48(%r12),@X[3],@X[3] 1988 1989 lea $TABLE(%rip),$Tbl 1990 vpshufb $t3,@X[2],@X[2] 1991 vpaddd 0x00($Tbl),@X[0],$t0 1992 vpshufb $t3,@X[3],@X[3] 1993 vpaddd 0x20($Tbl),@X[1],$t1 1994 vpaddd 0x40($Tbl),@X[2],$t2 1995 vpaddd 0x60($Tbl),@X[3],$t3 1996 vmovdqa $t0,0x00(%rsp) 1997 xor $a1,$a1 1998 vmovdqa $t1,0x20(%rsp) 1999___ 2000$code.=<<___ if (!$win64); 2001# temporarily use %rdi as frame pointer 2002 mov $_rsp,%rdi 2003.cfi_def_cfa %rdi,8 2004___ 2005$code.=<<___; 2006 lea -$PUSH8(%rsp),%rsp 2007___ 2008$code.=<<___ if (!$win64); 2009# the frame info is at $_rsp, but the stack is moving... 2010# so a second frame pointer is saved at -8(%rsp) 2011# that is in the red zone 2012 mov %rdi,-8(%rsp) 2013.cfi_cfa_expression %rsp-8,deref,+8 2014___ 2015$code.=<<___; 2016 mov $B,$a3 2017 vmovdqa $t2,0x00(%rsp) 2018 xor $C,$a3 # magic 2019 vmovdqa $t3,0x20(%rsp) 2020 mov $F,$a4 2021 sub \$-16*2*$SZ,$Tbl # size optimization 2022 jmp .Lavx2_00_47 2023 2024.align 16 2025.Lavx2_00_47: 2026___ 2027 2028sub AVX2_256_00_47 () { 2029my $j = shift; 2030my $body = shift; 2031my @X = @_; 2032my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 2033my $base = "+2*$PUSH8(%rsp)"; 2034 2035 if (($j%2)==0) { 2036 &lea ("%rsp","-$PUSH8(%rsp)"); 2037$code.=<<___ if (!$win64); 2038.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 2039# copy secondary frame pointer to new location again at -8(%rsp) 2040 pushq $PUSH8-8(%rsp) 2041.cfi_cfa_expression %rsp,deref,+8 2042 lea 8(%rsp),%rsp 2043.cfi_cfa_expression %rsp-8,deref,+8 2044___ 2045 } 2046 2047 foreach (Xupdate_256_AVX()) { # 29 instructions 2048 eval; 2049 eval(shift(@insns)); 2050 eval(shift(@insns)); 2051 eval(shift(@insns)); 2052 } 2053 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 2054 foreach (@insns) { eval; } # remaining instructions 2055 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2056} 2057 2058 for ($i=0,$j=0; $j<4; $j++) { 2059 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 2060 push(@X,shift(@X)); # rotate(@X) 2061 } 2062 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2063 &cmpb (($SZ-1)."($Tbl)",0); 2064 &jne (".Lavx2_00_47"); 2065 2066 for ($i=0; $i<16; ) { 2067 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2068 foreach(bodyx_00_15()) { eval; } 2069 } 2070 } else { # SHA512 2071 my @X = map("%ymm$_",(0..7)); 2072 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 2073 2074$code.=<<___; 2075 jmp .Loop_avx2 2076.align 16 2077.Loop_avx2: 2078 vmovdqu -16*$SZ($inp),%xmm0 2079 vmovdqu -16*$SZ+16($inp),%xmm1 2080 vmovdqu -16*$SZ+32($inp),%xmm2 2081 lea $TABLE+0x80(%rip),$Tbl # size optimization 2082 vmovdqu -16*$SZ+48($inp),%xmm3 2083 vmovdqu -16*$SZ+64($inp),%xmm4 2084 vmovdqu -16*$SZ+80($inp),%xmm5 2085 vmovdqu -16*$SZ+96($inp),%xmm6 2086 vmovdqu -16*$SZ+112($inp),%xmm7 2087 #mov $inp,$_inp # offload $inp 2088 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 2089 vinserti128 \$1,(%r12),@X[0],@X[0] 2090 vinserti128 \$1,16(%r12),@X[1],@X[1] 2091 vpshufb $t2,@X[0],@X[0] 2092 vinserti128 \$1,32(%r12),@X[2],@X[2] 2093 vpshufb $t2,@X[1],@X[1] 2094 vinserti128 \$1,48(%r12),@X[3],@X[3] 2095 vpshufb $t2,@X[2],@X[2] 2096 vinserti128 \$1,64(%r12),@X[4],@X[4] 2097 vpshufb $t2,@X[3],@X[3] 2098 vinserti128 \$1,80(%r12),@X[5],@X[5] 2099 vpshufb $t2,@X[4],@X[4] 2100 vinserti128 \$1,96(%r12),@X[6],@X[6] 2101 vpshufb $t2,@X[5],@X[5] 2102 vinserti128 \$1,112(%r12),@X[7],@X[7] 2103 2104 vpaddq -0x80($Tbl),@X[0],$t0 2105 vpshufb $t2,@X[6],@X[6] 2106 vpaddq -0x60($Tbl),@X[1],$t1 2107 vpshufb $t2,@X[7],@X[7] 2108 vpaddq -0x40($Tbl),@X[2],$t2 2109 vpaddq -0x20($Tbl),@X[3],$t3 2110 vmovdqa $t0,0x00(%rsp) 2111 vpaddq 0x00($Tbl),@X[4],$t0 2112 vmovdqa $t1,0x20(%rsp) 2113 vpaddq 0x20($Tbl),@X[5],$t1 2114 vmovdqa $t2,0x40(%rsp) 2115 vpaddq 0x40($Tbl),@X[6],$t2 2116 vmovdqa $t3,0x60(%rsp) 2117___ 2118$code.=<<___ if (!$win64); 2119# temporarily use %rdi as frame pointer 2120 mov $_rsp,%rdi 2121.cfi_def_cfa %rdi,8 2122___ 2123$code.=<<___; 2124 lea -$PUSH8(%rsp),%rsp 2125___ 2126$code.=<<___ if (!$win64); 2127# the frame info is at $_rsp, but the stack is moving... 2128# so a second frame pointer is saved at -8(%rsp) 2129# that is in the red zone 2130 mov %rdi,-8(%rsp) 2131.cfi_cfa_expression %rsp-8,deref,+8 2132___ 2133$code.=<<___; 2134 vpaddq 0x60($Tbl),@X[7],$t3 2135 vmovdqa $t0,0x00(%rsp) 2136 xor $a1,$a1 2137 vmovdqa $t1,0x20(%rsp) 2138 mov $B,$a3 2139 vmovdqa $t2,0x40(%rsp) 2140 xor $C,$a3 # magic 2141 vmovdqa $t3,0x60(%rsp) 2142 mov $F,$a4 2143 add \$16*2*$SZ,$Tbl 2144 jmp .Lavx2_00_47 2145 2146.align 16 2147.Lavx2_00_47: 2148___ 2149 2150sub AVX2_512_00_47 () { 2151my $j = shift; 2152my $body = shift; 2153my @X = @_; 2154my @insns = (&$body,&$body); # 48 instructions 2155my $base = "+2*$PUSH8(%rsp)"; 2156 2157 if (($j%4)==0) { 2158 &lea ("%rsp","-$PUSH8(%rsp)"); 2159$code.=<<___ if (!$win64); 2160.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 2161# copy secondary frame pointer to new location again at -8(%rsp) 2162 pushq $PUSH8-8(%rsp) 2163.cfi_cfa_expression %rsp,deref,+8 2164 lea 8(%rsp),%rsp 2165.cfi_cfa_expression %rsp-8,deref,+8 2166___ 2167 } 2168 2169 foreach (Xupdate_512_AVX()) { # 23 instructions 2170 eval; 2171 if ($_ !~ /\;$/) { 2172 eval(shift(@insns)); 2173 eval(shift(@insns)); 2174 eval(shift(@insns)); 2175 } 2176 } 2177 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 2178 foreach (@insns) { eval; } # remaining instructions 2179 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2180} 2181 2182 for ($i=0,$j=0; $j<8; $j++) { 2183 &AVX2_512_00_47($j,\&bodyx_00_15,@X); 2184 push(@X,shift(@X)); # rotate(@X) 2185 } 2186 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2187 &cmpb (($SZ-1-0x80)."($Tbl)",0); 2188 &jne (".Lavx2_00_47"); 2189 2190 for ($i=0; $i<16; ) { 2191 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2192 foreach(bodyx_00_15()) { eval; } 2193 } 2194} 2195$code.=<<___; 2196 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2197 add $a1,$A 2198 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2199 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 2200 2201 add $SZ*0($ctx),$A 2202 add $SZ*1($ctx),$B 2203 add $SZ*2($ctx),$C 2204 add $SZ*3($ctx),$D 2205 add $SZ*4($ctx),$E 2206 add $SZ*5($ctx),$F 2207 add $SZ*6($ctx),$G 2208 add $SZ*7($ctx),$H 2209 2210 mov $A,$SZ*0($ctx) 2211 mov $B,$SZ*1($ctx) 2212 mov $C,$SZ*2($ctx) 2213 mov $D,$SZ*3($ctx) 2214 mov $E,$SZ*4($ctx) 2215 mov $F,$SZ*5($ctx) 2216 mov $G,$SZ*6($ctx) 2217 mov $H,$SZ*7($ctx) 2218 2219 cmp `$PUSH8+2*8`($Tbl),$inp # $_end 2220 je .Ldone_avx2 2221 2222 xor $a1,$a1 2223 mov $B,$a3 2224 xor $C,$a3 # magic 2225 mov $F,$a4 2226 jmp .Lower_avx2 2227.align 16 2228.Lower_avx2: 2229___ 2230 for ($i=0; $i<8; ) { 2231 my $base="+16($Tbl)"; 2232 foreach(bodyx_00_15()) { eval; } 2233 } 2234$code.=<<___; 2235 lea -$PUSH8($Tbl),$Tbl 2236 cmp %rsp,$Tbl 2237 jae .Lower_avx2 2238 2239 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2240 add $a1,$A 2241 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2242 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 2243# restore frame pointer to original location at $_rsp 2244.cfi_cfa_expression $_rsp,deref,+8 2245 2246 add $SZ*0($ctx),$A 2247 add $SZ*1($ctx),$B 2248 add $SZ*2($ctx),$C 2249 add $SZ*3($ctx),$D 2250 add $SZ*4($ctx),$E 2251 add $SZ*5($ctx),$F 2252 lea `2*16*$SZ`($inp),$inp # inp+=2 2253 add $SZ*6($ctx),$G 2254 mov $inp,%r12 2255 add $SZ*7($ctx),$H 2256 cmp $_end,$inp 2257 2258 mov $A,$SZ*0($ctx) 2259 cmove %rsp,%r12 # next block or stale data 2260 mov $B,$SZ*1($ctx) 2261 mov $C,$SZ*2($ctx) 2262 mov $D,$SZ*3($ctx) 2263 mov $E,$SZ*4($ctx) 2264 mov $F,$SZ*5($ctx) 2265 mov $G,$SZ*6($ctx) 2266 mov $H,$SZ*7($ctx) 2267 2268 jbe .Loop_avx2 2269 lea (%rsp),$Tbl 2270# temporarily use $Tbl as index to $_rsp 2271# this avoids the need to save a secondary frame pointer at -8(%rsp) 2272.cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8 2273 2274.Ldone_avx2: 2275 mov `16*$SZ+3*8`($Tbl),%rsi 2276.cfi_def_cfa %rsi,8 2277 vzeroupper 2278___ 2279$code.=<<___ if ($win64); 2280 movaps 16*$SZ+32($Tbl),%xmm6 2281 movaps 16*$SZ+48($Tbl),%xmm7 2282 movaps 16*$SZ+64($Tbl),%xmm8 2283 movaps 16*$SZ+80($Tbl),%xmm9 2284___ 2285$code.=<<___ if ($win64 && $SZ>4); 2286 movaps 16*$SZ+96($Tbl),%xmm10 2287 movaps 16*$SZ+112($Tbl),%xmm11 2288___ 2289$code.=<<___; 2290 mov -48(%rsi),%r15 2291.cfi_restore %r15 2292 mov -40(%rsi),%r14 2293.cfi_restore %r14 2294 mov -32(%rsi),%r13 2295.cfi_restore %r13 2296 mov -24(%rsi),%r12 2297.cfi_restore %r12 2298 mov -16(%rsi),%rbp 2299.cfi_restore %rbp 2300 mov -8(%rsi),%rbx 2301.cfi_restore %rbx 2302 lea (%rsi),%rsp 2303.cfi_def_cfa_register %rsp 2304.Lepilogue_avx2: 2305 ret 2306.cfi_endproc 2307.size ${func}_avx2,.-${func}_avx2 2308___ 2309}} 2310}}}}} 2311 2312# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2313# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2314if ($win64) { 2315$rec="%rcx"; 2316$frame="%rdx"; 2317$context="%r8"; 2318$disp="%r9"; 2319 2320$code.=<<___; 2321.extern __imp_RtlVirtualUnwind 2322.type se_handler,\@abi-omnipotent 2323.align 16 2324se_handler: 2325 push %rsi 2326 push %rdi 2327 push %rbx 2328 push %rbp 2329 push %r12 2330 push %r13 2331 push %r14 2332 push %r15 2333 pushfq 2334 sub \$64,%rsp 2335 2336 mov 120($context),%rax # pull context->Rax 2337 mov 248($context),%rbx # pull context->Rip 2338 2339 mov 8($disp),%rsi # disp->ImageBase 2340 mov 56($disp),%r11 # disp->HanderlData 2341 2342 mov 0(%r11),%r10d # HandlerData[0] 2343 lea (%rsi,%r10),%r10 # prologue label 2344 cmp %r10,%rbx # context->Rip<prologue label 2345 jb .Lin_prologue 2346 2347 mov 152($context),%rax # pull context->Rsp 2348 2349 mov 4(%r11),%r10d # HandlerData[1] 2350 lea (%rsi,%r10),%r10 # epilogue label 2351 cmp %r10,%rbx # context->Rip>=epilogue label 2352 jae .Lin_prologue 2353___ 2354$code.=<<___ if ($avx>1); 2355 lea .Lavx2_shortcut(%rip),%r10 2356 cmp %r10,%rbx # context->Rip<avx2_shortcut 2357 jb .Lnot_in_avx2 2358 2359 and \$-256*$SZ,%rax 2360 add \$`2*$SZ*($rounds-8)`,%rax 2361.Lnot_in_avx2: 2362___ 2363$code.=<<___; 2364 mov %rax,%rsi # put aside Rsp 2365 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 2366 2367 mov -8(%rax),%rbx 2368 mov -16(%rax),%rbp 2369 mov -24(%rax),%r12 2370 mov -32(%rax),%r13 2371 mov -40(%rax),%r14 2372 mov -48(%rax),%r15 2373 mov %rbx,144($context) # restore context->Rbx 2374 mov %rbp,160($context) # restore context->Rbp 2375 mov %r12,216($context) # restore context->R12 2376 mov %r13,224($context) # restore context->R13 2377 mov %r14,232($context) # restore context->R14 2378 mov %r15,240($context) # restore context->R15 2379 2380 lea .Lepilogue(%rip),%r10 2381 cmp %r10,%rbx 2382 jb .Lin_prologue # non-AVX code 2383 2384 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 2385 lea 512($context),%rdi # &context.Xmm6 2386 mov \$`$SZ==4?8:12`,%ecx 2387 .long 0xa548f3fc # cld; rep movsq 2388 2389.Lin_prologue: 2390 mov 8(%rax),%rdi 2391 mov 16(%rax),%rsi 2392 mov %rax,152($context) # restore context->Rsp 2393 mov %rsi,168($context) # restore context->Rsi 2394 mov %rdi,176($context) # restore context->Rdi 2395 2396 mov 40($disp),%rdi # disp->ContextRecord 2397 mov $context,%rsi # context 2398 mov \$154,%ecx # sizeof(CONTEXT) 2399 .long 0xa548f3fc # cld; rep movsq 2400 2401 mov $disp,%rsi 2402 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2403 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2404 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2405 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2406 mov 40(%rsi),%r10 # disp->ContextRecord 2407 lea 56(%rsi),%r11 # &disp->HandlerData 2408 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2409 mov %r10,32(%rsp) # arg5 2410 mov %r11,40(%rsp) # arg6 2411 mov %r12,48(%rsp) # arg7 2412 mov %rcx,56(%rsp) # arg8, (NULL) 2413 call *__imp_RtlVirtualUnwind(%rip) 2414 2415 mov \$1,%eax # ExceptionContinueSearch 2416 add \$64,%rsp 2417 popfq 2418 pop %r15 2419 pop %r14 2420 pop %r13 2421 pop %r12 2422 pop %rbp 2423 pop %rbx 2424 pop %rdi 2425 pop %rsi 2426 ret 2427.size se_handler,.-se_handler 2428___ 2429 2430$code.=<<___ if ($SZ==4 && $shaext); 2431.type shaext_handler,\@abi-omnipotent 2432.align 16 2433shaext_handler: 2434 push %rsi 2435 push %rdi 2436 push %rbx 2437 push %rbp 2438 push %r12 2439 push %r13 2440 push %r14 2441 push %r15 2442 pushfq 2443 sub \$64,%rsp 2444 2445 mov 120($context),%rax # pull context->Rax 2446 mov 248($context),%rbx # pull context->Rip 2447 2448 lea .Lprologue_shaext(%rip),%r10 2449 cmp %r10,%rbx # context->Rip<.Lprologue 2450 jb .Lin_prologue 2451 2452 lea .Lepilogue_shaext(%rip),%r10 2453 cmp %r10,%rbx # context->Rip>=.Lepilogue 2454 jae .Lin_prologue 2455 2456 lea -8-5*16(%rax),%rsi 2457 lea 512($context),%rdi # &context.Xmm6 2458 mov \$10,%ecx 2459 .long 0xa548f3fc # cld; rep movsq 2460 2461 jmp .Lin_prologue 2462.size shaext_handler,.-shaext_handler 2463___ 2464 2465$code.=<<___; 2466.section .pdata 2467.align 4 2468 .rva .LSEH_begin_$func 2469 .rva .LSEH_end_$func 2470 .rva .LSEH_info_$func 2471___ 2472$code.=<<___ if ($SZ==4 && $shaext); 2473 .rva .LSEH_begin_${func}_shaext 2474 .rva .LSEH_end_${func}_shaext 2475 .rva .LSEH_info_${func}_shaext 2476___ 2477$code.=<<___ if ($SZ==4); 2478 .rva .LSEH_begin_${func}_ssse3 2479 .rva .LSEH_end_${func}_ssse3 2480 .rva .LSEH_info_${func}_ssse3 2481___ 2482$code.=<<___ if ($avx && $SZ==8); 2483 .rva .LSEH_begin_${func}_xop 2484 .rva .LSEH_end_${func}_xop 2485 .rva .LSEH_info_${func}_xop 2486___ 2487$code.=<<___ if ($avx); 2488 .rva .LSEH_begin_${func}_avx 2489 .rva .LSEH_end_${func}_avx 2490 .rva .LSEH_info_${func}_avx 2491___ 2492$code.=<<___ if ($avx>1); 2493 .rva .LSEH_begin_${func}_avx2 2494 .rva .LSEH_end_${func}_avx2 2495 .rva .LSEH_info_${func}_avx2 2496___ 2497$code.=<<___; 2498.section .xdata 2499.align 8 2500.LSEH_info_$func: 2501 .byte 9,0,0,0 2502 .rva se_handler 2503 .rva .Lprologue,.Lepilogue # HandlerData[] 2504___ 2505$code.=<<___ if ($SZ==4 && $shaext); 2506.LSEH_info_${func}_shaext: 2507 .byte 9,0,0,0 2508 .rva shaext_handler 2509___ 2510$code.=<<___ if ($SZ==4); 2511.LSEH_info_${func}_ssse3: 2512 .byte 9,0,0,0 2513 .rva se_handler 2514 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2515___ 2516$code.=<<___ if ($avx && $SZ==8); 2517.LSEH_info_${func}_xop: 2518 .byte 9,0,0,0 2519 .rva se_handler 2520 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 2521___ 2522$code.=<<___ if ($avx); 2523.LSEH_info_${func}_avx: 2524 .byte 9,0,0,0 2525 .rva se_handler 2526 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2527___ 2528$code.=<<___ if ($avx>1); 2529.LSEH_info_${func}_avx2: 2530 .byte 9,0,0,0 2531 .rva se_handler 2532 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2533___ 2534} 2535 2536sub sha256op38 { 2537 my $instr = shift; 2538 my %opcodelet = ( 2539 "sha256rnds2" => 0xcb, 2540 "sha256msg1" => 0xcc, 2541 "sha256msg2" => 0xcd ); 2542 2543 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2544 my @opcode=(0x0f,0x38); 2545 push @opcode,$opcodelet{$instr}; 2546 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2547 return ".byte\t".join(',',@opcode); 2548 } else { 2549 return $instr."\t".@_[0]; 2550 } 2551} 2552 2553foreach (split("\n",$code)) { 2554 s/\`([^\`]*)\`/eval $1/geo; 2555 2556 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2557 2558 print $_,"\n"; 2559} 2560close STDOUT or die "error closing STDOUT: $!"; 2561