1*7bded2dbSJung-uk Kim#!/usr/bin/env perl 2*7bded2dbSJung-uk Kim 3*7bded2dbSJung-uk Kim# ==================================================================== 4*7bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5*7bded2dbSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 6*7bded2dbSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 7*7bded2dbSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 8*7bded2dbSJung-uk Kim# ==================================================================== 9*7bded2dbSJung-uk Kim 10*7bded2dbSJung-uk Kim# Multi-buffer SHA1 procedure processes n buffers in parallel by 11*7bded2dbSJung-uk Kim# placing buffer data to designated lane of SIMD register. n is 12*7bded2dbSJung-uk Kim# naturally limited to 4 on pre-AVX2 processors and to 8 on 13*7bded2dbSJung-uk Kim# AVX2-capable processors such as Haswell. 14*7bded2dbSJung-uk Kim# 15*7bded2dbSJung-uk Kim# this +aesni(i) sha1 aesni-sha1 gain(iv) 16*7bded2dbSJung-uk Kim# ------------------------------------------------------------------- 17*7bded2dbSJung-uk Kim# Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68% 18*7bded2dbSJung-uk Kim# Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51% 19*7bded2dbSJung-uk Kim# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% 20*7bded2dbSJung-uk Kim# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% 21*7bded2dbSJung-uk Kim# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% 22*7bded2dbSJung-uk Kim# Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% 23*7bded2dbSJung-uk Kim# 24*7bded2dbSJung-uk Kim# (i) multi-block CBC encrypt with 128-bit key; 25*7bded2dbSJung-uk Kim# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, 26*7bded2dbSJung-uk Kim# because of lower AES-NI instruction throughput; 27*7bded2dbSJung-uk Kim# (iii) "this" is for n=8, when we gather twice as much data, result 28*7bded2dbSJung-uk Kim# for n=4 is 8.00+4.44=12.4; 29*7bded2dbSJung-uk Kim# (iv) presented improvement coefficients are asymptotic limits and 30*7bded2dbSJung-uk Kim# in real-life application are somewhat lower, e.g. for 2KB 31*7bded2dbSJung-uk Kim# fragments they range from 30% to 100% (on Haswell); 32*7bded2dbSJung-uk Kim 33*7bded2dbSJung-uk Kim$flavour = shift; 34*7bded2dbSJung-uk Kim$output = shift; 35*7bded2dbSJung-uk Kimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 36*7bded2dbSJung-uk Kim 37*7bded2dbSJung-uk Kim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 38*7bded2dbSJung-uk Kim 39*7bded2dbSJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40*7bded2dbSJung-uk Kim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 41*7bded2dbSJung-uk Kim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 42*7bded2dbSJung-uk Kimdie "can't locate x86_64-xlate.pl"; 43*7bded2dbSJung-uk Kim 44*7bded2dbSJung-uk Kim$avx=0; 45*7bded2dbSJung-uk Kim 46*7bded2dbSJung-uk Kimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 47*7bded2dbSJung-uk Kim =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 48*7bded2dbSJung-uk Kim $avx = ($1>=2.19) + ($1>=2.22); 49*7bded2dbSJung-uk Kim} 50*7bded2dbSJung-uk Kim 51*7bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 52*7bded2dbSJung-uk Kim `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 53*7bded2dbSJung-uk Kim $avx = ($1>=2.09) + ($1>=2.10); 54*7bded2dbSJung-uk Kim} 55*7bded2dbSJung-uk Kim 56*7bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 57*7bded2dbSJung-uk Kim `ml64 2>&1` =~ /Version ([0-9]+)\./) { 58*7bded2dbSJung-uk Kim $avx = ($1>=10) + ($1>=11); 59*7bded2dbSJung-uk Kim} 60*7bded2dbSJung-uk Kim 61*7bded2dbSJung-uk Kimif (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { 62*7bded2dbSJung-uk Kim $avx = ($2>=3.0) + ($2>3.0); 63*7bded2dbSJung-uk Kim} 64*7bded2dbSJung-uk Kim 65*7bded2dbSJung-uk Kimopen OUT,"| \"$^X\" $xlate $flavour $output"; 66*7bded2dbSJung-uk Kim*STDOUT=*OUT; 67*7bded2dbSJung-uk Kim 68*7bded2dbSJung-uk Kim# void sha1_multi_block ( 69*7bded2dbSJung-uk Kim# struct { unsigned int A[8]; 70*7bded2dbSJung-uk Kim# unsigned int B[8]; 71*7bded2dbSJung-uk Kim# unsigned int C[8]; 72*7bded2dbSJung-uk Kim# unsigned int D[8]; 73*7bded2dbSJung-uk Kim# unsigned int E[8]; } *ctx, 74*7bded2dbSJung-uk Kim# struct { void *ptr; int blocks; } inp[8], 75*7bded2dbSJung-uk Kim# int num); /* 1 or 2 */ 76*7bded2dbSJung-uk Kim# 77*7bded2dbSJung-uk Kim$ctx="%rdi"; # 1st arg 78*7bded2dbSJung-uk Kim$inp="%rsi"; # 2nd arg 79*7bded2dbSJung-uk Kim$num="%edx"; 80*7bded2dbSJung-uk Kim@ptr=map("%r$_",(8..11)); 81*7bded2dbSJung-uk Kim$Tbl="%rbp"; 82*7bded2dbSJung-uk Kim 83*7bded2dbSJung-uk Kim@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4)); 84*7bded2dbSJung-uk Kim($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9)); 85*7bded2dbSJung-uk Kim@Xi=map("%xmm$_",(10..14)); 86*7bded2dbSJung-uk Kim$K="%xmm15"; 87*7bded2dbSJung-uk Kim 88*7bded2dbSJung-uk Kimif (1) { 89*7bded2dbSJung-uk Kim # Atom-specific optimization aiming to eliminate pshufb with high 90*7bded2dbSJung-uk Kim # registers [and thus get rid of 48 cycles accumulated penalty] 91*7bded2dbSJung-uk Kim @Xi=map("%xmm$_",(0..4)); 92*7bded2dbSJung-uk Kim ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); 93*7bded2dbSJung-uk Kim @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); 94*7bded2dbSJung-uk Kim} 95*7bded2dbSJung-uk Kim 96*7bded2dbSJung-uk Kim$REG_SZ=16; 97*7bded2dbSJung-uk Kim 98*7bded2dbSJung-uk Kimsub Xi_off { 99*7bded2dbSJung-uk Kimmy $off = shift; 100*7bded2dbSJung-uk Kim 101*7bded2dbSJung-uk Kim $off %= 16; $off *= $REG_SZ; 102*7bded2dbSJung-uk Kim $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; 103*7bded2dbSJung-uk Kim} 104*7bded2dbSJung-uk Kim 105*7bded2dbSJung-uk Kimsub BODY_00_19 { 106*7bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 107*7bded2dbSJung-uk Kimmy $j=$i+1; 108*7bded2dbSJung-uk Kimmy $k=$i+2; 109*7bded2dbSJung-uk Kim 110*7bded2dbSJung-uk Kim# Loads are performed 2+3/4 iterations in advance. 3/4 means that out 111*7bded2dbSJung-uk Kim# of 4 words you would expect to be loaded per given iteration one is 112*7bded2dbSJung-uk Kim# spilled to next iteration. In other words indices in four input 113*7bded2dbSJung-uk Kim# streams are distributed as following: 114*7bded2dbSJung-uk Kim# 115*7bded2dbSJung-uk Kim# $i==0: 0,0,0,0,1,1,1,1,2,2,2, 116*7bded2dbSJung-uk Kim# $i==1: 2,3,3,3, 117*7bded2dbSJung-uk Kim# $i==2: 3,4,4,4, 118*7bded2dbSJung-uk Kim# ... 119*7bded2dbSJung-uk Kim# $i==13: 14,15,15,15, 120*7bded2dbSJung-uk Kim# $i==14: 15 121*7bded2dbSJung-uk Kim# 122*7bded2dbSJung-uk Kim# Then at $i==15 Xupdate is applied one iteration in advance... 123*7bded2dbSJung-uk Kim$code.=<<___ if ($i==0); 124*7bded2dbSJung-uk Kim movd (@ptr[0]),@Xi[0] 125*7bded2dbSJung-uk Kim lea `16*4`(@ptr[0]),@ptr[0] 126*7bded2dbSJung-uk Kim movd (@ptr[1]),@Xi[2] # borrow @Xi[2] 127*7bded2dbSJung-uk Kim lea `16*4`(@ptr[1]),@ptr[1] 128*7bded2dbSJung-uk Kim movd (@ptr[2]),@Xi[3] # borrow @Xi[3] 129*7bded2dbSJung-uk Kim lea `16*4`(@ptr[2]),@ptr[2] 130*7bded2dbSJung-uk Kim movd (@ptr[3]),@Xi[4] # borrow @Xi[4] 131*7bded2dbSJung-uk Kim lea `16*4`(@ptr[3]),@ptr[3] 132*7bded2dbSJung-uk Kim punpckldq @Xi[3],@Xi[0] 133*7bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[0]),@Xi[1] 134*7bded2dbSJung-uk Kim punpckldq @Xi[4],@Xi[2] 135*7bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[1]),$t3 136*7bded2dbSJung-uk Kim punpckldq @Xi[2],@Xi[0] 137*7bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[2]),$t2 138*7bded2dbSJung-uk Kim pshufb $tx,@Xi[0] 139*7bded2dbSJung-uk Kim___ 140*7bded2dbSJung-uk Kim$code.=<<___ if ($i<14); # just load input 141*7bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[3]),$t1 142*7bded2dbSJung-uk Kim punpckldq $t2,@Xi[1] 143*7bded2dbSJung-uk Kim movdqa $a,$t2 144*7bded2dbSJung-uk Kim paddd $K,$e # e+=K_00_19 145*7bded2dbSJung-uk Kim punpckldq $t1,$t3 146*7bded2dbSJung-uk Kim movdqa $b,$t1 147*7bded2dbSJung-uk Kim movdqa $b,$t0 148*7bded2dbSJung-uk Kim pslld \$5,$t2 149*7bded2dbSJung-uk Kim pandn $d,$t1 150*7bded2dbSJung-uk Kim pand $c,$t0 151*7bded2dbSJung-uk Kim punpckldq $t3,@Xi[1] 152*7bded2dbSJung-uk Kim movdqa $a,$t3 153*7bded2dbSJung-uk Kim 154*7bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 155*7bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 156*7bded2dbSJung-uk Kim movd `4*$k-16*4`(@ptr[0]),@Xi[2] 157*7bded2dbSJung-uk Kim psrld \$27,$t3 158*7bded2dbSJung-uk Kim pxor $t1,$t0 # Ch(b,c,d) 159*7bded2dbSJung-uk Kim movdqa $b,$t1 160*7bded2dbSJung-uk Kim 161*7bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 162*7bded2dbSJung-uk Kim movd `4*$k-16*4`(@ptr[1]),$t3 163*7bded2dbSJung-uk Kim pslld \$30,$t1 164*7bded2dbSJung-uk Kim paddd $t0,$e # e+=Ch(b,c,d) 165*7bded2dbSJung-uk Kim 166*7bded2dbSJung-uk Kim psrld \$2,$b 167*7bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 168*7bded2dbSJung-uk Kim pshufb $tx,@Xi[1] 169*7bded2dbSJung-uk Kim movd `4*$k-16*4`(@ptr[2]),$t2 170*7bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 171*7bded2dbSJung-uk Kim___ 172*7bded2dbSJung-uk Kim$code.=<<___ if ($i==14); # just load input 173*7bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[3]),$t1 174*7bded2dbSJung-uk Kim punpckldq $t2,@Xi[1] 175*7bded2dbSJung-uk Kim movdqa $a,$t2 176*7bded2dbSJung-uk Kim paddd $K,$e # e+=K_00_19 177*7bded2dbSJung-uk Kim punpckldq $t1,$t3 178*7bded2dbSJung-uk Kim movdqa $b,$t1 179*7bded2dbSJung-uk Kim movdqa $b,$t0 180*7bded2dbSJung-uk Kim pslld \$5,$t2 181*7bded2dbSJung-uk Kim prefetcht0 63(@ptr[0]) 182*7bded2dbSJung-uk Kim pandn $d,$t1 183*7bded2dbSJung-uk Kim pand $c,$t0 184*7bded2dbSJung-uk Kim punpckldq $t3,@Xi[1] 185*7bded2dbSJung-uk Kim movdqa $a,$t3 186*7bded2dbSJung-uk Kim 187*7bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 188*7bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 189*7bded2dbSJung-uk Kim psrld \$27,$t3 190*7bded2dbSJung-uk Kim pxor $t1,$t0 # Ch(b,c,d) 191*7bded2dbSJung-uk Kim movdqa $b,$t1 192*7bded2dbSJung-uk Kim prefetcht0 63(@ptr[1]) 193*7bded2dbSJung-uk Kim 194*7bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 195*7bded2dbSJung-uk Kim pslld \$30,$t1 196*7bded2dbSJung-uk Kim paddd $t0,$e # e+=Ch(b,c,d) 197*7bded2dbSJung-uk Kim prefetcht0 63(@ptr[2]) 198*7bded2dbSJung-uk Kim 199*7bded2dbSJung-uk Kim psrld \$2,$b 200*7bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 201*7bded2dbSJung-uk Kim pshufb $tx,@Xi[1] 202*7bded2dbSJung-uk Kim prefetcht0 63(@ptr[3]) 203*7bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 204*7bded2dbSJung-uk Kim___ 205*7bded2dbSJung-uk Kim$code.=<<___ if ($i>=13 && $i<15); 206*7bded2dbSJung-uk Kim movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" 207*7bded2dbSJung-uk Kim___ 208*7bded2dbSJung-uk Kim$code.=<<___ if ($i>=15); # apply Xupdate 209*7bded2dbSJung-uk Kim pxor @Xi[-2],@Xi[1] # "X[13]" 210*7bded2dbSJung-uk Kim movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 211*7bded2dbSJung-uk Kim 212*7bded2dbSJung-uk Kim movdqa $a,$t2 213*7bded2dbSJung-uk Kim pxor `&Xi_off($j+8)`,@Xi[1] 214*7bded2dbSJung-uk Kim paddd $K,$e # e+=K_00_19 215*7bded2dbSJung-uk Kim movdqa $b,$t1 216*7bded2dbSJung-uk Kim pslld \$5,$t2 217*7bded2dbSJung-uk Kim pxor @Xi[3],@Xi[1] 218*7bded2dbSJung-uk Kim movdqa $b,$t0 219*7bded2dbSJung-uk Kim pandn $d,$t1 220*7bded2dbSJung-uk Kim movdqa @Xi[1],$tx 221*7bded2dbSJung-uk Kim pand $c,$t0 222*7bded2dbSJung-uk Kim movdqa $a,$t3 223*7bded2dbSJung-uk Kim psrld \$31,$tx 224*7bded2dbSJung-uk Kim paddd @Xi[1],@Xi[1] 225*7bded2dbSJung-uk Kim 226*7bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 227*7bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 228*7bded2dbSJung-uk Kim psrld \$27,$t3 229*7bded2dbSJung-uk Kim pxor $t1,$t0 # Ch(b,c,d) 230*7bded2dbSJung-uk Kim 231*7bded2dbSJung-uk Kim movdqa $b,$t1 232*7bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 233*7bded2dbSJung-uk Kim pslld \$30,$t1 234*7bded2dbSJung-uk Kim paddd $t0,$e # e+=Ch(b,c,d) 235*7bded2dbSJung-uk Kim 236*7bded2dbSJung-uk Kim psrld \$2,$b 237*7bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 238*7bded2dbSJung-uk Kim por $tx,@Xi[1] # rol \$1,@Xi[1] 239*7bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 240*7bded2dbSJung-uk Kim___ 241*7bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 242*7bded2dbSJung-uk Kim} 243*7bded2dbSJung-uk Kim 244*7bded2dbSJung-uk Kimsub BODY_20_39 { 245*7bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 246*7bded2dbSJung-uk Kimmy $j=$i+1; 247*7bded2dbSJung-uk Kim 248*7bded2dbSJung-uk Kim$code.=<<___ if ($i<79); 249*7bded2dbSJung-uk Kim pxor @Xi[-2],@Xi[1] # "X[13]" 250*7bded2dbSJung-uk Kim movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 251*7bded2dbSJung-uk Kim 252*7bded2dbSJung-uk Kim movdqa $a,$t2 253*7bded2dbSJung-uk Kim movdqa $d,$t0 254*7bded2dbSJung-uk Kim pxor `&Xi_off($j+8)`,@Xi[1] 255*7bded2dbSJung-uk Kim paddd $K,$e # e+=K_20_39 256*7bded2dbSJung-uk Kim pslld \$5,$t2 257*7bded2dbSJung-uk Kim pxor $b,$t0 258*7bded2dbSJung-uk Kim 259*7bded2dbSJung-uk Kim movdqa $a,$t3 260*7bded2dbSJung-uk Kim___ 261*7bded2dbSJung-uk Kim$code.=<<___ if ($i<72); 262*7bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 263*7bded2dbSJung-uk Kim___ 264*7bded2dbSJung-uk Kim$code.=<<___ if ($i<79); 265*7bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 266*7bded2dbSJung-uk Kim pxor @Xi[3],@Xi[1] 267*7bded2dbSJung-uk Kim psrld \$27,$t3 268*7bded2dbSJung-uk Kim pxor $c,$t0 # Parity(b,c,d) 269*7bded2dbSJung-uk Kim movdqa $b,$t1 270*7bded2dbSJung-uk Kim 271*7bded2dbSJung-uk Kim pslld \$30,$t1 272*7bded2dbSJung-uk Kim movdqa @Xi[1],$tx 273*7bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 274*7bded2dbSJung-uk Kim psrld \$31,$tx 275*7bded2dbSJung-uk Kim paddd $t0,$e # e+=Parity(b,c,d) 276*7bded2dbSJung-uk Kim paddd @Xi[1],@Xi[1] 277*7bded2dbSJung-uk Kim 278*7bded2dbSJung-uk Kim psrld \$2,$b 279*7bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 280*7bded2dbSJung-uk Kim por $tx,@Xi[1] # rol(@Xi[1],1) 281*7bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 282*7bded2dbSJung-uk Kim___ 283*7bded2dbSJung-uk Kim$code.=<<___ if ($i==79); 284*7bded2dbSJung-uk Kim movdqa $a,$t2 285*7bded2dbSJung-uk Kim paddd $K,$e # e+=K_20_39 286*7bded2dbSJung-uk Kim movdqa $d,$t0 287*7bded2dbSJung-uk Kim pslld \$5,$t2 288*7bded2dbSJung-uk Kim pxor $b,$t0 289*7bded2dbSJung-uk Kim 290*7bded2dbSJung-uk Kim movdqa $a,$t3 291*7bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 292*7bded2dbSJung-uk Kim psrld \$27,$t3 293*7bded2dbSJung-uk Kim movdqa $b,$t1 294*7bded2dbSJung-uk Kim pxor $c,$t0 # Parity(b,c,d) 295*7bded2dbSJung-uk Kim 296*7bded2dbSJung-uk Kim pslld \$30,$t1 297*7bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 298*7bded2dbSJung-uk Kim paddd $t0,$e # e+=Parity(b,c,d) 299*7bded2dbSJung-uk Kim 300*7bded2dbSJung-uk Kim psrld \$2,$b 301*7bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 302*7bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 303*7bded2dbSJung-uk Kim___ 304*7bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 305*7bded2dbSJung-uk Kim} 306*7bded2dbSJung-uk Kim 307*7bded2dbSJung-uk Kimsub BODY_40_59 { 308*7bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 309*7bded2dbSJung-uk Kimmy $j=$i+1; 310*7bded2dbSJung-uk Kim 311*7bded2dbSJung-uk Kim$code.=<<___; 312*7bded2dbSJung-uk Kim pxor @Xi[-2],@Xi[1] # "X[13]" 313*7bded2dbSJung-uk Kim movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 314*7bded2dbSJung-uk Kim 315*7bded2dbSJung-uk Kim movdqa $a,$t2 316*7bded2dbSJung-uk Kim movdqa $d,$t1 317*7bded2dbSJung-uk Kim pxor `&Xi_off($j+8)`,@Xi[1] 318*7bded2dbSJung-uk Kim pxor @Xi[3],@Xi[1] 319*7bded2dbSJung-uk Kim paddd $K,$e # e+=K_40_59 320*7bded2dbSJung-uk Kim pslld \$5,$t2 321*7bded2dbSJung-uk Kim movdqa $a,$t3 322*7bded2dbSJung-uk Kim pand $c,$t1 323*7bded2dbSJung-uk Kim 324*7bded2dbSJung-uk Kim movdqa $d,$t0 325*7bded2dbSJung-uk Kim movdqa @Xi[1],$tx 326*7bded2dbSJung-uk Kim psrld \$27,$t3 327*7bded2dbSJung-uk Kim paddd $t1,$e 328*7bded2dbSJung-uk Kim pxor $c,$t0 329*7bded2dbSJung-uk Kim 330*7bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 331*7bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 332*7bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 333*7bded2dbSJung-uk Kim psrld \$31,$tx 334*7bded2dbSJung-uk Kim pand $b,$t0 335*7bded2dbSJung-uk Kim movdqa $b,$t1 336*7bded2dbSJung-uk Kim 337*7bded2dbSJung-uk Kim pslld \$30,$t1 338*7bded2dbSJung-uk Kim paddd @Xi[1],@Xi[1] 339*7bded2dbSJung-uk Kim paddd $t0,$e # e+=Maj(b,d,c) 340*7bded2dbSJung-uk Kim 341*7bded2dbSJung-uk Kim psrld \$2,$b 342*7bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 343*7bded2dbSJung-uk Kim por $tx,@Xi[1] # rol(@X[1],1) 344*7bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 345*7bded2dbSJung-uk Kim___ 346*7bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 347*7bded2dbSJung-uk Kim} 348*7bded2dbSJung-uk Kim 349*7bded2dbSJung-uk Kim$code.=<<___; 350*7bded2dbSJung-uk Kim.text 351*7bded2dbSJung-uk Kim 352*7bded2dbSJung-uk Kim.extern OPENSSL_ia32cap_P 353*7bded2dbSJung-uk Kim 354*7bded2dbSJung-uk Kim.globl sha1_multi_block 355*7bded2dbSJung-uk Kim.type sha1_multi_block,\@function,3 356*7bded2dbSJung-uk Kim.align 32 357*7bded2dbSJung-uk Kimsha1_multi_block: 358*7bded2dbSJung-uk Kim mov OPENSSL_ia32cap_P+4(%rip),%rcx 359*7bded2dbSJung-uk Kim bt \$61,%rcx # check SHA bit 360*7bded2dbSJung-uk Kim jc _shaext_shortcut 361*7bded2dbSJung-uk Kim___ 362*7bded2dbSJung-uk Kim$code.=<<___ if ($avx); 363*7bded2dbSJung-uk Kim test \$`1<<28`,%ecx 364*7bded2dbSJung-uk Kim jnz _avx_shortcut 365*7bded2dbSJung-uk Kim___ 366*7bded2dbSJung-uk Kim$code.=<<___; 367*7bded2dbSJung-uk Kim mov %rsp,%rax 368*7bded2dbSJung-uk Kim push %rbx 369*7bded2dbSJung-uk Kim push %rbp 370*7bded2dbSJung-uk Kim___ 371*7bded2dbSJung-uk Kim$code.=<<___ if ($win64); 372*7bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 373*7bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 374*7bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 375*7bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 376*7bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 377*7bded2dbSJung-uk Kim movaps %xmm10,-0x78(%rax) 378*7bded2dbSJung-uk Kim movaps %xmm11,-0x68(%rax) 379*7bded2dbSJung-uk Kim movaps %xmm12,-0x58(%rax) 380*7bded2dbSJung-uk Kim movaps %xmm13,-0x48(%rax) 381*7bded2dbSJung-uk Kim movaps %xmm14,-0x38(%rax) 382*7bded2dbSJung-uk Kim movaps %xmm15,-0x28(%rax) 383*7bded2dbSJung-uk Kim___ 384*7bded2dbSJung-uk Kim$code.=<<___; 385*7bded2dbSJung-uk Kim sub \$`$REG_SZ*18`,%rsp 386*7bded2dbSJung-uk Kim and \$-256,%rsp 387*7bded2dbSJung-uk Kim mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 388*7bded2dbSJung-uk Kim.Lbody: 389*7bded2dbSJung-uk Kim lea K_XX_XX(%rip),$Tbl 390*7bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 391*7bded2dbSJung-uk Kim 392*7bded2dbSJung-uk Kim.Loop_grande: 393*7bded2dbSJung-uk Kim mov $num,`$REG_SZ*17+8`(%rsp) # original $num 394*7bded2dbSJung-uk Kim xor $num,$num 395*7bded2dbSJung-uk Kim___ 396*7bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 397*7bded2dbSJung-uk Kim $code.=<<___; 398*7bded2dbSJung-uk Kim mov `16*$i+0`($inp),@ptr[$i] # input pointer 399*7bded2dbSJung-uk Kim mov `16*$i+8`($inp),%ecx # number of blocks 400*7bded2dbSJung-uk Kim cmp $num,%ecx 401*7bded2dbSJung-uk Kim cmovg %ecx,$num # find maximum 402*7bded2dbSJung-uk Kim test %ecx,%ecx 403*7bded2dbSJung-uk Kim mov %ecx,`4*$i`(%rbx) # initialize counters 404*7bded2dbSJung-uk Kim cmovle $Tbl,@ptr[$i] # cancel input 405*7bded2dbSJung-uk Kim___ 406*7bded2dbSJung-uk Kim} 407*7bded2dbSJung-uk Kim$code.=<<___; 408*7bded2dbSJung-uk Kim test $num,$num 409*7bded2dbSJung-uk Kim jz .Ldone 410*7bded2dbSJung-uk Kim 411*7bded2dbSJung-uk Kim movdqu 0x00($ctx),$A # load context 412*7bded2dbSJung-uk Kim lea 128(%rsp),%rax 413*7bded2dbSJung-uk Kim movdqu 0x20($ctx),$B 414*7bded2dbSJung-uk Kim movdqu 0x40($ctx),$C 415*7bded2dbSJung-uk Kim movdqu 0x60($ctx),$D 416*7bded2dbSJung-uk Kim movdqu 0x80($ctx),$E 417*7bded2dbSJung-uk Kim movdqa 0x60($Tbl),$tx # pbswap_mask 418*7bded2dbSJung-uk Kim movdqa -0x20($Tbl),$K # K_00_19 419*7bded2dbSJung-uk Kim jmp .Loop 420*7bded2dbSJung-uk Kim 421*7bded2dbSJung-uk Kim.align 32 422*7bded2dbSJung-uk Kim.Loop: 423*7bded2dbSJung-uk Kim___ 424*7bded2dbSJung-uk Kimfor($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 425*7bded2dbSJung-uk Kim$code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 426*7bded2dbSJung-uk Kimfor(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 427*7bded2dbSJung-uk Kim$code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59 428*7bded2dbSJung-uk Kimfor(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 429*7bded2dbSJung-uk Kim$code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79 430*7bded2dbSJung-uk Kimfor(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 431*7bded2dbSJung-uk Kim$code.=<<___; 432*7bded2dbSJung-uk Kim movdqa (%rbx),@Xi[0] # pull counters 433*7bded2dbSJung-uk Kim mov \$1,%ecx 434*7bded2dbSJung-uk Kim cmp 4*0(%rbx),%ecx # examinte counters 435*7bded2dbSJung-uk Kim pxor $t2,$t2 436*7bded2dbSJung-uk Kim cmovge $Tbl,@ptr[0] # cancel input 437*7bded2dbSJung-uk Kim cmp 4*1(%rbx),%ecx 438*7bded2dbSJung-uk Kim movdqa @Xi[0],@Xi[1] 439*7bded2dbSJung-uk Kim cmovge $Tbl,@ptr[1] 440*7bded2dbSJung-uk Kim cmp 4*2(%rbx),%ecx 441*7bded2dbSJung-uk Kim pcmpgtd $t2,@Xi[1] # mask value 442*7bded2dbSJung-uk Kim cmovge $Tbl,@ptr[2] 443*7bded2dbSJung-uk Kim cmp 4*3(%rbx),%ecx 444*7bded2dbSJung-uk Kim paddd @Xi[1],@Xi[0] # counters-- 445*7bded2dbSJung-uk Kim cmovge $Tbl,@ptr[3] 446*7bded2dbSJung-uk Kim 447*7bded2dbSJung-uk Kim movdqu 0x00($ctx),$t0 448*7bded2dbSJung-uk Kim pand @Xi[1],$A 449*7bded2dbSJung-uk Kim movdqu 0x20($ctx),$t1 450*7bded2dbSJung-uk Kim pand @Xi[1],$B 451*7bded2dbSJung-uk Kim paddd $t0,$A 452*7bded2dbSJung-uk Kim movdqu 0x40($ctx),$t2 453*7bded2dbSJung-uk Kim pand @Xi[1],$C 454*7bded2dbSJung-uk Kim paddd $t1,$B 455*7bded2dbSJung-uk Kim movdqu 0x60($ctx),$t3 456*7bded2dbSJung-uk Kim pand @Xi[1],$D 457*7bded2dbSJung-uk Kim paddd $t2,$C 458*7bded2dbSJung-uk Kim movdqu 0x80($ctx),$tx 459*7bded2dbSJung-uk Kim pand @Xi[1],$E 460*7bded2dbSJung-uk Kim movdqu $A,0x00($ctx) 461*7bded2dbSJung-uk Kim paddd $t3,$D 462*7bded2dbSJung-uk Kim movdqu $B,0x20($ctx) 463*7bded2dbSJung-uk Kim paddd $tx,$E 464*7bded2dbSJung-uk Kim movdqu $C,0x40($ctx) 465*7bded2dbSJung-uk Kim movdqu $D,0x60($ctx) 466*7bded2dbSJung-uk Kim movdqu $E,0x80($ctx) 467*7bded2dbSJung-uk Kim 468*7bded2dbSJung-uk Kim movdqa @Xi[0],(%rbx) # save counters 469*7bded2dbSJung-uk Kim movdqa 0x60($Tbl),$tx # pbswap_mask 470*7bded2dbSJung-uk Kim movdqa -0x20($Tbl),$K # K_00_19 471*7bded2dbSJung-uk Kim dec $num 472*7bded2dbSJung-uk Kim jnz .Loop 473*7bded2dbSJung-uk Kim 474*7bded2dbSJung-uk Kim mov `$REG_SZ*17+8`(%rsp),$num 475*7bded2dbSJung-uk Kim lea $REG_SZ($ctx),$ctx 476*7bded2dbSJung-uk Kim lea `16*$REG_SZ/4`($inp),$inp 477*7bded2dbSJung-uk Kim dec $num 478*7bded2dbSJung-uk Kim jnz .Loop_grande 479*7bded2dbSJung-uk Kim 480*7bded2dbSJung-uk Kim.Ldone: 481*7bded2dbSJung-uk Kim mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp 482*7bded2dbSJung-uk Kim___ 483*7bded2dbSJung-uk Kim$code.=<<___ if ($win64); 484*7bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm6 485*7bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm7 486*7bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm8 487*7bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm9 488*7bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm10 489*7bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm11 490*7bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm12 491*7bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm13 492*7bded2dbSJung-uk Kim movaps -0x38(%rax),%xmm14 493*7bded2dbSJung-uk Kim movaps -0x28(%rax),%xmm15 494*7bded2dbSJung-uk Kim___ 495*7bded2dbSJung-uk Kim$code.=<<___; 496*7bded2dbSJung-uk Kim mov -16(%rax),%rbp 497*7bded2dbSJung-uk Kim mov -8(%rax),%rbx 498*7bded2dbSJung-uk Kim lea (%rax),%rsp 499*7bded2dbSJung-uk Kim.Lepilogue: 500*7bded2dbSJung-uk Kim ret 501*7bded2dbSJung-uk Kim.size sha1_multi_block,.-sha1_multi_block 502*7bded2dbSJung-uk Kim___ 503*7bded2dbSJung-uk Kim {{{ 504*7bded2dbSJung-uk Kimmy ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10)); 505*7bded2dbSJung-uk Kimmy @MSG0=map("%xmm$_",(4..7)); 506*7bded2dbSJung-uk Kimmy @MSG1=map("%xmm$_",(11..14)); 507*7bded2dbSJung-uk Kim 508*7bded2dbSJung-uk Kim$code.=<<___; 509*7bded2dbSJung-uk Kim.type sha1_multi_block_shaext,\@function,3 510*7bded2dbSJung-uk Kim.align 32 511*7bded2dbSJung-uk Kimsha1_multi_block_shaext: 512*7bded2dbSJung-uk Kim_shaext_shortcut: 513*7bded2dbSJung-uk Kim mov %rsp,%rax 514*7bded2dbSJung-uk Kim push %rbx 515*7bded2dbSJung-uk Kim push %rbp 516*7bded2dbSJung-uk Kim___ 517*7bded2dbSJung-uk Kim$code.=<<___ if ($win64); 518*7bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 519*7bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 520*7bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 521*7bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 522*7bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 523*7bded2dbSJung-uk Kim movaps %xmm10,-0x78(%rax) 524*7bded2dbSJung-uk Kim movaps %xmm11,-0x68(%rax) 525*7bded2dbSJung-uk Kim movaps %xmm12,-0x58(%rax) 526*7bded2dbSJung-uk Kim movaps %xmm13,-0x48(%rax) 527*7bded2dbSJung-uk Kim movaps %xmm14,-0x38(%rax) 528*7bded2dbSJung-uk Kim movaps %xmm15,-0x28(%rax) 529*7bded2dbSJung-uk Kim___ 530*7bded2dbSJung-uk Kim$code.=<<___; 531*7bded2dbSJung-uk Kim sub \$`$REG_SZ*18`,%rsp 532*7bded2dbSJung-uk Kim shl \$1,$num # we process pair at a time 533*7bded2dbSJung-uk Kim and \$-256,%rsp 534*7bded2dbSJung-uk Kim lea 0x40($ctx),$ctx # size optimization 535*7bded2dbSJung-uk Kim mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 536*7bded2dbSJung-uk Kim.Lbody_shaext: 537*7bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 538*7bded2dbSJung-uk Kim movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap 539*7bded2dbSJung-uk Kim 540*7bded2dbSJung-uk Kim.Loop_grande_shaext: 541*7bded2dbSJung-uk Kim mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num 542*7bded2dbSJung-uk Kim xor $num,$num 543*7bded2dbSJung-uk Kim___ 544*7bded2dbSJung-uk Kimfor($i=0;$i<2;$i++) { 545*7bded2dbSJung-uk Kim $code.=<<___; 546*7bded2dbSJung-uk Kim mov `16*$i+0`($inp),@ptr[$i] # input pointer 547*7bded2dbSJung-uk Kim mov `16*$i+8`($inp),%ecx # number of blocks 548*7bded2dbSJung-uk Kim cmp $num,%ecx 549*7bded2dbSJung-uk Kim cmovg %ecx,$num # find maximum 550*7bded2dbSJung-uk Kim test %ecx,%ecx 551*7bded2dbSJung-uk Kim mov %ecx,`4*$i`(%rbx) # initialize counters 552*7bded2dbSJung-uk Kim cmovle %rsp,@ptr[$i] # cancel input 553*7bded2dbSJung-uk Kim___ 554*7bded2dbSJung-uk Kim} 555*7bded2dbSJung-uk Kim$code.=<<___; 556*7bded2dbSJung-uk Kim test $num,$num 557*7bded2dbSJung-uk Kim jz .Ldone_shaext 558*7bded2dbSJung-uk Kim 559*7bded2dbSJung-uk Kim movq 0x00-0x40($ctx),$ABCD0 # a1.a0 560*7bded2dbSJung-uk Kim movq 0x20-0x40($ctx),@MSG0[0]# b1.b0 561*7bded2dbSJung-uk Kim movq 0x40-0x40($ctx),@MSG0[1]# c1.c0 562*7bded2dbSJung-uk Kim movq 0x60-0x40($ctx),@MSG0[2]# d1.d0 563*7bded2dbSJung-uk Kim movq 0x80-0x40($ctx),@MSG0[3]# e1.e0 564*7bded2dbSJung-uk Kim 565*7bded2dbSJung-uk Kim punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0 566*7bded2dbSJung-uk Kim punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0 567*7bded2dbSJung-uk Kim 568*7bded2dbSJung-uk Kim movdqa $ABCD0,$ABCD1 569*7bded2dbSJung-uk Kim punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0 570*7bded2dbSJung-uk Kim punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1 571*7bded2dbSJung-uk Kim 572*7bded2dbSJung-uk Kim pshufd \$0b00111111,@MSG0[3],$E0 573*7bded2dbSJung-uk Kim pshufd \$0b01111111,@MSG0[3],$E1 574*7bded2dbSJung-uk Kim pshufd \$0b00011011,$ABCD0,$ABCD0 575*7bded2dbSJung-uk Kim pshufd \$0b00011011,$ABCD1,$ABCD1 576*7bded2dbSJung-uk Kim jmp .Loop_shaext 577*7bded2dbSJung-uk Kim 578*7bded2dbSJung-uk Kim.align 32 579*7bded2dbSJung-uk Kim.Loop_shaext: 580*7bded2dbSJung-uk Kim movdqu 0x00(@ptr[0]),@MSG0[0] 581*7bded2dbSJung-uk Kim movdqu 0x00(@ptr[1]),@MSG1[0] 582*7bded2dbSJung-uk Kim movdqu 0x10(@ptr[0]),@MSG0[1] 583*7bded2dbSJung-uk Kim movdqu 0x10(@ptr[1]),@MSG1[1] 584*7bded2dbSJung-uk Kim movdqu 0x20(@ptr[0]),@MSG0[2] 585*7bded2dbSJung-uk Kim pshufb $BSWAP,@MSG0[0] 586*7bded2dbSJung-uk Kim movdqu 0x20(@ptr[1]),@MSG1[2] 587*7bded2dbSJung-uk Kim pshufb $BSWAP,@MSG1[0] 588*7bded2dbSJung-uk Kim movdqu 0x30(@ptr[0]),@MSG0[3] 589*7bded2dbSJung-uk Kim lea 0x40(@ptr[0]),@ptr[0] 590*7bded2dbSJung-uk Kim pshufb $BSWAP,@MSG0[1] 591*7bded2dbSJung-uk Kim movdqu 0x30(@ptr[1]),@MSG1[3] 592*7bded2dbSJung-uk Kim lea 0x40(@ptr[1]),@ptr[1] 593*7bded2dbSJung-uk Kim pshufb $BSWAP,@MSG1[1] 594*7bded2dbSJung-uk Kim 595*7bded2dbSJung-uk Kim movdqa $E0,0x50(%rsp) # offload 596*7bded2dbSJung-uk Kim paddd @MSG0[0],$E0 597*7bded2dbSJung-uk Kim movdqa $E1,0x70(%rsp) 598*7bded2dbSJung-uk Kim paddd @MSG1[0],$E1 599*7bded2dbSJung-uk Kim movdqa $ABCD0,0x40(%rsp) # offload 600*7bded2dbSJung-uk Kim movdqa $ABCD0,$E0_ 601*7bded2dbSJung-uk Kim movdqa $ABCD1,0x60(%rsp) 602*7bded2dbSJung-uk Kim movdqa $ABCD1,$E1_ 603*7bded2dbSJung-uk Kim sha1rnds4 \$0,$E0,$ABCD0 # 0-3 604*7bded2dbSJung-uk Kim sha1nexte @MSG0[1],$E0_ 605*7bded2dbSJung-uk Kim sha1rnds4 \$0,$E1,$ABCD1 # 0-3 606*7bded2dbSJung-uk Kim sha1nexte @MSG1[1],$E1_ 607*7bded2dbSJung-uk Kim pshufb $BSWAP,@MSG0[2] 608*7bded2dbSJung-uk Kim prefetcht0 127(@ptr[0]) 609*7bded2dbSJung-uk Kim sha1msg1 @MSG0[1],@MSG0[0] 610*7bded2dbSJung-uk Kim pshufb $BSWAP,@MSG1[2] 611*7bded2dbSJung-uk Kim prefetcht0 127(@ptr[1]) 612*7bded2dbSJung-uk Kim sha1msg1 @MSG1[1],@MSG1[0] 613*7bded2dbSJung-uk Kim 614*7bded2dbSJung-uk Kim pshufb $BSWAP,@MSG0[3] 615*7bded2dbSJung-uk Kim movdqa $ABCD0,$E0 616*7bded2dbSJung-uk Kim pshufb $BSWAP,@MSG1[3] 617*7bded2dbSJung-uk Kim movdqa $ABCD1,$E1 618*7bded2dbSJung-uk Kim sha1rnds4 \$0,$E0_,$ABCD0 # 4-7 619*7bded2dbSJung-uk Kim sha1nexte @MSG0[2],$E0 620*7bded2dbSJung-uk Kim sha1rnds4 \$0,$E1_,$ABCD1 # 4-7 621*7bded2dbSJung-uk Kim sha1nexte @MSG1[2],$E1 622*7bded2dbSJung-uk Kim pxor @MSG0[2],@MSG0[0] 623*7bded2dbSJung-uk Kim sha1msg1 @MSG0[2],@MSG0[1] 624*7bded2dbSJung-uk Kim pxor @MSG1[2],@MSG1[0] 625*7bded2dbSJung-uk Kim sha1msg1 @MSG1[2],@MSG1[1] 626*7bded2dbSJung-uk Kim___ 627*7bded2dbSJung-uk Kimfor($i=2;$i<20-4;$i++) { 628*7bded2dbSJung-uk Kim$code.=<<___; 629*7bded2dbSJung-uk Kim movdqa $ABCD0,$E0_ 630*7bded2dbSJung-uk Kim movdqa $ABCD1,$E1_ 631*7bded2dbSJung-uk Kim sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11 632*7bded2dbSJung-uk Kim sha1nexte @MSG0[3],$E0_ 633*7bded2dbSJung-uk Kim sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11 634*7bded2dbSJung-uk Kim sha1nexte @MSG1[3],$E1_ 635*7bded2dbSJung-uk Kim sha1msg2 @MSG0[3],@MSG0[0] 636*7bded2dbSJung-uk Kim sha1msg2 @MSG1[3],@MSG1[0] 637*7bded2dbSJung-uk Kim pxor @MSG0[3],@MSG0[1] 638*7bded2dbSJung-uk Kim sha1msg1 @MSG0[3],@MSG0[2] 639*7bded2dbSJung-uk Kim pxor @MSG1[3],@MSG1[1] 640*7bded2dbSJung-uk Kim sha1msg1 @MSG1[3],@MSG1[2] 641*7bded2dbSJung-uk Kim___ 642*7bded2dbSJung-uk Kim ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1); 643*7bded2dbSJung-uk Kim push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); 644*7bded2dbSJung-uk Kim} 645*7bded2dbSJung-uk Kim$code.=<<___; 646*7bded2dbSJung-uk Kim movdqa $ABCD0,$E0_ 647*7bded2dbSJung-uk Kim movdqa $ABCD1,$E1_ 648*7bded2dbSJung-uk Kim sha1rnds4 \$3,$E0,$ABCD0 # 64-67 649*7bded2dbSJung-uk Kim sha1nexte @MSG0[3],$E0_ 650*7bded2dbSJung-uk Kim sha1rnds4 \$3,$E1,$ABCD1 # 64-67 651*7bded2dbSJung-uk Kim sha1nexte @MSG1[3],$E1_ 652*7bded2dbSJung-uk Kim sha1msg2 @MSG0[3],@MSG0[0] 653*7bded2dbSJung-uk Kim sha1msg2 @MSG1[3],@MSG1[0] 654*7bded2dbSJung-uk Kim pxor @MSG0[3],@MSG0[1] 655*7bded2dbSJung-uk Kim pxor @MSG1[3],@MSG1[1] 656*7bded2dbSJung-uk Kim 657*7bded2dbSJung-uk Kim mov \$1,%ecx 658*7bded2dbSJung-uk Kim pxor @MSG0[2],@MSG0[2] # zero 659*7bded2dbSJung-uk Kim cmp 4*0(%rbx),%ecx # examine counters 660*7bded2dbSJung-uk Kim cmovge %rsp,@ptr[0] # cancel input 661*7bded2dbSJung-uk Kim 662*7bded2dbSJung-uk Kim movdqa $ABCD0,$E0 663*7bded2dbSJung-uk Kim movdqa $ABCD1,$E1 664*7bded2dbSJung-uk Kim sha1rnds4 \$3,$E0_,$ABCD0 # 68-71 665*7bded2dbSJung-uk Kim sha1nexte @MSG0[0],$E0 666*7bded2dbSJung-uk Kim sha1rnds4 \$3,$E1_,$ABCD1 # 68-71 667*7bded2dbSJung-uk Kim sha1nexte @MSG1[0],$E1 668*7bded2dbSJung-uk Kim sha1msg2 @MSG0[0],@MSG0[1] 669*7bded2dbSJung-uk Kim sha1msg2 @MSG1[0],@MSG1[1] 670*7bded2dbSJung-uk Kim 671*7bded2dbSJung-uk Kim cmp 4*1(%rbx),%ecx 672*7bded2dbSJung-uk Kim cmovge %rsp,@ptr[1] 673*7bded2dbSJung-uk Kim movq (%rbx),@MSG0[0] # pull counters 674*7bded2dbSJung-uk Kim 675*7bded2dbSJung-uk Kim movdqa $ABCD0,$E0_ 676*7bded2dbSJung-uk Kim movdqa $ABCD1,$E1_ 677*7bded2dbSJung-uk Kim sha1rnds4 \$3,$E0,$ABCD0 # 72-75 678*7bded2dbSJung-uk Kim sha1nexte @MSG0[1],$E0_ 679*7bded2dbSJung-uk Kim sha1rnds4 \$3,$E1,$ABCD1 # 72-75 680*7bded2dbSJung-uk Kim sha1nexte @MSG1[1],$E1_ 681*7bded2dbSJung-uk Kim 682*7bded2dbSJung-uk Kim pshufd \$0x00,@MSG0[0],@MSG1[2] 683*7bded2dbSJung-uk Kim pshufd \$0x55,@MSG0[0],@MSG1[3] 684*7bded2dbSJung-uk Kim movdqa @MSG0[0],@MSG0[1] 685*7bded2dbSJung-uk Kim pcmpgtd @MSG0[2],@MSG1[2] 686*7bded2dbSJung-uk Kim pcmpgtd @MSG0[2],@MSG1[3] 687*7bded2dbSJung-uk Kim 688*7bded2dbSJung-uk Kim movdqa $ABCD0,$E0 689*7bded2dbSJung-uk Kim movdqa $ABCD1,$E1 690*7bded2dbSJung-uk Kim sha1rnds4 \$3,$E0_,$ABCD0 # 76-79 691*7bded2dbSJung-uk Kim sha1nexte $MSG0[2],$E0 692*7bded2dbSJung-uk Kim sha1rnds4 \$3,$E1_,$ABCD1 # 76-79 693*7bded2dbSJung-uk Kim sha1nexte $MSG0[2],$E1 694*7bded2dbSJung-uk Kim 695*7bded2dbSJung-uk Kim pcmpgtd @MSG0[2],@MSG0[1] # counter mask 696*7bded2dbSJung-uk Kim pand @MSG1[2],$ABCD0 697*7bded2dbSJung-uk Kim pand @MSG1[2],$E0 698*7bded2dbSJung-uk Kim pand @MSG1[3],$ABCD1 699*7bded2dbSJung-uk Kim pand @MSG1[3],$E1 700*7bded2dbSJung-uk Kim paddd @MSG0[1],@MSG0[0] # counters-- 701*7bded2dbSJung-uk Kim 702*7bded2dbSJung-uk Kim paddd 0x40(%rsp),$ABCD0 703*7bded2dbSJung-uk Kim paddd 0x50(%rsp),$E0 704*7bded2dbSJung-uk Kim paddd 0x60(%rsp),$ABCD1 705*7bded2dbSJung-uk Kim paddd 0x70(%rsp),$E1 706*7bded2dbSJung-uk Kim 707*7bded2dbSJung-uk Kim movq @MSG0[0],(%rbx) # save counters 708*7bded2dbSJung-uk Kim dec $num 709*7bded2dbSJung-uk Kim jnz .Loop_shaext 710*7bded2dbSJung-uk Kim 711*7bded2dbSJung-uk Kim mov `$REG_SZ*17+8`(%rsp),$num 712*7bded2dbSJung-uk Kim 713*7bded2dbSJung-uk Kim pshufd \$0b00011011,$ABCD0,$ABCD0 714*7bded2dbSJung-uk Kim pshufd \$0b00011011,$ABCD1,$ABCD1 715*7bded2dbSJung-uk Kim 716*7bded2dbSJung-uk Kim movdqa $ABCD0,@MSG0[0] 717*7bded2dbSJung-uk Kim punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0 718*7bded2dbSJung-uk Kim punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0 719*7bded2dbSJung-uk Kim punpckhdq $E1,$E0 # e1.e0.xx.xx 720*7bded2dbSJung-uk Kim movq $ABCD0,0x00-0x40($ctx) # a1.a0 721*7bded2dbSJung-uk Kim psrldq \$8,$ABCD0 722*7bded2dbSJung-uk Kim movq @MSG0[0],0x40-0x40($ctx)# c1.c0 723*7bded2dbSJung-uk Kim psrldq \$8,@MSG0[0] 724*7bded2dbSJung-uk Kim movq $ABCD0,0x20-0x40($ctx) # b1.b0 725*7bded2dbSJung-uk Kim psrldq \$8,$E0 726*7bded2dbSJung-uk Kim movq @MSG0[0],0x60-0x40($ctx)# d1.d0 727*7bded2dbSJung-uk Kim movq $E0,0x80-0x40($ctx) # e1.e0 728*7bded2dbSJung-uk Kim 729*7bded2dbSJung-uk Kim lea `$REG_SZ/2`($ctx),$ctx 730*7bded2dbSJung-uk Kim lea `16*2`($inp),$inp 731*7bded2dbSJung-uk Kim dec $num 732*7bded2dbSJung-uk Kim jnz .Loop_grande_shaext 733*7bded2dbSJung-uk Kim 734*7bded2dbSJung-uk Kim.Ldone_shaext: 735*7bded2dbSJung-uk Kim #mov `$REG_SZ*17`(%rsp),%rax # original %rsp 736*7bded2dbSJung-uk Kim___ 737*7bded2dbSJung-uk Kim$code.=<<___ if ($win64); 738*7bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm6 739*7bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm7 740*7bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm8 741*7bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm9 742*7bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm10 743*7bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm11 744*7bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm12 745*7bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm13 746*7bded2dbSJung-uk Kim movaps -0x38(%rax),%xmm14 747*7bded2dbSJung-uk Kim movaps -0x28(%rax),%xmm15 748*7bded2dbSJung-uk Kim___ 749*7bded2dbSJung-uk Kim$code.=<<___; 750*7bded2dbSJung-uk Kim mov -16(%rax),%rbp 751*7bded2dbSJung-uk Kim mov -8(%rax),%rbx 752*7bded2dbSJung-uk Kim lea (%rax),%rsp 753*7bded2dbSJung-uk Kim.Lepilogue_shaext: 754*7bded2dbSJung-uk Kim ret 755*7bded2dbSJung-uk Kim.size sha1_multi_block_shaext,.-sha1_multi_block_shaext 756*7bded2dbSJung-uk Kim___ 757*7bded2dbSJung-uk Kim }}} 758*7bded2dbSJung-uk Kim 759*7bded2dbSJung-uk Kim if ($avx) {{{ 760*7bded2dbSJung-uk Kimsub BODY_00_19_avx { 761*7bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 762*7bded2dbSJung-uk Kimmy $j=$i+1; 763*7bded2dbSJung-uk Kimmy $k=$i+2; 764*7bded2dbSJung-uk Kimmy $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128"; 765*7bded2dbSJung-uk Kimmy $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4]; 766*7bded2dbSJung-uk Kim 767*7bded2dbSJung-uk Kim$code.=<<___ if ($i==0 && $REG_SZ==16); 768*7bded2dbSJung-uk Kim vmovd (@ptr[0]),@Xi[0] 769*7bded2dbSJung-uk Kim lea `16*4`(@ptr[0]),@ptr[0] 770*7bded2dbSJung-uk Kim vmovd (@ptr[1]),@Xi[2] # borrow Xi[2] 771*7bded2dbSJung-uk Kim lea `16*4`(@ptr[1]),@ptr[1] 772*7bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] 773*7bded2dbSJung-uk Kim lea `16*4`(@ptr[2]),@ptr[2] 774*7bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2] 775*7bded2dbSJung-uk Kim lea `16*4`(@ptr[3]),@ptr[3] 776*7bded2dbSJung-uk Kim vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] 777*7bded2dbSJung-uk Kim vpunpckldq @Xi[2],@Xi[0],@Xi[0] 778*7bded2dbSJung-uk Kim vmovd `4*$j-16*4`($ptr_n),$t3 779*7bded2dbSJung-uk Kim vpshufb $tx,@Xi[0],@Xi[0] 780*7bded2dbSJung-uk Kim___ 781*7bded2dbSJung-uk Kim$code.=<<___ if ($i<15 && $REG_SZ==16); # just load input 782*7bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] 783*7bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3 784*7bded2dbSJung-uk Kim___ 785*7bded2dbSJung-uk Kim$code.=<<___ if ($i==0 && $REG_SZ==32); 786*7bded2dbSJung-uk Kim vmovd (@ptr[0]),@Xi[0] 787*7bded2dbSJung-uk Kim lea `16*4`(@ptr[0]),@ptr[0] 788*7bded2dbSJung-uk Kim vmovd (@ptr[4]),@Xi[2] # borrow Xi[2] 789*7bded2dbSJung-uk Kim lea `16*4`(@ptr[4]),@ptr[4] 790*7bded2dbSJung-uk Kim vmovd (@ptr[1]),$t2 791*7bded2dbSJung-uk Kim lea `16*4`(@ptr[1]),@ptr[1] 792*7bded2dbSJung-uk Kim vmovd (@ptr[5]),$t1 793*7bded2dbSJung-uk Kim lea `16*4`(@ptr[5]),@ptr[5] 794*7bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] 795*7bded2dbSJung-uk Kim lea `16*4`(@ptr[2]),@ptr[2] 796*7bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2] 797*7bded2dbSJung-uk Kim lea `16*4`(@ptr[6]),@ptr[6] 798*7bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[3]),$t2,$t2 799*7bded2dbSJung-uk Kim lea `16*4`(@ptr[3]),@ptr[3] 800*7bded2dbSJung-uk Kim vpunpckldq $t2,@Xi[0],@Xi[0] 801*7bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[7]),$t1,$t1 802*7bded2dbSJung-uk Kim lea `16*4`(@ptr[7]),@ptr[7] 803*7bded2dbSJung-uk Kim vpunpckldq $t1,@Xi[2],@Xi[2] 804*7bded2dbSJung-uk Kim vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] 805*7bded2dbSJung-uk Kim vinserti128 @Xi[2],@Xi[0],@Xi[0] 806*7bded2dbSJung-uk Kim vmovd `4*$j-16*4`($ptr_n),$t3 807*7bded2dbSJung-uk Kim vpshufb $tx,@Xi[0],@Xi[0] 808*7bded2dbSJung-uk Kim___ 809*7bded2dbSJung-uk Kim$code.=<<___ if ($i<15 && $REG_SZ==32); # just load input 810*7bded2dbSJung-uk Kim vmovd `4*$j-16*4`(@ptr[1]),$t2 811*7bded2dbSJung-uk Kim vmovd `4*$j-16*4`(@ptr[5]),$t1 812*7bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] 813*7bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3 814*7bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2 815*7bded2dbSJung-uk Kim vpunpckldq $t2,@Xi[1],@Xi[1] 816*7bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1 817*7bded2dbSJung-uk Kim vpunpckldq $t1,$t3,$t3 818*7bded2dbSJung-uk Kim___ 819*7bded2dbSJung-uk Kim$code.=<<___ if ($i<14); 820*7bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_00_19 821*7bded2dbSJung-uk Kim vpslld \$5,$a,$t2 822*7bded2dbSJung-uk Kim vpandn $d,$b,$t1 823*7bded2dbSJung-uk Kim vpand $c,$b,$t0 824*7bded2dbSJung-uk Kim 825*7bded2dbSJung-uk Kim vmovdqa @Xi[0],`&Xi_off($i)` 826*7bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 827*7bded2dbSJung-uk Kim $vpack $t3,@Xi[1],@Xi[1] 828*7bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 829*7bded2dbSJung-uk Kim vpxor $t1,$t0,$t0 # Ch(b,c,d) 830*7bded2dbSJung-uk Kim vmovd `4*$k-16*4`(@ptr[0]),@Xi[2] 831*7bded2dbSJung-uk Kim 832*7bded2dbSJung-uk Kim vpslld \$30,$b,$t1 833*7bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 834*7bded2dbSJung-uk Kim vmovd `4*$k-16*4`($ptr_n),$t3 835*7bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Ch(b,c,d) 836*7bded2dbSJung-uk Kim 837*7bded2dbSJung-uk Kim vpsrld \$2,$b,$b 838*7bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 839*7bded2dbSJung-uk Kim vpshufb $tx,@Xi[1],@Xi[1] 840*7bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 841*7bded2dbSJung-uk Kim___ 842*7bded2dbSJung-uk Kim$code.=<<___ if ($i==14); 843*7bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_00_19 844*7bded2dbSJung-uk Kim prefetcht0 63(@ptr[0]) 845*7bded2dbSJung-uk Kim vpslld \$5,$a,$t2 846*7bded2dbSJung-uk Kim vpandn $d,$b,$t1 847*7bded2dbSJung-uk Kim vpand $c,$b,$t0 848*7bded2dbSJung-uk Kim 849*7bded2dbSJung-uk Kim vmovdqa @Xi[0],`&Xi_off($i)` 850*7bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 851*7bded2dbSJung-uk Kim $vpack $t3,@Xi[1],@Xi[1] 852*7bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 853*7bded2dbSJung-uk Kim prefetcht0 63(@ptr[1]) 854*7bded2dbSJung-uk Kim vpxor $t1,$t0,$t0 # Ch(b,c,d) 855*7bded2dbSJung-uk Kim 856*7bded2dbSJung-uk Kim vpslld \$30,$b,$t1 857*7bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 858*7bded2dbSJung-uk Kim prefetcht0 63(@ptr[2]) 859*7bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Ch(b,c,d) 860*7bded2dbSJung-uk Kim 861*7bded2dbSJung-uk Kim vpsrld \$2,$b,$b 862*7bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 863*7bded2dbSJung-uk Kim prefetcht0 63(@ptr[3]) 864*7bded2dbSJung-uk Kim vpshufb $tx,@Xi[1],@Xi[1] 865*7bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 866*7bded2dbSJung-uk Kim___ 867*7bded2dbSJung-uk Kim$code.=<<___ if ($i>=13 && $i<15); 868*7bded2dbSJung-uk Kim vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" 869*7bded2dbSJung-uk Kim___ 870*7bded2dbSJung-uk Kim$code.=<<___ if ($i>=15); # apply Xupdate 871*7bded2dbSJung-uk Kim vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 872*7bded2dbSJung-uk Kim vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 873*7bded2dbSJung-uk Kim 874*7bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_00_19 875*7bded2dbSJung-uk Kim vpslld \$5,$a,$t2 876*7bded2dbSJung-uk Kim vpandn $d,$b,$t1 877*7bded2dbSJung-uk Kim `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` 878*7bded2dbSJung-uk Kim vpand $c,$b,$t0 879*7bded2dbSJung-uk Kim 880*7bded2dbSJung-uk Kim vmovdqa @Xi[0],`&Xi_off($i)` 881*7bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 882*7bded2dbSJung-uk Kim vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 883*7bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 884*7bded2dbSJung-uk Kim vpxor $t1,$t0,$t0 # Ch(b,c,d) 885*7bded2dbSJung-uk Kim vpxor @Xi[3],@Xi[1],@Xi[1] 886*7bded2dbSJung-uk Kim `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` 887*7bded2dbSJung-uk Kim 888*7bded2dbSJung-uk Kim vpslld \$30,$b,$t1 889*7bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 890*7bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Ch(b,c,d) 891*7bded2dbSJung-uk Kim `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` 892*7bded2dbSJung-uk Kim vpsrld \$31,@Xi[1],$tx 893*7bded2dbSJung-uk Kim vpaddd @Xi[1],@Xi[1],@Xi[1] 894*7bded2dbSJung-uk Kim 895*7bded2dbSJung-uk Kim vpsrld \$2,$b,$b 896*7bded2dbSJung-uk Kim `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` 897*7bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 898*7bded2dbSJung-uk Kim vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] 899*7bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 900*7bded2dbSJung-uk Kim___ 901*7bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 902*7bded2dbSJung-uk Kim} 903*7bded2dbSJung-uk Kim 904*7bded2dbSJung-uk Kimsub BODY_20_39_avx { 905*7bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 906*7bded2dbSJung-uk Kimmy $j=$i+1; 907*7bded2dbSJung-uk Kim 908*7bded2dbSJung-uk Kim$code.=<<___ if ($i<79); 909*7bded2dbSJung-uk Kim vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 910*7bded2dbSJung-uk Kim vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 911*7bded2dbSJung-uk Kim 912*7bded2dbSJung-uk Kim vpslld \$5,$a,$t2 913*7bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_20_39 914*7bded2dbSJung-uk Kim vpxor $b,$d,$t0 915*7bded2dbSJung-uk Kim___ 916*7bded2dbSJung-uk Kim$code.=<<___ if ($i<72); 917*7bded2dbSJung-uk Kim vmovdqa @Xi[0],`&Xi_off($i)` 918*7bded2dbSJung-uk Kim___ 919*7bded2dbSJung-uk Kim$code.=<<___ if ($i<79); 920*7bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 921*7bded2dbSJung-uk Kim vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 922*7bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 923*7bded2dbSJung-uk Kim vpxor $c,$t0,$t0 # Parity(b,c,d) 924*7bded2dbSJung-uk Kim vpxor @Xi[3],@Xi[1],@Xi[1] 925*7bded2dbSJung-uk Kim 926*7bded2dbSJung-uk Kim vpslld \$30,$b,$t1 927*7bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 928*7bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Parity(b,c,d) 929*7bded2dbSJung-uk Kim vpsrld \$31,@Xi[1],$tx 930*7bded2dbSJung-uk Kim vpaddd @Xi[1],@Xi[1],@Xi[1] 931*7bded2dbSJung-uk Kim 932*7bded2dbSJung-uk Kim vpsrld \$2,$b,$b 933*7bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 934*7bded2dbSJung-uk Kim vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1) 935*7bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 936*7bded2dbSJung-uk Kim___ 937*7bded2dbSJung-uk Kim$code.=<<___ if ($i==79); 938*7bded2dbSJung-uk Kim vpslld \$5,$a,$t2 939*7bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_20_39 940*7bded2dbSJung-uk Kim vpxor $b,$d,$t0 941*7bded2dbSJung-uk Kim 942*7bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 943*7bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 944*7bded2dbSJung-uk Kim vpxor $c,$t0,$t0 # Parity(b,c,d) 945*7bded2dbSJung-uk Kim 946*7bded2dbSJung-uk Kim vpslld \$30,$b,$t1 947*7bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 948*7bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Parity(b,c,d) 949*7bded2dbSJung-uk Kim 950*7bded2dbSJung-uk Kim vpsrld \$2,$b,$b 951*7bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 952*7bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 953*7bded2dbSJung-uk Kim___ 954*7bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 955*7bded2dbSJung-uk Kim} 956*7bded2dbSJung-uk Kim 957*7bded2dbSJung-uk Kimsub BODY_40_59_avx { 958*7bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 959*7bded2dbSJung-uk Kimmy $j=$i+1; 960*7bded2dbSJung-uk Kim 961*7bded2dbSJung-uk Kim$code.=<<___; 962*7bded2dbSJung-uk Kim vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 963*7bded2dbSJung-uk Kim vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 964*7bded2dbSJung-uk Kim 965*7bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_40_59 966*7bded2dbSJung-uk Kim vpslld \$5,$a,$t2 967*7bded2dbSJung-uk Kim vpand $c,$d,$t1 968*7bded2dbSJung-uk Kim vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 969*7bded2dbSJung-uk Kim 970*7bded2dbSJung-uk Kim vpaddd $t1,$e,$e 971*7bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 972*7bded2dbSJung-uk Kim vpxor $c,$d,$t0 973*7bded2dbSJung-uk Kim vpxor @Xi[3],@Xi[1],@Xi[1] 974*7bded2dbSJung-uk Kim 975*7bded2dbSJung-uk Kim vmovdqu @Xi[0],`&Xi_off($i)` 976*7bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 977*7bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 978*7bded2dbSJung-uk Kim vpsrld \$31,@Xi[1],$tx 979*7bded2dbSJung-uk Kim vpand $b,$t0,$t0 980*7bded2dbSJung-uk Kim vpaddd @Xi[1],@Xi[1],@Xi[1] 981*7bded2dbSJung-uk Kim 982*7bded2dbSJung-uk Kim vpslld \$30,$b,$t1 983*7bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Maj(b,d,c) 984*7bded2dbSJung-uk Kim 985*7bded2dbSJung-uk Kim vpsrld \$2,$b,$b 986*7bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 987*7bded2dbSJung-uk Kim vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1) 988*7bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 989*7bded2dbSJung-uk Kim___ 990*7bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 991*7bded2dbSJung-uk Kim} 992*7bded2dbSJung-uk Kim 993*7bded2dbSJung-uk Kim$code.=<<___; 994*7bded2dbSJung-uk Kim.type sha1_multi_block_avx,\@function,3 995*7bded2dbSJung-uk Kim.align 32 996*7bded2dbSJung-uk Kimsha1_multi_block_avx: 997*7bded2dbSJung-uk Kim_avx_shortcut: 998*7bded2dbSJung-uk Kim___ 999*7bded2dbSJung-uk Kim$code.=<<___ if ($avx>1); 1000*7bded2dbSJung-uk Kim shr \$32,%rcx 1001*7bded2dbSJung-uk Kim cmp \$2,$num 1002*7bded2dbSJung-uk Kim jb .Lavx 1003*7bded2dbSJung-uk Kim test \$`1<<5`,%ecx 1004*7bded2dbSJung-uk Kim jnz _avx2_shortcut 1005*7bded2dbSJung-uk Kim jmp .Lavx 1006*7bded2dbSJung-uk Kim.align 32 1007*7bded2dbSJung-uk Kim.Lavx: 1008*7bded2dbSJung-uk Kim___ 1009*7bded2dbSJung-uk Kim$code.=<<___; 1010*7bded2dbSJung-uk Kim mov %rsp,%rax 1011*7bded2dbSJung-uk Kim push %rbx 1012*7bded2dbSJung-uk Kim push %rbp 1013*7bded2dbSJung-uk Kim___ 1014*7bded2dbSJung-uk Kim$code.=<<___ if ($win64); 1015*7bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 1016*7bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 1017*7bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 1018*7bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 1019*7bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 1020*7bded2dbSJung-uk Kim movaps %xmm10,-0x78(%rax) 1021*7bded2dbSJung-uk Kim movaps %xmm11,-0x68(%rax) 1022*7bded2dbSJung-uk Kim movaps %xmm12,-0x58(%rax) 1023*7bded2dbSJung-uk Kim movaps %xmm13,-0x48(%rax) 1024*7bded2dbSJung-uk Kim movaps %xmm14,-0x38(%rax) 1025*7bded2dbSJung-uk Kim movaps %xmm15,-0x28(%rax) 1026*7bded2dbSJung-uk Kim___ 1027*7bded2dbSJung-uk Kim$code.=<<___; 1028*7bded2dbSJung-uk Kim sub \$`$REG_SZ*18`, %rsp 1029*7bded2dbSJung-uk Kim and \$-256,%rsp 1030*7bded2dbSJung-uk Kim mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1031*7bded2dbSJung-uk Kim.Lbody_avx: 1032*7bded2dbSJung-uk Kim lea K_XX_XX(%rip),$Tbl 1033*7bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 1034*7bded2dbSJung-uk Kim 1035*7bded2dbSJung-uk Kim vzeroupper 1036*7bded2dbSJung-uk Kim.Loop_grande_avx: 1037*7bded2dbSJung-uk Kim mov $num,`$REG_SZ*17+8`(%rsp) # original $num 1038*7bded2dbSJung-uk Kim xor $num,$num 1039*7bded2dbSJung-uk Kim___ 1040*7bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 1041*7bded2dbSJung-uk Kim $code.=<<___; 1042*7bded2dbSJung-uk Kim mov `16*$i+0`($inp),@ptr[$i] # input pointer 1043*7bded2dbSJung-uk Kim mov `16*$i+8`($inp),%ecx # number of blocks 1044*7bded2dbSJung-uk Kim cmp $num,%ecx 1045*7bded2dbSJung-uk Kim cmovg %ecx,$num # find maximum 1046*7bded2dbSJung-uk Kim test %ecx,%ecx 1047*7bded2dbSJung-uk Kim mov %ecx,`4*$i`(%rbx) # initialize counters 1048*7bded2dbSJung-uk Kim cmovle $Tbl,@ptr[$i] # cancel input 1049*7bded2dbSJung-uk Kim___ 1050*7bded2dbSJung-uk Kim} 1051*7bded2dbSJung-uk Kim$code.=<<___; 1052*7bded2dbSJung-uk Kim test $num,$num 1053*7bded2dbSJung-uk Kim jz .Ldone_avx 1054*7bded2dbSJung-uk Kim 1055*7bded2dbSJung-uk Kim vmovdqu 0x00($ctx),$A # load context 1056*7bded2dbSJung-uk Kim lea 128(%rsp),%rax 1057*7bded2dbSJung-uk Kim vmovdqu 0x20($ctx),$B 1058*7bded2dbSJung-uk Kim vmovdqu 0x40($ctx),$C 1059*7bded2dbSJung-uk Kim vmovdqu 0x60($ctx),$D 1060*7bded2dbSJung-uk Kim vmovdqu 0x80($ctx),$E 1061*7bded2dbSJung-uk Kim vmovdqu 0x60($Tbl),$tx # pbswap_mask 1062*7bded2dbSJung-uk Kim jmp .Loop_avx 1063*7bded2dbSJung-uk Kim 1064*7bded2dbSJung-uk Kim.align 32 1065*7bded2dbSJung-uk Kim.Loop_avx: 1066*7bded2dbSJung-uk Kim___ 1067*7bded2dbSJung-uk Kim$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 1068*7bded2dbSJung-uk Kimfor($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } 1069*7bded2dbSJung-uk Kim$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 1070*7bded2dbSJung-uk Kimfor(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1071*7bded2dbSJung-uk Kim$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 1072*7bded2dbSJung-uk Kimfor(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } 1073*7bded2dbSJung-uk Kim$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 1074*7bded2dbSJung-uk Kimfor(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1075*7bded2dbSJung-uk Kim$code.=<<___; 1076*7bded2dbSJung-uk Kim mov \$1,%ecx 1077*7bded2dbSJung-uk Kim___ 1078*7bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 1079*7bded2dbSJung-uk Kim $code.=<<___; 1080*7bded2dbSJung-uk Kim cmp `4*$i`(%rbx),%ecx # examine counters 1081*7bded2dbSJung-uk Kim cmovge $Tbl,@ptr[$i] # cancel input 1082*7bded2dbSJung-uk Kim___ 1083*7bded2dbSJung-uk Kim} 1084*7bded2dbSJung-uk Kim$code.=<<___; 1085*7bded2dbSJung-uk Kim vmovdqu (%rbx),$t0 # pull counters 1086*7bded2dbSJung-uk Kim vpxor $t2,$t2,$t2 1087*7bded2dbSJung-uk Kim vmovdqa $t0,$t1 1088*7bded2dbSJung-uk Kim vpcmpgtd $t2,$t1,$t1 # mask value 1089*7bded2dbSJung-uk Kim vpaddd $t1,$t0,$t0 # counters-- 1090*7bded2dbSJung-uk Kim 1091*7bded2dbSJung-uk Kim vpand $t1,$A,$A 1092*7bded2dbSJung-uk Kim vpand $t1,$B,$B 1093*7bded2dbSJung-uk Kim vpaddd 0x00($ctx),$A,$A 1094*7bded2dbSJung-uk Kim vpand $t1,$C,$C 1095*7bded2dbSJung-uk Kim vpaddd 0x20($ctx),$B,$B 1096*7bded2dbSJung-uk Kim vpand $t1,$D,$D 1097*7bded2dbSJung-uk Kim vpaddd 0x40($ctx),$C,$C 1098*7bded2dbSJung-uk Kim vpand $t1,$E,$E 1099*7bded2dbSJung-uk Kim vpaddd 0x60($ctx),$D,$D 1100*7bded2dbSJung-uk Kim vpaddd 0x80($ctx),$E,$E 1101*7bded2dbSJung-uk Kim vmovdqu $A,0x00($ctx) 1102*7bded2dbSJung-uk Kim vmovdqu $B,0x20($ctx) 1103*7bded2dbSJung-uk Kim vmovdqu $C,0x40($ctx) 1104*7bded2dbSJung-uk Kim vmovdqu $D,0x60($ctx) 1105*7bded2dbSJung-uk Kim vmovdqu $E,0x80($ctx) 1106*7bded2dbSJung-uk Kim 1107*7bded2dbSJung-uk Kim vmovdqu $t0,(%rbx) # save counters 1108*7bded2dbSJung-uk Kim vmovdqu 0x60($Tbl),$tx # pbswap_mask 1109*7bded2dbSJung-uk Kim dec $num 1110*7bded2dbSJung-uk Kim jnz .Loop_avx 1111*7bded2dbSJung-uk Kim 1112*7bded2dbSJung-uk Kim mov `$REG_SZ*17+8`(%rsp),$num 1113*7bded2dbSJung-uk Kim lea $REG_SZ($ctx),$ctx 1114*7bded2dbSJung-uk Kim lea `16*$REG_SZ/4`($inp),$inp 1115*7bded2dbSJung-uk Kim dec $num 1116*7bded2dbSJung-uk Kim jnz .Loop_grande_avx 1117*7bded2dbSJung-uk Kim 1118*7bded2dbSJung-uk Kim.Ldone_avx: 1119*7bded2dbSJung-uk Kim mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp 1120*7bded2dbSJung-uk Kim vzeroupper 1121*7bded2dbSJung-uk Kim___ 1122*7bded2dbSJung-uk Kim$code.=<<___ if ($win64); 1123*7bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm6 1124*7bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm7 1125*7bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm8 1126*7bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm9 1127*7bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm10 1128*7bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm11 1129*7bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm12 1130*7bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm13 1131*7bded2dbSJung-uk Kim movaps -0x38(%rax),%xmm14 1132*7bded2dbSJung-uk Kim movaps -0x28(%rax),%xmm15 1133*7bded2dbSJung-uk Kim___ 1134*7bded2dbSJung-uk Kim$code.=<<___; 1135*7bded2dbSJung-uk Kim mov -16(%rax),%rbp 1136*7bded2dbSJung-uk Kim mov -8(%rax),%rbx 1137*7bded2dbSJung-uk Kim lea (%rax),%rsp 1138*7bded2dbSJung-uk Kim.Lepilogue_avx: 1139*7bded2dbSJung-uk Kim ret 1140*7bded2dbSJung-uk Kim.size sha1_multi_block_avx,.-sha1_multi_block_avx 1141*7bded2dbSJung-uk Kim___ 1142*7bded2dbSJung-uk Kim 1143*7bded2dbSJung-uk Kim if ($avx>1) { 1144*7bded2dbSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval $1/gem; 1145*7bded2dbSJung-uk Kim 1146*7bded2dbSJung-uk Kim$REG_SZ=32; 1147*7bded2dbSJung-uk Kim 1148*7bded2dbSJung-uk Kim@ptr=map("%r$_",(12..15,8..11)); 1149*7bded2dbSJung-uk Kim 1150*7bded2dbSJung-uk Kim@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4)); 1151*7bded2dbSJung-uk Kim($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9)); 1152*7bded2dbSJung-uk Kim@Xi=map("%ymm$_",(10..14)); 1153*7bded2dbSJung-uk Kim$K="%ymm15"; 1154*7bded2dbSJung-uk Kim 1155*7bded2dbSJung-uk Kim$code.=<<___; 1156*7bded2dbSJung-uk Kim.type sha1_multi_block_avx2,\@function,3 1157*7bded2dbSJung-uk Kim.align 32 1158*7bded2dbSJung-uk Kimsha1_multi_block_avx2: 1159*7bded2dbSJung-uk Kim_avx2_shortcut: 1160*7bded2dbSJung-uk Kim mov %rsp,%rax 1161*7bded2dbSJung-uk Kim push %rbx 1162*7bded2dbSJung-uk Kim push %rbp 1163*7bded2dbSJung-uk Kim push %r12 1164*7bded2dbSJung-uk Kim push %r13 1165*7bded2dbSJung-uk Kim push %r14 1166*7bded2dbSJung-uk Kim push %r15 1167*7bded2dbSJung-uk Kim___ 1168*7bded2dbSJung-uk Kim$code.=<<___ if ($win64); 1169*7bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 1170*7bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 1171*7bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 1172*7bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 1173*7bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 1174*7bded2dbSJung-uk Kim movaps %xmm10,0x40(%rsp) 1175*7bded2dbSJung-uk Kim movaps %xmm11,0x50(%rsp) 1176*7bded2dbSJung-uk Kim movaps %xmm12,-0x78(%rax) 1177*7bded2dbSJung-uk Kim movaps %xmm13,-0x68(%rax) 1178*7bded2dbSJung-uk Kim movaps %xmm14,-0x58(%rax) 1179*7bded2dbSJung-uk Kim movaps %xmm15,-0x48(%rax) 1180*7bded2dbSJung-uk Kim___ 1181*7bded2dbSJung-uk Kim$code.=<<___; 1182*7bded2dbSJung-uk Kim sub \$`$REG_SZ*18`, %rsp 1183*7bded2dbSJung-uk Kim and \$-256,%rsp 1184*7bded2dbSJung-uk Kim mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1185*7bded2dbSJung-uk Kim.Lbody_avx2: 1186*7bded2dbSJung-uk Kim lea K_XX_XX(%rip),$Tbl 1187*7bded2dbSJung-uk Kim shr \$1,$num 1188*7bded2dbSJung-uk Kim 1189*7bded2dbSJung-uk Kim vzeroupper 1190*7bded2dbSJung-uk Kim.Loop_grande_avx2: 1191*7bded2dbSJung-uk Kim mov $num,`$REG_SZ*17+8`(%rsp) # original $num 1192*7bded2dbSJung-uk Kim xor $num,$num 1193*7bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 1194*7bded2dbSJung-uk Kim___ 1195*7bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) { 1196*7bded2dbSJung-uk Kim $code.=<<___; 1197*7bded2dbSJung-uk Kim mov `16*$i+0`($inp),@ptr[$i] # input pointer 1198*7bded2dbSJung-uk Kim mov `16*$i+8`($inp),%ecx # number of blocks 1199*7bded2dbSJung-uk Kim cmp $num,%ecx 1200*7bded2dbSJung-uk Kim cmovg %ecx,$num # find maximum 1201*7bded2dbSJung-uk Kim test %ecx,%ecx 1202*7bded2dbSJung-uk Kim mov %ecx,`4*$i`(%rbx) # initialize counters 1203*7bded2dbSJung-uk Kim cmovle $Tbl,@ptr[$i] # cancel input 1204*7bded2dbSJung-uk Kim___ 1205*7bded2dbSJung-uk Kim} 1206*7bded2dbSJung-uk Kim$code.=<<___; 1207*7bded2dbSJung-uk Kim vmovdqu 0x00($ctx),$A # load context 1208*7bded2dbSJung-uk Kim lea 128(%rsp),%rax 1209*7bded2dbSJung-uk Kim vmovdqu 0x20($ctx),$B 1210*7bded2dbSJung-uk Kim lea 256+128(%rsp),%rbx 1211*7bded2dbSJung-uk Kim vmovdqu 0x40($ctx),$C 1212*7bded2dbSJung-uk Kim vmovdqu 0x60($ctx),$D 1213*7bded2dbSJung-uk Kim vmovdqu 0x80($ctx),$E 1214*7bded2dbSJung-uk Kim vmovdqu 0x60($Tbl),$tx # pbswap_mask 1215*7bded2dbSJung-uk Kim jmp .Loop_avx2 1216*7bded2dbSJung-uk Kim 1217*7bded2dbSJung-uk Kim.align 32 1218*7bded2dbSJung-uk Kim.Loop_avx2: 1219*7bded2dbSJung-uk Kim___ 1220*7bded2dbSJung-uk Kim$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 1221*7bded2dbSJung-uk Kimfor($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } 1222*7bded2dbSJung-uk Kim$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 1223*7bded2dbSJung-uk Kimfor(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1224*7bded2dbSJung-uk Kim$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 1225*7bded2dbSJung-uk Kimfor(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } 1226*7bded2dbSJung-uk Kim$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 1227*7bded2dbSJung-uk Kimfor(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1228*7bded2dbSJung-uk Kim$code.=<<___; 1229*7bded2dbSJung-uk Kim mov \$1,%ecx 1230*7bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 1231*7bded2dbSJung-uk Kim___ 1232*7bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) { 1233*7bded2dbSJung-uk Kim $code.=<<___; 1234*7bded2dbSJung-uk Kim cmp `4*$i`(%rbx),%ecx # examine counters 1235*7bded2dbSJung-uk Kim cmovge $Tbl,@ptr[$i] # cancel input 1236*7bded2dbSJung-uk Kim___ 1237*7bded2dbSJung-uk Kim} 1238*7bded2dbSJung-uk Kim$code.=<<___; 1239*7bded2dbSJung-uk Kim vmovdqu (%rbx),$t0 # pull counters 1240*7bded2dbSJung-uk Kim vpxor $t2,$t2,$t2 1241*7bded2dbSJung-uk Kim vmovdqa $t0,$t1 1242*7bded2dbSJung-uk Kim vpcmpgtd $t2,$t1,$t1 # mask value 1243*7bded2dbSJung-uk Kim vpaddd $t1,$t0,$t0 # counters-- 1244*7bded2dbSJung-uk Kim 1245*7bded2dbSJung-uk Kim vpand $t1,$A,$A 1246*7bded2dbSJung-uk Kim vpand $t1,$B,$B 1247*7bded2dbSJung-uk Kim vpaddd 0x00($ctx),$A,$A 1248*7bded2dbSJung-uk Kim vpand $t1,$C,$C 1249*7bded2dbSJung-uk Kim vpaddd 0x20($ctx),$B,$B 1250*7bded2dbSJung-uk Kim vpand $t1,$D,$D 1251*7bded2dbSJung-uk Kim vpaddd 0x40($ctx),$C,$C 1252*7bded2dbSJung-uk Kim vpand $t1,$E,$E 1253*7bded2dbSJung-uk Kim vpaddd 0x60($ctx),$D,$D 1254*7bded2dbSJung-uk Kim vpaddd 0x80($ctx),$E,$E 1255*7bded2dbSJung-uk Kim vmovdqu $A,0x00($ctx) 1256*7bded2dbSJung-uk Kim vmovdqu $B,0x20($ctx) 1257*7bded2dbSJung-uk Kim vmovdqu $C,0x40($ctx) 1258*7bded2dbSJung-uk Kim vmovdqu $D,0x60($ctx) 1259*7bded2dbSJung-uk Kim vmovdqu $E,0x80($ctx) 1260*7bded2dbSJung-uk Kim 1261*7bded2dbSJung-uk Kim vmovdqu $t0,(%rbx) # save counters 1262*7bded2dbSJung-uk Kim lea 256+128(%rsp),%rbx 1263*7bded2dbSJung-uk Kim vmovdqu 0x60($Tbl),$tx # pbswap_mask 1264*7bded2dbSJung-uk Kim dec $num 1265*7bded2dbSJung-uk Kim jnz .Loop_avx2 1266*7bded2dbSJung-uk Kim 1267*7bded2dbSJung-uk Kim #mov `$REG_SZ*17+8`(%rsp),$num 1268*7bded2dbSJung-uk Kim #lea $REG_SZ($ctx),$ctx 1269*7bded2dbSJung-uk Kim #lea `16*$REG_SZ/4`($inp),$inp 1270*7bded2dbSJung-uk Kim #dec $num 1271*7bded2dbSJung-uk Kim #jnz .Loop_grande_avx2 1272*7bded2dbSJung-uk Kim 1273*7bded2dbSJung-uk Kim.Ldone_avx2: 1274*7bded2dbSJung-uk Kim mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp 1275*7bded2dbSJung-uk Kim vzeroupper 1276*7bded2dbSJung-uk Kim___ 1277*7bded2dbSJung-uk Kim$code.=<<___ if ($win64); 1278*7bded2dbSJung-uk Kim movaps -0xd8(%rax),%xmm6 1279*7bded2dbSJung-uk Kim movaps -0xc8(%rax),%xmm7 1280*7bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm8 1281*7bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm9 1282*7bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm10 1283*7bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm11 1284*7bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm12 1285*7bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm13 1286*7bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm14 1287*7bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm15 1288*7bded2dbSJung-uk Kim___ 1289*7bded2dbSJung-uk Kim$code.=<<___; 1290*7bded2dbSJung-uk Kim mov -48(%rax),%r15 1291*7bded2dbSJung-uk Kim mov -40(%rax),%r14 1292*7bded2dbSJung-uk Kim mov -32(%rax),%r13 1293*7bded2dbSJung-uk Kim mov -24(%rax),%r12 1294*7bded2dbSJung-uk Kim mov -16(%rax),%rbp 1295*7bded2dbSJung-uk Kim mov -8(%rax),%rbx 1296*7bded2dbSJung-uk Kim lea (%rax),%rsp 1297*7bded2dbSJung-uk Kim.Lepilogue_avx2: 1298*7bded2dbSJung-uk Kim ret 1299*7bded2dbSJung-uk Kim.size sha1_multi_block_avx2,.-sha1_multi_block_avx2 1300*7bded2dbSJung-uk Kim___ 1301*7bded2dbSJung-uk Kim } }}} 1302*7bded2dbSJung-uk Kim$code.=<<___; 1303*7bded2dbSJung-uk Kim 1304*7bded2dbSJung-uk Kim.align 256 1305*7bded2dbSJung-uk Kim .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1306*7bded2dbSJung-uk Kim .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1307*7bded2dbSJung-uk KimK_XX_XX: 1308*7bded2dbSJung-uk Kim .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1309*7bded2dbSJung-uk Kim .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1310*7bded2dbSJung-uk Kim .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1311*7bded2dbSJung-uk Kim .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1312*7bded2dbSJung-uk Kim .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1313*7bded2dbSJung-uk Kim .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1314*7bded2dbSJung-uk Kim .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1315*7bded2dbSJung-uk Kim .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1316*7bded2dbSJung-uk Kim .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 1317*7bded2dbSJung-uk Kim .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1318*7bded2dbSJung-uk Kim___ 1319*7bded2dbSJung-uk Kim 1320*7bded2dbSJung-uk Kimif ($win64) { 1321*7bded2dbSJung-uk Kim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1322*7bded2dbSJung-uk Kim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1323*7bded2dbSJung-uk Kim$rec="%rcx"; 1324*7bded2dbSJung-uk Kim$frame="%rdx"; 1325*7bded2dbSJung-uk Kim$context="%r8"; 1326*7bded2dbSJung-uk Kim$disp="%r9"; 1327*7bded2dbSJung-uk Kim 1328*7bded2dbSJung-uk Kim$code.=<<___; 1329*7bded2dbSJung-uk Kim.extern __imp_RtlVirtualUnwind 1330*7bded2dbSJung-uk Kim.type se_handler,\@abi-omnipotent 1331*7bded2dbSJung-uk Kim.align 16 1332*7bded2dbSJung-uk Kimse_handler: 1333*7bded2dbSJung-uk Kim push %rsi 1334*7bded2dbSJung-uk Kim push %rdi 1335*7bded2dbSJung-uk Kim push %rbx 1336*7bded2dbSJung-uk Kim push %rbp 1337*7bded2dbSJung-uk Kim push %r12 1338*7bded2dbSJung-uk Kim push %r13 1339*7bded2dbSJung-uk Kim push %r14 1340*7bded2dbSJung-uk Kim push %r15 1341*7bded2dbSJung-uk Kim pushfq 1342*7bded2dbSJung-uk Kim sub \$64,%rsp 1343*7bded2dbSJung-uk Kim 1344*7bded2dbSJung-uk Kim mov 120($context),%rax # pull context->Rax 1345*7bded2dbSJung-uk Kim mov 248($context),%rbx # pull context->Rip 1346*7bded2dbSJung-uk Kim 1347*7bded2dbSJung-uk Kim mov 8($disp),%rsi # disp->ImageBase 1348*7bded2dbSJung-uk Kim mov 56($disp),%r11 # disp->HandlerData 1349*7bded2dbSJung-uk Kim 1350*7bded2dbSJung-uk Kim mov 0(%r11),%r10d # HandlerData[0] 1351*7bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # end of prologue label 1352*7bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip<.Lbody 1353*7bded2dbSJung-uk Kim jb .Lin_prologue 1354*7bded2dbSJung-uk Kim 1355*7bded2dbSJung-uk Kim mov 152($context),%rax # pull context->Rsp 1356*7bded2dbSJung-uk Kim 1357*7bded2dbSJung-uk Kim mov 4(%r11),%r10d # HandlerData[1] 1358*7bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # epilogue label 1359*7bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip>=.Lepilogue 1360*7bded2dbSJung-uk Kim jae .Lin_prologue 1361*7bded2dbSJung-uk Kim 1362*7bded2dbSJung-uk Kim mov `16*17`(%rax),%rax # pull saved stack pointer 1363*7bded2dbSJung-uk Kim 1364*7bded2dbSJung-uk Kim mov -8(%rax),%rbx 1365*7bded2dbSJung-uk Kim mov -16(%rax),%rbp 1366*7bded2dbSJung-uk Kim mov %rbx,144($context) # restore context->Rbx 1367*7bded2dbSJung-uk Kim mov %rbp,160($context) # restore context->Rbp 1368*7bded2dbSJung-uk Kim 1369*7bded2dbSJung-uk Kim lea -24-10*16(%rax),%rsi 1370*7bded2dbSJung-uk Kim lea 512($context),%rdi # &context.Xmm6 1371*7bded2dbSJung-uk Kim mov \$20,%ecx 1372*7bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 1373*7bded2dbSJung-uk Kim 1374*7bded2dbSJung-uk Kim.Lin_prologue: 1375*7bded2dbSJung-uk Kim mov 8(%rax),%rdi 1376*7bded2dbSJung-uk Kim mov 16(%rax),%rsi 1377*7bded2dbSJung-uk Kim mov %rax,152($context) # restore context->Rsp 1378*7bded2dbSJung-uk Kim mov %rsi,168($context) # restore context->Rsi 1379*7bded2dbSJung-uk Kim mov %rdi,176($context) # restore context->Rdi 1380*7bded2dbSJung-uk Kim 1381*7bded2dbSJung-uk Kim mov 40($disp),%rdi # disp->ContextRecord 1382*7bded2dbSJung-uk Kim mov $context,%rsi # context 1383*7bded2dbSJung-uk Kim mov \$154,%ecx # sizeof(CONTEXT) 1384*7bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 1385*7bded2dbSJung-uk Kim 1386*7bded2dbSJung-uk Kim mov $disp,%rsi 1387*7bded2dbSJung-uk Kim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1388*7bded2dbSJung-uk Kim mov 8(%rsi),%rdx # arg2, disp->ImageBase 1389*7bded2dbSJung-uk Kim mov 0(%rsi),%r8 # arg3, disp->ControlPc 1390*7bded2dbSJung-uk Kim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1391*7bded2dbSJung-uk Kim mov 40(%rsi),%r10 # disp->ContextRecord 1392*7bded2dbSJung-uk Kim lea 56(%rsi),%r11 # &disp->HandlerData 1393*7bded2dbSJung-uk Kim lea 24(%rsi),%r12 # &disp->EstablisherFrame 1394*7bded2dbSJung-uk Kim mov %r10,32(%rsp) # arg5 1395*7bded2dbSJung-uk Kim mov %r11,40(%rsp) # arg6 1396*7bded2dbSJung-uk Kim mov %r12,48(%rsp) # arg7 1397*7bded2dbSJung-uk Kim mov %rcx,56(%rsp) # arg8, (NULL) 1398*7bded2dbSJung-uk Kim call *__imp_RtlVirtualUnwind(%rip) 1399*7bded2dbSJung-uk Kim 1400*7bded2dbSJung-uk Kim mov \$1,%eax # ExceptionContinueSearch 1401*7bded2dbSJung-uk Kim add \$64,%rsp 1402*7bded2dbSJung-uk Kim popfq 1403*7bded2dbSJung-uk Kim pop %r15 1404*7bded2dbSJung-uk Kim pop %r14 1405*7bded2dbSJung-uk Kim pop %r13 1406*7bded2dbSJung-uk Kim pop %r12 1407*7bded2dbSJung-uk Kim pop %rbp 1408*7bded2dbSJung-uk Kim pop %rbx 1409*7bded2dbSJung-uk Kim pop %rdi 1410*7bded2dbSJung-uk Kim pop %rsi 1411*7bded2dbSJung-uk Kim ret 1412*7bded2dbSJung-uk Kim.size se_handler,.-se_handler 1413*7bded2dbSJung-uk Kim___ 1414*7bded2dbSJung-uk Kim$code.=<<___ if ($avx>1); 1415*7bded2dbSJung-uk Kim.type avx2_handler,\@abi-omnipotent 1416*7bded2dbSJung-uk Kim.align 16 1417*7bded2dbSJung-uk Kimavx2_handler: 1418*7bded2dbSJung-uk Kim push %rsi 1419*7bded2dbSJung-uk Kim push %rdi 1420*7bded2dbSJung-uk Kim push %rbx 1421*7bded2dbSJung-uk Kim push %rbp 1422*7bded2dbSJung-uk Kim push %r12 1423*7bded2dbSJung-uk Kim push %r13 1424*7bded2dbSJung-uk Kim push %r14 1425*7bded2dbSJung-uk Kim push %r15 1426*7bded2dbSJung-uk Kim pushfq 1427*7bded2dbSJung-uk Kim sub \$64,%rsp 1428*7bded2dbSJung-uk Kim 1429*7bded2dbSJung-uk Kim mov 120($context),%rax # pull context->Rax 1430*7bded2dbSJung-uk Kim mov 248($context),%rbx # pull context->Rip 1431*7bded2dbSJung-uk Kim 1432*7bded2dbSJung-uk Kim mov 8($disp),%rsi # disp->ImageBase 1433*7bded2dbSJung-uk Kim mov 56($disp),%r11 # disp->HandlerData 1434*7bded2dbSJung-uk Kim 1435*7bded2dbSJung-uk Kim mov 0(%r11),%r10d # HandlerData[0] 1436*7bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # end of prologue label 1437*7bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip<body label 1438*7bded2dbSJung-uk Kim jb .Lin_prologue 1439*7bded2dbSJung-uk Kim 1440*7bded2dbSJung-uk Kim mov 152($context),%rax # pull context->Rsp 1441*7bded2dbSJung-uk Kim 1442*7bded2dbSJung-uk Kim mov 4(%r11),%r10d # HandlerData[1] 1443*7bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # epilogue label 1444*7bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip>=epilogue label 1445*7bded2dbSJung-uk Kim jae .Lin_prologue 1446*7bded2dbSJung-uk Kim 1447*7bded2dbSJung-uk Kim mov `32*17`($context),%rax # pull saved stack pointer 1448*7bded2dbSJung-uk Kim 1449*7bded2dbSJung-uk Kim mov -8(%rax),%rbx 1450*7bded2dbSJung-uk Kim mov -16(%rax),%rbp 1451*7bded2dbSJung-uk Kim mov -24(%rax),%r12 1452*7bded2dbSJung-uk Kim mov -32(%rax),%r13 1453*7bded2dbSJung-uk Kim mov -40(%rax),%r14 1454*7bded2dbSJung-uk Kim mov -48(%rax),%r15 1455*7bded2dbSJung-uk Kim mov %rbx,144($context) # restore context->Rbx 1456*7bded2dbSJung-uk Kim mov %rbp,160($context) # restore context->Rbp 1457*7bded2dbSJung-uk Kim mov %r12,216($context) # restore cotnext->R12 1458*7bded2dbSJung-uk Kim mov %r13,224($context) # restore cotnext->R13 1459*7bded2dbSJung-uk Kim mov %r14,232($context) # restore cotnext->R14 1460*7bded2dbSJung-uk Kim mov %r15,240($context) # restore cotnext->R15 1461*7bded2dbSJung-uk Kim 1462*7bded2dbSJung-uk Kim lea -56-10*16(%rax),%rsi 1463*7bded2dbSJung-uk Kim lea 512($context),%rdi # &context.Xmm6 1464*7bded2dbSJung-uk Kim mov \$20,%ecx 1465*7bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 1466*7bded2dbSJung-uk Kim 1467*7bded2dbSJung-uk Kim jmp .Lin_prologue 1468*7bded2dbSJung-uk Kim.size avx2_handler,.-avx2_handler 1469*7bded2dbSJung-uk Kim___ 1470*7bded2dbSJung-uk Kim$code.=<<___; 1471*7bded2dbSJung-uk Kim.section .pdata 1472*7bded2dbSJung-uk Kim.align 4 1473*7bded2dbSJung-uk Kim .rva .LSEH_begin_sha1_multi_block 1474*7bded2dbSJung-uk Kim .rva .LSEH_end_sha1_multi_block 1475*7bded2dbSJung-uk Kim .rva .LSEH_info_sha1_multi_block 1476*7bded2dbSJung-uk Kim .rva .LSEH_begin_sha1_multi_block_shaext 1477*7bded2dbSJung-uk Kim .rva .LSEH_end_sha1_multi_block_shaext 1478*7bded2dbSJung-uk Kim .rva .LSEH_info_sha1_multi_block_shaext 1479*7bded2dbSJung-uk Kim___ 1480*7bded2dbSJung-uk Kim$code.=<<___ if ($avx); 1481*7bded2dbSJung-uk Kim .rva .LSEH_begin_sha1_multi_block_avx 1482*7bded2dbSJung-uk Kim .rva .LSEH_end_sha1_multi_block_avx 1483*7bded2dbSJung-uk Kim .rva .LSEH_info_sha1_multi_block_avx 1484*7bded2dbSJung-uk Kim___ 1485*7bded2dbSJung-uk Kim$code.=<<___ if ($avx>1); 1486*7bded2dbSJung-uk Kim .rva .LSEH_begin_sha1_multi_block_avx2 1487*7bded2dbSJung-uk Kim .rva .LSEH_end_sha1_multi_block_avx2 1488*7bded2dbSJung-uk Kim .rva .LSEH_info_sha1_multi_block_avx2 1489*7bded2dbSJung-uk Kim___ 1490*7bded2dbSJung-uk Kim$code.=<<___; 1491*7bded2dbSJung-uk Kim.section .xdata 1492*7bded2dbSJung-uk Kim.align 8 1493*7bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block: 1494*7bded2dbSJung-uk Kim .byte 9,0,0,0 1495*7bded2dbSJung-uk Kim .rva se_handler 1496*7bded2dbSJung-uk Kim .rva .Lbody,.Lepilogue # HandlerData[] 1497*7bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block_shaext: 1498*7bded2dbSJung-uk Kim .byte 9,0,0,0 1499*7bded2dbSJung-uk Kim .rva se_handler 1500*7bded2dbSJung-uk Kim .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] 1501*7bded2dbSJung-uk Kim___ 1502*7bded2dbSJung-uk Kim$code.=<<___ if ($avx); 1503*7bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block_avx: 1504*7bded2dbSJung-uk Kim .byte 9,0,0,0 1505*7bded2dbSJung-uk Kim .rva se_handler 1506*7bded2dbSJung-uk Kim .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] 1507*7bded2dbSJung-uk Kim___ 1508*7bded2dbSJung-uk Kim$code.=<<___ if ($avx>1); 1509*7bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block_avx2: 1510*7bded2dbSJung-uk Kim .byte 9,0,0,0 1511*7bded2dbSJung-uk Kim .rva avx2_handler 1512*7bded2dbSJung-uk Kim .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] 1513*7bded2dbSJung-uk Kim___ 1514*7bded2dbSJung-uk Kim} 1515*7bded2dbSJung-uk Kim#################################################################### 1516*7bded2dbSJung-uk Kim 1517*7bded2dbSJung-uk Kimsub rex { 1518*7bded2dbSJung-uk Kim local *opcode=shift; 1519*7bded2dbSJung-uk Kim my ($dst,$src)=@_; 1520*7bded2dbSJung-uk Kim my $rex=0; 1521*7bded2dbSJung-uk Kim 1522*7bded2dbSJung-uk Kim $rex|=0x04 if ($dst>=8); 1523*7bded2dbSJung-uk Kim $rex|=0x01 if ($src>=8); 1524*7bded2dbSJung-uk Kim unshift @opcode,$rex|0x40 if ($rex); 1525*7bded2dbSJung-uk Kim} 1526*7bded2dbSJung-uk Kim 1527*7bded2dbSJung-uk Kimsub sha1rnds4 { 1528*7bded2dbSJung-uk Kim if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1529*7bded2dbSJung-uk Kim my @opcode=(0x0f,0x3a,0xcc); 1530*7bded2dbSJung-uk Kim rex(\@opcode,$3,$2); 1531*7bded2dbSJung-uk Kim push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 1532*7bded2dbSJung-uk Kim my $c=$1; 1533*7bded2dbSJung-uk Kim push @opcode,$c=~/^0/?oct($c):$c; 1534*7bded2dbSJung-uk Kim return ".byte\t".join(',',@opcode); 1535*7bded2dbSJung-uk Kim } else { 1536*7bded2dbSJung-uk Kim return "sha1rnds4\t".@_[0]; 1537*7bded2dbSJung-uk Kim } 1538*7bded2dbSJung-uk Kim} 1539*7bded2dbSJung-uk Kim 1540*7bded2dbSJung-uk Kimsub sha1op38 { 1541*7bded2dbSJung-uk Kim my $instr = shift; 1542*7bded2dbSJung-uk Kim my %opcodelet = ( 1543*7bded2dbSJung-uk Kim "sha1nexte" => 0xc8, 1544*7bded2dbSJung-uk Kim "sha1msg1" => 0xc9, 1545*7bded2dbSJung-uk Kim "sha1msg2" => 0xca ); 1546*7bded2dbSJung-uk Kim 1547*7bded2dbSJung-uk Kim if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1548*7bded2dbSJung-uk Kim my @opcode=(0x0f,0x38); 1549*7bded2dbSJung-uk Kim rex(\@opcode,$2,$1); 1550*7bded2dbSJung-uk Kim push @opcode,$opcodelet{$instr}; 1551*7bded2dbSJung-uk Kim push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1552*7bded2dbSJung-uk Kim return ".byte\t".join(',',@opcode); 1553*7bded2dbSJung-uk Kim } else { 1554*7bded2dbSJung-uk Kim return $instr."\t".@_[0]; 1555*7bded2dbSJung-uk Kim } 1556*7bded2dbSJung-uk Kim} 1557*7bded2dbSJung-uk Kim 1558*7bded2dbSJung-uk Kimforeach (split("\n",$code)) { 1559*7bded2dbSJung-uk Kim s/\`([^\`]*)\`/eval($1)/ge; 1560*7bded2dbSJung-uk Kim 1561*7bded2dbSJung-uk Kim s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or 1562*7bded2dbSJung-uk Kim s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or 1563*7bded2dbSJung-uk Kim 1564*7bded2dbSJung-uk Kim s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1565*7bded2dbSJung-uk Kim s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1566*7bded2dbSJung-uk Kim s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or 1567*7bded2dbSJung-uk Kim s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1568*7bded2dbSJung-uk Kim s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or 1569*7bded2dbSJung-uk Kim s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1570*7bded2dbSJung-uk Kim 1571*7bded2dbSJung-uk Kim print $_,"\n"; 1572*7bded2dbSJung-uk Kim} 1573*7bded2dbSJung-uk Kim 1574*7bded2dbSJung-uk Kimclose STDOUT; 1575