17bded2dbSJung-uk Kim#! /usr/bin/env perl 2*17f01e99SJung-uk Kim# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# 4e71b7053SJung-uk Kim# Licensed under the OpenSSL license (the "License"). You may not use 5e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8e71b7053SJung-uk Kim 97bded2dbSJung-uk Kim 107bded2dbSJung-uk Kim# ==================================================================== 117bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 127bded2dbSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 137bded2dbSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 147bded2dbSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 157bded2dbSJung-uk Kim# ==================================================================== 167bded2dbSJung-uk Kim 177bded2dbSJung-uk Kim# Multi-buffer SHA1 procedure processes n buffers in parallel by 187bded2dbSJung-uk Kim# placing buffer data to designated lane of SIMD register. n is 197bded2dbSJung-uk Kim# naturally limited to 4 on pre-AVX2 processors and to 8 on 207bded2dbSJung-uk Kim# AVX2-capable processors such as Haswell. 217bded2dbSJung-uk Kim# 227bded2dbSJung-uk Kim# this +aesni(i) sha1 aesni-sha1 gain(iv) 237bded2dbSJung-uk Kim# ------------------------------------------------------------------- 247bded2dbSJung-uk Kim# Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68% 257bded2dbSJung-uk Kim# Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51% 267bded2dbSJung-uk Kim# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% 277bded2dbSJung-uk Kim# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% 287bded2dbSJung-uk Kim# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% 29e71b7053SJung-uk Kim# Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145% 307bded2dbSJung-uk Kim# Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% 317bded2dbSJung-uk Kim# 327bded2dbSJung-uk Kim# (i) multi-block CBC encrypt with 128-bit key; 337bded2dbSJung-uk Kim# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, 347bded2dbSJung-uk Kim# because of lower AES-NI instruction throughput; 357bded2dbSJung-uk Kim# (iii) "this" is for n=8, when we gather twice as much data, result 367bded2dbSJung-uk Kim# for n=4 is 8.00+4.44=12.4; 377bded2dbSJung-uk Kim# (iv) presented improvement coefficients are asymptotic limits and 387bded2dbSJung-uk Kim# in real-life application are somewhat lower, e.g. for 2KB 397bded2dbSJung-uk Kim# fragments they range from 30% to 100% (on Haswell); 407bded2dbSJung-uk Kim 417bded2dbSJung-uk Kim$flavour = shift; 427bded2dbSJung-uk Kim$output = shift; 437bded2dbSJung-uk Kimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 447bded2dbSJung-uk Kim 457bded2dbSJung-uk Kim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 467bded2dbSJung-uk Kim 477bded2dbSJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 487bded2dbSJung-uk Kim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 497bded2dbSJung-uk Kim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 507bded2dbSJung-uk Kimdie "can't locate x86_64-xlate.pl"; 517bded2dbSJung-uk Kim 527bded2dbSJung-uk Kim$avx=0; 537bded2dbSJung-uk Kim 547bded2dbSJung-uk Kimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 557bded2dbSJung-uk Kim =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 567bded2dbSJung-uk Kim $avx = ($1>=2.19) + ($1>=2.22); 577bded2dbSJung-uk Kim} 587bded2dbSJung-uk Kim 597bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 607bded2dbSJung-uk Kim `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 617bded2dbSJung-uk Kim $avx = ($1>=2.09) + ($1>=2.10); 627bded2dbSJung-uk Kim} 637bded2dbSJung-uk Kim 647bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 657bded2dbSJung-uk Kim `ml64 2>&1` =~ /Version ([0-9]+)\./) { 667bded2dbSJung-uk Kim $avx = ($1>=10) + ($1>=11); 677bded2dbSJung-uk Kim} 687bded2dbSJung-uk Kim 69*17f01e99SJung-uk Kimif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 707bded2dbSJung-uk Kim $avx = ($2>=3.0) + ($2>3.0); 717bded2dbSJung-uk Kim} 727bded2dbSJung-uk Kim 73e71b7053SJung-uk Kimopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 747bded2dbSJung-uk Kim*STDOUT=*OUT; 757bded2dbSJung-uk Kim 767bded2dbSJung-uk Kim# void sha1_multi_block ( 777bded2dbSJung-uk Kim# struct { unsigned int A[8]; 787bded2dbSJung-uk Kim# unsigned int B[8]; 797bded2dbSJung-uk Kim# unsigned int C[8]; 807bded2dbSJung-uk Kim# unsigned int D[8]; 817bded2dbSJung-uk Kim# unsigned int E[8]; } *ctx, 827bded2dbSJung-uk Kim# struct { void *ptr; int blocks; } inp[8], 837bded2dbSJung-uk Kim# int num); /* 1 or 2 */ 847bded2dbSJung-uk Kim# 857bded2dbSJung-uk Kim$ctx="%rdi"; # 1st arg 867bded2dbSJung-uk Kim$inp="%rsi"; # 2nd arg 877bded2dbSJung-uk Kim$num="%edx"; 887bded2dbSJung-uk Kim@ptr=map("%r$_",(8..11)); 897bded2dbSJung-uk Kim$Tbl="%rbp"; 907bded2dbSJung-uk Kim 917bded2dbSJung-uk Kim@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4)); 927bded2dbSJung-uk Kim($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9)); 937bded2dbSJung-uk Kim@Xi=map("%xmm$_",(10..14)); 947bded2dbSJung-uk Kim$K="%xmm15"; 957bded2dbSJung-uk Kim 967bded2dbSJung-uk Kimif (1) { 977bded2dbSJung-uk Kim # Atom-specific optimization aiming to eliminate pshufb with high 987bded2dbSJung-uk Kim # registers [and thus get rid of 48 cycles accumulated penalty] 997bded2dbSJung-uk Kim @Xi=map("%xmm$_",(0..4)); 1007bded2dbSJung-uk Kim ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); 1017bded2dbSJung-uk Kim @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); 1027bded2dbSJung-uk Kim} 1037bded2dbSJung-uk Kim 1047bded2dbSJung-uk Kim$REG_SZ=16; 1057bded2dbSJung-uk Kim 1067bded2dbSJung-uk Kimsub Xi_off { 1077bded2dbSJung-uk Kimmy $off = shift; 1087bded2dbSJung-uk Kim 1097bded2dbSJung-uk Kim $off %= 16; $off *= $REG_SZ; 1107bded2dbSJung-uk Kim $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; 1117bded2dbSJung-uk Kim} 1127bded2dbSJung-uk Kim 1137bded2dbSJung-uk Kimsub BODY_00_19 { 1147bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 1157bded2dbSJung-uk Kimmy $j=$i+1; 1167bded2dbSJung-uk Kimmy $k=$i+2; 1177bded2dbSJung-uk Kim 1187bded2dbSJung-uk Kim# Loads are performed 2+3/4 iterations in advance. 3/4 means that out 1197bded2dbSJung-uk Kim# of 4 words you would expect to be loaded per given iteration one is 1207bded2dbSJung-uk Kim# spilled to next iteration. In other words indices in four input 1217bded2dbSJung-uk Kim# streams are distributed as following: 1227bded2dbSJung-uk Kim# 1237bded2dbSJung-uk Kim# $i==0: 0,0,0,0,1,1,1,1,2,2,2, 1247bded2dbSJung-uk Kim# $i==1: 2,3,3,3, 1257bded2dbSJung-uk Kim# $i==2: 3,4,4,4, 1267bded2dbSJung-uk Kim# ... 1277bded2dbSJung-uk Kim# $i==13: 14,15,15,15, 1287bded2dbSJung-uk Kim# $i==14: 15 1297bded2dbSJung-uk Kim# 1307bded2dbSJung-uk Kim# Then at $i==15 Xupdate is applied one iteration in advance... 1317bded2dbSJung-uk Kim$code.=<<___ if ($i==0); 1327bded2dbSJung-uk Kim movd (@ptr[0]),@Xi[0] 1337bded2dbSJung-uk Kim lea `16*4`(@ptr[0]),@ptr[0] 1347bded2dbSJung-uk Kim movd (@ptr[1]),@Xi[2] # borrow @Xi[2] 1357bded2dbSJung-uk Kim lea `16*4`(@ptr[1]),@ptr[1] 1367bded2dbSJung-uk Kim movd (@ptr[2]),@Xi[3] # borrow @Xi[3] 1377bded2dbSJung-uk Kim lea `16*4`(@ptr[2]),@ptr[2] 1387bded2dbSJung-uk Kim movd (@ptr[3]),@Xi[4] # borrow @Xi[4] 1397bded2dbSJung-uk Kim lea `16*4`(@ptr[3]),@ptr[3] 1407bded2dbSJung-uk Kim punpckldq @Xi[3],@Xi[0] 1417bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[0]),@Xi[1] 1427bded2dbSJung-uk Kim punpckldq @Xi[4],@Xi[2] 1437bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[1]),$t3 1447bded2dbSJung-uk Kim punpckldq @Xi[2],@Xi[0] 1457bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[2]),$t2 1467bded2dbSJung-uk Kim pshufb $tx,@Xi[0] 1477bded2dbSJung-uk Kim___ 1487bded2dbSJung-uk Kim$code.=<<___ if ($i<14); # just load input 1497bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[3]),$t1 1507bded2dbSJung-uk Kim punpckldq $t2,@Xi[1] 1517bded2dbSJung-uk Kim movdqa $a,$t2 1527bded2dbSJung-uk Kim paddd $K,$e # e+=K_00_19 1537bded2dbSJung-uk Kim punpckldq $t1,$t3 1547bded2dbSJung-uk Kim movdqa $b,$t1 1557bded2dbSJung-uk Kim movdqa $b,$t0 1567bded2dbSJung-uk Kim pslld \$5,$t2 1577bded2dbSJung-uk Kim pandn $d,$t1 1587bded2dbSJung-uk Kim pand $c,$t0 1597bded2dbSJung-uk Kim punpckldq $t3,@Xi[1] 1607bded2dbSJung-uk Kim movdqa $a,$t3 1617bded2dbSJung-uk Kim 1627bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 1637bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 1647bded2dbSJung-uk Kim movd `4*$k-16*4`(@ptr[0]),@Xi[2] 1657bded2dbSJung-uk Kim psrld \$27,$t3 1667bded2dbSJung-uk Kim pxor $t1,$t0 # Ch(b,c,d) 1677bded2dbSJung-uk Kim movdqa $b,$t1 1687bded2dbSJung-uk Kim 1697bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 1707bded2dbSJung-uk Kim movd `4*$k-16*4`(@ptr[1]),$t3 1717bded2dbSJung-uk Kim pslld \$30,$t1 1727bded2dbSJung-uk Kim paddd $t0,$e # e+=Ch(b,c,d) 1737bded2dbSJung-uk Kim 1747bded2dbSJung-uk Kim psrld \$2,$b 1757bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 1767bded2dbSJung-uk Kim pshufb $tx,@Xi[1] 1777bded2dbSJung-uk Kim movd `4*$k-16*4`(@ptr[2]),$t2 1787bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 1797bded2dbSJung-uk Kim___ 1807bded2dbSJung-uk Kim$code.=<<___ if ($i==14); # just load input 1817bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[3]),$t1 1827bded2dbSJung-uk Kim punpckldq $t2,@Xi[1] 1837bded2dbSJung-uk Kim movdqa $a,$t2 1847bded2dbSJung-uk Kim paddd $K,$e # e+=K_00_19 1857bded2dbSJung-uk Kim punpckldq $t1,$t3 1867bded2dbSJung-uk Kim movdqa $b,$t1 1877bded2dbSJung-uk Kim movdqa $b,$t0 1887bded2dbSJung-uk Kim pslld \$5,$t2 1897bded2dbSJung-uk Kim prefetcht0 63(@ptr[0]) 1907bded2dbSJung-uk Kim pandn $d,$t1 1917bded2dbSJung-uk Kim pand $c,$t0 1927bded2dbSJung-uk Kim punpckldq $t3,@Xi[1] 1937bded2dbSJung-uk Kim movdqa $a,$t3 1947bded2dbSJung-uk Kim 1957bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 1967bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 1977bded2dbSJung-uk Kim psrld \$27,$t3 1987bded2dbSJung-uk Kim pxor $t1,$t0 # Ch(b,c,d) 1997bded2dbSJung-uk Kim movdqa $b,$t1 2007bded2dbSJung-uk Kim prefetcht0 63(@ptr[1]) 2017bded2dbSJung-uk Kim 2027bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 2037bded2dbSJung-uk Kim pslld \$30,$t1 2047bded2dbSJung-uk Kim paddd $t0,$e # e+=Ch(b,c,d) 2057bded2dbSJung-uk Kim prefetcht0 63(@ptr[2]) 2067bded2dbSJung-uk Kim 2077bded2dbSJung-uk Kim psrld \$2,$b 2087bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 2097bded2dbSJung-uk Kim pshufb $tx,@Xi[1] 2107bded2dbSJung-uk Kim prefetcht0 63(@ptr[3]) 2117bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 2127bded2dbSJung-uk Kim___ 2137bded2dbSJung-uk Kim$code.=<<___ if ($i>=13 && $i<15); 2147bded2dbSJung-uk Kim movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" 2157bded2dbSJung-uk Kim___ 2167bded2dbSJung-uk Kim$code.=<<___ if ($i>=15); # apply Xupdate 2177bded2dbSJung-uk Kim pxor @Xi[-2],@Xi[1] # "X[13]" 2187bded2dbSJung-uk Kim movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 2197bded2dbSJung-uk Kim 2207bded2dbSJung-uk Kim movdqa $a,$t2 2217bded2dbSJung-uk Kim pxor `&Xi_off($j+8)`,@Xi[1] 2227bded2dbSJung-uk Kim paddd $K,$e # e+=K_00_19 2237bded2dbSJung-uk Kim movdqa $b,$t1 2247bded2dbSJung-uk Kim pslld \$5,$t2 2257bded2dbSJung-uk Kim pxor @Xi[3],@Xi[1] 2267bded2dbSJung-uk Kim movdqa $b,$t0 2277bded2dbSJung-uk Kim pandn $d,$t1 2287bded2dbSJung-uk Kim movdqa @Xi[1],$tx 2297bded2dbSJung-uk Kim pand $c,$t0 2307bded2dbSJung-uk Kim movdqa $a,$t3 2317bded2dbSJung-uk Kim psrld \$31,$tx 2327bded2dbSJung-uk Kim paddd @Xi[1],@Xi[1] 2337bded2dbSJung-uk Kim 2347bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 2357bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 2367bded2dbSJung-uk Kim psrld \$27,$t3 2377bded2dbSJung-uk Kim pxor $t1,$t0 # Ch(b,c,d) 2387bded2dbSJung-uk Kim 2397bded2dbSJung-uk Kim movdqa $b,$t1 2407bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 2417bded2dbSJung-uk Kim pslld \$30,$t1 2427bded2dbSJung-uk Kim paddd $t0,$e # e+=Ch(b,c,d) 2437bded2dbSJung-uk Kim 2447bded2dbSJung-uk Kim psrld \$2,$b 2457bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 2467bded2dbSJung-uk Kim por $tx,@Xi[1] # rol \$1,@Xi[1] 2477bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 2487bded2dbSJung-uk Kim___ 2497bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 2507bded2dbSJung-uk Kim} 2517bded2dbSJung-uk Kim 2527bded2dbSJung-uk Kimsub BODY_20_39 { 2537bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 2547bded2dbSJung-uk Kimmy $j=$i+1; 2557bded2dbSJung-uk Kim 2567bded2dbSJung-uk Kim$code.=<<___ if ($i<79); 2577bded2dbSJung-uk Kim pxor @Xi[-2],@Xi[1] # "X[13]" 2587bded2dbSJung-uk Kim movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 2597bded2dbSJung-uk Kim 2607bded2dbSJung-uk Kim movdqa $a,$t2 2617bded2dbSJung-uk Kim movdqa $d,$t0 2627bded2dbSJung-uk Kim pxor `&Xi_off($j+8)`,@Xi[1] 2637bded2dbSJung-uk Kim paddd $K,$e # e+=K_20_39 2647bded2dbSJung-uk Kim pslld \$5,$t2 2657bded2dbSJung-uk Kim pxor $b,$t0 2667bded2dbSJung-uk Kim 2677bded2dbSJung-uk Kim movdqa $a,$t3 2687bded2dbSJung-uk Kim___ 2697bded2dbSJung-uk Kim$code.=<<___ if ($i<72); 2707bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 2717bded2dbSJung-uk Kim___ 2727bded2dbSJung-uk Kim$code.=<<___ if ($i<79); 2737bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 2747bded2dbSJung-uk Kim pxor @Xi[3],@Xi[1] 2757bded2dbSJung-uk Kim psrld \$27,$t3 2767bded2dbSJung-uk Kim pxor $c,$t0 # Parity(b,c,d) 2777bded2dbSJung-uk Kim movdqa $b,$t1 2787bded2dbSJung-uk Kim 2797bded2dbSJung-uk Kim pslld \$30,$t1 2807bded2dbSJung-uk Kim movdqa @Xi[1],$tx 2817bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 2827bded2dbSJung-uk Kim psrld \$31,$tx 2837bded2dbSJung-uk Kim paddd $t0,$e # e+=Parity(b,c,d) 2847bded2dbSJung-uk Kim paddd @Xi[1],@Xi[1] 2857bded2dbSJung-uk Kim 2867bded2dbSJung-uk Kim psrld \$2,$b 2877bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 2887bded2dbSJung-uk Kim por $tx,@Xi[1] # rol(@Xi[1],1) 2897bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 2907bded2dbSJung-uk Kim___ 2917bded2dbSJung-uk Kim$code.=<<___ if ($i==79); 2927bded2dbSJung-uk Kim movdqa $a,$t2 2937bded2dbSJung-uk Kim paddd $K,$e # e+=K_20_39 2947bded2dbSJung-uk Kim movdqa $d,$t0 2957bded2dbSJung-uk Kim pslld \$5,$t2 2967bded2dbSJung-uk Kim pxor $b,$t0 2977bded2dbSJung-uk Kim 2987bded2dbSJung-uk Kim movdqa $a,$t3 2997bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 3007bded2dbSJung-uk Kim psrld \$27,$t3 3017bded2dbSJung-uk Kim movdqa $b,$t1 3027bded2dbSJung-uk Kim pxor $c,$t0 # Parity(b,c,d) 3037bded2dbSJung-uk Kim 3047bded2dbSJung-uk Kim pslld \$30,$t1 3057bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 3067bded2dbSJung-uk Kim paddd $t0,$e # e+=Parity(b,c,d) 3077bded2dbSJung-uk Kim 3087bded2dbSJung-uk Kim psrld \$2,$b 3097bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 3107bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 3117bded2dbSJung-uk Kim___ 3127bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 3137bded2dbSJung-uk Kim} 3147bded2dbSJung-uk Kim 3157bded2dbSJung-uk Kimsub BODY_40_59 { 3167bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 3177bded2dbSJung-uk Kimmy $j=$i+1; 3187bded2dbSJung-uk Kim 3197bded2dbSJung-uk Kim$code.=<<___; 3207bded2dbSJung-uk Kim pxor @Xi[-2],@Xi[1] # "X[13]" 3217bded2dbSJung-uk Kim movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 3227bded2dbSJung-uk Kim 3237bded2dbSJung-uk Kim movdqa $a,$t2 3247bded2dbSJung-uk Kim movdqa $d,$t1 3257bded2dbSJung-uk Kim pxor `&Xi_off($j+8)`,@Xi[1] 3267bded2dbSJung-uk Kim pxor @Xi[3],@Xi[1] 3277bded2dbSJung-uk Kim paddd $K,$e # e+=K_40_59 3287bded2dbSJung-uk Kim pslld \$5,$t2 3297bded2dbSJung-uk Kim movdqa $a,$t3 3307bded2dbSJung-uk Kim pand $c,$t1 3317bded2dbSJung-uk Kim 3327bded2dbSJung-uk Kim movdqa $d,$t0 3337bded2dbSJung-uk Kim movdqa @Xi[1],$tx 3347bded2dbSJung-uk Kim psrld \$27,$t3 3357bded2dbSJung-uk Kim paddd $t1,$e 3367bded2dbSJung-uk Kim pxor $c,$t0 3377bded2dbSJung-uk Kim 3387bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 3397bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 3407bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 3417bded2dbSJung-uk Kim psrld \$31,$tx 3427bded2dbSJung-uk Kim pand $b,$t0 3437bded2dbSJung-uk Kim movdqa $b,$t1 3447bded2dbSJung-uk Kim 3457bded2dbSJung-uk Kim pslld \$30,$t1 3467bded2dbSJung-uk Kim paddd @Xi[1],@Xi[1] 3477bded2dbSJung-uk Kim paddd $t0,$e # e+=Maj(b,d,c) 3487bded2dbSJung-uk Kim 3497bded2dbSJung-uk Kim psrld \$2,$b 3507bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 3517bded2dbSJung-uk Kim por $tx,@Xi[1] # rol(@X[1],1) 3527bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 3537bded2dbSJung-uk Kim___ 3547bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 3557bded2dbSJung-uk Kim} 3567bded2dbSJung-uk Kim 3577bded2dbSJung-uk Kim$code.=<<___; 3587bded2dbSJung-uk Kim.text 3597bded2dbSJung-uk Kim 3607bded2dbSJung-uk Kim.extern OPENSSL_ia32cap_P 3617bded2dbSJung-uk Kim 3627bded2dbSJung-uk Kim.globl sha1_multi_block 3637bded2dbSJung-uk Kim.type sha1_multi_block,\@function,3 3647bded2dbSJung-uk Kim.align 32 3657bded2dbSJung-uk Kimsha1_multi_block: 366e71b7053SJung-uk Kim.cfi_startproc 3677bded2dbSJung-uk Kim mov OPENSSL_ia32cap_P+4(%rip),%rcx 3687bded2dbSJung-uk Kim bt \$61,%rcx # check SHA bit 3697bded2dbSJung-uk Kim jc _shaext_shortcut 3707bded2dbSJung-uk Kim___ 3717bded2dbSJung-uk Kim$code.=<<___ if ($avx); 3727bded2dbSJung-uk Kim test \$`1<<28`,%ecx 3737bded2dbSJung-uk Kim jnz _avx_shortcut 3747bded2dbSJung-uk Kim___ 3757bded2dbSJung-uk Kim$code.=<<___; 3767bded2dbSJung-uk Kim mov %rsp,%rax 377e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 3787bded2dbSJung-uk Kim push %rbx 379e71b7053SJung-uk Kim.cfi_push %rbx 3807bded2dbSJung-uk Kim push %rbp 381e71b7053SJung-uk Kim.cfi_push %rbx 3827bded2dbSJung-uk Kim___ 3837bded2dbSJung-uk Kim$code.=<<___ if ($win64); 3847bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 3857bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 3867bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 3877bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 3887bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 3897bded2dbSJung-uk Kim movaps %xmm10,-0x78(%rax) 3907bded2dbSJung-uk Kim movaps %xmm11,-0x68(%rax) 3917bded2dbSJung-uk Kim movaps %xmm12,-0x58(%rax) 3927bded2dbSJung-uk Kim movaps %xmm13,-0x48(%rax) 3937bded2dbSJung-uk Kim movaps %xmm14,-0x38(%rax) 3947bded2dbSJung-uk Kim movaps %xmm15,-0x28(%rax) 3957bded2dbSJung-uk Kim___ 3967bded2dbSJung-uk Kim$code.=<<___; 3977bded2dbSJung-uk Kim sub \$`$REG_SZ*18`,%rsp 3987bded2dbSJung-uk Kim and \$-256,%rsp 3997bded2dbSJung-uk Kim mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 400e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 4017bded2dbSJung-uk Kim.Lbody: 4027bded2dbSJung-uk Kim lea K_XX_XX(%rip),$Tbl 4037bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 4047bded2dbSJung-uk Kim 4057bded2dbSJung-uk Kim.Loop_grande: 4067bded2dbSJung-uk Kim mov $num,`$REG_SZ*17+8`(%rsp) # original $num 4077bded2dbSJung-uk Kim xor $num,$num 4087bded2dbSJung-uk Kim___ 4097bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 4107bded2dbSJung-uk Kim $code.=<<___; 4117bded2dbSJung-uk Kim mov `16*$i+0`($inp),@ptr[$i] # input pointer 4127bded2dbSJung-uk Kim mov `16*$i+8`($inp),%ecx # number of blocks 4137bded2dbSJung-uk Kim cmp $num,%ecx 4147bded2dbSJung-uk Kim cmovg %ecx,$num # find maximum 4157bded2dbSJung-uk Kim test %ecx,%ecx 4167bded2dbSJung-uk Kim mov %ecx,`4*$i`(%rbx) # initialize counters 4177bded2dbSJung-uk Kim cmovle $Tbl,@ptr[$i] # cancel input 4187bded2dbSJung-uk Kim___ 4197bded2dbSJung-uk Kim} 4207bded2dbSJung-uk Kim$code.=<<___; 4217bded2dbSJung-uk Kim test $num,$num 4227bded2dbSJung-uk Kim jz .Ldone 4237bded2dbSJung-uk Kim 4247bded2dbSJung-uk Kim movdqu 0x00($ctx),$A # load context 4257bded2dbSJung-uk Kim lea 128(%rsp),%rax 4267bded2dbSJung-uk Kim movdqu 0x20($ctx),$B 4277bded2dbSJung-uk Kim movdqu 0x40($ctx),$C 4287bded2dbSJung-uk Kim movdqu 0x60($ctx),$D 4297bded2dbSJung-uk Kim movdqu 0x80($ctx),$E 4307bded2dbSJung-uk Kim movdqa 0x60($Tbl),$tx # pbswap_mask 4317bded2dbSJung-uk Kim movdqa -0x20($Tbl),$K # K_00_19 4327bded2dbSJung-uk Kim jmp .Loop 4337bded2dbSJung-uk Kim 4347bded2dbSJung-uk Kim.align 32 4357bded2dbSJung-uk Kim.Loop: 4367bded2dbSJung-uk Kim___ 4377bded2dbSJung-uk Kimfor($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 4387bded2dbSJung-uk Kim$code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 4397bded2dbSJung-uk Kimfor(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 4407bded2dbSJung-uk Kim$code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59 4417bded2dbSJung-uk Kimfor(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 4427bded2dbSJung-uk Kim$code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79 4437bded2dbSJung-uk Kimfor(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 4447bded2dbSJung-uk Kim$code.=<<___; 4457bded2dbSJung-uk Kim movdqa (%rbx),@Xi[0] # pull counters 4467bded2dbSJung-uk Kim mov \$1,%ecx 447e71b7053SJung-uk Kim cmp 4*0(%rbx),%ecx # examine counters 4487bded2dbSJung-uk Kim pxor $t2,$t2 4497bded2dbSJung-uk Kim cmovge $Tbl,@ptr[0] # cancel input 4507bded2dbSJung-uk Kim cmp 4*1(%rbx),%ecx 4517bded2dbSJung-uk Kim movdqa @Xi[0],@Xi[1] 4527bded2dbSJung-uk Kim cmovge $Tbl,@ptr[1] 4537bded2dbSJung-uk Kim cmp 4*2(%rbx),%ecx 4547bded2dbSJung-uk Kim pcmpgtd $t2,@Xi[1] # mask value 4557bded2dbSJung-uk Kim cmovge $Tbl,@ptr[2] 4567bded2dbSJung-uk Kim cmp 4*3(%rbx),%ecx 4577bded2dbSJung-uk Kim paddd @Xi[1],@Xi[0] # counters-- 4587bded2dbSJung-uk Kim cmovge $Tbl,@ptr[3] 4597bded2dbSJung-uk Kim 4607bded2dbSJung-uk Kim movdqu 0x00($ctx),$t0 4617bded2dbSJung-uk Kim pand @Xi[1],$A 4627bded2dbSJung-uk Kim movdqu 0x20($ctx),$t1 4637bded2dbSJung-uk Kim pand @Xi[1],$B 4647bded2dbSJung-uk Kim paddd $t0,$A 4657bded2dbSJung-uk Kim movdqu 0x40($ctx),$t2 4667bded2dbSJung-uk Kim pand @Xi[1],$C 4677bded2dbSJung-uk Kim paddd $t1,$B 4687bded2dbSJung-uk Kim movdqu 0x60($ctx),$t3 4697bded2dbSJung-uk Kim pand @Xi[1],$D 4707bded2dbSJung-uk Kim paddd $t2,$C 4717bded2dbSJung-uk Kim movdqu 0x80($ctx),$tx 4727bded2dbSJung-uk Kim pand @Xi[1],$E 4737bded2dbSJung-uk Kim movdqu $A,0x00($ctx) 4747bded2dbSJung-uk Kim paddd $t3,$D 4757bded2dbSJung-uk Kim movdqu $B,0x20($ctx) 4767bded2dbSJung-uk Kim paddd $tx,$E 4777bded2dbSJung-uk Kim movdqu $C,0x40($ctx) 4787bded2dbSJung-uk Kim movdqu $D,0x60($ctx) 4797bded2dbSJung-uk Kim movdqu $E,0x80($ctx) 4807bded2dbSJung-uk Kim 4817bded2dbSJung-uk Kim movdqa @Xi[0],(%rbx) # save counters 4827bded2dbSJung-uk Kim movdqa 0x60($Tbl),$tx # pbswap_mask 4837bded2dbSJung-uk Kim movdqa -0x20($Tbl),$K # K_00_19 4847bded2dbSJung-uk Kim dec $num 4857bded2dbSJung-uk Kim jnz .Loop 4867bded2dbSJung-uk Kim 4877bded2dbSJung-uk Kim mov `$REG_SZ*17+8`(%rsp),$num 4887bded2dbSJung-uk Kim lea $REG_SZ($ctx),$ctx 4897bded2dbSJung-uk Kim lea `16*$REG_SZ/4`($inp),$inp 4907bded2dbSJung-uk Kim dec $num 4917bded2dbSJung-uk Kim jnz .Loop_grande 4927bded2dbSJung-uk Kim 4937bded2dbSJung-uk Kim.Ldone: 494e71b7053SJung-uk Kim mov `$REG_SZ*17`(%rsp),%rax # original %rsp 495e71b7053SJung-uk Kim.cfi_def_cfa %rax,8 4967bded2dbSJung-uk Kim___ 4977bded2dbSJung-uk Kim$code.=<<___ if ($win64); 4987bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm6 4997bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm7 5007bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm8 5017bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm9 5027bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm10 5037bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm11 5047bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm12 5057bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm13 5067bded2dbSJung-uk Kim movaps -0x38(%rax),%xmm14 5077bded2dbSJung-uk Kim movaps -0x28(%rax),%xmm15 5087bded2dbSJung-uk Kim___ 5097bded2dbSJung-uk Kim$code.=<<___; 5107bded2dbSJung-uk Kim mov -16(%rax),%rbp 511e71b7053SJung-uk Kim.cfi_restore %rbp 5127bded2dbSJung-uk Kim mov -8(%rax),%rbx 513e71b7053SJung-uk Kim.cfi_restore %rbx 5147bded2dbSJung-uk Kim lea (%rax),%rsp 515e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 5167bded2dbSJung-uk Kim.Lepilogue: 5177bded2dbSJung-uk Kim ret 518e71b7053SJung-uk Kim.cfi_endproc 5197bded2dbSJung-uk Kim.size sha1_multi_block,.-sha1_multi_block 5207bded2dbSJung-uk Kim___ 5217bded2dbSJung-uk Kim {{{ 5227bded2dbSJung-uk Kimmy ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10)); 5237bded2dbSJung-uk Kimmy @MSG0=map("%xmm$_",(4..7)); 5247bded2dbSJung-uk Kimmy @MSG1=map("%xmm$_",(11..14)); 5257bded2dbSJung-uk Kim 5267bded2dbSJung-uk Kim$code.=<<___; 5277bded2dbSJung-uk Kim.type sha1_multi_block_shaext,\@function,3 5287bded2dbSJung-uk Kim.align 32 5297bded2dbSJung-uk Kimsha1_multi_block_shaext: 530e71b7053SJung-uk Kim.cfi_startproc 5317bded2dbSJung-uk Kim_shaext_shortcut: 5327bded2dbSJung-uk Kim mov %rsp,%rax 533e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 5347bded2dbSJung-uk Kim push %rbx 535e71b7053SJung-uk Kim.cfi_push %rbx 5367bded2dbSJung-uk Kim push %rbp 537e71b7053SJung-uk Kim.cfi_push %rbp 5387bded2dbSJung-uk Kim___ 5397bded2dbSJung-uk Kim$code.=<<___ if ($win64); 5407bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 5417bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 5427bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 5437bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 5447bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 5457bded2dbSJung-uk Kim movaps %xmm10,-0x78(%rax) 5467bded2dbSJung-uk Kim movaps %xmm11,-0x68(%rax) 5477bded2dbSJung-uk Kim movaps %xmm12,-0x58(%rax) 5487bded2dbSJung-uk Kim movaps %xmm13,-0x48(%rax) 5497bded2dbSJung-uk Kim movaps %xmm14,-0x38(%rax) 5507bded2dbSJung-uk Kim movaps %xmm15,-0x28(%rax) 5517bded2dbSJung-uk Kim___ 5527bded2dbSJung-uk Kim$code.=<<___; 5537bded2dbSJung-uk Kim sub \$`$REG_SZ*18`,%rsp 5547bded2dbSJung-uk Kim shl \$1,$num # we process pair at a time 5557bded2dbSJung-uk Kim and \$-256,%rsp 5567bded2dbSJung-uk Kim lea 0x40($ctx),$ctx # size optimization 5577bded2dbSJung-uk Kim mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 5587bded2dbSJung-uk Kim.Lbody_shaext: 5597bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 5607bded2dbSJung-uk Kim movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap 5617bded2dbSJung-uk Kim 5627bded2dbSJung-uk Kim.Loop_grande_shaext: 563e71b7053SJung-uk Kim mov $num,`$REG_SZ*17+8`(%rsp) # original $num 5647bded2dbSJung-uk Kim xor $num,$num 5657bded2dbSJung-uk Kim___ 5667bded2dbSJung-uk Kimfor($i=0;$i<2;$i++) { 5677bded2dbSJung-uk Kim $code.=<<___; 5687bded2dbSJung-uk Kim mov `16*$i+0`($inp),@ptr[$i] # input pointer 5697bded2dbSJung-uk Kim mov `16*$i+8`($inp),%ecx # number of blocks 5707bded2dbSJung-uk Kim cmp $num,%ecx 5717bded2dbSJung-uk Kim cmovg %ecx,$num # find maximum 5727bded2dbSJung-uk Kim test %ecx,%ecx 5737bded2dbSJung-uk Kim mov %ecx,`4*$i`(%rbx) # initialize counters 5747bded2dbSJung-uk Kim cmovle %rsp,@ptr[$i] # cancel input 5757bded2dbSJung-uk Kim___ 5767bded2dbSJung-uk Kim} 5777bded2dbSJung-uk Kim$code.=<<___; 5787bded2dbSJung-uk Kim test $num,$num 5797bded2dbSJung-uk Kim jz .Ldone_shaext 5807bded2dbSJung-uk Kim 5817bded2dbSJung-uk Kim movq 0x00-0x40($ctx),$ABCD0 # a1.a0 5827bded2dbSJung-uk Kim movq 0x20-0x40($ctx),@MSG0[0]# b1.b0 5837bded2dbSJung-uk Kim movq 0x40-0x40($ctx),@MSG0[1]# c1.c0 5847bded2dbSJung-uk Kim movq 0x60-0x40($ctx),@MSG0[2]# d1.d0 5857bded2dbSJung-uk Kim movq 0x80-0x40($ctx),@MSG0[3]# e1.e0 5867bded2dbSJung-uk Kim 5877bded2dbSJung-uk Kim punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0 5887bded2dbSJung-uk Kim punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0 5897bded2dbSJung-uk Kim 5907bded2dbSJung-uk Kim movdqa $ABCD0,$ABCD1 5917bded2dbSJung-uk Kim punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0 5927bded2dbSJung-uk Kim punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1 5937bded2dbSJung-uk Kim 5947bded2dbSJung-uk Kim pshufd \$0b00111111,@MSG0[3],$E0 5957bded2dbSJung-uk Kim pshufd \$0b01111111,@MSG0[3],$E1 5967bded2dbSJung-uk Kim pshufd \$0b00011011,$ABCD0,$ABCD0 5977bded2dbSJung-uk Kim pshufd \$0b00011011,$ABCD1,$ABCD1 5987bded2dbSJung-uk Kim jmp .Loop_shaext 5997bded2dbSJung-uk Kim 6007bded2dbSJung-uk Kim.align 32 6017bded2dbSJung-uk Kim.Loop_shaext: 6027bded2dbSJung-uk Kim movdqu 0x00(@ptr[0]),@MSG0[0] 6037bded2dbSJung-uk Kim movdqu 0x00(@ptr[1]),@MSG1[0] 6047bded2dbSJung-uk Kim movdqu 0x10(@ptr[0]),@MSG0[1] 6057bded2dbSJung-uk Kim movdqu 0x10(@ptr[1]),@MSG1[1] 6067bded2dbSJung-uk Kim movdqu 0x20(@ptr[0]),@MSG0[2] 6077bded2dbSJung-uk Kim pshufb $BSWAP,@MSG0[0] 6087bded2dbSJung-uk Kim movdqu 0x20(@ptr[1]),@MSG1[2] 6097bded2dbSJung-uk Kim pshufb $BSWAP,@MSG1[0] 6107bded2dbSJung-uk Kim movdqu 0x30(@ptr[0]),@MSG0[3] 6117bded2dbSJung-uk Kim lea 0x40(@ptr[0]),@ptr[0] 6127bded2dbSJung-uk Kim pshufb $BSWAP,@MSG0[1] 6137bded2dbSJung-uk Kim movdqu 0x30(@ptr[1]),@MSG1[3] 6147bded2dbSJung-uk Kim lea 0x40(@ptr[1]),@ptr[1] 6157bded2dbSJung-uk Kim pshufb $BSWAP,@MSG1[1] 6167bded2dbSJung-uk Kim 6177bded2dbSJung-uk Kim movdqa $E0,0x50(%rsp) # offload 6187bded2dbSJung-uk Kim paddd @MSG0[0],$E0 6197bded2dbSJung-uk Kim movdqa $E1,0x70(%rsp) 6207bded2dbSJung-uk Kim paddd @MSG1[0],$E1 6217bded2dbSJung-uk Kim movdqa $ABCD0,0x40(%rsp) # offload 6227bded2dbSJung-uk Kim movdqa $ABCD0,$E0_ 6237bded2dbSJung-uk Kim movdqa $ABCD1,0x60(%rsp) 6247bded2dbSJung-uk Kim movdqa $ABCD1,$E1_ 6257bded2dbSJung-uk Kim sha1rnds4 \$0,$E0,$ABCD0 # 0-3 6267bded2dbSJung-uk Kim sha1nexte @MSG0[1],$E0_ 6277bded2dbSJung-uk Kim sha1rnds4 \$0,$E1,$ABCD1 # 0-3 6287bded2dbSJung-uk Kim sha1nexte @MSG1[1],$E1_ 6297bded2dbSJung-uk Kim pshufb $BSWAP,@MSG0[2] 6307bded2dbSJung-uk Kim prefetcht0 127(@ptr[0]) 6317bded2dbSJung-uk Kim sha1msg1 @MSG0[1],@MSG0[0] 6327bded2dbSJung-uk Kim pshufb $BSWAP,@MSG1[2] 6337bded2dbSJung-uk Kim prefetcht0 127(@ptr[1]) 6347bded2dbSJung-uk Kim sha1msg1 @MSG1[1],@MSG1[0] 6357bded2dbSJung-uk Kim 6367bded2dbSJung-uk Kim pshufb $BSWAP,@MSG0[3] 6377bded2dbSJung-uk Kim movdqa $ABCD0,$E0 6387bded2dbSJung-uk Kim pshufb $BSWAP,@MSG1[3] 6397bded2dbSJung-uk Kim movdqa $ABCD1,$E1 6407bded2dbSJung-uk Kim sha1rnds4 \$0,$E0_,$ABCD0 # 4-7 6417bded2dbSJung-uk Kim sha1nexte @MSG0[2],$E0 6427bded2dbSJung-uk Kim sha1rnds4 \$0,$E1_,$ABCD1 # 4-7 6437bded2dbSJung-uk Kim sha1nexte @MSG1[2],$E1 6447bded2dbSJung-uk Kim pxor @MSG0[2],@MSG0[0] 6457bded2dbSJung-uk Kim sha1msg1 @MSG0[2],@MSG0[1] 6467bded2dbSJung-uk Kim pxor @MSG1[2],@MSG1[0] 6477bded2dbSJung-uk Kim sha1msg1 @MSG1[2],@MSG1[1] 6487bded2dbSJung-uk Kim___ 6497bded2dbSJung-uk Kimfor($i=2;$i<20-4;$i++) { 6507bded2dbSJung-uk Kim$code.=<<___; 6517bded2dbSJung-uk Kim movdqa $ABCD0,$E0_ 6527bded2dbSJung-uk Kim movdqa $ABCD1,$E1_ 6537bded2dbSJung-uk Kim sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11 6547bded2dbSJung-uk Kim sha1nexte @MSG0[3],$E0_ 6557bded2dbSJung-uk Kim sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11 6567bded2dbSJung-uk Kim sha1nexte @MSG1[3],$E1_ 6577bded2dbSJung-uk Kim sha1msg2 @MSG0[3],@MSG0[0] 6587bded2dbSJung-uk Kim sha1msg2 @MSG1[3],@MSG1[0] 6597bded2dbSJung-uk Kim pxor @MSG0[3],@MSG0[1] 6607bded2dbSJung-uk Kim sha1msg1 @MSG0[3],@MSG0[2] 6617bded2dbSJung-uk Kim pxor @MSG1[3],@MSG1[1] 6627bded2dbSJung-uk Kim sha1msg1 @MSG1[3],@MSG1[2] 6637bded2dbSJung-uk Kim___ 6647bded2dbSJung-uk Kim ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1); 6657bded2dbSJung-uk Kim push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); 6667bded2dbSJung-uk Kim} 6677bded2dbSJung-uk Kim$code.=<<___; 6687bded2dbSJung-uk Kim movdqa $ABCD0,$E0_ 6697bded2dbSJung-uk Kim movdqa $ABCD1,$E1_ 6707bded2dbSJung-uk Kim sha1rnds4 \$3,$E0,$ABCD0 # 64-67 6717bded2dbSJung-uk Kim sha1nexte @MSG0[3],$E0_ 6727bded2dbSJung-uk Kim sha1rnds4 \$3,$E1,$ABCD1 # 64-67 6737bded2dbSJung-uk Kim sha1nexte @MSG1[3],$E1_ 6747bded2dbSJung-uk Kim sha1msg2 @MSG0[3],@MSG0[0] 6757bded2dbSJung-uk Kim sha1msg2 @MSG1[3],@MSG1[0] 6767bded2dbSJung-uk Kim pxor @MSG0[3],@MSG0[1] 6777bded2dbSJung-uk Kim pxor @MSG1[3],@MSG1[1] 6787bded2dbSJung-uk Kim 6797bded2dbSJung-uk Kim mov \$1,%ecx 6807bded2dbSJung-uk Kim pxor @MSG0[2],@MSG0[2] # zero 6817bded2dbSJung-uk Kim cmp 4*0(%rbx),%ecx # examine counters 6827bded2dbSJung-uk Kim cmovge %rsp,@ptr[0] # cancel input 6837bded2dbSJung-uk Kim 6847bded2dbSJung-uk Kim movdqa $ABCD0,$E0 6857bded2dbSJung-uk Kim movdqa $ABCD1,$E1 6867bded2dbSJung-uk Kim sha1rnds4 \$3,$E0_,$ABCD0 # 68-71 6877bded2dbSJung-uk Kim sha1nexte @MSG0[0],$E0 6887bded2dbSJung-uk Kim sha1rnds4 \$3,$E1_,$ABCD1 # 68-71 6897bded2dbSJung-uk Kim sha1nexte @MSG1[0],$E1 6907bded2dbSJung-uk Kim sha1msg2 @MSG0[0],@MSG0[1] 6917bded2dbSJung-uk Kim sha1msg2 @MSG1[0],@MSG1[1] 6927bded2dbSJung-uk Kim 6937bded2dbSJung-uk Kim cmp 4*1(%rbx),%ecx 6947bded2dbSJung-uk Kim cmovge %rsp,@ptr[1] 6957bded2dbSJung-uk Kim movq (%rbx),@MSG0[0] # pull counters 6967bded2dbSJung-uk Kim 6977bded2dbSJung-uk Kim movdqa $ABCD0,$E0_ 6987bded2dbSJung-uk Kim movdqa $ABCD1,$E1_ 6997bded2dbSJung-uk Kim sha1rnds4 \$3,$E0,$ABCD0 # 72-75 7007bded2dbSJung-uk Kim sha1nexte @MSG0[1],$E0_ 7017bded2dbSJung-uk Kim sha1rnds4 \$3,$E1,$ABCD1 # 72-75 7027bded2dbSJung-uk Kim sha1nexte @MSG1[1],$E1_ 7037bded2dbSJung-uk Kim 7047bded2dbSJung-uk Kim pshufd \$0x00,@MSG0[0],@MSG1[2] 7057bded2dbSJung-uk Kim pshufd \$0x55,@MSG0[0],@MSG1[3] 7067bded2dbSJung-uk Kim movdqa @MSG0[0],@MSG0[1] 7077bded2dbSJung-uk Kim pcmpgtd @MSG0[2],@MSG1[2] 7087bded2dbSJung-uk Kim pcmpgtd @MSG0[2],@MSG1[3] 7097bded2dbSJung-uk Kim 7107bded2dbSJung-uk Kim movdqa $ABCD0,$E0 7117bded2dbSJung-uk Kim movdqa $ABCD1,$E1 7127bded2dbSJung-uk Kim sha1rnds4 \$3,$E0_,$ABCD0 # 76-79 7137bded2dbSJung-uk Kim sha1nexte $MSG0[2],$E0 7147bded2dbSJung-uk Kim sha1rnds4 \$3,$E1_,$ABCD1 # 76-79 7157bded2dbSJung-uk Kim sha1nexte $MSG0[2],$E1 7167bded2dbSJung-uk Kim 7177bded2dbSJung-uk Kim pcmpgtd @MSG0[2],@MSG0[1] # counter mask 7187bded2dbSJung-uk Kim pand @MSG1[2],$ABCD0 7197bded2dbSJung-uk Kim pand @MSG1[2],$E0 7207bded2dbSJung-uk Kim pand @MSG1[3],$ABCD1 7217bded2dbSJung-uk Kim pand @MSG1[3],$E1 7227bded2dbSJung-uk Kim paddd @MSG0[1],@MSG0[0] # counters-- 7237bded2dbSJung-uk Kim 7247bded2dbSJung-uk Kim paddd 0x40(%rsp),$ABCD0 7257bded2dbSJung-uk Kim paddd 0x50(%rsp),$E0 7267bded2dbSJung-uk Kim paddd 0x60(%rsp),$ABCD1 7277bded2dbSJung-uk Kim paddd 0x70(%rsp),$E1 7287bded2dbSJung-uk Kim 7297bded2dbSJung-uk Kim movq @MSG0[0],(%rbx) # save counters 7307bded2dbSJung-uk Kim dec $num 7317bded2dbSJung-uk Kim jnz .Loop_shaext 7327bded2dbSJung-uk Kim 7337bded2dbSJung-uk Kim mov `$REG_SZ*17+8`(%rsp),$num 7347bded2dbSJung-uk Kim 7357bded2dbSJung-uk Kim pshufd \$0b00011011,$ABCD0,$ABCD0 7367bded2dbSJung-uk Kim pshufd \$0b00011011,$ABCD1,$ABCD1 7377bded2dbSJung-uk Kim 7387bded2dbSJung-uk Kim movdqa $ABCD0,@MSG0[0] 7397bded2dbSJung-uk Kim punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0 7407bded2dbSJung-uk Kim punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0 7417bded2dbSJung-uk Kim punpckhdq $E1,$E0 # e1.e0.xx.xx 7427bded2dbSJung-uk Kim movq $ABCD0,0x00-0x40($ctx) # a1.a0 7437bded2dbSJung-uk Kim psrldq \$8,$ABCD0 7447bded2dbSJung-uk Kim movq @MSG0[0],0x40-0x40($ctx)# c1.c0 7457bded2dbSJung-uk Kim psrldq \$8,@MSG0[0] 7467bded2dbSJung-uk Kim movq $ABCD0,0x20-0x40($ctx) # b1.b0 7477bded2dbSJung-uk Kim psrldq \$8,$E0 7487bded2dbSJung-uk Kim movq @MSG0[0],0x60-0x40($ctx)# d1.d0 7497bded2dbSJung-uk Kim movq $E0,0x80-0x40($ctx) # e1.e0 7507bded2dbSJung-uk Kim 7517bded2dbSJung-uk Kim lea `$REG_SZ/2`($ctx),$ctx 7527bded2dbSJung-uk Kim lea `16*2`($inp),$inp 7537bded2dbSJung-uk Kim dec $num 7547bded2dbSJung-uk Kim jnz .Loop_grande_shaext 7557bded2dbSJung-uk Kim 7567bded2dbSJung-uk Kim.Ldone_shaext: 7577bded2dbSJung-uk Kim #mov `$REG_SZ*17`(%rsp),%rax # original %rsp 7587bded2dbSJung-uk Kim___ 7597bded2dbSJung-uk Kim$code.=<<___ if ($win64); 7607bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm6 7617bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm7 7627bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm8 7637bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm9 7647bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm10 7657bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm11 7667bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm12 7677bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm13 7687bded2dbSJung-uk Kim movaps -0x38(%rax),%xmm14 7697bded2dbSJung-uk Kim movaps -0x28(%rax),%xmm15 7707bded2dbSJung-uk Kim___ 7717bded2dbSJung-uk Kim$code.=<<___; 7727bded2dbSJung-uk Kim mov -16(%rax),%rbp 773e71b7053SJung-uk Kim.cfi_restore %rbp 7747bded2dbSJung-uk Kim mov -8(%rax),%rbx 775e71b7053SJung-uk Kim.cfi_restore %rbx 7767bded2dbSJung-uk Kim lea (%rax),%rsp 777e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 7787bded2dbSJung-uk Kim.Lepilogue_shaext: 7797bded2dbSJung-uk Kim ret 780e71b7053SJung-uk Kim.cfi_endproc 7817bded2dbSJung-uk Kim.size sha1_multi_block_shaext,.-sha1_multi_block_shaext 7827bded2dbSJung-uk Kim___ 7837bded2dbSJung-uk Kim }}} 7847bded2dbSJung-uk Kim 7857bded2dbSJung-uk Kim if ($avx) {{{ 7867bded2dbSJung-uk Kimsub BODY_00_19_avx { 7877bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 7887bded2dbSJung-uk Kimmy $j=$i+1; 7897bded2dbSJung-uk Kimmy $k=$i+2; 7907bded2dbSJung-uk Kimmy $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128"; 7917bded2dbSJung-uk Kimmy $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4]; 7927bded2dbSJung-uk Kim 7937bded2dbSJung-uk Kim$code.=<<___ if ($i==0 && $REG_SZ==16); 7947bded2dbSJung-uk Kim vmovd (@ptr[0]),@Xi[0] 7957bded2dbSJung-uk Kim lea `16*4`(@ptr[0]),@ptr[0] 7967bded2dbSJung-uk Kim vmovd (@ptr[1]),@Xi[2] # borrow Xi[2] 7977bded2dbSJung-uk Kim lea `16*4`(@ptr[1]),@ptr[1] 7987bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] 7997bded2dbSJung-uk Kim lea `16*4`(@ptr[2]),@ptr[2] 8007bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2] 8017bded2dbSJung-uk Kim lea `16*4`(@ptr[3]),@ptr[3] 8027bded2dbSJung-uk Kim vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] 8037bded2dbSJung-uk Kim vpunpckldq @Xi[2],@Xi[0],@Xi[0] 8047bded2dbSJung-uk Kim vmovd `4*$j-16*4`($ptr_n),$t3 8057bded2dbSJung-uk Kim vpshufb $tx,@Xi[0],@Xi[0] 8067bded2dbSJung-uk Kim___ 8077bded2dbSJung-uk Kim$code.=<<___ if ($i<15 && $REG_SZ==16); # just load input 8087bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] 8097bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3 8107bded2dbSJung-uk Kim___ 8117bded2dbSJung-uk Kim$code.=<<___ if ($i==0 && $REG_SZ==32); 8127bded2dbSJung-uk Kim vmovd (@ptr[0]),@Xi[0] 8137bded2dbSJung-uk Kim lea `16*4`(@ptr[0]),@ptr[0] 8147bded2dbSJung-uk Kim vmovd (@ptr[4]),@Xi[2] # borrow Xi[2] 8157bded2dbSJung-uk Kim lea `16*4`(@ptr[4]),@ptr[4] 8167bded2dbSJung-uk Kim vmovd (@ptr[1]),$t2 8177bded2dbSJung-uk Kim lea `16*4`(@ptr[1]),@ptr[1] 8187bded2dbSJung-uk Kim vmovd (@ptr[5]),$t1 8197bded2dbSJung-uk Kim lea `16*4`(@ptr[5]),@ptr[5] 8207bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] 8217bded2dbSJung-uk Kim lea `16*4`(@ptr[2]),@ptr[2] 8227bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2] 8237bded2dbSJung-uk Kim lea `16*4`(@ptr[6]),@ptr[6] 8247bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[3]),$t2,$t2 8257bded2dbSJung-uk Kim lea `16*4`(@ptr[3]),@ptr[3] 8267bded2dbSJung-uk Kim vpunpckldq $t2,@Xi[0],@Xi[0] 8277bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[7]),$t1,$t1 8287bded2dbSJung-uk Kim lea `16*4`(@ptr[7]),@ptr[7] 8297bded2dbSJung-uk Kim vpunpckldq $t1,@Xi[2],@Xi[2] 8307bded2dbSJung-uk Kim vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] 8317bded2dbSJung-uk Kim vinserti128 @Xi[2],@Xi[0],@Xi[0] 8327bded2dbSJung-uk Kim vmovd `4*$j-16*4`($ptr_n),$t3 8337bded2dbSJung-uk Kim vpshufb $tx,@Xi[0],@Xi[0] 8347bded2dbSJung-uk Kim___ 8357bded2dbSJung-uk Kim$code.=<<___ if ($i<15 && $REG_SZ==32); # just load input 8367bded2dbSJung-uk Kim vmovd `4*$j-16*4`(@ptr[1]),$t2 8377bded2dbSJung-uk Kim vmovd `4*$j-16*4`(@ptr[5]),$t1 8387bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] 8397bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3 8407bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2 8417bded2dbSJung-uk Kim vpunpckldq $t2,@Xi[1],@Xi[1] 8427bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1 8437bded2dbSJung-uk Kim vpunpckldq $t1,$t3,$t3 8447bded2dbSJung-uk Kim___ 8457bded2dbSJung-uk Kim$code.=<<___ if ($i<14); 8467bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_00_19 8477bded2dbSJung-uk Kim vpslld \$5,$a,$t2 8487bded2dbSJung-uk Kim vpandn $d,$b,$t1 8497bded2dbSJung-uk Kim vpand $c,$b,$t0 8507bded2dbSJung-uk Kim 8517bded2dbSJung-uk Kim vmovdqa @Xi[0],`&Xi_off($i)` 8527bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 8537bded2dbSJung-uk Kim $vpack $t3,@Xi[1],@Xi[1] 8547bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 8557bded2dbSJung-uk Kim vpxor $t1,$t0,$t0 # Ch(b,c,d) 8567bded2dbSJung-uk Kim vmovd `4*$k-16*4`(@ptr[0]),@Xi[2] 8577bded2dbSJung-uk Kim 8587bded2dbSJung-uk Kim vpslld \$30,$b,$t1 8597bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 8607bded2dbSJung-uk Kim vmovd `4*$k-16*4`($ptr_n),$t3 8617bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Ch(b,c,d) 8627bded2dbSJung-uk Kim 8637bded2dbSJung-uk Kim vpsrld \$2,$b,$b 8647bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 8657bded2dbSJung-uk Kim vpshufb $tx,@Xi[1],@Xi[1] 8667bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 8677bded2dbSJung-uk Kim___ 8687bded2dbSJung-uk Kim$code.=<<___ if ($i==14); 8697bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_00_19 8707bded2dbSJung-uk Kim prefetcht0 63(@ptr[0]) 8717bded2dbSJung-uk Kim vpslld \$5,$a,$t2 8727bded2dbSJung-uk Kim vpandn $d,$b,$t1 8737bded2dbSJung-uk Kim vpand $c,$b,$t0 8747bded2dbSJung-uk Kim 8757bded2dbSJung-uk Kim vmovdqa @Xi[0],`&Xi_off($i)` 8767bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 8777bded2dbSJung-uk Kim $vpack $t3,@Xi[1],@Xi[1] 8787bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 8797bded2dbSJung-uk Kim prefetcht0 63(@ptr[1]) 8807bded2dbSJung-uk Kim vpxor $t1,$t0,$t0 # Ch(b,c,d) 8817bded2dbSJung-uk Kim 8827bded2dbSJung-uk Kim vpslld \$30,$b,$t1 8837bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 8847bded2dbSJung-uk Kim prefetcht0 63(@ptr[2]) 8857bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Ch(b,c,d) 8867bded2dbSJung-uk Kim 8877bded2dbSJung-uk Kim vpsrld \$2,$b,$b 8887bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 8897bded2dbSJung-uk Kim prefetcht0 63(@ptr[3]) 8907bded2dbSJung-uk Kim vpshufb $tx,@Xi[1],@Xi[1] 8917bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 8927bded2dbSJung-uk Kim___ 8937bded2dbSJung-uk Kim$code.=<<___ if ($i>=13 && $i<15); 8947bded2dbSJung-uk Kim vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" 8957bded2dbSJung-uk Kim___ 8967bded2dbSJung-uk Kim$code.=<<___ if ($i>=15); # apply Xupdate 8977bded2dbSJung-uk Kim vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 8987bded2dbSJung-uk Kim vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 8997bded2dbSJung-uk Kim 9007bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_00_19 9017bded2dbSJung-uk Kim vpslld \$5,$a,$t2 9027bded2dbSJung-uk Kim vpandn $d,$b,$t1 9037bded2dbSJung-uk Kim `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` 9047bded2dbSJung-uk Kim vpand $c,$b,$t0 9057bded2dbSJung-uk Kim 9067bded2dbSJung-uk Kim vmovdqa @Xi[0],`&Xi_off($i)` 9077bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 9087bded2dbSJung-uk Kim vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 9097bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 9107bded2dbSJung-uk Kim vpxor $t1,$t0,$t0 # Ch(b,c,d) 9117bded2dbSJung-uk Kim vpxor @Xi[3],@Xi[1],@Xi[1] 9127bded2dbSJung-uk Kim `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` 9137bded2dbSJung-uk Kim 9147bded2dbSJung-uk Kim vpslld \$30,$b,$t1 9157bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 9167bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Ch(b,c,d) 9177bded2dbSJung-uk Kim `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` 9187bded2dbSJung-uk Kim vpsrld \$31,@Xi[1],$tx 9197bded2dbSJung-uk Kim vpaddd @Xi[1],@Xi[1],@Xi[1] 9207bded2dbSJung-uk Kim 9217bded2dbSJung-uk Kim vpsrld \$2,$b,$b 9227bded2dbSJung-uk Kim `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` 9237bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 9247bded2dbSJung-uk Kim vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] 9257bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 9267bded2dbSJung-uk Kim___ 9277bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 9287bded2dbSJung-uk Kim} 9297bded2dbSJung-uk Kim 9307bded2dbSJung-uk Kimsub BODY_20_39_avx { 9317bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 9327bded2dbSJung-uk Kimmy $j=$i+1; 9337bded2dbSJung-uk Kim 9347bded2dbSJung-uk Kim$code.=<<___ if ($i<79); 9357bded2dbSJung-uk Kim vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 9367bded2dbSJung-uk Kim vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 9377bded2dbSJung-uk Kim 9387bded2dbSJung-uk Kim vpslld \$5,$a,$t2 9397bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_20_39 9407bded2dbSJung-uk Kim vpxor $b,$d,$t0 9417bded2dbSJung-uk Kim___ 9427bded2dbSJung-uk Kim$code.=<<___ if ($i<72); 9437bded2dbSJung-uk Kim vmovdqa @Xi[0],`&Xi_off($i)` 9447bded2dbSJung-uk Kim___ 9457bded2dbSJung-uk Kim$code.=<<___ if ($i<79); 9467bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 9477bded2dbSJung-uk Kim vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 9487bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 9497bded2dbSJung-uk Kim vpxor $c,$t0,$t0 # Parity(b,c,d) 9507bded2dbSJung-uk Kim vpxor @Xi[3],@Xi[1],@Xi[1] 9517bded2dbSJung-uk Kim 9527bded2dbSJung-uk Kim vpslld \$30,$b,$t1 9537bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 9547bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Parity(b,c,d) 9557bded2dbSJung-uk Kim vpsrld \$31,@Xi[1],$tx 9567bded2dbSJung-uk Kim vpaddd @Xi[1],@Xi[1],@Xi[1] 9577bded2dbSJung-uk Kim 9587bded2dbSJung-uk Kim vpsrld \$2,$b,$b 9597bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 9607bded2dbSJung-uk Kim vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1) 9617bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 9627bded2dbSJung-uk Kim___ 9637bded2dbSJung-uk Kim$code.=<<___ if ($i==79); 9647bded2dbSJung-uk Kim vpslld \$5,$a,$t2 9657bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_20_39 9667bded2dbSJung-uk Kim vpxor $b,$d,$t0 9677bded2dbSJung-uk Kim 9687bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 9697bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 9707bded2dbSJung-uk Kim vpxor $c,$t0,$t0 # Parity(b,c,d) 9717bded2dbSJung-uk Kim 9727bded2dbSJung-uk Kim vpslld \$30,$b,$t1 9737bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 9747bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Parity(b,c,d) 9757bded2dbSJung-uk Kim 9767bded2dbSJung-uk Kim vpsrld \$2,$b,$b 9777bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 9787bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 9797bded2dbSJung-uk Kim___ 9807bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 9817bded2dbSJung-uk Kim} 9827bded2dbSJung-uk Kim 9837bded2dbSJung-uk Kimsub BODY_40_59_avx { 9847bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 9857bded2dbSJung-uk Kimmy $j=$i+1; 9867bded2dbSJung-uk Kim 9877bded2dbSJung-uk Kim$code.=<<___; 9887bded2dbSJung-uk Kim vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 9897bded2dbSJung-uk Kim vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 9907bded2dbSJung-uk Kim 9917bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_40_59 9927bded2dbSJung-uk Kim vpslld \$5,$a,$t2 9937bded2dbSJung-uk Kim vpand $c,$d,$t1 9947bded2dbSJung-uk Kim vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 9957bded2dbSJung-uk Kim 9967bded2dbSJung-uk Kim vpaddd $t1,$e,$e 9977bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 9987bded2dbSJung-uk Kim vpxor $c,$d,$t0 9997bded2dbSJung-uk Kim vpxor @Xi[3],@Xi[1],@Xi[1] 10007bded2dbSJung-uk Kim 10017bded2dbSJung-uk Kim vmovdqu @Xi[0],`&Xi_off($i)` 10027bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 10037bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 10047bded2dbSJung-uk Kim vpsrld \$31,@Xi[1],$tx 10057bded2dbSJung-uk Kim vpand $b,$t0,$t0 10067bded2dbSJung-uk Kim vpaddd @Xi[1],@Xi[1],@Xi[1] 10077bded2dbSJung-uk Kim 10087bded2dbSJung-uk Kim vpslld \$30,$b,$t1 10097bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Maj(b,d,c) 10107bded2dbSJung-uk Kim 10117bded2dbSJung-uk Kim vpsrld \$2,$b,$b 10127bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 10137bded2dbSJung-uk Kim vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1) 10147bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 10157bded2dbSJung-uk Kim___ 10167bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 10177bded2dbSJung-uk Kim} 10187bded2dbSJung-uk Kim 10197bded2dbSJung-uk Kim$code.=<<___; 10207bded2dbSJung-uk Kim.type sha1_multi_block_avx,\@function,3 10217bded2dbSJung-uk Kim.align 32 10227bded2dbSJung-uk Kimsha1_multi_block_avx: 1023e71b7053SJung-uk Kim.cfi_startproc 10247bded2dbSJung-uk Kim_avx_shortcut: 10257bded2dbSJung-uk Kim___ 10267bded2dbSJung-uk Kim$code.=<<___ if ($avx>1); 10277bded2dbSJung-uk Kim shr \$32,%rcx 10287bded2dbSJung-uk Kim cmp \$2,$num 10297bded2dbSJung-uk Kim jb .Lavx 10307bded2dbSJung-uk Kim test \$`1<<5`,%ecx 10317bded2dbSJung-uk Kim jnz _avx2_shortcut 10327bded2dbSJung-uk Kim jmp .Lavx 10337bded2dbSJung-uk Kim.align 32 10347bded2dbSJung-uk Kim.Lavx: 10357bded2dbSJung-uk Kim___ 10367bded2dbSJung-uk Kim$code.=<<___; 10377bded2dbSJung-uk Kim mov %rsp,%rax 1038e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 10397bded2dbSJung-uk Kim push %rbx 1040e71b7053SJung-uk Kim.cfi_push %rbx 10417bded2dbSJung-uk Kim push %rbp 1042e71b7053SJung-uk Kim.cfi_push %rbp 10437bded2dbSJung-uk Kim___ 10447bded2dbSJung-uk Kim$code.=<<___ if ($win64); 10457bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 10467bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 10477bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 10487bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 10497bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 10507bded2dbSJung-uk Kim movaps %xmm10,-0x78(%rax) 10517bded2dbSJung-uk Kim movaps %xmm11,-0x68(%rax) 10527bded2dbSJung-uk Kim movaps %xmm12,-0x58(%rax) 10537bded2dbSJung-uk Kim movaps %xmm13,-0x48(%rax) 10547bded2dbSJung-uk Kim movaps %xmm14,-0x38(%rax) 10557bded2dbSJung-uk Kim movaps %xmm15,-0x28(%rax) 10567bded2dbSJung-uk Kim___ 10577bded2dbSJung-uk Kim$code.=<<___; 10587bded2dbSJung-uk Kim sub \$`$REG_SZ*18`, %rsp 10597bded2dbSJung-uk Kim and \$-256,%rsp 10607bded2dbSJung-uk Kim mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1061e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 10627bded2dbSJung-uk Kim.Lbody_avx: 10637bded2dbSJung-uk Kim lea K_XX_XX(%rip),$Tbl 10647bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 10657bded2dbSJung-uk Kim 10667bded2dbSJung-uk Kim vzeroupper 10677bded2dbSJung-uk Kim.Loop_grande_avx: 10687bded2dbSJung-uk Kim mov $num,`$REG_SZ*17+8`(%rsp) # original $num 10697bded2dbSJung-uk Kim xor $num,$num 10707bded2dbSJung-uk Kim___ 10717bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 10727bded2dbSJung-uk Kim $code.=<<___; 10737bded2dbSJung-uk Kim mov `16*$i+0`($inp),@ptr[$i] # input pointer 10747bded2dbSJung-uk Kim mov `16*$i+8`($inp),%ecx # number of blocks 10757bded2dbSJung-uk Kim cmp $num,%ecx 10767bded2dbSJung-uk Kim cmovg %ecx,$num # find maximum 10777bded2dbSJung-uk Kim test %ecx,%ecx 10787bded2dbSJung-uk Kim mov %ecx,`4*$i`(%rbx) # initialize counters 10797bded2dbSJung-uk Kim cmovle $Tbl,@ptr[$i] # cancel input 10807bded2dbSJung-uk Kim___ 10817bded2dbSJung-uk Kim} 10827bded2dbSJung-uk Kim$code.=<<___; 10837bded2dbSJung-uk Kim test $num,$num 10847bded2dbSJung-uk Kim jz .Ldone_avx 10857bded2dbSJung-uk Kim 10867bded2dbSJung-uk Kim vmovdqu 0x00($ctx),$A # load context 10877bded2dbSJung-uk Kim lea 128(%rsp),%rax 10887bded2dbSJung-uk Kim vmovdqu 0x20($ctx),$B 10897bded2dbSJung-uk Kim vmovdqu 0x40($ctx),$C 10907bded2dbSJung-uk Kim vmovdqu 0x60($ctx),$D 10917bded2dbSJung-uk Kim vmovdqu 0x80($ctx),$E 10927bded2dbSJung-uk Kim vmovdqu 0x60($Tbl),$tx # pbswap_mask 10937bded2dbSJung-uk Kim jmp .Loop_avx 10947bded2dbSJung-uk Kim 10957bded2dbSJung-uk Kim.align 32 10967bded2dbSJung-uk Kim.Loop_avx: 10977bded2dbSJung-uk Kim___ 10987bded2dbSJung-uk Kim$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 10997bded2dbSJung-uk Kimfor($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } 11007bded2dbSJung-uk Kim$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 11017bded2dbSJung-uk Kimfor(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 11027bded2dbSJung-uk Kim$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 11037bded2dbSJung-uk Kimfor(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } 11047bded2dbSJung-uk Kim$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 11057bded2dbSJung-uk Kimfor(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 11067bded2dbSJung-uk Kim$code.=<<___; 11077bded2dbSJung-uk Kim mov \$1,%ecx 11087bded2dbSJung-uk Kim___ 11097bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 11107bded2dbSJung-uk Kim $code.=<<___; 11117bded2dbSJung-uk Kim cmp `4*$i`(%rbx),%ecx # examine counters 11127bded2dbSJung-uk Kim cmovge $Tbl,@ptr[$i] # cancel input 11137bded2dbSJung-uk Kim___ 11147bded2dbSJung-uk Kim} 11157bded2dbSJung-uk Kim$code.=<<___; 11167bded2dbSJung-uk Kim vmovdqu (%rbx),$t0 # pull counters 11177bded2dbSJung-uk Kim vpxor $t2,$t2,$t2 11187bded2dbSJung-uk Kim vmovdqa $t0,$t1 11197bded2dbSJung-uk Kim vpcmpgtd $t2,$t1,$t1 # mask value 11207bded2dbSJung-uk Kim vpaddd $t1,$t0,$t0 # counters-- 11217bded2dbSJung-uk Kim 11227bded2dbSJung-uk Kim vpand $t1,$A,$A 11237bded2dbSJung-uk Kim vpand $t1,$B,$B 11247bded2dbSJung-uk Kim vpaddd 0x00($ctx),$A,$A 11257bded2dbSJung-uk Kim vpand $t1,$C,$C 11267bded2dbSJung-uk Kim vpaddd 0x20($ctx),$B,$B 11277bded2dbSJung-uk Kim vpand $t1,$D,$D 11287bded2dbSJung-uk Kim vpaddd 0x40($ctx),$C,$C 11297bded2dbSJung-uk Kim vpand $t1,$E,$E 11307bded2dbSJung-uk Kim vpaddd 0x60($ctx),$D,$D 11317bded2dbSJung-uk Kim vpaddd 0x80($ctx),$E,$E 11327bded2dbSJung-uk Kim vmovdqu $A,0x00($ctx) 11337bded2dbSJung-uk Kim vmovdqu $B,0x20($ctx) 11347bded2dbSJung-uk Kim vmovdqu $C,0x40($ctx) 11357bded2dbSJung-uk Kim vmovdqu $D,0x60($ctx) 11367bded2dbSJung-uk Kim vmovdqu $E,0x80($ctx) 11377bded2dbSJung-uk Kim 11387bded2dbSJung-uk Kim vmovdqu $t0,(%rbx) # save counters 11397bded2dbSJung-uk Kim vmovdqu 0x60($Tbl),$tx # pbswap_mask 11407bded2dbSJung-uk Kim dec $num 11417bded2dbSJung-uk Kim jnz .Loop_avx 11427bded2dbSJung-uk Kim 11437bded2dbSJung-uk Kim mov `$REG_SZ*17+8`(%rsp),$num 11447bded2dbSJung-uk Kim lea $REG_SZ($ctx),$ctx 11457bded2dbSJung-uk Kim lea `16*$REG_SZ/4`($inp),$inp 11467bded2dbSJung-uk Kim dec $num 11477bded2dbSJung-uk Kim jnz .Loop_grande_avx 11487bded2dbSJung-uk Kim 11497bded2dbSJung-uk Kim.Ldone_avx: 1150e71b7053SJung-uk Kim mov `$REG_SZ*17`(%rsp),%rax # original %rsp 1151e71b7053SJung-uk Kim.cfi_def_cfa %rax,8 11527bded2dbSJung-uk Kim vzeroupper 11537bded2dbSJung-uk Kim___ 11547bded2dbSJung-uk Kim$code.=<<___ if ($win64); 11557bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm6 11567bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm7 11577bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm8 11587bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm9 11597bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm10 11607bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm11 11617bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm12 11627bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm13 11637bded2dbSJung-uk Kim movaps -0x38(%rax),%xmm14 11647bded2dbSJung-uk Kim movaps -0x28(%rax),%xmm15 11657bded2dbSJung-uk Kim___ 11667bded2dbSJung-uk Kim$code.=<<___; 11677bded2dbSJung-uk Kim mov -16(%rax),%rbp 1168e71b7053SJung-uk Kim.cfi_restore %rbp 11697bded2dbSJung-uk Kim mov -8(%rax),%rbx 1170e71b7053SJung-uk Kim.cfi_restore %rbx 11717bded2dbSJung-uk Kim lea (%rax),%rsp 1172e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 11737bded2dbSJung-uk Kim.Lepilogue_avx: 11747bded2dbSJung-uk Kim ret 1175e71b7053SJung-uk Kim.cfi_endproc 11767bded2dbSJung-uk Kim.size sha1_multi_block_avx,.-sha1_multi_block_avx 11777bded2dbSJung-uk Kim___ 11787bded2dbSJung-uk Kim 11797bded2dbSJung-uk Kim if ($avx>1) { 11807bded2dbSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval $1/gem; 11817bded2dbSJung-uk Kim 11827bded2dbSJung-uk Kim$REG_SZ=32; 11837bded2dbSJung-uk Kim 11847bded2dbSJung-uk Kim@ptr=map("%r$_",(12..15,8..11)); 11857bded2dbSJung-uk Kim 11867bded2dbSJung-uk Kim@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4)); 11877bded2dbSJung-uk Kim($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9)); 11887bded2dbSJung-uk Kim@Xi=map("%ymm$_",(10..14)); 11897bded2dbSJung-uk Kim$K="%ymm15"; 11907bded2dbSJung-uk Kim 11917bded2dbSJung-uk Kim$code.=<<___; 11927bded2dbSJung-uk Kim.type sha1_multi_block_avx2,\@function,3 11937bded2dbSJung-uk Kim.align 32 11947bded2dbSJung-uk Kimsha1_multi_block_avx2: 1195e71b7053SJung-uk Kim.cfi_startproc 11967bded2dbSJung-uk Kim_avx2_shortcut: 11977bded2dbSJung-uk Kim mov %rsp,%rax 1198e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 11997bded2dbSJung-uk Kim push %rbx 1200e71b7053SJung-uk Kim.cfi_push %rbx 12017bded2dbSJung-uk Kim push %rbp 1202e71b7053SJung-uk Kim.cfi_push %rbp 12037bded2dbSJung-uk Kim push %r12 1204e71b7053SJung-uk Kim.cfi_push %r12 12057bded2dbSJung-uk Kim push %r13 1206e71b7053SJung-uk Kim.cfi_push %r13 12077bded2dbSJung-uk Kim push %r14 1208e71b7053SJung-uk Kim.cfi_push %r14 12097bded2dbSJung-uk Kim push %r15 1210e71b7053SJung-uk Kim.cfi_push %r15 12117bded2dbSJung-uk Kim___ 12127bded2dbSJung-uk Kim$code.=<<___ if ($win64); 12137bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 12147bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 12157bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 12167bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 12177bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 12187bded2dbSJung-uk Kim movaps %xmm10,0x40(%rsp) 12197bded2dbSJung-uk Kim movaps %xmm11,0x50(%rsp) 12207bded2dbSJung-uk Kim movaps %xmm12,-0x78(%rax) 12217bded2dbSJung-uk Kim movaps %xmm13,-0x68(%rax) 12227bded2dbSJung-uk Kim movaps %xmm14,-0x58(%rax) 12237bded2dbSJung-uk Kim movaps %xmm15,-0x48(%rax) 12247bded2dbSJung-uk Kim___ 12257bded2dbSJung-uk Kim$code.=<<___; 12267bded2dbSJung-uk Kim sub \$`$REG_SZ*18`, %rsp 12277bded2dbSJung-uk Kim and \$-256,%rsp 12287bded2dbSJung-uk Kim mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1229e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 12307bded2dbSJung-uk Kim.Lbody_avx2: 12317bded2dbSJung-uk Kim lea K_XX_XX(%rip),$Tbl 12327bded2dbSJung-uk Kim shr \$1,$num 12337bded2dbSJung-uk Kim 12347bded2dbSJung-uk Kim vzeroupper 12357bded2dbSJung-uk Kim.Loop_grande_avx2: 12367bded2dbSJung-uk Kim mov $num,`$REG_SZ*17+8`(%rsp) # original $num 12377bded2dbSJung-uk Kim xor $num,$num 12387bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 12397bded2dbSJung-uk Kim___ 12407bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) { 12417bded2dbSJung-uk Kim $code.=<<___; 12427bded2dbSJung-uk Kim mov `16*$i+0`($inp),@ptr[$i] # input pointer 12437bded2dbSJung-uk Kim mov `16*$i+8`($inp),%ecx # number of blocks 12447bded2dbSJung-uk Kim cmp $num,%ecx 12457bded2dbSJung-uk Kim cmovg %ecx,$num # find maximum 12467bded2dbSJung-uk Kim test %ecx,%ecx 12477bded2dbSJung-uk Kim mov %ecx,`4*$i`(%rbx) # initialize counters 12487bded2dbSJung-uk Kim cmovle $Tbl,@ptr[$i] # cancel input 12497bded2dbSJung-uk Kim___ 12507bded2dbSJung-uk Kim} 12517bded2dbSJung-uk Kim$code.=<<___; 12527bded2dbSJung-uk Kim vmovdqu 0x00($ctx),$A # load context 12537bded2dbSJung-uk Kim lea 128(%rsp),%rax 12547bded2dbSJung-uk Kim vmovdqu 0x20($ctx),$B 12557bded2dbSJung-uk Kim lea 256+128(%rsp),%rbx 12567bded2dbSJung-uk Kim vmovdqu 0x40($ctx),$C 12577bded2dbSJung-uk Kim vmovdqu 0x60($ctx),$D 12587bded2dbSJung-uk Kim vmovdqu 0x80($ctx),$E 12597bded2dbSJung-uk Kim vmovdqu 0x60($Tbl),$tx # pbswap_mask 12607bded2dbSJung-uk Kim jmp .Loop_avx2 12617bded2dbSJung-uk Kim 12627bded2dbSJung-uk Kim.align 32 12637bded2dbSJung-uk Kim.Loop_avx2: 12647bded2dbSJung-uk Kim___ 12657bded2dbSJung-uk Kim$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 12667bded2dbSJung-uk Kimfor($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } 12677bded2dbSJung-uk Kim$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 12687bded2dbSJung-uk Kimfor(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 12697bded2dbSJung-uk Kim$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 12707bded2dbSJung-uk Kimfor(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } 12717bded2dbSJung-uk Kim$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 12727bded2dbSJung-uk Kimfor(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 12737bded2dbSJung-uk Kim$code.=<<___; 12747bded2dbSJung-uk Kim mov \$1,%ecx 12757bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 12767bded2dbSJung-uk Kim___ 12777bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) { 12787bded2dbSJung-uk Kim $code.=<<___; 12797bded2dbSJung-uk Kim cmp `4*$i`(%rbx),%ecx # examine counters 12807bded2dbSJung-uk Kim cmovge $Tbl,@ptr[$i] # cancel input 12817bded2dbSJung-uk Kim___ 12827bded2dbSJung-uk Kim} 12837bded2dbSJung-uk Kim$code.=<<___; 12847bded2dbSJung-uk Kim vmovdqu (%rbx),$t0 # pull counters 12857bded2dbSJung-uk Kim vpxor $t2,$t2,$t2 12867bded2dbSJung-uk Kim vmovdqa $t0,$t1 12877bded2dbSJung-uk Kim vpcmpgtd $t2,$t1,$t1 # mask value 12887bded2dbSJung-uk Kim vpaddd $t1,$t0,$t0 # counters-- 12897bded2dbSJung-uk Kim 12907bded2dbSJung-uk Kim vpand $t1,$A,$A 12917bded2dbSJung-uk Kim vpand $t1,$B,$B 12927bded2dbSJung-uk Kim vpaddd 0x00($ctx),$A,$A 12937bded2dbSJung-uk Kim vpand $t1,$C,$C 12947bded2dbSJung-uk Kim vpaddd 0x20($ctx),$B,$B 12957bded2dbSJung-uk Kim vpand $t1,$D,$D 12967bded2dbSJung-uk Kim vpaddd 0x40($ctx),$C,$C 12977bded2dbSJung-uk Kim vpand $t1,$E,$E 12987bded2dbSJung-uk Kim vpaddd 0x60($ctx),$D,$D 12997bded2dbSJung-uk Kim vpaddd 0x80($ctx),$E,$E 13007bded2dbSJung-uk Kim vmovdqu $A,0x00($ctx) 13017bded2dbSJung-uk Kim vmovdqu $B,0x20($ctx) 13027bded2dbSJung-uk Kim vmovdqu $C,0x40($ctx) 13037bded2dbSJung-uk Kim vmovdqu $D,0x60($ctx) 13047bded2dbSJung-uk Kim vmovdqu $E,0x80($ctx) 13057bded2dbSJung-uk Kim 13067bded2dbSJung-uk Kim vmovdqu $t0,(%rbx) # save counters 13077bded2dbSJung-uk Kim lea 256+128(%rsp),%rbx 13087bded2dbSJung-uk Kim vmovdqu 0x60($Tbl),$tx # pbswap_mask 13097bded2dbSJung-uk Kim dec $num 13107bded2dbSJung-uk Kim jnz .Loop_avx2 13117bded2dbSJung-uk Kim 13127bded2dbSJung-uk Kim #mov `$REG_SZ*17+8`(%rsp),$num 13137bded2dbSJung-uk Kim #lea $REG_SZ($ctx),$ctx 13147bded2dbSJung-uk Kim #lea `16*$REG_SZ/4`($inp),$inp 13157bded2dbSJung-uk Kim #dec $num 13167bded2dbSJung-uk Kim #jnz .Loop_grande_avx2 13177bded2dbSJung-uk Kim 13187bded2dbSJung-uk Kim.Ldone_avx2: 1319e71b7053SJung-uk Kim mov `$REG_SZ*17`(%rsp),%rax # original %rsp 1320e71b7053SJung-uk Kim.cfi_def_cfa %rax,8 13217bded2dbSJung-uk Kim vzeroupper 13227bded2dbSJung-uk Kim___ 13237bded2dbSJung-uk Kim$code.=<<___ if ($win64); 13247bded2dbSJung-uk Kim movaps -0xd8(%rax),%xmm6 13257bded2dbSJung-uk Kim movaps -0xc8(%rax),%xmm7 13267bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm8 13277bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm9 13287bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm10 13297bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm11 13307bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm12 13317bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm13 13327bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm14 13337bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm15 13347bded2dbSJung-uk Kim___ 13357bded2dbSJung-uk Kim$code.=<<___; 13367bded2dbSJung-uk Kim mov -48(%rax),%r15 1337e71b7053SJung-uk Kim.cfi_restore %r15 13387bded2dbSJung-uk Kim mov -40(%rax),%r14 1339e71b7053SJung-uk Kim.cfi_restore %r14 13407bded2dbSJung-uk Kim mov -32(%rax),%r13 1341e71b7053SJung-uk Kim.cfi_restore %r13 13427bded2dbSJung-uk Kim mov -24(%rax),%r12 1343e71b7053SJung-uk Kim.cfi_restore %r12 13447bded2dbSJung-uk Kim mov -16(%rax),%rbp 1345e71b7053SJung-uk Kim.cfi_restore %rbp 13467bded2dbSJung-uk Kim mov -8(%rax),%rbx 1347e71b7053SJung-uk Kim.cfi_restore %rbx 13487bded2dbSJung-uk Kim lea (%rax),%rsp 1349e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 13507bded2dbSJung-uk Kim.Lepilogue_avx2: 13517bded2dbSJung-uk Kim ret 1352e71b7053SJung-uk Kim.cfi_endproc 13537bded2dbSJung-uk Kim.size sha1_multi_block_avx2,.-sha1_multi_block_avx2 13547bded2dbSJung-uk Kim___ 13557bded2dbSJung-uk Kim } }}} 13567bded2dbSJung-uk Kim$code.=<<___; 13577bded2dbSJung-uk Kim 13587bded2dbSJung-uk Kim.align 256 13597bded2dbSJung-uk Kim .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 13607bded2dbSJung-uk Kim .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 13617bded2dbSJung-uk KimK_XX_XX: 13627bded2dbSJung-uk Kim .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 13637bded2dbSJung-uk Kim .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 13647bded2dbSJung-uk Kim .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 13657bded2dbSJung-uk Kim .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 13667bded2dbSJung-uk Kim .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 13677bded2dbSJung-uk Kim .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 13687bded2dbSJung-uk Kim .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 13697bded2dbSJung-uk Kim .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 13707bded2dbSJung-uk Kim .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 13717bded2dbSJung-uk Kim .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 13727bded2dbSJung-uk Kim___ 13737bded2dbSJung-uk Kim 13747bded2dbSJung-uk Kimif ($win64) { 13757bded2dbSJung-uk Kim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 13767bded2dbSJung-uk Kim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 13777bded2dbSJung-uk Kim$rec="%rcx"; 13787bded2dbSJung-uk Kim$frame="%rdx"; 13797bded2dbSJung-uk Kim$context="%r8"; 13807bded2dbSJung-uk Kim$disp="%r9"; 13817bded2dbSJung-uk Kim 13827bded2dbSJung-uk Kim$code.=<<___; 13837bded2dbSJung-uk Kim.extern __imp_RtlVirtualUnwind 13847bded2dbSJung-uk Kim.type se_handler,\@abi-omnipotent 13857bded2dbSJung-uk Kim.align 16 13867bded2dbSJung-uk Kimse_handler: 13877bded2dbSJung-uk Kim push %rsi 13887bded2dbSJung-uk Kim push %rdi 13897bded2dbSJung-uk Kim push %rbx 13907bded2dbSJung-uk Kim push %rbp 13917bded2dbSJung-uk Kim push %r12 13927bded2dbSJung-uk Kim push %r13 13937bded2dbSJung-uk Kim push %r14 13947bded2dbSJung-uk Kim push %r15 13957bded2dbSJung-uk Kim pushfq 13967bded2dbSJung-uk Kim sub \$64,%rsp 13977bded2dbSJung-uk Kim 13987bded2dbSJung-uk Kim mov 120($context),%rax # pull context->Rax 13997bded2dbSJung-uk Kim mov 248($context),%rbx # pull context->Rip 14007bded2dbSJung-uk Kim 14017bded2dbSJung-uk Kim mov 8($disp),%rsi # disp->ImageBase 14027bded2dbSJung-uk Kim mov 56($disp),%r11 # disp->HandlerData 14037bded2dbSJung-uk Kim 14047bded2dbSJung-uk Kim mov 0(%r11),%r10d # HandlerData[0] 14057bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # end of prologue label 14067bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip<.Lbody 14077bded2dbSJung-uk Kim jb .Lin_prologue 14087bded2dbSJung-uk Kim 14097bded2dbSJung-uk Kim mov 152($context),%rax # pull context->Rsp 14107bded2dbSJung-uk Kim 14117bded2dbSJung-uk Kim mov 4(%r11),%r10d # HandlerData[1] 14127bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # epilogue label 14137bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip>=.Lepilogue 14147bded2dbSJung-uk Kim jae .Lin_prologue 14157bded2dbSJung-uk Kim 14167bded2dbSJung-uk Kim mov `16*17`(%rax),%rax # pull saved stack pointer 14177bded2dbSJung-uk Kim 14187bded2dbSJung-uk Kim mov -8(%rax),%rbx 14197bded2dbSJung-uk Kim mov -16(%rax),%rbp 14207bded2dbSJung-uk Kim mov %rbx,144($context) # restore context->Rbx 14217bded2dbSJung-uk Kim mov %rbp,160($context) # restore context->Rbp 14227bded2dbSJung-uk Kim 14237bded2dbSJung-uk Kim lea -24-10*16(%rax),%rsi 14247bded2dbSJung-uk Kim lea 512($context),%rdi # &context.Xmm6 14257bded2dbSJung-uk Kim mov \$20,%ecx 14267bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 14277bded2dbSJung-uk Kim 14287bded2dbSJung-uk Kim.Lin_prologue: 14297bded2dbSJung-uk Kim mov 8(%rax),%rdi 14307bded2dbSJung-uk Kim mov 16(%rax),%rsi 14317bded2dbSJung-uk Kim mov %rax,152($context) # restore context->Rsp 14327bded2dbSJung-uk Kim mov %rsi,168($context) # restore context->Rsi 14337bded2dbSJung-uk Kim mov %rdi,176($context) # restore context->Rdi 14347bded2dbSJung-uk Kim 14357bded2dbSJung-uk Kim mov 40($disp),%rdi # disp->ContextRecord 14367bded2dbSJung-uk Kim mov $context,%rsi # context 14377bded2dbSJung-uk Kim mov \$154,%ecx # sizeof(CONTEXT) 14387bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 14397bded2dbSJung-uk Kim 14407bded2dbSJung-uk Kim mov $disp,%rsi 14417bded2dbSJung-uk Kim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 14427bded2dbSJung-uk Kim mov 8(%rsi),%rdx # arg2, disp->ImageBase 14437bded2dbSJung-uk Kim mov 0(%rsi),%r8 # arg3, disp->ControlPc 14447bded2dbSJung-uk Kim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 14457bded2dbSJung-uk Kim mov 40(%rsi),%r10 # disp->ContextRecord 14467bded2dbSJung-uk Kim lea 56(%rsi),%r11 # &disp->HandlerData 14477bded2dbSJung-uk Kim lea 24(%rsi),%r12 # &disp->EstablisherFrame 14487bded2dbSJung-uk Kim mov %r10,32(%rsp) # arg5 14497bded2dbSJung-uk Kim mov %r11,40(%rsp) # arg6 14507bded2dbSJung-uk Kim mov %r12,48(%rsp) # arg7 14517bded2dbSJung-uk Kim mov %rcx,56(%rsp) # arg8, (NULL) 14527bded2dbSJung-uk Kim call *__imp_RtlVirtualUnwind(%rip) 14537bded2dbSJung-uk Kim 14547bded2dbSJung-uk Kim mov \$1,%eax # ExceptionContinueSearch 14557bded2dbSJung-uk Kim add \$64,%rsp 14567bded2dbSJung-uk Kim popfq 14577bded2dbSJung-uk Kim pop %r15 14587bded2dbSJung-uk Kim pop %r14 14597bded2dbSJung-uk Kim pop %r13 14607bded2dbSJung-uk Kim pop %r12 14617bded2dbSJung-uk Kim pop %rbp 14627bded2dbSJung-uk Kim pop %rbx 14637bded2dbSJung-uk Kim pop %rdi 14647bded2dbSJung-uk Kim pop %rsi 14657bded2dbSJung-uk Kim ret 14667bded2dbSJung-uk Kim.size se_handler,.-se_handler 14677bded2dbSJung-uk Kim___ 14687bded2dbSJung-uk Kim$code.=<<___ if ($avx>1); 14697bded2dbSJung-uk Kim.type avx2_handler,\@abi-omnipotent 14707bded2dbSJung-uk Kim.align 16 14717bded2dbSJung-uk Kimavx2_handler: 14727bded2dbSJung-uk Kim push %rsi 14737bded2dbSJung-uk Kim push %rdi 14747bded2dbSJung-uk Kim push %rbx 14757bded2dbSJung-uk Kim push %rbp 14767bded2dbSJung-uk Kim push %r12 14777bded2dbSJung-uk Kim push %r13 14787bded2dbSJung-uk Kim push %r14 14797bded2dbSJung-uk Kim push %r15 14807bded2dbSJung-uk Kim pushfq 14817bded2dbSJung-uk Kim sub \$64,%rsp 14827bded2dbSJung-uk Kim 14837bded2dbSJung-uk Kim mov 120($context),%rax # pull context->Rax 14847bded2dbSJung-uk Kim mov 248($context),%rbx # pull context->Rip 14857bded2dbSJung-uk Kim 14867bded2dbSJung-uk Kim mov 8($disp),%rsi # disp->ImageBase 14877bded2dbSJung-uk Kim mov 56($disp),%r11 # disp->HandlerData 14887bded2dbSJung-uk Kim 14897bded2dbSJung-uk Kim mov 0(%r11),%r10d # HandlerData[0] 14907bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # end of prologue label 14917bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip<body label 14927bded2dbSJung-uk Kim jb .Lin_prologue 14937bded2dbSJung-uk Kim 14947bded2dbSJung-uk Kim mov 152($context),%rax # pull context->Rsp 14957bded2dbSJung-uk Kim 14967bded2dbSJung-uk Kim mov 4(%r11),%r10d # HandlerData[1] 14977bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # epilogue label 14987bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip>=epilogue label 14997bded2dbSJung-uk Kim jae .Lin_prologue 15007bded2dbSJung-uk Kim 15017bded2dbSJung-uk Kim mov `32*17`($context),%rax # pull saved stack pointer 15027bded2dbSJung-uk Kim 15037bded2dbSJung-uk Kim mov -8(%rax),%rbx 15047bded2dbSJung-uk Kim mov -16(%rax),%rbp 15057bded2dbSJung-uk Kim mov -24(%rax),%r12 15067bded2dbSJung-uk Kim mov -32(%rax),%r13 15077bded2dbSJung-uk Kim mov -40(%rax),%r14 15087bded2dbSJung-uk Kim mov -48(%rax),%r15 15097bded2dbSJung-uk Kim mov %rbx,144($context) # restore context->Rbx 15107bded2dbSJung-uk Kim mov %rbp,160($context) # restore context->Rbp 1511e71b7053SJung-uk Kim mov %r12,216($context) # restore context->R12 1512e71b7053SJung-uk Kim mov %r13,224($context) # restore context->R13 1513e71b7053SJung-uk Kim mov %r14,232($context) # restore context->R14 1514e71b7053SJung-uk Kim mov %r15,240($context) # restore context->R15 15157bded2dbSJung-uk Kim 15167bded2dbSJung-uk Kim lea -56-10*16(%rax),%rsi 15177bded2dbSJung-uk Kim lea 512($context),%rdi # &context.Xmm6 15187bded2dbSJung-uk Kim mov \$20,%ecx 15197bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 15207bded2dbSJung-uk Kim 15217bded2dbSJung-uk Kim jmp .Lin_prologue 15227bded2dbSJung-uk Kim.size avx2_handler,.-avx2_handler 15237bded2dbSJung-uk Kim___ 15247bded2dbSJung-uk Kim$code.=<<___; 15257bded2dbSJung-uk Kim.section .pdata 15267bded2dbSJung-uk Kim.align 4 15277bded2dbSJung-uk Kim .rva .LSEH_begin_sha1_multi_block 15287bded2dbSJung-uk Kim .rva .LSEH_end_sha1_multi_block 15297bded2dbSJung-uk Kim .rva .LSEH_info_sha1_multi_block 15307bded2dbSJung-uk Kim .rva .LSEH_begin_sha1_multi_block_shaext 15317bded2dbSJung-uk Kim .rva .LSEH_end_sha1_multi_block_shaext 15327bded2dbSJung-uk Kim .rva .LSEH_info_sha1_multi_block_shaext 15337bded2dbSJung-uk Kim___ 15347bded2dbSJung-uk Kim$code.=<<___ if ($avx); 15357bded2dbSJung-uk Kim .rva .LSEH_begin_sha1_multi_block_avx 15367bded2dbSJung-uk Kim .rva .LSEH_end_sha1_multi_block_avx 15377bded2dbSJung-uk Kim .rva .LSEH_info_sha1_multi_block_avx 15387bded2dbSJung-uk Kim___ 15397bded2dbSJung-uk Kim$code.=<<___ if ($avx>1); 15407bded2dbSJung-uk Kim .rva .LSEH_begin_sha1_multi_block_avx2 15417bded2dbSJung-uk Kim .rva .LSEH_end_sha1_multi_block_avx2 15427bded2dbSJung-uk Kim .rva .LSEH_info_sha1_multi_block_avx2 15437bded2dbSJung-uk Kim___ 15447bded2dbSJung-uk Kim$code.=<<___; 15457bded2dbSJung-uk Kim.section .xdata 15467bded2dbSJung-uk Kim.align 8 15477bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block: 15487bded2dbSJung-uk Kim .byte 9,0,0,0 15497bded2dbSJung-uk Kim .rva se_handler 15507bded2dbSJung-uk Kim .rva .Lbody,.Lepilogue # HandlerData[] 15517bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block_shaext: 15527bded2dbSJung-uk Kim .byte 9,0,0,0 15537bded2dbSJung-uk Kim .rva se_handler 15547bded2dbSJung-uk Kim .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] 15557bded2dbSJung-uk Kim___ 15567bded2dbSJung-uk Kim$code.=<<___ if ($avx); 15577bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block_avx: 15587bded2dbSJung-uk Kim .byte 9,0,0,0 15597bded2dbSJung-uk Kim .rva se_handler 15607bded2dbSJung-uk Kim .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] 15617bded2dbSJung-uk Kim___ 15627bded2dbSJung-uk Kim$code.=<<___ if ($avx>1); 15637bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block_avx2: 15647bded2dbSJung-uk Kim .byte 9,0,0,0 15657bded2dbSJung-uk Kim .rva avx2_handler 15667bded2dbSJung-uk Kim .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] 15677bded2dbSJung-uk Kim___ 15687bded2dbSJung-uk Kim} 15697bded2dbSJung-uk Kim#################################################################### 15707bded2dbSJung-uk Kim 15717bded2dbSJung-uk Kimsub rex { 15727bded2dbSJung-uk Kim local *opcode=shift; 15737bded2dbSJung-uk Kim my ($dst,$src)=@_; 15747bded2dbSJung-uk Kim my $rex=0; 15757bded2dbSJung-uk Kim 15767bded2dbSJung-uk Kim $rex|=0x04 if ($dst>=8); 15777bded2dbSJung-uk Kim $rex|=0x01 if ($src>=8); 15787bded2dbSJung-uk Kim unshift @opcode,$rex|0x40 if ($rex); 15797bded2dbSJung-uk Kim} 15807bded2dbSJung-uk Kim 15817bded2dbSJung-uk Kimsub sha1rnds4 { 15827bded2dbSJung-uk Kim if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 15837bded2dbSJung-uk Kim my @opcode=(0x0f,0x3a,0xcc); 15847bded2dbSJung-uk Kim rex(\@opcode,$3,$2); 15857bded2dbSJung-uk Kim push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 15867bded2dbSJung-uk Kim my $c=$1; 15877bded2dbSJung-uk Kim push @opcode,$c=~/^0/?oct($c):$c; 15887bded2dbSJung-uk Kim return ".byte\t".join(',',@opcode); 15897bded2dbSJung-uk Kim } else { 15907bded2dbSJung-uk Kim return "sha1rnds4\t".@_[0]; 15917bded2dbSJung-uk Kim } 15927bded2dbSJung-uk Kim} 15937bded2dbSJung-uk Kim 15947bded2dbSJung-uk Kimsub sha1op38 { 15957bded2dbSJung-uk Kim my $instr = shift; 15967bded2dbSJung-uk Kim my %opcodelet = ( 15977bded2dbSJung-uk Kim "sha1nexte" => 0xc8, 15987bded2dbSJung-uk Kim "sha1msg1" => 0xc9, 15997bded2dbSJung-uk Kim "sha1msg2" => 0xca ); 16007bded2dbSJung-uk Kim 16017bded2dbSJung-uk Kim if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 16027bded2dbSJung-uk Kim my @opcode=(0x0f,0x38); 16037bded2dbSJung-uk Kim rex(\@opcode,$2,$1); 16047bded2dbSJung-uk Kim push @opcode,$opcodelet{$instr}; 16057bded2dbSJung-uk Kim push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 16067bded2dbSJung-uk Kim return ".byte\t".join(',',@opcode); 16077bded2dbSJung-uk Kim } else { 16087bded2dbSJung-uk Kim return $instr."\t".@_[0]; 16097bded2dbSJung-uk Kim } 16107bded2dbSJung-uk Kim} 16117bded2dbSJung-uk Kim 16127bded2dbSJung-uk Kimforeach (split("\n",$code)) { 16137bded2dbSJung-uk Kim s/\`([^\`]*)\`/eval($1)/ge; 16147bded2dbSJung-uk Kim 16157bded2dbSJung-uk Kim s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or 16167bded2dbSJung-uk Kim s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or 16177bded2dbSJung-uk Kim 16187bded2dbSJung-uk Kim s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 16197bded2dbSJung-uk Kim s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 16207bded2dbSJung-uk Kim s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or 16217bded2dbSJung-uk Kim s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 16227bded2dbSJung-uk Kim s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or 16237bded2dbSJung-uk Kim s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 16247bded2dbSJung-uk Kim 16257bded2dbSJung-uk Kim print $_,"\n"; 16267bded2dbSJung-uk Kim} 16277bded2dbSJung-uk Kim 1628*17f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!"; 1629