17bded2dbSJung-uk Kim#! /usr/bin/env perl 217f01e99SJung-uk Kim# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# 4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License"). You may not use 5e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8e71b7053SJung-uk Kim 97bded2dbSJung-uk Kim 107bded2dbSJung-uk Kim# ==================================================================== 117bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 127bded2dbSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 137bded2dbSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 147bded2dbSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 157bded2dbSJung-uk Kim# ==================================================================== 167bded2dbSJung-uk Kim 177bded2dbSJung-uk Kim# Multi-buffer SHA1 procedure processes n buffers in parallel by 187bded2dbSJung-uk Kim# placing buffer data to designated lane of SIMD register. n is 197bded2dbSJung-uk Kim# naturally limited to 4 on pre-AVX2 processors and to 8 on 207bded2dbSJung-uk Kim# AVX2-capable processors such as Haswell. 217bded2dbSJung-uk Kim# 227bded2dbSJung-uk Kim# this +aesni(i) sha1 aesni-sha1 gain(iv) 237bded2dbSJung-uk Kim# ------------------------------------------------------------------- 247bded2dbSJung-uk Kim# Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68% 257bded2dbSJung-uk Kim# Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51% 267bded2dbSJung-uk Kim# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% 277bded2dbSJung-uk Kim# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% 287bded2dbSJung-uk Kim# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% 29e71b7053SJung-uk Kim# Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145% 307bded2dbSJung-uk Kim# Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% 317bded2dbSJung-uk Kim# 327bded2dbSJung-uk Kim# (i) multi-block CBC encrypt with 128-bit key; 337bded2dbSJung-uk Kim# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, 347bded2dbSJung-uk Kim# because of lower AES-NI instruction throughput; 357bded2dbSJung-uk Kim# (iii) "this" is for n=8, when we gather twice as much data, result 367bded2dbSJung-uk Kim# for n=4 is 8.00+4.44=12.4; 377bded2dbSJung-uk Kim# (iv) presented improvement coefficients are asymptotic limits and 387bded2dbSJung-uk Kim# in real-life application are somewhat lower, e.g. for 2KB 397bded2dbSJung-uk Kim# fragments they range from 30% to 100% (on Haswell); 407bded2dbSJung-uk Kim 41*b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension) 42*b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file 43*b077aed3SPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 44*b077aed3SPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 457bded2dbSJung-uk Kim 467bded2dbSJung-uk Kim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 477bded2dbSJung-uk Kim 487bded2dbSJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 497bded2dbSJung-uk Kim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 507bded2dbSJung-uk Kim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 517bded2dbSJung-uk Kimdie "can't locate x86_64-xlate.pl"; 527bded2dbSJung-uk Kim 53*b077aed3SPierre Proncherypush(@INC,"${dir}","${dir}../../perlasm"); 54*b077aed3SPierre Proncheryrequire "x86_64-support.pl"; 55*b077aed3SPierre Pronchery 56*b077aed3SPierre Pronchery$ptr_size=&pointer_size($flavour); 57*b077aed3SPierre Pronchery 587bded2dbSJung-uk Kim$avx=0; 597bded2dbSJung-uk Kim 607bded2dbSJung-uk Kimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 617bded2dbSJung-uk Kim =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 627bded2dbSJung-uk Kim $avx = ($1>=2.19) + ($1>=2.22); 637bded2dbSJung-uk Kim} 647bded2dbSJung-uk Kim 657bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 667bded2dbSJung-uk Kim `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 677bded2dbSJung-uk Kim $avx = ($1>=2.09) + ($1>=2.10); 687bded2dbSJung-uk Kim} 697bded2dbSJung-uk Kim 707bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 717bded2dbSJung-uk Kim `ml64 2>&1` =~ /Version ([0-9]+)\./) { 727bded2dbSJung-uk Kim $avx = ($1>=10) + ($1>=11); 737bded2dbSJung-uk Kim} 747bded2dbSJung-uk Kim 7563c1bb51SJung-uk Kimif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 767bded2dbSJung-uk Kim $avx = ($2>=3.0) + ($2>3.0); 777bded2dbSJung-uk Kim} 787bded2dbSJung-uk Kim 79*b077aed3SPierre Proncheryopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 80*b077aed3SPierre Pronchery or die "can't call $xlate: $!"; 817bded2dbSJung-uk Kim*STDOUT=*OUT; 827bded2dbSJung-uk Kim 837bded2dbSJung-uk Kim# void sha1_multi_block ( 847bded2dbSJung-uk Kim# struct { unsigned int A[8]; 857bded2dbSJung-uk Kim# unsigned int B[8]; 867bded2dbSJung-uk Kim# unsigned int C[8]; 877bded2dbSJung-uk Kim# unsigned int D[8]; 887bded2dbSJung-uk Kim# unsigned int E[8]; } *ctx, 897bded2dbSJung-uk Kim# struct { void *ptr; int blocks; } inp[8], 907bded2dbSJung-uk Kim# int num); /* 1 or 2 */ 917bded2dbSJung-uk Kim# 927bded2dbSJung-uk Kim$ctx="%rdi"; # 1st arg 937bded2dbSJung-uk Kim$inp="%rsi"; # 2nd arg 947bded2dbSJung-uk Kim$num="%edx"; 957bded2dbSJung-uk Kim@ptr=map("%r$_",(8..11)); 967bded2dbSJung-uk Kim$Tbl="%rbp"; 97*b077aed3SPierre Pronchery$inp_elm_size=2*$ptr_size; 987bded2dbSJung-uk Kim 997bded2dbSJung-uk Kim@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4)); 1007bded2dbSJung-uk Kim($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9)); 1017bded2dbSJung-uk Kim@Xi=map("%xmm$_",(10..14)); 1027bded2dbSJung-uk Kim$K="%xmm15"; 1037bded2dbSJung-uk Kim 1047bded2dbSJung-uk Kimif (1) { 1057bded2dbSJung-uk Kim # Atom-specific optimization aiming to eliminate pshufb with high 1067bded2dbSJung-uk Kim # registers [and thus get rid of 48 cycles accumulated penalty] 1077bded2dbSJung-uk Kim @Xi=map("%xmm$_",(0..4)); 1087bded2dbSJung-uk Kim ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); 1097bded2dbSJung-uk Kim @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); 1107bded2dbSJung-uk Kim} 1117bded2dbSJung-uk Kim 1127bded2dbSJung-uk Kim$REG_SZ=16; 1137bded2dbSJung-uk Kim 1147bded2dbSJung-uk Kimsub Xi_off { 1157bded2dbSJung-uk Kimmy $off = shift; 1167bded2dbSJung-uk Kim 1177bded2dbSJung-uk Kim $off %= 16; $off *= $REG_SZ; 1187bded2dbSJung-uk Kim $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; 1197bded2dbSJung-uk Kim} 1207bded2dbSJung-uk Kim 1217bded2dbSJung-uk Kimsub BODY_00_19 { 1227bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 1237bded2dbSJung-uk Kimmy $j=$i+1; 1247bded2dbSJung-uk Kimmy $k=$i+2; 1257bded2dbSJung-uk Kim 1267bded2dbSJung-uk Kim# Loads are performed 2+3/4 iterations in advance. 3/4 means that out 1277bded2dbSJung-uk Kim# of 4 words you would expect to be loaded per given iteration one is 1287bded2dbSJung-uk Kim# spilled to next iteration. In other words indices in four input 1297bded2dbSJung-uk Kim# streams are distributed as following: 1307bded2dbSJung-uk Kim# 1317bded2dbSJung-uk Kim# $i==0: 0,0,0,0,1,1,1,1,2,2,2, 1327bded2dbSJung-uk Kim# $i==1: 2,3,3,3, 1337bded2dbSJung-uk Kim# $i==2: 3,4,4,4, 1347bded2dbSJung-uk Kim# ... 1357bded2dbSJung-uk Kim# $i==13: 14,15,15,15, 1367bded2dbSJung-uk Kim# $i==14: 15 1377bded2dbSJung-uk Kim# 1387bded2dbSJung-uk Kim# Then at $i==15 Xupdate is applied one iteration in advance... 1397bded2dbSJung-uk Kim$code.=<<___ if ($i==0); 1407bded2dbSJung-uk Kim movd (@ptr[0]),@Xi[0] 1417bded2dbSJung-uk Kim lea `16*4`(@ptr[0]),@ptr[0] 1427bded2dbSJung-uk Kim movd (@ptr[1]),@Xi[2] # borrow @Xi[2] 1437bded2dbSJung-uk Kim lea `16*4`(@ptr[1]),@ptr[1] 1447bded2dbSJung-uk Kim movd (@ptr[2]),@Xi[3] # borrow @Xi[3] 1457bded2dbSJung-uk Kim lea `16*4`(@ptr[2]),@ptr[2] 1467bded2dbSJung-uk Kim movd (@ptr[3]),@Xi[4] # borrow @Xi[4] 1477bded2dbSJung-uk Kim lea `16*4`(@ptr[3]),@ptr[3] 1487bded2dbSJung-uk Kim punpckldq @Xi[3],@Xi[0] 1497bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[0]),@Xi[1] 1507bded2dbSJung-uk Kim punpckldq @Xi[4],@Xi[2] 1517bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[1]),$t3 1527bded2dbSJung-uk Kim punpckldq @Xi[2],@Xi[0] 1537bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[2]),$t2 1547bded2dbSJung-uk Kim pshufb $tx,@Xi[0] 1557bded2dbSJung-uk Kim___ 1567bded2dbSJung-uk Kim$code.=<<___ if ($i<14); # just load input 1577bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[3]),$t1 1587bded2dbSJung-uk Kim punpckldq $t2,@Xi[1] 1597bded2dbSJung-uk Kim movdqa $a,$t2 1607bded2dbSJung-uk Kim paddd $K,$e # e+=K_00_19 1617bded2dbSJung-uk Kim punpckldq $t1,$t3 1627bded2dbSJung-uk Kim movdqa $b,$t1 1637bded2dbSJung-uk Kim movdqa $b,$t0 1647bded2dbSJung-uk Kim pslld \$5,$t2 1657bded2dbSJung-uk Kim pandn $d,$t1 1667bded2dbSJung-uk Kim pand $c,$t0 1677bded2dbSJung-uk Kim punpckldq $t3,@Xi[1] 1687bded2dbSJung-uk Kim movdqa $a,$t3 1697bded2dbSJung-uk Kim 1707bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 1717bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 1727bded2dbSJung-uk Kim movd `4*$k-16*4`(@ptr[0]),@Xi[2] 1737bded2dbSJung-uk Kim psrld \$27,$t3 1747bded2dbSJung-uk Kim pxor $t1,$t0 # Ch(b,c,d) 1757bded2dbSJung-uk Kim movdqa $b,$t1 1767bded2dbSJung-uk Kim 1777bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 1787bded2dbSJung-uk Kim movd `4*$k-16*4`(@ptr[1]),$t3 1797bded2dbSJung-uk Kim pslld \$30,$t1 1807bded2dbSJung-uk Kim paddd $t0,$e # e+=Ch(b,c,d) 1817bded2dbSJung-uk Kim 1827bded2dbSJung-uk Kim psrld \$2,$b 1837bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 1847bded2dbSJung-uk Kim pshufb $tx,@Xi[1] 1857bded2dbSJung-uk Kim movd `4*$k-16*4`(@ptr[2]),$t2 1867bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 1877bded2dbSJung-uk Kim___ 1887bded2dbSJung-uk Kim$code.=<<___ if ($i==14); # just load input 1897bded2dbSJung-uk Kim movd `4*$j-16*4`(@ptr[3]),$t1 1907bded2dbSJung-uk Kim punpckldq $t2,@Xi[1] 1917bded2dbSJung-uk Kim movdqa $a,$t2 1927bded2dbSJung-uk Kim paddd $K,$e # e+=K_00_19 1937bded2dbSJung-uk Kim punpckldq $t1,$t3 1947bded2dbSJung-uk Kim movdqa $b,$t1 1957bded2dbSJung-uk Kim movdqa $b,$t0 1967bded2dbSJung-uk Kim pslld \$5,$t2 1977bded2dbSJung-uk Kim prefetcht0 63(@ptr[0]) 1987bded2dbSJung-uk Kim pandn $d,$t1 1997bded2dbSJung-uk Kim pand $c,$t0 2007bded2dbSJung-uk Kim punpckldq $t3,@Xi[1] 2017bded2dbSJung-uk Kim movdqa $a,$t3 2027bded2dbSJung-uk Kim 2037bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 2047bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 2057bded2dbSJung-uk Kim psrld \$27,$t3 2067bded2dbSJung-uk Kim pxor $t1,$t0 # Ch(b,c,d) 2077bded2dbSJung-uk Kim movdqa $b,$t1 2087bded2dbSJung-uk Kim prefetcht0 63(@ptr[1]) 2097bded2dbSJung-uk Kim 2107bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 2117bded2dbSJung-uk Kim pslld \$30,$t1 2127bded2dbSJung-uk Kim paddd $t0,$e # e+=Ch(b,c,d) 2137bded2dbSJung-uk Kim prefetcht0 63(@ptr[2]) 2147bded2dbSJung-uk Kim 2157bded2dbSJung-uk Kim psrld \$2,$b 2167bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 2177bded2dbSJung-uk Kim pshufb $tx,@Xi[1] 2187bded2dbSJung-uk Kim prefetcht0 63(@ptr[3]) 2197bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 2207bded2dbSJung-uk Kim___ 2217bded2dbSJung-uk Kim$code.=<<___ if ($i>=13 && $i<15); 2227bded2dbSJung-uk Kim movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" 2237bded2dbSJung-uk Kim___ 2247bded2dbSJung-uk Kim$code.=<<___ if ($i>=15); # apply Xupdate 2257bded2dbSJung-uk Kim pxor @Xi[-2],@Xi[1] # "X[13]" 2267bded2dbSJung-uk Kim movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 2277bded2dbSJung-uk Kim 2287bded2dbSJung-uk Kim movdqa $a,$t2 2297bded2dbSJung-uk Kim pxor `&Xi_off($j+8)`,@Xi[1] 2307bded2dbSJung-uk Kim paddd $K,$e # e+=K_00_19 2317bded2dbSJung-uk Kim movdqa $b,$t1 2327bded2dbSJung-uk Kim pslld \$5,$t2 2337bded2dbSJung-uk Kim pxor @Xi[3],@Xi[1] 2347bded2dbSJung-uk Kim movdqa $b,$t0 2357bded2dbSJung-uk Kim pandn $d,$t1 2367bded2dbSJung-uk Kim movdqa @Xi[1],$tx 2377bded2dbSJung-uk Kim pand $c,$t0 2387bded2dbSJung-uk Kim movdqa $a,$t3 2397bded2dbSJung-uk Kim psrld \$31,$tx 2407bded2dbSJung-uk Kim paddd @Xi[1],@Xi[1] 2417bded2dbSJung-uk Kim 2427bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 2437bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 2447bded2dbSJung-uk Kim psrld \$27,$t3 2457bded2dbSJung-uk Kim pxor $t1,$t0 # Ch(b,c,d) 2467bded2dbSJung-uk Kim 2477bded2dbSJung-uk Kim movdqa $b,$t1 2487bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 2497bded2dbSJung-uk Kim pslld \$30,$t1 2507bded2dbSJung-uk Kim paddd $t0,$e # e+=Ch(b,c,d) 2517bded2dbSJung-uk Kim 2527bded2dbSJung-uk Kim psrld \$2,$b 2537bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 2547bded2dbSJung-uk Kim por $tx,@Xi[1] # rol \$1,@Xi[1] 2557bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 2567bded2dbSJung-uk Kim___ 2577bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 2587bded2dbSJung-uk Kim} 2597bded2dbSJung-uk Kim 2607bded2dbSJung-uk Kimsub BODY_20_39 { 2617bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 2627bded2dbSJung-uk Kimmy $j=$i+1; 2637bded2dbSJung-uk Kim 2647bded2dbSJung-uk Kim$code.=<<___ if ($i<79); 2657bded2dbSJung-uk Kim pxor @Xi[-2],@Xi[1] # "X[13]" 2667bded2dbSJung-uk Kim movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 2677bded2dbSJung-uk Kim 2687bded2dbSJung-uk Kim movdqa $a,$t2 2697bded2dbSJung-uk Kim movdqa $d,$t0 2707bded2dbSJung-uk Kim pxor `&Xi_off($j+8)`,@Xi[1] 2717bded2dbSJung-uk Kim paddd $K,$e # e+=K_20_39 2727bded2dbSJung-uk Kim pslld \$5,$t2 2737bded2dbSJung-uk Kim pxor $b,$t0 2747bded2dbSJung-uk Kim 2757bded2dbSJung-uk Kim movdqa $a,$t3 2767bded2dbSJung-uk Kim___ 2777bded2dbSJung-uk Kim$code.=<<___ if ($i<72); 2787bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 2797bded2dbSJung-uk Kim___ 2807bded2dbSJung-uk Kim$code.=<<___ if ($i<79); 2817bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 2827bded2dbSJung-uk Kim pxor @Xi[3],@Xi[1] 2837bded2dbSJung-uk Kim psrld \$27,$t3 2847bded2dbSJung-uk Kim pxor $c,$t0 # Parity(b,c,d) 2857bded2dbSJung-uk Kim movdqa $b,$t1 2867bded2dbSJung-uk Kim 2877bded2dbSJung-uk Kim pslld \$30,$t1 2887bded2dbSJung-uk Kim movdqa @Xi[1],$tx 2897bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 2907bded2dbSJung-uk Kim psrld \$31,$tx 2917bded2dbSJung-uk Kim paddd $t0,$e # e+=Parity(b,c,d) 2927bded2dbSJung-uk Kim paddd @Xi[1],@Xi[1] 2937bded2dbSJung-uk Kim 2947bded2dbSJung-uk Kim psrld \$2,$b 2957bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 2967bded2dbSJung-uk Kim por $tx,@Xi[1] # rol(@Xi[1],1) 2977bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 2987bded2dbSJung-uk Kim___ 2997bded2dbSJung-uk Kim$code.=<<___ if ($i==79); 3007bded2dbSJung-uk Kim movdqa $a,$t2 3017bded2dbSJung-uk Kim paddd $K,$e # e+=K_20_39 3027bded2dbSJung-uk Kim movdqa $d,$t0 3037bded2dbSJung-uk Kim pslld \$5,$t2 3047bded2dbSJung-uk Kim pxor $b,$t0 3057bded2dbSJung-uk Kim 3067bded2dbSJung-uk Kim movdqa $a,$t3 3077bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 3087bded2dbSJung-uk Kim psrld \$27,$t3 3097bded2dbSJung-uk Kim movdqa $b,$t1 3107bded2dbSJung-uk Kim pxor $c,$t0 # Parity(b,c,d) 3117bded2dbSJung-uk Kim 3127bded2dbSJung-uk Kim pslld \$30,$t1 3137bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 3147bded2dbSJung-uk Kim paddd $t0,$e # e+=Parity(b,c,d) 3157bded2dbSJung-uk Kim 3167bded2dbSJung-uk Kim psrld \$2,$b 3177bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 3187bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 3197bded2dbSJung-uk Kim___ 3207bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 3217bded2dbSJung-uk Kim} 3227bded2dbSJung-uk Kim 3237bded2dbSJung-uk Kimsub BODY_40_59 { 3247bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 3257bded2dbSJung-uk Kimmy $j=$i+1; 3267bded2dbSJung-uk Kim 3277bded2dbSJung-uk Kim$code.=<<___; 3287bded2dbSJung-uk Kim pxor @Xi[-2],@Xi[1] # "X[13]" 3297bded2dbSJung-uk Kim movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 3307bded2dbSJung-uk Kim 3317bded2dbSJung-uk Kim movdqa $a,$t2 3327bded2dbSJung-uk Kim movdqa $d,$t1 3337bded2dbSJung-uk Kim pxor `&Xi_off($j+8)`,@Xi[1] 3347bded2dbSJung-uk Kim pxor @Xi[3],@Xi[1] 3357bded2dbSJung-uk Kim paddd $K,$e # e+=K_40_59 3367bded2dbSJung-uk Kim pslld \$5,$t2 3377bded2dbSJung-uk Kim movdqa $a,$t3 3387bded2dbSJung-uk Kim pand $c,$t1 3397bded2dbSJung-uk Kim 3407bded2dbSJung-uk Kim movdqa $d,$t0 3417bded2dbSJung-uk Kim movdqa @Xi[1],$tx 3427bded2dbSJung-uk Kim psrld \$27,$t3 3437bded2dbSJung-uk Kim paddd $t1,$e 3447bded2dbSJung-uk Kim pxor $c,$t0 3457bded2dbSJung-uk Kim 3467bded2dbSJung-uk Kim movdqa @Xi[0],`&Xi_off($i)` 3477bded2dbSJung-uk Kim paddd @Xi[0],$e # e+=X[i] 3487bded2dbSJung-uk Kim por $t3,$t2 # rol(a,5) 3497bded2dbSJung-uk Kim psrld \$31,$tx 3507bded2dbSJung-uk Kim pand $b,$t0 3517bded2dbSJung-uk Kim movdqa $b,$t1 3527bded2dbSJung-uk Kim 3537bded2dbSJung-uk Kim pslld \$30,$t1 3547bded2dbSJung-uk Kim paddd @Xi[1],@Xi[1] 3557bded2dbSJung-uk Kim paddd $t0,$e # e+=Maj(b,d,c) 3567bded2dbSJung-uk Kim 3577bded2dbSJung-uk Kim psrld \$2,$b 3587bded2dbSJung-uk Kim paddd $t2,$e # e+=rol(a,5) 3597bded2dbSJung-uk Kim por $tx,@Xi[1] # rol(@X[1],1) 3607bded2dbSJung-uk Kim por $t1,$b # b=rol(b,30) 3617bded2dbSJung-uk Kim___ 3627bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 3637bded2dbSJung-uk Kim} 3647bded2dbSJung-uk Kim 3657bded2dbSJung-uk Kim$code.=<<___; 3667bded2dbSJung-uk Kim.text 3677bded2dbSJung-uk Kim 3687bded2dbSJung-uk Kim.extern OPENSSL_ia32cap_P 3697bded2dbSJung-uk Kim 3707bded2dbSJung-uk Kim.globl sha1_multi_block 3717bded2dbSJung-uk Kim.type sha1_multi_block,\@function,3 3727bded2dbSJung-uk Kim.align 32 3737bded2dbSJung-uk Kimsha1_multi_block: 374e71b7053SJung-uk Kim.cfi_startproc 3757bded2dbSJung-uk Kim mov OPENSSL_ia32cap_P+4(%rip),%rcx 3767bded2dbSJung-uk Kim bt \$61,%rcx # check SHA bit 3777bded2dbSJung-uk Kim jc _shaext_shortcut 3787bded2dbSJung-uk Kim___ 3797bded2dbSJung-uk Kim$code.=<<___ if ($avx); 3807bded2dbSJung-uk Kim test \$`1<<28`,%ecx 3817bded2dbSJung-uk Kim jnz _avx_shortcut 3827bded2dbSJung-uk Kim___ 3837bded2dbSJung-uk Kim$code.=<<___; 3847bded2dbSJung-uk Kim mov %rsp,%rax 385e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 3867bded2dbSJung-uk Kim push %rbx 387e71b7053SJung-uk Kim.cfi_push %rbx 3887bded2dbSJung-uk Kim push %rbp 389e71b7053SJung-uk Kim.cfi_push %rbx 3907bded2dbSJung-uk Kim___ 3917bded2dbSJung-uk Kim$code.=<<___ if ($win64); 3927bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 3937bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 3947bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 3957bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 3967bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 3977bded2dbSJung-uk Kim movaps %xmm10,-0x78(%rax) 3987bded2dbSJung-uk Kim movaps %xmm11,-0x68(%rax) 3997bded2dbSJung-uk Kim movaps %xmm12,-0x58(%rax) 4007bded2dbSJung-uk Kim movaps %xmm13,-0x48(%rax) 4017bded2dbSJung-uk Kim movaps %xmm14,-0x38(%rax) 4027bded2dbSJung-uk Kim movaps %xmm15,-0x28(%rax) 4037bded2dbSJung-uk Kim___ 4047bded2dbSJung-uk Kim$code.=<<___; 4057bded2dbSJung-uk Kim sub \$`$REG_SZ*18`,%rsp 4067bded2dbSJung-uk Kim and \$-256,%rsp 4077bded2dbSJung-uk Kim mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 408e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 4097bded2dbSJung-uk Kim.Lbody: 4107bded2dbSJung-uk Kim lea K_XX_XX(%rip),$Tbl 4117bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 4127bded2dbSJung-uk Kim 4137bded2dbSJung-uk Kim.Loop_grande: 4147bded2dbSJung-uk Kim mov $num,`$REG_SZ*17+8`(%rsp) # original $num 4157bded2dbSJung-uk Kim xor $num,$num 4167bded2dbSJung-uk Kim___ 4177bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 418*b077aed3SPierre Pronchery $ptr_reg=&pointer_register($flavour,@ptr[$i]); 4197bded2dbSJung-uk Kim $code.=<<___; 420*b077aed3SPierre Pronchery # input pointer 421*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+0`($inp),$ptr_reg 422*b077aed3SPierre Pronchery # number of blocks 423*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx 4247bded2dbSJung-uk Kim cmp $num,%ecx 4257bded2dbSJung-uk Kim cmovg %ecx,$num # find maximum 4267bded2dbSJung-uk Kim test %ecx,%ecx 4277bded2dbSJung-uk Kim mov %ecx,`4*$i`(%rbx) # initialize counters 4287bded2dbSJung-uk Kim cmovle $Tbl,@ptr[$i] # cancel input 4297bded2dbSJung-uk Kim___ 4307bded2dbSJung-uk Kim} 4317bded2dbSJung-uk Kim$code.=<<___; 4327bded2dbSJung-uk Kim test $num,$num 4337bded2dbSJung-uk Kim jz .Ldone 4347bded2dbSJung-uk Kim 4357bded2dbSJung-uk Kim movdqu 0x00($ctx),$A # load context 4367bded2dbSJung-uk Kim lea 128(%rsp),%rax 4377bded2dbSJung-uk Kim movdqu 0x20($ctx),$B 4387bded2dbSJung-uk Kim movdqu 0x40($ctx),$C 4397bded2dbSJung-uk Kim movdqu 0x60($ctx),$D 4407bded2dbSJung-uk Kim movdqu 0x80($ctx),$E 4417bded2dbSJung-uk Kim movdqa 0x60($Tbl),$tx # pbswap_mask 4427bded2dbSJung-uk Kim movdqa -0x20($Tbl),$K # K_00_19 4437bded2dbSJung-uk Kim jmp .Loop 4447bded2dbSJung-uk Kim 4457bded2dbSJung-uk Kim.align 32 4467bded2dbSJung-uk Kim.Loop: 4477bded2dbSJung-uk Kim___ 4487bded2dbSJung-uk Kimfor($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 4497bded2dbSJung-uk Kim$code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 4507bded2dbSJung-uk Kimfor(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 4517bded2dbSJung-uk Kim$code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59 4527bded2dbSJung-uk Kimfor(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 4537bded2dbSJung-uk Kim$code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79 4547bded2dbSJung-uk Kimfor(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 4557bded2dbSJung-uk Kim$code.=<<___; 4567bded2dbSJung-uk Kim movdqa (%rbx),@Xi[0] # pull counters 4577bded2dbSJung-uk Kim mov \$1,%ecx 458e71b7053SJung-uk Kim cmp 4*0(%rbx),%ecx # examine counters 4597bded2dbSJung-uk Kim pxor $t2,$t2 4607bded2dbSJung-uk Kim cmovge $Tbl,@ptr[0] # cancel input 4617bded2dbSJung-uk Kim cmp 4*1(%rbx),%ecx 4627bded2dbSJung-uk Kim movdqa @Xi[0],@Xi[1] 4637bded2dbSJung-uk Kim cmovge $Tbl,@ptr[1] 4647bded2dbSJung-uk Kim cmp 4*2(%rbx),%ecx 4657bded2dbSJung-uk Kim pcmpgtd $t2,@Xi[1] # mask value 4667bded2dbSJung-uk Kim cmovge $Tbl,@ptr[2] 4677bded2dbSJung-uk Kim cmp 4*3(%rbx),%ecx 4687bded2dbSJung-uk Kim paddd @Xi[1],@Xi[0] # counters-- 4697bded2dbSJung-uk Kim cmovge $Tbl,@ptr[3] 4707bded2dbSJung-uk Kim 4717bded2dbSJung-uk Kim movdqu 0x00($ctx),$t0 4727bded2dbSJung-uk Kim pand @Xi[1],$A 4737bded2dbSJung-uk Kim movdqu 0x20($ctx),$t1 4747bded2dbSJung-uk Kim pand @Xi[1],$B 4757bded2dbSJung-uk Kim paddd $t0,$A 4767bded2dbSJung-uk Kim movdqu 0x40($ctx),$t2 4777bded2dbSJung-uk Kim pand @Xi[1],$C 4787bded2dbSJung-uk Kim paddd $t1,$B 4797bded2dbSJung-uk Kim movdqu 0x60($ctx),$t3 4807bded2dbSJung-uk Kim pand @Xi[1],$D 4817bded2dbSJung-uk Kim paddd $t2,$C 4827bded2dbSJung-uk Kim movdqu 0x80($ctx),$tx 4837bded2dbSJung-uk Kim pand @Xi[1],$E 4847bded2dbSJung-uk Kim movdqu $A,0x00($ctx) 4857bded2dbSJung-uk Kim paddd $t3,$D 4867bded2dbSJung-uk Kim movdqu $B,0x20($ctx) 4877bded2dbSJung-uk Kim paddd $tx,$E 4887bded2dbSJung-uk Kim movdqu $C,0x40($ctx) 4897bded2dbSJung-uk Kim movdqu $D,0x60($ctx) 4907bded2dbSJung-uk Kim movdqu $E,0x80($ctx) 4917bded2dbSJung-uk Kim 4927bded2dbSJung-uk Kim movdqa @Xi[0],(%rbx) # save counters 4937bded2dbSJung-uk Kim movdqa 0x60($Tbl),$tx # pbswap_mask 4947bded2dbSJung-uk Kim movdqa -0x20($Tbl),$K # K_00_19 4957bded2dbSJung-uk Kim dec $num 4967bded2dbSJung-uk Kim jnz .Loop 4977bded2dbSJung-uk Kim 4987bded2dbSJung-uk Kim mov `$REG_SZ*17+8`(%rsp),$num 4997bded2dbSJung-uk Kim lea $REG_SZ($ctx),$ctx 500*b077aed3SPierre Pronchery lea `$inp_elm_size*$REG_SZ/4`($inp),$inp 5017bded2dbSJung-uk Kim dec $num 5027bded2dbSJung-uk Kim jnz .Loop_grande 5037bded2dbSJung-uk Kim 5047bded2dbSJung-uk Kim.Ldone: 505e71b7053SJung-uk Kim mov `$REG_SZ*17`(%rsp),%rax # original %rsp 506e71b7053SJung-uk Kim.cfi_def_cfa %rax,8 5077bded2dbSJung-uk Kim___ 5087bded2dbSJung-uk Kim$code.=<<___ if ($win64); 5097bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm6 5107bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm7 5117bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm8 5127bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm9 5137bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm10 5147bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm11 5157bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm12 5167bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm13 5177bded2dbSJung-uk Kim movaps -0x38(%rax),%xmm14 5187bded2dbSJung-uk Kim movaps -0x28(%rax),%xmm15 5197bded2dbSJung-uk Kim___ 5207bded2dbSJung-uk Kim$code.=<<___; 5217bded2dbSJung-uk Kim mov -16(%rax),%rbp 522e71b7053SJung-uk Kim.cfi_restore %rbp 5237bded2dbSJung-uk Kim mov -8(%rax),%rbx 524e71b7053SJung-uk Kim.cfi_restore %rbx 5257bded2dbSJung-uk Kim lea (%rax),%rsp 526e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 5277bded2dbSJung-uk Kim.Lepilogue: 5287bded2dbSJung-uk Kim ret 529e71b7053SJung-uk Kim.cfi_endproc 5307bded2dbSJung-uk Kim.size sha1_multi_block,.-sha1_multi_block 5317bded2dbSJung-uk Kim___ 5327bded2dbSJung-uk Kim {{{ 5337bded2dbSJung-uk Kimmy ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10)); 5347bded2dbSJung-uk Kimmy @MSG0=map("%xmm$_",(4..7)); 5357bded2dbSJung-uk Kimmy @MSG1=map("%xmm$_",(11..14)); 5367bded2dbSJung-uk Kim 5377bded2dbSJung-uk Kim$code.=<<___; 5387bded2dbSJung-uk Kim.type sha1_multi_block_shaext,\@function,3 5397bded2dbSJung-uk Kim.align 32 5407bded2dbSJung-uk Kimsha1_multi_block_shaext: 541e71b7053SJung-uk Kim.cfi_startproc 5427bded2dbSJung-uk Kim_shaext_shortcut: 5437bded2dbSJung-uk Kim mov %rsp,%rax 544e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 5457bded2dbSJung-uk Kim push %rbx 546e71b7053SJung-uk Kim.cfi_push %rbx 5477bded2dbSJung-uk Kim push %rbp 548e71b7053SJung-uk Kim.cfi_push %rbp 5497bded2dbSJung-uk Kim___ 5507bded2dbSJung-uk Kim$code.=<<___ if ($win64); 5517bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 5527bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 5537bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 5547bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 5557bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 5567bded2dbSJung-uk Kim movaps %xmm10,-0x78(%rax) 5577bded2dbSJung-uk Kim movaps %xmm11,-0x68(%rax) 5587bded2dbSJung-uk Kim movaps %xmm12,-0x58(%rax) 5597bded2dbSJung-uk Kim movaps %xmm13,-0x48(%rax) 5607bded2dbSJung-uk Kim movaps %xmm14,-0x38(%rax) 5617bded2dbSJung-uk Kim movaps %xmm15,-0x28(%rax) 5627bded2dbSJung-uk Kim___ 5637bded2dbSJung-uk Kim$code.=<<___; 5647bded2dbSJung-uk Kim sub \$`$REG_SZ*18`,%rsp 5657bded2dbSJung-uk Kim shl \$1,$num # we process pair at a time 5667bded2dbSJung-uk Kim and \$-256,%rsp 5677bded2dbSJung-uk Kim lea 0x40($ctx),$ctx # size optimization 5687bded2dbSJung-uk Kim mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 5697bded2dbSJung-uk Kim.Lbody_shaext: 5707bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 5717bded2dbSJung-uk Kim movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap 5727bded2dbSJung-uk Kim 5737bded2dbSJung-uk Kim.Loop_grande_shaext: 574e71b7053SJung-uk Kim mov $num,`$REG_SZ*17+8`(%rsp) # original $num 5757bded2dbSJung-uk Kim xor $num,$num 5767bded2dbSJung-uk Kim___ 5777bded2dbSJung-uk Kimfor($i=0;$i<2;$i++) { 578*b077aed3SPierre Pronchery $ptr_reg=&pointer_register($flavour,@ptr[$i]); 5797bded2dbSJung-uk Kim $code.=<<___; 580*b077aed3SPierre Pronchery # input pointer 581*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+0`($inp),$ptr_reg 582*b077aed3SPierre Pronchery # number of blocks 583*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx 5847bded2dbSJung-uk Kim cmp $num,%ecx 5857bded2dbSJung-uk Kim cmovg %ecx,$num # find maximum 5867bded2dbSJung-uk Kim test %ecx,%ecx 5877bded2dbSJung-uk Kim mov %ecx,`4*$i`(%rbx) # initialize counters 5887bded2dbSJung-uk Kim cmovle %rsp,@ptr[$i] # cancel input 5897bded2dbSJung-uk Kim___ 5907bded2dbSJung-uk Kim} 5917bded2dbSJung-uk Kim$code.=<<___; 5927bded2dbSJung-uk Kim test $num,$num 5937bded2dbSJung-uk Kim jz .Ldone_shaext 5947bded2dbSJung-uk Kim 5957bded2dbSJung-uk Kim movq 0x00-0x40($ctx),$ABCD0 # a1.a0 5967bded2dbSJung-uk Kim movq 0x20-0x40($ctx),@MSG0[0]# b1.b0 5977bded2dbSJung-uk Kim movq 0x40-0x40($ctx),@MSG0[1]# c1.c0 5987bded2dbSJung-uk Kim movq 0x60-0x40($ctx),@MSG0[2]# d1.d0 5997bded2dbSJung-uk Kim movq 0x80-0x40($ctx),@MSG0[3]# e1.e0 6007bded2dbSJung-uk Kim 6017bded2dbSJung-uk Kim punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0 6027bded2dbSJung-uk Kim punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0 6037bded2dbSJung-uk Kim 6047bded2dbSJung-uk Kim movdqa $ABCD0,$ABCD1 6057bded2dbSJung-uk Kim punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0 6067bded2dbSJung-uk Kim punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1 6077bded2dbSJung-uk Kim 6087bded2dbSJung-uk Kim pshufd \$0b00111111,@MSG0[3],$E0 6097bded2dbSJung-uk Kim pshufd \$0b01111111,@MSG0[3],$E1 6107bded2dbSJung-uk Kim pshufd \$0b00011011,$ABCD0,$ABCD0 6117bded2dbSJung-uk Kim pshufd \$0b00011011,$ABCD1,$ABCD1 6127bded2dbSJung-uk Kim jmp .Loop_shaext 6137bded2dbSJung-uk Kim 6147bded2dbSJung-uk Kim.align 32 6157bded2dbSJung-uk Kim.Loop_shaext: 6167bded2dbSJung-uk Kim movdqu 0x00(@ptr[0]),@MSG0[0] 6177bded2dbSJung-uk Kim movdqu 0x00(@ptr[1]),@MSG1[0] 6187bded2dbSJung-uk Kim movdqu 0x10(@ptr[0]),@MSG0[1] 6197bded2dbSJung-uk Kim movdqu 0x10(@ptr[1]),@MSG1[1] 6207bded2dbSJung-uk Kim movdqu 0x20(@ptr[0]),@MSG0[2] 6217bded2dbSJung-uk Kim pshufb $BSWAP,@MSG0[0] 6227bded2dbSJung-uk Kim movdqu 0x20(@ptr[1]),@MSG1[2] 6237bded2dbSJung-uk Kim pshufb $BSWAP,@MSG1[0] 6247bded2dbSJung-uk Kim movdqu 0x30(@ptr[0]),@MSG0[3] 6257bded2dbSJung-uk Kim lea 0x40(@ptr[0]),@ptr[0] 6267bded2dbSJung-uk Kim pshufb $BSWAP,@MSG0[1] 6277bded2dbSJung-uk Kim movdqu 0x30(@ptr[1]),@MSG1[3] 6287bded2dbSJung-uk Kim lea 0x40(@ptr[1]),@ptr[1] 6297bded2dbSJung-uk Kim pshufb $BSWAP,@MSG1[1] 6307bded2dbSJung-uk Kim 6317bded2dbSJung-uk Kim movdqa $E0,0x50(%rsp) # offload 6327bded2dbSJung-uk Kim paddd @MSG0[0],$E0 6337bded2dbSJung-uk Kim movdqa $E1,0x70(%rsp) 6347bded2dbSJung-uk Kim paddd @MSG1[0],$E1 6357bded2dbSJung-uk Kim movdqa $ABCD0,0x40(%rsp) # offload 6367bded2dbSJung-uk Kim movdqa $ABCD0,$E0_ 6377bded2dbSJung-uk Kim movdqa $ABCD1,0x60(%rsp) 6387bded2dbSJung-uk Kim movdqa $ABCD1,$E1_ 6397bded2dbSJung-uk Kim sha1rnds4 \$0,$E0,$ABCD0 # 0-3 6407bded2dbSJung-uk Kim sha1nexte @MSG0[1],$E0_ 6417bded2dbSJung-uk Kim sha1rnds4 \$0,$E1,$ABCD1 # 0-3 6427bded2dbSJung-uk Kim sha1nexte @MSG1[1],$E1_ 6437bded2dbSJung-uk Kim pshufb $BSWAP,@MSG0[2] 6447bded2dbSJung-uk Kim prefetcht0 127(@ptr[0]) 6457bded2dbSJung-uk Kim sha1msg1 @MSG0[1],@MSG0[0] 6467bded2dbSJung-uk Kim pshufb $BSWAP,@MSG1[2] 6477bded2dbSJung-uk Kim prefetcht0 127(@ptr[1]) 6487bded2dbSJung-uk Kim sha1msg1 @MSG1[1],@MSG1[0] 6497bded2dbSJung-uk Kim 6507bded2dbSJung-uk Kim pshufb $BSWAP,@MSG0[3] 6517bded2dbSJung-uk Kim movdqa $ABCD0,$E0 6527bded2dbSJung-uk Kim pshufb $BSWAP,@MSG1[3] 6537bded2dbSJung-uk Kim movdqa $ABCD1,$E1 6547bded2dbSJung-uk Kim sha1rnds4 \$0,$E0_,$ABCD0 # 4-7 6557bded2dbSJung-uk Kim sha1nexte @MSG0[2],$E0 6567bded2dbSJung-uk Kim sha1rnds4 \$0,$E1_,$ABCD1 # 4-7 6577bded2dbSJung-uk Kim sha1nexte @MSG1[2],$E1 6587bded2dbSJung-uk Kim pxor @MSG0[2],@MSG0[0] 6597bded2dbSJung-uk Kim sha1msg1 @MSG0[2],@MSG0[1] 6607bded2dbSJung-uk Kim pxor @MSG1[2],@MSG1[0] 6617bded2dbSJung-uk Kim sha1msg1 @MSG1[2],@MSG1[1] 6627bded2dbSJung-uk Kim___ 6637bded2dbSJung-uk Kimfor($i=2;$i<20-4;$i++) { 6647bded2dbSJung-uk Kim$code.=<<___; 6657bded2dbSJung-uk Kim movdqa $ABCD0,$E0_ 6667bded2dbSJung-uk Kim movdqa $ABCD1,$E1_ 6677bded2dbSJung-uk Kim sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11 6687bded2dbSJung-uk Kim sha1nexte @MSG0[3],$E0_ 6697bded2dbSJung-uk Kim sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11 6707bded2dbSJung-uk Kim sha1nexte @MSG1[3],$E1_ 6717bded2dbSJung-uk Kim sha1msg2 @MSG0[3],@MSG0[0] 6727bded2dbSJung-uk Kim sha1msg2 @MSG1[3],@MSG1[0] 6737bded2dbSJung-uk Kim pxor @MSG0[3],@MSG0[1] 6747bded2dbSJung-uk Kim sha1msg1 @MSG0[3],@MSG0[2] 6757bded2dbSJung-uk Kim pxor @MSG1[3],@MSG1[1] 6767bded2dbSJung-uk Kim sha1msg1 @MSG1[3],@MSG1[2] 6777bded2dbSJung-uk Kim___ 6787bded2dbSJung-uk Kim ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1); 6797bded2dbSJung-uk Kim push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); 6807bded2dbSJung-uk Kim} 6817bded2dbSJung-uk Kim$code.=<<___; 6827bded2dbSJung-uk Kim movdqa $ABCD0,$E0_ 6837bded2dbSJung-uk Kim movdqa $ABCD1,$E1_ 6847bded2dbSJung-uk Kim sha1rnds4 \$3,$E0,$ABCD0 # 64-67 6857bded2dbSJung-uk Kim sha1nexte @MSG0[3],$E0_ 6867bded2dbSJung-uk Kim sha1rnds4 \$3,$E1,$ABCD1 # 64-67 6877bded2dbSJung-uk Kim sha1nexte @MSG1[3],$E1_ 6887bded2dbSJung-uk Kim sha1msg2 @MSG0[3],@MSG0[0] 6897bded2dbSJung-uk Kim sha1msg2 @MSG1[3],@MSG1[0] 6907bded2dbSJung-uk Kim pxor @MSG0[3],@MSG0[1] 6917bded2dbSJung-uk Kim pxor @MSG1[3],@MSG1[1] 6927bded2dbSJung-uk Kim 6937bded2dbSJung-uk Kim mov \$1,%ecx 6947bded2dbSJung-uk Kim pxor @MSG0[2],@MSG0[2] # zero 6957bded2dbSJung-uk Kim cmp 4*0(%rbx),%ecx # examine counters 6967bded2dbSJung-uk Kim cmovge %rsp,@ptr[0] # cancel input 6977bded2dbSJung-uk Kim 6987bded2dbSJung-uk Kim movdqa $ABCD0,$E0 6997bded2dbSJung-uk Kim movdqa $ABCD1,$E1 7007bded2dbSJung-uk Kim sha1rnds4 \$3,$E0_,$ABCD0 # 68-71 7017bded2dbSJung-uk Kim sha1nexte @MSG0[0],$E0 7027bded2dbSJung-uk Kim sha1rnds4 \$3,$E1_,$ABCD1 # 68-71 7037bded2dbSJung-uk Kim sha1nexte @MSG1[0],$E1 7047bded2dbSJung-uk Kim sha1msg2 @MSG0[0],@MSG0[1] 7057bded2dbSJung-uk Kim sha1msg2 @MSG1[0],@MSG1[1] 7067bded2dbSJung-uk Kim 7077bded2dbSJung-uk Kim cmp 4*1(%rbx),%ecx 7087bded2dbSJung-uk Kim cmovge %rsp,@ptr[1] 7097bded2dbSJung-uk Kim movq (%rbx),@MSG0[0] # pull counters 7107bded2dbSJung-uk Kim 7117bded2dbSJung-uk Kim movdqa $ABCD0,$E0_ 7127bded2dbSJung-uk Kim movdqa $ABCD1,$E1_ 7137bded2dbSJung-uk Kim sha1rnds4 \$3,$E0,$ABCD0 # 72-75 7147bded2dbSJung-uk Kim sha1nexte @MSG0[1],$E0_ 7157bded2dbSJung-uk Kim sha1rnds4 \$3,$E1,$ABCD1 # 72-75 7167bded2dbSJung-uk Kim sha1nexte @MSG1[1],$E1_ 7177bded2dbSJung-uk Kim 7187bded2dbSJung-uk Kim pshufd \$0x00,@MSG0[0],@MSG1[2] 7197bded2dbSJung-uk Kim pshufd \$0x55,@MSG0[0],@MSG1[3] 7207bded2dbSJung-uk Kim movdqa @MSG0[0],@MSG0[1] 7217bded2dbSJung-uk Kim pcmpgtd @MSG0[2],@MSG1[2] 7227bded2dbSJung-uk Kim pcmpgtd @MSG0[2],@MSG1[3] 7237bded2dbSJung-uk Kim 7247bded2dbSJung-uk Kim movdqa $ABCD0,$E0 7257bded2dbSJung-uk Kim movdqa $ABCD1,$E1 7267bded2dbSJung-uk Kim sha1rnds4 \$3,$E0_,$ABCD0 # 76-79 7277bded2dbSJung-uk Kim sha1nexte $MSG0[2],$E0 7287bded2dbSJung-uk Kim sha1rnds4 \$3,$E1_,$ABCD1 # 76-79 7297bded2dbSJung-uk Kim sha1nexte $MSG0[2],$E1 7307bded2dbSJung-uk Kim 7317bded2dbSJung-uk Kim pcmpgtd @MSG0[2],@MSG0[1] # counter mask 7327bded2dbSJung-uk Kim pand @MSG1[2],$ABCD0 7337bded2dbSJung-uk Kim pand @MSG1[2],$E0 7347bded2dbSJung-uk Kim pand @MSG1[3],$ABCD1 7357bded2dbSJung-uk Kim pand @MSG1[3],$E1 7367bded2dbSJung-uk Kim paddd @MSG0[1],@MSG0[0] # counters-- 7377bded2dbSJung-uk Kim 7387bded2dbSJung-uk Kim paddd 0x40(%rsp),$ABCD0 7397bded2dbSJung-uk Kim paddd 0x50(%rsp),$E0 7407bded2dbSJung-uk Kim paddd 0x60(%rsp),$ABCD1 7417bded2dbSJung-uk Kim paddd 0x70(%rsp),$E1 7427bded2dbSJung-uk Kim 7437bded2dbSJung-uk Kim movq @MSG0[0],(%rbx) # save counters 7447bded2dbSJung-uk Kim dec $num 7457bded2dbSJung-uk Kim jnz .Loop_shaext 7467bded2dbSJung-uk Kim 7477bded2dbSJung-uk Kim mov `$REG_SZ*17+8`(%rsp),$num 7487bded2dbSJung-uk Kim 7497bded2dbSJung-uk Kim pshufd \$0b00011011,$ABCD0,$ABCD0 7507bded2dbSJung-uk Kim pshufd \$0b00011011,$ABCD1,$ABCD1 7517bded2dbSJung-uk Kim 7527bded2dbSJung-uk Kim movdqa $ABCD0,@MSG0[0] 7537bded2dbSJung-uk Kim punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0 7547bded2dbSJung-uk Kim punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0 7557bded2dbSJung-uk Kim punpckhdq $E1,$E0 # e1.e0.xx.xx 7567bded2dbSJung-uk Kim movq $ABCD0,0x00-0x40($ctx) # a1.a0 7577bded2dbSJung-uk Kim psrldq \$8,$ABCD0 7587bded2dbSJung-uk Kim movq @MSG0[0],0x40-0x40($ctx)# c1.c0 7597bded2dbSJung-uk Kim psrldq \$8,@MSG0[0] 7607bded2dbSJung-uk Kim movq $ABCD0,0x20-0x40($ctx) # b1.b0 7617bded2dbSJung-uk Kim psrldq \$8,$E0 7627bded2dbSJung-uk Kim movq @MSG0[0],0x60-0x40($ctx)# d1.d0 7637bded2dbSJung-uk Kim movq $E0,0x80-0x40($ctx) # e1.e0 7647bded2dbSJung-uk Kim 7657bded2dbSJung-uk Kim lea `$REG_SZ/2`($ctx),$ctx 766*b077aed3SPierre Pronchery lea `$inp_elm_size*2`($inp),$inp 7677bded2dbSJung-uk Kim dec $num 7687bded2dbSJung-uk Kim jnz .Loop_grande_shaext 7697bded2dbSJung-uk Kim 7707bded2dbSJung-uk Kim.Ldone_shaext: 7717bded2dbSJung-uk Kim #mov `$REG_SZ*17`(%rsp),%rax # original %rsp 7727bded2dbSJung-uk Kim___ 7737bded2dbSJung-uk Kim$code.=<<___ if ($win64); 7747bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm6 7757bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm7 7767bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm8 7777bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm9 7787bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm10 7797bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm11 7807bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm12 7817bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm13 7827bded2dbSJung-uk Kim movaps -0x38(%rax),%xmm14 7837bded2dbSJung-uk Kim movaps -0x28(%rax),%xmm15 7847bded2dbSJung-uk Kim___ 7857bded2dbSJung-uk Kim$code.=<<___; 7867bded2dbSJung-uk Kim mov -16(%rax),%rbp 787e71b7053SJung-uk Kim.cfi_restore %rbp 7887bded2dbSJung-uk Kim mov -8(%rax),%rbx 789e71b7053SJung-uk Kim.cfi_restore %rbx 7907bded2dbSJung-uk Kim lea (%rax),%rsp 791e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 7927bded2dbSJung-uk Kim.Lepilogue_shaext: 7937bded2dbSJung-uk Kim ret 794e71b7053SJung-uk Kim.cfi_endproc 7957bded2dbSJung-uk Kim.size sha1_multi_block_shaext,.-sha1_multi_block_shaext 7967bded2dbSJung-uk Kim___ 7977bded2dbSJung-uk Kim }}} 7987bded2dbSJung-uk Kim 7997bded2dbSJung-uk Kim if ($avx) {{{ 8007bded2dbSJung-uk Kimsub BODY_00_19_avx { 8017bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 8027bded2dbSJung-uk Kimmy $j=$i+1; 8037bded2dbSJung-uk Kimmy $k=$i+2; 8047bded2dbSJung-uk Kimmy $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128"; 8057bded2dbSJung-uk Kimmy $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4]; 8067bded2dbSJung-uk Kim 8077bded2dbSJung-uk Kim$code.=<<___ if ($i==0 && $REG_SZ==16); 8087bded2dbSJung-uk Kim vmovd (@ptr[0]),@Xi[0] 8097bded2dbSJung-uk Kim lea `16*4`(@ptr[0]),@ptr[0] 8107bded2dbSJung-uk Kim vmovd (@ptr[1]),@Xi[2] # borrow Xi[2] 8117bded2dbSJung-uk Kim lea `16*4`(@ptr[1]),@ptr[1] 8127bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] 8137bded2dbSJung-uk Kim lea `16*4`(@ptr[2]),@ptr[2] 8147bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2] 8157bded2dbSJung-uk Kim lea `16*4`(@ptr[3]),@ptr[3] 8167bded2dbSJung-uk Kim vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] 8177bded2dbSJung-uk Kim vpunpckldq @Xi[2],@Xi[0],@Xi[0] 8187bded2dbSJung-uk Kim vmovd `4*$j-16*4`($ptr_n),$t3 8197bded2dbSJung-uk Kim vpshufb $tx,@Xi[0],@Xi[0] 8207bded2dbSJung-uk Kim___ 8217bded2dbSJung-uk Kim$code.=<<___ if ($i<15 && $REG_SZ==16); # just load input 8227bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] 8237bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3 8247bded2dbSJung-uk Kim___ 8257bded2dbSJung-uk Kim$code.=<<___ if ($i==0 && $REG_SZ==32); 8267bded2dbSJung-uk Kim vmovd (@ptr[0]),@Xi[0] 8277bded2dbSJung-uk Kim lea `16*4`(@ptr[0]),@ptr[0] 8287bded2dbSJung-uk Kim vmovd (@ptr[4]),@Xi[2] # borrow Xi[2] 8297bded2dbSJung-uk Kim lea `16*4`(@ptr[4]),@ptr[4] 8307bded2dbSJung-uk Kim vmovd (@ptr[1]),$t2 8317bded2dbSJung-uk Kim lea `16*4`(@ptr[1]),@ptr[1] 8327bded2dbSJung-uk Kim vmovd (@ptr[5]),$t1 8337bded2dbSJung-uk Kim lea `16*4`(@ptr[5]),@ptr[5] 8347bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] 8357bded2dbSJung-uk Kim lea `16*4`(@ptr[2]),@ptr[2] 8367bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2] 8377bded2dbSJung-uk Kim lea `16*4`(@ptr[6]),@ptr[6] 8387bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[3]),$t2,$t2 8397bded2dbSJung-uk Kim lea `16*4`(@ptr[3]),@ptr[3] 8407bded2dbSJung-uk Kim vpunpckldq $t2,@Xi[0],@Xi[0] 8417bded2dbSJung-uk Kim vpinsrd \$1,(@ptr[7]),$t1,$t1 8427bded2dbSJung-uk Kim lea `16*4`(@ptr[7]),@ptr[7] 8437bded2dbSJung-uk Kim vpunpckldq $t1,@Xi[2],@Xi[2] 8447bded2dbSJung-uk Kim vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] 8457bded2dbSJung-uk Kim vinserti128 @Xi[2],@Xi[0],@Xi[0] 8467bded2dbSJung-uk Kim vmovd `4*$j-16*4`($ptr_n),$t3 8477bded2dbSJung-uk Kim vpshufb $tx,@Xi[0],@Xi[0] 8487bded2dbSJung-uk Kim___ 8497bded2dbSJung-uk Kim$code.=<<___ if ($i<15 && $REG_SZ==32); # just load input 8507bded2dbSJung-uk Kim vmovd `4*$j-16*4`(@ptr[1]),$t2 8517bded2dbSJung-uk Kim vmovd `4*$j-16*4`(@ptr[5]),$t1 8527bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] 8537bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3 8547bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2 8557bded2dbSJung-uk Kim vpunpckldq $t2,@Xi[1],@Xi[1] 8567bded2dbSJung-uk Kim vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1 8577bded2dbSJung-uk Kim vpunpckldq $t1,$t3,$t3 8587bded2dbSJung-uk Kim___ 8597bded2dbSJung-uk Kim$code.=<<___ if ($i<14); 8607bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_00_19 8617bded2dbSJung-uk Kim vpslld \$5,$a,$t2 8627bded2dbSJung-uk Kim vpandn $d,$b,$t1 8637bded2dbSJung-uk Kim vpand $c,$b,$t0 8647bded2dbSJung-uk Kim 8657bded2dbSJung-uk Kim vmovdqa @Xi[0],`&Xi_off($i)` 8667bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 8677bded2dbSJung-uk Kim $vpack $t3,@Xi[1],@Xi[1] 8687bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 8697bded2dbSJung-uk Kim vpxor $t1,$t0,$t0 # Ch(b,c,d) 8707bded2dbSJung-uk Kim vmovd `4*$k-16*4`(@ptr[0]),@Xi[2] 8717bded2dbSJung-uk Kim 8727bded2dbSJung-uk Kim vpslld \$30,$b,$t1 8737bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 8747bded2dbSJung-uk Kim vmovd `4*$k-16*4`($ptr_n),$t3 8757bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Ch(b,c,d) 8767bded2dbSJung-uk Kim 8777bded2dbSJung-uk Kim vpsrld \$2,$b,$b 8787bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 8797bded2dbSJung-uk Kim vpshufb $tx,@Xi[1],@Xi[1] 8807bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 8817bded2dbSJung-uk Kim___ 8827bded2dbSJung-uk Kim$code.=<<___ if ($i==14); 8837bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_00_19 8847bded2dbSJung-uk Kim prefetcht0 63(@ptr[0]) 8857bded2dbSJung-uk Kim vpslld \$5,$a,$t2 8867bded2dbSJung-uk Kim vpandn $d,$b,$t1 8877bded2dbSJung-uk Kim vpand $c,$b,$t0 8887bded2dbSJung-uk Kim 8897bded2dbSJung-uk Kim vmovdqa @Xi[0],`&Xi_off($i)` 8907bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 8917bded2dbSJung-uk Kim $vpack $t3,@Xi[1],@Xi[1] 8927bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 8937bded2dbSJung-uk Kim prefetcht0 63(@ptr[1]) 8947bded2dbSJung-uk Kim vpxor $t1,$t0,$t0 # Ch(b,c,d) 8957bded2dbSJung-uk Kim 8967bded2dbSJung-uk Kim vpslld \$30,$b,$t1 8977bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 8987bded2dbSJung-uk Kim prefetcht0 63(@ptr[2]) 8997bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Ch(b,c,d) 9007bded2dbSJung-uk Kim 9017bded2dbSJung-uk Kim vpsrld \$2,$b,$b 9027bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 9037bded2dbSJung-uk Kim prefetcht0 63(@ptr[3]) 9047bded2dbSJung-uk Kim vpshufb $tx,@Xi[1],@Xi[1] 9057bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 9067bded2dbSJung-uk Kim___ 9077bded2dbSJung-uk Kim$code.=<<___ if ($i>=13 && $i<15); 9087bded2dbSJung-uk Kim vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" 9097bded2dbSJung-uk Kim___ 9107bded2dbSJung-uk Kim$code.=<<___ if ($i>=15); # apply Xupdate 9117bded2dbSJung-uk Kim vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 9127bded2dbSJung-uk Kim vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 9137bded2dbSJung-uk Kim 9147bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_00_19 9157bded2dbSJung-uk Kim vpslld \$5,$a,$t2 9167bded2dbSJung-uk Kim vpandn $d,$b,$t1 9177bded2dbSJung-uk Kim `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` 9187bded2dbSJung-uk Kim vpand $c,$b,$t0 9197bded2dbSJung-uk Kim 9207bded2dbSJung-uk Kim vmovdqa @Xi[0],`&Xi_off($i)` 9217bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 9227bded2dbSJung-uk Kim vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 9237bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 9247bded2dbSJung-uk Kim vpxor $t1,$t0,$t0 # Ch(b,c,d) 9257bded2dbSJung-uk Kim vpxor @Xi[3],@Xi[1],@Xi[1] 9267bded2dbSJung-uk Kim `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` 9277bded2dbSJung-uk Kim 9287bded2dbSJung-uk Kim vpslld \$30,$b,$t1 9297bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 9307bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Ch(b,c,d) 9317bded2dbSJung-uk Kim `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` 9327bded2dbSJung-uk Kim vpsrld \$31,@Xi[1],$tx 9337bded2dbSJung-uk Kim vpaddd @Xi[1],@Xi[1],@Xi[1] 9347bded2dbSJung-uk Kim 9357bded2dbSJung-uk Kim vpsrld \$2,$b,$b 9367bded2dbSJung-uk Kim `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` 9377bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 9387bded2dbSJung-uk Kim vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] 9397bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 9407bded2dbSJung-uk Kim___ 9417bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 9427bded2dbSJung-uk Kim} 9437bded2dbSJung-uk Kim 9447bded2dbSJung-uk Kimsub BODY_20_39_avx { 9457bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 9467bded2dbSJung-uk Kimmy $j=$i+1; 9477bded2dbSJung-uk Kim 9487bded2dbSJung-uk Kim$code.=<<___ if ($i<79); 9497bded2dbSJung-uk Kim vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 9507bded2dbSJung-uk Kim vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 9517bded2dbSJung-uk Kim 9527bded2dbSJung-uk Kim vpslld \$5,$a,$t2 9537bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_20_39 9547bded2dbSJung-uk Kim vpxor $b,$d,$t0 9557bded2dbSJung-uk Kim___ 9567bded2dbSJung-uk Kim$code.=<<___ if ($i<72); 9577bded2dbSJung-uk Kim vmovdqa @Xi[0],`&Xi_off($i)` 9587bded2dbSJung-uk Kim___ 9597bded2dbSJung-uk Kim$code.=<<___ if ($i<79); 9607bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 9617bded2dbSJung-uk Kim vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 9627bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 9637bded2dbSJung-uk Kim vpxor $c,$t0,$t0 # Parity(b,c,d) 9647bded2dbSJung-uk Kim vpxor @Xi[3],@Xi[1],@Xi[1] 9657bded2dbSJung-uk Kim 9667bded2dbSJung-uk Kim vpslld \$30,$b,$t1 9677bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 9687bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Parity(b,c,d) 9697bded2dbSJung-uk Kim vpsrld \$31,@Xi[1],$tx 9707bded2dbSJung-uk Kim vpaddd @Xi[1],@Xi[1],@Xi[1] 9717bded2dbSJung-uk Kim 9727bded2dbSJung-uk Kim vpsrld \$2,$b,$b 9737bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 9747bded2dbSJung-uk Kim vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1) 9757bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 9767bded2dbSJung-uk Kim___ 9777bded2dbSJung-uk Kim$code.=<<___ if ($i==79); 9787bded2dbSJung-uk Kim vpslld \$5,$a,$t2 9797bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_20_39 9807bded2dbSJung-uk Kim vpxor $b,$d,$t0 9817bded2dbSJung-uk Kim 9827bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 9837bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 9847bded2dbSJung-uk Kim vpxor $c,$t0,$t0 # Parity(b,c,d) 9857bded2dbSJung-uk Kim 9867bded2dbSJung-uk Kim vpslld \$30,$b,$t1 9877bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 9887bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Parity(b,c,d) 9897bded2dbSJung-uk Kim 9907bded2dbSJung-uk Kim vpsrld \$2,$b,$b 9917bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 9927bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 9937bded2dbSJung-uk Kim___ 9947bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 9957bded2dbSJung-uk Kim} 9967bded2dbSJung-uk Kim 9977bded2dbSJung-uk Kimsub BODY_40_59_avx { 9987bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_; 9997bded2dbSJung-uk Kimmy $j=$i+1; 10007bded2dbSJung-uk Kim 10017bded2dbSJung-uk Kim$code.=<<___; 10027bded2dbSJung-uk Kim vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 10037bded2dbSJung-uk Kim vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 10047bded2dbSJung-uk Kim 10057bded2dbSJung-uk Kim vpaddd $K,$e,$e # e+=K_40_59 10067bded2dbSJung-uk Kim vpslld \$5,$a,$t2 10077bded2dbSJung-uk Kim vpand $c,$d,$t1 10087bded2dbSJung-uk Kim vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 10097bded2dbSJung-uk Kim 10107bded2dbSJung-uk Kim vpaddd $t1,$e,$e 10117bded2dbSJung-uk Kim vpsrld \$27,$a,$t3 10127bded2dbSJung-uk Kim vpxor $c,$d,$t0 10137bded2dbSJung-uk Kim vpxor @Xi[3],@Xi[1],@Xi[1] 10147bded2dbSJung-uk Kim 10157bded2dbSJung-uk Kim vmovdqu @Xi[0],`&Xi_off($i)` 10167bded2dbSJung-uk Kim vpaddd @Xi[0],$e,$e # e+=X[i] 10177bded2dbSJung-uk Kim vpor $t3,$t2,$t2 # rol(a,5) 10187bded2dbSJung-uk Kim vpsrld \$31,@Xi[1],$tx 10197bded2dbSJung-uk Kim vpand $b,$t0,$t0 10207bded2dbSJung-uk Kim vpaddd @Xi[1],@Xi[1],@Xi[1] 10217bded2dbSJung-uk Kim 10227bded2dbSJung-uk Kim vpslld \$30,$b,$t1 10237bded2dbSJung-uk Kim vpaddd $t0,$e,$e # e+=Maj(b,d,c) 10247bded2dbSJung-uk Kim 10257bded2dbSJung-uk Kim vpsrld \$2,$b,$b 10267bded2dbSJung-uk Kim vpaddd $t2,$e,$e # e+=rol(a,5) 10277bded2dbSJung-uk Kim vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1) 10287bded2dbSJung-uk Kim vpor $t1,$b,$b # b=rol(b,30) 10297bded2dbSJung-uk Kim___ 10307bded2dbSJung-uk Kimpush(@Xi,shift(@Xi)); 10317bded2dbSJung-uk Kim} 10327bded2dbSJung-uk Kim 10337bded2dbSJung-uk Kim$code.=<<___; 10347bded2dbSJung-uk Kim.type sha1_multi_block_avx,\@function,3 10357bded2dbSJung-uk Kim.align 32 10367bded2dbSJung-uk Kimsha1_multi_block_avx: 1037e71b7053SJung-uk Kim.cfi_startproc 10387bded2dbSJung-uk Kim_avx_shortcut: 10397bded2dbSJung-uk Kim___ 10407bded2dbSJung-uk Kim$code.=<<___ if ($avx>1); 10417bded2dbSJung-uk Kim shr \$32,%rcx 10427bded2dbSJung-uk Kim cmp \$2,$num 10437bded2dbSJung-uk Kim jb .Lavx 10447bded2dbSJung-uk Kim test \$`1<<5`,%ecx 10457bded2dbSJung-uk Kim jnz _avx2_shortcut 10467bded2dbSJung-uk Kim jmp .Lavx 10477bded2dbSJung-uk Kim.align 32 10487bded2dbSJung-uk Kim.Lavx: 10497bded2dbSJung-uk Kim___ 10507bded2dbSJung-uk Kim$code.=<<___; 10517bded2dbSJung-uk Kim mov %rsp,%rax 1052e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 10537bded2dbSJung-uk Kim push %rbx 1054e71b7053SJung-uk Kim.cfi_push %rbx 10557bded2dbSJung-uk Kim push %rbp 1056e71b7053SJung-uk Kim.cfi_push %rbp 10577bded2dbSJung-uk Kim___ 10587bded2dbSJung-uk Kim$code.=<<___ if ($win64); 10597bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 10607bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 10617bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 10627bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 10637bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 10647bded2dbSJung-uk Kim movaps %xmm10,-0x78(%rax) 10657bded2dbSJung-uk Kim movaps %xmm11,-0x68(%rax) 10667bded2dbSJung-uk Kim movaps %xmm12,-0x58(%rax) 10677bded2dbSJung-uk Kim movaps %xmm13,-0x48(%rax) 10687bded2dbSJung-uk Kim movaps %xmm14,-0x38(%rax) 10697bded2dbSJung-uk Kim movaps %xmm15,-0x28(%rax) 10707bded2dbSJung-uk Kim___ 10717bded2dbSJung-uk Kim$code.=<<___; 10727bded2dbSJung-uk Kim sub \$`$REG_SZ*18`, %rsp 10737bded2dbSJung-uk Kim and \$-256,%rsp 10747bded2dbSJung-uk Kim mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1075e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 10767bded2dbSJung-uk Kim.Lbody_avx: 10777bded2dbSJung-uk Kim lea K_XX_XX(%rip),$Tbl 10787bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 10797bded2dbSJung-uk Kim 10807bded2dbSJung-uk Kim vzeroupper 10817bded2dbSJung-uk Kim.Loop_grande_avx: 10827bded2dbSJung-uk Kim mov $num,`$REG_SZ*17+8`(%rsp) # original $num 10837bded2dbSJung-uk Kim xor $num,$num 10847bded2dbSJung-uk Kim___ 10857bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 1086*b077aed3SPierre Pronchery $ptr_reg=&pointer_register($flavour,@ptr[$i]); 10877bded2dbSJung-uk Kim $code.=<<___; 1088*b077aed3SPierre Pronchery # input pointer 1089*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+0`($inp),$ptr_reg 1090*b077aed3SPierre Pronchery # number of blocks 1091*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx 10927bded2dbSJung-uk Kim cmp $num,%ecx 10937bded2dbSJung-uk Kim cmovg %ecx,$num # find maximum 10947bded2dbSJung-uk Kim test %ecx,%ecx 10957bded2dbSJung-uk Kim mov %ecx,`4*$i`(%rbx) # initialize counters 10967bded2dbSJung-uk Kim cmovle $Tbl,@ptr[$i] # cancel input 10977bded2dbSJung-uk Kim___ 10987bded2dbSJung-uk Kim} 10997bded2dbSJung-uk Kim$code.=<<___; 11007bded2dbSJung-uk Kim test $num,$num 11017bded2dbSJung-uk Kim jz .Ldone_avx 11027bded2dbSJung-uk Kim 11037bded2dbSJung-uk Kim vmovdqu 0x00($ctx),$A # load context 11047bded2dbSJung-uk Kim lea 128(%rsp),%rax 11057bded2dbSJung-uk Kim vmovdqu 0x20($ctx),$B 11067bded2dbSJung-uk Kim vmovdqu 0x40($ctx),$C 11077bded2dbSJung-uk Kim vmovdqu 0x60($ctx),$D 11087bded2dbSJung-uk Kim vmovdqu 0x80($ctx),$E 11097bded2dbSJung-uk Kim vmovdqu 0x60($Tbl),$tx # pbswap_mask 11107bded2dbSJung-uk Kim jmp .Loop_avx 11117bded2dbSJung-uk Kim 11127bded2dbSJung-uk Kim.align 32 11137bded2dbSJung-uk Kim.Loop_avx: 11147bded2dbSJung-uk Kim___ 11157bded2dbSJung-uk Kim$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 11167bded2dbSJung-uk Kimfor($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } 11177bded2dbSJung-uk Kim$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 11187bded2dbSJung-uk Kimfor(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 11197bded2dbSJung-uk Kim$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 11207bded2dbSJung-uk Kimfor(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } 11217bded2dbSJung-uk Kim$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 11227bded2dbSJung-uk Kimfor(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 11237bded2dbSJung-uk Kim$code.=<<___; 11247bded2dbSJung-uk Kim mov \$1,%ecx 11257bded2dbSJung-uk Kim___ 11267bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 11277bded2dbSJung-uk Kim $code.=<<___; 11287bded2dbSJung-uk Kim cmp `4*$i`(%rbx),%ecx # examine counters 11297bded2dbSJung-uk Kim cmovge $Tbl,@ptr[$i] # cancel input 11307bded2dbSJung-uk Kim___ 11317bded2dbSJung-uk Kim} 11327bded2dbSJung-uk Kim$code.=<<___; 11337bded2dbSJung-uk Kim vmovdqu (%rbx),$t0 # pull counters 11347bded2dbSJung-uk Kim vpxor $t2,$t2,$t2 11357bded2dbSJung-uk Kim vmovdqa $t0,$t1 11367bded2dbSJung-uk Kim vpcmpgtd $t2,$t1,$t1 # mask value 11377bded2dbSJung-uk Kim vpaddd $t1,$t0,$t0 # counters-- 11387bded2dbSJung-uk Kim 11397bded2dbSJung-uk Kim vpand $t1,$A,$A 11407bded2dbSJung-uk Kim vpand $t1,$B,$B 11417bded2dbSJung-uk Kim vpaddd 0x00($ctx),$A,$A 11427bded2dbSJung-uk Kim vpand $t1,$C,$C 11437bded2dbSJung-uk Kim vpaddd 0x20($ctx),$B,$B 11447bded2dbSJung-uk Kim vpand $t1,$D,$D 11457bded2dbSJung-uk Kim vpaddd 0x40($ctx),$C,$C 11467bded2dbSJung-uk Kim vpand $t1,$E,$E 11477bded2dbSJung-uk Kim vpaddd 0x60($ctx),$D,$D 11487bded2dbSJung-uk Kim vpaddd 0x80($ctx),$E,$E 11497bded2dbSJung-uk Kim vmovdqu $A,0x00($ctx) 11507bded2dbSJung-uk Kim vmovdqu $B,0x20($ctx) 11517bded2dbSJung-uk Kim vmovdqu $C,0x40($ctx) 11527bded2dbSJung-uk Kim vmovdqu $D,0x60($ctx) 11537bded2dbSJung-uk Kim vmovdqu $E,0x80($ctx) 11547bded2dbSJung-uk Kim 11557bded2dbSJung-uk Kim vmovdqu $t0,(%rbx) # save counters 11567bded2dbSJung-uk Kim vmovdqu 0x60($Tbl),$tx # pbswap_mask 11577bded2dbSJung-uk Kim dec $num 11587bded2dbSJung-uk Kim jnz .Loop_avx 11597bded2dbSJung-uk Kim 11607bded2dbSJung-uk Kim mov `$REG_SZ*17+8`(%rsp),$num 11617bded2dbSJung-uk Kim lea $REG_SZ($ctx),$ctx 1162*b077aed3SPierre Pronchery lea `$inp_elm_size*$REG_SZ/4`($inp),$inp 11637bded2dbSJung-uk Kim dec $num 11647bded2dbSJung-uk Kim jnz .Loop_grande_avx 11657bded2dbSJung-uk Kim 11667bded2dbSJung-uk Kim.Ldone_avx: 1167e71b7053SJung-uk Kim mov `$REG_SZ*17`(%rsp),%rax # original %rsp 1168e71b7053SJung-uk Kim.cfi_def_cfa %rax,8 11697bded2dbSJung-uk Kim vzeroupper 11707bded2dbSJung-uk Kim___ 11717bded2dbSJung-uk Kim$code.=<<___ if ($win64); 11727bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm6 11737bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm7 11747bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm8 11757bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm9 11767bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm10 11777bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm11 11787bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm12 11797bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm13 11807bded2dbSJung-uk Kim movaps -0x38(%rax),%xmm14 11817bded2dbSJung-uk Kim movaps -0x28(%rax),%xmm15 11827bded2dbSJung-uk Kim___ 11837bded2dbSJung-uk Kim$code.=<<___; 11847bded2dbSJung-uk Kim mov -16(%rax),%rbp 1185e71b7053SJung-uk Kim.cfi_restore %rbp 11867bded2dbSJung-uk Kim mov -8(%rax),%rbx 1187e71b7053SJung-uk Kim.cfi_restore %rbx 11887bded2dbSJung-uk Kim lea (%rax),%rsp 1189e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 11907bded2dbSJung-uk Kim.Lepilogue_avx: 11917bded2dbSJung-uk Kim ret 1192e71b7053SJung-uk Kim.cfi_endproc 11937bded2dbSJung-uk Kim.size sha1_multi_block_avx,.-sha1_multi_block_avx 11947bded2dbSJung-uk Kim___ 11957bded2dbSJung-uk Kim 11967bded2dbSJung-uk Kim if ($avx>1) { 11977bded2dbSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval $1/gem; 11987bded2dbSJung-uk Kim 11997bded2dbSJung-uk Kim$REG_SZ=32; 12007bded2dbSJung-uk Kim 12017bded2dbSJung-uk Kim@ptr=map("%r$_",(12..15,8..11)); 12027bded2dbSJung-uk Kim 12037bded2dbSJung-uk Kim@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4)); 12047bded2dbSJung-uk Kim($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9)); 12057bded2dbSJung-uk Kim@Xi=map("%ymm$_",(10..14)); 12067bded2dbSJung-uk Kim$K="%ymm15"; 12077bded2dbSJung-uk Kim 12087bded2dbSJung-uk Kim$code.=<<___; 12097bded2dbSJung-uk Kim.type sha1_multi_block_avx2,\@function,3 12107bded2dbSJung-uk Kim.align 32 12117bded2dbSJung-uk Kimsha1_multi_block_avx2: 1212e71b7053SJung-uk Kim.cfi_startproc 12137bded2dbSJung-uk Kim_avx2_shortcut: 12147bded2dbSJung-uk Kim mov %rsp,%rax 1215e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 12167bded2dbSJung-uk Kim push %rbx 1217e71b7053SJung-uk Kim.cfi_push %rbx 12187bded2dbSJung-uk Kim push %rbp 1219e71b7053SJung-uk Kim.cfi_push %rbp 12207bded2dbSJung-uk Kim push %r12 1221e71b7053SJung-uk Kim.cfi_push %r12 12227bded2dbSJung-uk Kim push %r13 1223e71b7053SJung-uk Kim.cfi_push %r13 12247bded2dbSJung-uk Kim push %r14 1225e71b7053SJung-uk Kim.cfi_push %r14 12267bded2dbSJung-uk Kim push %r15 1227e71b7053SJung-uk Kim.cfi_push %r15 12287bded2dbSJung-uk Kim___ 12297bded2dbSJung-uk Kim$code.=<<___ if ($win64); 12307bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 12317bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 12327bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 12337bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 12347bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 12357bded2dbSJung-uk Kim movaps %xmm10,0x40(%rsp) 12367bded2dbSJung-uk Kim movaps %xmm11,0x50(%rsp) 12377bded2dbSJung-uk Kim movaps %xmm12,-0x78(%rax) 12387bded2dbSJung-uk Kim movaps %xmm13,-0x68(%rax) 12397bded2dbSJung-uk Kim movaps %xmm14,-0x58(%rax) 12407bded2dbSJung-uk Kim movaps %xmm15,-0x48(%rax) 12417bded2dbSJung-uk Kim___ 12427bded2dbSJung-uk Kim$code.=<<___; 12437bded2dbSJung-uk Kim sub \$`$REG_SZ*18`, %rsp 12447bded2dbSJung-uk Kim and \$-256,%rsp 12457bded2dbSJung-uk Kim mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1246e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 12477bded2dbSJung-uk Kim.Lbody_avx2: 12487bded2dbSJung-uk Kim lea K_XX_XX(%rip),$Tbl 12497bded2dbSJung-uk Kim shr \$1,$num 12507bded2dbSJung-uk Kim 12517bded2dbSJung-uk Kim vzeroupper 12527bded2dbSJung-uk Kim.Loop_grande_avx2: 12537bded2dbSJung-uk Kim mov $num,`$REG_SZ*17+8`(%rsp) # original $num 12547bded2dbSJung-uk Kim xor $num,$num 12557bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 12567bded2dbSJung-uk Kim___ 12577bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) { 1258*b077aed3SPierre Pronchery $ptr_reg=&pointer_register($flavour,@ptr[$i]); 12597bded2dbSJung-uk Kim $code.=<<___; 1260*b077aed3SPierre Pronchery # input pointer 1261*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+0`($inp),$ptr_reg 1262*b077aed3SPierre Pronchery # number of blocks 1263*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx 12647bded2dbSJung-uk Kim cmp $num,%ecx 12657bded2dbSJung-uk Kim cmovg %ecx,$num # find maximum 12667bded2dbSJung-uk Kim test %ecx,%ecx 12677bded2dbSJung-uk Kim mov %ecx,`4*$i`(%rbx) # initialize counters 12687bded2dbSJung-uk Kim cmovle $Tbl,@ptr[$i] # cancel input 12697bded2dbSJung-uk Kim___ 12707bded2dbSJung-uk Kim} 12717bded2dbSJung-uk Kim$code.=<<___; 12727bded2dbSJung-uk Kim vmovdqu 0x00($ctx),$A # load context 12737bded2dbSJung-uk Kim lea 128(%rsp),%rax 12747bded2dbSJung-uk Kim vmovdqu 0x20($ctx),$B 12757bded2dbSJung-uk Kim lea 256+128(%rsp),%rbx 12767bded2dbSJung-uk Kim vmovdqu 0x40($ctx),$C 12777bded2dbSJung-uk Kim vmovdqu 0x60($ctx),$D 12787bded2dbSJung-uk Kim vmovdqu 0x80($ctx),$E 12797bded2dbSJung-uk Kim vmovdqu 0x60($Tbl),$tx # pbswap_mask 12807bded2dbSJung-uk Kim jmp .Loop_avx2 12817bded2dbSJung-uk Kim 12827bded2dbSJung-uk Kim.align 32 12837bded2dbSJung-uk Kim.Loop_avx2: 12847bded2dbSJung-uk Kim___ 12857bded2dbSJung-uk Kim$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 12867bded2dbSJung-uk Kimfor($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } 12877bded2dbSJung-uk Kim$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 12887bded2dbSJung-uk Kimfor(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 12897bded2dbSJung-uk Kim$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 12907bded2dbSJung-uk Kimfor(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } 12917bded2dbSJung-uk Kim$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 12927bded2dbSJung-uk Kimfor(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 12937bded2dbSJung-uk Kim$code.=<<___; 12947bded2dbSJung-uk Kim mov \$1,%ecx 12957bded2dbSJung-uk Kim lea `$REG_SZ*16`(%rsp),%rbx 12967bded2dbSJung-uk Kim___ 12977bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) { 12987bded2dbSJung-uk Kim $code.=<<___; 12997bded2dbSJung-uk Kim cmp `4*$i`(%rbx),%ecx # examine counters 13007bded2dbSJung-uk Kim cmovge $Tbl,@ptr[$i] # cancel input 13017bded2dbSJung-uk Kim___ 13027bded2dbSJung-uk Kim} 13037bded2dbSJung-uk Kim$code.=<<___; 13047bded2dbSJung-uk Kim vmovdqu (%rbx),$t0 # pull counters 13057bded2dbSJung-uk Kim vpxor $t2,$t2,$t2 13067bded2dbSJung-uk Kim vmovdqa $t0,$t1 13077bded2dbSJung-uk Kim vpcmpgtd $t2,$t1,$t1 # mask value 13087bded2dbSJung-uk Kim vpaddd $t1,$t0,$t0 # counters-- 13097bded2dbSJung-uk Kim 13107bded2dbSJung-uk Kim vpand $t1,$A,$A 13117bded2dbSJung-uk Kim vpand $t1,$B,$B 13127bded2dbSJung-uk Kim vpaddd 0x00($ctx),$A,$A 13137bded2dbSJung-uk Kim vpand $t1,$C,$C 13147bded2dbSJung-uk Kim vpaddd 0x20($ctx),$B,$B 13157bded2dbSJung-uk Kim vpand $t1,$D,$D 13167bded2dbSJung-uk Kim vpaddd 0x40($ctx),$C,$C 13177bded2dbSJung-uk Kim vpand $t1,$E,$E 13187bded2dbSJung-uk Kim vpaddd 0x60($ctx),$D,$D 13197bded2dbSJung-uk Kim vpaddd 0x80($ctx),$E,$E 13207bded2dbSJung-uk Kim vmovdqu $A,0x00($ctx) 13217bded2dbSJung-uk Kim vmovdqu $B,0x20($ctx) 13227bded2dbSJung-uk Kim vmovdqu $C,0x40($ctx) 13237bded2dbSJung-uk Kim vmovdqu $D,0x60($ctx) 13247bded2dbSJung-uk Kim vmovdqu $E,0x80($ctx) 13257bded2dbSJung-uk Kim 13267bded2dbSJung-uk Kim vmovdqu $t0,(%rbx) # save counters 13277bded2dbSJung-uk Kim lea 256+128(%rsp),%rbx 13287bded2dbSJung-uk Kim vmovdqu 0x60($Tbl),$tx # pbswap_mask 13297bded2dbSJung-uk Kim dec $num 13307bded2dbSJung-uk Kim jnz .Loop_avx2 13317bded2dbSJung-uk Kim 13327bded2dbSJung-uk Kim #mov `$REG_SZ*17+8`(%rsp),$num 13337bded2dbSJung-uk Kim #lea $REG_SZ($ctx),$ctx 1334*b077aed3SPierre Pronchery #lea `$inp_elm_size*$REG_SZ/4`($inp),$inp 13357bded2dbSJung-uk Kim #dec $num 13367bded2dbSJung-uk Kim #jnz .Loop_grande_avx2 13377bded2dbSJung-uk Kim 13387bded2dbSJung-uk Kim.Ldone_avx2: 1339e71b7053SJung-uk Kim mov `$REG_SZ*17`(%rsp),%rax # original %rsp 1340e71b7053SJung-uk Kim.cfi_def_cfa %rax,8 13417bded2dbSJung-uk Kim vzeroupper 13427bded2dbSJung-uk Kim___ 13437bded2dbSJung-uk Kim$code.=<<___ if ($win64); 13447bded2dbSJung-uk Kim movaps -0xd8(%rax),%xmm6 13457bded2dbSJung-uk Kim movaps -0xc8(%rax),%xmm7 13467bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm8 13477bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm9 13487bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm10 13497bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm11 13507bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm12 13517bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm13 13527bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm14 13537bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm15 13547bded2dbSJung-uk Kim___ 13557bded2dbSJung-uk Kim$code.=<<___; 13567bded2dbSJung-uk Kim mov -48(%rax),%r15 1357e71b7053SJung-uk Kim.cfi_restore %r15 13587bded2dbSJung-uk Kim mov -40(%rax),%r14 1359e71b7053SJung-uk Kim.cfi_restore %r14 13607bded2dbSJung-uk Kim mov -32(%rax),%r13 1361e71b7053SJung-uk Kim.cfi_restore %r13 13627bded2dbSJung-uk Kim mov -24(%rax),%r12 1363e71b7053SJung-uk Kim.cfi_restore %r12 13647bded2dbSJung-uk Kim mov -16(%rax),%rbp 1365e71b7053SJung-uk Kim.cfi_restore %rbp 13667bded2dbSJung-uk Kim mov -8(%rax),%rbx 1367e71b7053SJung-uk Kim.cfi_restore %rbx 13687bded2dbSJung-uk Kim lea (%rax),%rsp 1369e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 13707bded2dbSJung-uk Kim.Lepilogue_avx2: 13717bded2dbSJung-uk Kim ret 1372e71b7053SJung-uk Kim.cfi_endproc 13737bded2dbSJung-uk Kim.size sha1_multi_block_avx2,.-sha1_multi_block_avx2 13747bded2dbSJung-uk Kim___ 13757bded2dbSJung-uk Kim } }}} 13767bded2dbSJung-uk Kim$code.=<<___; 13777bded2dbSJung-uk Kim 13787bded2dbSJung-uk Kim.align 256 13797bded2dbSJung-uk Kim .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 13807bded2dbSJung-uk Kim .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 13817bded2dbSJung-uk KimK_XX_XX: 13827bded2dbSJung-uk Kim .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 13837bded2dbSJung-uk Kim .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 13847bded2dbSJung-uk Kim .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 13857bded2dbSJung-uk Kim .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 13867bded2dbSJung-uk Kim .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 13877bded2dbSJung-uk Kim .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 13887bded2dbSJung-uk Kim .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 13897bded2dbSJung-uk Kim .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 13907bded2dbSJung-uk Kim .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 13917bded2dbSJung-uk Kim .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 13927bded2dbSJung-uk Kim___ 13937bded2dbSJung-uk Kim 13947bded2dbSJung-uk Kimif ($win64) { 13957bded2dbSJung-uk Kim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 13967bded2dbSJung-uk Kim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 13977bded2dbSJung-uk Kim$rec="%rcx"; 13987bded2dbSJung-uk Kim$frame="%rdx"; 13997bded2dbSJung-uk Kim$context="%r8"; 14007bded2dbSJung-uk Kim$disp="%r9"; 14017bded2dbSJung-uk Kim 14027bded2dbSJung-uk Kim$code.=<<___; 14037bded2dbSJung-uk Kim.extern __imp_RtlVirtualUnwind 14047bded2dbSJung-uk Kim.type se_handler,\@abi-omnipotent 14057bded2dbSJung-uk Kim.align 16 14067bded2dbSJung-uk Kimse_handler: 14077bded2dbSJung-uk Kim push %rsi 14087bded2dbSJung-uk Kim push %rdi 14097bded2dbSJung-uk Kim push %rbx 14107bded2dbSJung-uk Kim push %rbp 14117bded2dbSJung-uk Kim push %r12 14127bded2dbSJung-uk Kim push %r13 14137bded2dbSJung-uk Kim push %r14 14147bded2dbSJung-uk Kim push %r15 14157bded2dbSJung-uk Kim pushfq 14167bded2dbSJung-uk Kim sub \$64,%rsp 14177bded2dbSJung-uk Kim 14187bded2dbSJung-uk Kim mov 120($context),%rax # pull context->Rax 14197bded2dbSJung-uk Kim mov 248($context),%rbx # pull context->Rip 14207bded2dbSJung-uk Kim 14217bded2dbSJung-uk Kim mov 8($disp),%rsi # disp->ImageBase 14227bded2dbSJung-uk Kim mov 56($disp),%r11 # disp->HandlerData 14237bded2dbSJung-uk Kim 14247bded2dbSJung-uk Kim mov 0(%r11),%r10d # HandlerData[0] 14257bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # end of prologue label 14267bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip<.Lbody 14277bded2dbSJung-uk Kim jb .Lin_prologue 14287bded2dbSJung-uk Kim 14297bded2dbSJung-uk Kim mov 152($context),%rax # pull context->Rsp 14307bded2dbSJung-uk Kim 14317bded2dbSJung-uk Kim mov 4(%r11),%r10d # HandlerData[1] 14327bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # epilogue label 14337bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip>=.Lepilogue 14347bded2dbSJung-uk Kim jae .Lin_prologue 14357bded2dbSJung-uk Kim 14367bded2dbSJung-uk Kim mov `16*17`(%rax),%rax # pull saved stack pointer 14377bded2dbSJung-uk Kim 14387bded2dbSJung-uk Kim mov -8(%rax),%rbx 14397bded2dbSJung-uk Kim mov -16(%rax),%rbp 14407bded2dbSJung-uk Kim mov %rbx,144($context) # restore context->Rbx 14417bded2dbSJung-uk Kim mov %rbp,160($context) # restore context->Rbp 14427bded2dbSJung-uk Kim 14437bded2dbSJung-uk Kim lea -24-10*16(%rax),%rsi 14447bded2dbSJung-uk Kim lea 512($context),%rdi # &context.Xmm6 14457bded2dbSJung-uk Kim mov \$20,%ecx 14467bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 14477bded2dbSJung-uk Kim 14487bded2dbSJung-uk Kim.Lin_prologue: 14497bded2dbSJung-uk Kim mov 8(%rax),%rdi 14507bded2dbSJung-uk Kim mov 16(%rax),%rsi 14517bded2dbSJung-uk Kim mov %rax,152($context) # restore context->Rsp 14527bded2dbSJung-uk Kim mov %rsi,168($context) # restore context->Rsi 14537bded2dbSJung-uk Kim mov %rdi,176($context) # restore context->Rdi 14547bded2dbSJung-uk Kim 14557bded2dbSJung-uk Kim mov 40($disp),%rdi # disp->ContextRecord 14567bded2dbSJung-uk Kim mov $context,%rsi # context 14577bded2dbSJung-uk Kim mov \$154,%ecx # sizeof(CONTEXT) 14587bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 14597bded2dbSJung-uk Kim 14607bded2dbSJung-uk Kim mov $disp,%rsi 14617bded2dbSJung-uk Kim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 14627bded2dbSJung-uk Kim mov 8(%rsi),%rdx # arg2, disp->ImageBase 14637bded2dbSJung-uk Kim mov 0(%rsi),%r8 # arg3, disp->ControlPc 14647bded2dbSJung-uk Kim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 14657bded2dbSJung-uk Kim mov 40(%rsi),%r10 # disp->ContextRecord 14667bded2dbSJung-uk Kim lea 56(%rsi),%r11 # &disp->HandlerData 14677bded2dbSJung-uk Kim lea 24(%rsi),%r12 # &disp->EstablisherFrame 14687bded2dbSJung-uk Kim mov %r10,32(%rsp) # arg5 14697bded2dbSJung-uk Kim mov %r11,40(%rsp) # arg6 14707bded2dbSJung-uk Kim mov %r12,48(%rsp) # arg7 14717bded2dbSJung-uk Kim mov %rcx,56(%rsp) # arg8, (NULL) 14727bded2dbSJung-uk Kim call *__imp_RtlVirtualUnwind(%rip) 14737bded2dbSJung-uk Kim 14747bded2dbSJung-uk Kim mov \$1,%eax # ExceptionContinueSearch 14757bded2dbSJung-uk Kim add \$64,%rsp 14767bded2dbSJung-uk Kim popfq 14777bded2dbSJung-uk Kim pop %r15 14787bded2dbSJung-uk Kim pop %r14 14797bded2dbSJung-uk Kim pop %r13 14807bded2dbSJung-uk Kim pop %r12 14817bded2dbSJung-uk Kim pop %rbp 14827bded2dbSJung-uk Kim pop %rbx 14837bded2dbSJung-uk Kim pop %rdi 14847bded2dbSJung-uk Kim pop %rsi 14857bded2dbSJung-uk Kim ret 14867bded2dbSJung-uk Kim.size se_handler,.-se_handler 14877bded2dbSJung-uk Kim___ 14887bded2dbSJung-uk Kim$code.=<<___ if ($avx>1); 14897bded2dbSJung-uk Kim.type avx2_handler,\@abi-omnipotent 14907bded2dbSJung-uk Kim.align 16 14917bded2dbSJung-uk Kimavx2_handler: 14927bded2dbSJung-uk Kim push %rsi 14937bded2dbSJung-uk Kim push %rdi 14947bded2dbSJung-uk Kim push %rbx 14957bded2dbSJung-uk Kim push %rbp 14967bded2dbSJung-uk Kim push %r12 14977bded2dbSJung-uk Kim push %r13 14987bded2dbSJung-uk Kim push %r14 14997bded2dbSJung-uk Kim push %r15 15007bded2dbSJung-uk Kim pushfq 15017bded2dbSJung-uk Kim sub \$64,%rsp 15027bded2dbSJung-uk Kim 15037bded2dbSJung-uk Kim mov 120($context),%rax # pull context->Rax 15047bded2dbSJung-uk Kim mov 248($context),%rbx # pull context->Rip 15057bded2dbSJung-uk Kim 15067bded2dbSJung-uk Kim mov 8($disp),%rsi # disp->ImageBase 15077bded2dbSJung-uk Kim mov 56($disp),%r11 # disp->HandlerData 15087bded2dbSJung-uk Kim 15097bded2dbSJung-uk Kim mov 0(%r11),%r10d # HandlerData[0] 15107bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # end of prologue label 15117bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip<body label 15127bded2dbSJung-uk Kim jb .Lin_prologue 15137bded2dbSJung-uk Kim 15147bded2dbSJung-uk Kim mov 152($context),%rax # pull context->Rsp 15157bded2dbSJung-uk Kim 15167bded2dbSJung-uk Kim mov 4(%r11),%r10d # HandlerData[1] 15177bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # epilogue label 15187bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip>=epilogue label 15197bded2dbSJung-uk Kim jae .Lin_prologue 15207bded2dbSJung-uk Kim 15217bded2dbSJung-uk Kim mov `32*17`($context),%rax # pull saved stack pointer 15227bded2dbSJung-uk Kim 15237bded2dbSJung-uk Kim mov -8(%rax),%rbx 15247bded2dbSJung-uk Kim mov -16(%rax),%rbp 15257bded2dbSJung-uk Kim mov -24(%rax),%r12 15267bded2dbSJung-uk Kim mov -32(%rax),%r13 15277bded2dbSJung-uk Kim mov -40(%rax),%r14 15287bded2dbSJung-uk Kim mov -48(%rax),%r15 15297bded2dbSJung-uk Kim mov %rbx,144($context) # restore context->Rbx 15307bded2dbSJung-uk Kim mov %rbp,160($context) # restore context->Rbp 1531e71b7053SJung-uk Kim mov %r12,216($context) # restore context->R12 1532e71b7053SJung-uk Kim mov %r13,224($context) # restore context->R13 1533e71b7053SJung-uk Kim mov %r14,232($context) # restore context->R14 1534e71b7053SJung-uk Kim mov %r15,240($context) # restore context->R15 15357bded2dbSJung-uk Kim 15367bded2dbSJung-uk Kim lea -56-10*16(%rax),%rsi 15377bded2dbSJung-uk Kim lea 512($context),%rdi # &context.Xmm6 15387bded2dbSJung-uk Kim mov \$20,%ecx 15397bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 15407bded2dbSJung-uk Kim 15417bded2dbSJung-uk Kim jmp .Lin_prologue 15427bded2dbSJung-uk Kim.size avx2_handler,.-avx2_handler 15437bded2dbSJung-uk Kim___ 15447bded2dbSJung-uk Kim$code.=<<___; 15457bded2dbSJung-uk Kim.section .pdata 15467bded2dbSJung-uk Kim.align 4 15477bded2dbSJung-uk Kim .rva .LSEH_begin_sha1_multi_block 15487bded2dbSJung-uk Kim .rva .LSEH_end_sha1_multi_block 15497bded2dbSJung-uk Kim .rva .LSEH_info_sha1_multi_block 15507bded2dbSJung-uk Kim .rva .LSEH_begin_sha1_multi_block_shaext 15517bded2dbSJung-uk Kim .rva .LSEH_end_sha1_multi_block_shaext 15527bded2dbSJung-uk Kim .rva .LSEH_info_sha1_multi_block_shaext 15537bded2dbSJung-uk Kim___ 15547bded2dbSJung-uk Kim$code.=<<___ if ($avx); 15557bded2dbSJung-uk Kim .rva .LSEH_begin_sha1_multi_block_avx 15567bded2dbSJung-uk Kim .rva .LSEH_end_sha1_multi_block_avx 15577bded2dbSJung-uk Kim .rva .LSEH_info_sha1_multi_block_avx 15587bded2dbSJung-uk Kim___ 15597bded2dbSJung-uk Kim$code.=<<___ if ($avx>1); 15607bded2dbSJung-uk Kim .rva .LSEH_begin_sha1_multi_block_avx2 15617bded2dbSJung-uk Kim .rva .LSEH_end_sha1_multi_block_avx2 15627bded2dbSJung-uk Kim .rva .LSEH_info_sha1_multi_block_avx2 15637bded2dbSJung-uk Kim___ 15647bded2dbSJung-uk Kim$code.=<<___; 15657bded2dbSJung-uk Kim.section .xdata 15667bded2dbSJung-uk Kim.align 8 15677bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block: 15687bded2dbSJung-uk Kim .byte 9,0,0,0 15697bded2dbSJung-uk Kim .rva se_handler 15707bded2dbSJung-uk Kim .rva .Lbody,.Lepilogue # HandlerData[] 15717bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block_shaext: 15727bded2dbSJung-uk Kim .byte 9,0,0,0 15737bded2dbSJung-uk Kim .rva se_handler 15747bded2dbSJung-uk Kim .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] 15757bded2dbSJung-uk Kim___ 15767bded2dbSJung-uk Kim$code.=<<___ if ($avx); 15777bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block_avx: 15787bded2dbSJung-uk Kim .byte 9,0,0,0 15797bded2dbSJung-uk Kim .rva se_handler 15807bded2dbSJung-uk Kim .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] 15817bded2dbSJung-uk Kim___ 15827bded2dbSJung-uk Kim$code.=<<___ if ($avx>1); 15837bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block_avx2: 15847bded2dbSJung-uk Kim .byte 9,0,0,0 15857bded2dbSJung-uk Kim .rva avx2_handler 15867bded2dbSJung-uk Kim .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] 15877bded2dbSJung-uk Kim___ 15887bded2dbSJung-uk Kim} 15897bded2dbSJung-uk Kim#################################################################### 15907bded2dbSJung-uk Kim 15917bded2dbSJung-uk Kimsub rex { 15927bded2dbSJung-uk Kim local *opcode=shift; 15937bded2dbSJung-uk Kim my ($dst,$src)=@_; 15947bded2dbSJung-uk Kim my $rex=0; 15957bded2dbSJung-uk Kim 15967bded2dbSJung-uk Kim $rex|=0x04 if ($dst>=8); 15977bded2dbSJung-uk Kim $rex|=0x01 if ($src>=8); 15987bded2dbSJung-uk Kim unshift @opcode,$rex|0x40 if ($rex); 15997bded2dbSJung-uk Kim} 16007bded2dbSJung-uk Kim 16017bded2dbSJung-uk Kimsub sha1rnds4 { 16027bded2dbSJung-uk Kim if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 16037bded2dbSJung-uk Kim my @opcode=(0x0f,0x3a,0xcc); 16047bded2dbSJung-uk Kim rex(\@opcode,$3,$2); 16057bded2dbSJung-uk Kim push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 16067bded2dbSJung-uk Kim my $c=$1; 16077bded2dbSJung-uk Kim push @opcode,$c=~/^0/?oct($c):$c; 16087bded2dbSJung-uk Kim return ".byte\t".join(',',@opcode); 16097bded2dbSJung-uk Kim } else { 16107bded2dbSJung-uk Kim return "sha1rnds4\t".@_[0]; 16117bded2dbSJung-uk Kim } 16127bded2dbSJung-uk Kim} 16137bded2dbSJung-uk Kim 16147bded2dbSJung-uk Kimsub sha1op38 { 16157bded2dbSJung-uk Kim my $instr = shift; 16167bded2dbSJung-uk Kim my %opcodelet = ( 16177bded2dbSJung-uk Kim "sha1nexte" => 0xc8, 16187bded2dbSJung-uk Kim "sha1msg1" => 0xc9, 16197bded2dbSJung-uk Kim "sha1msg2" => 0xca ); 16207bded2dbSJung-uk Kim 16217bded2dbSJung-uk Kim if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 16227bded2dbSJung-uk Kim my @opcode=(0x0f,0x38); 16237bded2dbSJung-uk Kim rex(\@opcode,$2,$1); 16247bded2dbSJung-uk Kim push @opcode,$opcodelet{$instr}; 16257bded2dbSJung-uk Kim push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 16267bded2dbSJung-uk Kim return ".byte\t".join(',',@opcode); 16277bded2dbSJung-uk Kim } else { 16287bded2dbSJung-uk Kim return $instr."\t".@_[0]; 16297bded2dbSJung-uk Kim } 16307bded2dbSJung-uk Kim} 16317bded2dbSJung-uk Kim 16327bded2dbSJung-uk Kimforeach (split("\n",$code)) { 16337bded2dbSJung-uk Kim s/\`([^\`]*)\`/eval($1)/ge; 16347bded2dbSJung-uk Kim 16357bded2dbSJung-uk Kim s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or 16367bded2dbSJung-uk Kim s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or 16377bded2dbSJung-uk Kim 16387bded2dbSJung-uk Kim s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 16397bded2dbSJung-uk Kim s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 16407bded2dbSJung-uk Kim s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or 16417bded2dbSJung-uk Kim s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 16427bded2dbSJung-uk Kim s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or 16437bded2dbSJung-uk Kim s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 16447bded2dbSJung-uk Kim 16457bded2dbSJung-uk Kim print $_,"\n"; 16467bded2dbSJung-uk Kim} 16477bded2dbSJung-uk Kim 164817f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!"; 1649