xref: /freebsd/crypto/openssl/crypto/sha/asm/sha1-mb-x86_64.pl (revision 7bded2db17780f5b59bc532689d8a9541f06901e)
1*7bded2dbSJung-uk Kim#!/usr/bin/env perl
2*7bded2dbSJung-uk Kim
3*7bded2dbSJung-uk Kim# ====================================================================
4*7bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5*7bded2dbSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and
6*7bded2dbSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further
7*7bded2dbSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/.
8*7bded2dbSJung-uk Kim# ====================================================================
9*7bded2dbSJung-uk Kim
10*7bded2dbSJung-uk Kim# Multi-buffer SHA1 procedure processes n buffers in parallel by
11*7bded2dbSJung-uk Kim# placing buffer data to designated lane of SIMD register. n is
12*7bded2dbSJung-uk Kim# naturally limited to 4 on pre-AVX2 processors and to 8 on
13*7bded2dbSJung-uk Kim# AVX2-capable processors such as Haswell.
14*7bded2dbSJung-uk Kim#
15*7bded2dbSJung-uk Kim#		this	+aesni(i)	sha1	aesni-sha1	gain(iv)
16*7bded2dbSJung-uk Kim# -------------------------------------------------------------------
17*7bded2dbSJung-uk Kim# Westmere(ii)	10.7/n	+1.28=3.96(n=4)	5.30	6.66		+68%
18*7bded2dbSJung-uk Kim# Atom(ii)	18.1/n	+3.93=8.46(n=4)	9.37	12.8		+51%
19*7bded2dbSJung-uk Kim# Sandy Bridge	(8.16	+5.15=13.3)/n	4.99	5.98		+80%
20*7bded2dbSJung-uk Kim# Ivy Bridge	(8.08	+5.14=13.2)/n	4.60	5.54		+68%
21*7bded2dbSJung-uk Kim# Haswell(iii)	(8.96	+5.00=14.0)/n	3.57	4.55		+160%
22*7bded2dbSJung-uk Kim# Bulldozer	(9.76	+5.76=15.5)/n	5.95	6.37		+64%
23*7bded2dbSJung-uk Kim#
24*7bded2dbSJung-uk Kim# (i)	multi-block CBC encrypt with 128-bit key;
25*7bded2dbSJung-uk Kim# (ii)	(HASH+AES)/n does not apply to Westmere for n>3 and Atom,
26*7bded2dbSJung-uk Kim#	because of lower AES-NI instruction throughput;
27*7bded2dbSJung-uk Kim# (iii)	"this" is for n=8, when we gather twice as much data, result
28*7bded2dbSJung-uk Kim#	for n=4 is 8.00+4.44=12.4;
29*7bded2dbSJung-uk Kim# (iv)	presented improvement coefficients are asymptotic limits and
30*7bded2dbSJung-uk Kim#	in real-life application are somewhat lower, e.g. for 2KB
31*7bded2dbSJung-uk Kim#	fragments they range from 30% to 100% (on Haswell);
32*7bded2dbSJung-uk Kim
33*7bded2dbSJung-uk Kim$flavour = shift;
34*7bded2dbSJung-uk Kim$output  = shift;
35*7bded2dbSJung-uk Kimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
36*7bded2dbSJung-uk Kim
37*7bded2dbSJung-uk Kim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
38*7bded2dbSJung-uk Kim
39*7bded2dbSJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40*7bded2dbSJung-uk Kim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
41*7bded2dbSJung-uk Kim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
42*7bded2dbSJung-uk Kimdie "can't locate x86_64-xlate.pl";
43*7bded2dbSJung-uk Kim
44*7bded2dbSJung-uk Kim$avx=0;
45*7bded2dbSJung-uk Kim
46*7bded2dbSJung-uk Kimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
47*7bded2dbSJung-uk Kim		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
48*7bded2dbSJung-uk Kim	$avx = ($1>=2.19) + ($1>=2.22);
49*7bded2dbSJung-uk Kim}
50*7bded2dbSJung-uk Kim
51*7bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
52*7bded2dbSJung-uk Kim	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
53*7bded2dbSJung-uk Kim	$avx = ($1>=2.09) + ($1>=2.10);
54*7bded2dbSJung-uk Kim}
55*7bded2dbSJung-uk Kim
56*7bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
57*7bded2dbSJung-uk Kim	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
58*7bded2dbSJung-uk Kim	$avx = ($1>=10) + ($1>=11);
59*7bded2dbSJung-uk Kim}
60*7bded2dbSJung-uk Kim
61*7bded2dbSJung-uk Kimif (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
62*7bded2dbSJung-uk Kim	$avx = ($2>=3.0) + ($2>3.0);
63*7bded2dbSJung-uk Kim}
64*7bded2dbSJung-uk Kim
65*7bded2dbSJung-uk Kimopen OUT,"| \"$^X\" $xlate $flavour $output";
66*7bded2dbSJung-uk Kim*STDOUT=*OUT;
67*7bded2dbSJung-uk Kim
68*7bded2dbSJung-uk Kim# void sha1_multi_block (
69*7bded2dbSJung-uk Kim#     struct {	unsigned int A[8];
70*7bded2dbSJung-uk Kim#		unsigned int B[8];
71*7bded2dbSJung-uk Kim#		unsigned int C[8];
72*7bded2dbSJung-uk Kim#		unsigned int D[8];
73*7bded2dbSJung-uk Kim#		unsigned int E[8];	} *ctx,
74*7bded2dbSJung-uk Kim#     struct {	void *ptr; int blocks;	} inp[8],
75*7bded2dbSJung-uk Kim#     int num);		/* 1 or 2 */
76*7bded2dbSJung-uk Kim#
77*7bded2dbSJung-uk Kim$ctx="%rdi";	# 1st arg
78*7bded2dbSJung-uk Kim$inp="%rsi";	# 2nd arg
79*7bded2dbSJung-uk Kim$num="%edx";
80*7bded2dbSJung-uk Kim@ptr=map("%r$_",(8..11));
81*7bded2dbSJung-uk Kim$Tbl="%rbp";
82*7bded2dbSJung-uk Kim
83*7bded2dbSJung-uk Kim@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4));
84*7bded2dbSJung-uk Kim($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9));
85*7bded2dbSJung-uk Kim@Xi=map("%xmm$_",(10..14));
86*7bded2dbSJung-uk Kim$K="%xmm15";
87*7bded2dbSJung-uk Kim
88*7bded2dbSJung-uk Kimif (1) {
89*7bded2dbSJung-uk Kim    # Atom-specific optimization aiming to eliminate pshufb with high
90*7bded2dbSJung-uk Kim    # registers [and thus get rid of 48 cycles accumulated penalty]
91*7bded2dbSJung-uk Kim    @Xi=map("%xmm$_",(0..4));
92*7bded2dbSJung-uk Kim    ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
93*7bded2dbSJung-uk Kim    @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
94*7bded2dbSJung-uk Kim}
95*7bded2dbSJung-uk Kim
96*7bded2dbSJung-uk Kim$REG_SZ=16;
97*7bded2dbSJung-uk Kim
98*7bded2dbSJung-uk Kimsub Xi_off {
99*7bded2dbSJung-uk Kimmy $off = shift;
100*7bded2dbSJung-uk Kim
101*7bded2dbSJung-uk Kim    $off %= 16; $off *= $REG_SZ;
102*7bded2dbSJung-uk Kim    $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
103*7bded2dbSJung-uk Kim}
104*7bded2dbSJung-uk Kim
105*7bded2dbSJung-uk Kimsub BODY_00_19 {
106*7bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_;
107*7bded2dbSJung-uk Kimmy $j=$i+1;
108*7bded2dbSJung-uk Kimmy $k=$i+2;
109*7bded2dbSJung-uk Kim
110*7bded2dbSJung-uk Kim# Loads are performed 2+3/4 iterations in advance. 3/4 means that out
111*7bded2dbSJung-uk Kim# of 4 words you would expect to be loaded per given iteration one is
112*7bded2dbSJung-uk Kim# spilled to next iteration. In other words indices in four input
113*7bded2dbSJung-uk Kim# streams are distributed as following:
114*7bded2dbSJung-uk Kim#
115*7bded2dbSJung-uk Kim# $i==0:	0,0,0,0,1,1,1,1,2,2,2,
116*7bded2dbSJung-uk Kim# $i==1:	2,3,3,3,
117*7bded2dbSJung-uk Kim# $i==2:	3,4,4,4,
118*7bded2dbSJung-uk Kim# ...
119*7bded2dbSJung-uk Kim# $i==13:	14,15,15,15,
120*7bded2dbSJung-uk Kim# $i==14:	15
121*7bded2dbSJung-uk Kim#
122*7bded2dbSJung-uk Kim# Then at $i==15 Xupdate is applied one iteration in advance...
123*7bded2dbSJung-uk Kim$code.=<<___ if ($i==0);
124*7bded2dbSJung-uk Kim	movd		(@ptr[0]),@Xi[0]
125*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[0]),@ptr[0]
126*7bded2dbSJung-uk Kim	movd		(@ptr[1]),@Xi[2]	# borrow @Xi[2]
127*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[1]),@ptr[1]
128*7bded2dbSJung-uk Kim	movd		(@ptr[2]),@Xi[3]	# borrow @Xi[3]
129*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[2]),@ptr[2]
130*7bded2dbSJung-uk Kim	movd		(@ptr[3]),@Xi[4]	# borrow @Xi[4]
131*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[3]),@ptr[3]
132*7bded2dbSJung-uk Kim	punpckldq	@Xi[3],@Xi[0]
133*7bded2dbSJung-uk Kim	 movd		`4*$j-16*4`(@ptr[0]),@Xi[1]
134*7bded2dbSJung-uk Kim	punpckldq	@Xi[4],@Xi[2]
135*7bded2dbSJung-uk Kim	 movd		`4*$j-16*4`(@ptr[1]),$t3
136*7bded2dbSJung-uk Kim	punpckldq	@Xi[2],@Xi[0]
137*7bded2dbSJung-uk Kim	 movd		`4*$j-16*4`(@ptr[2]),$t2
138*7bded2dbSJung-uk Kim	pshufb		$tx,@Xi[0]
139*7bded2dbSJung-uk Kim___
140*7bded2dbSJung-uk Kim$code.=<<___ if ($i<14);			# just load input
141*7bded2dbSJung-uk Kim	 movd		`4*$j-16*4`(@ptr[3]),$t1
142*7bded2dbSJung-uk Kim	 punpckldq	$t2,@Xi[1]
143*7bded2dbSJung-uk Kim	movdqa	$a,$t2
144*7bded2dbSJung-uk Kim	paddd	$K,$e				# e+=K_00_19
145*7bded2dbSJung-uk Kim	 punpckldq	$t1,$t3
146*7bded2dbSJung-uk Kim	movdqa	$b,$t1
147*7bded2dbSJung-uk Kim	movdqa	$b,$t0
148*7bded2dbSJung-uk Kim	pslld	\$5,$t2
149*7bded2dbSJung-uk Kim	pandn	$d,$t1
150*7bded2dbSJung-uk Kim	pand	$c,$t0
151*7bded2dbSJung-uk Kim	 punpckldq	$t3,@Xi[1]
152*7bded2dbSJung-uk Kim	movdqa	$a,$t3
153*7bded2dbSJung-uk Kim
154*7bded2dbSJung-uk Kim	movdqa	@Xi[0],`&Xi_off($i)`
155*7bded2dbSJung-uk Kim	paddd	@Xi[0],$e			# e+=X[i]
156*7bded2dbSJung-uk Kim	 movd		`4*$k-16*4`(@ptr[0]),@Xi[2]
157*7bded2dbSJung-uk Kim	psrld	\$27,$t3
158*7bded2dbSJung-uk Kim	pxor	$t1,$t0				# Ch(b,c,d)
159*7bded2dbSJung-uk Kim	movdqa	$b,$t1
160*7bded2dbSJung-uk Kim
161*7bded2dbSJung-uk Kim	por	$t3,$t2				# rol(a,5)
162*7bded2dbSJung-uk Kim	 movd		`4*$k-16*4`(@ptr[1]),$t3
163*7bded2dbSJung-uk Kim	pslld	\$30,$t1
164*7bded2dbSJung-uk Kim	paddd	$t0,$e				# e+=Ch(b,c,d)
165*7bded2dbSJung-uk Kim
166*7bded2dbSJung-uk Kim	psrld	\$2,$b
167*7bded2dbSJung-uk Kim	paddd	$t2,$e				# e+=rol(a,5)
168*7bded2dbSJung-uk Kim	 pshufb	$tx,@Xi[1]
169*7bded2dbSJung-uk Kim	 movd		`4*$k-16*4`(@ptr[2]),$t2
170*7bded2dbSJung-uk Kim	por	$t1,$b				# b=rol(b,30)
171*7bded2dbSJung-uk Kim___
172*7bded2dbSJung-uk Kim$code.=<<___ if ($i==14);			# just load input
173*7bded2dbSJung-uk Kim	 movd		`4*$j-16*4`(@ptr[3]),$t1
174*7bded2dbSJung-uk Kim	 punpckldq	$t2,@Xi[1]
175*7bded2dbSJung-uk Kim	movdqa	$a,$t2
176*7bded2dbSJung-uk Kim	paddd	$K,$e				# e+=K_00_19
177*7bded2dbSJung-uk Kim	 punpckldq	$t1,$t3
178*7bded2dbSJung-uk Kim	movdqa	$b,$t1
179*7bded2dbSJung-uk Kim	movdqa	$b,$t0
180*7bded2dbSJung-uk Kim	pslld	\$5,$t2
181*7bded2dbSJung-uk Kim	 prefetcht0	63(@ptr[0])
182*7bded2dbSJung-uk Kim	pandn	$d,$t1
183*7bded2dbSJung-uk Kim	pand	$c,$t0
184*7bded2dbSJung-uk Kim	 punpckldq	$t3,@Xi[1]
185*7bded2dbSJung-uk Kim	movdqa	$a,$t3
186*7bded2dbSJung-uk Kim
187*7bded2dbSJung-uk Kim	movdqa	@Xi[0],`&Xi_off($i)`
188*7bded2dbSJung-uk Kim	paddd	@Xi[0],$e			# e+=X[i]
189*7bded2dbSJung-uk Kim	psrld	\$27,$t3
190*7bded2dbSJung-uk Kim	pxor	$t1,$t0				# Ch(b,c,d)
191*7bded2dbSJung-uk Kim	movdqa	$b,$t1
192*7bded2dbSJung-uk Kim	 prefetcht0	63(@ptr[1])
193*7bded2dbSJung-uk Kim
194*7bded2dbSJung-uk Kim	por	$t3,$t2				# rol(a,5)
195*7bded2dbSJung-uk Kim	pslld	\$30,$t1
196*7bded2dbSJung-uk Kim	paddd	$t0,$e				# e+=Ch(b,c,d)
197*7bded2dbSJung-uk Kim	 prefetcht0	63(@ptr[2])
198*7bded2dbSJung-uk Kim
199*7bded2dbSJung-uk Kim	psrld	\$2,$b
200*7bded2dbSJung-uk Kim	paddd	$t2,$e				# e+=rol(a,5)
201*7bded2dbSJung-uk Kim	 pshufb	$tx,@Xi[1]
202*7bded2dbSJung-uk Kim	 prefetcht0	63(@ptr[3])
203*7bded2dbSJung-uk Kim	por	$t1,$b				# b=rol(b,30)
204*7bded2dbSJung-uk Kim___
205*7bded2dbSJung-uk Kim$code.=<<___ if ($i>=13 && $i<15);
206*7bded2dbSJung-uk Kim	movdqa	`&Xi_off($j+2)`,@Xi[3]		# preload "X[2]"
207*7bded2dbSJung-uk Kim___
208*7bded2dbSJung-uk Kim$code.=<<___ if ($i>=15);			# apply Xupdate
209*7bded2dbSJung-uk Kim	pxor	@Xi[-2],@Xi[1]			# "X[13]"
210*7bded2dbSJung-uk Kim	movdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
211*7bded2dbSJung-uk Kim
212*7bded2dbSJung-uk Kim	movdqa	$a,$t2
213*7bded2dbSJung-uk Kim	 pxor	`&Xi_off($j+8)`,@Xi[1]
214*7bded2dbSJung-uk Kim	paddd	$K,$e				# e+=K_00_19
215*7bded2dbSJung-uk Kim	movdqa	$b,$t1
216*7bded2dbSJung-uk Kim	pslld	\$5,$t2
217*7bded2dbSJung-uk Kim	 pxor	@Xi[3],@Xi[1]
218*7bded2dbSJung-uk Kim	movdqa	$b,$t0
219*7bded2dbSJung-uk Kim	pandn	$d,$t1
220*7bded2dbSJung-uk Kim	 movdqa	@Xi[1],$tx
221*7bded2dbSJung-uk Kim	pand	$c,$t0
222*7bded2dbSJung-uk Kim	movdqa	$a,$t3
223*7bded2dbSJung-uk Kim	 psrld	\$31,$tx
224*7bded2dbSJung-uk Kim	 paddd	@Xi[1],@Xi[1]
225*7bded2dbSJung-uk Kim
226*7bded2dbSJung-uk Kim	movdqa	@Xi[0],`&Xi_off($i)`
227*7bded2dbSJung-uk Kim	paddd	@Xi[0],$e			# e+=X[i]
228*7bded2dbSJung-uk Kim	psrld	\$27,$t3
229*7bded2dbSJung-uk Kim	pxor	$t1,$t0				# Ch(b,c,d)
230*7bded2dbSJung-uk Kim
231*7bded2dbSJung-uk Kim	movdqa	$b,$t1
232*7bded2dbSJung-uk Kim	por	$t3,$t2				# rol(a,5)
233*7bded2dbSJung-uk Kim	pslld	\$30,$t1
234*7bded2dbSJung-uk Kim	paddd	$t0,$e				# e+=Ch(b,c,d)
235*7bded2dbSJung-uk Kim
236*7bded2dbSJung-uk Kim	psrld	\$2,$b
237*7bded2dbSJung-uk Kim	paddd	$t2,$e				# e+=rol(a,5)
238*7bded2dbSJung-uk Kim	 por	$tx,@Xi[1]			# rol	\$1,@Xi[1]
239*7bded2dbSJung-uk Kim	por	$t1,$b				# b=rol(b,30)
240*7bded2dbSJung-uk Kim___
241*7bded2dbSJung-uk Kimpush(@Xi,shift(@Xi));
242*7bded2dbSJung-uk Kim}
243*7bded2dbSJung-uk Kim
244*7bded2dbSJung-uk Kimsub BODY_20_39 {
245*7bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_;
246*7bded2dbSJung-uk Kimmy $j=$i+1;
247*7bded2dbSJung-uk Kim
248*7bded2dbSJung-uk Kim$code.=<<___ if ($i<79);
249*7bded2dbSJung-uk Kim	pxor	@Xi[-2],@Xi[1]			# "X[13]"
250*7bded2dbSJung-uk Kim	movdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
251*7bded2dbSJung-uk Kim
252*7bded2dbSJung-uk Kim	movdqa	$a,$t2
253*7bded2dbSJung-uk Kim	movdqa	$d,$t0
254*7bded2dbSJung-uk Kim	 pxor	`&Xi_off($j+8)`,@Xi[1]
255*7bded2dbSJung-uk Kim	paddd	$K,$e				# e+=K_20_39
256*7bded2dbSJung-uk Kim	pslld	\$5,$t2
257*7bded2dbSJung-uk Kim	pxor	$b,$t0
258*7bded2dbSJung-uk Kim
259*7bded2dbSJung-uk Kim	movdqa	$a,$t3
260*7bded2dbSJung-uk Kim___
261*7bded2dbSJung-uk Kim$code.=<<___ if ($i<72);
262*7bded2dbSJung-uk Kim	movdqa	@Xi[0],`&Xi_off($i)`
263*7bded2dbSJung-uk Kim___
264*7bded2dbSJung-uk Kim$code.=<<___ if ($i<79);
265*7bded2dbSJung-uk Kim	paddd	@Xi[0],$e			# e+=X[i]
266*7bded2dbSJung-uk Kim	 pxor	@Xi[3],@Xi[1]
267*7bded2dbSJung-uk Kim	psrld	\$27,$t3
268*7bded2dbSJung-uk Kim	pxor	$c,$t0				# Parity(b,c,d)
269*7bded2dbSJung-uk Kim	movdqa	$b,$t1
270*7bded2dbSJung-uk Kim
271*7bded2dbSJung-uk Kim	pslld	\$30,$t1
272*7bded2dbSJung-uk Kim	 movdqa	@Xi[1],$tx
273*7bded2dbSJung-uk Kim	por	$t3,$t2				# rol(a,5)
274*7bded2dbSJung-uk Kim	 psrld	\$31,$tx
275*7bded2dbSJung-uk Kim	paddd	$t0,$e				# e+=Parity(b,c,d)
276*7bded2dbSJung-uk Kim	 paddd	@Xi[1],@Xi[1]
277*7bded2dbSJung-uk Kim
278*7bded2dbSJung-uk Kim	psrld	\$2,$b
279*7bded2dbSJung-uk Kim	paddd	$t2,$e				# e+=rol(a,5)
280*7bded2dbSJung-uk Kim	 por	$tx,@Xi[1]			# rol(@Xi[1],1)
281*7bded2dbSJung-uk Kim	por	$t1,$b				# b=rol(b,30)
282*7bded2dbSJung-uk Kim___
283*7bded2dbSJung-uk Kim$code.=<<___ if ($i==79);
284*7bded2dbSJung-uk Kim	movdqa	$a,$t2
285*7bded2dbSJung-uk Kim	paddd	$K,$e				# e+=K_20_39
286*7bded2dbSJung-uk Kim	movdqa	$d,$t0
287*7bded2dbSJung-uk Kim	pslld	\$5,$t2
288*7bded2dbSJung-uk Kim	pxor	$b,$t0
289*7bded2dbSJung-uk Kim
290*7bded2dbSJung-uk Kim	movdqa	$a,$t3
291*7bded2dbSJung-uk Kim	paddd	@Xi[0],$e			# e+=X[i]
292*7bded2dbSJung-uk Kim	psrld	\$27,$t3
293*7bded2dbSJung-uk Kim	movdqa	$b,$t1
294*7bded2dbSJung-uk Kim	pxor	$c,$t0				# Parity(b,c,d)
295*7bded2dbSJung-uk Kim
296*7bded2dbSJung-uk Kim	pslld	\$30,$t1
297*7bded2dbSJung-uk Kim	por	$t3,$t2				# rol(a,5)
298*7bded2dbSJung-uk Kim	paddd	$t0,$e				# e+=Parity(b,c,d)
299*7bded2dbSJung-uk Kim
300*7bded2dbSJung-uk Kim	psrld	\$2,$b
301*7bded2dbSJung-uk Kim	paddd	$t2,$e				# e+=rol(a,5)
302*7bded2dbSJung-uk Kim	por	$t1,$b				# b=rol(b,30)
303*7bded2dbSJung-uk Kim___
304*7bded2dbSJung-uk Kimpush(@Xi,shift(@Xi));
305*7bded2dbSJung-uk Kim}
306*7bded2dbSJung-uk Kim
307*7bded2dbSJung-uk Kimsub BODY_40_59 {
308*7bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_;
309*7bded2dbSJung-uk Kimmy $j=$i+1;
310*7bded2dbSJung-uk Kim
311*7bded2dbSJung-uk Kim$code.=<<___;
312*7bded2dbSJung-uk Kim	pxor	@Xi[-2],@Xi[1]			# "X[13]"
313*7bded2dbSJung-uk Kim	movdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
314*7bded2dbSJung-uk Kim
315*7bded2dbSJung-uk Kim	movdqa	$a,$t2
316*7bded2dbSJung-uk Kim	movdqa	$d,$t1
317*7bded2dbSJung-uk Kim	 pxor	`&Xi_off($j+8)`,@Xi[1]
318*7bded2dbSJung-uk Kim	pxor	@Xi[3],@Xi[1]
319*7bded2dbSJung-uk Kim	paddd	$K,$e				# e+=K_40_59
320*7bded2dbSJung-uk Kim	pslld	\$5,$t2
321*7bded2dbSJung-uk Kim	movdqa	$a,$t3
322*7bded2dbSJung-uk Kim	pand	$c,$t1
323*7bded2dbSJung-uk Kim
324*7bded2dbSJung-uk Kim	movdqa	$d,$t0
325*7bded2dbSJung-uk Kim	 movdqa	@Xi[1],$tx
326*7bded2dbSJung-uk Kim	psrld	\$27,$t3
327*7bded2dbSJung-uk Kim	paddd	$t1,$e
328*7bded2dbSJung-uk Kim	pxor	$c,$t0
329*7bded2dbSJung-uk Kim
330*7bded2dbSJung-uk Kim	movdqa	@Xi[0],`&Xi_off($i)`
331*7bded2dbSJung-uk Kim	paddd	@Xi[0],$e			# e+=X[i]
332*7bded2dbSJung-uk Kim	por	$t3,$t2				# rol(a,5)
333*7bded2dbSJung-uk Kim	 psrld	\$31,$tx
334*7bded2dbSJung-uk Kim	pand	$b,$t0
335*7bded2dbSJung-uk Kim	movdqa	$b,$t1
336*7bded2dbSJung-uk Kim
337*7bded2dbSJung-uk Kim	pslld	\$30,$t1
338*7bded2dbSJung-uk Kim	 paddd	@Xi[1],@Xi[1]
339*7bded2dbSJung-uk Kim	paddd	$t0,$e				# e+=Maj(b,d,c)
340*7bded2dbSJung-uk Kim
341*7bded2dbSJung-uk Kim	psrld	\$2,$b
342*7bded2dbSJung-uk Kim	paddd	$t2,$e				# e+=rol(a,5)
343*7bded2dbSJung-uk Kim	 por	$tx,@Xi[1]			# rol(@X[1],1)
344*7bded2dbSJung-uk Kim	por	$t1,$b				# b=rol(b,30)
345*7bded2dbSJung-uk Kim___
346*7bded2dbSJung-uk Kimpush(@Xi,shift(@Xi));
347*7bded2dbSJung-uk Kim}
348*7bded2dbSJung-uk Kim
349*7bded2dbSJung-uk Kim$code.=<<___;
350*7bded2dbSJung-uk Kim.text
351*7bded2dbSJung-uk Kim
352*7bded2dbSJung-uk Kim.extern	OPENSSL_ia32cap_P
353*7bded2dbSJung-uk Kim
354*7bded2dbSJung-uk Kim.globl	sha1_multi_block
355*7bded2dbSJung-uk Kim.type	sha1_multi_block,\@function,3
356*7bded2dbSJung-uk Kim.align	32
357*7bded2dbSJung-uk Kimsha1_multi_block:
358*7bded2dbSJung-uk Kim	mov	OPENSSL_ia32cap_P+4(%rip),%rcx
359*7bded2dbSJung-uk Kim	bt	\$61,%rcx			# check SHA bit
360*7bded2dbSJung-uk Kim	jc	_shaext_shortcut
361*7bded2dbSJung-uk Kim___
362*7bded2dbSJung-uk Kim$code.=<<___ if ($avx);
363*7bded2dbSJung-uk Kim	test	\$`1<<28`,%ecx
364*7bded2dbSJung-uk Kim	jnz	_avx_shortcut
365*7bded2dbSJung-uk Kim___
366*7bded2dbSJung-uk Kim$code.=<<___;
367*7bded2dbSJung-uk Kim	mov	%rsp,%rax
368*7bded2dbSJung-uk Kim	push	%rbx
369*7bded2dbSJung-uk Kim	push	%rbp
370*7bded2dbSJung-uk Kim___
371*7bded2dbSJung-uk Kim$code.=<<___ if ($win64);
372*7bded2dbSJung-uk Kim	lea	-0xa8(%rsp),%rsp
373*7bded2dbSJung-uk Kim	movaps	%xmm6,(%rsp)
374*7bded2dbSJung-uk Kim	movaps	%xmm7,0x10(%rsp)
375*7bded2dbSJung-uk Kim	movaps	%xmm8,0x20(%rsp)
376*7bded2dbSJung-uk Kim	movaps	%xmm9,0x30(%rsp)
377*7bded2dbSJung-uk Kim	movaps	%xmm10,-0x78(%rax)
378*7bded2dbSJung-uk Kim	movaps	%xmm11,-0x68(%rax)
379*7bded2dbSJung-uk Kim	movaps	%xmm12,-0x58(%rax)
380*7bded2dbSJung-uk Kim	movaps	%xmm13,-0x48(%rax)
381*7bded2dbSJung-uk Kim	movaps	%xmm14,-0x38(%rax)
382*7bded2dbSJung-uk Kim	movaps	%xmm15,-0x28(%rax)
383*7bded2dbSJung-uk Kim___
384*7bded2dbSJung-uk Kim$code.=<<___;
385*7bded2dbSJung-uk Kim	sub	\$`$REG_SZ*18`,%rsp
386*7bded2dbSJung-uk Kim	and	\$-256,%rsp
387*7bded2dbSJung-uk Kim	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
388*7bded2dbSJung-uk Kim.Lbody:
389*7bded2dbSJung-uk Kim	lea	K_XX_XX(%rip),$Tbl
390*7bded2dbSJung-uk Kim	lea	`$REG_SZ*16`(%rsp),%rbx
391*7bded2dbSJung-uk Kim
392*7bded2dbSJung-uk Kim.Loop_grande:
393*7bded2dbSJung-uk Kim	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
394*7bded2dbSJung-uk Kim	xor	$num,$num
395*7bded2dbSJung-uk Kim___
396*7bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) {
397*7bded2dbSJung-uk Kim    $code.=<<___;
398*7bded2dbSJung-uk Kim	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
399*7bded2dbSJung-uk Kim	mov	`16*$i+8`($inp),%ecx		# number of blocks
400*7bded2dbSJung-uk Kim	cmp	$num,%ecx
401*7bded2dbSJung-uk Kim	cmovg	%ecx,$num			# find maximum
402*7bded2dbSJung-uk Kim	test	%ecx,%ecx
403*7bded2dbSJung-uk Kim	mov	%ecx,`4*$i`(%rbx)		# initialize counters
404*7bded2dbSJung-uk Kim	cmovle	$Tbl,@ptr[$i]			# cancel input
405*7bded2dbSJung-uk Kim___
406*7bded2dbSJung-uk Kim}
407*7bded2dbSJung-uk Kim$code.=<<___;
408*7bded2dbSJung-uk Kim	test	$num,$num
409*7bded2dbSJung-uk Kim	jz	.Ldone
410*7bded2dbSJung-uk Kim
411*7bded2dbSJung-uk Kim	movdqu	0x00($ctx),$A			# load context
412*7bded2dbSJung-uk Kim	 lea	128(%rsp),%rax
413*7bded2dbSJung-uk Kim	movdqu	0x20($ctx),$B
414*7bded2dbSJung-uk Kim	movdqu	0x40($ctx),$C
415*7bded2dbSJung-uk Kim	movdqu	0x60($ctx),$D
416*7bded2dbSJung-uk Kim	movdqu	0x80($ctx),$E
417*7bded2dbSJung-uk Kim	movdqa	0x60($Tbl),$tx			# pbswap_mask
418*7bded2dbSJung-uk Kim	movdqa	-0x20($Tbl),$K			# K_00_19
419*7bded2dbSJung-uk Kim	jmp	.Loop
420*7bded2dbSJung-uk Kim
421*7bded2dbSJung-uk Kim.align	32
422*7bded2dbSJung-uk Kim.Loop:
423*7bded2dbSJung-uk Kim___
424*7bded2dbSJung-uk Kimfor($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
425*7bded2dbSJung-uk Kim$code.="	movdqa	0x00($Tbl),$K\n";	# K_20_39
426*7bded2dbSJung-uk Kimfor(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
427*7bded2dbSJung-uk Kim$code.="	movdqa	0x20($Tbl),$K\n";	# K_40_59
428*7bded2dbSJung-uk Kimfor(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
429*7bded2dbSJung-uk Kim$code.="	movdqa	0x40($Tbl),$K\n";	# K_60_79
430*7bded2dbSJung-uk Kimfor(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
431*7bded2dbSJung-uk Kim$code.=<<___;
432*7bded2dbSJung-uk Kim	movdqa	(%rbx),@Xi[0]			# pull counters
433*7bded2dbSJung-uk Kim	mov	\$1,%ecx
434*7bded2dbSJung-uk Kim	cmp	4*0(%rbx),%ecx			# examinte counters
435*7bded2dbSJung-uk Kim	pxor	$t2,$t2
436*7bded2dbSJung-uk Kim	cmovge	$Tbl,@ptr[0]			# cancel input
437*7bded2dbSJung-uk Kim	cmp	4*1(%rbx),%ecx
438*7bded2dbSJung-uk Kim	movdqa	@Xi[0],@Xi[1]
439*7bded2dbSJung-uk Kim	cmovge	$Tbl,@ptr[1]
440*7bded2dbSJung-uk Kim	cmp	4*2(%rbx),%ecx
441*7bded2dbSJung-uk Kim	pcmpgtd	$t2,@Xi[1]			# mask value
442*7bded2dbSJung-uk Kim	cmovge	$Tbl,@ptr[2]
443*7bded2dbSJung-uk Kim	cmp	4*3(%rbx),%ecx
444*7bded2dbSJung-uk Kim	paddd	@Xi[1],@Xi[0]			# counters--
445*7bded2dbSJung-uk Kim	cmovge	$Tbl,@ptr[3]
446*7bded2dbSJung-uk Kim
447*7bded2dbSJung-uk Kim	movdqu	0x00($ctx),$t0
448*7bded2dbSJung-uk Kim	pand	@Xi[1],$A
449*7bded2dbSJung-uk Kim	movdqu	0x20($ctx),$t1
450*7bded2dbSJung-uk Kim	pand	@Xi[1],$B
451*7bded2dbSJung-uk Kim	paddd	$t0,$A
452*7bded2dbSJung-uk Kim	movdqu	0x40($ctx),$t2
453*7bded2dbSJung-uk Kim	pand	@Xi[1],$C
454*7bded2dbSJung-uk Kim	paddd	$t1,$B
455*7bded2dbSJung-uk Kim	movdqu	0x60($ctx),$t3
456*7bded2dbSJung-uk Kim	pand	@Xi[1],$D
457*7bded2dbSJung-uk Kim	paddd	$t2,$C
458*7bded2dbSJung-uk Kim	movdqu	0x80($ctx),$tx
459*7bded2dbSJung-uk Kim	pand	@Xi[1],$E
460*7bded2dbSJung-uk Kim	movdqu	$A,0x00($ctx)
461*7bded2dbSJung-uk Kim	paddd	$t3,$D
462*7bded2dbSJung-uk Kim	movdqu	$B,0x20($ctx)
463*7bded2dbSJung-uk Kim	paddd	$tx,$E
464*7bded2dbSJung-uk Kim	movdqu	$C,0x40($ctx)
465*7bded2dbSJung-uk Kim	movdqu	$D,0x60($ctx)
466*7bded2dbSJung-uk Kim	movdqu	$E,0x80($ctx)
467*7bded2dbSJung-uk Kim
468*7bded2dbSJung-uk Kim	movdqa	@Xi[0],(%rbx)			# save counters
469*7bded2dbSJung-uk Kim	movdqa	0x60($Tbl),$tx			# pbswap_mask
470*7bded2dbSJung-uk Kim	movdqa	-0x20($Tbl),$K			# K_00_19
471*7bded2dbSJung-uk Kim	dec	$num
472*7bded2dbSJung-uk Kim	jnz	.Loop
473*7bded2dbSJung-uk Kim
474*7bded2dbSJung-uk Kim	mov	`$REG_SZ*17+8`(%rsp),$num
475*7bded2dbSJung-uk Kim	lea	$REG_SZ($ctx),$ctx
476*7bded2dbSJung-uk Kim	lea	`16*$REG_SZ/4`($inp),$inp
477*7bded2dbSJung-uk Kim	dec	$num
478*7bded2dbSJung-uk Kim	jnz	.Loop_grande
479*7bded2dbSJung-uk Kim
480*7bded2dbSJung-uk Kim.Ldone:
481*7bded2dbSJung-uk Kim	mov	`$REG_SZ*17`(%rsp),%rax		# orignal %rsp
482*7bded2dbSJung-uk Kim___
483*7bded2dbSJung-uk Kim$code.=<<___ if ($win64);
484*7bded2dbSJung-uk Kim	movaps	-0xb8(%rax),%xmm6
485*7bded2dbSJung-uk Kim	movaps	-0xa8(%rax),%xmm7
486*7bded2dbSJung-uk Kim	movaps	-0x98(%rax),%xmm8
487*7bded2dbSJung-uk Kim	movaps	-0x88(%rax),%xmm9
488*7bded2dbSJung-uk Kim	movaps	-0x78(%rax),%xmm10
489*7bded2dbSJung-uk Kim	movaps	-0x68(%rax),%xmm11
490*7bded2dbSJung-uk Kim	movaps	-0x58(%rax),%xmm12
491*7bded2dbSJung-uk Kim	movaps	-0x48(%rax),%xmm13
492*7bded2dbSJung-uk Kim	movaps	-0x38(%rax),%xmm14
493*7bded2dbSJung-uk Kim	movaps	-0x28(%rax),%xmm15
494*7bded2dbSJung-uk Kim___
495*7bded2dbSJung-uk Kim$code.=<<___;
496*7bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
497*7bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
498*7bded2dbSJung-uk Kim	lea	(%rax),%rsp
499*7bded2dbSJung-uk Kim.Lepilogue:
500*7bded2dbSJung-uk Kim	ret
501*7bded2dbSJung-uk Kim.size	sha1_multi_block,.-sha1_multi_block
502*7bded2dbSJung-uk Kim___
503*7bded2dbSJung-uk Kim						{{{
504*7bded2dbSJung-uk Kimmy ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10));
505*7bded2dbSJung-uk Kimmy @MSG0=map("%xmm$_",(4..7));
506*7bded2dbSJung-uk Kimmy @MSG1=map("%xmm$_",(11..14));
507*7bded2dbSJung-uk Kim
508*7bded2dbSJung-uk Kim$code.=<<___;
509*7bded2dbSJung-uk Kim.type	sha1_multi_block_shaext,\@function,3
510*7bded2dbSJung-uk Kim.align	32
511*7bded2dbSJung-uk Kimsha1_multi_block_shaext:
512*7bded2dbSJung-uk Kim_shaext_shortcut:
513*7bded2dbSJung-uk Kim	mov	%rsp,%rax
514*7bded2dbSJung-uk Kim	push	%rbx
515*7bded2dbSJung-uk Kim	push	%rbp
516*7bded2dbSJung-uk Kim___
517*7bded2dbSJung-uk Kim$code.=<<___ if ($win64);
518*7bded2dbSJung-uk Kim	lea	-0xa8(%rsp),%rsp
519*7bded2dbSJung-uk Kim	movaps	%xmm6,(%rsp)
520*7bded2dbSJung-uk Kim	movaps	%xmm7,0x10(%rsp)
521*7bded2dbSJung-uk Kim	movaps	%xmm8,0x20(%rsp)
522*7bded2dbSJung-uk Kim	movaps	%xmm9,0x30(%rsp)
523*7bded2dbSJung-uk Kim	movaps	%xmm10,-0x78(%rax)
524*7bded2dbSJung-uk Kim	movaps	%xmm11,-0x68(%rax)
525*7bded2dbSJung-uk Kim	movaps	%xmm12,-0x58(%rax)
526*7bded2dbSJung-uk Kim	movaps	%xmm13,-0x48(%rax)
527*7bded2dbSJung-uk Kim	movaps	%xmm14,-0x38(%rax)
528*7bded2dbSJung-uk Kim	movaps	%xmm15,-0x28(%rax)
529*7bded2dbSJung-uk Kim___
530*7bded2dbSJung-uk Kim$code.=<<___;
531*7bded2dbSJung-uk Kim	sub	\$`$REG_SZ*18`,%rsp
532*7bded2dbSJung-uk Kim	shl	\$1,$num			# we process pair at a time
533*7bded2dbSJung-uk Kim	and	\$-256,%rsp
534*7bded2dbSJung-uk Kim	lea	0x40($ctx),$ctx			# size optimization
535*7bded2dbSJung-uk Kim	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
536*7bded2dbSJung-uk Kim.Lbody_shaext:
537*7bded2dbSJung-uk Kim	lea	`$REG_SZ*16`(%rsp),%rbx
538*7bded2dbSJung-uk Kim	movdqa	K_XX_XX+0x80(%rip),$BSWAP	# byte-n-word swap
539*7bded2dbSJung-uk Kim
540*7bded2dbSJung-uk Kim.Loop_grande_shaext:
541*7bded2dbSJung-uk Kim	mov	$num,`$REG_SZ*17+8`(%rsp)	# orignal $num
542*7bded2dbSJung-uk Kim	xor	$num,$num
543*7bded2dbSJung-uk Kim___
544*7bded2dbSJung-uk Kimfor($i=0;$i<2;$i++) {
545*7bded2dbSJung-uk Kim    $code.=<<___;
546*7bded2dbSJung-uk Kim	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
547*7bded2dbSJung-uk Kim	mov	`16*$i+8`($inp),%ecx		# number of blocks
548*7bded2dbSJung-uk Kim	cmp	$num,%ecx
549*7bded2dbSJung-uk Kim	cmovg	%ecx,$num			# find maximum
550*7bded2dbSJung-uk Kim	test	%ecx,%ecx
551*7bded2dbSJung-uk Kim	mov	%ecx,`4*$i`(%rbx)		# initialize counters
552*7bded2dbSJung-uk Kim	cmovle	%rsp,@ptr[$i]			# cancel input
553*7bded2dbSJung-uk Kim___
554*7bded2dbSJung-uk Kim}
555*7bded2dbSJung-uk Kim$code.=<<___;
556*7bded2dbSJung-uk Kim	test	$num,$num
557*7bded2dbSJung-uk Kim	jz	.Ldone_shaext
558*7bded2dbSJung-uk Kim
559*7bded2dbSJung-uk Kim	movq		0x00-0x40($ctx),$ABCD0	# a1.a0
560*7bded2dbSJung-uk Kim	movq		0x20-0x40($ctx),@MSG0[0]# b1.b0
561*7bded2dbSJung-uk Kim	movq		0x40-0x40($ctx),@MSG0[1]# c1.c0
562*7bded2dbSJung-uk Kim	movq		0x60-0x40($ctx),@MSG0[2]# d1.d0
563*7bded2dbSJung-uk Kim	movq		0x80-0x40($ctx),@MSG0[3]# e1.e0
564*7bded2dbSJung-uk Kim
565*7bded2dbSJung-uk Kim	punpckldq	@MSG0[0],$ABCD0		# b1.a1.b0.a0
566*7bded2dbSJung-uk Kim	punpckldq	@MSG0[2],@MSG0[1]	# d1.c1.d0.c0
567*7bded2dbSJung-uk Kim
568*7bded2dbSJung-uk Kim	movdqa		$ABCD0,$ABCD1
569*7bded2dbSJung-uk Kim	punpcklqdq	@MSG0[1],$ABCD0		# d0.c0.b0.a0
570*7bded2dbSJung-uk Kim	punpckhqdq	@MSG0[1],$ABCD1		# d1.c1.b1.a1
571*7bded2dbSJung-uk Kim
572*7bded2dbSJung-uk Kim	pshufd		\$0b00111111,@MSG0[3],$E0
573*7bded2dbSJung-uk Kim	pshufd		\$0b01111111,@MSG0[3],$E1
574*7bded2dbSJung-uk Kim	pshufd		\$0b00011011,$ABCD0,$ABCD0
575*7bded2dbSJung-uk Kim	pshufd		\$0b00011011,$ABCD1,$ABCD1
576*7bded2dbSJung-uk Kim	jmp		.Loop_shaext
577*7bded2dbSJung-uk Kim
578*7bded2dbSJung-uk Kim.align	32
579*7bded2dbSJung-uk Kim.Loop_shaext:
580*7bded2dbSJung-uk Kim	movdqu		0x00(@ptr[0]),@MSG0[0]
581*7bded2dbSJung-uk Kim	 movdqu		0x00(@ptr[1]),@MSG1[0]
582*7bded2dbSJung-uk Kim	movdqu		0x10(@ptr[0]),@MSG0[1]
583*7bded2dbSJung-uk Kim	 movdqu		0x10(@ptr[1]),@MSG1[1]
584*7bded2dbSJung-uk Kim	movdqu		0x20(@ptr[0]),@MSG0[2]
585*7bded2dbSJung-uk Kim	pshufb		$BSWAP,@MSG0[0]
586*7bded2dbSJung-uk Kim	 movdqu		0x20(@ptr[1]),@MSG1[2]
587*7bded2dbSJung-uk Kim	 pshufb		$BSWAP,@MSG1[0]
588*7bded2dbSJung-uk Kim	movdqu		0x30(@ptr[0]),@MSG0[3]
589*7bded2dbSJung-uk Kim	lea		0x40(@ptr[0]),@ptr[0]
590*7bded2dbSJung-uk Kim	pshufb		$BSWAP,@MSG0[1]
591*7bded2dbSJung-uk Kim	 movdqu		0x30(@ptr[1]),@MSG1[3]
592*7bded2dbSJung-uk Kim	 lea		0x40(@ptr[1]),@ptr[1]
593*7bded2dbSJung-uk Kim	 pshufb		$BSWAP,@MSG1[1]
594*7bded2dbSJung-uk Kim
595*7bded2dbSJung-uk Kim	movdqa		$E0,0x50(%rsp)		# offload
596*7bded2dbSJung-uk Kim	paddd		@MSG0[0],$E0
597*7bded2dbSJung-uk Kim	 movdqa		$E1,0x70(%rsp)
598*7bded2dbSJung-uk Kim	 paddd		@MSG1[0],$E1
599*7bded2dbSJung-uk Kim	movdqa		$ABCD0,0x40(%rsp)	# offload
600*7bded2dbSJung-uk Kim	movdqa		$ABCD0,$E0_
601*7bded2dbSJung-uk Kim	 movdqa		$ABCD1,0x60(%rsp)
602*7bded2dbSJung-uk Kim	 movdqa		$ABCD1,$E1_
603*7bded2dbSJung-uk Kim	sha1rnds4	\$0,$E0,$ABCD0		# 0-3
604*7bded2dbSJung-uk Kim	sha1nexte	@MSG0[1],$E0_
605*7bded2dbSJung-uk Kim	 sha1rnds4	\$0,$E1,$ABCD1		# 0-3
606*7bded2dbSJung-uk Kim	 sha1nexte	@MSG1[1],$E1_
607*7bded2dbSJung-uk Kim	pshufb		$BSWAP,@MSG0[2]
608*7bded2dbSJung-uk Kim	prefetcht0	127(@ptr[0])
609*7bded2dbSJung-uk Kim	sha1msg1	@MSG0[1],@MSG0[0]
610*7bded2dbSJung-uk Kim	 pshufb		$BSWAP,@MSG1[2]
611*7bded2dbSJung-uk Kim	 prefetcht0	127(@ptr[1])
612*7bded2dbSJung-uk Kim	 sha1msg1	@MSG1[1],@MSG1[0]
613*7bded2dbSJung-uk Kim
614*7bded2dbSJung-uk Kim	pshufb		$BSWAP,@MSG0[3]
615*7bded2dbSJung-uk Kim	movdqa		$ABCD0,$E0
616*7bded2dbSJung-uk Kim	 pshufb		$BSWAP,@MSG1[3]
617*7bded2dbSJung-uk Kim	 movdqa		$ABCD1,$E1
618*7bded2dbSJung-uk Kim	sha1rnds4	\$0,$E0_,$ABCD0		# 4-7
619*7bded2dbSJung-uk Kim	sha1nexte	@MSG0[2],$E0
620*7bded2dbSJung-uk Kim	 sha1rnds4	\$0,$E1_,$ABCD1		# 4-7
621*7bded2dbSJung-uk Kim	 sha1nexte	@MSG1[2],$E1
622*7bded2dbSJung-uk Kim	pxor		@MSG0[2],@MSG0[0]
623*7bded2dbSJung-uk Kim	sha1msg1	@MSG0[2],@MSG0[1]
624*7bded2dbSJung-uk Kim	 pxor		@MSG1[2],@MSG1[0]
625*7bded2dbSJung-uk Kim	 sha1msg1	@MSG1[2],@MSG1[1]
626*7bded2dbSJung-uk Kim___
627*7bded2dbSJung-uk Kimfor($i=2;$i<20-4;$i++) {
628*7bded2dbSJung-uk Kim$code.=<<___;
629*7bded2dbSJung-uk Kim	movdqa		$ABCD0,$E0_
630*7bded2dbSJung-uk Kim	 movdqa		$ABCD1,$E1_
631*7bded2dbSJung-uk Kim	sha1rnds4	\$`int($i/5)`,$E0,$ABCD0	# 8-11
632*7bded2dbSJung-uk Kim	sha1nexte	@MSG0[3],$E0_
633*7bded2dbSJung-uk Kim	 sha1rnds4	\$`int($i/5)`,$E1,$ABCD1	# 8-11
634*7bded2dbSJung-uk Kim	 sha1nexte	@MSG1[3],$E1_
635*7bded2dbSJung-uk Kim	sha1msg2	@MSG0[3],@MSG0[0]
636*7bded2dbSJung-uk Kim	 sha1msg2	@MSG1[3],@MSG1[0]
637*7bded2dbSJung-uk Kim	pxor		@MSG0[3],@MSG0[1]
638*7bded2dbSJung-uk Kim	sha1msg1	@MSG0[3],@MSG0[2]
639*7bded2dbSJung-uk Kim	 pxor		@MSG1[3],@MSG1[1]
640*7bded2dbSJung-uk Kim	 sha1msg1	@MSG1[3],@MSG1[2]
641*7bded2dbSJung-uk Kim___
642*7bded2dbSJung-uk Kim	($E0,$E0_)=($E0_,$E0);		($E1,$E1_)=($E1_,$E1);
643*7bded2dbSJung-uk Kim	push(@MSG0,shift(@MSG0));	push(@MSG1,shift(@MSG1));
644*7bded2dbSJung-uk Kim}
645*7bded2dbSJung-uk Kim$code.=<<___;
646*7bded2dbSJung-uk Kim	movdqa		$ABCD0,$E0_
647*7bded2dbSJung-uk Kim	 movdqa		$ABCD1,$E1_
648*7bded2dbSJung-uk Kim	sha1rnds4	\$3,$E0,$ABCD0		# 64-67
649*7bded2dbSJung-uk Kim	sha1nexte	@MSG0[3],$E0_
650*7bded2dbSJung-uk Kim	 sha1rnds4	\$3,$E1,$ABCD1		# 64-67
651*7bded2dbSJung-uk Kim	 sha1nexte	@MSG1[3],$E1_
652*7bded2dbSJung-uk Kim	sha1msg2	@MSG0[3],@MSG0[0]
653*7bded2dbSJung-uk Kim	 sha1msg2	@MSG1[3],@MSG1[0]
654*7bded2dbSJung-uk Kim	pxor		@MSG0[3],@MSG0[1]
655*7bded2dbSJung-uk Kim	 pxor		@MSG1[3],@MSG1[1]
656*7bded2dbSJung-uk Kim
657*7bded2dbSJung-uk Kim	mov		\$1,%ecx
658*7bded2dbSJung-uk Kim	pxor		@MSG0[2],@MSG0[2]	# zero
659*7bded2dbSJung-uk Kim	cmp		4*0(%rbx),%ecx		# examine counters
660*7bded2dbSJung-uk Kim	cmovge		%rsp,@ptr[0]		# cancel input
661*7bded2dbSJung-uk Kim
662*7bded2dbSJung-uk Kim	movdqa		$ABCD0,$E0
663*7bded2dbSJung-uk Kim	 movdqa		$ABCD1,$E1
664*7bded2dbSJung-uk Kim	sha1rnds4	\$3,$E0_,$ABCD0		# 68-71
665*7bded2dbSJung-uk Kim	sha1nexte	@MSG0[0],$E0
666*7bded2dbSJung-uk Kim	 sha1rnds4	\$3,$E1_,$ABCD1		# 68-71
667*7bded2dbSJung-uk Kim	 sha1nexte	@MSG1[0],$E1
668*7bded2dbSJung-uk Kim	sha1msg2	@MSG0[0],@MSG0[1]
669*7bded2dbSJung-uk Kim	 sha1msg2	@MSG1[0],@MSG1[1]
670*7bded2dbSJung-uk Kim
671*7bded2dbSJung-uk Kim	cmp		4*1(%rbx),%ecx
672*7bded2dbSJung-uk Kim	cmovge		%rsp,@ptr[1]
673*7bded2dbSJung-uk Kim	movq		(%rbx),@MSG0[0]		# pull counters
674*7bded2dbSJung-uk Kim
675*7bded2dbSJung-uk Kim	movdqa		$ABCD0,$E0_
676*7bded2dbSJung-uk Kim	 movdqa		$ABCD1,$E1_
677*7bded2dbSJung-uk Kim	sha1rnds4	\$3,$E0,$ABCD0		# 72-75
678*7bded2dbSJung-uk Kim	sha1nexte	@MSG0[1],$E0_
679*7bded2dbSJung-uk Kim	 sha1rnds4	\$3,$E1,$ABCD1		# 72-75
680*7bded2dbSJung-uk Kim	 sha1nexte	@MSG1[1],$E1_
681*7bded2dbSJung-uk Kim
682*7bded2dbSJung-uk Kim	pshufd		\$0x00,@MSG0[0],@MSG1[2]
683*7bded2dbSJung-uk Kim	pshufd		\$0x55,@MSG0[0],@MSG1[3]
684*7bded2dbSJung-uk Kim	movdqa		@MSG0[0],@MSG0[1]
685*7bded2dbSJung-uk Kim	pcmpgtd		@MSG0[2],@MSG1[2]
686*7bded2dbSJung-uk Kim	pcmpgtd		@MSG0[2],@MSG1[3]
687*7bded2dbSJung-uk Kim
688*7bded2dbSJung-uk Kim	movdqa		$ABCD0,$E0
689*7bded2dbSJung-uk Kim	 movdqa		$ABCD1,$E1
690*7bded2dbSJung-uk Kim	sha1rnds4	\$3,$E0_,$ABCD0		# 76-79
691*7bded2dbSJung-uk Kim	sha1nexte	$MSG0[2],$E0
692*7bded2dbSJung-uk Kim	 sha1rnds4	\$3,$E1_,$ABCD1		# 76-79
693*7bded2dbSJung-uk Kim	 sha1nexte	$MSG0[2],$E1
694*7bded2dbSJung-uk Kim
695*7bded2dbSJung-uk Kim	pcmpgtd		@MSG0[2],@MSG0[1]	# counter mask
696*7bded2dbSJung-uk Kim	pand		@MSG1[2],$ABCD0
697*7bded2dbSJung-uk Kim	pand		@MSG1[2],$E0
698*7bded2dbSJung-uk Kim	 pand		@MSG1[3],$ABCD1
699*7bded2dbSJung-uk Kim	 pand		@MSG1[3],$E1
700*7bded2dbSJung-uk Kim	paddd		@MSG0[1],@MSG0[0]	# counters--
701*7bded2dbSJung-uk Kim
702*7bded2dbSJung-uk Kim	paddd		0x40(%rsp),$ABCD0
703*7bded2dbSJung-uk Kim	paddd		0x50(%rsp),$E0
704*7bded2dbSJung-uk Kim	 paddd		0x60(%rsp),$ABCD1
705*7bded2dbSJung-uk Kim	 paddd		0x70(%rsp),$E1
706*7bded2dbSJung-uk Kim
707*7bded2dbSJung-uk Kim	movq		@MSG0[0],(%rbx)		# save counters
708*7bded2dbSJung-uk Kim	dec		$num
709*7bded2dbSJung-uk Kim	jnz		.Loop_shaext
710*7bded2dbSJung-uk Kim
711*7bded2dbSJung-uk Kim	mov		`$REG_SZ*17+8`(%rsp),$num
712*7bded2dbSJung-uk Kim
713*7bded2dbSJung-uk Kim	pshufd		\$0b00011011,$ABCD0,$ABCD0
714*7bded2dbSJung-uk Kim	pshufd		\$0b00011011,$ABCD1,$ABCD1
715*7bded2dbSJung-uk Kim
716*7bded2dbSJung-uk Kim	movdqa		$ABCD0,@MSG0[0]
717*7bded2dbSJung-uk Kim	punpckldq	$ABCD1,$ABCD0		# b1.b0.a1.a0
718*7bded2dbSJung-uk Kim	punpckhdq	$ABCD1,@MSG0[0]		# d1.d0.c1.c0
719*7bded2dbSJung-uk Kim	punpckhdq	$E1,$E0			# e1.e0.xx.xx
720*7bded2dbSJung-uk Kim	movq		$ABCD0,0x00-0x40($ctx)	# a1.a0
721*7bded2dbSJung-uk Kim	psrldq		\$8,$ABCD0
722*7bded2dbSJung-uk Kim	movq		@MSG0[0],0x40-0x40($ctx)# c1.c0
723*7bded2dbSJung-uk Kim	psrldq		\$8,@MSG0[0]
724*7bded2dbSJung-uk Kim	movq		$ABCD0,0x20-0x40($ctx)	# b1.b0
725*7bded2dbSJung-uk Kim	psrldq		\$8,$E0
726*7bded2dbSJung-uk Kim	movq		@MSG0[0],0x60-0x40($ctx)# d1.d0
727*7bded2dbSJung-uk Kim	movq		$E0,0x80-0x40($ctx)	# e1.e0
728*7bded2dbSJung-uk Kim
729*7bded2dbSJung-uk Kim	lea	`$REG_SZ/2`($ctx),$ctx
730*7bded2dbSJung-uk Kim	lea	`16*2`($inp),$inp
731*7bded2dbSJung-uk Kim	dec	$num
732*7bded2dbSJung-uk Kim	jnz	.Loop_grande_shaext
733*7bded2dbSJung-uk Kim
734*7bded2dbSJung-uk Kim.Ldone_shaext:
735*7bded2dbSJung-uk Kim	#mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
736*7bded2dbSJung-uk Kim___
737*7bded2dbSJung-uk Kim$code.=<<___ if ($win64);
738*7bded2dbSJung-uk Kim	movaps	-0xb8(%rax),%xmm6
739*7bded2dbSJung-uk Kim	movaps	-0xa8(%rax),%xmm7
740*7bded2dbSJung-uk Kim	movaps	-0x98(%rax),%xmm8
741*7bded2dbSJung-uk Kim	movaps	-0x88(%rax),%xmm9
742*7bded2dbSJung-uk Kim	movaps	-0x78(%rax),%xmm10
743*7bded2dbSJung-uk Kim	movaps	-0x68(%rax),%xmm11
744*7bded2dbSJung-uk Kim	movaps	-0x58(%rax),%xmm12
745*7bded2dbSJung-uk Kim	movaps	-0x48(%rax),%xmm13
746*7bded2dbSJung-uk Kim	movaps	-0x38(%rax),%xmm14
747*7bded2dbSJung-uk Kim	movaps	-0x28(%rax),%xmm15
748*7bded2dbSJung-uk Kim___
749*7bded2dbSJung-uk Kim$code.=<<___;
750*7bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
751*7bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
752*7bded2dbSJung-uk Kim	lea	(%rax),%rsp
753*7bded2dbSJung-uk Kim.Lepilogue_shaext:
754*7bded2dbSJung-uk Kim	ret
755*7bded2dbSJung-uk Kim.size	sha1_multi_block_shaext,.-sha1_multi_block_shaext
756*7bded2dbSJung-uk Kim___
757*7bded2dbSJung-uk Kim						}}}
758*7bded2dbSJung-uk Kim
759*7bded2dbSJung-uk Kim						if ($avx) {{{
760*7bded2dbSJung-uk Kimsub BODY_00_19_avx {
761*7bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_;
762*7bded2dbSJung-uk Kimmy $j=$i+1;
763*7bded2dbSJung-uk Kimmy $k=$i+2;
764*7bded2dbSJung-uk Kimmy $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128";
765*7bded2dbSJung-uk Kimmy $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4];
766*7bded2dbSJung-uk Kim
767*7bded2dbSJung-uk Kim$code.=<<___ if ($i==0 && $REG_SZ==16);
768*7bded2dbSJung-uk Kim	vmovd		(@ptr[0]),@Xi[0]
769*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[0]),@ptr[0]
770*7bded2dbSJung-uk Kim	vmovd		(@ptr[1]),@Xi[2]	# borrow Xi[2]
771*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[1]),@ptr[1]
772*7bded2dbSJung-uk Kim	vpinsrd		\$1,(@ptr[2]),@Xi[0],@Xi[0]
773*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[2]),@ptr[2]
774*7bded2dbSJung-uk Kim	vpinsrd		\$1,(@ptr[3]),@Xi[2],@Xi[2]
775*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[3]),@ptr[3]
776*7bded2dbSJung-uk Kim	 vmovd		`4*$j-16*4`(@ptr[0]),@Xi[1]
777*7bded2dbSJung-uk Kim	vpunpckldq	@Xi[2],@Xi[0],@Xi[0]
778*7bded2dbSJung-uk Kim	 vmovd		`4*$j-16*4`($ptr_n),$t3
779*7bded2dbSJung-uk Kim	vpshufb		$tx,@Xi[0],@Xi[0]
780*7bded2dbSJung-uk Kim___
781*7bded2dbSJung-uk Kim$code.=<<___ if ($i<15 && $REG_SZ==16);		# just load input
782*7bded2dbSJung-uk Kim	 vpinsrd	\$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
783*7bded2dbSJung-uk Kim	 vpinsrd	\$1,`4*$j-16*4`(@ptr[3]),$t3,$t3
784*7bded2dbSJung-uk Kim___
785*7bded2dbSJung-uk Kim$code.=<<___ if ($i==0 && $REG_SZ==32);
786*7bded2dbSJung-uk Kim	vmovd		(@ptr[0]),@Xi[0]
787*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[0]),@ptr[0]
788*7bded2dbSJung-uk Kim	vmovd		(@ptr[4]),@Xi[2]	# borrow Xi[2]
789*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[4]),@ptr[4]
790*7bded2dbSJung-uk Kim	vmovd		(@ptr[1]),$t2
791*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[1]),@ptr[1]
792*7bded2dbSJung-uk Kim	vmovd		(@ptr[5]),$t1
793*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[5]),@ptr[5]
794*7bded2dbSJung-uk Kim	vpinsrd		\$1,(@ptr[2]),@Xi[0],@Xi[0]
795*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[2]),@ptr[2]
796*7bded2dbSJung-uk Kim	vpinsrd		\$1,(@ptr[6]),@Xi[2],@Xi[2]
797*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[6]),@ptr[6]
798*7bded2dbSJung-uk Kim	vpinsrd		\$1,(@ptr[3]),$t2,$t2
799*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[3]),@ptr[3]
800*7bded2dbSJung-uk Kim	vpunpckldq	$t2,@Xi[0],@Xi[0]
801*7bded2dbSJung-uk Kim	vpinsrd		\$1,(@ptr[7]),$t1,$t1
802*7bded2dbSJung-uk Kim	 lea		`16*4`(@ptr[7]),@ptr[7]
803*7bded2dbSJung-uk Kim	vpunpckldq	$t1,@Xi[2],@Xi[2]
804*7bded2dbSJung-uk Kim	 vmovd		`4*$j-16*4`(@ptr[0]),@Xi[1]
805*7bded2dbSJung-uk Kim	vinserti128	@Xi[2],@Xi[0],@Xi[0]
806*7bded2dbSJung-uk Kim	 vmovd		`4*$j-16*4`($ptr_n),$t3
807*7bded2dbSJung-uk Kim	vpshufb		$tx,@Xi[0],@Xi[0]
808*7bded2dbSJung-uk Kim___
809*7bded2dbSJung-uk Kim$code.=<<___ if ($i<15 && $REG_SZ==32);		# just load input
810*7bded2dbSJung-uk Kim	 vmovd		`4*$j-16*4`(@ptr[1]),$t2
811*7bded2dbSJung-uk Kim	 vmovd		`4*$j-16*4`(@ptr[5]),$t1
812*7bded2dbSJung-uk Kim	 vpinsrd	\$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
813*7bded2dbSJung-uk Kim	 vpinsrd	\$1,`4*$j-16*4`(@ptr[6]),$t3,$t3
814*7bded2dbSJung-uk Kim	 vpinsrd	\$1,`4*$j-16*4`(@ptr[3]),$t2,$t2
815*7bded2dbSJung-uk Kim	 vpunpckldq	$t2,@Xi[1],@Xi[1]
816*7bded2dbSJung-uk Kim	 vpinsrd	\$1,`4*$j-16*4`(@ptr[7]),$t1,$t1
817*7bded2dbSJung-uk Kim	 vpunpckldq	$t1,$t3,$t3
818*7bded2dbSJung-uk Kim___
819*7bded2dbSJung-uk Kim$code.=<<___ if ($i<14);
820*7bded2dbSJung-uk Kim	vpaddd	$K,$e,$e			# e+=K_00_19
821*7bded2dbSJung-uk Kim	vpslld	\$5,$a,$t2
822*7bded2dbSJung-uk Kim	vpandn	$d,$b,$t1
823*7bded2dbSJung-uk Kim	vpand	$c,$b,$t0
824*7bded2dbSJung-uk Kim
825*7bded2dbSJung-uk Kim	vmovdqa	@Xi[0],`&Xi_off($i)`
826*7bded2dbSJung-uk Kim	vpaddd	@Xi[0],$e,$e			# e+=X[i]
827*7bded2dbSJung-uk Kim	 $vpack		$t3,@Xi[1],@Xi[1]
828*7bded2dbSJung-uk Kim	vpsrld	\$27,$a,$t3
829*7bded2dbSJung-uk Kim	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
830*7bded2dbSJung-uk Kim	 vmovd		`4*$k-16*4`(@ptr[0]),@Xi[2]
831*7bded2dbSJung-uk Kim
832*7bded2dbSJung-uk Kim	vpslld	\$30,$b,$t1
833*7bded2dbSJung-uk Kim	vpor	$t3,$t2,$t2			# rol(a,5)
834*7bded2dbSJung-uk Kim	 vmovd		`4*$k-16*4`($ptr_n),$t3
835*7bded2dbSJung-uk Kim	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
836*7bded2dbSJung-uk Kim
837*7bded2dbSJung-uk Kim	vpsrld	\$2,$b,$b
838*7bded2dbSJung-uk Kim	vpaddd	$t2,$e,$e			# e+=rol(a,5)
839*7bded2dbSJung-uk Kim	 vpshufb	$tx,@Xi[1],@Xi[1]
840*7bded2dbSJung-uk Kim	vpor	$t1,$b,$b			# b=rol(b,30)
841*7bded2dbSJung-uk Kim___
842*7bded2dbSJung-uk Kim$code.=<<___ if ($i==14);
843*7bded2dbSJung-uk Kim	vpaddd	$K,$e,$e			# e+=K_00_19
844*7bded2dbSJung-uk Kim	 prefetcht0	63(@ptr[0])
845*7bded2dbSJung-uk Kim	vpslld	\$5,$a,$t2
846*7bded2dbSJung-uk Kim	vpandn	$d,$b,$t1
847*7bded2dbSJung-uk Kim	vpand	$c,$b,$t0
848*7bded2dbSJung-uk Kim
849*7bded2dbSJung-uk Kim	vmovdqa	@Xi[0],`&Xi_off($i)`
850*7bded2dbSJung-uk Kim	vpaddd	@Xi[0],$e,$e			# e+=X[i]
851*7bded2dbSJung-uk Kim	 $vpack		$t3,@Xi[1],@Xi[1]
852*7bded2dbSJung-uk Kim	vpsrld	\$27,$a,$t3
853*7bded2dbSJung-uk Kim	 prefetcht0	63(@ptr[1])
854*7bded2dbSJung-uk Kim	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
855*7bded2dbSJung-uk Kim
856*7bded2dbSJung-uk Kim	vpslld	\$30,$b,$t1
857*7bded2dbSJung-uk Kim	vpor	$t3,$t2,$t2			# rol(a,5)
858*7bded2dbSJung-uk Kim	 prefetcht0	63(@ptr[2])
859*7bded2dbSJung-uk Kim	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
860*7bded2dbSJung-uk Kim
861*7bded2dbSJung-uk Kim	vpsrld	\$2,$b,$b
862*7bded2dbSJung-uk Kim	vpaddd	$t2,$e,$e			# e+=rol(a,5)
863*7bded2dbSJung-uk Kim	 prefetcht0	63(@ptr[3])
864*7bded2dbSJung-uk Kim	 vpshufb	$tx,@Xi[1],@Xi[1]
865*7bded2dbSJung-uk Kim	vpor	$t1,$b,$b			# b=rol(b,30)
866*7bded2dbSJung-uk Kim___
867*7bded2dbSJung-uk Kim$code.=<<___ if ($i>=13 && $i<15);
868*7bded2dbSJung-uk Kim	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# preload "X[2]"
869*7bded2dbSJung-uk Kim___
870*7bded2dbSJung-uk Kim$code.=<<___ if ($i>=15);			# apply Xupdate
871*7bded2dbSJung-uk Kim	vpxor	@Xi[-2],@Xi[1],@Xi[1]		# "X[13]"
872*7bded2dbSJung-uk Kim	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
873*7bded2dbSJung-uk Kim
874*7bded2dbSJung-uk Kim	vpaddd	$K,$e,$e			# e+=K_00_19
875*7bded2dbSJung-uk Kim	vpslld	\$5,$a,$t2
876*7bded2dbSJung-uk Kim	vpandn	$d,$b,$t1
877*7bded2dbSJung-uk Kim	 `"prefetcht0	63(@ptr[4])"		if ($i==15 && $REG_SZ==32)`
878*7bded2dbSJung-uk Kim	vpand	$c,$b,$t0
879*7bded2dbSJung-uk Kim
880*7bded2dbSJung-uk Kim	vmovdqa	@Xi[0],`&Xi_off($i)`
881*7bded2dbSJung-uk Kim	vpaddd	@Xi[0],$e,$e			# e+=X[i]
882*7bded2dbSJung-uk Kim	 vpxor	`&Xi_off($j+8)`,@Xi[1],@Xi[1]
883*7bded2dbSJung-uk Kim	vpsrld	\$27,$a,$t3
884*7bded2dbSJung-uk Kim	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
885*7bded2dbSJung-uk Kim	 vpxor	@Xi[3],@Xi[1],@Xi[1]
886*7bded2dbSJung-uk Kim	 `"prefetcht0	63(@ptr[5])"		if ($i==15 && $REG_SZ==32)`
887*7bded2dbSJung-uk Kim
888*7bded2dbSJung-uk Kim	vpslld	\$30,$b,$t1
889*7bded2dbSJung-uk Kim	vpor	$t3,$t2,$t2			# rol(a,5)
890*7bded2dbSJung-uk Kim	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
891*7bded2dbSJung-uk Kim	 `"prefetcht0	63(@ptr[6])"		if ($i==15 && $REG_SZ==32)`
892*7bded2dbSJung-uk Kim	 vpsrld	\$31,@Xi[1],$tx
893*7bded2dbSJung-uk Kim	 vpaddd	@Xi[1],@Xi[1],@Xi[1]
894*7bded2dbSJung-uk Kim
895*7bded2dbSJung-uk Kim	vpsrld	\$2,$b,$b
896*7bded2dbSJung-uk Kim	 `"prefetcht0	63(@ptr[7])"		if ($i==15 && $REG_SZ==32)`
897*7bded2dbSJung-uk Kim	vpaddd	$t2,$e,$e			# e+=rol(a,5)
898*7bded2dbSJung-uk Kim	 vpor	$tx,@Xi[1],@Xi[1]		# rol	\$1,@Xi[1]
899*7bded2dbSJung-uk Kim	vpor	$t1,$b,$b			# b=rol(b,30)
900*7bded2dbSJung-uk Kim___
901*7bded2dbSJung-uk Kimpush(@Xi,shift(@Xi));
902*7bded2dbSJung-uk Kim}
903*7bded2dbSJung-uk Kim
904*7bded2dbSJung-uk Kimsub BODY_20_39_avx {
905*7bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_;
906*7bded2dbSJung-uk Kimmy $j=$i+1;
907*7bded2dbSJung-uk Kim
908*7bded2dbSJung-uk Kim$code.=<<___ if ($i<79);
909*7bded2dbSJung-uk Kim	vpxor	@Xi[-2],@Xi[1],@Xi[1]		# "X[13]"
910*7bded2dbSJung-uk Kim	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
911*7bded2dbSJung-uk Kim
912*7bded2dbSJung-uk Kim	vpslld	\$5,$a,$t2
913*7bded2dbSJung-uk Kim	vpaddd	$K,$e,$e			# e+=K_20_39
914*7bded2dbSJung-uk Kim	vpxor	$b,$d,$t0
915*7bded2dbSJung-uk Kim___
916*7bded2dbSJung-uk Kim$code.=<<___ if ($i<72);
917*7bded2dbSJung-uk Kim	vmovdqa	@Xi[0],`&Xi_off($i)`
918*7bded2dbSJung-uk Kim___
919*7bded2dbSJung-uk Kim$code.=<<___ if ($i<79);
920*7bded2dbSJung-uk Kim	vpaddd	@Xi[0],$e,$e			# e+=X[i]
921*7bded2dbSJung-uk Kim	 vpxor	`&Xi_off($j+8)`,@Xi[1],@Xi[1]
922*7bded2dbSJung-uk Kim	vpsrld	\$27,$a,$t3
923*7bded2dbSJung-uk Kim	vpxor	$c,$t0,$t0			# Parity(b,c,d)
924*7bded2dbSJung-uk Kim	 vpxor	@Xi[3],@Xi[1],@Xi[1]
925*7bded2dbSJung-uk Kim
926*7bded2dbSJung-uk Kim	vpslld	\$30,$b,$t1
927*7bded2dbSJung-uk Kim	vpor	$t3,$t2,$t2			# rol(a,5)
928*7bded2dbSJung-uk Kim	vpaddd	$t0,$e,$e			# e+=Parity(b,c,d)
929*7bded2dbSJung-uk Kim	 vpsrld	\$31,@Xi[1],$tx
930*7bded2dbSJung-uk Kim	 vpaddd	@Xi[1],@Xi[1],@Xi[1]
931*7bded2dbSJung-uk Kim
932*7bded2dbSJung-uk Kim	vpsrld	\$2,$b,$b
933*7bded2dbSJung-uk Kim	vpaddd	$t2,$e,$e			# e+=rol(a,5)
934*7bded2dbSJung-uk Kim	 vpor	$tx,@Xi[1],@Xi[1]		# rol(@Xi[1],1)
935*7bded2dbSJung-uk Kim	vpor	$t1,$b,$b			# b=rol(b,30)
936*7bded2dbSJung-uk Kim___
937*7bded2dbSJung-uk Kim$code.=<<___ if ($i==79);
938*7bded2dbSJung-uk Kim	vpslld	\$5,$a,$t2
939*7bded2dbSJung-uk Kim	vpaddd	$K,$e,$e			# e+=K_20_39
940*7bded2dbSJung-uk Kim	vpxor	$b,$d,$t0
941*7bded2dbSJung-uk Kim
942*7bded2dbSJung-uk Kim	vpsrld	\$27,$a,$t3
943*7bded2dbSJung-uk Kim	vpaddd	@Xi[0],$e,$e			# e+=X[i]
944*7bded2dbSJung-uk Kim	vpxor	$c,$t0,$t0			# Parity(b,c,d)
945*7bded2dbSJung-uk Kim
946*7bded2dbSJung-uk Kim	vpslld	\$30,$b,$t1
947*7bded2dbSJung-uk Kim	vpor	$t3,$t2,$t2			# rol(a,5)
948*7bded2dbSJung-uk Kim	vpaddd	$t0,$e,$e			# e+=Parity(b,c,d)
949*7bded2dbSJung-uk Kim
950*7bded2dbSJung-uk Kim	vpsrld	\$2,$b,$b
951*7bded2dbSJung-uk Kim	vpaddd	$t2,$e,$e			# e+=rol(a,5)
952*7bded2dbSJung-uk Kim	vpor	$t1,$b,$b			# b=rol(b,30)
953*7bded2dbSJung-uk Kim___
954*7bded2dbSJung-uk Kimpush(@Xi,shift(@Xi));
955*7bded2dbSJung-uk Kim}
956*7bded2dbSJung-uk Kim
957*7bded2dbSJung-uk Kimsub BODY_40_59_avx {
958*7bded2dbSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_;
959*7bded2dbSJung-uk Kimmy $j=$i+1;
960*7bded2dbSJung-uk Kim
961*7bded2dbSJung-uk Kim$code.=<<___;
962*7bded2dbSJung-uk Kim	vpxor	@Xi[-2],@Xi[1],@Xi[1]		# "X[13]"
963*7bded2dbSJung-uk Kim	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
964*7bded2dbSJung-uk Kim
965*7bded2dbSJung-uk Kim	vpaddd	$K,$e,$e			# e+=K_40_59
966*7bded2dbSJung-uk Kim	vpslld	\$5,$a,$t2
967*7bded2dbSJung-uk Kim	vpand	$c,$d,$t1
968*7bded2dbSJung-uk Kim	 vpxor	`&Xi_off($j+8)`,@Xi[1],@Xi[1]
969*7bded2dbSJung-uk Kim
970*7bded2dbSJung-uk Kim	vpaddd	$t1,$e,$e
971*7bded2dbSJung-uk Kim	vpsrld	\$27,$a,$t3
972*7bded2dbSJung-uk Kim	vpxor	$c,$d,$t0
973*7bded2dbSJung-uk Kim	 vpxor	@Xi[3],@Xi[1],@Xi[1]
974*7bded2dbSJung-uk Kim
975*7bded2dbSJung-uk Kim	vmovdqu	@Xi[0],`&Xi_off($i)`
976*7bded2dbSJung-uk Kim	vpaddd	@Xi[0],$e,$e			# e+=X[i]
977*7bded2dbSJung-uk Kim	vpor	$t3,$t2,$t2			# rol(a,5)
978*7bded2dbSJung-uk Kim	 vpsrld	\$31,@Xi[1],$tx
979*7bded2dbSJung-uk Kim	vpand	$b,$t0,$t0
980*7bded2dbSJung-uk Kim	 vpaddd	@Xi[1],@Xi[1],@Xi[1]
981*7bded2dbSJung-uk Kim
982*7bded2dbSJung-uk Kim	vpslld	\$30,$b,$t1
983*7bded2dbSJung-uk Kim	vpaddd	$t0,$e,$e			# e+=Maj(b,d,c)
984*7bded2dbSJung-uk Kim
985*7bded2dbSJung-uk Kim	vpsrld	\$2,$b,$b
986*7bded2dbSJung-uk Kim	vpaddd	$t2,$e,$e			# e+=rol(a,5)
987*7bded2dbSJung-uk Kim	 vpor	$tx,@Xi[1],@Xi[1]		# rol(@X[1],1)
988*7bded2dbSJung-uk Kim	vpor	$t1,$b,$b			# b=rol(b,30)
989*7bded2dbSJung-uk Kim___
990*7bded2dbSJung-uk Kimpush(@Xi,shift(@Xi));
991*7bded2dbSJung-uk Kim}
992*7bded2dbSJung-uk Kim
993*7bded2dbSJung-uk Kim$code.=<<___;
994*7bded2dbSJung-uk Kim.type	sha1_multi_block_avx,\@function,3
995*7bded2dbSJung-uk Kim.align	32
996*7bded2dbSJung-uk Kimsha1_multi_block_avx:
997*7bded2dbSJung-uk Kim_avx_shortcut:
998*7bded2dbSJung-uk Kim___
999*7bded2dbSJung-uk Kim$code.=<<___ if ($avx>1);
1000*7bded2dbSJung-uk Kim	shr	\$32,%rcx
1001*7bded2dbSJung-uk Kim	cmp	\$2,$num
1002*7bded2dbSJung-uk Kim	jb	.Lavx
1003*7bded2dbSJung-uk Kim	test	\$`1<<5`,%ecx
1004*7bded2dbSJung-uk Kim	jnz	_avx2_shortcut
1005*7bded2dbSJung-uk Kim	jmp	.Lavx
1006*7bded2dbSJung-uk Kim.align	32
1007*7bded2dbSJung-uk Kim.Lavx:
1008*7bded2dbSJung-uk Kim___
1009*7bded2dbSJung-uk Kim$code.=<<___;
1010*7bded2dbSJung-uk Kim	mov	%rsp,%rax
1011*7bded2dbSJung-uk Kim	push	%rbx
1012*7bded2dbSJung-uk Kim	push	%rbp
1013*7bded2dbSJung-uk Kim___
1014*7bded2dbSJung-uk Kim$code.=<<___ if ($win64);
1015*7bded2dbSJung-uk Kim	lea	-0xa8(%rsp),%rsp
1016*7bded2dbSJung-uk Kim	movaps	%xmm6,(%rsp)
1017*7bded2dbSJung-uk Kim	movaps	%xmm7,0x10(%rsp)
1018*7bded2dbSJung-uk Kim	movaps	%xmm8,0x20(%rsp)
1019*7bded2dbSJung-uk Kim	movaps	%xmm9,0x30(%rsp)
1020*7bded2dbSJung-uk Kim	movaps	%xmm10,-0x78(%rax)
1021*7bded2dbSJung-uk Kim	movaps	%xmm11,-0x68(%rax)
1022*7bded2dbSJung-uk Kim	movaps	%xmm12,-0x58(%rax)
1023*7bded2dbSJung-uk Kim	movaps	%xmm13,-0x48(%rax)
1024*7bded2dbSJung-uk Kim	movaps	%xmm14,-0x38(%rax)
1025*7bded2dbSJung-uk Kim	movaps	%xmm15,-0x28(%rax)
1026*7bded2dbSJung-uk Kim___
1027*7bded2dbSJung-uk Kim$code.=<<___;
1028*7bded2dbSJung-uk Kim	sub	\$`$REG_SZ*18`, %rsp
1029*7bded2dbSJung-uk Kim	and	\$-256,%rsp
1030*7bded2dbSJung-uk Kim	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
1031*7bded2dbSJung-uk Kim.Lbody_avx:
1032*7bded2dbSJung-uk Kim	lea	K_XX_XX(%rip),$Tbl
1033*7bded2dbSJung-uk Kim	lea	`$REG_SZ*16`(%rsp),%rbx
1034*7bded2dbSJung-uk Kim
1035*7bded2dbSJung-uk Kim	vzeroupper
1036*7bded2dbSJung-uk Kim.Loop_grande_avx:
1037*7bded2dbSJung-uk Kim	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1038*7bded2dbSJung-uk Kim	xor	$num,$num
1039*7bded2dbSJung-uk Kim___
1040*7bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) {
1041*7bded2dbSJung-uk Kim    $code.=<<___;
1042*7bded2dbSJung-uk Kim	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
1043*7bded2dbSJung-uk Kim	mov	`16*$i+8`($inp),%ecx		# number of blocks
1044*7bded2dbSJung-uk Kim	cmp	$num,%ecx
1045*7bded2dbSJung-uk Kim	cmovg	%ecx,$num			# find maximum
1046*7bded2dbSJung-uk Kim	test	%ecx,%ecx
1047*7bded2dbSJung-uk Kim	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1048*7bded2dbSJung-uk Kim	cmovle	$Tbl,@ptr[$i]			# cancel input
1049*7bded2dbSJung-uk Kim___
1050*7bded2dbSJung-uk Kim}
1051*7bded2dbSJung-uk Kim$code.=<<___;
1052*7bded2dbSJung-uk Kim	test	$num,$num
1053*7bded2dbSJung-uk Kim	jz	.Ldone_avx
1054*7bded2dbSJung-uk Kim
1055*7bded2dbSJung-uk Kim	vmovdqu	0x00($ctx),$A			# load context
1056*7bded2dbSJung-uk Kim	 lea	128(%rsp),%rax
1057*7bded2dbSJung-uk Kim	vmovdqu	0x20($ctx),$B
1058*7bded2dbSJung-uk Kim	vmovdqu	0x40($ctx),$C
1059*7bded2dbSJung-uk Kim	vmovdqu	0x60($ctx),$D
1060*7bded2dbSJung-uk Kim	vmovdqu	0x80($ctx),$E
1061*7bded2dbSJung-uk Kim	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1062*7bded2dbSJung-uk Kim	jmp	.Loop_avx
1063*7bded2dbSJung-uk Kim
1064*7bded2dbSJung-uk Kim.align	32
1065*7bded2dbSJung-uk Kim.Loop_avx:
1066*7bded2dbSJung-uk Kim___
1067*7bded2dbSJung-uk Kim$code.="	vmovdqa	-0x20($Tbl),$K\n";	# K_00_19
1068*7bded2dbSJung-uk Kimfor($i=0;$i<20;$i++)	{ &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1069*7bded2dbSJung-uk Kim$code.="	vmovdqa	0x00($Tbl),$K\n";	# K_20_39
1070*7bded2dbSJung-uk Kimfor(;$i<40;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1071*7bded2dbSJung-uk Kim$code.="	vmovdqa	0x20($Tbl),$K\n";	# K_40_59
1072*7bded2dbSJung-uk Kimfor(;$i<60;$i++)	{ &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1073*7bded2dbSJung-uk Kim$code.="	vmovdqa	0x40($Tbl),$K\n";	# K_60_79
1074*7bded2dbSJung-uk Kimfor(;$i<80;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1075*7bded2dbSJung-uk Kim$code.=<<___;
1076*7bded2dbSJung-uk Kim	mov	\$1,%ecx
1077*7bded2dbSJung-uk Kim___
1078*7bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) {
1079*7bded2dbSJung-uk Kim    $code.=<<___;
1080*7bded2dbSJung-uk Kim	cmp	`4*$i`(%rbx),%ecx		# examine counters
1081*7bded2dbSJung-uk Kim	cmovge	$Tbl,@ptr[$i]			# cancel input
1082*7bded2dbSJung-uk Kim___
1083*7bded2dbSJung-uk Kim}
1084*7bded2dbSJung-uk Kim$code.=<<___;
1085*7bded2dbSJung-uk Kim	vmovdqu	(%rbx),$t0			# pull counters
1086*7bded2dbSJung-uk Kim	vpxor	$t2,$t2,$t2
1087*7bded2dbSJung-uk Kim	vmovdqa	$t0,$t1
1088*7bded2dbSJung-uk Kim	vpcmpgtd $t2,$t1,$t1			# mask value
1089*7bded2dbSJung-uk Kim	vpaddd	$t1,$t0,$t0			# counters--
1090*7bded2dbSJung-uk Kim
1091*7bded2dbSJung-uk Kim	vpand	$t1,$A,$A
1092*7bded2dbSJung-uk Kim	vpand	$t1,$B,$B
1093*7bded2dbSJung-uk Kim	vpaddd	0x00($ctx),$A,$A
1094*7bded2dbSJung-uk Kim	vpand	$t1,$C,$C
1095*7bded2dbSJung-uk Kim	vpaddd	0x20($ctx),$B,$B
1096*7bded2dbSJung-uk Kim	vpand	$t1,$D,$D
1097*7bded2dbSJung-uk Kim	vpaddd	0x40($ctx),$C,$C
1098*7bded2dbSJung-uk Kim	vpand	$t1,$E,$E
1099*7bded2dbSJung-uk Kim	vpaddd	0x60($ctx),$D,$D
1100*7bded2dbSJung-uk Kim	vpaddd	0x80($ctx),$E,$E
1101*7bded2dbSJung-uk Kim	vmovdqu	$A,0x00($ctx)
1102*7bded2dbSJung-uk Kim	vmovdqu	$B,0x20($ctx)
1103*7bded2dbSJung-uk Kim	vmovdqu	$C,0x40($ctx)
1104*7bded2dbSJung-uk Kim	vmovdqu	$D,0x60($ctx)
1105*7bded2dbSJung-uk Kim	vmovdqu	$E,0x80($ctx)
1106*7bded2dbSJung-uk Kim
1107*7bded2dbSJung-uk Kim	vmovdqu	$t0,(%rbx)			# save counters
1108*7bded2dbSJung-uk Kim	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1109*7bded2dbSJung-uk Kim	dec	$num
1110*7bded2dbSJung-uk Kim	jnz	.Loop_avx
1111*7bded2dbSJung-uk Kim
1112*7bded2dbSJung-uk Kim	mov	`$REG_SZ*17+8`(%rsp),$num
1113*7bded2dbSJung-uk Kim	lea	$REG_SZ($ctx),$ctx
1114*7bded2dbSJung-uk Kim	lea	`16*$REG_SZ/4`($inp),$inp
1115*7bded2dbSJung-uk Kim	dec	$num
1116*7bded2dbSJung-uk Kim	jnz	.Loop_grande_avx
1117*7bded2dbSJung-uk Kim
1118*7bded2dbSJung-uk Kim.Ldone_avx:
1119*7bded2dbSJung-uk Kim	mov	`$REG_SZ*17`(%rsp),%rax		# orignal %rsp
1120*7bded2dbSJung-uk Kim	vzeroupper
1121*7bded2dbSJung-uk Kim___
1122*7bded2dbSJung-uk Kim$code.=<<___ if ($win64);
1123*7bded2dbSJung-uk Kim	movaps	-0xb8(%rax),%xmm6
1124*7bded2dbSJung-uk Kim	movaps	-0xa8(%rax),%xmm7
1125*7bded2dbSJung-uk Kim	movaps	-0x98(%rax),%xmm8
1126*7bded2dbSJung-uk Kim	movaps	-0x88(%rax),%xmm9
1127*7bded2dbSJung-uk Kim	movaps	-0x78(%rax),%xmm10
1128*7bded2dbSJung-uk Kim	movaps	-0x68(%rax),%xmm11
1129*7bded2dbSJung-uk Kim	movaps	-0x58(%rax),%xmm12
1130*7bded2dbSJung-uk Kim	movaps	-0x48(%rax),%xmm13
1131*7bded2dbSJung-uk Kim	movaps	-0x38(%rax),%xmm14
1132*7bded2dbSJung-uk Kim	movaps	-0x28(%rax),%xmm15
1133*7bded2dbSJung-uk Kim___
1134*7bded2dbSJung-uk Kim$code.=<<___;
1135*7bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
1136*7bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
1137*7bded2dbSJung-uk Kim	lea	(%rax),%rsp
1138*7bded2dbSJung-uk Kim.Lepilogue_avx:
1139*7bded2dbSJung-uk Kim	ret
1140*7bded2dbSJung-uk Kim.size	sha1_multi_block_avx,.-sha1_multi_block_avx
1141*7bded2dbSJung-uk Kim___
1142*7bded2dbSJung-uk Kim
1143*7bded2dbSJung-uk Kim						if ($avx>1) {
1144*7bded2dbSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval $1/gem;
1145*7bded2dbSJung-uk Kim
1146*7bded2dbSJung-uk Kim$REG_SZ=32;
1147*7bded2dbSJung-uk Kim
1148*7bded2dbSJung-uk Kim@ptr=map("%r$_",(12..15,8..11));
1149*7bded2dbSJung-uk Kim
1150*7bded2dbSJung-uk Kim@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4));
1151*7bded2dbSJung-uk Kim($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9));
1152*7bded2dbSJung-uk Kim@Xi=map("%ymm$_",(10..14));
1153*7bded2dbSJung-uk Kim$K="%ymm15";
1154*7bded2dbSJung-uk Kim
1155*7bded2dbSJung-uk Kim$code.=<<___;
1156*7bded2dbSJung-uk Kim.type	sha1_multi_block_avx2,\@function,3
1157*7bded2dbSJung-uk Kim.align	32
1158*7bded2dbSJung-uk Kimsha1_multi_block_avx2:
1159*7bded2dbSJung-uk Kim_avx2_shortcut:
1160*7bded2dbSJung-uk Kim	mov	%rsp,%rax
1161*7bded2dbSJung-uk Kim	push	%rbx
1162*7bded2dbSJung-uk Kim	push	%rbp
1163*7bded2dbSJung-uk Kim	push	%r12
1164*7bded2dbSJung-uk Kim	push	%r13
1165*7bded2dbSJung-uk Kim	push	%r14
1166*7bded2dbSJung-uk Kim	push	%r15
1167*7bded2dbSJung-uk Kim___
1168*7bded2dbSJung-uk Kim$code.=<<___ if ($win64);
1169*7bded2dbSJung-uk Kim	lea	-0xa8(%rsp),%rsp
1170*7bded2dbSJung-uk Kim	movaps	%xmm6,(%rsp)
1171*7bded2dbSJung-uk Kim	movaps	%xmm7,0x10(%rsp)
1172*7bded2dbSJung-uk Kim	movaps	%xmm8,0x20(%rsp)
1173*7bded2dbSJung-uk Kim	movaps	%xmm9,0x30(%rsp)
1174*7bded2dbSJung-uk Kim	movaps	%xmm10,0x40(%rsp)
1175*7bded2dbSJung-uk Kim	movaps	%xmm11,0x50(%rsp)
1176*7bded2dbSJung-uk Kim	movaps	%xmm12,-0x78(%rax)
1177*7bded2dbSJung-uk Kim	movaps	%xmm13,-0x68(%rax)
1178*7bded2dbSJung-uk Kim	movaps	%xmm14,-0x58(%rax)
1179*7bded2dbSJung-uk Kim	movaps	%xmm15,-0x48(%rax)
1180*7bded2dbSJung-uk Kim___
1181*7bded2dbSJung-uk Kim$code.=<<___;
1182*7bded2dbSJung-uk Kim	sub	\$`$REG_SZ*18`, %rsp
1183*7bded2dbSJung-uk Kim	and	\$-256,%rsp
1184*7bded2dbSJung-uk Kim	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
1185*7bded2dbSJung-uk Kim.Lbody_avx2:
1186*7bded2dbSJung-uk Kim	lea	K_XX_XX(%rip),$Tbl
1187*7bded2dbSJung-uk Kim	shr	\$1,$num
1188*7bded2dbSJung-uk Kim
1189*7bded2dbSJung-uk Kim	vzeroupper
1190*7bded2dbSJung-uk Kim.Loop_grande_avx2:
1191*7bded2dbSJung-uk Kim	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1192*7bded2dbSJung-uk Kim	xor	$num,$num
1193*7bded2dbSJung-uk Kim	lea	`$REG_SZ*16`(%rsp),%rbx
1194*7bded2dbSJung-uk Kim___
1195*7bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) {
1196*7bded2dbSJung-uk Kim    $code.=<<___;
1197*7bded2dbSJung-uk Kim	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
1198*7bded2dbSJung-uk Kim	mov	`16*$i+8`($inp),%ecx		# number of blocks
1199*7bded2dbSJung-uk Kim	cmp	$num,%ecx
1200*7bded2dbSJung-uk Kim	cmovg	%ecx,$num			# find maximum
1201*7bded2dbSJung-uk Kim	test	%ecx,%ecx
1202*7bded2dbSJung-uk Kim	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1203*7bded2dbSJung-uk Kim	cmovle	$Tbl,@ptr[$i]			# cancel input
1204*7bded2dbSJung-uk Kim___
1205*7bded2dbSJung-uk Kim}
1206*7bded2dbSJung-uk Kim$code.=<<___;
1207*7bded2dbSJung-uk Kim	vmovdqu	0x00($ctx),$A			# load context
1208*7bded2dbSJung-uk Kim	 lea	128(%rsp),%rax
1209*7bded2dbSJung-uk Kim	vmovdqu	0x20($ctx),$B
1210*7bded2dbSJung-uk Kim	 lea	256+128(%rsp),%rbx
1211*7bded2dbSJung-uk Kim	vmovdqu	0x40($ctx),$C
1212*7bded2dbSJung-uk Kim	vmovdqu	0x60($ctx),$D
1213*7bded2dbSJung-uk Kim	vmovdqu	0x80($ctx),$E
1214*7bded2dbSJung-uk Kim	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1215*7bded2dbSJung-uk Kim	jmp	.Loop_avx2
1216*7bded2dbSJung-uk Kim
1217*7bded2dbSJung-uk Kim.align	32
1218*7bded2dbSJung-uk Kim.Loop_avx2:
1219*7bded2dbSJung-uk Kim___
1220*7bded2dbSJung-uk Kim$code.="	vmovdqa	-0x20($Tbl),$K\n";	# K_00_19
1221*7bded2dbSJung-uk Kimfor($i=0;$i<20;$i++)	{ &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1222*7bded2dbSJung-uk Kim$code.="	vmovdqa	0x00($Tbl),$K\n";	# K_20_39
1223*7bded2dbSJung-uk Kimfor(;$i<40;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1224*7bded2dbSJung-uk Kim$code.="	vmovdqa	0x20($Tbl),$K\n";	# K_40_59
1225*7bded2dbSJung-uk Kimfor(;$i<60;$i++)	{ &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1226*7bded2dbSJung-uk Kim$code.="	vmovdqa	0x40($Tbl),$K\n";	# K_60_79
1227*7bded2dbSJung-uk Kimfor(;$i<80;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1228*7bded2dbSJung-uk Kim$code.=<<___;
1229*7bded2dbSJung-uk Kim	mov	\$1,%ecx
1230*7bded2dbSJung-uk Kim	lea	`$REG_SZ*16`(%rsp),%rbx
1231*7bded2dbSJung-uk Kim___
1232*7bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) {
1233*7bded2dbSJung-uk Kim    $code.=<<___;
1234*7bded2dbSJung-uk Kim	cmp	`4*$i`(%rbx),%ecx		# examine counters
1235*7bded2dbSJung-uk Kim	cmovge	$Tbl,@ptr[$i]			# cancel input
1236*7bded2dbSJung-uk Kim___
1237*7bded2dbSJung-uk Kim}
1238*7bded2dbSJung-uk Kim$code.=<<___;
1239*7bded2dbSJung-uk Kim	vmovdqu	(%rbx),$t0		# pull counters
1240*7bded2dbSJung-uk Kim	vpxor	$t2,$t2,$t2
1241*7bded2dbSJung-uk Kim	vmovdqa	$t0,$t1
1242*7bded2dbSJung-uk Kim	vpcmpgtd $t2,$t1,$t1			# mask value
1243*7bded2dbSJung-uk Kim	vpaddd	$t1,$t0,$t0			# counters--
1244*7bded2dbSJung-uk Kim
1245*7bded2dbSJung-uk Kim	vpand	$t1,$A,$A
1246*7bded2dbSJung-uk Kim	vpand	$t1,$B,$B
1247*7bded2dbSJung-uk Kim	vpaddd	0x00($ctx),$A,$A
1248*7bded2dbSJung-uk Kim	vpand	$t1,$C,$C
1249*7bded2dbSJung-uk Kim	vpaddd	0x20($ctx),$B,$B
1250*7bded2dbSJung-uk Kim	vpand	$t1,$D,$D
1251*7bded2dbSJung-uk Kim	vpaddd	0x40($ctx),$C,$C
1252*7bded2dbSJung-uk Kim	vpand	$t1,$E,$E
1253*7bded2dbSJung-uk Kim	vpaddd	0x60($ctx),$D,$D
1254*7bded2dbSJung-uk Kim	vpaddd	0x80($ctx),$E,$E
1255*7bded2dbSJung-uk Kim	vmovdqu	$A,0x00($ctx)
1256*7bded2dbSJung-uk Kim	vmovdqu	$B,0x20($ctx)
1257*7bded2dbSJung-uk Kim	vmovdqu	$C,0x40($ctx)
1258*7bded2dbSJung-uk Kim	vmovdqu	$D,0x60($ctx)
1259*7bded2dbSJung-uk Kim	vmovdqu	$E,0x80($ctx)
1260*7bded2dbSJung-uk Kim
1261*7bded2dbSJung-uk Kim	vmovdqu	$t0,(%rbx)			# save counters
1262*7bded2dbSJung-uk Kim	lea	256+128(%rsp),%rbx
1263*7bded2dbSJung-uk Kim	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1264*7bded2dbSJung-uk Kim	dec	$num
1265*7bded2dbSJung-uk Kim	jnz	.Loop_avx2
1266*7bded2dbSJung-uk Kim
1267*7bded2dbSJung-uk Kim	#mov	`$REG_SZ*17+8`(%rsp),$num
1268*7bded2dbSJung-uk Kim	#lea	$REG_SZ($ctx),$ctx
1269*7bded2dbSJung-uk Kim	#lea	`16*$REG_SZ/4`($inp),$inp
1270*7bded2dbSJung-uk Kim	#dec	$num
1271*7bded2dbSJung-uk Kim	#jnz	.Loop_grande_avx2
1272*7bded2dbSJung-uk Kim
1273*7bded2dbSJung-uk Kim.Ldone_avx2:
1274*7bded2dbSJung-uk Kim	mov	`$REG_SZ*17`(%rsp),%rax		# orignal %rsp
1275*7bded2dbSJung-uk Kim	vzeroupper
1276*7bded2dbSJung-uk Kim___
1277*7bded2dbSJung-uk Kim$code.=<<___ if ($win64);
1278*7bded2dbSJung-uk Kim	movaps	-0xd8(%rax),%xmm6
1279*7bded2dbSJung-uk Kim	movaps	-0xc8(%rax),%xmm7
1280*7bded2dbSJung-uk Kim	movaps	-0xb8(%rax),%xmm8
1281*7bded2dbSJung-uk Kim	movaps	-0xa8(%rax),%xmm9
1282*7bded2dbSJung-uk Kim	movaps	-0x98(%rax),%xmm10
1283*7bded2dbSJung-uk Kim	movaps	-0x88(%rax),%xmm11
1284*7bded2dbSJung-uk Kim	movaps	-0x78(%rax),%xmm12
1285*7bded2dbSJung-uk Kim	movaps	-0x68(%rax),%xmm13
1286*7bded2dbSJung-uk Kim	movaps	-0x58(%rax),%xmm14
1287*7bded2dbSJung-uk Kim	movaps	-0x48(%rax),%xmm15
1288*7bded2dbSJung-uk Kim___
1289*7bded2dbSJung-uk Kim$code.=<<___;
1290*7bded2dbSJung-uk Kim	mov	-48(%rax),%r15
1291*7bded2dbSJung-uk Kim	mov	-40(%rax),%r14
1292*7bded2dbSJung-uk Kim	mov	-32(%rax),%r13
1293*7bded2dbSJung-uk Kim	mov	-24(%rax),%r12
1294*7bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
1295*7bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
1296*7bded2dbSJung-uk Kim	lea	(%rax),%rsp
1297*7bded2dbSJung-uk Kim.Lepilogue_avx2:
1298*7bded2dbSJung-uk Kim	ret
1299*7bded2dbSJung-uk Kim.size	sha1_multi_block_avx2,.-sha1_multi_block_avx2
1300*7bded2dbSJung-uk Kim___
1301*7bded2dbSJung-uk Kim						}	}}}
1302*7bded2dbSJung-uk Kim$code.=<<___;
1303*7bded2dbSJung-uk Kim
1304*7bded2dbSJung-uk Kim.align	256
1305*7bded2dbSJung-uk Kim	.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1306*7bded2dbSJung-uk Kim	.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1307*7bded2dbSJung-uk KimK_XX_XX:
1308*7bded2dbSJung-uk Kim	.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1309*7bded2dbSJung-uk Kim	.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1310*7bded2dbSJung-uk Kim	.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1311*7bded2dbSJung-uk Kim	.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1312*7bded2dbSJung-uk Kim	.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1313*7bded2dbSJung-uk Kim	.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1314*7bded2dbSJung-uk Kim	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1315*7bded2dbSJung-uk Kim	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1316*7bded2dbSJung-uk Kim	.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1317*7bded2dbSJung-uk Kim	.asciz	"SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1318*7bded2dbSJung-uk Kim___
1319*7bded2dbSJung-uk Kim
1320*7bded2dbSJung-uk Kimif ($win64) {
1321*7bded2dbSJung-uk Kim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1322*7bded2dbSJung-uk Kim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1323*7bded2dbSJung-uk Kim$rec="%rcx";
1324*7bded2dbSJung-uk Kim$frame="%rdx";
1325*7bded2dbSJung-uk Kim$context="%r8";
1326*7bded2dbSJung-uk Kim$disp="%r9";
1327*7bded2dbSJung-uk Kim
1328*7bded2dbSJung-uk Kim$code.=<<___;
1329*7bded2dbSJung-uk Kim.extern	__imp_RtlVirtualUnwind
1330*7bded2dbSJung-uk Kim.type	se_handler,\@abi-omnipotent
1331*7bded2dbSJung-uk Kim.align	16
1332*7bded2dbSJung-uk Kimse_handler:
1333*7bded2dbSJung-uk Kim	push	%rsi
1334*7bded2dbSJung-uk Kim	push	%rdi
1335*7bded2dbSJung-uk Kim	push	%rbx
1336*7bded2dbSJung-uk Kim	push	%rbp
1337*7bded2dbSJung-uk Kim	push	%r12
1338*7bded2dbSJung-uk Kim	push	%r13
1339*7bded2dbSJung-uk Kim	push	%r14
1340*7bded2dbSJung-uk Kim	push	%r15
1341*7bded2dbSJung-uk Kim	pushfq
1342*7bded2dbSJung-uk Kim	sub	\$64,%rsp
1343*7bded2dbSJung-uk Kim
1344*7bded2dbSJung-uk Kim	mov	120($context),%rax	# pull context->Rax
1345*7bded2dbSJung-uk Kim	mov	248($context),%rbx	# pull context->Rip
1346*7bded2dbSJung-uk Kim
1347*7bded2dbSJung-uk Kim	mov	8($disp),%rsi		# disp->ImageBase
1348*7bded2dbSJung-uk Kim	mov	56($disp),%r11		# disp->HandlerData
1349*7bded2dbSJung-uk Kim
1350*7bded2dbSJung-uk Kim	mov	0(%r11),%r10d		# HandlerData[0]
1351*7bded2dbSJung-uk Kim	lea	(%rsi,%r10),%r10	# end of prologue label
1352*7bded2dbSJung-uk Kim	cmp	%r10,%rbx		# context->Rip<.Lbody
1353*7bded2dbSJung-uk Kim	jb	.Lin_prologue
1354*7bded2dbSJung-uk Kim
1355*7bded2dbSJung-uk Kim	mov	152($context),%rax	# pull context->Rsp
1356*7bded2dbSJung-uk Kim
1357*7bded2dbSJung-uk Kim	mov	4(%r11),%r10d		# HandlerData[1]
1358*7bded2dbSJung-uk Kim	lea	(%rsi,%r10),%r10	# epilogue label
1359*7bded2dbSJung-uk Kim	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1360*7bded2dbSJung-uk Kim	jae	.Lin_prologue
1361*7bded2dbSJung-uk Kim
1362*7bded2dbSJung-uk Kim	mov	`16*17`(%rax),%rax	# pull saved stack pointer
1363*7bded2dbSJung-uk Kim
1364*7bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
1365*7bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
1366*7bded2dbSJung-uk Kim	mov	%rbx,144($context)	# restore context->Rbx
1367*7bded2dbSJung-uk Kim	mov	%rbp,160($context)	# restore context->Rbp
1368*7bded2dbSJung-uk Kim
1369*7bded2dbSJung-uk Kim	lea	-24-10*16(%rax),%rsi
1370*7bded2dbSJung-uk Kim	lea	512($context),%rdi	# &context.Xmm6
1371*7bded2dbSJung-uk Kim	mov	\$20,%ecx
1372*7bded2dbSJung-uk Kim	.long	0xa548f3fc		# cld; rep movsq
1373*7bded2dbSJung-uk Kim
1374*7bded2dbSJung-uk Kim.Lin_prologue:
1375*7bded2dbSJung-uk Kim	mov	8(%rax),%rdi
1376*7bded2dbSJung-uk Kim	mov	16(%rax),%rsi
1377*7bded2dbSJung-uk Kim	mov	%rax,152($context)	# restore context->Rsp
1378*7bded2dbSJung-uk Kim	mov	%rsi,168($context)	# restore context->Rsi
1379*7bded2dbSJung-uk Kim	mov	%rdi,176($context)	# restore context->Rdi
1380*7bded2dbSJung-uk Kim
1381*7bded2dbSJung-uk Kim	mov	40($disp),%rdi		# disp->ContextRecord
1382*7bded2dbSJung-uk Kim	mov	$context,%rsi		# context
1383*7bded2dbSJung-uk Kim	mov	\$154,%ecx		# sizeof(CONTEXT)
1384*7bded2dbSJung-uk Kim	.long	0xa548f3fc		# cld; rep movsq
1385*7bded2dbSJung-uk Kim
1386*7bded2dbSJung-uk Kim	mov	$disp,%rsi
1387*7bded2dbSJung-uk Kim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1388*7bded2dbSJung-uk Kim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1389*7bded2dbSJung-uk Kim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1390*7bded2dbSJung-uk Kim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1391*7bded2dbSJung-uk Kim	mov	40(%rsi),%r10		# disp->ContextRecord
1392*7bded2dbSJung-uk Kim	lea	56(%rsi),%r11		# &disp->HandlerData
1393*7bded2dbSJung-uk Kim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1394*7bded2dbSJung-uk Kim	mov	%r10,32(%rsp)		# arg5
1395*7bded2dbSJung-uk Kim	mov	%r11,40(%rsp)		# arg6
1396*7bded2dbSJung-uk Kim	mov	%r12,48(%rsp)		# arg7
1397*7bded2dbSJung-uk Kim	mov	%rcx,56(%rsp)		# arg8, (NULL)
1398*7bded2dbSJung-uk Kim	call	*__imp_RtlVirtualUnwind(%rip)
1399*7bded2dbSJung-uk Kim
1400*7bded2dbSJung-uk Kim	mov	\$1,%eax		# ExceptionContinueSearch
1401*7bded2dbSJung-uk Kim	add	\$64,%rsp
1402*7bded2dbSJung-uk Kim	popfq
1403*7bded2dbSJung-uk Kim	pop	%r15
1404*7bded2dbSJung-uk Kim	pop	%r14
1405*7bded2dbSJung-uk Kim	pop	%r13
1406*7bded2dbSJung-uk Kim	pop	%r12
1407*7bded2dbSJung-uk Kim	pop	%rbp
1408*7bded2dbSJung-uk Kim	pop	%rbx
1409*7bded2dbSJung-uk Kim	pop	%rdi
1410*7bded2dbSJung-uk Kim	pop	%rsi
1411*7bded2dbSJung-uk Kim	ret
1412*7bded2dbSJung-uk Kim.size	se_handler,.-se_handler
1413*7bded2dbSJung-uk Kim___
1414*7bded2dbSJung-uk Kim$code.=<<___ if ($avx>1);
1415*7bded2dbSJung-uk Kim.type	avx2_handler,\@abi-omnipotent
1416*7bded2dbSJung-uk Kim.align	16
1417*7bded2dbSJung-uk Kimavx2_handler:
1418*7bded2dbSJung-uk Kim	push	%rsi
1419*7bded2dbSJung-uk Kim	push	%rdi
1420*7bded2dbSJung-uk Kim	push	%rbx
1421*7bded2dbSJung-uk Kim	push	%rbp
1422*7bded2dbSJung-uk Kim	push	%r12
1423*7bded2dbSJung-uk Kim	push	%r13
1424*7bded2dbSJung-uk Kim	push	%r14
1425*7bded2dbSJung-uk Kim	push	%r15
1426*7bded2dbSJung-uk Kim	pushfq
1427*7bded2dbSJung-uk Kim	sub	\$64,%rsp
1428*7bded2dbSJung-uk Kim
1429*7bded2dbSJung-uk Kim	mov	120($context),%rax	# pull context->Rax
1430*7bded2dbSJung-uk Kim	mov	248($context),%rbx	# pull context->Rip
1431*7bded2dbSJung-uk Kim
1432*7bded2dbSJung-uk Kim	mov	8($disp),%rsi		# disp->ImageBase
1433*7bded2dbSJung-uk Kim	mov	56($disp),%r11		# disp->HandlerData
1434*7bded2dbSJung-uk Kim
1435*7bded2dbSJung-uk Kim	mov	0(%r11),%r10d		# HandlerData[0]
1436*7bded2dbSJung-uk Kim	lea	(%rsi,%r10),%r10	# end of prologue label
1437*7bded2dbSJung-uk Kim	cmp	%r10,%rbx		# context->Rip<body label
1438*7bded2dbSJung-uk Kim	jb	.Lin_prologue
1439*7bded2dbSJung-uk Kim
1440*7bded2dbSJung-uk Kim	mov	152($context),%rax	# pull context->Rsp
1441*7bded2dbSJung-uk Kim
1442*7bded2dbSJung-uk Kim	mov	4(%r11),%r10d		# HandlerData[1]
1443*7bded2dbSJung-uk Kim	lea	(%rsi,%r10),%r10	# epilogue label
1444*7bded2dbSJung-uk Kim	cmp	%r10,%rbx		# context->Rip>=epilogue label
1445*7bded2dbSJung-uk Kim	jae	.Lin_prologue
1446*7bded2dbSJung-uk Kim
1447*7bded2dbSJung-uk Kim	mov	`32*17`($context),%rax	# pull saved stack pointer
1448*7bded2dbSJung-uk Kim
1449*7bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
1450*7bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
1451*7bded2dbSJung-uk Kim	mov	-24(%rax),%r12
1452*7bded2dbSJung-uk Kim	mov	-32(%rax),%r13
1453*7bded2dbSJung-uk Kim	mov	-40(%rax),%r14
1454*7bded2dbSJung-uk Kim	mov	-48(%rax),%r15
1455*7bded2dbSJung-uk Kim	mov	%rbx,144($context)	# restore context->Rbx
1456*7bded2dbSJung-uk Kim	mov	%rbp,160($context)	# restore context->Rbp
1457*7bded2dbSJung-uk Kim	mov	%r12,216($context)	# restore cotnext->R12
1458*7bded2dbSJung-uk Kim	mov	%r13,224($context)	# restore cotnext->R13
1459*7bded2dbSJung-uk Kim	mov	%r14,232($context)	# restore cotnext->R14
1460*7bded2dbSJung-uk Kim	mov	%r15,240($context)	# restore cotnext->R15
1461*7bded2dbSJung-uk Kim
1462*7bded2dbSJung-uk Kim	lea	-56-10*16(%rax),%rsi
1463*7bded2dbSJung-uk Kim	lea	512($context),%rdi	# &context.Xmm6
1464*7bded2dbSJung-uk Kim	mov	\$20,%ecx
1465*7bded2dbSJung-uk Kim	.long	0xa548f3fc		# cld; rep movsq
1466*7bded2dbSJung-uk Kim
1467*7bded2dbSJung-uk Kim	jmp	.Lin_prologue
1468*7bded2dbSJung-uk Kim.size	avx2_handler,.-avx2_handler
1469*7bded2dbSJung-uk Kim___
1470*7bded2dbSJung-uk Kim$code.=<<___;
1471*7bded2dbSJung-uk Kim.section	.pdata
1472*7bded2dbSJung-uk Kim.align	4
1473*7bded2dbSJung-uk Kim	.rva	.LSEH_begin_sha1_multi_block
1474*7bded2dbSJung-uk Kim	.rva	.LSEH_end_sha1_multi_block
1475*7bded2dbSJung-uk Kim	.rva	.LSEH_info_sha1_multi_block
1476*7bded2dbSJung-uk Kim	.rva	.LSEH_begin_sha1_multi_block_shaext
1477*7bded2dbSJung-uk Kim	.rva	.LSEH_end_sha1_multi_block_shaext
1478*7bded2dbSJung-uk Kim	.rva	.LSEH_info_sha1_multi_block_shaext
1479*7bded2dbSJung-uk Kim___
1480*7bded2dbSJung-uk Kim$code.=<<___ if ($avx);
1481*7bded2dbSJung-uk Kim	.rva	.LSEH_begin_sha1_multi_block_avx
1482*7bded2dbSJung-uk Kim	.rva	.LSEH_end_sha1_multi_block_avx
1483*7bded2dbSJung-uk Kim	.rva	.LSEH_info_sha1_multi_block_avx
1484*7bded2dbSJung-uk Kim___
1485*7bded2dbSJung-uk Kim$code.=<<___ if ($avx>1);
1486*7bded2dbSJung-uk Kim	.rva	.LSEH_begin_sha1_multi_block_avx2
1487*7bded2dbSJung-uk Kim	.rva	.LSEH_end_sha1_multi_block_avx2
1488*7bded2dbSJung-uk Kim	.rva	.LSEH_info_sha1_multi_block_avx2
1489*7bded2dbSJung-uk Kim___
1490*7bded2dbSJung-uk Kim$code.=<<___;
1491*7bded2dbSJung-uk Kim.section	.xdata
1492*7bded2dbSJung-uk Kim.align	8
1493*7bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block:
1494*7bded2dbSJung-uk Kim	.byte	9,0,0,0
1495*7bded2dbSJung-uk Kim	.rva	se_handler
1496*7bded2dbSJung-uk Kim	.rva	.Lbody,.Lepilogue			# HandlerData[]
1497*7bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block_shaext:
1498*7bded2dbSJung-uk Kim	.byte	9,0,0,0
1499*7bded2dbSJung-uk Kim	.rva	se_handler
1500*7bded2dbSJung-uk Kim	.rva	.Lbody_shaext,.Lepilogue_shaext	# HandlerData[]
1501*7bded2dbSJung-uk Kim___
1502*7bded2dbSJung-uk Kim$code.=<<___ if ($avx);
1503*7bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block_avx:
1504*7bded2dbSJung-uk Kim	.byte	9,0,0,0
1505*7bded2dbSJung-uk Kim	.rva	se_handler
1506*7bded2dbSJung-uk Kim	.rva	.Lbody_avx,.Lepilogue_avx		# HandlerData[]
1507*7bded2dbSJung-uk Kim___
1508*7bded2dbSJung-uk Kim$code.=<<___ if ($avx>1);
1509*7bded2dbSJung-uk Kim.LSEH_info_sha1_multi_block_avx2:
1510*7bded2dbSJung-uk Kim	.byte	9,0,0,0
1511*7bded2dbSJung-uk Kim	.rva	avx2_handler
1512*7bded2dbSJung-uk Kim	.rva	.Lbody_avx2,.Lepilogue_avx2		# HandlerData[]
1513*7bded2dbSJung-uk Kim___
1514*7bded2dbSJung-uk Kim}
1515*7bded2dbSJung-uk Kim####################################################################
1516*7bded2dbSJung-uk Kim
1517*7bded2dbSJung-uk Kimsub rex {
1518*7bded2dbSJung-uk Kim  local *opcode=shift;
1519*7bded2dbSJung-uk Kim  my ($dst,$src)=@_;
1520*7bded2dbSJung-uk Kim  my $rex=0;
1521*7bded2dbSJung-uk Kim
1522*7bded2dbSJung-uk Kim    $rex|=0x04			if ($dst>=8);
1523*7bded2dbSJung-uk Kim    $rex|=0x01			if ($src>=8);
1524*7bded2dbSJung-uk Kim    unshift @opcode,$rex|0x40	if ($rex);
1525*7bded2dbSJung-uk Kim}
1526*7bded2dbSJung-uk Kim
1527*7bded2dbSJung-uk Kimsub sha1rnds4 {
1528*7bded2dbSJung-uk Kim    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1529*7bded2dbSJung-uk Kim      my @opcode=(0x0f,0x3a,0xcc);
1530*7bded2dbSJung-uk Kim	rex(\@opcode,$3,$2);
1531*7bded2dbSJung-uk Kim	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
1532*7bded2dbSJung-uk Kim	my $c=$1;
1533*7bded2dbSJung-uk Kim	push @opcode,$c=~/^0/?oct($c):$c;
1534*7bded2dbSJung-uk Kim	return ".byte\t".join(',',@opcode);
1535*7bded2dbSJung-uk Kim    } else {
1536*7bded2dbSJung-uk Kim	return "sha1rnds4\t".@_[0];
1537*7bded2dbSJung-uk Kim    }
1538*7bded2dbSJung-uk Kim}
1539*7bded2dbSJung-uk Kim
1540*7bded2dbSJung-uk Kimsub sha1op38 {
1541*7bded2dbSJung-uk Kim    my $instr = shift;
1542*7bded2dbSJung-uk Kim    my %opcodelet = (
1543*7bded2dbSJung-uk Kim		"sha1nexte" => 0xc8,
1544*7bded2dbSJung-uk Kim  		"sha1msg1"  => 0xc9,
1545*7bded2dbSJung-uk Kim		"sha1msg2"  => 0xca	);
1546*7bded2dbSJung-uk Kim
1547*7bded2dbSJung-uk Kim    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1548*7bded2dbSJung-uk Kim      my @opcode=(0x0f,0x38);
1549*7bded2dbSJung-uk Kim	rex(\@opcode,$2,$1);
1550*7bded2dbSJung-uk Kim	push @opcode,$opcodelet{$instr};
1551*7bded2dbSJung-uk Kim	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1552*7bded2dbSJung-uk Kim	return ".byte\t".join(',',@opcode);
1553*7bded2dbSJung-uk Kim    } else {
1554*7bded2dbSJung-uk Kim	return $instr."\t".@_[0];
1555*7bded2dbSJung-uk Kim    }
1556*7bded2dbSJung-uk Kim}
1557*7bded2dbSJung-uk Kim
1558*7bded2dbSJung-uk Kimforeach (split("\n",$code)) {
1559*7bded2dbSJung-uk Kim	s/\`([^\`]*)\`/eval($1)/ge;
1560*7bded2dbSJung-uk Kim
1561*7bded2dbSJung-uk Kim	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo		or
1562*7bded2dbSJung-uk Kim	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo		or
1563*7bded2dbSJung-uk Kim
1564*7bded2dbSJung-uk Kim	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
1565*7bded2dbSJung-uk Kim	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
1566*7bded2dbSJung-uk Kim	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go	or
1567*7bded2dbSJung-uk Kim	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1568*7bded2dbSJung-uk Kim	s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go		or
1569*7bded2dbSJung-uk Kim	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1570*7bded2dbSJung-uk Kim
1571*7bded2dbSJung-uk Kim	print $_,"\n";
1572*7bded2dbSJung-uk Kim}
1573*7bded2dbSJung-uk Kim
1574*7bded2dbSJung-uk Kimclose STDOUT;
1575