xref: /freebsd/crypto/openssl/crypto/sha/asm/sha1-mb-x86_64.pl (revision 396c556d77189a5c474d35cec6f44a762e310b7d)
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Multi-buffer SHA1 procedure processes n buffers in parallel by
11# placing buffer data to designated lane of SIMD register. n is
12# naturally limited to 4 on pre-AVX2 processors and to 8 on
13# AVX2-capable processors such as Haswell.
14#
15#		this	+aesni(i)	sha1	aesni-sha1	gain(iv)
16# -------------------------------------------------------------------
17# Westmere(ii)	10.7/n	+1.28=3.96(n=4)	5.30	6.66		+68%
18# Atom(ii)	18.1/n	+3.93=8.46(n=4)	9.37	12.8		+51%
19# Sandy Bridge	(8.16	+5.15=13.3)/n	4.99	5.98		+80%
20# Ivy Bridge	(8.08	+5.14=13.2)/n	4.60	5.54		+68%
21# Haswell(iii)	(8.96	+5.00=14.0)/n	3.57	4.55		+160%
22# Bulldozer	(9.76	+5.76=15.5)/n	5.95	6.37		+64%
23#
24# (i)	multi-block CBC encrypt with 128-bit key;
25# (ii)	(HASH+AES)/n does not apply to Westmere for n>3 and Atom,
26#	because of lower AES-NI instruction throughput;
27# (iii)	"this" is for n=8, when we gather twice as much data, result
28#	for n=4 is 8.00+4.44=12.4;
29# (iv)	presented improvement coefficients are asymptotic limits and
30#	in real-life application are somewhat lower, e.g. for 2KB
31#	fragments they range from 30% to 100% (on Haswell);
32
33$flavour = shift;
34$output  = shift;
35if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
36
37$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
41( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
42die "can't locate x86_64-xlate.pl";
43
44$avx=0;
45
46if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
47		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
48	$avx = ($1>=2.19) + ($1>=2.22);
49}
50
51if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
52	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
53	$avx = ($1>=2.09) + ($1>=2.10);
54}
55
56if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
57	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
58	$avx = ($1>=10) + ($1>=11);
59}
60
61if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
62	$avx = ($2>=3.0) + ($2>3.0);
63}
64
65open OUT,"| \"$^X\" $xlate $flavour $output";
66*STDOUT=*OUT;
67
68# void sha1_multi_block (
69#     struct {	unsigned int A[8];
70#		unsigned int B[8];
71#		unsigned int C[8];
72#		unsigned int D[8];
73#		unsigned int E[8];	} *ctx,
74#     struct {	void *ptr; int blocks;	} inp[8],
75#     int num);		/* 1 or 2 */
76#
77$ctx="%rdi";	# 1st arg
78$inp="%rsi";	# 2nd arg
79$num="%edx";
80@ptr=map("%r$_",(8..11));
81$Tbl="%rbp";
82
83@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4));
84($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9));
85@Xi=map("%xmm$_",(10..14));
86$K="%xmm15";
87
88if (1) {
89    # Atom-specific optimization aiming to eliminate pshufb with high
90    # registers [and thus get rid of 48 cycles accumulated penalty]
91    @Xi=map("%xmm$_",(0..4));
92    ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
93    @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
94}
95
96$REG_SZ=16;
97
98sub Xi_off {
99my $off = shift;
100
101    $off %= 16; $off *= $REG_SZ;
102    $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
103}
104
105sub BODY_00_19 {
106my ($i,$a,$b,$c,$d,$e)=@_;
107my $j=$i+1;
108my $k=$i+2;
109
110# Loads are performed 2+3/4 iterations in advance. 3/4 means that out
111# of 4 words you would expect to be loaded per given iteration one is
112# spilled to next iteration. In other words indices in four input
113# streams are distributed as following:
114#
115# $i==0:	0,0,0,0,1,1,1,1,2,2,2,
116# $i==1:	2,3,3,3,
117# $i==2:	3,4,4,4,
118# ...
119# $i==13:	14,15,15,15,
120# $i==14:	15
121#
122# Then at $i==15 Xupdate is applied one iteration in advance...
123$code.=<<___ if ($i==0);
124	movd		(@ptr[0]),@Xi[0]
125	 lea		`16*4`(@ptr[0]),@ptr[0]
126	movd		(@ptr[1]),@Xi[2]	# borrow @Xi[2]
127	 lea		`16*4`(@ptr[1]),@ptr[1]
128	movd		(@ptr[2]),@Xi[3]	# borrow @Xi[3]
129	 lea		`16*4`(@ptr[2]),@ptr[2]
130	movd		(@ptr[3]),@Xi[4]	# borrow @Xi[4]
131	 lea		`16*4`(@ptr[3]),@ptr[3]
132	punpckldq	@Xi[3],@Xi[0]
133	 movd		`4*$j-16*4`(@ptr[0]),@Xi[1]
134	punpckldq	@Xi[4],@Xi[2]
135	 movd		`4*$j-16*4`(@ptr[1]),$t3
136	punpckldq	@Xi[2],@Xi[0]
137	 movd		`4*$j-16*4`(@ptr[2]),$t2
138	pshufb		$tx,@Xi[0]
139___
140$code.=<<___ if ($i<14);			# just load input
141	 movd		`4*$j-16*4`(@ptr[3]),$t1
142	 punpckldq	$t2,@Xi[1]
143	movdqa	$a,$t2
144	paddd	$K,$e				# e+=K_00_19
145	 punpckldq	$t1,$t3
146	movdqa	$b,$t1
147	movdqa	$b,$t0
148	pslld	\$5,$t2
149	pandn	$d,$t1
150	pand	$c,$t0
151	 punpckldq	$t3,@Xi[1]
152	movdqa	$a,$t3
153
154	movdqa	@Xi[0],`&Xi_off($i)`
155	paddd	@Xi[0],$e			# e+=X[i]
156	 movd		`4*$k-16*4`(@ptr[0]),@Xi[2]
157	psrld	\$27,$t3
158	pxor	$t1,$t0				# Ch(b,c,d)
159	movdqa	$b,$t1
160
161	por	$t3,$t2				# rol(a,5)
162	 movd		`4*$k-16*4`(@ptr[1]),$t3
163	pslld	\$30,$t1
164	paddd	$t0,$e				# e+=Ch(b,c,d)
165
166	psrld	\$2,$b
167	paddd	$t2,$e				# e+=rol(a,5)
168	 pshufb	$tx,@Xi[1]
169	 movd		`4*$k-16*4`(@ptr[2]),$t2
170	por	$t1,$b				# b=rol(b,30)
171___
172$code.=<<___ if ($i==14);			# just load input
173	 movd		`4*$j-16*4`(@ptr[3]),$t1
174	 punpckldq	$t2,@Xi[1]
175	movdqa	$a,$t2
176	paddd	$K,$e				# e+=K_00_19
177	 punpckldq	$t1,$t3
178	movdqa	$b,$t1
179	movdqa	$b,$t0
180	pslld	\$5,$t2
181	 prefetcht0	63(@ptr[0])
182	pandn	$d,$t1
183	pand	$c,$t0
184	 punpckldq	$t3,@Xi[1]
185	movdqa	$a,$t3
186
187	movdqa	@Xi[0],`&Xi_off($i)`
188	paddd	@Xi[0],$e			# e+=X[i]
189	psrld	\$27,$t3
190	pxor	$t1,$t0				# Ch(b,c,d)
191	movdqa	$b,$t1
192	 prefetcht0	63(@ptr[1])
193
194	por	$t3,$t2				# rol(a,5)
195	pslld	\$30,$t1
196	paddd	$t0,$e				# e+=Ch(b,c,d)
197	 prefetcht0	63(@ptr[2])
198
199	psrld	\$2,$b
200	paddd	$t2,$e				# e+=rol(a,5)
201	 pshufb	$tx,@Xi[1]
202	 prefetcht0	63(@ptr[3])
203	por	$t1,$b				# b=rol(b,30)
204___
205$code.=<<___ if ($i>=13 && $i<15);
206	movdqa	`&Xi_off($j+2)`,@Xi[3]		# preload "X[2]"
207___
208$code.=<<___ if ($i>=15);			# apply Xupdate
209	pxor	@Xi[-2],@Xi[1]			# "X[13]"
210	movdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
211
212	movdqa	$a,$t2
213	 pxor	`&Xi_off($j+8)`,@Xi[1]
214	paddd	$K,$e				# e+=K_00_19
215	movdqa	$b,$t1
216	pslld	\$5,$t2
217	 pxor	@Xi[3],@Xi[1]
218	movdqa	$b,$t0
219	pandn	$d,$t1
220	 movdqa	@Xi[1],$tx
221	pand	$c,$t0
222	movdqa	$a,$t3
223	 psrld	\$31,$tx
224	 paddd	@Xi[1],@Xi[1]
225
226	movdqa	@Xi[0],`&Xi_off($i)`
227	paddd	@Xi[0],$e			# e+=X[i]
228	psrld	\$27,$t3
229	pxor	$t1,$t0				# Ch(b,c,d)
230
231	movdqa	$b,$t1
232	por	$t3,$t2				# rol(a,5)
233	pslld	\$30,$t1
234	paddd	$t0,$e				# e+=Ch(b,c,d)
235
236	psrld	\$2,$b
237	paddd	$t2,$e				# e+=rol(a,5)
238	 por	$tx,@Xi[1]			# rol	\$1,@Xi[1]
239	por	$t1,$b				# b=rol(b,30)
240___
241push(@Xi,shift(@Xi));
242}
243
244sub BODY_20_39 {
245my ($i,$a,$b,$c,$d,$e)=@_;
246my $j=$i+1;
247
248$code.=<<___ if ($i<79);
249	pxor	@Xi[-2],@Xi[1]			# "X[13]"
250	movdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
251
252	movdqa	$a,$t2
253	movdqa	$d,$t0
254	 pxor	`&Xi_off($j+8)`,@Xi[1]
255	paddd	$K,$e				# e+=K_20_39
256	pslld	\$5,$t2
257	pxor	$b,$t0
258
259	movdqa	$a,$t3
260___
261$code.=<<___ if ($i<72);
262	movdqa	@Xi[0],`&Xi_off($i)`
263___
264$code.=<<___ if ($i<79);
265	paddd	@Xi[0],$e			# e+=X[i]
266	 pxor	@Xi[3],@Xi[1]
267	psrld	\$27,$t3
268	pxor	$c,$t0				# Parity(b,c,d)
269	movdqa	$b,$t1
270
271	pslld	\$30,$t1
272	 movdqa	@Xi[1],$tx
273	por	$t3,$t2				# rol(a,5)
274	 psrld	\$31,$tx
275	paddd	$t0,$e				# e+=Parity(b,c,d)
276	 paddd	@Xi[1],@Xi[1]
277
278	psrld	\$2,$b
279	paddd	$t2,$e				# e+=rol(a,5)
280	 por	$tx,@Xi[1]			# rol(@Xi[1],1)
281	por	$t1,$b				# b=rol(b,30)
282___
283$code.=<<___ if ($i==79);
284	movdqa	$a,$t2
285	paddd	$K,$e				# e+=K_20_39
286	movdqa	$d,$t0
287	pslld	\$5,$t2
288	pxor	$b,$t0
289
290	movdqa	$a,$t3
291	paddd	@Xi[0],$e			# e+=X[i]
292	psrld	\$27,$t3
293	movdqa	$b,$t1
294	pxor	$c,$t0				# Parity(b,c,d)
295
296	pslld	\$30,$t1
297	por	$t3,$t2				# rol(a,5)
298	paddd	$t0,$e				# e+=Parity(b,c,d)
299
300	psrld	\$2,$b
301	paddd	$t2,$e				# e+=rol(a,5)
302	por	$t1,$b				# b=rol(b,30)
303___
304push(@Xi,shift(@Xi));
305}
306
307sub BODY_40_59 {
308my ($i,$a,$b,$c,$d,$e)=@_;
309my $j=$i+1;
310
311$code.=<<___;
312	pxor	@Xi[-2],@Xi[1]			# "X[13]"
313	movdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
314
315	movdqa	$a,$t2
316	movdqa	$d,$t1
317	 pxor	`&Xi_off($j+8)`,@Xi[1]
318	pxor	@Xi[3],@Xi[1]
319	paddd	$K,$e				# e+=K_40_59
320	pslld	\$5,$t2
321	movdqa	$a,$t3
322	pand	$c,$t1
323
324	movdqa	$d,$t0
325	 movdqa	@Xi[1],$tx
326	psrld	\$27,$t3
327	paddd	$t1,$e
328	pxor	$c,$t0
329
330	movdqa	@Xi[0],`&Xi_off($i)`
331	paddd	@Xi[0],$e			# e+=X[i]
332	por	$t3,$t2				# rol(a,5)
333	 psrld	\$31,$tx
334	pand	$b,$t0
335	movdqa	$b,$t1
336
337	pslld	\$30,$t1
338	 paddd	@Xi[1],@Xi[1]
339	paddd	$t0,$e				# e+=Maj(b,d,c)
340
341	psrld	\$2,$b
342	paddd	$t2,$e				# e+=rol(a,5)
343	 por	$tx,@Xi[1]			# rol(@X[1],1)
344	por	$t1,$b				# b=rol(b,30)
345___
346push(@Xi,shift(@Xi));
347}
348
349$code.=<<___;
350.text
351
352.extern	OPENSSL_ia32cap_P
353
354.globl	sha1_multi_block
355.type	sha1_multi_block,\@function,3
356.align	32
357sha1_multi_block:
358	mov	OPENSSL_ia32cap_P+4(%rip),%rcx
359	bt	\$61,%rcx			# check SHA bit
360	jc	_shaext_shortcut
361___
362$code.=<<___ if ($avx);
363	test	\$`1<<28`,%ecx
364	jnz	_avx_shortcut
365___
366$code.=<<___;
367	mov	%rsp,%rax
368	push	%rbx
369	push	%rbp
370___
371$code.=<<___ if ($win64);
372	lea	-0xa8(%rsp),%rsp
373	movaps	%xmm6,(%rsp)
374	movaps	%xmm7,0x10(%rsp)
375	movaps	%xmm8,0x20(%rsp)
376	movaps	%xmm9,0x30(%rsp)
377	movaps	%xmm10,-0x78(%rax)
378	movaps	%xmm11,-0x68(%rax)
379	movaps	%xmm12,-0x58(%rax)
380	movaps	%xmm13,-0x48(%rax)
381	movaps	%xmm14,-0x38(%rax)
382	movaps	%xmm15,-0x28(%rax)
383___
384$code.=<<___;
385	sub	\$`$REG_SZ*18`,%rsp
386	and	\$-256,%rsp
387	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
388.Lbody:
389	lea	K_XX_XX(%rip),$Tbl
390	lea	`$REG_SZ*16`(%rsp),%rbx
391
392.Loop_grande:
393	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
394	xor	$num,$num
395___
396for($i=0;$i<4;$i++) {
397    $code.=<<___;
398	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
399	mov	`16*$i+8`($inp),%ecx		# number of blocks
400	cmp	$num,%ecx
401	cmovg	%ecx,$num			# find maximum
402	test	%ecx,%ecx
403	mov	%ecx,`4*$i`(%rbx)		# initialize counters
404	cmovle	$Tbl,@ptr[$i]			# cancel input
405___
406}
407$code.=<<___;
408	test	$num,$num
409	jz	.Ldone
410
411	movdqu	0x00($ctx),$A			# load context
412	 lea	128(%rsp),%rax
413	movdqu	0x20($ctx),$B
414	movdqu	0x40($ctx),$C
415	movdqu	0x60($ctx),$D
416	movdqu	0x80($ctx),$E
417	movdqa	0x60($Tbl),$tx			# pbswap_mask
418	movdqa	-0x20($Tbl),$K			# K_00_19
419	jmp	.Loop
420
421.align	32
422.Loop:
423___
424for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
425$code.="	movdqa	0x00($Tbl),$K\n";	# K_20_39
426for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
427$code.="	movdqa	0x20($Tbl),$K\n";	# K_40_59
428for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
429$code.="	movdqa	0x40($Tbl),$K\n";	# K_60_79
430for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
431$code.=<<___;
432	movdqa	(%rbx),@Xi[0]			# pull counters
433	mov	\$1,%ecx
434	cmp	4*0(%rbx),%ecx			# examinte counters
435	pxor	$t2,$t2
436	cmovge	$Tbl,@ptr[0]			# cancel input
437	cmp	4*1(%rbx),%ecx
438	movdqa	@Xi[0],@Xi[1]
439	cmovge	$Tbl,@ptr[1]
440	cmp	4*2(%rbx),%ecx
441	pcmpgtd	$t2,@Xi[1]			# mask value
442	cmovge	$Tbl,@ptr[2]
443	cmp	4*3(%rbx),%ecx
444	paddd	@Xi[1],@Xi[0]			# counters--
445	cmovge	$Tbl,@ptr[3]
446
447	movdqu	0x00($ctx),$t0
448	pand	@Xi[1],$A
449	movdqu	0x20($ctx),$t1
450	pand	@Xi[1],$B
451	paddd	$t0,$A
452	movdqu	0x40($ctx),$t2
453	pand	@Xi[1],$C
454	paddd	$t1,$B
455	movdqu	0x60($ctx),$t3
456	pand	@Xi[1],$D
457	paddd	$t2,$C
458	movdqu	0x80($ctx),$tx
459	pand	@Xi[1],$E
460	movdqu	$A,0x00($ctx)
461	paddd	$t3,$D
462	movdqu	$B,0x20($ctx)
463	paddd	$tx,$E
464	movdqu	$C,0x40($ctx)
465	movdqu	$D,0x60($ctx)
466	movdqu	$E,0x80($ctx)
467
468	movdqa	@Xi[0],(%rbx)			# save counters
469	movdqa	0x60($Tbl),$tx			# pbswap_mask
470	movdqa	-0x20($Tbl),$K			# K_00_19
471	dec	$num
472	jnz	.Loop
473
474	mov	`$REG_SZ*17+8`(%rsp),$num
475	lea	$REG_SZ($ctx),$ctx
476	lea	`16*$REG_SZ/4`($inp),$inp
477	dec	$num
478	jnz	.Loop_grande
479
480.Ldone:
481	mov	`$REG_SZ*17`(%rsp),%rax		# orignal %rsp
482___
483$code.=<<___ if ($win64);
484	movaps	-0xb8(%rax),%xmm6
485	movaps	-0xa8(%rax),%xmm7
486	movaps	-0x98(%rax),%xmm8
487	movaps	-0x88(%rax),%xmm9
488	movaps	-0x78(%rax),%xmm10
489	movaps	-0x68(%rax),%xmm11
490	movaps	-0x58(%rax),%xmm12
491	movaps	-0x48(%rax),%xmm13
492	movaps	-0x38(%rax),%xmm14
493	movaps	-0x28(%rax),%xmm15
494___
495$code.=<<___;
496	mov	-16(%rax),%rbp
497	mov	-8(%rax),%rbx
498	lea	(%rax),%rsp
499.Lepilogue:
500	ret
501.size	sha1_multi_block,.-sha1_multi_block
502___
503						{{{
504my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10));
505my @MSG0=map("%xmm$_",(4..7));
506my @MSG1=map("%xmm$_",(11..14));
507
508$code.=<<___;
509.type	sha1_multi_block_shaext,\@function,3
510.align	32
511sha1_multi_block_shaext:
512_shaext_shortcut:
513	mov	%rsp,%rax
514	push	%rbx
515	push	%rbp
516___
517$code.=<<___ if ($win64);
518	lea	-0xa8(%rsp),%rsp
519	movaps	%xmm6,(%rsp)
520	movaps	%xmm7,0x10(%rsp)
521	movaps	%xmm8,0x20(%rsp)
522	movaps	%xmm9,0x30(%rsp)
523	movaps	%xmm10,-0x78(%rax)
524	movaps	%xmm11,-0x68(%rax)
525	movaps	%xmm12,-0x58(%rax)
526	movaps	%xmm13,-0x48(%rax)
527	movaps	%xmm14,-0x38(%rax)
528	movaps	%xmm15,-0x28(%rax)
529___
530$code.=<<___;
531	sub	\$`$REG_SZ*18`,%rsp
532	shl	\$1,$num			# we process pair at a time
533	and	\$-256,%rsp
534	lea	0x40($ctx),$ctx			# size optimization
535	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
536.Lbody_shaext:
537	lea	`$REG_SZ*16`(%rsp),%rbx
538	movdqa	K_XX_XX+0x80(%rip),$BSWAP	# byte-n-word swap
539
540.Loop_grande_shaext:
541	mov	$num,`$REG_SZ*17+8`(%rsp)	# orignal $num
542	xor	$num,$num
543___
544for($i=0;$i<2;$i++) {
545    $code.=<<___;
546	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
547	mov	`16*$i+8`($inp),%ecx		# number of blocks
548	cmp	$num,%ecx
549	cmovg	%ecx,$num			# find maximum
550	test	%ecx,%ecx
551	mov	%ecx,`4*$i`(%rbx)		# initialize counters
552	cmovle	%rsp,@ptr[$i]			# cancel input
553___
554}
555$code.=<<___;
556	test	$num,$num
557	jz	.Ldone_shaext
558
559	movq		0x00-0x40($ctx),$ABCD0	# a1.a0
560	movq		0x20-0x40($ctx),@MSG0[0]# b1.b0
561	movq		0x40-0x40($ctx),@MSG0[1]# c1.c0
562	movq		0x60-0x40($ctx),@MSG0[2]# d1.d0
563	movq		0x80-0x40($ctx),@MSG0[3]# e1.e0
564
565	punpckldq	@MSG0[0],$ABCD0		# b1.a1.b0.a0
566	punpckldq	@MSG0[2],@MSG0[1]	# d1.c1.d0.c0
567
568	movdqa		$ABCD0,$ABCD1
569	punpcklqdq	@MSG0[1],$ABCD0		# d0.c0.b0.a0
570	punpckhqdq	@MSG0[1],$ABCD1		# d1.c1.b1.a1
571
572	pshufd		\$0b00111111,@MSG0[3],$E0
573	pshufd		\$0b01111111,@MSG0[3],$E1
574	pshufd		\$0b00011011,$ABCD0,$ABCD0
575	pshufd		\$0b00011011,$ABCD1,$ABCD1
576	jmp		.Loop_shaext
577
578.align	32
579.Loop_shaext:
580	movdqu		0x00(@ptr[0]),@MSG0[0]
581	 movdqu		0x00(@ptr[1]),@MSG1[0]
582	movdqu		0x10(@ptr[0]),@MSG0[1]
583	 movdqu		0x10(@ptr[1]),@MSG1[1]
584	movdqu		0x20(@ptr[0]),@MSG0[2]
585	pshufb		$BSWAP,@MSG0[0]
586	 movdqu		0x20(@ptr[1]),@MSG1[2]
587	 pshufb		$BSWAP,@MSG1[0]
588	movdqu		0x30(@ptr[0]),@MSG0[3]
589	lea		0x40(@ptr[0]),@ptr[0]
590	pshufb		$BSWAP,@MSG0[1]
591	 movdqu		0x30(@ptr[1]),@MSG1[3]
592	 lea		0x40(@ptr[1]),@ptr[1]
593	 pshufb		$BSWAP,@MSG1[1]
594
595	movdqa		$E0,0x50(%rsp)		# offload
596	paddd		@MSG0[0],$E0
597	 movdqa		$E1,0x70(%rsp)
598	 paddd		@MSG1[0],$E1
599	movdqa		$ABCD0,0x40(%rsp)	# offload
600	movdqa		$ABCD0,$E0_
601	 movdqa		$ABCD1,0x60(%rsp)
602	 movdqa		$ABCD1,$E1_
603	sha1rnds4	\$0,$E0,$ABCD0		# 0-3
604	sha1nexte	@MSG0[1],$E0_
605	 sha1rnds4	\$0,$E1,$ABCD1		# 0-3
606	 sha1nexte	@MSG1[1],$E1_
607	pshufb		$BSWAP,@MSG0[2]
608	prefetcht0	127(@ptr[0])
609	sha1msg1	@MSG0[1],@MSG0[0]
610	 pshufb		$BSWAP,@MSG1[2]
611	 prefetcht0	127(@ptr[1])
612	 sha1msg1	@MSG1[1],@MSG1[0]
613
614	pshufb		$BSWAP,@MSG0[3]
615	movdqa		$ABCD0,$E0
616	 pshufb		$BSWAP,@MSG1[3]
617	 movdqa		$ABCD1,$E1
618	sha1rnds4	\$0,$E0_,$ABCD0		# 4-7
619	sha1nexte	@MSG0[2],$E0
620	 sha1rnds4	\$0,$E1_,$ABCD1		# 4-7
621	 sha1nexte	@MSG1[2],$E1
622	pxor		@MSG0[2],@MSG0[0]
623	sha1msg1	@MSG0[2],@MSG0[1]
624	 pxor		@MSG1[2],@MSG1[0]
625	 sha1msg1	@MSG1[2],@MSG1[1]
626___
627for($i=2;$i<20-4;$i++) {
628$code.=<<___;
629	movdqa		$ABCD0,$E0_
630	 movdqa		$ABCD1,$E1_
631	sha1rnds4	\$`int($i/5)`,$E0,$ABCD0	# 8-11
632	sha1nexte	@MSG0[3],$E0_
633	 sha1rnds4	\$`int($i/5)`,$E1,$ABCD1	# 8-11
634	 sha1nexte	@MSG1[3],$E1_
635	sha1msg2	@MSG0[3],@MSG0[0]
636	 sha1msg2	@MSG1[3],@MSG1[0]
637	pxor		@MSG0[3],@MSG0[1]
638	sha1msg1	@MSG0[3],@MSG0[2]
639	 pxor		@MSG1[3],@MSG1[1]
640	 sha1msg1	@MSG1[3],@MSG1[2]
641___
642	($E0,$E0_)=($E0_,$E0);		($E1,$E1_)=($E1_,$E1);
643	push(@MSG0,shift(@MSG0));	push(@MSG1,shift(@MSG1));
644}
645$code.=<<___;
646	movdqa		$ABCD0,$E0_
647	 movdqa		$ABCD1,$E1_
648	sha1rnds4	\$3,$E0,$ABCD0		# 64-67
649	sha1nexte	@MSG0[3],$E0_
650	 sha1rnds4	\$3,$E1,$ABCD1		# 64-67
651	 sha1nexte	@MSG1[3],$E1_
652	sha1msg2	@MSG0[3],@MSG0[0]
653	 sha1msg2	@MSG1[3],@MSG1[0]
654	pxor		@MSG0[3],@MSG0[1]
655	 pxor		@MSG1[3],@MSG1[1]
656
657	mov		\$1,%ecx
658	pxor		@MSG0[2],@MSG0[2]	# zero
659	cmp		4*0(%rbx),%ecx		# examine counters
660	cmovge		%rsp,@ptr[0]		# cancel input
661
662	movdqa		$ABCD0,$E0
663	 movdqa		$ABCD1,$E1
664	sha1rnds4	\$3,$E0_,$ABCD0		# 68-71
665	sha1nexte	@MSG0[0],$E0
666	 sha1rnds4	\$3,$E1_,$ABCD1		# 68-71
667	 sha1nexte	@MSG1[0],$E1
668	sha1msg2	@MSG0[0],@MSG0[1]
669	 sha1msg2	@MSG1[0],@MSG1[1]
670
671	cmp		4*1(%rbx),%ecx
672	cmovge		%rsp,@ptr[1]
673	movq		(%rbx),@MSG0[0]		# pull counters
674
675	movdqa		$ABCD0,$E0_
676	 movdqa		$ABCD1,$E1_
677	sha1rnds4	\$3,$E0,$ABCD0		# 72-75
678	sha1nexte	@MSG0[1],$E0_
679	 sha1rnds4	\$3,$E1,$ABCD1		# 72-75
680	 sha1nexte	@MSG1[1],$E1_
681
682	pshufd		\$0x00,@MSG0[0],@MSG1[2]
683	pshufd		\$0x55,@MSG0[0],@MSG1[3]
684	movdqa		@MSG0[0],@MSG0[1]
685	pcmpgtd		@MSG0[2],@MSG1[2]
686	pcmpgtd		@MSG0[2],@MSG1[3]
687
688	movdqa		$ABCD0,$E0
689	 movdqa		$ABCD1,$E1
690	sha1rnds4	\$3,$E0_,$ABCD0		# 76-79
691	sha1nexte	$MSG0[2],$E0
692	 sha1rnds4	\$3,$E1_,$ABCD1		# 76-79
693	 sha1nexte	$MSG0[2],$E1
694
695	pcmpgtd		@MSG0[2],@MSG0[1]	# counter mask
696	pand		@MSG1[2],$ABCD0
697	pand		@MSG1[2],$E0
698	 pand		@MSG1[3],$ABCD1
699	 pand		@MSG1[3],$E1
700	paddd		@MSG0[1],@MSG0[0]	# counters--
701
702	paddd		0x40(%rsp),$ABCD0
703	paddd		0x50(%rsp),$E0
704	 paddd		0x60(%rsp),$ABCD1
705	 paddd		0x70(%rsp),$E1
706
707	movq		@MSG0[0],(%rbx)		# save counters
708	dec		$num
709	jnz		.Loop_shaext
710
711	mov		`$REG_SZ*17+8`(%rsp),$num
712
713	pshufd		\$0b00011011,$ABCD0,$ABCD0
714	pshufd		\$0b00011011,$ABCD1,$ABCD1
715
716	movdqa		$ABCD0,@MSG0[0]
717	punpckldq	$ABCD1,$ABCD0		# b1.b0.a1.a0
718	punpckhdq	$ABCD1,@MSG0[0]		# d1.d0.c1.c0
719	punpckhdq	$E1,$E0			# e1.e0.xx.xx
720	movq		$ABCD0,0x00-0x40($ctx)	# a1.a0
721	psrldq		\$8,$ABCD0
722	movq		@MSG0[0],0x40-0x40($ctx)# c1.c0
723	psrldq		\$8,@MSG0[0]
724	movq		$ABCD0,0x20-0x40($ctx)	# b1.b0
725	psrldq		\$8,$E0
726	movq		@MSG0[0],0x60-0x40($ctx)# d1.d0
727	movq		$E0,0x80-0x40($ctx)	# e1.e0
728
729	lea	`$REG_SZ/2`($ctx),$ctx
730	lea	`16*2`($inp),$inp
731	dec	$num
732	jnz	.Loop_grande_shaext
733
734.Ldone_shaext:
735	#mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
736___
737$code.=<<___ if ($win64);
738	movaps	-0xb8(%rax),%xmm6
739	movaps	-0xa8(%rax),%xmm7
740	movaps	-0x98(%rax),%xmm8
741	movaps	-0x88(%rax),%xmm9
742	movaps	-0x78(%rax),%xmm10
743	movaps	-0x68(%rax),%xmm11
744	movaps	-0x58(%rax),%xmm12
745	movaps	-0x48(%rax),%xmm13
746	movaps	-0x38(%rax),%xmm14
747	movaps	-0x28(%rax),%xmm15
748___
749$code.=<<___;
750	mov	-16(%rax),%rbp
751	mov	-8(%rax),%rbx
752	lea	(%rax),%rsp
753.Lepilogue_shaext:
754	ret
755.size	sha1_multi_block_shaext,.-sha1_multi_block_shaext
756___
757						}}}
758
759						if ($avx) {{{
760sub BODY_00_19_avx {
761my ($i,$a,$b,$c,$d,$e)=@_;
762my $j=$i+1;
763my $k=$i+2;
764my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128";
765my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4];
766
767$code.=<<___ if ($i==0 && $REG_SZ==16);
768	vmovd		(@ptr[0]),@Xi[0]
769	 lea		`16*4`(@ptr[0]),@ptr[0]
770	vmovd		(@ptr[1]),@Xi[2]	# borrow Xi[2]
771	 lea		`16*4`(@ptr[1]),@ptr[1]
772	vpinsrd		\$1,(@ptr[2]),@Xi[0],@Xi[0]
773	 lea		`16*4`(@ptr[2]),@ptr[2]
774	vpinsrd		\$1,(@ptr[3]),@Xi[2],@Xi[2]
775	 lea		`16*4`(@ptr[3]),@ptr[3]
776	 vmovd		`4*$j-16*4`(@ptr[0]),@Xi[1]
777	vpunpckldq	@Xi[2],@Xi[0],@Xi[0]
778	 vmovd		`4*$j-16*4`($ptr_n),$t3
779	vpshufb		$tx,@Xi[0],@Xi[0]
780___
781$code.=<<___ if ($i<15 && $REG_SZ==16);		# just load input
782	 vpinsrd	\$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
783	 vpinsrd	\$1,`4*$j-16*4`(@ptr[3]),$t3,$t3
784___
785$code.=<<___ if ($i==0 && $REG_SZ==32);
786	vmovd		(@ptr[0]),@Xi[0]
787	 lea		`16*4`(@ptr[0]),@ptr[0]
788	vmovd		(@ptr[4]),@Xi[2]	# borrow Xi[2]
789	 lea		`16*4`(@ptr[4]),@ptr[4]
790	vmovd		(@ptr[1]),$t2
791	 lea		`16*4`(@ptr[1]),@ptr[1]
792	vmovd		(@ptr[5]),$t1
793	 lea		`16*4`(@ptr[5]),@ptr[5]
794	vpinsrd		\$1,(@ptr[2]),@Xi[0],@Xi[0]
795	 lea		`16*4`(@ptr[2]),@ptr[2]
796	vpinsrd		\$1,(@ptr[6]),@Xi[2],@Xi[2]
797	 lea		`16*4`(@ptr[6]),@ptr[6]
798	vpinsrd		\$1,(@ptr[3]),$t2,$t2
799	 lea		`16*4`(@ptr[3]),@ptr[3]
800	vpunpckldq	$t2,@Xi[0],@Xi[0]
801	vpinsrd		\$1,(@ptr[7]),$t1,$t1
802	 lea		`16*4`(@ptr[7]),@ptr[7]
803	vpunpckldq	$t1,@Xi[2],@Xi[2]
804	 vmovd		`4*$j-16*4`(@ptr[0]),@Xi[1]
805	vinserti128	@Xi[2],@Xi[0],@Xi[0]
806	 vmovd		`4*$j-16*4`($ptr_n),$t3
807	vpshufb		$tx,@Xi[0],@Xi[0]
808___
809$code.=<<___ if ($i<15 && $REG_SZ==32);		# just load input
810	 vmovd		`4*$j-16*4`(@ptr[1]),$t2
811	 vmovd		`4*$j-16*4`(@ptr[5]),$t1
812	 vpinsrd	\$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
813	 vpinsrd	\$1,`4*$j-16*4`(@ptr[6]),$t3,$t3
814	 vpinsrd	\$1,`4*$j-16*4`(@ptr[3]),$t2,$t2
815	 vpunpckldq	$t2,@Xi[1],@Xi[1]
816	 vpinsrd	\$1,`4*$j-16*4`(@ptr[7]),$t1,$t1
817	 vpunpckldq	$t1,$t3,$t3
818___
819$code.=<<___ if ($i<14);
820	vpaddd	$K,$e,$e			# e+=K_00_19
821	vpslld	\$5,$a,$t2
822	vpandn	$d,$b,$t1
823	vpand	$c,$b,$t0
824
825	vmovdqa	@Xi[0],`&Xi_off($i)`
826	vpaddd	@Xi[0],$e,$e			# e+=X[i]
827	 $vpack		$t3,@Xi[1],@Xi[1]
828	vpsrld	\$27,$a,$t3
829	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
830	 vmovd		`4*$k-16*4`(@ptr[0]),@Xi[2]
831
832	vpslld	\$30,$b,$t1
833	vpor	$t3,$t2,$t2			# rol(a,5)
834	 vmovd		`4*$k-16*4`($ptr_n),$t3
835	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
836
837	vpsrld	\$2,$b,$b
838	vpaddd	$t2,$e,$e			# e+=rol(a,5)
839	 vpshufb	$tx,@Xi[1],@Xi[1]
840	vpor	$t1,$b,$b			# b=rol(b,30)
841___
842$code.=<<___ if ($i==14);
843	vpaddd	$K,$e,$e			# e+=K_00_19
844	 prefetcht0	63(@ptr[0])
845	vpslld	\$5,$a,$t2
846	vpandn	$d,$b,$t1
847	vpand	$c,$b,$t0
848
849	vmovdqa	@Xi[0],`&Xi_off($i)`
850	vpaddd	@Xi[0],$e,$e			# e+=X[i]
851	 $vpack		$t3,@Xi[1],@Xi[1]
852	vpsrld	\$27,$a,$t3
853	 prefetcht0	63(@ptr[1])
854	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
855
856	vpslld	\$30,$b,$t1
857	vpor	$t3,$t2,$t2			# rol(a,5)
858	 prefetcht0	63(@ptr[2])
859	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
860
861	vpsrld	\$2,$b,$b
862	vpaddd	$t2,$e,$e			# e+=rol(a,5)
863	 prefetcht0	63(@ptr[3])
864	 vpshufb	$tx,@Xi[1],@Xi[1]
865	vpor	$t1,$b,$b			# b=rol(b,30)
866___
867$code.=<<___ if ($i>=13 && $i<15);
868	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# preload "X[2]"
869___
870$code.=<<___ if ($i>=15);			# apply Xupdate
871	vpxor	@Xi[-2],@Xi[1],@Xi[1]		# "X[13]"
872	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
873
874	vpaddd	$K,$e,$e			# e+=K_00_19
875	vpslld	\$5,$a,$t2
876	vpandn	$d,$b,$t1
877	 `"prefetcht0	63(@ptr[4])"		if ($i==15 && $REG_SZ==32)`
878	vpand	$c,$b,$t0
879
880	vmovdqa	@Xi[0],`&Xi_off($i)`
881	vpaddd	@Xi[0],$e,$e			# e+=X[i]
882	 vpxor	`&Xi_off($j+8)`,@Xi[1],@Xi[1]
883	vpsrld	\$27,$a,$t3
884	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
885	 vpxor	@Xi[3],@Xi[1],@Xi[1]
886	 `"prefetcht0	63(@ptr[5])"		if ($i==15 && $REG_SZ==32)`
887
888	vpslld	\$30,$b,$t1
889	vpor	$t3,$t2,$t2			# rol(a,5)
890	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
891	 `"prefetcht0	63(@ptr[6])"		if ($i==15 && $REG_SZ==32)`
892	 vpsrld	\$31,@Xi[1],$tx
893	 vpaddd	@Xi[1],@Xi[1],@Xi[1]
894
895	vpsrld	\$2,$b,$b
896	 `"prefetcht0	63(@ptr[7])"		if ($i==15 && $REG_SZ==32)`
897	vpaddd	$t2,$e,$e			# e+=rol(a,5)
898	 vpor	$tx,@Xi[1],@Xi[1]		# rol	\$1,@Xi[1]
899	vpor	$t1,$b,$b			# b=rol(b,30)
900___
901push(@Xi,shift(@Xi));
902}
903
904sub BODY_20_39_avx {
905my ($i,$a,$b,$c,$d,$e)=@_;
906my $j=$i+1;
907
908$code.=<<___ if ($i<79);
909	vpxor	@Xi[-2],@Xi[1],@Xi[1]		# "X[13]"
910	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
911
912	vpslld	\$5,$a,$t2
913	vpaddd	$K,$e,$e			# e+=K_20_39
914	vpxor	$b,$d,$t0
915___
916$code.=<<___ if ($i<72);
917	vmovdqa	@Xi[0],`&Xi_off($i)`
918___
919$code.=<<___ if ($i<79);
920	vpaddd	@Xi[0],$e,$e			# e+=X[i]
921	 vpxor	`&Xi_off($j+8)`,@Xi[1],@Xi[1]
922	vpsrld	\$27,$a,$t3
923	vpxor	$c,$t0,$t0			# Parity(b,c,d)
924	 vpxor	@Xi[3],@Xi[1],@Xi[1]
925
926	vpslld	\$30,$b,$t1
927	vpor	$t3,$t2,$t2			# rol(a,5)
928	vpaddd	$t0,$e,$e			# e+=Parity(b,c,d)
929	 vpsrld	\$31,@Xi[1],$tx
930	 vpaddd	@Xi[1],@Xi[1],@Xi[1]
931
932	vpsrld	\$2,$b,$b
933	vpaddd	$t2,$e,$e			# e+=rol(a,5)
934	 vpor	$tx,@Xi[1],@Xi[1]		# rol(@Xi[1],1)
935	vpor	$t1,$b,$b			# b=rol(b,30)
936___
937$code.=<<___ if ($i==79);
938	vpslld	\$5,$a,$t2
939	vpaddd	$K,$e,$e			# e+=K_20_39
940	vpxor	$b,$d,$t0
941
942	vpsrld	\$27,$a,$t3
943	vpaddd	@Xi[0],$e,$e			# e+=X[i]
944	vpxor	$c,$t0,$t0			# Parity(b,c,d)
945
946	vpslld	\$30,$b,$t1
947	vpor	$t3,$t2,$t2			# rol(a,5)
948	vpaddd	$t0,$e,$e			# e+=Parity(b,c,d)
949
950	vpsrld	\$2,$b,$b
951	vpaddd	$t2,$e,$e			# e+=rol(a,5)
952	vpor	$t1,$b,$b			# b=rol(b,30)
953___
954push(@Xi,shift(@Xi));
955}
956
957sub BODY_40_59_avx {
958my ($i,$a,$b,$c,$d,$e)=@_;
959my $j=$i+1;
960
961$code.=<<___;
962	vpxor	@Xi[-2],@Xi[1],@Xi[1]		# "X[13]"
963	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
964
965	vpaddd	$K,$e,$e			# e+=K_40_59
966	vpslld	\$5,$a,$t2
967	vpand	$c,$d,$t1
968	 vpxor	`&Xi_off($j+8)`,@Xi[1],@Xi[1]
969
970	vpaddd	$t1,$e,$e
971	vpsrld	\$27,$a,$t3
972	vpxor	$c,$d,$t0
973	 vpxor	@Xi[3],@Xi[1],@Xi[1]
974
975	vmovdqu	@Xi[0],`&Xi_off($i)`
976	vpaddd	@Xi[0],$e,$e			# e+=X[i]
977	vpor	$t3,$t2,$t2			# rol(a,5)
978	 vpsrld	\$31,@Xi[1],$tx
979	vpand	$b,$t0,$t0
980	 vpaddd	@Xi[1],@Xi[1],@Xi[1]
981
982	vpslld	\$30,$b,$t1
983	vpaddd	$t0,$e,$e			# e+=Maj(b,d,c)
984
985	vpsrld	\$2,$b,$b
986	vpaddd	$t2,$e,$e			# e+=rol(a,5)
987	 vpor	$tx,@Xi[1],@Xi[1]		# rol(@X[1],1)
988	vpor	$t1,$b,$b			# b=rol(b,30)
989___
990push(@Xi,shift(@Xi));
991}
992
993$code.=<<___;
994.type	sha1_multi_block_avx,\@function,3
995.align	32
996sha1_multi_block_avx:
997_avx_shortcut:
998___
999$code.=<<___ if ($avx>1);
1000	shr	\$32,%rcx
1001	cmp	\$2,$num
1002	jb	.Lavx
1003	test	\$`1<<5`,%ecx
1004	jnz	_avx2_shortcut
1005	jmp	.Lavx
1006.align	32
1007.Lavx:
1008___
1009$code.=<<___;
1010	mov	%rsp,%rax
1011	push	%rbx
1012	push	%rbp
1013___
1014$code.=<<___ if ($win64);
1015	lea	-0xa8(%rsp),%rsp
1016	movaps	%xmm6,(%rsp)
1017	movaps	%xmm7,0x10(%rsp)
1018	movaps	%xmm8,0x20(%rsp)
1019	movaps	%xmm9,0x30(%rsp)
1020	movaps	%xmm10,-0x78(%rax)
1021	movaps	%xmm11,-0x68(%rax)
1022	movaps	%xmm12,-0x58(%rax)
1023	movaps	%xmm13,-0x48(%rax)
1024	movaps	%xmm14,-0x38(%rax)
1025	movaps	%xmm15,-0x28(%rax)
1026___
1027$code.=<<___;
1028	sub	\$`$REG_SZ*18`, %rsp
1029	and	\$-256,%rsp
1030	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
1031.Lbody_avx:
1032	lea	K_XX_XX(%rip),$Tbl
1033	lea	`$REG_SZ*16`(%rsp),%rbx
1034
1035	vzeroupper
1036.Loop_grande_avx:
1037	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1038	xor	$num,$num
1039___
1040for($i=0;$i<4;$i++) {
1041    $code.=<<___;
1042	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
1043	mov	`16*$i+8`($inp),%ecx		# number of blocks
1044	cmp	$num,%ecx
1045	cmovg	%ecx,$num			# find maximum
1046	test	%ecx,%ecx
1047	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1048	cmovle	$Tbl,@ptr[$i]			# cancel input
1049___
1050}
1051$code.=<<___;
1052	test	$num,$num
1053	jz	.Ldone_avx
1054
1055	vmovdqu	0x00($ctx),$A			# load context
1056	 lea	128(%rsp),%rax
1057	vmovdqu	0x20($ctx),$B
1058	vmovdqu	0x40($ctx),$C
1059	vmovdqu	0x60($ctx),$D
1060	vmovdqu	0x80($ctx),$E
1061	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1062	jmp	.Loop_avx
1063
1064.align	32
1065.Loop_avx:
1066___
1067$code.="	vmovdqa	-0x20($Tbl),$K\n";	# K_00_19
1068for($i=0;$i<20;$i++)	{ &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1069$code.="	vmovdqa	0x00($Tbl),$K\n";	# K_20_39
1070for(;$i<40;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1071$code.="	vmovdqa	0x20($Tbl),$K\n";	# K_40_59
1072for(;$i<60;$i++)	{ &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1073$code.="	vmovdqa	0x40($Tbl),$K\n";	# K_60_79
1074for(;$i<80;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1075$code.=<<___;
1076	mov	\$1,%ecx
1077___
1078for($i=0;$i<4;$i++) {
1079    $code.=<<___;
1080	cmp	`4*$i`(%rbx),%ecx		# examine counters
1081	cmovge	$Tbl,@ptr[$i]			# cancel input
1082___
1083}
1084$code.=<<___;
1085	vmovdqu	(%rbx),$t0			# pull counters
1086	vpxor	$t2,$t2,$t2
1087	vmovdqa	$t0,$t1
1088	vpcmpgtd $t2,$t1,$t1			# mask value
1089	vpaddd	$t1,$t0,$t0			# counters--
1090
1091	vpand	$t1,$A,$A
1092	vpand	$t1,$B,$B
1093	vpaddd	0x00($ctx),$A,$A
1094	vpand	$t1,$C,$C
1095	vpaddd	0x20($ctx),$B,$B
1096	vpand	$t1,$D,$D
1097	vpaddd	0x40($ctx),$C,$C
1098	vpand	$t1,$E,$E
1099	vpaddd	0x60($ctx),$D,$D
1100	vpaddd	0x80($ctx),$E,$E
1101	vmovdqu	$A,0x00($ctx)
1102	vmovdqu	$B,0x20($ctx)
1103	vmovdqu	$C,0x40($ctx)
1104	vmovdqu	$D,0x60($ctx)
1105	vmovdqu	$E,0x80($ctx)
1106
1107	vmovdqu	$t0,(%rbx)			# save counters
1108	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1109	dec	$num
1110	jnz	.Loop_avx
1111
1112	mov	`$REG_SZ*17+8`(%rsp),$num
1113	lea	$REG_SZ($ctx),$ctx
1114	lea	`16*$REG_SZ/4`($inp),$inp
1115	dec	$num
1116	jnz	.Loop_grande_avx
1117
1118.Ldone_avx:
1119	mov	`$REG_SZ*17`(%rsp),%rax		# orignal %rsp
1120	vzeroupper
1121___
1122$code.=<<___ if ($win64);
1123	movaps	-0xb8(%rax),%xmm6
1124	movaps	-0xa8(%rax),%xmm7
1125	movaps	-0x98(%rax),%xmm8
1126	movaps	-0x88(%rax),%xmm9
1127	movaps	-0x78(%rax),%xmm10
1128	movaps	-0x68(%rax),%xmm11
1129	movaps	-0x58(%rax),%xmm12
1130	movaps	-0x48(%rax),%xmm13
1131	movaps	-0x38(%rax),%xmm14
1132	movaps	-0x28(%rax),%xmm15
1133___
1134$code.=<<___;
1135	mov	-16(%rax),%rbp
1136	mov	-8(%rax),%rbx
1137	lea	(%rax),%rsp
1138.Lepilogue_avx:
1139	ret
1140.size	sha1_multi_block_avx,.-sha1_multi_block_avx
1141___
1142
1143						if ($avx>1) {
1144$code =~ s/\`([^\`]*)\`/eval $1/gem;
1145
1146$REG_SZ=32;
1147
1148@ptr=map("%r$_",(12..15,8..11));
1149
1150@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4));
1151($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9));
1152@Xi=map("%ymm$_",(10..14));
1153$K="%ymm15";
1154
1155$code.=<<___;
1156.type	sha1_multi_block_avx2,\@function,3
1157.align	32
1158sha1_multi_block_avx2:
1159_avx2_shortcut:
1160	mov	%rsp,%rax
1161	push	%rbx
1162	push	%rbp
1163	push	%r12
1164	push	%r13
1165	push	%r14
1166	push	%r15
1167___
1168$code.=<<___ if ($win64);
1169	lea	-0xa8(%rsp),%rsp
1170	movaps	%xmm6,(%rsp)
1171	movaps	%xmm7,0x10(%rsp)
1172	movaps	%xmm8,0x20(%rsp)
1173	movaps	%xmm9,0x30(%rsp)
1174	movaps	%xmm10,0x40(%rsp)
1175	movaps	%xmm11,0x50(%rsp)
1176	movaps	%xmm12,-0x78(%rax)
1177	movaps	%xmm13,-0x68(%rax)
1178	movaps	%xmm14,-0x58(%rax)
1179	movaps	%xmm15,-0x48(%rax)
1180___
1181$code.=<<___;
1182	sub	\$`$REG_SZ*18`, %rsp
1183	and	\$-256,%rsp
1184	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
1185.Lbody_avx2:
1186	lea	K_XX_XX(%rip),$Tbl
1187	shr	\$1,$num
1188
1189	vzeroupper
1190.Loop_grande_avx2:
1191	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1192	xor	$num,$num
1193	lea	`$REG_SZ*16`(%rsp),%rbx
1194___
1195for($i=0;$i<8;$i++) {
1196    $code.=<<___;
1197	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
1198	mov	`16*$i+8`($inp),%ecx		# number of blocks
1199	cmp	$num,%ecx
1200	cmovg	%ecx,$num			# find maximum
1201	test	%ecx,%ecx
1202	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1203	cmovle	$Tbl,@ptr[$i]			# cancel input
1204___
1205}
1206$code.=<<___;
1207	vmovdqu	0x00($ctx),$A			# load context
1208	 lea	128(%rsp),%rax
1209	vmovdqu	0x20($ctx),$B
1210	 lea	256+128(%rsp),%rbx
1211	vmovdqu	0x40($ctx),$C
1212	vmovdqu	0x60($ctx),$D
1213	vmovdqu	0x80($ctx),$E
1214	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1215	jmp	.Loop_avx2
1216
1217.align	32
1218.Loop_avx2:
1219___
1220$code.="	vmovdqa	-0x20($Tbl),$K\n";	# K_00_19
1221for($i=0;$i<20;$i++)	{ &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1222$code.="	vmovdqa	0x00($Tbl),$K\n";	# K_20_39
1223for(;$i<40;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1224$code.="	vmovdqa	0x20($Tbl),$K\n";	# K_40_59
1225for(;$i<60;$i++)	{ &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1226$code.="	vmovdqa	0x40($Tbl),$K\n";	# K_60_79
1227for(;$i<80;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1228$code.=<<___;
1229	mov	\$1,%ecx
1230	lea	`$REG_SZ*16`(%rsp),%rbx
1231___
1232for($i=0;$i<8;$i++) {
1233    $code.=<<___;
1234	cmp	`4*$i`(%rbx),%ecx		# examine counters
1235	cmovge	$Tbl,@ptr[$i]			# cancel input
1236___
1237}
1238$code.=<<___;
1239	vmovdqu	(%rbx),$t0		# pull counters
1240	vpxor	$t2,$t2,$t2
1241	vmovdqa	$t0,$t1
1242	vpcmpgtd $t2,$t1,$t1			# mask value
1243	vpaddd	$t1,$t0,$t0			# counters--
1244
1245	vpand	$t1,$A,$A
1246	vpand	$t1,$B,$B
1247	vpaddd	0x00($ctx),$A,$A
1248	vpand	$t1,$C,$C
1249	vpaddd	0x20($ctx),$B,$B
1250	vpand	$t1,$D,$D
1251	vpaddd	0x40($ctx),$C,$C
1252	vpand	$t1,$E,$E
1253	vpaddd	0x60($ctx),$D,$D
1254	vpaddd	0x80($ctx),$E,$E
1255	vmovdqu	$A,0x00($ctx)
1256	vmovdqu	$B,0x20($ctx)
1257	vmovdqu	$C,0x40($ctx)
1258	vmovdqu	$D,0x60($ctx)
1259	vmovdqu	$E,0x80($ctx)
1260
1261	vmovdqu	$t0,(%rbx)			# save counters
1262	lea	256+128(%rsp),%rbx
1263	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1264	dec	$num
1265	jnz	.Loop_avx2
1266
1267	#mov	`$REG_SZ*17+8`(%rsp),$num
1268	#lea	$REG_SZ($ctx),$ctx
1269	#lea	`16*$REG_SZ/4`($inp),$inp
1270	#dec	$num
1271	#jnz	.Loop_grande_avx2
1272
1273.Ldone_avx2:
1274	mov	`$REG_SZ*17`(%rsp),%rax		# orignal %rsp
1275	vzeroupper
1276___
1277$code.=<<___ if ($win64);
1278	movaps	-0xd8(%rax),%xmm6
1279	movaps	-0xc8(%rax),%xmm7
1280	movaps	-0xb8(%rax),%xmm8
1281	movaps	-0xa8(%rax),%xmm9
1282	movaps	-0x98(%rax),%xmm10
1283	movaps	-0x88(%rax),%xmm11
1284	movaps	-0x78(%rax),%xmm12
1285	movaps	-0x68(%rax),%xmm13
1286	movaps	-0x58(%rax),%xmm14
1287	movaps	-0x48(%rax),%xmm15
1288___
1289$code.=<<___;
1290	mov	-48(%rax),%r15
1291	mov	-40(%rax),%r14
1292	mov	-32(%rax),%r13
1293	mov	-24(%rax),%r12
1294	mov	-16(%rax),%rbp
1295	mov	-8(%rax),%rbx
1296	lea	(%rax),%rsp
1297.Lepilogue_avx2:
1298	ret
1299.size	sha1_multi_block_avx2,.-sha1_multi_block_avx2
1300___
1301						}	}}}
1302$code.=<<___;
1303
1304.align	256
1305	.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1306	.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1307K_XX_XX:
1308	.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1309	.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1310	.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1311	.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1312	.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1313	.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1314	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1315	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1316	.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1317	.asciz	"SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1318___
1319
1320if ($win64) {
1321# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1322#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1323$rec="%rcx";
1324$frame="%rdx";
1325$context="%r8";
1326$disp="%r9";
1327
1328$code.=<<___;
1329.extern	__imp_RtlVirtualUnwind
1330.type	se_handler,\@abi-omnipotent
1331.align	16
1332se_handler:
1333	push	%rsi
1334	push	%rdi
1335	push	%rbx
1336	push	%rbp
1337	push	%r12
1338	push	%r13
1339	push	%r14
1340	push	%r15
1341	pushfq
1342	sub	\$64,%rsp
1343
1344	mov	120($context),%rax	# pull context->Rax
1345	mov	248($context),%rbx	# pull context->Rip
1346
1347	mov	8($disp),%rsi		# disp->ImageBase
1348	mov	56($disp),%r11		# disp->HandlerData
1349
1350	mov	0(%r11),%r10d		# HandlerData[0]
1351	lea	(%rsi,%r10),%r10	# end of prologue label
1352	cmp	%r10,%rbx		# context->Rip<.Lbody
1353	jb	.Lin_prologue
1354
1355	mov	152($context),%rax	# pull context->Rsp
1356
1357	mov	4(%r11),%r10d		# HandlerData[1]
1358	lea	(%rsi,%r10),%r10	# epilogue label
1359	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1360	jae	.Lin_prologue
1361
1362	mov	`16*17`(%rax),%rax	# pull saved stack pointer
1363
1364	mov	-8(%rax),%rbx
1365	mov	-16(%rax),%rbp
1366	mov	%rbx,144($context)	# restore context->Rbx
1367	mov	%rbp,160($context)	# restore context->Rbp
1368
1369	lea	-24-10*16(%rax),%rsi
1370	lea	512($context),%rdi	# &context.Xmm6
1371	mov	\$20,%ecx
1372	.long	0xa548f3fc		# cld; rep movsq
1373
1374.Lin_prologue:
1375	mov	8(%rax),%rdi
1376	mov	16(%rax),%rsi
1377	mov	%rax,152($context)	# restore context->Rsp
1378	mov	%rsi,168($context)	# restore context->Rsi
1379	mov	%rdi,176($context)	# restore context->Rdi
1380
1381	mov	40($disp),%rdi		# disp->ContextRecord
1382	mov	$context,%rsi		# context
1383	mov	\$154,%ecx		# sizeof(CONTEXT)
1384	.long	0xa548f3fc		# cld; rep movsq
1385
1386	mov	$disp,%rsi
1387	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1388	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1389	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1390	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1391	mov	40(%rsi),%r10		# disp->ContextRecord
1392	lea	56(%rsi),%r11		# &disp->HandlerData
1393	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1394	mov	%r10,32(%rsp)		# arg5
1395	mov	%r11,40(%rsp)		# arg6
1396	mov	%r12,48(%rsp)		# arg7
1397	mov	%rcx,56(%rsp)		# arg8, (NULL)
1398	call	*__imp_RtlVirtualUnwind(%rip)
1399
1400	mov	\$1,%eax		# ExceptionContinueSearch
1401	add	\$64,%rsp
1402	popfq
1403	pop	%r15
1404	pop	%r14
1405	pop	%r13
1406	pop	%r12
1407	pop	%rbp
1408	pop	%rbx
1409	pop	%rdi
1410	pop	%rsi
1411	ret
1412.size	se_handler,.-se_handler
1413___
1414$code.=<<___ if ($avx>1);
1415.type	avx2_handler,\@abi-omnipotent
1416.align	16
1417avx2_handler:
1418	push	%rsi
1419	push	%rdi
1420	push	%rbx
1421	push	%rbp
1422	push	%r12
1423	push	%r13
1424	push	%r14
1425	push	%r15
1426	pushfq
1427	sub	\$64,%rsp
1428
1429	mov	120($context),%rax	# pull context->Rax
1430	mov	248($context),%rbx	# pull context->Rip
1431
1432	mov	8($disp),%rsi		# disp->ImageBase
1433	mov	56($disp),%r11		# disp->HandlerData
1434
1435	mov	0(%r11),%r10d		# HandlerData[0]
1436	lea	(%rsi,%r10),%r10	# end of prologue label
1437	cmp	%r10,%rbx		# context->Rip<body label
1438	jb	.Lin_prologue
1439
1440	mov	152($context),%rax	# pull context->Rsp
1441
1442	mov	4(%r11),%r10d		# HandlerData[1]
1443	lea	(%rsi,%r10),%r10	# epilogue label
1444	cmp	%r10,%rbx		# context->Rip>=epilogue label
1445	jae	.Lin_prologue
1446
1447	mov	`32*17`($context),%rax	# pull saved stack pointer
1448
1449	mov	-8(%rax),%rbx
1450	mov	-16(%rax),%rbp
1451	mov	-24(%rax),%r12
1452	mov	-32(%rax),%r13
1453	mov	-40(%rax),%r14
1454	mov	-48(%rax),%r15
1455	mov	%rbx,144($context)	# restore context->Rbx
1456	mov	%rbp,160($context)	# restore context->Rbp
1457	mov	%r12,216($context)	# restore cotnext->R12
1458	mov	%r13,224($context)	# restore cotnext->R13
1459	mov	%r14,232($context)	# restore cotnext->R14
1460	mov	%r15,240($context)	# restore cotnext->R15
1461
1462	lea	-56-10*16(%rax),%rsi
1463	lea	512($context),%rdi	# &context.Xmm6
1464	mov	\$20,%ecx
1465	.long	0xa548f3fc		# cld; rep movsq
1466
1467	jmp	.Lin_prologue
1468.size	avx2_handler,.-avx2_handler
1469___
1470$code.=<<___;
1471.section	.pdata
1472.align	4
1473	.rva	.LSEH_begin_sha1_multi_block
1474	.rva	.LSEH_end_sha1_multi_block
1475	.rva	.LSEH_info_sha1_multi_block
1476	.rva	.LSEH_begin_sha1_multi_block_shaext
1477	.rva	.LSEH_end_sha1_multi_block_shaext
1478	.rva	.LSEH_info_sha1_multi_block_shaext
1479___
1480$code.=<<___ if ($avx);
1481	.rva	.LSEH_begin_sha1_multi_block_avx
1482	.rva	.LSEH_end_sha1_multi_block_avx
1483	.rva	.LSEH_info_sha1_multi_block_avx
1484___
1485$code.=<<___ if ($avx>1);
1486	.rva	.LSEH_begin_sha1_multi_block_avx2
1487	.rva	.LSEH_end_sha1_multi_block_avx2
1488	.rva	.LSEH_info_sha1_multi_block_avx2
1489___
1490$code.=<<___;
1491.section	.xdata
1492.align	8
1493.LSEH_info_sha1_multi_block:
1494	.byte	9,0,0,0
1495	.rva	se_handler
1496	.rva	.Lbody,.Lepilogue			# HandlerData[]
1497.LSEH_info_sha1_multi_block_shaext:
1498	.byte	9,0,0,0
1499	.rva	se_handler
1500	.rva	.Lbody_shaext,.Lepilogue_shaext	# HandlerData[]
1501___
1502$code.=<<___ if ($avx);
1503.LSEH_info_sha1_multi_block_avx:
1504	.byte	9,0,0,0
1505	.rva	se_handler
1506	.rva	.Lbody_avx,.Lepilogue_avx		# HandlerData[]
1507___
1508$code.=<<___ if ($avx>1);
1509.LSEH_info_sha1_multi_block_avx2:
1510	.byte	9,0,0,0
1511	.rva	avx2_handler
1512	.rva	.Lbody_avx2,.Lepilogue_avx2		# HandlerData[]
1513___
1514}
1515####################################################################
1516
1517sub rex {
1518  local *opcode=shift;
1519  my ($dst,$src)=@_;
1520  my $rex=0;
1521
1522    $rex|=0x04			if ($dst>=8);
1523    $rex|=0x01			if ($src>=8);
1524    unshift @opcode,$rex|0x40	if ($rex);
1525}
1526
1527sub sha1rnds4 {
1528    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1529      my @opcode=(0x0f,0x3a,0xcc);
1530	rex(\@opcode,$3,$2);
1531	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
1532	my $c=$1;
1533	push @opcode,$c=~/^0/?oct($c):$c;
1534	return ".byte\t".join(',',@opcode);
1535    } else {
1536	return "sha1rnds4\t".@_[0];
1537    }
1538}
1539
1540sub sha1op38 {
1541    my $instr = shift;
1542    my %opcodelet = (
1543		"sha1nexte" => 0xc8,
1544  		"sha1msg1"  => 0xc9,
1545		"sha1msg2"  => 0xca	);
1546
1547    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1548      my @opcode=(0x0f,0x38);
1549	rex(\@opcode,$2,$1);
1550	push @opcode,$opcodelet{$instr};
1551	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1552	return ".byte\t".join(',',@opcode);
1553    } else {
1554	return $instr."\t".@_[0];
1555    }
1556}
1557
1558foreach (split("\n",$code)) {
1559	s/\`([^\`]*)\`/eval($1)/ge;
1560
1561	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo		or
1562	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo		or
1563
1564	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
1565	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
1566	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go	or
1567	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1568	s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go		or
1569	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1570
1571	print $_,"\n";
1572}
1573
1574close STDOUT;
1575