xref: /freebsd/crypto/openssl/crypto/aes/asm/aesni-mb-x86_64.pl (revision 3fc36ee018bb836bd1796067cf4ef8683f166ebc)
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Multi-buffer AES-NI procedures process several independent buffers
11# in parallel by interleaving independent instructions.
12#
13# Cycles per byte for interleave factor 4:
14#
15#			asymptotic	measured
16#			---------------------------
17# Westmere		5.00/4=1.25	5.13/4=1.28
18# Atom			15.0/4=3.75	?15.7/4=3.93
19# Sandy Bridge		5.06/4=1.27	5.18/4=1.29
20# Ivy Bridge		5.06/4=1.27	5.14/4=1.29
21# Haswell		4.44/4=1.11	4.44/4=1.11
22# Bulldozer		5.75/4=1.44	5.76/4=1.44
23#
24# Cycles per byte for interleave factor 8 (not implemented for
25# pre-AVX processors, where higher interleave factor incidentally
26# doesn't result in improvement):
27#
28#			asymptotic	measured
29#			---------------------------
30# Sandy Bridge		5.06/8=0.64	7.10/8=0.89(*)
31# Ivy Bridge		5.06/8=0.64	7.14/8=0.89(*)
32# Haswell		5.00/8=0.63	5.00/8=0.63
33# Bulldozer		5.75/8=0.72	5.77/8=0.72
34#
35# (*)	Sandy/Ivy Bridge are known to handle high interleave factors
36#	suboptimally;
37
38$flavour = shift;
39$output  = shift;
40if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
41
42$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
43
44$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
46( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
47die "can't locate x86_64-xlate.pl";
48
49$avx=0;
50
51if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
52		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
53	$avx = ($1>=2.19) + ($1>=2.22);
54}
55
56if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
57	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
58	$avx = ($1>=2.09) + ($1>=2.10);
59}
60
61if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
62	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
63	$avx = ($1>=10) + ($1>=11);
64}
65
66if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
67	$avx = ($2>=3.0) + ($2>3.0);
68}
69
70open OUT,"| \"$^X\" $xlate $flavour $output";
71*STDOUT=*OUT;
72
73# void aesni_multi_cbc_encrypt (
74#     struct {	void *inp,*out; int blocks; double iv[2]; } inp[8];
75#     const AES_KEY *key,
76#     int num);		/* 1 or 2 */
77#
78$inp="%rdi";	# 1st arg
79$key="%rsi";	# 2nd arg
80$num="%edx";
81
82@inptr=map("%r$_",(8..11));
83@outptr=map("%r$_",(12..15));
84
85($rndkey0,$rndkey1)=("%xmm0","%xmm1");
86@out=map("%xmm$_",(2..5));
87@inp=map("%xmm$_",(6..9));
88($counters,$mask,$zero)=map("%xmm$_",(10..12));
89
90($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
91
92$code.=<<___;
93.text
94
95.extern	OPENSSL_ia32cap_P
96
97.globl	aesni_multi_cbc_encrypt
98.type	aesni_multi_cbc_encrypt,\@function,3
99.align	32
100aesni_multi_cbc_encrypt:
101___
102$code.=<<___ if ($avx);
103	cmp	\$2,$num
104	jb	.Lenc_non_avx
105	mov	OPENSSL_ia32cap_P+4(%rip),%ecx
106	test	\$`1<<28`,%ecx			# AVX bit
107	jnz	_avx_cbc_enc_shortcut
108	jmp	.Lenc_non_avx
109.align	16
110.Lenc_non_avx:
111___
112$code.=<<___;
113	mov	%rsp,%rax
114	push	%rbx
115	push	%rbp
116	push	%r12
117	push	%r13
118	push	%r14
119	push	%r15
120___
121$code.=<<___ if ($win64);
122	lea	-0xa8(%rsp),%rsp
123	movaps	%xmm6,(%rsp)
124	movaps	%xmm7,0x10(%rsp)
125	movaps	%xmm8,0x20(%rsp)
126	movaps	%xmm9,0x30(%rsp)
127	movaps	%xmm10,0x40(%rsp)
128	movaps	%xmm11,0x50(%rsp)
129	movaps	%xmm12,0x60(%rsp)
130	movaps	%xmm13,-0x68(%rax)	# not used, saved to share se_handler
131	movaps	%xmm14,-0x58(%rax)
132	movaps	%xmm15,-0x48(%rax)
133___
134$code.=<<___;
135	# stack layout
136	#
137	# +0	output sink
138	# +16	input sink [original %rsp and $num]
139	# +32	counters
140
141	sub	\$48,%rsp
142	and	\$-64,%rsp
143	mov	%rax,16(%rsp)			# original %rsp
144
145.Lenc4x_body:
146	movdqu	($key),$zero			# 0-round key
147	lea	0x78($key),$key			# size optimization
148	lea	40*2($inp),$inp
149
150.Lenc4x_loop_grande:
151	mov	$num,24(%rsp)			# original $num
152	xor	$num,$num
153___
154for($i=0;$i<4;$i++) {
155    $code.=<<___;
156	mov	`40*$i+16-40*2`($inp),$one	# borrow $one for number of blocks
157	mov	`40*$i+0-40*2`($inp),@inptr[$i]
158	cmp	$num,$one
159	mov	`40*$i+8-40*2`($inp),@outptr[$i]
160	cmovg	$one,$num			# find maximum
161	test	$one,$one
162	movdqu	`40*$i+24-40*2`($inp),@out[$i]	# load IV
163	mov	$one,`32+4*$i`(%rsp)		# initialize counters
164	cmovle	%rsp,@inptr[$i]			# cancel input
165___
166}
167$code.=<<___;
168	test	$num,$num
169	jz	.Lenc4x_done
170
171	movups	0x10-0x78($key),$rndkey1
172	 pxor	$zero,@out[0]
173	movups	0x20-0x78($key),$rndkey0
174	 pxor	$zero,@out[1]
175	mov	0xf0-0x78($key),$rounds
176	 pxor	$zero,@out[2]
177	movdqu	(@inptr[0]),@inp[0]		# load inputs
178	 pxor	$zero,@out[3]
179	movdqu	(@inptr[1]),@inp[1]
180	 pxor	@inp[0],@out[0]
181	movdqu	(@inptr[2]),@inp[2]
182	 pxor	@inp[1],@out[1]
183	movdqu	(@inptr[3]),@inp[3]
184	 pxor	@inp[2],@out[2]
185	 pxor	@inp[3],@out[3]
186	movdqa	32(%rsp),$counters		# load counters
187	xor	$offset,$offset
188	jmp	.Loop_enc4x
189
190.align	32
191.Loop_enc4x:
192	add	\$16,$offset
193	lea	16(%rsp),$sink			# sink pointer
194	mov	\$1,$one			# constant of 1
195	sub	$offset,$sink
196
197	aesenc		$rndkey1,@out[0]
198	prefetcht0	31(@inptr[0],$offset)	# prefetch input
199	prefetcht0	31(@inptr[1],$offset)
200	aesenc		$rndkey1,@out[1]
201	prefetcht0	31(@inptr[2],$offset)
202	prefetcht0	31(@inptr[2],$offset)
203	aesenc		$rndkey1,@out[2]
204	aesenc		$rndkey1,@out[3]
205	movups		0x30-0x78($key),$rndkey1
206___
207for($i=0;$i<4;$i++) {
208my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
209$code.=<<___;
210	 cmp		`32+4*$i`(%rsp),$one
211	aesenc		$rndkey,@out[0]
212	aesenc		$rndkey,@out[1]
213	aesenc		$rndkey,@out[2]
214	 cmovge		$sink,@inptr[$i]	# cancel input
215	 cmovg		$sink,@outptr[$i]	# sink output
216	aesenc		$rndkey,@out[3]
217	movups		`0x40+16*$i-0x78`($key),$rndkey
218___
219}
220$code.=<<___;
221	 movdqa		$counters,$mask
222	aesenc		$rndkey0,@out[0]
223	prefetcht0	15(@outptr[0],$offset)	# prefetch output
224	prefetcht0	15(@outptr[1],$offset)
225	aesenc		$rndkey0,@out[1]
226	prefetcht0	15(@outptr[2],$offset)
227	prefetcht0	15(@outptr[3],$offset)
228	aesenc		$rndkey0,@out[2]
229	aesenc		$rndkey0,@out[3]
230	movups		0x80-0x78($key),$rndkey0
231	 pxor		$zero,$zero
232
233	aesenc		$rndkey1,@out[0]
234	 pcmpgtd	$zero,$mask
235	 movdqu		-0x78($key),$zero	# reload 0-round key
236	aesenc		$rndkey1,@out[1]
237	 paddd		$mask,$counters		# decrement counters
238	 movdqa		$counters,32(%rsp)	# update counters
239	aesenc		$rndkey1,@out[2]
240	aesenc		$rndkey1,@out[3]
241	movups		0x90-0x78($key),$rndkey1
242
243	cmp	\$11,$rounds
244
245	aesenc		$rndkey0,@out[0]
246	aesenc		$rndkey0,@out[1]
247	aesenc		$rndkey0,@out[2]
248	aesenc		$rndkey0,@out[3]
249	movups		0xa0-0x78($key),$rndkey0
250
251	jb	.Lenc4x_tail
252
253	aesenc		$rndkey1,@out[0]
254	aesenc		$rndkey1,@out[1]
255	aesenc		$rndkey1,@out[2]
256	aesenc		$rndkey1,@out[3]
257	movups		0xb0-0x78($key),$rndkey1
258
259	aesenc		$rndkey0,@out[0]
260	aesenc		$rndkey0,@out[1]
261	aesenc		$rndkey0,@out[2]
262	aesenc		$rndkey0,@out[3]
263	movups		0xc0-0x78($key),$rndkey0
264
265	je	.Lenc4x_tail
266
267	aesenc		$rndkey1,@out[0]
268	aesenc		$rndkey1,@out[1]
269	aesenc		$rndkey1,@out[2]
270	aesenc		$rndkey1,@out[3]
271	movups		0xd0-0x78($key),$rndkey1
272
273	aesenc		$rndkey0,@out[0]
274	aesenc		$rndkey0,@out[1]
275	aesenc		$rndkey0,@out[2]
276	aesenc		$rndkey0,@out[3]
277	movups		0xe0-0x78($key),$rndkey0
278	jmp	.Lenc4x_tail
279
280.align	32
281.Lenc4x_tail:
282	aesenc		$rndkey1,@out[0]
283	aesenc		$rndkey1,@out[1]
284	aesenc		$rndkey1,@out[2]
285	aesenc		$rndkey1,@out[3]
286	 movdqu		(@inptr[0],$offset),@inp[0]
287	movdqu		0x10-0x78($key),$rndkey1
288
289	aesenclast	$rndkey0,@out[0]
290	 movdqu		(@inptr[1],$offset),@inp[1]
291	 pxor		$zero,@inp[0]
292	aesenclast	$rndkey0,@out[1]
293	 movdqu		(@inptr[2],$offset),@inp[2]
294	 pxor		$zero,@inp[1]
295	aesenclast	$rndkey0,@out[2]
296	 movdqu		(@inptr[3],$offset),@inp[3]
297	 pxor		$zero,@inp[2]
298	aesenclast	$rndkey0,@out[3]
299	movdqu		0x20-0x78($key),$rndkey0
300	 pxor		$zero,@inp[3]
301
302	movups		@out[0],-16(@outptr[0],$offset)
303	 pxor		@inp[0],@out[0]
304	movups		@out[1],-16(@outptr[1],$offset)
305	 pxor		@inp[1],@out[1]
306	movups		@out[2],-16(@outptr[2],$offset)
307	 pxor		@inp[2],@out[2]
308	movups		@out[3],-16(@outptr[3],$offset)
309	 pxor		@inp[3],@out[3]
310
311	dec	$num
312	jnz	.Loop_enc4x
313
314	mov	16(%rsp),%rax			# original %rsp
315	mov	24(%rsp),$num
316
317	#pxor	@inp[0],@out[0]
318	#pxor	@inp[1],@out[1]
319	#movdqu	@out[0],`40*0+24-40*2`($inp)	# output iv FIX ME!
320	#pxor	@inp[2],@out[2]
321	#movdqu	@out[1],`40*1+24-40*2`($inp)
322	#pxor	@inp[3],@out[3]
323	#movdqu	@out[2],`40*2+24-40*2`($inp)	# won't fix, let caller
324	#movdqu	@out[3],`40*3+24-40*2`($inp)	# figure this out...
325
326	lea	`40*4`($inp),$inp
327	dec	$num
328	jnz	.Lenc4x_loop_grande
329
330.Lenc4x_done:
331___
332$code.=<<___ if ($win64);
333	movaps	-0xd8(%rax),%xmm6
334	movaps	-0xc8(%rax),%xmm7
335	movaps	-0xb8(%rax),%xmm8
336	movaps	-0xa8(%rax),%xmm9
337	movaps	-0x98(%rax),%xmm10
338	movaps	-0x88(%rax),%xmm11
339	movaps	-0x78(%rax),%xmm12
340	#movaps	-0x68(%rax),%xmm13
341	#movaps	-0x58(%rax),%xmm14
342	#movaps	-0x48(%rax),%xmm15
343___
344$code.=<<___;
345	mov	-48(%rax),%r15
346	mov	-40(%rax),%r14
347	mov	-32(%rax),%r13
348	mov	-24(%rax),%r12
349	mov	-16(%rax),%rbp
350	mov	-8(%rax),%rbx
351	lea	(%rax),%rsp
352.Lenc4x_epilogue:
353	ret
354.size	aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
355
356.globl	aesni_multi_cbc_decrypt
357.type	aesni_multi_cbc_decrypt,\@function,3
358.align	32
359aesni_multi_cbc_decrypt:
360___
361$code.=<<___ if ($avx);
362	cmp	\$2,$num
363	jb	.Ldec_non_avx
364	mov	OPENSSL_ia32cap_P+4(%rip),%ecx
365	test	\$`1<<28`,%ecx			# AVX bit
366	jnz	_avx_cbc_dec_shortcut
367	jmp	.Ldec_non_avx
368.align	16
369.Ldec_non_avx:
370___
371$code.=<<___;
372	mov	%rsp,%rax
373	push	%rbx
374	push	%rbp
375	push	%r12
376	push	%r13
377	push	%r14
378	push	%r15
379___
380$code.=<<___ if ($win64);
381	lea	-0xa8(%rsp),%rsp
382	movaps	%xmm6,(%rsp)
383	movaps	%xmm7,0x10(%rsp)
384	movaps	%xmm8,0x20(%rsp)
385	movaps	%xmm9,0x30(%rsp)
386	movaps	%xmm10,0x40(%rsp)
387	movaps	%xmm11,0x50(%rsp)
388	movaps	%xmm12,0x60(%rsp)
389	movaps	%xmm13,-0x68(%rax)	# not used, saved to share se_handler
390	movaps	%xmm14,-0x58(%rax)
391	movaps	%xmm15,-0x48(%rax)
392___
393$code.=<<___;
394	# stack layout
395	#
396	# +0	output sink
397	# +16	input sink [original %rsp and $num]
398	# +32	counters
399
400	sub	\$48,%rsp
401	and	\$-64,%rsp
402	mov	%rax,16(%rsp)			# original %rsp
403
404.Ldec4x_body:
405	movdqu	($key),$zero			# 0-round key
406	lea	0x78($key),$key			# size optimization
407	lea	40*2($inp),$inp
408
409.Ldec4x_loop_grande:
410	mov	$num,24(%rsp)			# original $num
411	xor	$num,$num
412___
413for($i=0;$i<4;$i++) {
414    $code.=<<___;
415	mov	`40*$i+16-40*2`($inp),$one	# borrow $one for number of blocks
416	mov	`40*$i+0-40*2`($inp),@inptr[$i]
417	cmp	$num,$one
418	mov	`40*$i+8-40*2`($inp),@outptr[$i]
419	cmovg	$one,$num			# find maximum
420	test	$one,$one
421	movdqu	`40*$i+24-40*2`($inp),@inp[$i]	# load IV
422	mov	$one,`32+4*$i`(%rsp)		# initialize counters
423	cmovle	%rsp,@inptr[$i]			# cancel input
424___
425}
426$code.=<<___;
427	test	$num,$num
428	jz	.Ldec4x_done
429
430	movups	0x10-0x78($key),$rndkey1
431	movups	0x20-0x78($key),$rndkey0
432	mov	0xf0-0x78($key),$rounds
433	movdqu	(@inptr[0]),@out[0]		# load inputs
434	movdqu	(@inptr[1]),@out[1]
435	 pxor	$zero,@out[0]
436	movdqu	(@inptr[2]),@out[2]
437	 pxor	$zero,@out[1]
438	movdqu	(@inptr[3]),@out[3]
439	 pxor	$zero,@out[2]
440	 pxor	$zero,@out[3]
441	movdqa	32(%rsp),$counters		# load counters
442	xor	$offset,$offset
443	jmp	.Loop_dec4x
444
445.align	32
446.Loop_dec4x:
447	add	\$16,$offset
448	lea	16(%rsp),$sink			# sink pointer
449	mov	\$1,$one			# constant of 1
450	sub	$offset,$sink
451
452	aesdec		$rndkey1,@out[0]
453	prefetcht0	31(@inptr[0],$offset)	# prefetch input
454	prefetcht0	31(@inptr[1],$offset)
455	aesdec		$rndkey1,@out[1]
456	prefetcht0	31(@inptr[2],$offset)
457	prefetcht0	31(@inptr[3],$offset)
458	aesdec		$rndkey1,@out[2]
459	aesdec		$rndkey1,@out[3]
460	movups		0x30-0x78($key),$rndkey1
461___
462for($i=0;$i<4;$i++) {
463my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
464$code.=<<___;
465	 cmp		`32+4*$i`(%rsp),$one
466	aesdec		$rndkey,@out[0]
467	aesdec		$rndkey,@out[1]
468	aesdec		$rndkey,@out[2]
469	 cmovge		$sink,@inptr[$i]	# cancel input
470	 cmovg		$sink,@outptr[$i]	# sink output
471	aesdec		$rndkey,@out[3]
472	movups		`0x40+16*$i-0x78`($key),$rndkey
473___
474}
475$code.=<<___;
476	 movdqa		$counters,$mask
477	aesdec		$rndkey0,@out[0]
478	prefetcht0	15(@outptr[0],$offset)	# prefetch output
479	prefetcht0	15(@outptr[1],$offset)
480	aesdec		$rndkey0,@out[1]
481	prefetcht0	15(@outptr[2],$offset)
482	prefetcht0	15(@outptr[3],$offset)
483	aesdec		$rndkey0,@out[2]
484	aesdec		$rndkey0,@out[3]
485	movups		0x80-0x78($key),$rndkey0
486	 pxor		$zero,$zero
487
488	aesdec		$rndkey1,@out[0]
489	 pcmpgtd	$zero,$mask
490	 movdqu		-0x78($key),$zero	# reload 0-round key
491	aesdec		$rndkey1,@out[1]
492	 paddd		$mask,$counters		# decrement counters
493	 movdqa		$counters,32(%rsp)	# update counters
494	aesdec		$rndkey1,@out[2]
495	aesdec		$rndkey1,@out[3]
496	movups		0x90-0x78($key),$rndkey1
497
498	cmp	\$11,$rounds
499
500	aesdec		$rndkey0,@out[0]
501	aesdec		$rndkey0,@out[1]
502	aesdec		$rndkey0,@out[2]
503	aesdec		$rndkey0,@out[3]
504	movups		0xa0-0x78($key),$rndkey0
505
506	jb	.Ldec4x_tail
507
508	aesdec		$rndkey1,@out[0]
509	aesdec		$rndkey1,@out[1]
510	aesdec		$rndkey1,@out[2]
511	aesdec		$rndkey1,@out[3]
512	movups		0xb0-0x78($key),$rndkey1
513
514	aesdec		$rndkey0,@out[0]
515	aesdec		$rndkey0,@out[1]
516	aesdec		$rndkey0,@out[2]
517	aesdec		$rndkey0,@out[3]
518	movups		0xc0-0x78($key),$rndkey0
519
520	je	.Ldec4x_tail
521
522	aesdec		$rndkey1,@out[0]
523	aesdec		$rndkey1,@out[1]
524	aesdec		$rndkey1,@out[2]
525	aesdec		$rndkey1,@out[3]
526	movups		0xd0-0x78($key),$rndkey1
527
528	aesdec		$rndkey0,@out[0]
529	aesdec		$rndkey0,@out[1]
530	aesdec		$rndkey0,@out[2]
531	aesdec		$rndkey0,@out[3]
532	movups		0xe0-0x78($key),$rndkey0
533	jmp	.Ldec4x_tail
534
535.align	32
536.Ldec4x_tail:
537	aesdec		$rndkey1,@out[0]
538	aesdec		$rndkey1,@out[1]
539	aesdec		$rndkey1,@out[2]
540	 pxor		$rndkey0,@inp[0]
541	 pxor		$rndkey0,@inp[1]
542	aesdec		$rndkey1,@out[3]
543	movdqu		0x10-0x78($key),$rndkey1
544	 pxor		$rndkey0,@inp[2]
545	 pxor		$rndkey0,@inp[3]
546	movdqu		0x20-0x78($key),$rndkey0
547
548	aesdeclast	@inp[0],@out[0]
549	aesdeclast	@inp[1],@out[1]
550	 movdqu		-16(@inptr[0],$offset),@inp[0]	# load next IV
551	 movdqu		-16(@inptr[1],$offset),@inp[1]
552	aesdeclast	@inp[2],@out[2]
553	aesdeclast	@inp[3],@out[3]
554	 movdqu		-16(@inptr[2],$offset),@inp[2]
555	 movdqu		-16(@inptr[3],$offset),@inp[3]
556
557	movups		@out[0],-16(@outptr[0],$offset)
558	 movdqu		(@inptr[0],$offset),@out[0]
559	movups		@out[1],-16(@outptr[1],$offset)
560	 movdqu		(@inptr[1],$offset),@out[1]
561	 pxor		$zero,@out[0]
562	movups		@out[2],-16(@outptr[2],$offset)
563	 movdqu		(@inptr[2],$offset),@out[2]
564	 pxor		$zero,@out[1]
565	movups		@out[3],-16(@outptr[3],$offset)
566	 movdqu		(@inptr[3],$offset),@out[3]
567	 pxor		$zero,@out[2]
568	 pxor		$zero,@out[3]
569
570	dec	$num
571	jnz	.Loop_dec4x
572
573	mov	16(%rsp),%rax			# original %rsp
574	mov	24(%rsp),$num
575
576	lea	`40*4`($inp),$inp
577	dec	$num
578	jnz	.Ldec4x_loop_grande
579
580.Ldec4x_done:
581___
582$code.=<<___ if ($win64);
583	movaps	-0xd8(%rax),%xmm6
584	movaps	-0xc8(%rax),%xmm7
585	movaps	-0xb8(%rax),%xmm8
586	movaps	-0xa8(%rax),%xmm9
587	movaps	-0x98(%rax),%xmm10
588	movaps	-0x88(%rax),%xmm11
589	movaps	-0x78(%rax),%xmm12
590	#movaps	-0x68(%rax),%xmm13
591	#movaps	-0x58(%rax),%xmm14
592	#movaps	-0x48(%rax),%xmm15
593___
594$code.=<<___;
595	mov	-48(%rax),%r15
596	mov	-40(%rax),%r14
597	mov	-32(%rax),%r13
598	mov	-24(%rax),%r12
599	mov	-16(%rax),%rbp
600	mov	-8(%rax),%rbx
601	lea	(%rax),%rsp
602.Ldec4x_epilogue:
603	ret
604.size	aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
605___
606
607						if ($avx) {{{
608my @ptr=map("%r$_",(8..15));
609my $offload=$sink;
610
611my @out=map("%xmm$_",(2..9));
612my @inp=map("%xmm$_",(10..13));
613my ($counters,$zero)=("%xmm14","%xmm15");
614
615$code.=<<___;
616.type	aesni_multi_cbc_encrypt_avx,\@function,3
617.align	32
618aesni_multi_cbc_encrypt_avx:
619_avx_cbc_enc_shortcut:
620	mov	%rsp,%rax
621	push	%rbx
622	push	%rbp
623	push	%r12
624	push	%r13
625	push	%r14
626	push	%r15
627___
628$code.=<<___ if ($win64);
629	lea	-0xa8(%rsp),%rsp
630	movaps	%xmm6,(%rsp)
631	movaps	%xmm7,0x10(%rsp)
632	movaps	%xmm8,0x20(%rsp)
633	movaps	%xmm9,0x30(%rsp)
634	movaps	%xmm10,0x40(%rsp)
635	movaps	%xmm11,0x50(%rsp)
636	movaps	%xmm12,-0x78(%rax)
637	movaps	%xmm13,-0x68(%rax)
638	movaps	%xmm14,-0x58(%rax)
639	movaps	%xmm15,-0x48(%rax)
640___
641$code.=<<___;
642	# stack layout
643	#
644	# +0	output sink
645	# +16	input sink [original %rsp and $num]
646	# +32	counters
647	# +64	distances between inputs and outputs
648	# +128	off-load area for @inp[0..3]
649
650	sub	\$192,%rsp
651	and	\$-128,%rsp
652	mov	%rax,16(%rsp)			# original %rsp
653
654.Lenc8x_body:
655	vzeroupper
656	vmovdqu	($key),$zero			# 0-round key
657	lea	0x78($key),$key			# size optimization
658	lea	40*4($inp),$inp
659	shr	\$1,$num
660
661.Lenc8x_loop_grande:
662	#mov	$num,24(%rsp)			# original $num
663	xor	$num,$num
664___
665for($i=0;$i<8;$i++) {
666  my $temp = $i ? $offload : $offset;
667    $code.=<<___;
668	mov	`40*$i+16-40*4`($inp),$one	# borrow $one for number of blocks
669	mov	`40*$i+0-40*4`($inp),@ptr[$i]	# input pointer
670	cmp	$num,$one
671	mov	`40*$i+8-40*4`($inp),$temp	# output pointer
672	cmovg	$one,$num			# find maximum
673	test	$one,$one
674	vmovdqu	`40*$i+24-40*4`($inp),@out[$i]	# load IV
675	mov	$one,`32+4*$i`(%rsp)		# initialize counters
676	cmovle	%rsp,@ptr[$i]			# cancel input
677	sub	@ptr[$i],$temp			# distance between input and output
678	mov	$temp,`64+8*$i`(%rsp)		# initialize distances
679___
680}
681$code.=<<___;
682	test	$num,$num
683	jz	.Lenc8x_done
684
685	vmovups	0x10-0x78($key),$rndkey1
686	vmovups	0x20-0x78($key),$rndkey0
687	mov	0xf0-0x78($key),$rounds
688
689	vpxor	(@ptr[0]),$zero,@inp[0]		# load inputs and xor with 0-round
690	 lea	128(%rsp),$offload		# offload area
691	vpxor	(@ptr[1]),$zero,@inp[1]
692	vpxor	(@ptr[2]),$zero,@inp[2]
693	vpxor	(@ptr[3]),$zero,@inp[3]
694	 vpxor	@inp[0],@out[0],@out[0]
695	vpxor	(@ptr[4]),$zero,@inp[0]
696	 vpxor	@inp[1],@out[1],@out[1]
697	vpxor	(@ptr[5]),$zero,@inp[1]
698	 vpxor	@inp[2],@out[2],@out[2]
699	vpxor	(@ptr[6]),$zero,@inp[2]
700	 vpxor	@inp[3],@out[3],@out[3]
701	vpxor	(@ptr[7]),$zero,@inp[3]
702	 vpxor	@inp[0],@out[4],@out[4]
703	mov	\$1,$one			# constant of 1
704	 vpxor	@inp[1],@out[5],@out[5]
705	 vpxor	@inp[2],@out[6],@out[6]
706	 vpxor	@inp[3],@out[7],@out[7]
707	jmp	.Loop_enc8x
708
709.align	32
710.Loop_enc8x:
711___
712for($i=0;$i<8;$i++) {
713my $rndkey=($i&1)?$rndkey0:$rndkey1;
714$code.=<<___;
715	vaesenc		$rndkey,@out[0],@out[0]
716	 cmp		32+4*$i(%rsp),$one
717___
718$code.=<<___ if ($i);
719	 mov		64+8*$i(%rsp),$offset
720___
721$code.=<<___;
722	vaesenc		$rndkey,@out[1],@out[1]
723	prefetcht0	31(@ptr[$i])			# prefetch input
724	vaesenc		$rndkey,@out[2],@out[2]
725___
726$code.=<<___ if ($i>1);
727	prefetcht0	15(@ptr[$i-2])			# prefetch output
728___
729$code.=<<___;
730	vaesenc		$rndkey,@out[3],@out[3]
731	 lea		(@ptr[$i],$offset),$offset
732	 cmovge		%rsp,@ptr[$i]			# cancel input
733	vaesenc		$rndkey,@out[4],@out[4]
734	 cmovg		%rsp,$offset			# sink output
735	vaesenc		$rndkey,@out[5],@out[5]
736	 sub		@ptr[$i],$offset
737	vaesenc		$rndkey,@out[6],@out[6]
738	 vpxor		16(@ptr[$i]),$zero,@inp[$i%4]	# load input and xor with 0-round
739	 mov		$offset,64+8*$i(%rsp)
740	vaesenc		$rndkey,@out[7],@out[7]
741	vmovups		`16*(3+$i)-0x78`($key),$rndkey
742	 lea		16(@ptr[$i],$offset),@ptr[$i]	# switch to output
743___
744$code.=<<___ if ($i<4)
745	 vmovdqu	@inp[$i%4],`16*$i`($offload)	# off-load
746___
747}
748$code.=<<___;
749	 vmovdqu	32(%rsp),$counters
750	prefetcht0	15(@ptr[$i-2])			# prefetch output
751	prefetcht0	15(@ptr[$i-1])
752	cmp	\$11,$rounds
753	jb	.Lenc8x_tail
754
755	vaesenc		$rndkey1,@out[0],@out[0]
756	vaesenc		$rndkey1,@out[1],@out[1]
757	vaesenc		$rndkey1,@out[2],@out[2]
758	vaesenc		$rndkey1,@out[3],@out[3]
759	vaesenc		$rndkey1,@out[4],@out[4]
760	vaesenc		$rndkey1,@out[5],@out[5]
761	vaesenc		$rndkey1,@out[6],@out[6]
762	vaesenc		$rndkey1,@out[7],@out[7]
763	vmovups		0xb0-0x78($key),$rndkey1
764
765	vaesenc		$rndkey0,@out[0],@out[0]
766	vaesenc		$rndkey0,@out[1],@out[1]
767	vaesenc		$rndkey0,@out[2],@out[2]
768	vaesenc		$rndkey0,@out[3],@out[3]
769	vaesenc		$rndkey0,@out[4],@out[4]
770	vaesenc		$rndkey0,@out[5],@out[5]
771	vaesenc		$rndkey0,@out[6],@out[6]
772	vaesenc		$rndkey0,@out[7],@out[7]
773	vmovups		0xc0-0x78($key),$rndkey0
774	je	.Lenc8x_tail
775
776	vaesenc		$rndkey1,@out[0],@out[0]
777	vaesenc		$rndkey1,@out[1],@out[1]
778	vaesenc		$rndkey1,@out[2],@out[2]
779	vaesenc		$rndkey1,@out[3],@out[3]
780	vaesenc		$rndkey1,@out[4],@out[4]
781	vaesenc		$rndkey1,@out[5],@out[5]
782	vaesenc		$rndkey1,@out[6],@out[6]
783	vaesenc		$rndkey1,@out[7],@out[7]
784	vmovups		0xd0-0x78($key),$rndkey1
785
786	vaesenc		$rndkey0,@out[0],@out[0]
787	vaesenc		$rndkey0,@out[1],@out[1]
788	vaesenc		$rndkey0,@out[2],@out[2]
789	vaesenc		$rndkey0,@out[3],@out[3]
790	vaesenc		$rndkey0,@out[4],@out[4]
791	vaesenc		$rndkey0,@out[5],@out[5]
792	vaesenc		$rndkey0,@out[6],@out[6]
793	vaesenc		$rndkey0,@out[7],@out[7]
794	vmovups		0xe0-0x78($key),$rndkey0
795
796.Lenc8x_tail:
797	vaesenc		$rndkey1,@out[0],@out[0]
798	 vpxor		$zero,$zero,$zero
799	vaesenc		$rndkey1,@out[1],@out[1]
800	vaesenc		$rndkey1,@out[2],@out[2]
801	 vpcmpgtd	$zero,$counters,$zero
802	vaesenc		$rndkey1,@out[3],@out[3]
803	vaesenc		$rndkey1,@out[4],@out[4]
804	 vpaddd		$counters,$zero,$zero		# decrement counters
805	 vmovdqu	48(%rsp),$counters
806	vaesenc		$rndkey1,@out[5],@out[5]
807	 mov		64(%rsp),$offset		# pre-load 1st offset
808	vaesenc		$rndkey1,@out[6],@out[6]
809	vaesenc		$rndkey1,@out[7],@out[7]
810	vmovups		0x10-0x78($key),$rndkey1
811
812	vaesenclast	$rndkey0,@out[0],@out[0]
813	 vmovdqa	$zero,32(%rsp)			# update counters
814	 vpxor		$zero,$zero,$zero
815	vaesenclast	$rndkey0,@out[1],@out[1]
816	vaesenclast	$rndkey0,@out[2],@out[2]
817	 vpcmpgtd	$zero,$counters,$zero
818	vaesenclast	$rndkey0,@out[3],@out[3]
819	vaesenclast	$rndkey0,@out[4],@out[4]
820	 vpaddd		$zero,$counters,$counters	# decrement counters
821	 vmovdqu	-0x78($key),$zero		# 0-round
822	vaesenclast	$rndkey0,@out[5],@out[5]
823	vaesenclast	$rndkey0,@out[6],@out[6]
824	 vmovdqa	$counters,48(%rsp)		# update counters
825	vaesenclast	$rndkey0,@out[7],@out[7]
826	vmovups		0x20-0x78($key),$rndkey0
827
828	vmovups		@out[0],-16(@ptr[0])		# write output
829	 sub		$offset,@ptr[0]			# switch to input
830	 vpxor		0x00($offload),@out[0],@out[0]
831	vmovups		@out[1],-16(@ptr[1])
832	 sub		`64+1*8`(%rsp),@ptr[1]
833	 vpxor		0x10($offload),@out[1],@out[1]
834	vmovups		@out[2],-16(@ptr[2])
835	 sub		`64+2*8`(%rsp),@ptr[2]
836	 vpxor		0x20($offload),@out[2],@out[2]
837	vmovups		@out[3],-16(@ptr[3])
838	 sub		`64+3*8`(%rsp),@ptr[3]
839	 vpxor		0x30($offload),@out[3],@out[3]
840	vmovups		@out[4],-16(@ptr[4])
841	 sub		`64+4*8`(%rsp),@ptr[4]
842	 vpxor		@inp[0],@out[4],@out[4]
843	vmovups		@out[5],-16(@ptr[5])
844	 sub		`64+5*8`(%rsp),@ptr[5]
845	 vpxor		@inp[1],@out[5],@out[5]
846	vmovups		@out[6],-16(@ptr[6])
847	 sub		`64+6*8`(%rsp),@ptr[6]
848	 vpxor		@inp[2],@out[6],@out[6]
849	vmovups		@out[7],-16(@ptr[7])
850	 sub		`64+7*8`(%rsp),@ptr[7]
851	 vpxor		@inp[3],@out[7],@out[7]
852
853	dec	$num
854	jnz	.Loop_enc8x
855
856	mov	16(%rsp),%rax			# original %rsp
857	#mov	24(%rsp),$num
858	#lea	`40*8`($inp),$inp
859	#dec	$num
860	#jnz	.Lenc8x_loop_grande
861
862.Lenc8x_done:
863	vzeroupper
864___
865$code.=<<___ if ($win64);
866	movaps	-0xd8(%rax),%xmm6
867	movaps	-0xc8(%rax),%xmm7
868	movaps	-0xb8(%rax),%xmm8
869	movaps	-0xa8(%rax),%xmm9
870	movaps	-0x98(%rax),%xmm10
871	movaps	-0x88(%rax),%xmm11
872	movaps	-0x78(%rax),%xmm12
873	movaps	-0x68(%rax),%xmm13
874	movaps	-0x58(%rax),%xmm14
875	movaps	-0x48(%rax),%xmm15
876___
877$code.=<<___;
878	mov	-48(%rax),%r15
879	mov	-40(%rax),%r14
880	mov	-32(%rax),%r13
881	mov	-24(%rax),%r12
882	mov	-16(%rax),%rbp
883	mov	-8(%rax),%rbx
884	lea	(%rax),%rsp
885.Lenc8x_epilogue:
886	ret
887.size	aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
888
889.type	aesni_multi_cbc_decrypt_avx,\@function,3
890.align	32
891aesni_multi_cbc_decrypt_avx:
892_avx_cbc_dec_shortcut:
893	mov	%rsp,%rax
894	push	%rbx
895	push	%rbp
896	push	%r12
897	push	%r13
898	push	%r14
899	push	%r15
900___
901$code.=<<___ if ($win64);
902	lea	-0xa8(%rsp),%rsp
903	movaps	%xmm6,(%rsp)
904	movaps	%xmm7,0x10(%rsp)
905	movaps	%xmm8,0x20(%rsp)
906	movaps	%xmm9,0x30(%rsp)
907	movaps	%xmm10,0x40(%rsp)
908	movaps	%xmm11,0x50(%rsp)
909	movaps	%xmm12,-0x78(%rax)
910	movaps	%xmm13,-0x68(%rax)
911	movaps	%xmm14,-0x58(%rax)
912	movaps	%xmm15,-0x48(%rax)
913___
914$code.=<<___;
915	# stack layout
916	#
917	# +0	output sink
918	# +16	input sink [original %rsp and $num]
919	# +32	counters
920	# +64	distances between inputs and outputs
921	# +128	off-load area for @inp[0..3]
922	# +192	IV/input offload
923
924	sub	\$256,%rsp
925	and	\$-256,%rsp
926	sub	\$192,%rsp
927	mov	%rax,16(%rsp)			# original %rsp
928
929.Ldec8x_body:
930	vzeroupper
931	vmovdqu	($key),$zero			# 0-round key
932	lea	0x78($key),$key			# size optimization
933	lea	40*4($inp),$inp
934	shr	\$1,$num
935
936.Ldec8x_loop_grande:
937	#mov	$num,24(%rsp)			# original $num
938	xor	$num,$num
939___
940for($i=0;$i<8;$i++) {
941  my $temp = $i ? $offload : $offset;
942    $code.=<<___;
943	mov	`40*$i+16-40*4`($inp),$one	# borrow $one for number of blocks
944	mov	`40*$i+0-40*4`($inp),@ptr[$i]	# input pointer
945	cmp	$num,$one
946	mov	`40*$i+8-40*4`($inp),$temp	# output pointer
947	cmovg	$one,$num			# find maximum
948	test	$one,$one
949	vmovdqu	`40*$i+24-40*4`($inp),@out[$i]	# load IV
950	mov	$one,`32+4*$i`(%rsp)		# initialize counters
951	cmovle	%rsp,@ptr[$i]			# cancel input
952	sub	@ptr[$i],$temp			# distance between input and output
953	mov	$temp,`64+8*$i`(%rsp)		# initialize distances
954	vmovdqu	@out[$i],`192+16*$i`(%rsp)	# offload IV
955___
956}
957$code.=<<___;
958	test	$num,$num
959	jz	.Ldec8x_done
960
961	vmovups	0x10-0x78($key),$rndkey1
962	vmovups	0x20-0x78($key),$rndkey0
963	mov	0xf0-0x78($key),$rounds
964	 lea	192+128(%rsp),$offload		# offload area
965
966	vmovdqu	(@ptr[0]),@out[0]		# load inputs
967	vmovdqu	(@ptr[1]),@out[1]
968	vmovdqu	(@ptr[2]),@out[2]
969	vmovdqu	(@ptr[3]),@out[3]
970	vmovdqu	(@ptr[4]),@out[4]
971	vmovdqu	(@ptr[5]),@out[5]
972	vmovdqu	(@ptr[6]),@out[6]
973	vmovdqu	(@ptr[7]),@out[7]
974	vmovdqu	@out[0],0x00($offload)		# offload inputs
975	vpxor	$zero,@out[0],@out[0]		# xor inputs with 0-round
976	vmovdqu	@out[1],0x10($offload)
977	vpxor	$zero,@out[1],@out[1]
978	vmovdqu	@out[2],0x20($offload)
979	vpxor	$zero,@out[2],@out[2]
980	vmovdqu	@out[3],0x30($offload)
981	vpxor	$zero,@out[3],@out[3]
982	vmovdqu	@out[4],0x40($offload)
983	vpxor	$zero,@out[4],@out[4]
984	vmovdqu	@out[5],0x50($offload)
985	vpxor	$zero,@out[5],@out[5]
986	vmovdqu	@out[6],0x60($offload)
987	vpxor	$zero,@out[6],@out[6]
988	vmovdqu	@out[7],0x70($offload)
989	vpxor	$zero,@out[7],@out[7]
990	xor	\$0x80,$offload
991	mov	\$1,$one			# constant of 1
992	jmp	.Loop_dec8x
993
994.align	32
995.Loop_dec8x:
996___
997for($i=0;$i<8;$i++) {
998my $rndkey=($i&1)?$rndkey0:$rndkey1;
999$code.=<<___;
1000	vaesdec		$rndkey,@out[0],@out[0]
1001	 cmp		32+4*$i(%rsp),$one
1002___
1003$code.=<<___ if ($i);
1004	 mov		64+8*$i(%rsp),$offset
1005___
1006$code.=<<___;
1007	vaesdec		$rndkey,@out[1],@out[1]
1008	prefetcht0	31(@ptr[$i])			# prefetch input
1009	vaesdec		$rndkey,@out[2],@out[2]
1010___
1011$code.=<<___ if ($i>1);
1012	prefetcht0	15(@ptr[$i-2])			# prefetch output
1013___
1014$code.=<<___;
1015	vaesdec		$rndkey,@out[3],@out[3]
1016	 lea		(@ptr[$i],$offset),$offset
1017	 cmovge		%rsp,@ptr[$i]			# cancel input
1018	vaesdec		$rndkey,@out[4],@out[4]
1019	 cmovg		%rsp,$offset			# sink output
1020	vaesdec		$rndkey,@out[5],@out[5]
1021	 sub		@ptr[$i],$offset
1022	vaesdec		$rndkey,@out[6],@out[6]
1023	 vmovdqu	16(@ptr[$i]),@inp[$i%4]		# load input
1024	 mov		$offset,64+8*$i(%rsp)
1025	vaesdec		$rndkey,@out[7],@out[7]
1026	vmovups		`16*(3+$i)-0x78`($key),$rndkey
1027	 lea		16(@ptr[$i],$offset),@ptr[$i]	# switch to output
1028___
1029$code.=<<___ if ($i<4);
1030	 vmovdqu	@inp[$i%4],`128+16*$i`(%rsp)	# off-load
1031___
1032}
1033$code.=<<___;
1034	 vmovdqu	32(%rsp),$counters
1035	prefetcht0	15(@ptr[$i-2])			# prefetch output
1036	prefetcht0	15(@ptr[$i-1])
1037	cmp	\$11,$rounds
1038	jb	.Ldec8x_tail
1039
1040	vaesdec		$rndkey1,@out[0],@out[0]
1041	vaesdec		$rndkey1,@out[1],@out[1]
1042	vaesdec		$rndkey1,@out[2],@out[2]
1043	vaesdec		$rndkey1,@out[3],@out[3]
1044	vaesdec		$rndkey1,@out[4],@out[4]
1045	vaesdec		$rndkey1,@out[5],@out[5]
1046	vaesdec		$rndkey1,@out[6],@out[6]
1047	vaesdec		$rndkey1,@out[7],@out[7]
1048	vmovups		0xb0-0x78($key),$rndkey1
1049
1050	vaesdec		$rndkey0,@out[0],@out[0]
1051	vaesdec		$rndkey0,@out[1],@out[1]
1052	vaesdec		$rndkey0,@out[2],@out[2]
1053	vaesdec		$rndkey0,@out[3],@out[3]
1054	vaesdec		$rndkey0,@out[4],@out[4]
1055	vaesdec		$rndkey0,@out[5],@out[5]
1056	vaesdec		$rndkey0,@out[6],@out[6]
1057	vaesdec		$rndkey0,@out[7],@out[7]
1058	vmovups		0xc0-0x78($key),$rndkey0
1059	je	.Ldec8x_tail
1060
1061	vaesdec		$rndkey1,@out[0],@out[0]
1062	vaesdec		$rndkey1,@out[1],@out[1]
1063	vaesdec		$rndkey1,@out[2],@out[2]
1064	vaesdec		$rndkey1,@out[3],@out[3]
1065	vaesdec		$rndkey1,@out[4],@out[4]
1066	vaesdec		$rndkey1,@out[5],@out[5]
1067	vaesdec		$rndkey1,@out[6],@out[6]
1068	vaesdec		$rndkey1,@out[7],@out[7]
1069	vmovups		0xd0-0x78($key),$rndkey1
1070
1071	vaesdec		$rndkey0,@out[0],@out[0]
1072	vaesdec		$rndkey0,@out[1],@out[1]
1073	vaesdec		$rndkey0,@out[2],@out[2]
1074	vaesdec		$rndkey0,@out[3],@out[3]
1075	vaesdec		$rndkey0,@out[4],@out[4]
1076	vaesdec		$rndkey0,@out[5],@out[5]
1077	vaesdec		$rndkey0,@out[6],@out[6]
1078	vaesdec		$rndkey0,@out[7],@out[7]
1079	vmovups		0xe0-0x78($key),$rndkey0
1080
1081.Ldec8x_tail:
1082	vaesdec		$rndkey1,@out[0],@out[0]
1083	 vpxor		$zero,$zero,$zero
1084	vaesdec		$rndkey1,@out[1],@out[1]
1085	vaesdec		$rndkey1,@out[2],@out[2]
1086	 vpcmpgtd	$zero,$counters,$zero
1087	vaesdec		$rndkey1,@out[3],@out[3]
1088	vaesdec		$rndkey1,@out[4],@out[4]
1089	 vpaddd		$counters,$zero,$zero		# decrement counters
1090	 vmovdqu	48(%rsp),$counters
1091	vaesdec		$rndkey1,@out[5],@out[5]
1092	 mov		64(%rsp),$offset		# pre-load 1st offset
1093	vaesdec		$rndkey1,@out[6],@out[6]
1094	vaesdec		$rndkey1,@out[7],@out[7]
1095	vmovups		0x10-0x78($key),$rndkey1
1096
1097	vaesdeclast	$rndkey0,@out[0],@out[0]
1098	 vmovdqa	$zero,32(%rsp)			# update counters
1099	 vpxor		$zero,$zero,$zero
1100	vaesdeclast	$rndkey0,@out[1],@out[1]
1101	vpxor		0x00($offload),@out[0],@out[0]	# xor with IV
1102	vaesdeclast	$rndkey0,@out[2],@out[2]
1103	vpxor		0x10($offload),@out[1],@out[1]
1104	 vpcmpgtd	$zero,$counters,$zero
1105	vaesdeclast	$rndkey0,@out[3],@out[3]
1106	vpxor		0x20($offload),@out[2],@out[2]
1107	vaesdeclast	$rndkey0,@out[4],@out[4]
1108	vpxor		0x30($offload),@out[3],@out[3]
1109	 vpaddd		$zero,$counters,$counters	# decrement counters
1110	 vmovdqu	-0x78($key),$zero		# 0-round
1111	vaesdeclast	$rndkey0,@out[5],@out[5]
1112	vpxor		0x40($offload),@out[4],@out[4]
1113	vaesdeclast	$rndkey0,@out[6],@out[6]
1114	vpxor		0x50($offload),@out[5],@out[5]
1115	 vmovdqa	$counters,48(%rsp)		# update counters
1116	vaesdeclast	$rndkey0,@out[7],@out[7]
1117	vpxor		0x60($offload),@out[6],@out[6]
1118	vmovups		0x20-0x78($key),$rndkey0
1119
1120	vmovups		@out[0],-16(@ptr[0])		# write output
1121	 sub		$offset,@ptr[0]			# switch to input
1122	 vmovdqu	128+0(%rsp),@out[0]
1123	vpxor		0x70($offload),@out[7],@out[7]
1124	vmovups		@out[1],-16(@ptr[1])
1125	 sub		`64+1*8`(%rsp),@ptr[1]
1126	 vmovdqu	@out[0],0x00($offload)
1127	 vpxor		$zero,@out[0],@out[0]
1128	 vmovdqu	128+16(%rsp),@out[1]
1129	vmovups		@out[2],-16(@ptr[2])
1130	 sub		`64+2*8`(%rsp),@ptr[2]
1131	 vmovdqu	@out[1],0x10($offload)
1132	 vpxor		$zero,@out[1],@out[1]
1133	 vmovdqu	128+32(%rsp),@out[2]
1134	vmovups		@out[3],-16(@ptr[3])
1135	 sub		`64+3*8`(%rsp),@ptr[3]
1136	 vmovdqu	@out[2],0x20($offload)
1137	 vpxor		$zero,@out[2],@out[2]
1138	 vmovdqu	128+48(%rsp),@out[3]
1139	vmovups		@out[4],-16(@ptr[4])
1140	 sub		`64+4*8`(%rsp),@ptr[4]
1141	 vmovdqu	@out[3],0x30($offload)
1142	 vpxor		$zero,@out[3],@out[3]
1143	 vmovdqu	@inp[0],0x40($offload)
1144	 vpxor		@inp[0],$zero,@out[4]
1145	vmovups		@out[5],-16(@ptr[5])
1146	 sub		`64+5*8`(%rsp),@ptr[5]
1147	 vmovdqu	@inp[1],0x50($offload)
1148	 vpxor		@inp[1],$zero,@out[5]
1149	vmovups		@out[6],-16(@ptr[6])
1150	 sub		`64+6*8`(%rsp),@ptr[6]
1151	 vmovdqu	@inp[2],0x60($offload)
1152	 vpxor		@inp[2],$zero,@out[6]
1153	vmovups		@out[7],-16(@ptr[7])
1154	 sub		`64+7*8`(%rsp),@ptr[7]
1155	 vmovdqu	@inp[3],0x70($offload)
1156	 vpxor		@inp[3],$zero,@out[7]
1157
1158	xor	\$128,$offload
1159	dec	$num
1160	jnz	.Loop_dec8x
1161
1162	mov	16(%rsp),%rax			# original %rsp
1163	#mov	24(%rsp),$num
1164	#lea	`40*8`($inp),$inp
1165	#dec	$num
1166	#jnz	.Ldec8x_loop_grande
1167
1168.Ldec8x_done:
1169	vzeroupper
1170___
1171$code.=<<___ if ($win64);
1172	movaps	-0xd8(%rax),%xmm6
1173	movaps	-0xc8(%rax),%xmm7
1174	movaps	-0xb8(%rax),%xmm8
1175	movaps	-0xa8(%rax),%xmm9
1176	movaps	-0x98(%rax),%xmm10
1177	movaps	-0x88(%rax),%xmm11
1178	movaps	-0x78(%rax),%xmm12
1179	movaps	-0x68(%rax),%xmm13
1180	movaps	-0x58(%rax),%xmm14
1181	movaps	-0x48(%rax),%xmm15
1182___
1183$code.=<<___;
1184	mov	-48(%rax),%r15
1185	mov	-40(%rax),%r14
1186	mov	-32(%rax),%r13
1187	mov	-24(%rax),%r12
1188	mov	-16(%rax),%rbp
1189	mov	-8(%rax),%rbx
1190	lea	(%rax),%rsp
1191.Ldec8x_epilogue:
1192	ret
1193.size	aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1194___
1195						}}}
1196
1197if ($win64) {
1198# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1199#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1200$rec="%rcx";
1201$frame="%rdx";
1202$context="%r8";
1203$disp="%r9";
1204
1205$code.=<<___;
1206.extern	__imp_RtlVirtualUnwind
1207.type	se_handler,\@abi-omnipotent
1208.align	16
1209se_handler:
1210	push	%rsi
1211	push	%rdi
1212	push	%rbx
1213	push	%rbp
1214	push	%r12
1215	push	%r13
1216	push	%r14
1217	push	%r15
1218	pushfq
1219	sub	\$64,%rsp
1220
1221	mov	120($context),%rax	# pull context->Rax
1222	mov	248($context),%rbx	# pull context->Rip
1223
1224	mov	8($disp),%rsi		# disp->ImageBase
1225	mov	56($disp),%r11		# disp->HandlerData
1226
1227	mov	0(%r11),%r10d		# HandlerData[0]
1228	lea	(%rsi,%r10),%r10	# prologue label
1229	cmp	%r10,%rbx		# context->Rip<.Lprologue
1230	jb	.Lin_prologue
1231
1232	mov	152($context),%rax	# pull context->Rsp
1233
1234	mov	4(%r11),%r10d		# HandlerData[1]
1235	lea	(%rsi,%r10),%r10	# epilogue label
1236	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1237	jae	.Lin_prologue
1238
1239	mov	16(%rax),%rax		# pull saved stack pointer
1240
1241	mov	-8(%rax),%rbx
1242	mov	-16(%rax),%rbp
1243	mov	-24(%rax),%r12
1244	mov	-32(%rax),%r13
1245	mov	-40(%rax),%r14
1246	mov	-48(%rax),%r15
1247	mov	%rbx,144($context)	# restore context->Rbx
1248	mov	%rbp,160($context)	# restore context->Rbp
1249	mov	%r12,216($context)	# restore cotnext->R12
1250	mov	%r13,224($context)	# restore cotnext->R13
1251	mov	%r14,232($context)	# restore cotnext->R14
1252	mov	%r15,240($context)	# restore cotnext->R15
1253
1254	lea	-56-10*16(%rax),%rsi
1255	lea	512($context),%rdi	# &context.Xmm6
1256	mov	\$20,%ecx
1257	.long	0xa548f3fc		# cld; rep movsq
1258
1259.Lin_prologue:
1260	mov	8(%rax),%rdi
1261	mov	16(%rax),%rsi
1262	mov	%rax,152($context)	# restore context->Rsp
1263	mov	%rsi,168($context)	# restore context->Rsi
1264	mov	%rdi,176($context)	# restore context->Rdi
1265
1266	mov	40($disp),%rdi		# disp->ContextRecord
1267	mov	$context,%rsi		# context
1268	mov	\$154,%ecx		# sizeof(CONTEXT)
1269	.long	0xa548f3fc		# cld; rep movsq
1270
1271	mov	$disp,%rsi
1272	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1273	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1274	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1275	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1276	mov	40(%rsi),%r10		# disp->ContextRecord
1277	lea	56(%rsi),%r11		# &disp->HandlerData
1278	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1279	mov	%r10,32(%rsp)		# arg5
1280	mov	%r11,40(%rsp)		# arg6
1281	mov	%r12,48(%rsp)		# arg7
1282	mov	%rcx,56(%rsp)		# arg8, (NULL)
1283	call	*__imp_RtlVirtualUnwind(%rip)
1284
1285	mov	\$1,%eax		# ExceptionContinueSearch
1286	add	\$64,%rsp
1287	popfq
1288	pop	%r15
1289	pop	%r14
1290	pop	%r13
1291	pop	%r12
1292	pop	%rbp
1293	pop	%rbx
1294	pop	%rdi
1295	pop	%rsi
1296	ret
1297.size	se_handler,.-se_handler
1298
1299.section	.pdata
1300.align	4
1301	.rva	.LSEH_begin_aesni_multi_cbc_encrypt
1302	.rva	.LSEH_end_aesni_multi_cbc_encrypt
1303	.rva	.LSEH_info_aesni_multi_cbc_encrypt
1304	.rva	.LSEH_begin_aesni_multi_cbc_decrypt
1305	.rva	.LSEH_end_aesni_multi_cbc_decrypt
1306	.rva	.LSEH_info_aesni_multi_cbc_decrypt
1307___
1308$code.=<<___ if ($avx);
1309	.rva	.LSEH_begin_aesni_multi_cbc_encrypt_avx
1310	.rva	.LSEH_end_aesni_multi_cbc_encrypt_avx
1311	.rva	.LSEH_info_aesni_multi_cbc_encrypt_avx
1312	.rva	.LSEH_begin_aesni_multi_cbc_decrypt_avx
1313	.rva	.LSEH_end_aesni_multi_cbc_decrypt_avx
1314	.rva	.LSEH_info_aesni_multi_cbc_decrypt_avx
1315___
1316$code.=<<___;
1317.section	.xdata
1318.align	8
1319.LSEH_info_aesni_multi_cbc_encrypt:
1320	.byte	9,0,0,0
1321	.rva	se_handler
1322	.rva	.Lenc4x_body,.Lenc4x_epilogue		# HandlerData[]
1323.LSEH_info_aesni_multi_cbc_decrypt:
1324	.byte	9,0,0,0
1325	.rva	se_handler
1326	.rva	.Ldec4x_body,.Ldec4x_epilogue		# HandlerData[]
1327___
1328$code.=<<___ if ($avx);
1329.LSEH_info_aesni_multi_cbc_encrypt_avx:
1330	.byte	9,0,0,0
1331	.rva	se_handler
1332	.rva	.Lenc8x_body,.Lenc8x_epilogue		# HandlerData[]
1333.LSEH_info_aesni_multi_cbc_decrypt_avx:
1334	.byte	9,0,0,0
1335	.rva	se_handler
1336	.rva	.Ldec8x_body,.Ldec8x_epilogue		# HandlerData[]
1337___
1338}
1339####################################################################
1340
1341sub rex {
1342  local *opcode=shift;
1343  my ($dst,$src)=@_;
1344  my $rex=0;
1345
1346    $rex|=0x04			if($dst>=8);
1347    $rex|=0x01			if($src>=8);
1348    push @opcode,$rex|0x40	if($rex);
1349}
1350
1351sub aesni {
1352  my $line=shift;
1353  my @opcode=(0x66);
1354
1355    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1356	rex(\@opcode,$4,$3);
1357	push @opcode,0x0f,0x3a,0xdf;
1358	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
1359	my $c=$2;
1360	push @opcode,$c=~/^0/?oct($c):$c;
1361	return ".byte\t".join(',',@opcode);
1362    }
1363    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1364	my %opcodelet = (
1365		"aesimc" => 0xdb,
1366		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
1367		"aesdec" => 0xde,	"aesdeclast" => 0xdf
1368	);
1369	return undef if (!defined($opcodelet{$1}));
1370	rex(\@opcode,$3,$2);
1371	push @opcode,0x0f,0x38,$opcodelet{$1};
1372	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
1373	return ".byte\t".join(',',@opcode);
1374    }
1375    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1376	my %opcodelet = (
1377		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
1378		"aesdec" => 0xde,	"aesdeclast" => 0xdf
1379	);
1380	return undef if (!defined($opcodelet{$1}));
1381	my $off = $2;
1382	push @opcode,0x44 if ($3>=8);
1383	push @opcode,0x0f,0x38,$opcodelet{$1};
1384	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
1385	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1386	return ".byte\t".join(',',@opcode);
1387    }
1388    return $line;
1389}
1390
1391$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1392$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1393
1394print $code;
1395close STDOUT;
1396