xref: /freebsd/crypto/openssl/crypto/aes/asm/aesni-mb-x86_64.pl (revision b4af4f93c682e445bf159f0d1ec90b636296c946)
1#! /usr/bin/env perl
2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Multi-buffer AES-NI procedures process several independent buffers
18# in parallel by interleaving independent instructions.
19#
20# Cycles per byte for interleave factor 4:
21#
22#			asymptotic	measured
23#			---------------------------
24# Westmere		5.00/4=1.25	5.13/4=1.28
25# Atom			15.0/4=3.75	?15.7/4=3.93
26# Sandy Bridge		5.06/4=1.27	5.18/4=1.29
27# Ivy Bridge		5.06/4=1.27	5.14/4=1.29
28# Haswell		4.44/4=1.11	4.44/4=1.11
29# Bulldozer		5.75/4=1.44	5.76/4=1.44
30#
31# Cycles per byte for interleave factor 8 (not implemented for
32# pre-AVX processors, where higher interleave factor incidentally
33# doesn't result in improvement):
34#
35#			asymptotic	measured
36#			---------------------------
37# Sandy Bridge		5.06/8=0.64	7.10/8=0.89(*)
38# Ivy Bridge		5.06/8=0.64	7.14/8=0.89(*)
39# Haswell		5.00/8=0.63	5.00/8=0.63
40# Bulldozer		5.75/8=0.72	5.77/8=0.72
41#
42# (*)	Sandy/Ivy Bridge are known to handle high interleave factors
43#	suboptimally;
44
45$flavour = shift;
46$output  = shift;
47if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
48
49$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
53( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
54die "can't locate x86_64-xlate.pl";
55
56$avx=0;
57
58if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
59		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
60	$avx = ($1>=2.19) + ($1>=2.22);
61}
62
63if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65	$avx = ($1>=2.09) + ($1>=2.10);
66}
67
68if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
70	$avx = ($1>=10) + ($1>=11);
71}
72
73if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
74	$avx = ($2>=3.0) + ($2>3.0);
75}
76
77open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
78*STDOUT=*OUT;
79
80# void aesni_multi_cbc_encrypt (
81#     struct {	void *inp,*out; int blocks; double iv[2]; } inp[8];
82#     const AES_KEY *key,
83#     int num);		/* 1 or 2 */
84#
85$inp="%rdi";	# 1st arg
86$key="%rsi";	# 2nd arg
87$num="%edx";
88
89@inptr=map("%r$_",(8..11));
90@outptr=map("%r$_",(12..15));
91
92($rndkey0,$rndkey1)=("%xmm0","%xmm1");
93@out=map("%xmm$_",(2..5));
94@inp=map("%xmm$_",(6..9));
95($counters,$mask,$zero)=map("%xmm$_",(10..12));
96
97($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
98
99$code.=<<___;
100.text
101
102.extern	OPENSSL_ia32cap_P
103
104.globl	aesni_multi_cbc_encrypt
105.type	aesni_multi_cbc_encrypt,\@function,3
106.align	32
107aesni_multi_cbc_encrypt:
108.cfi_startproc
109___
110$code.=<<___ if ($avx);
111	cmp	\$2,$num
112	jb	.Lenc_non_avx
113	mov	OPENSSL_ia32cap_P+4(%rip),%ecx
114	test	\$`1<<28`,%ecx			# AVX bit
115	jnz	_avx_cbc_enc_shortcut
116	jmp	.Lenc_non_avx
117.align	16
118.Lenc_non_avx:
119___
120$code.=<<___;
121	mov	%rsp,%rax
122.cfi_def_cfa_register	%rax
123	push	%rbx
124.cfi_push	%rbx
125	push	%rbp
126.cfi_push	%rbp
127	push	%r12
128.cfi_push	%r12
129	push	%r13
130.cfi_push	%r13
131	push	%r14
132.cfi_push	%r14
133	push	%r15
134.cfi_push	%r15
135___
136$code.=<<___ if ($win64);
137	lea	-0xa8(%rsp),%rsp
138	movaps	%xmm6,(%rsp)
139	movaps	%xmm7,0x10(%rsp)
140	movaps	%xmm8,0x20(%rsp)
141	movaps	%xmm9,0x30(%rsp)
142	movaps	%xmm10,0x40(%rsp)
143	movaps	%xmm11,0x50(%rsp)
144	movaps	%xmm12,0x60(%rsp)
145	movaps	%xmm13,-0x68(%rax)	# not used, saved to share se_handler
146	movaps	%xmm14,-0x58(%rax)
147	movaps	%xmm15,-0x48(%rax)
148___
149$code.=<<___;
150	# stack layout
151	#
152	# +0	output sink
153	# +16	input sink [original %rsp and $num]
154	# +32	counters
155
156	sub	\$48,%rsp
157	and	\$-64,%rsp
158	mov	%rax,16(%rsp)			# original %rsp
159.cfi_cfa_expression	%rsp+16,deref,+8
160
161.Lenc4x_body:
162	movdqu	($key),$zero			# 0-round key
163	lea	0x78($key),$key			# size optimization
164	lea	40*2($inp),$inp
165
166.Lenc4x_loop_grande:
167	mov	$num,24(%rsp)			# original $num
168	xor	$num,$num
169___
170for($i=0;$i<4;$i++) {
171    $code.=<<___;
172	mov	`40*$i+16-40*2`($inp),$one	# borrow $one for number of blocks
173	mov	`40*$i+0-40*2`($inp),@inptr[$i]
174	cmp	$num,$one
175	mov	`40*$i+8-40*2`($inp),@outptr[$i]
176	cmovg	$one,$num			# find maximum
177	test	$one,$one
178	movdqu	`40*$i+24-40*2`($inp),@out[$i]	# load IV
179	mov	$one,`32+4*$i`(%rsp)		# initialize counters
180	cmovle	%rsp,@inptr[$i]			# cancel input
181___
182}
183$code.=<<___;
184	test	$num,$num
185	jz	.Lenc4x_done
186
187	movups	0x10-0x78($key),$rndkey1
188	 pxor	$zero,@out[0]
189	movups	0x20-0x78($key),$rndkey0
190	 pxor	$zero,@out[1]
191	mov	0xf0-0x78($key),$rounds
192	 pxor	$zero,@out[2]
193	movdqu	(@inptr[0]),@inp[0]		# load inputs
194	 pxor	$zero,@out[3]
195	movdqu	(@inptr[1]),@inp[1]
196	 pxor	@inp[0],@out[0]
197	movdqu	(@inptr[2]),@inp[2]
198	 pxor	@inp[1],@out[1]
199	movdqu	(@inptr[3]),@inp[3]
200	 pxor	@inp[2],@out[2]
201	 pxor	@inp[3],@out[3]
202	movdqa	32(%rsp),$counters		# load counters
203	xor	$offset,$offset
204	jmp	.Loop_enc4x
205
206.align	32
207.Loop_enc4x:
208	add	\$16,$offset
209	lea	16(%rsp),$sink			# sink pointer
210	mov	\$1,$one			# constant of 1
211	sub	$offset,$sink
212
213	aesenc		$rndkey1,@out[0]
214	prefetcht0	31(@inptr[0],$offset)	# prefetch input
215	prefetcht0	31(@inptr[1],$offset)
216	aesenc		$rndkey1,@out[1]
217	prefetcht0	31(@inptr[2],$offset)
218	prefetcht0	31(@inptr[2],$offset)
219	aesenc		$rndkey1,@out[2]
220	aesenc		$rndkey1,@out[3]
221	movups		0x30-0x78($key),$rndkey1
222___
223for($i=0;$i<4;$i++) {
224my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
225$code.=<<___;
226	 cmp		`32+4*$i`(%rsp),$one
227	aesenc		$rndkey,@out[0]
228	aesenc		$rndkey,@out[1]
229	aesenc		$rndkey,@out[2]
230	 cmovge		$sink,@inptr[$i]	# cancel input
231	 cmovg		$sink,@outptr[$i]	# sink output
232	aesenc		$rndkey,@out[3]
233	movups		`0x40+16*$i-0x78`($key),$rndkey
234___
235}
236$code.=<<___;
237	 movdqa		$counters,$mask
238	aesenc		$rndkey0,@out[0]
239	prefetcht0	15(@outptr[0],$offset)	# prefetch output
240	prefetcht0	15(@outptr[1],$offset)
241	aesenc		$rndkey0,@out[1]
242	prefetcht0	15(@outptr[2],$offset)
243	prefetcht0	15(@outptr[3],$offset)
244	aesenc		$rndkey0,@out[2]
245	aesenc		$rndkey0,@out[3]
246	movups		0x80-0x78($key),$rndkey0
247	 pxor		$zero,$zero
248
249	aesenc		$rndkey1,@out[0]
250	 pcmpgtd	$zero,$mask
251	 movdqu		-0x78($key),$zero	# reload 0-round key
252	aesenc		$rndkey1,@out[1]
253	 paddd		$mask,$counters		# decrement counters
254	 movdqa		$counters,32(%rsp)	# update counters
255	aesenc		$rndkey1,@out[2]
256	aesenc		$rndkey1,@out[3]
257	movups		0x90-0x78($key),$rndkey1
258
259	cmp	\$11,$rounds
260
261	aesenc		$rndkey0,@out[0]
262	aesenc		$rndkey0,@out[1]
263	aesenc		$rndkey0,@out[2]
264	aesenc		$rndkey0,@out[3]
265	movups		0xa0-0x78($key),$rndkey0
266
267	jb	.Lenc4x_tail
268
269	aesenc		$rndkey1,@out[0]
270	aesenc		$rndkey1,@out[1]
271	aesenc		$rndkey1,@out[2]
272	aesenc		$rndkey1,@out[3]
273	movups		0xb0-0x78($key),$rndkey1
274
275	aesenc		$rndkey0,@out[0]
276	aesenc		$rndkey0,@out[1]
277	aesenc		$rndkey0,@out[2]
278	aesenc		$rndkey0,@out[3]
279	movups		0xc0-0x78($key),$rndkey0
280
281	je	.Lenc4x_tail
282
283	aesenc		$rndkey1,@out[0]
284	aesenc		$rndkey1,@out[1]
285	aesenc		$rndkey1,@out[2]
286	aesenc		$rndkey1,@out[3]
287	movups		0xd0-0x78($key),$rndkey1
288
289	aesenc		$rndkey0,@out[0]
290	aesenc		$rndkey0,@out[1]
291	aesenc		$rndkey0,@out[2]
292	aesenc		$rndkey0,@out[3]
293	movups		0xe0-0x78($key),$rndkey0
294	jmp	.Lenc4x_tail
295
296.align	32
297.Lenc4x_tail:
298	aesenc		$rndkey1,@out[0]
299	aesenc		$rndkey1,@out[1]
300	aesenc		$rndkey1,@out[2]
301	aesenc		$rndkey1,@out[3]
302	 movdqu		(@inptr[0],$offset),@inp[0]
303	movdqu		0x10-0x78($key),$rndkey1
304
305	aesenclast	$rndkey0,@out[0]
306	 movdqu		(@inptr[1],$offset),@inp[1]
307	 pxor		$zero,@inp[0]
308	aesenclast	$rndkey0,@out[1]
309	 movdqu		(@inptr[2],$offset),@inp[2]
310	 pxor		$zero,@inp[1]
311	aesenclast	$rndkey0,@out[2]
312	 movdqu		(@inptr[3],$offset),@inp[3]
313	 pxor		$zero,@inp[2]
314	aesenclast	$rndkey0,@out[3]
315	movdqu		0x20-0x78($key),$rndkey0
316	 pxor		$zero,@inp[3]
317
318	movups		@out[0],-16(@outptr[0],$offset)
319	 pxor		@inp[0],@out[0]
320	movups		@out[1],-16(@outptr[1],$offset)
321	 pxor		@inp[1],@out[1]
322	movups		@out[2],-16(@outptr[2],$offset)
323	 pxor		@inp[2],@out[2]
324	movups		@out[3],-16(@outptr[3],$offset)
325	 pxor		@inp[3],@out[3]
326
327	dec	$num
328	jnz	.Loop_enc4x
329
330	mov	16(%rsp),%rax			# original %rsp
331.cfi_def_cfa	%rax,8
332	mov	24(%rsp),$num
333
334	#pxor	@inp[0],@out[0]
335	#pxor	@inp[1],@out[1]
336	#movdqu	@out[0],`40*0+24-40*2`($inp)	# output iv FIX ME!
337	#pxor	@inp[2],@out[2]
338	#movdqu	@out[1],`40*1+24-40*2`($inp)
339	#pxor	@inp[3],@out[3]
340	#movdqu	@out[2],`40*2+24-40*2`($inp)	# won't fix, let caller
341	#movdqu	@out[3],`40*3+24-40*2`($inp)	# figure this out...
342
343	lea	`40*4`($inp),$inp
344	dec	$num
345	jnz	.Lenc4x_loop_grande
346
347.Lenc4x_done:
348___
349$code.=<<___ if ($win64);
350	movaps	-0xd8(%rax),%xmm6
351	movaps	-0xc8(%rax),%xmm7
352	movaps	-0xb8(%rax),%xmm8
353	movaps	-0xa8(%rax),%xmm9
354	movaps	-0x98(%rax),%xmm10
355	movaps	-0x88(%rax),%xmm11
356	movaps	-0x78(%rax),%xmm12
357	#movaps	-0x68(%rax),%xmm13
358	#movaps	-0x58(%rax),%xmm14
359	#movaps	-0x48(%rax),%xmm15
360___
361$code.=<<___;
362	mov	-48(%rax),%r15
363.cfi_restore	%r15
364	mov	-40(%rax),%r14
365.cfi_restore	%r14
366	mov	-32(%rax),%r13
367.cfi_restore	%r13
368	mov	-24(%rax),%r12
369.cfi_restore	%r12
370	mov	-16(%rax),%rbp
371.cfi_restore	%rbp
372	mov	-8(%rax),%rbx
373.cfi_restore	%rbx
374	lea	(%rax),%rsp
375.cfi_def_cfa_register	%rsp
376.Lenc4x_epilogue:
377	ret
378.cfi_endproc
379.size	aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
380
381.globl	aesni_multi_cbc_decrypt
382.type	aesni_multi_cbc_decrypt,\@function,3
383.align	32
384aesni_multi_cbc_decrypt:
385.cfi_startproc
386___
387$code.=<<___ if ($avx);
388	cmp	\$2,$num
389	jb	.Ldec_non_avx
390	mov	OPENSSL_ia32cap_P+4(%rip),%ecx
391	test	\$`1<<28`,%ecx			# AVX bit
392	jnz	_avx_cbc_dec_shortcut
393	jmp	.Ldec_non_avx
394.align	16
395.Ldec_non_avx:
396___
397$code.=<<___;
398	mov	%rsp,%rax
399.cfi_def_cfa_register	%rax
400	push	%rbx
401.cfi_push	%rbx
402	push	%rbp
403.cfi_push	%rbp
404	push	%r12
405.cfi_push	%r12
406	push	%r13
407.cfi_push	%r13
408	push	%r14
409.cfi_push	%r14
410	push	%r15
411.cfi_push	%r15
412___
413$code.=<<___ if ($win64);
414	lea	-0xa8(%rsp),%rsp
415	movaps	%xmm6,(%rsp)
416	movaps	%xmm7,0x10(%rsp)
417	movaps	%xmm8,0x20(%rsp)
418	movaps	%xmm9,0x30(%rsp)
419	movaps	%xmm10,0x40(%rsp)
420	movaps	%xmm11,0x50(%rsp)
421	movaps	%xmm12,0x60(%rsp)
422	movaps	%xmm13,-0x68(%rax)	# not used, saved to share se_handler
423	movaps	%xmm14,-0x58(%rax)
424	movaps	%xmm15,-0x48(%rax)
425___
426$code.=<<___;
427	# stack layout
428	#
429	# +0	output sink
430	# +16	input sink [original %rsp and $num]
431	# +32	counters
432
433	sub	\$48,%rsp
434	and	\$-64,%rsp
435	mov	%rax,16(%rsp)			# original %rsp
436.cfi_cfa_expression	%rsp+16,deref,+8
437
438.Ldec4x_body:
439	movdqu	($key),$zero			# 0-round key
440	lea	0x78($key),$key			# size optimization
441	lea	40*2($inp),$inp
442
443.Ldec4x_loop_grande:
444	mov	$num,24(%rsp)			# original $num
445	xor	$num,$num
446___
447for($i=0;$i<4;$i++) {
448    $code.=<<___;
449	mov	`40*$i+16-40*2`($inp),$one	# borrow $one for number of blocks
450	mov	`40*$i+0-40*2`($inp),@inptr[$i]
451	cmp	$num,$one
452	mov	`40*$i+8-40*2`($inp),@outptr[$i]
453	cmovg	$one,$num			# find maximum
454	test	$one,$one
455	movdqu	`40*$i+24-40*2`($inp),@inp[$i]	# load IV
456	mov	$one,`32+4*$i`(%rsp)		# initialize counters
457	cmovle	%rsp,@inptr[$i]			# cancel input
458___
459}
460$code.=<<___;
461	test	$num,$num
462	jz	.Ldec4x_done
463
464	movups	0x10-0x78($key),$rndkey1
465	movups	0x20-0x78($key),$rndkey0
466	mov	0xf0-0x78($key),$rounds
467	movdqu	(@inptr[0]),@out[0]		# load inputs
468	movdqu	(@inptr[1]),@out[1]
469	 pxor	$zero,@out[0]
470	movdqu	(@inptr[2]),@out[2]
471	 pxor	$zero,@out[1]
472	movdqu	(@inptr[3]),@out[3]
473	 pxor	$zero,@out[2]
474	 pxor	$zero,@out[3]
475	movdqa	32(%rsp),$counters		# load counters
476	xor	$offset,$offset
477	jmp	.Loop_dec4x
478
479.align	32
480.Loop_dec4x:
481	add	\$16,$offset
482	lea	16(%rsp),$sink			# sink pointer
483	mov	\$1,$one			# constant of 1
484	sub	$offset,$sink
485
486	aesdec		$rndkey1,@out[0]
487	prefetcht0	31(@inptr[0],$offset)	# prefetch input
488	prefetcht0	31(@inptr[1],$offset)
489	aesdec		$rndkey1,@out[1]
490	prefetcht0	31(@inptr[2],$offset)
491	prefetcht0	31(@inptr[3],$offset)
492	aesdec		$rndkey1,@out[2]
493	aesdec		$rndkey1,@out[3]
494	movups		0x30-0x78($key),$rndkey1
495___
496for($i=0;$i<4;$i++) {
497my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
498$code.=<<___;
499	 cmp		`32+4*$i`(%rsp),$one
500	aesdec		$rndkey,@out[0]
501	aesdec		$rndkey,@out[1]
502	aesdec		$rndkey,@out[2]
503	 cmovge		$sink,@inptr[$i]	# cancel input
504	 cmovg		$sink,@outptr[$i]	# sink output
505	aesdec		$rndkey,@out[3]
506	movups		`0x40+16*$i-0x78`($key),$rndkey
507___
508}
509$code.=<<___;
510	 movdqa		$counters,$mask
511	aesdec		$rndkey0,@out[0]
512	prefetcht0	15(@outptr[0],$offset)	# prefetch output
513	prefetcht0	15(@outptr[1],$offset)
514	aesdec		$rndkey0,@out[1]
515	prefetcht0	15(@outptr[2],$offset)
516	prefetcht0	15(@outptr[3],$offset)
517	aesdec		$rndkey0,@out[2]
518	aesdec		$rndkey0,@out[3]
519	movups		0x80-0x78($key),$rndkey0
520	 pxor		$zero,$zero
521
522	aesdec		$rndkey1,@out[0]
523	 pcmpgtd	$zero,$mask
524	 movdqu		-0x78($key),$zero	# reload 0-round key
525	aesdec		$rndkey1,@out[1]
526	 paddd		$mask,$counters		# decrement counters
527	 movdqa		$counters,32(%rsp)	# update counters
528	aesdec		$rndkey1,@out[2]
529	aesdec		$rndkey1,@out[3]
530	movups		0x90-0x78($key),$rndkey1
531
532	cmp	\$11,$rounds
533
534	aesdec		$rndkey0,@out[0]
535	aesdec		$rndkey0,@out[1]
536	aesdec		$rndkey0,@out[2]
537	aesdec		$rndkey0,@out[3]
538	movups		0xa0-0x78($key),$rndkey0
539
540	jb	.Ldec4x_tail
541
542	aesdec		$rndkey1,@out[0]
543	aesdec		$rndkey1,@out[1]
544	aesdec		$rndkey1,@out[2]
545	aesdec		$rndkey1,@out[3]
546	movups		0xb0-0x78($key),$rndkey1
547
548	aesdec		$rndkey0,@out[0]
549	aesdec		$rndkey0,@out[1]
550	aesdec		$rndkey0,@out[2]
551	aesdec		$rndkey0,@out[3]
552	movups		0xc0-0x78($key),$rndkey0
553
554	je	.Ldec4x_tail
555
556	aesdec		$rndkey1,@out[0]
557	aesdec		$rndkey1,@out[1]
558	aesdec		$rndkey1,@out[2]
559	aesdec		$rndkey1,@out[3]
560	movups		0xd0-0x78($key),$rndkey1
561
562	aesdec		$rndkey0,@out[0]
563	aesdec		$rndkey0,@out[1]
564	aesdec		$rndkey0,@out[2]
565	aesdec		$rndkey0,@out[3]
566	movups		0xe0-0x78($key),$rndkey0
567	jmp	.Ldec4x_tail
568
569.align	32
570.Ldec4x_tail:
571	aesdec		$rndkey1,@out[0]
572	aesdec		$rndkey1,@out[1]
573	aesdec		$rndkey1,@out[2]
574	 pxor		$rndkey0,@inp[0]
575	 pxor		$rndkey0,@inp[1]
576	aesdec		$rndkey1,@out[3]
577	movdqu		0x10-0x78($key),$rndkey1
578	 pxor		$rndkey0,@inp[2]
579	 pxor		$rndkey0,@inp[3]
580	movdqu		0x20-0x78($key),$rndkey0
581
582	aesdeclast	@inp[0],@out[0]
583	aesdeclast	@inp[1],@out[1]
584	 movdqu		-16(@inptr[0],$offset),@inp[0]	# load next IV
585	 movdqu		-16(@inptr[1],$offset),@inp[1]
586	aesdeclast	@inp[2],@out[2]
587	aesdeclast	@inp[3],@out[3]
588	 movdqu		-16(@inptr[2],$offset),@inp[2]
589	 movdqu		-16(@inptr[3],$offset),@inp[3]
590
591	movups		@out[0],-16(@outptr[0],$offset)
592	 movdqu		(@inptr[0],$offset),@out[0]
593	movups		@out[1],-16(@outptr[1],$offset)
594	 movdqu		(@inptr[1],$offset),@out[1]
595	 pxor		$zero,@out[0]
596	movups		@out[2],-16(@outptr[2],$offset)
597	 movdqu		(@inptr[2],$offset),@out[2]
598	 pxor		$zero,@out[1]
599	movups		@out[3],-16(@outptr[3],$offset)
600	 movdqu		(@inptr[3],$offset),@out[3]
601	 pxor		$zero,@out[2]
602	 pxor		$zero,@out[3]
603
604	dec	$num
605	jnz	.Loop_dec4x
606
607	mov	16(%rsp),%rax			# original %rsp
608.cfi_def_cfa	%rax,8
609	mov	24(%rsp),$num
610
611	lea	`40*4`($inp),$inp
612	dec	$num
613	jnz	.Ldec4x_loop_grande
614
615.Ldec4x_done:
616___
617$code.=<<___ if ($win64);
618	movaps	-0xd8(%rax),%xmm6
619	movaps	-0xc8(%rax),%xmm7
620	movaps	-0xb8(%rax),%xmm8
621	movaps	-0xa8(%rax),%xmm9
622	movaps	-0x98(%rax),%xmm10
623	movaps	-0x88(%rax),%xmm11
624	movaps	-0x78(%rax),%xmm12
625	#movaps	-0x68(%rax),%xmm13
626	#movaps	-0x58(%rax),%xmm14
627	#movaps	-0x48(%rax),%xmm15
628___
629$code.=<<___;
630	mov	-48(%rax),%r15
631.cfi_restore	%r15
632	mov	-40(%rax),%r14
633.cfi_restore	%r14
634	mov	-32(%rax),%r13
635.cfi_restore	%r13
636	mov	-24(%rax),%r12
637.cfi_restore	%r12
638	mov	-16(%rax),%rbp
639.cfi_restore	%rbp
640	mov	-8(%rax),%rbx
641.cfi_restore	%rbx
642	lea	(%rax),%rsp
643.cfi_def_cfa_register	%rsp
644.Ldec4x_epilogue:
645	ret
646.cfi_endproc
647.size	aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
648___
649
650						if ($avx) {{{
651my @ptr=map("%r$_",(8..15));
652my $offload=$sink;
653
654my @out=map("%xmm$_",(2..9));
655my @inp=map("%xmm$_",(10..13));
656my ($counters,$zero)=("%xmm14","%xmm15");
657
658$code.=<<___;
659.type	aesni_multi_cbc_encrypt_avx,\@function,3
660.align	32
661aesni_multi_cbc_encrypt_avx:
662.cfi_startproc
663_avx_cbc_enc_shortcut:
664	mov	%rsp,%rax
665.cfi_def_cfa_register	%rax
666	push	%rbx
667.cfi_push	%rbx
668	push	%rbp
669.cfi_push	%rbp
670	push	%r12
671.cfi_push	%r12
672	push	%r13
673.cfi_push	%r13
674	push	%r14
675.cfi_push	%r14
676	push	%r15
677.cfi_push	%r15
678___
679$code.=<<___ if ($win64);
680	lea	-0xa8(%rsp),%rsp
681	movaps	%xmm6,(%rsp)
682	movaps	%xmm7,0x10(%rsp)
683	movaps	%xmm8,0x20(%rsp)
684	movaps	%xmm9,0x30(%rsp)
685	movaps	%xmm10,0x40(%rsp)
686	movaps	%xmm11,0x50(%rsp)
687	movaps	%xmm12,-0x78(%rax)
688	movaps	%xmm13,-0x68(%rax)
689	movaps	%xmm14,-0x58(%rax)
690	movaps	%xmm15,-0x48(%rax)
691___
692$code.=<<___;
693	# stack layout
694	#
695	# +0	output sink
696	# +16	input sink [original %rsp and $num]
697	# +32	counters
698	# +64	distances between inputs and outputs
699	# +128	off-load area for @inp[0..3]
700
701	sub	\$192,%rsp
702	and	\$-128,%rsp
703	mov	%rax,16(%rsp)			# original %rsp
704.cfi_cfa_expression	%rsp+16,deref,+8
705
706.Lenc8x_body:
707	vzeroupper
708	vmovdqu	($key),$zero			# 0-round key
709	lea	0x78($key),$key			# size optimization
710	lea	40*4($inp),$inp
711	shr	\$1,$num
712
713.Lenc8x_loop_grande:
714	#mov	$num,24(%rsp)			# original $num
715	xor	$num,$num
716___
717for($i=0;$i<8;$i++) {
718  my $temp = $i ? $offload : $offset;
719    $code.=<<___;
720	mov	`40*$i+16-40*4`($inp),$one	# borrow $one for number of blocks
721	mov	`40*$i+0-40*4`($inp),@ptr[$i]	# input pointer
722	cmp	$num,$one
723	mov	`40*$i+8-40*4`($inp),$temp	# output pointer
724	cmovg	$one,$num			# find maximum
725	test	$one,$one
726	vmovdqu	`40*$i+24-40*4`($inp),@out[$i]	# load IV
727	mov	$one,`32+4*$i`(%rsp)		# initialize counters
728	cmovle	%rsp,@ptr[$i]			# cancel input
729	sub	@ptr[$i],$temp			# distance between input and output
730	mov	$temp,`64+8*$i`(%rsp)		# initialize distances
731___
732}
733$code.=<<___;
734	test	$num,$num
735	jz	.Lenc8x_done
736
737	vmovups	0x10-0x78($key),$rndkey1
738	vmovups	0x20-0x78($key),$rndkey0
739	mov	0xf0-0x78($key),$rounds
740
741	vpxor	(@ptr[0]),$zero,@inp[0]		# load inputs and xor with 0-round
742	 lea	128(%rsp),$offload		# offload area
743	vpxor	(@ptr[1]),$zero,@inp[1]
744	vpxor	(@ptr[2]),$zero,@inp[2]
745	vpxor	(@ptr[3]),$zero,@inp[3]
746	 vpxor	@inp[0],@out[0],@out[0]
747	vpxor	(@ptr[4]),$zero,@inp[0]
748	 vpxor	@inp[1],@out[1],@out[1]
749	vpxor	(@ptr[5]),$zero,@inp[1]
750	 vpxor	@inp[2],@out[2],@out[2]
751	vpxor	(@ptr[6]),$zero,@inp[2]
752	 vpxor	@inp[3],@out[3],@out[3]
753	vpxor	(@ptr[7]),$zero,@inp[3]
754	 vpxor	@inp[0],@out[4],@out[4]
755	mov	\$1,$one			# constant of 1
756	 vpxor	@inp[1],@out[5],@out[5]
757	 vpxor	@inp[2],@out[6],@out[6]
758	 vpxor	@inp[3],@out[7],@out[7]
759	jmp	.Loop_enc8x
760
761.align	32
762.Loop_enc8x:
763___
764for($i=0;$i<8;$i++) {
765my $rndkey=($i&1)?$rndkey0:$rndkey1;
766$code.=<<___;
767	vaesenc		$rndkey,@out[0],@out[0]
768	 cmp		32+4*$i(%rsp),$one
769___
770$code.=<<___ if ($i);
771	 mov		64+8*$i(%rsp),$offset
772___
773$code.=<<___;
774	vaesenc		$rndkey,@out[1],@out[1]
775	prefetcht0	31(@ptr[$i])			# prefetch input
776	vaesenc		$rndkey,@out[2],@out[2]
777___
778$code.=<<___ if ($i>1);
779	prefetcht0	15(@ptr[$i-2])			# prefetch output
780___
781$code.=<<___;
782	vaesenc		$rndkey,@out[3],@out[3]
783	 lea		(@ptr[$i],$offset),$offset
784	 cmovge		%rsp,@ptr[$i]			# cancel input
785	vaesenc		$rndkey,@out[4],@out[4]
786	 cmovg		%rsp,$offset			# sink output
787	vaesenc		$rndkey,@out[5],@out[5]
788	 sub		@ptr[$i],$offset
789	vaesenc		$rndkey,@out[6],@out[6]
790	 vpxor		16(@ptr[$i]),$zero,@inp[$i%4]	# load input and xor with 0-round
791	 mov		$offset,64+8*$i(%rsp)
792	vaesenc		$rndkey,@out[7],@out[7]
793	vmovups		`16*(3+$i)-0x78`($key),$rndkey
794	 lea		16(@ptr[$i],$offset),@ptr[$i]	# switch to output
795___
796$code.=<<___ if ($i<4)
797	 vmovdqu	@inp[$i%4],`16*$i`($offload)	# off-load
798___
799}
800$code.=<<___;
801	 vmovdqu	32(%rsp),$counters
802	prefetcht0	15(@ptr[$i-2])			# prefetch output
803	prefetcht0	15(@ptr[$i-1])
804	cmp	\$11,$rounds
805	jb	.Lenc8x_tail
806
807	vaesenc		$rndkey1,@out[0],@out[0]
808	vaesenc		$rndkey1,@out[1],@out[1]
809	vaesenc		$rndkey1,@out[2],@out[2]
810	vaesenc		$rndkey1,@out[3],@out[3]
811	vaesenc		$rndkey1,@out[4],@out[4]
812	vaesenc		$rndkey1,@out[5],@out[5]
813	vaesenc		$rndkey1,@out[6],@out[6]
814	vaesenc		$rndkey1,@out[7],@out[7]
815	vmovups		0xb0-0x78($key),$rndkey1
816
817	vaesenc		$rndkey0,@out[0],@out[0]
818	vaesenc		$rndkey0,@out[1],@out[1]
819	vaesenc		$rndkey0,@out[2],@out[2]
820	vaesenc		$rndkey0,@out[3],@out[3]
821	vaesenc		$rndkey0,@out[4],@out[4]
822	vaesenc		$rndkey0,@out[5],@out[5]
823	vaesenc		$rndkey0,@out[6],@out[6]
824	vaesenc		$rndkey0,@out[7],@out[7]
825	vmovups		0xc0-0x78($key),$rndkey0
826	je	.Lenc8x_tail
827
828	vaesenc		$rndkey1,@out[0],@out[0]
829	vaesenc		$rndkey1,@out[1],@out[1]
830	vaesenc		$rndkey1,@out[2],@out[2]
831	vaesenc		$rndkey1,@out[3],@out[3]
832	vaesenc		$rndkey1,@out[4],@out[4]
833	vaesenc		$rndkey1,@out[5],@out[5]
834	vaesenc		$rndkey1,@out[6],@out[6]
835	vaesenc		$rndkey1,@out[7],@out[7]
836	vmovups		0xd0-0x78($key),$rndkey1
837
838	vaesenc		$rndkey0,@out[0],@out[0]
839	vaesenc		$rndkey0,@out[1],@out[1]
840	vaesenc		$rndkey0,@out[2],@out[2]
841	vaesenc		$rndkey0,@out[3],@out[3]
842	vaesenc		$rndkey0,@out[4],@out[4]
843	vaesenc		$rndkey0,@out[5],@out[5]
844	vaesenc		$rndkey0,@out[6],@out[6]
845	vaesenc		$rndkey0,@out[7],@out[7]
846	vmovups		0xe0-0x78($key),$rndkey0
847
848.Lenc8x_tail:
849	vaesenc		$rndkey1,@out[0],@out[0]
850	 vpxor		$zero,$zero,$zero
851	vaesenc		$rndkey1,@out[1],@out[1]
852	vaesenc		$rndkey1,@out[2],@out[2]
853	 vpcmpgtd	$zero,$counters,$zero
854	vaesenc		$rndkey1,@out[3],@out[3]
855	vaesenc		$rndkey1,@out[4],@out[4]
856	 vpaddd		$counters,$zero,$zero		# decrement counters
857	 vmovdqu	48(%rsp),$counters
858	vaesenc		$rndkey1,@out[5],@out[5]
859	 mov		64(%rsp),$offset		# pre-load 1st offset
860	vaesenc		$rndkey1,@out[6],@out[6]
861	vaesenc		$rndkey1,@out[7],@out[7]
862	vmovups		0x10-0x78($key),$rndkey1
863
864	vaesenclast	$rndkey0,@out[0],@out[0]
865	 vmovdqa	$zero,32(%rsp)			# update counters
866	 vpxor		$zero,$zero,$zero
867	vaesenclast	$rndkey0,@out[1],@out[1]
868	vaesenclast	$rndkey0,@out[2],@out[2]
869	 vpcmpgtd	$zero,$counters,$zero
870	vaesenclast	$rndkey0,@out[3],@out[3]
871	vaesenclast	$rndkey0,@out[4],@out[4]
872	 vpaddd		$zero,$counters,$counters	# decrement counters
873	 vmovdqu	-0x78($key),$zero		# 0-round
874	vaesenclast	$rndkey0,@out[5],@out[5]
875	vaesenclast	$rndkey0,@out[6],@out[6]
876	 vmovdqa	$counters,48(%rsp)		# update counters
877	vaesenclast	$rndkey0,@out[7],@out[7]
878	vmovups		0x20-0x78($key),$rndkey0
879
880	vmovups		@out[0],-16(@ptr[0])		# write output
881	 sub		$offset,@ptr[0]			# switch to input
882	 vpxor		0x00($offload),@out[0],@out[0]
883	vmovups		@out[1],-16(@ptr[1])
884	 sub		`64+1*8`(%rsp),@ptr[1]
885	 vpxor		0x10($offload),@out[1],@out[1]
886	vmovups		@out[2],-16(@ptr[2])
887	 sub		`64+2*8`(%rsp),@ptr[2]
888	 vpxor		0x20($offload),@out[2],@out[2]
889	vmovups		@out[3],-16(@ptr[3])
890	 sub		`64+3*8`(%rsp),@ptr[3]
891	 vpxor		0x30($offload),@out[3],@out[3]
892	vmovups		@out[4],-16(@ptr[4])
893	 sub		`64+4*8`(%rsp),@ptr[4]
894	 vpxor		@inp[0],@out[4],@out[4]
895	vmovups		@out[5],-16(@ptr[5])
896	 sub		`64+5*8`(%rsp),@ptr[5]
897	 vpxor		@inp[1],@out[5],@out[5]
898	vmovups		@out[6],-16(@ptr[6])
899	 sub		`64+6*8`(%rsp),@ptr[6]
900	 vpxor		@inp[2],@out[6],@out[6]
901	vmovups		@out[7],-16(@ptr[7])
902	 sub		`64+7*8`(%rsp),@ptr[7]
903	 vpxor		@inp[3],@out[7],@out[7]
904
905	dec	$num
906	jnz	.Loop_enc8x
907
908	mov	16(%rsp),%rax			# original %rsp
909.cfi_def_cfa	%rax,8
910	#mov	24(%rsp),$num
911	#lea	`40*8`($inp),$inp
912	#dec	$num
913	#jnz	.Lenc8x_loop_grande
914
915.Lenc8x_done:
916	vzeroupper
917___
918$code.=<<___ if ($win64);
919	movaps	-0xd8(%rax),%xmm6
920	movaps	-0xc8(%rax),%xmm7
921	movaps	-0xb8(%rax),%xmm8
922	movaps	-0xa8(%rax),%xmm9
923	movaps	-0x98(%rax),%xmm10
924	movaps	-0x88(%rax),%xmm11
925	movaps	-0x78(%rax),%xmm12
926	movaps	-0x68(%rax),%xmm13
927	movaps	-0x58(%rax),%xmm14
928	movaps	-0x48(%rax),%xmm15
929___
930$code.=<<___;
931	mov	-48(%rax),%r15
932.cfi_restore	%r15
933	mov	-40(%rax),%r14
934.cfi_restore	%r14
935	mov	-32(%rax),%r13
936.cfi_restore	%r13
937	mov	-24(%rax),%r12
938.cfi_restore	%r12
939	mov	-16(%rax),%rbp
940.cfi_restore	%rbp
941	mov	-8(%rax),%rbx
942.cfi_restore	%rbx
943	lea	(%rax),%rsp
944.cfi_def_cfa_register	%rsp
945.Lenc8x_epilogue:
946	ret
947.cfi_endproc
948.size	aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
949
950.type	aesni_multi_cbc_decrypt_avx,\@function,3
951.align	32
952aesni_multi_cbc_decrypt_avx:
953.cfi_startproc
954_avx_cbc_dec_shortcut:
955	mov	%rsp,%rax
956.cfi_def_cfa_register	%rax
957	push	%rbx
958.cfi_push	%rbx
959	push	%rbp
960.cfi_push	%rbp
961	push	%r12
962.cfi_push	%r12
963	push	%r13
964.cfi_push	%r13
965	push	%r14
966.cfi_push	%r14
967	push	%r15
968.cfi_push	%r15
969___
970$code.=<<___ if ($win64);
971	lea	-0xa8(%rsp),%rsp
972	movaps	%xmm6,(%rsp)
973	movaps	%xmm7,0x10(%rsp)
974	movaps	%xmm8,0x20(%rsp)
975	movaps	%xmm9,0x30(%rsp)
976	movaps	%xmm10,0x40(%rsp)
977	movaps	%xmm11,0x50(%rsp)
978	movaps	%xmm12,-0x78(%rax)
979	movaps	%xmm13,-0x68(%rax)
980	movaps	%xmm14,-0x58(%rax)
981	movaps	%xmm15,-0x48(%rax)
982___
983$code.=<<___;
984	# stack layout
985	#
986	# +0	output sink
987	# +16	input sink [original %rsp and $num]
988	# +32	counters
989	# +64	distances between inputs and outputs
990	# +128	off-load area for @inp[0..3]
991	# +192	IV/input offload
992
993	sub	\$256,%rsp
994	and	\$-256,%rsp
995	sub	\$192,%rsp
996	mov	%rax,16(%rsp)			# original %rsp
997.cfi_cfa_expression	%rsp+16,deref,+8
998
999.Ldec8x_body:
1000	vzeroupper
1001	vmovdqu	($key),$zero			# 0-round key
1002	lea	0x78($key),$key			# size optimization
1003	lea	40*4($inp),$inp
1004	shr	\$1,$num
1005
1006.Ldec8x_loop_grande:
1007	#mov	$num,24(%rsp)			# original $num
1008	xor	$num,$num
1009___
1010for($i=0;$i<8;$i++) {
1011  my $temp = $i ? $offload : $offset;
1012    $code.=<<___;
1013	mov	`40*$i+16-40*4`($inp),$one	# borrow $one for number of blocks
1014	mov	`40*$i+0-40*4`($inp),@ptr[$i]	# input pointer
1015	cmp	$num,$one
1016	mov	`40*$i+8-40*4`($inp),$temp	# output pointer
1017	cmovg	$one,$num			# find maximum
1018	test	$one,$one
1019	vmovdqu	`40*$i+24-40*4`($inp),@out[$i]	# load IV
1020	mov	$one,`32+4*$i`(%rsp)		# initialize counters
1021	cmovle	%rsp,@ptr[$i]			# cancel input
1022	sub	@ptr[$i],$temp			# distance between input and output
1023	mov	$temp,`64+8*$i`(%rsp)		# initialize distances
1024	vmovdqu	@out[$i],`192+16*$i`(%rsp)	# offload IV
1025___
1026}
1027$code.=<<___;
1028	test	$num,$num
1029	jz	.Ldec8x_done
1030
1031	vmovups	0x10-0x78($key),$rndkey1
1032	vmovups	0x20-0x78($key),$rndkey0
1033	mov	0xf0-0x78($key),$rounds
1034	 lea	192+128(%rsp),$offload		# offload area
1035
1036	vmovdqu	(@ptr[0]),@out[0]		# load inputs
1037	vmovdqu	(@ptr[1]),@out[1]
1038	vmovdqu	(@ptr[2]),@out[2]
1039	vmovdqu	(@ptr[3]),@out[3]
1040	vmovdqu	(@ptr[4]),@out[4]
1041	vmovdqu	(@ptr[5]),@out[5]
1042	vmovdqu	(@ptr[6]),@out[6]
1043	vmovdqu	(@ptr[7]),@out[7]
1044	vmovdqu	@out[0],0x00($offload)		# offload inputs
1045	vpxor	$zero,@out[0],@out[0]		# xor inputs with 0-round
1046	vmovdqu	@out[1],0x10($offload)
1047	vpxor	$zero,@out[1],@out[1]
1048	vmovdqu	@out[2],0x20($offload)
1049	vpxor	$zero,@out[2],@out[2]
1050	vmovdqu	@out[3],0x30($offload)
1051	vpxor	$zero,@out[3],@out[3]
1052	vmovdqu	@out[4],0x40($offload)
1053	vpxor	$zero,@out[4],@out[4]
1054	vmovdqu	@out[5],0x50($offload)
1055	vpxor	$zero,@out[5],@out[5]
1056	vmovdqu	@out[6],0x60($offload)
1057	vpxor	$zero,@out[6],@out[6]
1058	vmovdqu	@out[7],0x70($offload)
1059	vpxor	$zero,@out[7],@out[7]
1060	xor	\$0x80,$offload
1061	mov	\$1,$one			# constant of 1
1062	jmp	.Loop_dec8x
1063
1064.align	32
1065.Loop_dec8x:
1066___
1067for($i=0;$i<8;$i++) {
1068my $rndkey=($i&1)?$rndkey0:$rndkey1;
1069$code.=<<___;
1070	vaesdec		$rndkey,@out[0],@out[0]
1071	 cmp		32+4*$i(%rsp),$one
1072___
1073$code.=<<___ if ($i);
1074	 mov		64+8*$i(%rsp),$offset
1075___
1076$code.=<<___;
1077	vaesdec		$rndkey,@out[1],@out[1]
1078	prefetcht0	31(@ptr[$i])			# prefetch input
1079	vaesdec		$rndkey,@out[2],@out[2]
1080___
1081$code.=<<___ if ($i>1);
1082	prefetcht0	15(@ptr[$i-2])			# prefetch output
1083___
1084$code.=<<___;
1085	vaesdec		$rndkey,@out[3],@out[3]
1086	 lea		(@ptr[$i],$offset),$offset
1087	 cmovge		%rsp,@ptr[$i]			# cancel input
1088	vaesdec		$rndkey,@out[4],@out[4]
1089	 cmovg		%rsp,$offset			# sink output
1090	vaesdec		$rndkey,@out[5],@out[5]
1091	 sub		@ptr[$i],$offset
1092	vaesdec		$rndkey,@out[6],@out[6]
1093	 vmovdqu	16(@ptr[$i]),@inp[$i%4]		# load input
1094	 mov		$offset,64+8*$i(%rsp)
1095	vaesdec		$rndkey,@out[7],@out[7]
1096	vmovups		`16*(3+$i)-0x78`($key),$rndkey
1097	 lea		16(@ptr[$i],$offset),@ptr[$i]	# switch to output
1098___
1099$code.=<<___ if ($i<4);
1100	 vmovdqu	@inp[$i%4],`128+16*$i`(%rsp)	# off-load
1101___
1102}
1103$code.=<<___;
1104	 vmovdqu	32(%rsp),$counters
1105	prefetcht0	15(@ptr[$i-2])			# prefetch output
1106	prefetcht0	15(@ptr[$i-1])
1107	cmp	\$11,$rounds
1108	jb	.Ldec8x_tail
1109
1110	vaesdec		$rndkey1,@out[0],@out[0]
1111	vaesdec		$rndkey1,@out[1],@out[1]
1112	vaesdec		$rndkey1,@out[2],@out[2]
1113	vaesdec		$rndkey1,@out[3],@out[3]
1114	vaesdec		$rndkey1,@out[4],@out[4]
1115	vaesdec		$rndkey1,@out[5],@out[5]
1116	vaesdec		$rndkey1,@out[6],@out[6]
1117	vaesdec		$rndkey1,@out[7],@out[7]
1118	vmovups		0xb0-0x78($key),$rndkey1
1119
1120	vaesdec		$rndkey0,@out[0],@out[0]
1121	vaesdec		$rndkey0,@out[1],@out[1]
1122	vaesdec		$rndkey0,@out[2],@out[2]
1123	vaesdec		$rndkey0,@out[3],@out[3]
1124	vaesdec		$rndkey0,@out[4],@out[4]
1125	vaesdec		$rndkey0,@out[5],@out[5]
1126	vaesdec		$rndkey0,@out[6],@out[6]
1127	vaesdec		$rndkey0,@out[7],@out[7]
1128	vmovups		0xc0-0x78($key),$rndkey0
1129	je	.Ldec8x_tail
1130
1131	vaesdec		$rndkey1,@out[0],@out[0]
1132	vaesdec		$rndkey1,@out[1],@out[1]
1133	vaesdec		$rndkey1,@out[2],@out[2]
1134	vaesdec		$rndkey1,@out[3],@out[3]
1135	vaesdec		$rndkey1,@out[4],@out[4]
1136	vaesdec		$rndkey1,@out[5],@out[5]
1137	vaesdec		$rndkey1,@out[6],@out[6]
1138	vaesdec		$rndkey1,@out[7],@out[7]
1139	vmovups		0xd0-0x78($key),$rndkey1
1140
1141	vaesdec		$rndkey0,@out[0],@out[0]
1142	vaesdec		$rndkey0,@out[1],@out[1]
1143	vaesdec		$rndkey0,@out[2],@out[2]
1144	vaesdec		$rndkey0,@out[3],@out[3]
1145	vaesdec		$rndkey0,@out[4],@out[4]
1146	vaesdec		$rndkey0,@out[5],@out[5]
1147	vaesdec		$rndkey0,@out[6],@out[6]
1148	vaesdec		$rndkey0,@out[7],@out[7]
1149	vmovups		0xe0-0x78($key),$rndkey0
1150
1151.Ldec8x_tail:
1152	vaesdec		$rndkey1,@out[0],@out[0]
1153	 vpxor		$zero,$zero,$zero
1154	vaesdec		$rndkey1,@out[1],@out[1]
1155	vaesdec		$rndkey1,@out[2],@out[2]
1156	 vpcmpgtd	$zero,$counters,$zero
1157	vaesdec		$rndkey1,@out[3],@out[3]
1158	vaesdec		$rndkey1,@out[4],@out[4]
1159	 vpaddd		$counters,$zero,$zero		# decrement counters
1160	 vmovdqu	48(%rsp),$counters
1161	vaesdec		$rndkey1,@out[5],@out[5]
1162	 mov		64(%rsp),$offset		# pre-load 1st offset
1163	vaesdec		$rndkey1,@out[6],@out[6]
1164	vaesdec		$rndkey1,@out[7],@out[7]
1165	vmovups		0x10-0x78($key),$rndkey1
1166
1167	vaesdeclast	$rndkey0,@out[0],@out[0]
1168	 vmovdqa	$zero,32(%rsp)			# update counters
1169	 vpxor		$zero,$zero,$zero
1170	vaesdeclast	$rndkey0,@out[1],@out[1]
1171	vpxor		0x00($offload),@out[0],@out[0]	# xor with IV
1172	vaesdeclast	$rndkey0,@out[2],@out[2]
1173	vpxor		0x10($offload),@out[1],@out[1]
1174	 vpcmpgtd	$zero,$counters,$zero
1175	vaesdeclast	$rndkey0,@out[3],@out[3]
1176	vpxor		0x20($offload),@out[2],@out[2]
1177	vaesdeclast	$rndkey0,@out[4],@out[4]
1178	vpxor		0x30($offload),@out[3],@out[3]
1179	 vpaddd		$zero,$counters,$counters	# decrement counters
1180	 vmovdqu	-0x78($key),$zero		# 0-round
1181	vaesdeclast	$rndkey0,@out[5],@out[5]
1182	vpxor		0x40($offload),@out[4],@out[4]
1183	vaesdeclast	$rndkey0,@out[6],@out[6]
1184	vpxor		0x50($offload),@out[5],@out[5]
1185	 vmovdqa	$counters,48(%rsp)		# update counters
1186	vaesdeclast	$rndkey0,@out[7],@out[7]
1187	vpxor		0x60($offload),@out[6],@out[6]
1188	vmovups		0x20-0x78($key),$rndkey0
1189
1190	vmovups		@out[0],-16(@ptr[0])		# write output
1191	 sub		$offset,@ptr[0]			# switch to input
1192	 vmovdqu	128+0(%rsp),@out[0]
1193	vpxor		0x70($offload),@out[7],@out[7]
1194	vmovups		@out[1],-16(@ptr[1])
1195	 sub		`64+1*8`(%rsp),@ptr[1]
1196	 vmovdqu	@out[0],0x00($offload)
1197	 vpxor		$zero,@out[0],@out[0]
1198	 vmovdqu	128+16(%rsp),@out[1]
1199	vmovups		@out[2],-16(@ptr[2])
1200	 sub		`64+2*8`(%rsp),@ptr[2]
1201	 vmovdqu	@out[1],0x10($offload)
1202	 vpxor		$zero,@out[1],@out[1]
1203	 vmovdqu	128+32(%rsp),@out[2]
1204	vmovups		@out[3],-16(@ptr[3])
1205	 sub		`64+3*8`(%rsp),@ptr[3]
1206	 vmovdqu	@out[2],0x20($offload)
1207	 vpxor		$zero,@out[2],@out[2]
1208	 vmovdqu	128+48(%rsp),@out[3]
1209	vmovups		@out[4],-16(@ptr[4])
1210	 sub		`64+4*8`(%rsp),@ptr[4]
1211	 vmovdqu	@out[3],0x30($offload)
1212	 vpxor		$zero,@out[3],@out[3]
1213	 vmovdqu	@inp[0],0x40($offload)
1214	 vpxor		@inp[0],$zero,@out[4]
1215	vmovups		@out[5],-16(@ptr[5])
1216	 sub		`64+5*8`(%rsp),@ptr[5]
1217	 vmovdqu	@inp[1],0x50($offload)
1218	 vpxor		@inp[1],$zero,@out[5]
1219	vmovups		@out[6],-16(@ptr[6])
1220	 sub		`64+6*8`(%rsp),@ptr[6]
1221	 vmovdqu	@inp[2],0x60($offload)
1222	 vpxor		@inp[2],$zero,@out[6]
1223	vmovups		@out[7],-16(@ptr[7])
1224	 sub		`64+7*8`(%rsp),@ptr[7]
1225	 vmovdqu	@inp[3],0x70($offload)
1226	 vpxor		@inp[3],$zero,@out[7]
1227
1228	xor	\$128,$offload
1229	dec	$num
1230	jnz	.Loop_dec8x
1231
1232	mov	16(%rsp),%rax			# original %rsp
1233.cfi_def_cfa	%rax,8
1234	#mov	24(%rsp),$num
1235	#lea	`40*8`($inp),$inp
1236	#dec	$num
1237	#jnz	.Ldec8x_loop_grande
1238
1239.Ldec8x_done:
1240	vzeroupper
1241___
1242$code.=<<___ if ($win64);
1243	movaps	-0xd8(%rax),%xmm6
1244	movaps	-0xc8(%rax),%xmm7
1245	movaps	-0xb8(%rax),%xmm8
1246	movaps	-0xa8(%rax),%xmm9
1247	movaps	-0x98(%rax),%xmm10
1248	movaps	-0x88(%rax),%xmm11
1249	movaps	-0x78(%rax),%xmm12
1250	movaps	-0x68(%rax),%xmm13
1251	movaps	-0x58(%rax),%xmm14
1252	movaps	-0x48(%rax),%xmm15
1253___
1254$code.=<<___;
1255	mov	-48(%rax),%r15
1256.cfi_restore	%r15
1257	mov	-40(%rax),%r14
1258.cfi_restore	%r14
1259	mov	-32(%rax),%r13
1260.cfi_restore	%r13
1261	mov	-24(%rax),%r12
1262.cfi_restore	%r12
1263	mov	-16(%rax),%rbp
1264.cfi_restore	%rbp
1265	mov	-8(%rax),%rbx
1266.cfi_restore	%rbx
1267	lea	(%rax),%rsp
1268.cfi_def_cfa_register	%rsp
1269.Ldec8x_epilogue:
1270	ret
1271.cfi_endproc
1272.size	aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1273___
1274						}}}
1275
1276if ($win64) {
1277# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1278#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1279$rec="%rcx";
1280$frame="%rdx";
1281$context="%r8";
1282$disp="%r9";
1283
1284$code.=<<___;
1285.extern	__imp_RtlVirtualUnwind
1286.type	se_handler,\@abi-omnipotent
1287.align	16
1288se_handler:
1289	push	%rsi
1290	push	%rdi
1291	push	%rbx
1292	push	%rbp
1293	push	%r12
1294	push	%r13
1295	push	%r14
1296	push	%r15
1297	pushfq
1298	sub	\$64,%rsp
1299
1300	mov	120($context),%rax	# pull context->Rax
1301	mov	248($context),%rbx	# pull context->Rip
1302
1303	mov	8($disp),%rsi		# disp->ImageBase
1304	mov	56($disp),%r11		# disp->HandlerData
1305
1306	mov	0(%r11),%r10d		# HandlerData[0]
1307	lea	(%rsi,%r10),%r10	# prologue label
1308	cmp	%r10,%rbx		# context->Rip<.Lprologue
1309	jb	.Lin_prologue
1310
1311	mov	152($context),%rax	# pull context->Rsp
1312
1313	mov	4(%r11),%r10d		# HandlerData[1]
1314	lea	(%rsi,%r10),%r10	# epilogue label
1315	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1316	jae	.Lin_prologue
1317
1318	mov	16(%rax),%rax		# pull saved stack pointer
1319
1320	mov	-8(%rax),%rbx
1321	mov	-16(%rax),%rbp
1322	mov	-24(%rax),%r12
1323	mov	-32(%rax),%r13
1324	mov	-40(%rax),%r14
1325	mov	-48(%rax),%r15
1326	mov	%rbx,144($context)	# restore context->Rbx
1327	mov	%rbp,160($context)	# restore context->Rbp
1328	mov	%r12,216($context)	# restore context->R12
1329	mov	%r13,224($context)	# restore context->R13
1330	mov	%r14,232($context)	# restore context->R14
1331	mov	%r15,240($context)	# restore context->R15
1332
1333	lea	-56-10*16(%rax),%rsi
1334	lea	512($context),%rdi	# &context.Xmm6
1335	mov	\$20,%ecx
1336	.long	0xa548f3fc		# cld; rep movsq
1337
1338.Lin_prologue:
1339	mov	8(%rax),%rdi
1340	mov	16(%rax),%rsi
1341	mov	%rax,152($context)	# restore context->Rsp
1342	mov	%rsi,168($context)	# restore context->Rsi
1343	mov	%rdi,176($context)	# restore context->Rdi
1344
1345	mov	40($disp),%rdi		# disp->ContextRecord
1346	mov	$context,%rsi		# context
1347	mov	\$154,%ecx		# sizeof(CONTEXT)
1348	.long	0xa548f3fc		# cld; rep movsq
1349
1350	mov	$disp,%rsi
1351	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1352	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1353	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1354	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1355	mov	40(%rsi),%r10		# disp->ContextRecord
1356	lea	56(%rsi),%r11		# &disp->HandlerData
1357	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1358	mov	%r10,32(%rsp)		# arg5
1359	mov	%r11,40(%rsp)		# arg6
1360	mov	%r12,48(%rsp)		# arg7
1361	mov	%rcx,56(%rsp)		# arg8, (NULL)
1362	call	*__imp_RtlVirtualUnwind(%rip)
1363
1364	mov	\$1,%eax		# ExceptionContinueSearch
1365	add	\$64,%rsp
1366	popfq
1367	pop	%r15
1368	pop	%r14
1369	pop	%r13
1370	pop	%r12
1371	pop	%rbp
1372	pop	%rbx
1373	pop	%rdi
1374	pop	%rsi
1375	ret
1376.size	se_handler,.-se_handler
1377
1378.section	.pdata
1379.align	4
1380	.rva	.LSEH_begin_aesni_multi_cbc_encrypt
1381	.rva	.LSEH_end_aesni_multi_cbc_encrypt
1382	.rva	.LSEH_info_aesni_multi_cbc_encrypt
1383	.rva	.LSEH_begin_aesni_multi_cbc_decrypt
1384	.rva	.LSEH_end_aesni_multi_cbc_decrypt
1385	.rva	.LSEH_info_aesni_multi_cbc_decrypt
1386___
1387$code.=<<___ if ($avx);
1388	.rva	.LSEH_begin_aesni_multi_cbc_encrypt_avx
1389	.rva	.LSEH_end_aesni_multi_cbc_encrypt_avx
1390	.rva	.LSEH_info_aesni_multi_cbc_encrypt_avx
1391	.rva	.LSEH_begin_aesni_multi_cbc_decrypt_avx
1392	.rva	.LSEH_end_aesni_multi_cbc_decrypt_avx
1393	.rva	.LSEH_info_aesni_multi_cbc_decrypt_avx
1394___
1395$code.=<<___;
1396.section	.xdata
1397.align	8
1398.LSEH_info_aesni_multi_cbc_encrypt:
1399	.byte	9,0,0,0
1400	.rva	se_handler
1401	.rva	.Lenc4x_body,.Lenc4x_epilogue		# HandlerData[]
1402.LSEH_info_aesni_multi_cbc_decrypt:
1403	.byte	9,0,0,0
1404	.rva	se_handler
1405	.rva	.Ldec4x_body,.Ldec4x_epilogue		# HandlerData[]
1406___
1407$code.=<<___ if ($avx);
1408.LSEH_info_aesni_multi_cbc_encrypt_avx:
1409	.byte	9,0,0,0
1410	.rva	se_handler
1411	.rva	.Lenc8x_body,.Lenc8x_epilogue		# HandlerData[]
1412.LSEH_info_aesni_multi_cbc_decrypt_avx:
1413	.byte	9,0,0,0
1414	.rva	se_handler
1415	.rva	.Ldec8x_body,.Ldec8x_epilogue		# HandlerData[]
1416___
1417}
1418####################################################################
1419
1420sub rex {
1421  local *opcode=shift;
1422  my ($dst,$src)=@_;
1423  my $rex=0;
1424
1425    $rex|=0x04			if($dst>=8);
1426    $rex|=0x01			if($src>=8);
1427    push @opcode,$rex|0x40	if($rex);
1428}
1429
1430sub aesni {
1431  my $line=shift;
1432  my @opcode=(0x66);
1433
1434    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1435	rex(\@opcode,$4,$3);
1436	push @opcode,0x0f,0x3a,0xdf;
1437	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
1438	my $c=$2;
1439	push @opcode,$c=~/^0/?oct($c):$c;
1440	return ".byte\t".join(',',@opcode);
1441    }
1442    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1443	my %opcodelet = (
1444		"aesimc" => 0xdb,
1445		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
1446		"aesdec" => 0xde,	"aesdeclast" => 0xdf
1447	);
1448	return undef if (!defined($opcodelet{$1}));
1449	rex(\@opcode,$3,$2);
1450	push @opcode,0x0f,0x38,$opcodelet{$1};
1451	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
1452	return ".byte\t".join(',',@opcode);
1453    }
1454    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1455	my %opcodelet = (
1456		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
1457		"aesdec" => 0xde,	"aesdeclast" => 0xdf
1458	);
1459	return undef if (!defined($opcodelet{$1}));
1460	my $off = $2;
1461	push @opcode,0x44 if ($3>=8);
1462	push @opcode,0x0f,0x38,$opcodelet{$1};
1463	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
1464	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1465	return ".byte\t".join(',',@opcode);
1466    }
1467    return $line;
1468}
1469
1470$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1471$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1472
1473print $code;
1474close STDOUT or die "error closing STDOUT: $!";
1475