xref: /freebsd/crypto/openssl/crypto/aes/asm/aesni-mb-x86_64.pl (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1#! /usr/bin/env perl
2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Multi-buffer AES-NI procedures process several independent buffers
18# in parallel by interleaving independent instructions.
19#
20# Cycles per byte for interleave factor 4:
21#
22#			asymptotic	measured
23#			---------------------------
24# Westmere		5.00/4=1.25	5.13/4=1.28
25# Atom			15.0/4=3.75	?15.7/4=3.93
26# Sandy Bridge		5.06/4=1.27	5.18/4=1.29
27# Ivy Bridge		5.06/4=1.27	5.14/4=1.29
28# Haswell		4.44/4=1.11	4.44/4=1.11
29# Bulldozer		5.75/4=1.44	5.76/4=1.44
30#
31# Cycles per byte for interleave factor 8 (not implemented for
32# pre-AVX processors, where higher interleave factor incidentally
33# doesn't result in improvement):
34#
35#			asymptotic	measured
36#			---------------------------
37# Sandy Bridge		5.06/8=0.64	7.10/8=0.89(*)
38# Ivy Bridge		5.06/8=0.64	7.14/8=0.89(*)
39# Haswell		5.00/8=0.63	5.00/8=0.63
40# Bulldozer		5.75/8=0.72	5.77/8=0.72
41#
42# (*)	Sandy/Ivy Bridge are known to handle high interleave factors
43#	suboptimally;
44
45# $output is the last argument if it looks like a file (it has an extension)
46# $flavour is the first argument if it doesn't look like a file
47$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
48$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
49
50$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
51
52$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
54( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
55die "can't locate x86_64-xlate.pl";
56
57push(@INC,"${dir}","${dir}../../perlasm");
58require "x86_64-support.pl";
59
60$ptr_size=&pointer_size($flavour);
61
62$avx=0;
63
64if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
65		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
66	$avx = ($1>=2.19) + ($1>=2.22);
67}
68
69if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
70	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
71	$avx = ($1>=2.09) + ($1>=2.10);
72}
73
74if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
75	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
76	$avx = ($1>=10) + ($1>=11);
77}
78
79if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
80	$avx = ($2>=3.0) + ($2>3.0);
81}
82
83open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
84    or die "can't call $xlate: $!";
85*STDOUT=*OUT;
86
87# void aesni_multi_cbc_encrypt (
88#     struct {	void *inp,*out; int blocks; double iv[2]; } inp[8];
89#     const AES_KEY *key,
90#     int num);		/* 1 or 2 */
91#
92$inp="%rdi";	# 1st arg
93$key="%rsi";	# 2nd arg
94$num="%edx";
95
96$inp_elm_size=2*$ptr_size+8+16;
97
98@inptr=map("%r$_",(8..11));
99@outptr=map("%r$_",(12..15));
100
101($rndkey0,$rndkey1)=("%xmm0","%xmm1");
102@out=map("%xmm$_",(2..5));
103@inp=map("%xmm$_",(6..9));
104($counters,$mask,$zero)=map("%xmm$_",(10..12));
105
106($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
107
108$code.=<<___;
109.text
110
111.extern	OPENSSL_ia32cap_P
112
113.globl	aesni_multi_cbc_encrypt
114.type	aesni_multi_cbc_encrypt,\@function,3
115.align	32
116aesni_multi_cbc_encrypt:
117.cfi_startproc
118___
119$code.=<<___ if ($avx);
120	cmp	\$2,$num
121	jb	.Lenc_non_avx
122	mov	OPENSSL_ia32cap_P+4(%rip),%ecx
123	test	\$`1<<28`,%ecx			# AVX bit
124	jnz	_avx_cbc_enc_shortcut
125	jmp	.Lenc_non_avx
126.align	16
127.Lenc_non_avx:
128___
129$code.=<<___;
130	mov	%rsp,%rax
131.cfi_def_cfa_register	%rax
132	push	%rbx
133.cfi_push	%rbx
134	push	%rbp
135.cfi_push	%rbp
136	push	%r12
137.cfi_push	%r12
138	push	%r13
139.cfi_push	%r13
140	push	%r14
141.cfi_push	%r14
142	push	%r15
143.cfi_push	%r15
144___
145$code.=<<___ if ($win64);
146	lea	-0xa8(%rsp),%rsp
147	movaps	%xmm6,(%rsp)
148	movaps	%xmm7,0x10(%rsp)
149	movaps	%xmm8,0x20(%rsp)
150	movaps	%xmm9,0x30(%rsp)
151	movaps	%xmm10,0x40(%rsp)
152	movaps	%xmm11,0x50(%rsp)
153	movaps	%xmm12,0x60(%rsp)
154	movaps	%xmm13,-0x68(%rax)	# not used, saved to share se_handler
155	movaps	%xmm14,-0x58(%rax)
156	movaps	%xmm15,-0x48(%rax)
157___
158$code.=<<___;
159	# stack layout
160	#
161	# +0	output sink
162	# +16	input sink [original %rsp and $num]
163	# +32	counters
164
165	sub	\$48,%rsp
166	and	\$-64,%rsp
167	mov	%rax,16(%rsp)			# original %rsp
168.cfi_cfa_expression	%rsp+16,deref,+8
169
170.Lenc4x_body:
171	movdqu	($key),$zero			# 0-round key
172	lea	0x78($key),$key			# size optimization
173	lea	$inp_elm_size*2($inp),$inp
174
175.Lenc4x_loop_grande:
176	mov	$num,24(%rsp)			# original $num
177	xor	$num,$num
178___
179for($i=0;$i<4;$i++) {
180    $inptr_reg=&pointer_register($flavour,@inptr[$i]);
181    $outptr_reg=&pointer_register($flavour,@outptr[$i]);
182    $code.=<<___;
183	# borrow $one for number of blocks
184	mov	`$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
185	mov	`$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
186	cmp	$num,$one
187	mov	`$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
188	cmovg	$one,$num			# find maximum
189	test	$one,$one
190	# load IV
191	movdqu	`$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@out[$i]
192	mov	$one,`32+4*$i`(%rsp)		# initialize counters
193	cmovle	%rsp,@inptr[$i]			# cancel input
194___
195}
196$code.=<<___;
197	test	$num,$num
198	jz	.Lenc4x_done
199
200	movups	0x10-0x78($key),$rndkey1
201	 pxor	$zero,@out[0]
202	movups	0x20-0x78($key),$rndkey0
203	 pxor	$zero,@out[1]
204	mov	0xf0-0x78($key),$rounds
205	 pxor	$zero,@out[2]
206	movdqu	(@inptr[0]),@inp[0]		# load inputs
207	 pxor	$zero,@out[3]
208	movdqu	(@inptr[1]),@inp[1]
209	 pxor	@inp[0],@out[0]
210	movdqu	(@inptr[2]),@inp[2]
211	 pxor	@inp[1],@out[1]
212	movdqu	(@inptr[3]),@inp[3]
213	 pxor	@inp[2],@out[2]
214	 pxor	@inp[3],@out[3]
215	movdqa	32(%rsp),$counters		# load counters
216	xor	$offset,$offset
217	jmp	.Loop_enc4x
218
219.align	32
220.Loop_enc4x:
221	add	\$16,$offset
222	lea	16(%rsp),$sink			# sink pointer
223	mov	\$1,$one			# constant of 1
224	sub	$offset,$sink
225
226	aesenc		$rndkey1,@out[0]
227	prefetcht0	31(@inptr[0],$offset)	# prefetch input
228	prefetcht0	31(@inptr[1],$offset)
229	aesenc		$rndkey1,@out[1]
230	prefetcht0	31(@inptr[2],$offset)
231	prefetcht0	31(@inptr[2],$offset)
232	aesenc		$rndkey1,@out[2]
233	aesenc		$rndkey1,@out[3]
234	movups		0x30-0x78($key),$rndkey1
235___
236for($i=0;$i<4;$i++) {
237my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
238$code.=<<___;
239	 cmp		`32+4*$i`(%rsp),$one
240	aesenc		$rndkey,@out[0]
241	aesenc		$rndkey,@out[1]
242	aesenc		$rndkey,@out[2]
243	 cmovge		$sink,@inptr[$i]	# cancel input
244	 cmovg		$sink,@outptr[$i]	# sink output
245	aesenc		$rndkey,@out[3]
246	movups		`0x40+16*$i-0x78`($key),$rndkey
247___
248}
249$code.=<<___;
250	 movdqa		$counters,$mask
251	aesenc		$rndkey0,@out[0]
252	prefetcht0	15(@outptr[0],$offset)	# prefetch output
253	prefetcht0	15(@outptr[1],$offset)
254	aesenc		$rndkey0,@out[1]
255	prefetcht0	15(@outptr[2],$offset)
256	prefetcht0	15(@outptr[3],$offset)
257	aesenc		$rndkey0,@out[2]
258	aesenc		$rndkey0,@out[3]
259	movups		0x80-0x78($key),$rndkey0
260	 pxor		$zero,$zero
261
262	aesenc		$rndkey1,@out[0]
263	 pcmpgtd	$zero,$mask
264	 movdqu		-0x78($key),$zero	# reload 0-round key
265	aesenc		$rndkey1,@out[1]
266	 paddd		$mask,$counters		# decrement counters
267	 movdqa		$counters,32(%rsp)	# update counters
268	aesenc		$rndkey1,@out[2]
269	aesenc		$rndkey1,@out[3]
270	movups		0x90-0x78($key),$rndkey1
271
272	cmp	\$11,$rounds
273
274	aesenc		$rndkey0,@out[0]
275	aesenc		$rndkey0,@out[1]
276	aesenc		$rndkey0,@out[2]
277	aesenc		$rndkey0,@out[3]
278	movups		0xa0-0x78($key),$rndkey0
279
280	jb	.Lenc4x_tail
281
282	aesenc		$rndkey1,@out[0]
283	aesenc		$rndkey1,@out[1]
284	aesenc		$rndkey1,@out[2]
285	aesenc		$rndkey1,@out[3]
286	movups		0xb0-0x78($key),$rndkey1
287
288	aesenc		$rndkey0,@out[0]
289	aesenc		$rndkey0,@out[1]
290	aesenc		$rndkey0,@out[2]
291	aesenc		$rndkey0,@out[3]
292	movups		0xc0-0x78($key),$rndkey0
293
294	je	.Lenc4x_tail
295
296	aesenc		$rndkey1,@out[0]
297	aesenc		$rndkey1,@out[1]
298	aesenc		$rndkey1,@out[2]
299	aesenc		$rndkey1,@out[3]
300	movups		0xd0-0x78($key),$rndkey1
301
302	aesenc		$rndkey0,@out[0]
303	aesenc		$rndkey0,@out[1]
304	aesenc		$rndkey0,@out[2]
305	aesenc		$rndkey0,@out[3]
306	movups		0xe0-0x78($key),$rndkey0
307	jmp	.Lenc4x_tail
308
309.align	32
310.Lenc4x_tail:
311	aesenc		$rndkey1,@out[0]
312	aesenc		$rndkey1,@out[1]
313	aesenc		$rndkey1,@out[2]
314	aesenc		$rndkey1,@out[3]
315	 movdqu		(@inptr[0],$offset),@inp[0]
316	movdqu		0x10-0x78($key),$rndkey1
317
318	aesenclast	$rndkey0,@out[0]
319	 movdqu		(@inptr[1],$offset),@inp[1]
320	 pxor		$zero,@inp[0]
321	aesenclast	$rndkey0,@out[1]
322	 movdqu		(@inptr[2],$offset),@inp[2]
323	 pxor		$zero,@inp[1]
324	aesenclast	$rndkey0,@out[2]
325	 movdqu		(@inptr[3],$offset),@inp[3]
326	 pxor		$zero,@inp[2]
327	aesenclast	$rndkey0,@out[3]
328	movdqu		0x20-0x78($key),$rndkey0
329	 pxor		$zero,@inp[3]
330
331	movups		@out[0],-16(@outptr[0],$offset)
332	 pxor		@inp[0],@out[0]
333	movups		@out[1],-16(@outptr[1],$offset)
334	 pxor		@inp[1],@out[1]
335	movups		@out[2],-16(@outptr[2],$offset)
336	 pxor		@inp[2],@out[2]
337	movups		@out[3],-16(@outptr[3],$offset)
338	 pxor		@inp[3],@out[3]
339
340	dec	$num
341	jnz	.Loop_enc4x
342
343	mov	16(%rsp),%rax			# original %rsp
344.cfi_def_cfa	%rax,8
345	mov	24(%rsp),$num
346
347	#pxor	@inp[0],@out[0]
348	#pxor	@inp[1],@out[1]
349	# output iv FIX ME!
350	#movdqu	@out[0],`$inp_elm_size*0+2*$ptr_size+8-$inp_elm_size*2`($inp)
351	#pxor	@inp[2],@out[2]
352	#movdqu	@out[1],`$inp_elm_size*1+2*$ptr_size+8-$inp_elm_size*2`($inp)
353	#pxor	@inp[3],@out[3]
354	#movdqu	@out[2],`$inp_elm_size*2+2*$ptr_size+8-$inp_elm_size*2`($inp)	# won't fix, let caller
355	#movdqu	@out[3],`$inp_elm_size*3+2*$ptr_size+8-$inp_elm_size*2`($inp)	# figure this out...
356
357	lea	`$inp_elm_size*4`($inp),$inp
358	dec	$num
359	jnz	.Lenc4x_loop_grande
360
361.Lenc4x_done:
362___
363$code.=<<___ if ($win64);
364	movaps	-0xd8(%rax),%xmm6
365	movaps	-0xc8(%rax),%xmm7
366	movaps	-0xb8(%rax),%xmm8
367	movaps	-0xa8(%rax),%xmm9
368	movaps	-0x98(%rax),%xmm10
369	movaps	-0x88(%rax),%xmm11
370	movaps	-0x78(%rax),%xmm12
371	#movaps	-0x68(%rax),%xmm13
372	#movaps	-0x58(%rax),%xmm14
373	#movaps	-0x48(%rax),%xmm15
374___
375$code.=<<___;
376	mov	-48(%rax),%r15
377.cfi_restore	%r15
378	mov	-40(%rax),%r14
379.cfi_restore	%r14
380	mov	-32(%rax),%r13
381.cfi_restore	%r13
382	mov	-24(%rax),%r12
383.cfi_restore	%r12
384	mov	-16(%rax),%rbp
385.cfi_restore	%rbp
386	mov	-8(%rax),%rbx
387.cfi_restore	%rbx
388	lea	(%rax),%rsp
389.cfi_def_cfa_register	%rsp
390.Lenc4x_epilogue:
391	ret
392.cfi_endproc
393.size	aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
394
395.globl	aesni_multi_cbc_decrypt
396.type	aesni_multi_cbc_decrypt,\@function,3
397.align	32
398aesni_multi_cbc_decrypt:
399.cfi_startproc
400___
401$code.=<<___ if ($avx);
402	cmp	\$2,$num
403	jb	.Ldec_non_avx
404	mov	OPENSSL_ia32cap_P+4(%rip),%ecx
405	test	\$`1<<28`,%ecx			# AVX bit
406	jnz	_avx_cbc_dec_shortcut
407	jmp	.Ldec_non_avx
408.align	16
409.Ldec_non_avx:
410___
411$code.=<<___;
412	mov	%rsp,%rax
413.cfi_def_cfa_register	%rax
414	push	%rbx
415.cfi_push	%rbx
416	push	%rbp
417.cfi_push	%rbp
418	push	%r12
419.cfi_push	%r12
420	push	%r13
421.cfi_push	%r13
422	push	%r14
423.cfi_push	%r14
424	push	%r15
425.cfi_push	%r15
426___
427$code.=<<___ if ($win64);
428	lea	-0xa8(%rsp),%rsp
429	movaps	%xmm6,(%rsp)
430	movaps	%xmm7,0x10(%rsp)
431	movaps	%xmm8,0x20(%rsp)
432	movaps	%xmm9,0x30(%rsp)
433	movaps	%xmm10,0x40(%rsp)
434	movaps	%xmm11,0x50(%rsp)
435	movaps	%xmm12,0x60(%rsp)
436	movaps	%xmm13,-0x68(%rax)	# not used, saved to share se_handler
437	movaps	%xmm14,-0x58(%rax)
438	movaps	%xmm15,-0x48(%rax)
439___
440$code.=<<___;
441	# stack layout
442	#
443	# +0	output sink
444	# +16	input sink [original %rsp and $num]
445	# +32	counters
446
447	sub	\$48,%rsp
448	and	\$-64,%rsp
449	mov	%rax,16(%rsp)			# original %rsp
450.cfi_cfa_expression	%rsp+16,deref,+8
451
452.Ldec4x_body:
453	movdqu	($key),$zero			# 0-round key
454	lea	0x78($key),$key			# size optimization
455	lea	$inp_elm_size*2($inp),$inp
456
457.Ldec4x_loop_grande:
458	mov	$num,24(%rsp)			# original $num
459	xor	$num,$num
460___
461for($i=0;$i<4;$i++) {
462    $inptr_reg=&pointer_register($flavour,@inptr[$i]);
463    $outptr_reg=&pointer_register($flavour,@outptr[$i]);
464    $code.=<<___;
465	# borrow $one for number of blocks
466	mov	`$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
467	mov	`$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
468	cmp	$num,$one
469	mov	`$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
470	cmovg	$one,$num			# find maximum
471	test	$one,$one
472	# load IV
473	movdqu	`$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@inp[$i]
474	mov	$one,`32+4*$i`(%rsp)		# initialize counters
475	cmovle	%rsp,@inptr[$i]			# cancel input
476___
477}
478$code.=<<___;
479	test	$num,$num
480	jz	.Ldec4x_done
481
482	movups	0x10-0x78($key),$rndkey1
483	movups	0x20-0x78($key),$rndkey0
484	mov	0xf0-0x78($key),$rounds
485	movdqu	(@inptr[0]),@out[0]		# load inputs
486	movdqu	(@inptr[1]),@out[1]
487	 pxor	$zero,@out[0]
488	movdqu	(@inptr[2]),@out[2]
489	 pxor	$zero,@out[1]
490	movdqu	(@inptr[3]),@out[3]
491	 pxor	$zero,@out[2]
492	 pxor	$zero,@out[3]
493	movdqa	32(%rsp),$counters		# load counters
494	xor	$offset,$offset
495	jmp	.Loop_dec4x
496
497.align	32
498.Loop_dec4x:
499	add	\$16,$offset
500	lea	16(%rsp),$sink			# sink pointer
501	mov	\$1,$one			# constant of 1
502	sub	$offset,$sink
503
504	aesdec		$rndkey1,@out[0]
505	prefetcht0	31(@inptr[0],$offset)	# prefetch input
506	prefetcht0	31(@inptr[1],$offset)
507	aesdec		$rndkey1,@out[1]
508	prefetcht0	31(@inptr[2],$offset)
509	prefetcht0	31(@inptr[3],$offset)
510	aesdec		$rndkey1,@out[2]
511	aesdec		$rndkey1,@out[3]
512	movups		0x30-0x78($key),$rndkey1
513___
514for($i=0;$i<4;$i++) {
515my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
516$code.=<<___;
517	 cmp		`32+4*$i`(%rsp),$one
518	aesdec		$rndkey,@out[0]
519	aesdec		$rndkey,@out[1]
520	aesdec		$rndkey,@out[2]
521	 cmovge		$sink,@inptr[$i]	# cancel input
522	 cmovg		$sink,@outptr[$i]	# sink output
523	aesdec		$rndkey,@out[3]
524	movups		`0x40+16*$i-0x78`($key),$rndkey
525___
526}
527$code.=<<___;
528	 movdqa		$counters,$mask
529	aesdec		$rndkey0,@out[0]
530	prefetcht0	15(@outptr[0],$offset)	# prefetch output
531	prefetcht0	15(@outptr[1],$offset)
532	aesdec		$rndkey0,@out[1]
533	prefetcht0	15(@outptr[2],$offset)
534	prefetcht0	15(@outptr[3],$offset)
535	aesdec		$rndkey0,@out[2]
536	aesdec		$rndkey0,@out[3]
537	movups		0x80-0x78($key),$rndkey0
538	 pxor		$zero,$zero
539
540	aesdec		$rndkey1,@out[0]
541	 pcmpgtd	$zero,$mask
542	 movdqu		-0x78($key),$zero	# reload 0-round key
543	aesdec		$rndkey1,@out[1]
544	 paddd		$mask,$counters		# decrement counters
545	 movdqa		$counters,32(%rsp)	# update counters
546	aesdec		$rndkey1,@out[2]
547	aesdec		$rndkey1,@out[3]
548	movups		0x90-0x78($key),$rndkey1
549
550	cmp	\$11,$rounds
551
552	aesdec		$rndkey0,@out[0]
553	aesdec		$rndkey0,@out[1]
554	aesdec		$rndkey0,@out[2]
555	aesdec		$rndkey0,@out[3]
556	movups		0xa0-0x78($key),$rndkey0
557
558	jb	.Ldec4x_tail
559
560	aesdec		$rndkey1,@out[0]
561	aesdec		$rndkey1,@out[1]
562	aesdec		$rndkey1,@out[2]
563	aesdec		$rndkey1,@out[3]
564	movups		0xb0-0x78($key),$rndkey1
565
566	aesdec		$rndkey0,@out[0]
567	aesdec		$rndkey0,@out[1]
568	aesdec		$rndkey0,@out[2]
569	aesdec		$rndkey0,@out[3]
570	movups		0xc0-0x78($key),$rndkey0
571
572	je	.Ldec4x_tail
573
574	aesdec		$rndkey1,@out[0]
575	aesdec		$rndkey1,@out[1]
576	aesdec		$rndkey1,@out[2]
577	aesdec		$rndkey1,@out[3]
578	movups		0xd0-0x78($key),$rndkey1
579
580	aesdec		$rndkey0,@out[0]
581	aesdec		$rndkey0,@out[1]
582	aesdec		$rndkey0,@out[2]
583	aesdec		$rndkey0,@out[3]
584	movups		0xe0-0x78($key),$rndkey0
585	jmp	.Ldec4x_tail
586
587.align	32
588.Ldec4x_tail:
589	aesdec		$rndkey1,@out[0]
590	aesdec		$rndkey1,@out[1]
591	aesdec		$rndkey1,@out[2]
592	 pxor		$rndkey0,@inp[0]
593	 pxor		$rndkey0,@inp[1]
594	aesdec		$rndkey1,@out[3]
595	movdqu		0x10-0x78($key),$rndkey1
596	 pxor		$rndkey0,@inp[2]
597	 pxor		$rndkey0,@inp[3]
598	movdqu		0x20-0x78($key),$rndkey0
599
600	aesdeclast	@inp[0],@out[0]
601	aesdeclast	@inp[1],@out[1]
602	 movdqu		-16(@inptr[0],$offset),@inp[0]	# load next IV
603	 movdqu		-16(@inptr[1],$offset),@inp[1]
604	aesdeclast	@inp[2],@out[2]
605	aesdeclast	@inp[3],@out[3]
606	 movdqu		-16(@inptr[2],$offset),@inp[2]
607	 movdqu		-16(@inptr[3],$offset),@inp[3]
608
609	movups		@out[0],-16(@outptr[0],$offset)
610	 movdqu		(@inptr[0],$offset),@out[0]
611	movups		@out[1],-16(@outptr[1],$offset)
612	 movdqu		(@inptr[1],$offset),@out[1]
613	 pxor		$zero,@out[0]
614	movups		@out[2],-16(@outptr[2],$offset)
615	 movdqu		(@inptr[2],$offset),@out[2]
616	 pxor		$zero,@out[1]
617	movups		@out[3],-16(@outptr[3],$offset)
618	 movdqu		(@inptr[3],$offset),@out[3]
619	 pxor		$zero,@out[2]
620	 pxor		$zero,@out[3]
621
622	dec	$num
623	jnz	.Loop_dec4x
624
625	mov	16(%rsp),%rax			# original %rsp
626.cfi_def_cfa	%rax,8
627	mov	24(%rsp),$num
628
629	lea	`$inp_elm_size*4`($inp),$inp
630	dec	$num
631	jnz	.Ldec4x_loop_grande
632
633.Ldec4x_done:
634___
635$code.=<<___ if ($win64);
636	movaps	-0xd8(%rax),%xmm6
637	movaps	-0xc8(%rax),%xmm7
638	movaps	-0xb8(%rax),%xmm8
639	movaps	-0xa8(%rax),%xmm9
640	movaps	-0x98(%rax),%xmm10
641	movaps	-0x88(%rax),%xmm11
642	movaps	-0x78(%rax),%xmm12
643	#movaps	-0x68(%rax),%xmm13
644	#movaps	-0x58(%rax),%xmm14
645	#movaps	-0x48(%rax),%xmm15
646___
647$code.=<<___;
648	mov	-48(%rax),%r15
649.cfi_restore	%r15
650	mov	-40(%rax),%r14
651.cfi_restore	%r14
652	mov	-32(%rax),%r13
653.cfi_restore	%r13
654	mov	-24(%rax),%r12
655.cfi_restore	%r12
656	mov	-16(%rax),%rbp
657.cfi_restore	%rbp
658	mov	-8(%rax),%rbx
659.cfi_restore	%rbx
660	lea	(%rax),%rsp
661.cfi_def_cfa_register	%rsp
662.Ldec4x_epilogue:
663	ret
664.cfi_endproc
665.size	aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
666___
667
668						if ($avx) {{{
669my @ptr=map("%r$_",(8..15));
670my $offload=$sink;
671
672my @out=map("%xmm$_",(2..9));
673my @inp=map("%xmm$_",(10..13));
674my ($counters,$zero)=("%xmm14","%xmm15");
675
676$code.=<<___;
677.type	aesni_multi_cbc_encrypt_avx,\@function,3
678.align	32
679aesni_multi_cbc_encrypt_avx:
680.cfi_startproc
681_avx_cbc_enc_shortcut:
682	mov	%rsp,%rax
683.cfi_def_cfa_register	%rax
684	push	%rbx
685.cfi_push	%rbx
686	push	%rbp
687.cfi_push	%rbp
688	push	%r12
689.cfi_push	%r12
690	push	%r13
691.cfi_push	%r13
692	push	%r14
693.cfi_push	%r14
694	push	%r15
695.cfi_push	%r15
696___
697$code.=<<___ if ($win64);
698	lea	-0xa8(%rsp),%rsp
699	movaps	%xmm6,(%rsp)
700	movaps	%xmm7,0x10(%rsp)
701	movaps	%xmm8,0x20(%rsp)
702	movaps	%xmm9,0x30(%rsp)
703	movaps	%xmm10,0x40(%rsp)
704	movaps	%xmm11,0x50(%rsp)
705	movaps	%xmm12,-0x78(%rax)
706	movaps	%xmm13,-0x68(%rax)
707	movaps	%xmm14,-0x58(%rax)
708	movaps	%xmm15,-0x48(%rax)
709___
710$code.=<<___;
711	# stack layout
712	#
713	# +0	output sink
714	# +16	input sink [original %rsp and $num]
715	# +32	counters
716	# +64	distances between inputs and outputs
717	# +128	off-load area for @inp[0..3]
718
719	sub	\$192,%rsp
720	and	\$-128,%rsp
721	mov	%rax,16(%rsp)			# original %rsp
722.cfi_cfa_expression	%rsp+16,deref,+8
723
724.Lenc8x_body:
725	vzeroupper
726	vmovdqu	($key),$zero			# 0-round key
727	lea	0x78($key),$key			# size optimization
728	lea	`$inp_elm_size*4`($inp),$inp
729	shr	\$1,$num
730
731.Lenc8x_loop_grande:
732	#mov	$num,24(%rsp)			# original $num
733	xor	$num,$num
734___
735for($i=0;$i<8;$i++) {
736  my $temp = $i ? $offload : $offset;
737    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
738    $temp_reg=&pointer_register($flavour,$temp);
739    $code.=<<___;
740	# borrow $one for number of blocks
741	mov	`$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
742	# input pointer
743	mov	`$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
744	cmp	$num,$one
745	# output pointer
746	mov	`$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
747	cmovg	$one,$num			# find maximum
748	test	$one,$one
749	# load IV
750	vmovdqu	`$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
751	mov	$one,`32+4*$i`(%rsp)		# initialize counters
752	cmovle	%rsp,@ptr[$i]			# cancel input
753	sub	@ptr[$i],$temp			# distance between input and output
754	mov	$temp,`64+8*$i`(%rsp)		# initialize distances
755___
756}
757$code.=<<___;
758	test	$num,$num
759	jz	.Lenc8x_done
760
761	vmovups	0x10-0x78($key),$rndkey1
762	vmovups	0x20-0x78($key),$rndkey0
763	mov	0xf0-0x78($key),$rounds
764
765	vpxor	(@ptr[0]),$zero,@inp[0]		# load inputs and xor with 0-round
766	 lea	128(%rsp),$offload		# offload area
767	vpxor	(@ptr[1]),$zero,@inp[1]
768	vpxor	(@ptr[2]),$zero,@inp[2]
769	vpxor	(@ptr[3]),$zero,@inp[3]
770	 vpxor	@inp[0],@out[0],@out[0]
771	vpxor	(@ptr[4]),$zero,@inp[0]
772	 vpxor	@inp[1],@out[1],@out[1]
773	vpxor	(@ptr[5]),$zero,@inp[1]
774	 vpxor	@inp[2],@out[2],@out[2]
775	vpxor	(@ptr[6]),$zero,@inp[2]
776	 vpxor	@inp[3],@out[3],@out[3]
777	vpxor	(@ptr[7]),$zero,@inp[3]
778	 vpxor	@inp[0],@out[4],@out[4]
779	mov	\$1,$one			# constant of 1
780	 vpxor	@inp[1],@out[5],@out[5]
781	 vpxor	@inp[2],@out[6],@out[6]
782	 vpxor	@inp[3],@out[7],@out[7]
783	jmp	.Loop_enc8x
784
785.align	32
786.Loop_enc8x:
787___
788for($i=0;$i<8;$i++) {
789my $rndkey=($i&1)?$rndkey0:$rndkey1;
790$code.=<<___;
791	vaesenc		$rndkey,@out[0],@out[0]
792	 cmp		32+4*$i(%rsp),$one
793___
794$code.=<<___ if ($i);
795	 mov		64+8*$i(%rsp),$offset
796___
797$code.=<<___;
798	vaesenc		$rndkey,@out[1],@out[1]
799	prefetcht0	31(@ptr[$i])			# prefetch input
800	vaesenc		$rndkey,@out[2],@out[2]
801___
802$code.=<<___ if ($i>1);
803	prefetcht0	15(@ptr[$i-2])			# prefetch output
804___
805$code.=<<___;
806	vaesenc		$rndkey,@out[3],@out[3]
807	 lea		(@ptr[$i],$offset),$offset
808	 cmovge		%rsp,@ptr[$i]			# cancel input
809	vaesenc		$rndkey,@out[4],@out[4]
810	 cmovg		%rsp,$offset			# sink output
811	vaesenc		$rndkey,@out[5],@out[5]
812	 sub		@ptr[$i],$offset
813	vaesenc		$rndkey,@out[6],@out[6]
814	 vpxor		16(@ptr[$i]),$zero,@inp[$i%4]	# load input and xor with 0-round
815	 mov		$offset,64+8*$i(%rsp)
816	vaesenc		$rndkey,@out[7],@out[7]
817	vmovups		`16*(3+$i)-0x78`($key),$rndkey
818	 lea		16(@ptr[$i],$offset),@ptr[$i]	# switch to output
819___
820$code.=<<___ if ($i<4)
821	 vmovdqu	@inp[$i%4],`16*$i`($offload)	# off-load
822___
823}
824$code.=<<___;
825	 vmovdqu	32(%rsp),$counters
826	prefetcht0	15(@ptr[$i-2])			# prefetch output
827	prefetcht0	15(@ptr[$i-1])
828	cmp	\$11,$rounds
829	jb	.Lenc8x_tail
830
831	vaesenc		$rndkey1,@out[0],@out[0]
832	vaesenc		$rndkey1,@out[1],@out[1]
833	vaesenc		$rndkey1,@out[2],@out[2]
834	vaesenc		$rndkey1,@out[3],@out[3]
835	vaesenc		$rndkey1,@out[4],@out[4]
836	vaesenc		$rndkey1,@out[5],@out[5]
837	vaesenc		$rndkey1,@out[6],@out[6]
838	vaesenc		$rndkey1,@out[7],@out[7]
839	vmovups		0xb0-0x78($key),$rndkey1
840
841	vaesenc		$rndkey0,@out[0],@out[0]
842	vaesenc		$rndkey0,@out[1],@out[1]
843	vaesenc		$rndkey0,@out[2],@out[2]
844	vaesenc		$rndkey0,@out[3],@out[3]
845	vaesenc		$rndkey0,@out[4],@out[4]
846	vaesenc		$rndkey0,@out[5],@out[5]
847	vaesenc		$rndkey0,@out[6],@out[6]
848	vaesenc		$rndkey0,@out[7],@out[7]
849	vmovups		0xc0-0x78($key),$rndkey0
850	je	.Lenc8x_tail
851
852	vaesenc		$rndkey1,@out[0],@out[0]
853	vaesenc		$rndkey1,@out[1],@out[1]
854	vaesenc		$rndkey1,@out[2],@out[2]
855	vaesenc		$rndkey1,@out[3],@out[3]
856	vaesenc		$rndkey1,@out[4],@out[4]
857	vaesenc		$rndkey1,@out[5],@out[5]
858	vaesenc		$rndkey1,@out[6],@out[6]
859	vaesenc		$rndkey1,@out[7],@out[7]
860	vmovups		0xd0-0x78($key),$rndkey1
861
862	vaesenc		$rndkey0,@out[0],@out[0]
863	vaesenc		$rndkey0,@out[1],@out[1]
864	vaesenc		$rndkey0,@out[2],@out[2]
865	vaesenc		$rndkey0,@out[3],@out[3]
866	vaesenc		$rndkey0,@out[4],@out[4]
867	vaesenc		$rndkey0,@out[5],@out[5]
868	vaesenc		$rndkey0,@out[6],@out[6]
869	vaesenc		$rndkey0,@out[7],@out[7]
870	vmovups		0xe0-0x78($key),$rndkey0
871
872.Lenc8x_tail:
873	vaesenc		$rndkey1,@out[0],@out[0]
874	 vpxor		$zero,$zero,$zero
875	vaesenc		$rndkey1,@out[1],@out[1]
876	vaesenc		$rndkey1,@out[2],@out[2]
877	 vpcmpgtd	$zero,$counters,$zero
878	vaesenc		$rndkey1,@out[3],@out[3]
879	vaesenc		$rndkey1,@out[4],@out[4]
880	 vpaddd		$counters,$zero,$zero		# decrement counters
881	 vmovdqu	48(%rsp),$counters
882	vaesenc		$rndkey1,@out[5],@out[5]
883	 mov		64(%rsp),$offset		# pre-load 1st offset
884	vaesenc		$rndkey1,@out[6],@out[6]
885	vaesenc		$rndkey1,@out[7],@out[7]
886	vmovups		0x10-0x78($key),$rndkey1
887
888	vaesenclast	$rndkey0,@out[0],@out[0]
889	 vmovdqa	$zero,32(%rsp)			# update counters
890	 vpxor		$zero,$zero,$zero
891	vaesenclast	$rndkey0,@out[1],@out[1]
892	vaesenclast	$rndkey0,@out[2],@out[2]
893	 vpcmpgtd	$zero,$counters,$zero
894	vaesenclast	$rndkey0,@out[3],@out[3]
895	vaesenclast	$rndkey0,@out[4],@out[4]
896	 vpaddd		$zero,$counters,$counters	# decrement counters
897	 vmovdqu	-0x78($key),$zero		# 0-round
898	vaesenclast	$rndkey0,@out[5],@out[5]
899	vaesenclast	$rndkey0,@out[6],@out[6]
900	 vmovdqa	$counters,48(%rsp)		# update counters
901	vaesenclast	$rndkey0,@out[7],@out[7]
902	vmovups		0x20-0x78($key),$rndkey0
903
904	vmovups		@out[0],-16(@ptr[0])		# write output
905	 sub		$offset,@ptr[0]			# switch to input
906	 vpxor		0x00($offload),@out[0],@out[0]
907	vmovups		@out[1],-16(@ptr[1])
908	 sub		`64+1*8`(%rsp),@ptr[1]
909	 vpxor		0x10($offload),@out[1],@out[1]
910	vmovups		@out[2],-16(@ptr[2])
911	 sub		`64+2*8`(%rsp),@ptr[2]
912	 vpxor		0x20($offload),@out[2],@out[2]
913	vmovups		@out[3],-16(@ptr[3])
914	 sub		`64+3*8`(%rsp),@ptr[3]
915	 vpxor		0x30($offload),@out[3],@out[3]
916	vmovups		@out[4],-16(@ptr[4])
917	 sub		`64+4*8`(%rsp),@ptr[4]
918	 vpxor		@inp[0],@out[4],@out[4]
919	vmovups		@out[5],-16(@ptr[5])
920	 sub		`64+5*8`(%rsp),@ptr[5]
921	 vpxor		@inp[1],@out[5],@out[5]
922	vmovups		@out[6],-16(@ptr[6])
923	 sub		`64+6*8`(%rsp),@ptr[6]
924	 vpxor		@inp[2],@out[6],@out[6]
925	vmovups		@out[7],-16(@ptr[7])
926	 sub		`64+7*8`(%rsp),@ptr[7]
927	 vpxor		@inp[3],@out[7],@out[7]
928
929	dec	$num
930	jnz	.Loop_enc8x
931
932	mov	16(%rsp),%rax			# original %rsp
933.cfi_def_cfa	%rax,8
934	#mov	24(%rsp),$num
935	#lea	`$inp_elm_size*8`($inp),$inp
936	#dec	$num
937	#jnz	.Lenc8x_loop_grande
938
939.Lenc8x_done:
940	vzeroupper
941___
942$code.=<<___ if ($win64);
943	movaps	-0xd8(%rax),%xmm6
944	movaps	-0xc8(%rax),%xmm7
945	movaps	-0xb8(%rax),%xmm8
946	movaps	-0xa8(%rax),%xmm9
947	movaps	-0x98(%rax),%xmm10
948	movaps	-0x88(%rax),%xmm11
949	movaps	-0x78(%rax),%xmm12
950	movaps	-0x68(%rax),%xmm13
951	movaps	-0x58(%rax),%xmm14
952	movaps	-0x48(%rax),%xmm15
953___
954$code.=<<___;
955	mov	-48(%rax),%r15
956.cfi_restore	%r15
957	mov	-40(%rax),%r14
958.cfi_restore	%r14
959	mov	-32(%rax),%r13
960.cfi_restore	%r13
961	mov	-24(%rax),%r12
962.cfi_restore	%r12
963	mov	-16(%rax),%rbp
964.cfi_restore	%rbp
965	mov	-8(%rax),%rbx
966.cfi_restore	%rbx
967	lea	(%rax),%rsp
968.cfi_def_cfa_register	%rsp
969.Lenc8x_epilogue:
970	ret
971.cfi_endproc
972.size	aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
973
974.type	aesni_multi_cbc_decrypt_avx,\@function,3
975.align	32
976aesni_multi_cbc_decrypt_avx:
977.cfi_startproc
978_avx_cbc_dec_shortcut:
979	mov	%rsp,%rax
980.cfi_def_cfa_register	%rax
981	push	%rbx
982.cfi_push	%rbx
983	push	%rbp
984.cfi_push	%rbp
985	push	%r12
986.cfi_push	%r12
987	push	%r13
988.cfi_push	%r13
989	push	%r14
990.cfi_push	%r14
991	push	%r15
992.cfi_push	%r15
993___
994$code.=<<___ if ($win64);
995	lea	-0xa8(%rsp),%rsp
996	movaps	%xmm6,(%rsp)
997	movaps	%xmm7,0x10(%rsp)
998	movaps	%xmm8,0x20(%rsp)
999	movaps	%xmm9,0x30(%rsp)
1000	movaps	%xmm10,0x40(%rsp)
1001	movaps	%xmm11,0x50(%rsp)
1002	movaps	%xmm12,-0x78(%rax)
1003	movaps	%xmm13,-0x68(%rax)
1004	movaps	%xmm14,-0x58(%rax)
1005	movaps	%xmm15,-0x48(%rax)
1006___
1007$code.=<<___;
1008	# stack layout
1009	#
1010	# +0	output sink
1011	# +16	input sink [original %rsp and $num]
1012	# +32	counters
1013	# +64	distances between inputs and outputs
1014	# +128	off-load area for @inp[0..3]
1015	# +192	IV/input offload
1016
1017	sub	\$256,%rsp
1018	and	\$-256,%rsp
1019	sub	\$192,%rsp
1020	mov	%rax,16(%rsp)			# original %rsp
1021.cfi_cfa_expression	%rsp+16,deref,+8
1022
1023.Ldec8x_body:
1024	vzeroupper
1025	vmovdqu	($key),$zero			# 0-round key
1026	lea	0x78($key),$key			# size optimization
1027	lea	`$inp_elm_size*4`($inp),$inp
1028	shr	\$1,$num
1029
1030.Ldec8x_loop_grande:
1031	#mov	$num,24(%rsp)			# original $num
1032	xor	$num,$num
1033___
1034for($i=0;$i<8;$i++) {
1035  my $temp = $i ? $offload : $offset;
1036    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1037    $temp_reg=&pointer_register($flavour,$temp);
1038    $code.=<<___;
1039	# borrow $one for number of blocks
1040	mov	`$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
1041	# input pointer
1042	mov	`$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
1043	cmp	$num,$one
1044	# output pointer
1045	mov	`$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
1046	cmovg	$one,$num			# find maximum
1047	test	$one,$one
1048	# load IV
1049	vmovdqu	`$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
1050	mov	$one,`32+4*$i`(%rsp)		# initialize counters
1051	cmovle	%rsp,@ptr[$i]			# cancel input
1052	sub	@ptr[$i],$temp			# distance between input and output
1053	mov	$temp,`64+8*$i`(%rsp)		# initialize distances
1054	vmovdqu	@out[$i],`192+16*$i`(%rsp)	# offload IV
1055___
1056}
1057$code.=<<___;
1058	test	$num,$num
1059	jz	.Ldec8x_done
1060
1061	vmovups	0x10-0x78($key),$rndkey1
1062	vmovups	0x20-0x78($key),$rndkey0
1063	mov	0xf0-0x78($key),$rounds
1064	 lea	192+128(%rsp),$offload		# offload area
1065
1066	vmovdqu	(@ptr[0]),@out[0]		# load inputs
1067	vmovdqu	(@ptr[1]),@out[1]
1068	vmovdqu	(@ptr[2]),@out[2]
1069	vmovdqu	(@ptr[3]),@out[3]
1070	vmovdqu	(@ptr[4]),@out[4]
1071	vmovdqu	(@ptr[5]),@out[5]
1072	vmovdqu	(@ptr[6]),@out[6]
1073	vmovdqu	(@ptr[7]),@out[7]
1074	vmovdqu	@out[0],0x00($offload)		# offload inputs
1075	vpxor	$zero,@out[0],@out[0]		# xor inputs with 0-round
1076	vmovdqu	@out[1],0x10($offload)
1077	vpxor	$zero,@out[1],@out[1]
1078	vmovdqu	@out[2],0x20($offload)
1079	vpxor	$zero,@out[2],@out[2]
1080	vmovdqu	@out[3],0x30($offload)
1081	vpxor	$zero,@out[3],@out[3]
1082	vmovdqu	@out[4],0x40($offload)
1083	vpxor	$zero,@out[4],@out[4]
1084	vmovdqu	@out[5],0x50($offload)
1085	vpxor	$zero,@out[5],@out[5]
1086	vmovdqu	@out[6],0x60($offload)
1087	vpxor	$zero,@out[6],@out[6]
1088	vmovdqu	@out[7],0x70($offload)
1089	vpxor	$zero,@out[7],@out[7]
1090	xor	\$0x80,$offload
1091	mov	\$1,$one			# constant of 1
1092	jmp	.Loop_dec8x
1093
1094.align	32
1095.Loop_dec8x:
1096___
1097for($i=0;$i<8;$i++) {
1098my $rndkey=($i&1)?$rndkey0:$rndkey1;
1099$code.=<<___;
1100	vaesdec		$rndkey,@out[0],@out[0]
1101	 cmp		32+4*$i(%rsp),$one
1102___
1103$code.=<<___ if ($i);
1104	 mov		64+8*$i(%rsp),$offset
1105___
1106$code.=<<___;
1107	vaesdec		$rndkey,@out[1],@out[1]
1108	prefetcht0	31(@ptr[$i])			# prefetch input
1109	vaesdec		$rndkey,@out[2],@out[2]
1110___
1111$code.=<<___ if ($i>1);
1112	prefetcht0	15(@ptr[$i-2])			# prefetch output
1113___
1114$code.=<<___;
1115	vaesdec		$rndkey,@out[3],@out[3]
1116	 lea		(@ptr[$i],$offset),$offset
1117	 cmovge		%rsp,@ptr[$i]			# cancel input
1118	vaesdec		$rndkey,@out[4],@out[4]
1119	 cmovg		%rsp,$offset			# sink output
1120	vaesdec		$rndkey,@out[5],@out[5]
1121	 sub		@ptr[$i],$offset
1122	vaesdec		$rndkey,@out[6],@out[6]
1123	 vmovdqu	16(@ptr[$i]),@inp[$i%4]		# load input
1124	 mov		$offset,64+8*$i(%rsp)
1125	vaesdec		$rndkey,@out[7],@out[7]
1126	vmovups		`16*(3+$i)-0x78`($key),$rndkey
1127	 lea		16(@ptr[$i],$offset),@ptr[$i]	# switch to output
1128___
1129$code.=<<___ if ($i<4);
1130	 vmovdqu	@inp[$i%4],`128+16*$i`(%rsp)	# off-load
1131___
1132}
1133$code.=<<___;
1134	 vmovdqu	32(%rsp),$counters
1135	prefetcht0	15(@ptr[$i-2])			# prefetch output
1136	prefetcht0	15(@ptr[$i-1])
1137	cmp	\$11,$rounds
1138	jb	.Ldec8x_tail
1139
1140	vaesdec		$rndkey1,@out[0],@out[0]
1141	vaesdec		$rndkey1,@out[1],@out[1]
1142	vaesdec		$rndkey1,@out[2],@out[2]
1143	vaesdec		$rndkey1,@out[3],@out[3]
1144	vaesdec		$rndkey1,@out[4],@out[4]
1145	vaesdec		$rndkey1,@out[5],@out[5]
1146	vaesdec		$rndkey1,@out[6],@out[6]
1147	vaesdec		$rndkey1,@out[7],@out[7]
1148	vmovups		0xb0-0x78($key),$rndkey1
1149
1150	vaesdec		$rndkey0,@out[0],@out[0]
1151	vaesdec		$rndkey0,@out[1],@out[1]
1152	vaesdec		$rndkey0,@out[2],@out[2]
1153	vaesdec		$rndkey0,@out[3],@out[3]
1154	vaesdec		$rndkey0,@out[4],@out[4]
1155	vaesdec		$rndkey0,@out[5],@out[5]
1156	vaesdec		$rndkey0,@out[6],@out[6]
1157	vaesdec		$rndkey0,@out[7],@out[7]
1158	vmovups		0xc0-0x78($key),$rndkey0
1159	je	.Ldec8x_tail
1160
1161	vaesdec		$rndkey1,@out[0],@out[0]
1162	vaesdec		$rndkey1,@out[1],@out[1]
1163	vaesdec		$rndkey1,@out[2],@out[2]
1164	vaesdec		$rndkey1,@out[3],@out[3]
1165	vaesdec		$rndkey1,@out[4],@out[4]
1166	vaesdec		$rndkey1,@out[5],@out[5]
1167	vaesdec		$rndkey1,@out[6],@out[6]
1168	vaesdec		$rndkey1,@out[7],@out[7]
1169	vmovups		0xd0-0x78($key),$rndkey1
1170
1171	vaesdec		$rndkey0,@out[0],@out[0]
1172	vaesdec		$rndkey0,@out[1],@out[1]
1173	vaesdec		$rndkey0,@out[2],@out[2]
1174	vaesdec		$rndkey0,@out[3],@out[3]
1175	vaesdec		$rndkey0,@out[4],@out[4]
1176	vaesdec		$rndkey0,@out[5],@out[5]
1177	vaesdec		$rndkey0,@out[6],@out[6]
1178	vaesdec		$rndkey0,@out[7],@out[7]
1179	vmovups		0xe0-0x78($key),$rndkey0
1180
1181.Ldec8x_tail:
1182	vaesdec		$rndkey1,@out[0],@out[0]
1183	 vpxor		$zero,$zero,$zero
1184	vaesdec		$rndkey1,@out[1],@out[1]
1185	vaesdec		$rndkey1,@out[2],@out[2]
1186	 vpcmpgtd	$zero,$counters,$zero
1187	vaesdec		$rndkey1,@out[3],@out[3]
1188	vaesdec		$rndkey1,@out[4],@out[4]
1189	 vpaddd		$counters,$zero,$zero		# decrement counters
1190	 vmovdqu	48(%rsp),$counters
1191	vaesdec		$rndkey1,@out[5],@out[5]
1192	 mov		64(%rsp),$offset		# pre-load 1st offset
1193	vaesdec		$rndkey1,@out[6],@out[6]
1194	vaesdec		$rndkey1,@out[7],@out[7]
1195	vmovups		0x10-0x78($key),$rndkey1
1196
1197	vaesdeclast	$rndkey0,@out[0],@out[0]
1198	 vmovdqa	$zero,32(%rsp)			# update counters
1199	 vpxor		$zero,$zero,$zero
1200	vaesdeclast	$rndkey0,@out[1],@out[1]
1201	vpxor		0x00($offload),@out[0],@out[0]	# xor with IV
1202	vaesdeclast	$rndkey0,@out[2],@out[2]
1203	vpxor		0x10($offload),@out[1],@out[1]
1204	 vpcmpgtd	$zero,$counters,$zero
1205	vaesdeclast	$rndkey0,@out[3],@out[3]
1206	vpxor		0x20($offload),@out[2],@out[2]
1207	vaesdeclast	$rndkey0,@out[4],@out[4]
1208	vpxor		0x30($offload),@out[3],@out[3]
1209	 vpaddd		$zero,$counters,$counters	# decrement counters
1210	 vmovdqu	-0x78($key),$zero		# 0-round
1211	vaesdeclast	$rndkey0,@out[5],@out[5]
1212	vpxor		0x40($offload),@out[4],@out[4]
1213	vaesdeclast	$rndkey0,@out[6],@out[6]
1214	vpxor		0x50($offload),@out[5],@out[5]
1215	 vmovdqa	$counters,48(%rsp)		# update counters
1216	vaesdeclast	$rndkey0,@out[7],@out[7]
1217	vpxor		0x60($offload),@out[6],@out[6]
1218	vmovups		0x20-0x78($key),$rndkey0
1219
1220	vmovups		@out[0],-16(@ptr[0])		# write output
1221	 sub		$offset,@ptr[0]			# switch to input
1222	 vmovdqu	128+0(%rsp),@out[0]
1223	vpxor		0x70($offload),@out[7],@out[7]
1224	vmovups		@out[1],-16(@ptr[1])
1225	 sub		`64+1*8`(%rsp),@ptr[1]
1226	 vmovdqu	@out[0],0x00($offload)
1227	 vpxor		$zero,@out[0],@out[0]
1228	 vmovdqu	128+16(%rsp),@out[1]
1229	vmovups		@out[2],-16(@ptr[2])
1230	 sub		`64+2*8`(%rsp),@ptr[2]
1231	 vmovdqu	@out[1],0x10($offload)
1232	 vpxor		$zero,@out[1],@out[1]
1233	 vmovdqu	128+32(%rsp),@out[2]
1234	vmovups		@out[3],-16(@ptr[3])
1235	 sub		`64+3*8`(%rsp),@ptr[3]
1236	 vmovdqu	@out[2],0x20($offload)
1237	 vpxor		$zero,@out[2],@out[2]
1238	 vmovdqu	128+48(%rsp),@out[3]
1239	vmovups		@out[4],-16(@ptr[4])
1240	 sub		`64+4*8`(%rsp),@ptr[4]
1241	 vmovdqu	@out[3],0x30($offload)
1242	 vpxor		$zero,@out[3],@out[3]
1243	 vmovdqu	@inp[0],0x40($offload)
1244	 vpxor		@inp[0],$zero,@out[4]
1245	vmovups		@out[5],-16(@ptr[5])
1246	 sub		`64+5*8`(%rsp),@ptr[5]
1247	 vmovdqu	@inp[1],0x50($offload)
1248	 vpxor		@inp[1],$zero,@out[5]
1249	vmovups		@out[6],-16(@ptr[6])
1250	 sub		`64+6*8`(%rsp),@ptr[6]
1251	 vmovdqu	@inp[2],0x60($offload)
1252	 vpxor		@inp[2],$zero,@out[6]
1253	vmovups		@out[7],-16(@ptr[7])
1254	 sub		`64+7*8`(%rsp),@ptr[7]
1255	 vmovdqu	@inp[3],0x70($offload)
1256	 vpxor		@inp[3],$zero,@out[7]
1257
1258	xor	\$128,$offload
1259	dec	$num
1260	jnz	.Loop_dec8x
1261
1262	mov	16(%rsp),%rax			# original %rsp
1263.cfi_def_cfa	%rax,8
1264	#mov	24(%rsp),$num
1265	#lea	`$inp_elm_size*8`($inp),$inp
1266	#dec	$num
1267	#jnz	.Ldec8x_loop_grande
1268
1269.Ldec8x_done:
1270	vzeroupper
1271___
1272$code.=<<___ if ($win64);
1273	movaps	-0xd8(%rax),%xmm6
1274	movaps	-0xc8(%rax),%xmm7
1275	movaps	-0xb8(%rax),%xmm8
1276	movaps	-0xa8(%rax),%xmm9
1277	movaps	-0x98(%rax),%xmm10
1278	movaps	-0x88(%rax),%xmm11
1279	movaps	-0x78(%rax),%xmm12
1280	movaps	-0x68(%rax),%xmm13
1281	movaps	-0x58(%rax),%xmm14
1282	movaps	-0x48(%rax),%xmm15
1283___
1284$code.=<<___;
1285	mov	-48(%rax),%r15
1286.cfi_restore	%r15
1287	mov	-40(%rax),%r14
1288.cfi_restore	%r14
1289	mov	-32(%rax),%r13
1290.cfi_restore	%r13
1291	mov	-24(%rax),%r12
1292.cfi_restore	%r12
1293	mov	-16(%rax),%rbp
1294.cfi_restore	%rbp
1295	mov	-8(%rax),%rbx
1296.cfi_restore	%rbx
1297	lea	(%rax),%rsp
1298.cfi_def_cfa_register	%rsp
1299.Ldec8x_epilogue:
1300	ret
1301.cfi_endproc
1302.size	aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1303___
1304						}}}
1305
1306if ($win64) {
1307# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1308#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1309$rec="%rcx";
1310$frame="%rdx";
1311$context="%r8";
1312$disp="%r9";
1313
1314$code.=<<___;
1315.extern	__imp_RtlVirtualUnwind
1316.type	se_handler,\@abi-omnipotent
1317.align	16
1318se_handler:
1319	push	%rsi
1320	push	%rdi
1321	push	%rbx
1322	push	%rbp
1323	push	%r12
1324	push	%r13
1325	push	%r14
1326	push	%r15
1327	pushfq
1328	sub	\$64,%rsp
1329
1330	mov	120($context),%rax	# pull context->Rax
1331	mov	248($context),%rbx	# pull context->Rip
1332
1333	mov	8($disp),%rsi		# disp->ImageBase
1334	mov	56($disp),%r11		# disp->HandlerData
1335
1336	mov	0(%r11),%r10d		# HandlerData[0]
1337	lea	(%rsi,%r10),%r10	# prologue label
1338	cmp	%r10,%rbx		# context->Rip<.Lprologue
1339	jb	.Lin_prologue
1340
1341	mov	152($context),%rax	# pull context->Rsp
1342
1343	mov	4(%r11),%r10d		# HandlerData[1]
1344	lea	(%rsi,%r10),%r10	# epilogue label
1345	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1346	jae	.Lin_prologue
1347
1348	mov	16(%rax),%rax		# pull saved stack pointer
1349
1350	mov	-8(%rax),%rbx
1351	mov	-16(%rax),%rbp
1352	mov	-24(%rax),%r12
1353	mov	-32(%rax),%r13
1354	mov	-40(%rax),%r14
1355	mov	-48(%rax),%r15
1356	mov	%rbx,144($context)	# restore context->Rbx
1357	mov	%rbp,160($context)	# restore context->Rbp
1358	mov	%r12,216($context)	# restore context->R12
1359	mov	%r13,224($context)	# restore context->R13
1360	mov	%r14,232($context)	# restore context->R14
1361	mov	%r15,240($context)	# restore context->R15
1362
1363	lea	-56-10*16(%rax),%rsi
1364	lea	512($context),%rdi	# &context.Xmm6
1365	mov	\$20,%ecx
1366	.long	0xa548f3fc		# cld; rep movsq
1367
1368.Lin_prologue:
1369	mov	8(%rax),%rdi
1370	mov	16(%rax),%rsi
1371	mov	%rax,152($context)	# restore context->Rsp
1372	mov	%rsi,168($context)	# restore context->Rsi
1373	mov	%rdi,176($context)	# restore context->Rdi
1374
1375	mov	40($disp),%rdi		# disp->ContextRecord
1376	mov	$context,%rsi		# context
1377	mov	\$154,%ecx		# sizeof(CONTEXT)
1378	.long	0xa548f3fc		# cld; rep movsq
1379
1380	mov	$disp,%rsi
1381	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1382	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1383	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1384	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1385	mov	40(%rsi),%r10		# disp->ContextRecord
1386	lea	56(%rsi),%r11		# &disp->HandlerData
1387	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1388	mov	%r10,32(%rsp)		# arg5
1389	mov	%r11,40(%rsp)		# arg6
1390	mov	%r12,48(%rsp)		# arg7
1391	mov	%rcx,56(%rsp)		# arg8, (NULL)
1392	call	*__imp_RtlVirtualUnwind(%rip)
1393
1394	mov	\$1,%eax		# ExceptionContinueSearch
1395	add	\$64,%rsp
1396	popfq
1397	pop	%r15
1398	pop	%r14
1399	pop	%r13
1400	pop	%r12
1401	pop	%rbp
1402	pop	%rbx
1403	pop	%rdi
1404	pop	%rsi
1405	ret
1406.size	se_handler,.-se_handler
1407
1408.section	.pdata
1409.align	4
1410	.rva	.LSEH_begin_aesni_multi_cbc_encrypt
1411	.rva	.LSEH_end_aesni_multi_cbc_encrypt
1412	.rva	.LSEH_info_aesni_multi_cbc_encrypt
1413	.rva	.LSEH_begin_aesni_multi_cbc_decrypt
1414	.rva	.LSEH_end_aesni_multi_cbc_decrypt
1415	.rva	.LSEH_info_aesni_multi_cbc_decrypt
1416___
1417$code.=<<___ if ($avx);
1418	.rva	.LSEH_begin_aesni_multi_cbc_encrypt_avx
1419	.rva	.LSEH_end_aesni_multi_cbc_encrypt_avx
1420	.rva	.LSEH_info_aesni_multi_cbc_encrypt_avx
1421	.rva	.LSEH_begin_aesni_multi_cbc_decrypt_avx
1422	.rva	.LSEH_end_aesni_multi_cbc_decrypt_avx
1423	.rva	.LSEH_info_aesni_multi_cbc_decrypt_avx
1424___
1425$code.=<<___;
1426.section	.xdata
1427.align	8
1428.LSEH_info_aesni_multi_cbc_encrypt:
1429	.byte	9,0,0,0
1430	.rva	se_handler
1431	.rva	.Lenc4x_body,.Lenc4x_epilogue		# HandlerData[]
1432.LSEH_info_aesni_multi_cbc_decrypt:
1433	.byte	9,0,0,0
1434	.rva	se_handler
1435	.rva	.Ldec4x_body,.Ldec4x_epilogue		# HandlerData[]
1436___
1437$code.=<<___ if ($avx);
1438.LSEH_info_aesni_multi_cbc_encrypt_avx:
1439	.byte	9,0,0,0
1440	.rva	se_handler
1441	.rva	.Lenc8x_body,.Lenc8x_epilogue		# HandlerData[]
1442.LSEH_info_aesni_multi_cbc_decrypt_avx:
1443	.byte	9,0,0,0
1444	.rva	se_handler
1445	.rva	.Ldec8x_body,.Ldec8x_epilogue		# HandlerData[]
1446___
1447}
1448####################################################################
1449
1450sub rex {
1451  local *opcode=shift;
1452  my ($dst,$src)=@_;
1453  my $rex=0;
1454
1455    $rex|=0x04			if($dst>=8);
1456    $rex|=0x01			if($src>=8);
1457    push @opcode,$rex|0x40	if($rex);
1458}
1459
1460sub aesni {
1461  my $line=shift;
1462  my @opcode=(0x66);
1463
1464    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1465	rex(\@opcode,$4,$3);
1466	push @opcode,0x0f,0x3a,0xdf;
1467	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
1468	my $c=$2;
1469	push @opcode,$c=~/^0/?oct($c):$c;
1470	return ".byte\t".join(',',@opcode);
1471    }
1472    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1473	my %opcodelet = (
1474		"aesimc" => 0xdb,
1475		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
1476		"aesdec" => 0xde,	"aesdeclast" => 0xdf
1477	);
1478	return undef if (!defined($opcodelet{$1}));
1479	rex(\@opcode,$3,$2);
1480	push @opcode,0x0f,0x38,$opcodelet{$1};
1481	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
1482	return ".byte\t".join(',',@opcode);
1483    }
1484    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1485	my %opcodelet = (
1486		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
1487		"aesdec" => 0xde,	"aesdeclast" => 0xdf
1488	);
1489	return undef if (!defined($opcodelet{$1}));
1490	my $off = $2;
1491	push @opcode,0x44 if ($3>=8);
1492	push @opcode,0x0f,0x38,$opcodelet{$1};
1493	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
1494	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1495	return ".byte\t".join(',',@opcode);
1496    }
1497    return $line;
1498}
1499
1500$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1501$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1502
1503print $code;
1504close STDOUT or die "error closing STDOUT: $!";
1505