xref: /freebsd/crypto/openssl/crypto/aes/asm/aesni-x86_64.pl (revision af6a5351a1fdb1130f18be6c782c4d48916eb971)
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
13# details].
14#
15# Performance.
16#
17# Given aes(enc|dec) instructions' latency asymptotic performance for
18# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
19# processed with 128-bit key. And given their throughput asymptotic
20# performance for parallelizable modes is 1.25 cycles per byte. Being
21# asymptotic limit it's not something you commonly achieve in reality,
22# but how close does one get? Below are results collected for
23# different modes and block sized. Pairs of numbers are for en-/
24# decryption.
25#
26#	16-byte     64-byte     256-byte    1-KB        8-KB
27# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
28# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
29# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
30# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
31# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
32# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
33#
34# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
35# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
36# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
37# The results were collected with specially crafted speed.c benchmark
38# in order to compare them with results reported in "Intel Advanced
39# Encryption Standard (AES) New Instruction Set" White Paper Revision
40# 3.0 dated May 2010. All above results are consistently better. This
41# module also provides better performance for block sizes smaller than
42# 128 bytes in points *not* represented in the above table.
43#
44# Looking at the results for 8-KB buffer.
45#
46# CFB and OFB results are far from the limit, because implementation
47# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
48# single-block aesni_encrypt, which is not the most optimal way to go.
49# CBC encrypt result is unexpectedly high and there is no documented
50# explanation for it. Seemingly there is a small penalty for feeding
51# the result back to AES unit the way it's done in CBC mode. There is
52# nothing one can do and the result appears optimal. CCM result is
53# identical to CBC, because CBC-MAC is essentially CBC encrypt without
54# saving output. CCM CTR "stays invisible," because it's neatly
55# interleaved wih CBC-MAC. This provides ~30% improvement over
56# "straghtforward" CCM implementation with CTR and CBC-MAC performed
57# disjointly. Parallelizable modes practically achieve the theoretical
58# limit.
59#
60# Looking at how results vary with buffer size.
61#
62# Curves are practically saturated at 1-KB buffer size. In most cases
63# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
64# CTR curve doesn't follow this pattern and is "slowest" changing one
65# with "256-byte" result being 87% of "8-KB." This is because overhead
66# in CTR mode is most computationally intensive. Small-block CCM
67# decrypt is slower than encrypt, because first CTR and last CBC-MAC
68# iterations can't be interleaved.
69#
70# Results for 192- and 256-bit keys.
71#
72# EVP-free results were observed to scale perfectly with number of
73# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
74# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
75# are a tad smaller, because the above mentioned penalty biases all
76# results by same constant value. In similar way function call
77# overhead affects small-block performance, as well as OFB and CFB
78# results. Differences are not large, most common coefficients are
79# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
80# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
81
82# January 2011
83#
84# While Westmere processor features 6 cycles latency for aes[enc|dec]
85# instructions, which can be scheduled every second cycle, Sandy
86# Bridge spends 8 cycles per instruction, but it can schedule them
87# every cycle. This means that code targeting Westmere would perform
88# suboptimally on Sandy Bridge. Therefore this update.
89#
90# In addition, non-parallelizable CBC encrypt (as well as CCM) is
91# optimized. Relative improvement might appear modest, 8% on Westmere,
92# but in absolute terms it's 3.77 cycles per byte encrypted with
93# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
94# should be compared to asymptotic limits of 3.75 for Westmere and
95# 5.00 for Sandy Bridge. Actually, the fact that they get this close
96# to asymptotic limits is quite amazing. Indeed, the limit is
97# calculated as latency times number of rounds, 10 for 128-bit key,
98# and divided by 16, the number of bytes in block, or in other words
99# it accounts *solely* for aesenc instructions. But there are extra
100# instructions, and numbers so close to the asymptotic limits mean
101# that it's as if it takes as little as *one* additional cycle to
102# execute all of them. How is it possible? It is possible thanks to
103# out-of-order execution logic, which manages to overlap post-
104# processing of previous block, things like saving the output, with
105# actual encryption of current block, as well as pre-processing of
106# current block, things like fetching input and xor-ing it with
107# 0-round element of the key schedule, with actual encryption of
108# previous block. Keep this in mind...
109#
110# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
111# performance is achieved by interleaving instructions working on
112# independent blocks. In which case asymptotic limit for such modes
113# can be obtained by dividing above mentioned numbers by AES
114# instructions' interleave factor. Westmere can execute at most 3
115# instructions at a time, meaning that optimal interleave factor is 3,
116# and that's where the "magic" number of 1.25 come from. "Optimal
117# interleave factor" means that increase of interleave factor does
118# not improve performance. The formula has proven to reflect reality
119# pretty well on Westmere... Sandy Bridge on the other hand can
120# execute up to 8 AES instructions at a time, so how does varying
121# interleave factor affect the performance? Here is table for ECB
122# (numbers are cycles per byte processed with 128-bit key):
123#
124# instruction interleave factor		3x	6x	8x
125# theoretical asymptotic limit		1.67	0.83	0.625
126# measured performance for 8KB block	1.05	0.86	0.84
127#
128# "as if" interleave factor		4.7x	5.8x	6.0x
129#
130# Further data for other parallelizable modes:
131#
132# CBC decrypt				1.16	0.93	0.74
133# CTR					1.14	0.91	0.74
134#
135# Well, given 3x column it's probably inappropriate to call the limit
136# asymptotic, if it can be surpassed, isn't it? What happens there?
137# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
138# magic is responsible for this. Processor overlaps not only the
139# additional instructions with AES ones, but even AES instuctions
140# processing adjacent triplets of independent blocks. In the 6x case
141# additional instructions  still claim disproportionally small amount
142# of additional cycles, but in 8x case number of instructions must be
143# a tad too high for out-of-order logic to cope with, and AES unit
144# remains underutilized... As you can see 8x interleave is hardly
145# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
146# utilizies 6x interleave because of limited register bank capacity.
147#
148# Higher interleave factors do have negative impact on Westmere
149# performance. While for ECB mode it's negligible ~1.5%, other
150# parallelizables perform ~5% worse, which is outweighed by ~25%
151# improvement on Sandy Bridge. To balance regression on Westmere
152# CTR mode was implemented with 6x aesenc interleave factor.
153
154# April 2011
155#
156# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
157# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
158# in CTR mode AES instruction interleave factor was chosen to be 6x.
159
160######################################################################
161# Current large-block performance in cycles per byte processed with
162# 128-bit key (less is better).
163#
164#		CBC en-/decrypt	CTR	XTS	ECB
165# Westmere	3.77/1.25	1.25	1.25	1.26
166# * Bridge	5.07/0.74	0.75	0.90	0.85
167# Haswell	4.44/0.63	0.63	0.73	0.63
168# Silvermont	5.75/3.54	3.56	4.12	3.87(*)
169# Bulldozer	5.77/0.70	0.72	0.90	0.70
170#
171# (*)	Atom Silvermont ECB result is suboptimal because of penalties
172#	incurred by operations on %xmm8-15. As ECB is not considered
173#	critical, nothing was done to mitigate the problem.
174
175$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
176			# generates drop-in replacement for
177			# crypto/aes/asm/aes-x86_64.pl:-)
178
179$flavour = shift;
180$output  = shift;
181if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
182
183$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
184
185$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
186( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
187( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
188die "can't locate x86_64-xlate.pl";
189
190open OUT,"| \"$^X\" $xlate $flavour $output";
191*STDOUT=*OUT;
192
193$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
194@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
195		("%rdi","%rsi","%rdx","%rcx");	# Unix order
196
197$code=".text\n";
198$code.=".extern	OPENSSL_ia32cap_P\n";
199
200$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
201# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
202$inp="%rdi";
203$out="%rsi";
204$len="%rdx";
205$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
206$ivp="%r8";	# cbc, ctr, ...
207
208$rnds_="%r10d";	# backup copy for $rounds
209$key_="%r11";	# backup copy for $key
210
211# %xmm register layout
212$rndkey0="%xmm0";	$rndkey1="%xmm1";
213$inout0="%xmm2";	$inout1="%xmm3";
214$inout2="%xmm4";	$inout3="%xmm5";
215$inout4="%xmm6";	$inout5="%xmm7";
216$inout6="%xmm8";	$inout7="%xmm9";
217
218$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
219$in0="%xmm8";		$iv="%xmm9";
220
221# Inline version of internal aesni_[en|de]crypt1.
222#
223# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
224# cycles which take care of loop variables...
225{ my $sn;
226sub aesni_generate1 {
227my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
228++$sn;
229$code.=<<___;
230	$movkey	($key),$rndkey0
231	$movkey	16($key),$rndkey1
232___
233$code.=<<___ if (defined($ivec));
234	xorps	$rndkey0,$ivec
235	lea	32($key),$key
236	xorps	$ivec,$inout
237___
238$code.=<<___ if (!defined($ivec));
239	lea	32($key),$key
240	xorps	$rndkey0,$inout
241___
242$code.=<<___;
243.Loop_${p}1_$sn:
244	aes${p}	$rndkey1,$inout
245	dec	$rounds
246	$movkey	($key),$rndkey1
247	lea	16($key),$key
248	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
249	aes${p}last	$rndkey1,$inout
250___
251}}
252# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
253#
254{ my ($inp,$out,$key) = @_4args;
255
256$code.=<<___;
257.globl	${PREFIX}_encrypt
258.type	${PREFIX}_encrypt,\@abi-omnipotent
259.align	16
260${PREFIX}_encrypt:
261	movups	($inp),$inout0		# load input
262	mov	240($key),$rounds	# key->rounds
263___
264	&aesni_generate1("enc",$key,$rounds);
265$code.=<<___;
266	 pxor	$rndkey0,$rndkey0	# clear register bank
267	 pxor	$rndkey1,$rndkey1
268	movups	$inout0,($out)		# output
269	 pxor	$inout0,$inout0
270	ret
271.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
272
273.globl	${PREFIX}_decrypt
274.type	${PREFIX}_decrypt,\@abi-omnipotent
275.align	16
276${PREFIX}_decrypt:
277	movups	($inp),$inout0		# load input
278	mov	240($key),$rounds	# key->rounds
279___
280	&aesni_generate1("dec",$key,$rounds);
281$code.=<<___;
282	 pxor	$rndkey0,$rndkey0	# clear register bank
283	 pxor	$rndkey1,$rndkey1
284	movups	$inout0,($out)		# output
285	 pxor	$inout0,$inout0
286	ret
287.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
288___
289}
290
291# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
292# factor. Why 3x subroutine were originally used in loops? Even though
293# aes[enc|dec] latency was originally 6, it could be scheduled only
294# every *2nd* cycle. Thus 3x interleave was the one providing optimal
295# utilization, i.e. when subroutine's throughput is virtually same as
296# of non-interleaved subroutine [for number of input blocks up to 3].
297# This is why it originally made no sense to implement 2x subroutine.
298# But times change and it became appropriate to spend extra 192 bytes
299# on 2x subroutine on Atom Silvermont account. For processors that
300# can schedule aes[enc|dec] every cycle optimal interleave factor
301# equals to corresponding instructions latency. 8x is optimal for
302# * Bridge and "super-optimal" for other Intel CPUs...
303
304sub aesni_generate2 {
305my $dir=shift;
306# As already mentioned it takes in $key and $rounds, which are *not*
307# preserved. $inout[0-1] is cipher/clear text...
308$code.=<<___;
309.type	_aesni_${dir}rypt2,\@abi-omnipotent
310.align	16
311_aesni_${dir}rypt2:
312	$movkey	($key),$rndkey0
313	shl	\$4,$rounds
314	$movkey	16($key),$rndkey1
315	xorps	$rndkey0,$inout0
316	xorps	$rndkey0,$inout1
317	$movkey	32($key),$rndkey0
318	lea	32($key,$rounds),$key
319	neg	%rax				# $rounds
320	add	\$16,%rax
321
322.L${dir}_loop2:
323	aes${dir}	$rndkey1,$inout0
324	aes${dir}	$rndkey1,$inout1
325	$movkey		($key,%rax),$rndkey1
326	add		\$32,%rax
327	aes${dir}	$rndkey0,$inout0
328	aes${dir}	$rndkey0,$inout1
329	$movkey		-16($key,%rax),$rndkey0
330	jnz		.L${dir}_loop2
331
332	aes${dir}	$rndkey1,$inout0
333	aes${dir}	$rndkey1,$inout1
334	aes${dir}last	$rndkey0,$inout0
335	aes${dir}last	$rndkey0,$inout1
336	ret
337.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
338___
339}
340sub aesni_generate3 {
341my $dir=shift;
342# As already mentioned it takes in $key and $rounds, which are *not*
343# preserved. $inout[0-2] is cipher/clear text...
344$code.=<<___;
345.type	_aesni_${dir}rypt3,\@abi-omnipotent
346.align	16
347_aesni_${dir}rypt3:
348	$movkey	($key),$rndkey0
349	shl	\$4,$rounds
350	$movkey	16($key),$rndkey1
351	xorps	$rndkey0,$inout0
352	xorps	$rndkey0,$inout1
353	xorps	$rndkey0,$inout2
354	$movkey	32($key),$rndkey0
355	lea	32($key,$rounds),$key
356	neg	%rax				# $rounds
357	add	\$16,%rax
358
359.L${dir}_loop3:
360	aes${dir}	$rndkey1,$inout0
361	aes${dir}	$rndkey1,$inout1
362	aes${dir}	$rndkey1,$inout2
363	$movkey		($key,%rax),$rndkey1
364	add		\$32,%rax
365	aes${dir}	$rndkey0,$inout0
366	aes${dir}	$rndkey0,$inout1
367	aes${dir}	$rndkey0,$inout2
368	$movkey		-16($key,%rax),$rndkey0
369	jnz		.L${dir}_loop3
370
371	aes${dir}	$rndkey1,$inout0
372	aes${dir}	$rndkey1,$inout1
373	aes${dir}	$rndkey1,$inout2
374	aes${dir}last	$rndkey0,$inout0
375	aes${dir}last	$rndkey0,$inout1
376	aes${dir}last	$rndkey0,$inout2
377	ret
378.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
379___
380}
381# 4x interleave is implemented to improve small block performance,
382# most notably [and naturally] 4 block by ~30%. One can argue that one
383# should have implemented 5x as well, but improvement would be <20%,
384# so it's not worth it...
385sub aesni_generate4 {
386my $dir=shift;
387# As already mentioned it takes in $key and $rounds, which are *not*
388# preserved. $inout[0-3] is cipher/clear text...
389$code.=<<___;
390.type	_aesni_${dir}rypt4,\@abi-omnipotent
391.align	16
392_aesni_${dir}rypt4:
393	$movkey	($key),$rndkey0
394	shl	\$4,$rounds
395	$movkey	16($key),$rndkey1
396	xorps	$rndkey0,$inout0
397	xorps	$rndkey0,$inout1
398	xorps	$rndkey0,$inout2
399	xorps	$rndkey0,$inout3
400	$movkey	32($key),$rndkey0
401	lea	32($key,$rounds),$key
402	neg	%rax				# $rounds
403	.byte	0x0f,0x1f,0x00
404	add	\$16,%rax
405
406.L${dir}_loop4:
407	aes${dir}	$rndkey1,$inout0
408	aes${dir}	$rndkey1,$inout1
409	aes${dir}	$rndkey1,$inout2
410	aes${dir}	$rndkey1,$inout3
411	$movkey		($key,%rax),$rndkey1
412	add		\$32,%rax
413	aes${dir}	$rndkey0,$inout0
414	aes${dir}	$rndkey0,$inout1
415	aes${dir}	$rndkey0,$inout2
416	aes${dir}	$rndkey0,$inout3
417	$movkey		-16($key,%rax),$rndkey0
418	jnz		.L${dir}_loop4
419
420	aes${dir}	$rndkey1,$inout0
421	aes${dir}	$rndkey1,$inout1
422	aes${dir}	$rndkey1,$inout2
423	aes${dir}	$rndkey1,$inout3
424	aes${dir}last	$rndkey0,$inout0
425	aes${dir}last	$rndkey0,$inout1
426	aes${dir}last	$rndkey0,$inout2
427	aes${dir}last	$rndkey0,$inout3
428	ret
429.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
430___
431}
432sub aesni_generate6 {
433my $dir=shift;
434# As already mentioned it takes in $key and $rounds, which are *not*
435# preserved. $inout[0-5] is cipher/clear text...
436$code.=<<___;
437.type	_aesni_${dir}rypt6,\@abi-omnipotent
438.align	16
439_aesni_${dir}rypt6:
440	$movkey		($key),$rndkey0
441	shl		\$4,$rounds
442	$movkey		16($key),$rndkey1
443	xorps		$rndkey0,$inout0
444	pxor		$rndkey0,$inout1
445	pxor		$rndkey0,$inout2
446	aes${dir}	$rndkey1,$inout0
447	lea		32($key,$rounds),$key
448	neg		%rax			# $rounds
449	aes${dir}	$rndkey1,$inout1
450	pxor		$rndkey0,$inout3
451	pxor		$rndkey0,$inout4
452	aes${dir}	$rndkey1,$inout2
453	pxor		$rndkey0,$inout5
454	$movkey		($key,%rax),$rndkey0
455	add		\$16,%rax
456	jmp		.L${dir}_loop6_enter
457.align	16
458.L${dir}_loop6:
459	aes${dir}	$rndkey1,$inout0
460	aes${dir}	$rndkey1,$inout1
461	aes${dir}	$rndkey1,$inout2
462.L${dir}_loop6_enter:
463	aes${dir}	$rndkey1,$inout3
464	aes${dir}	$rndkey1,$inout4
465	aes${dir}	$rndkey1,$inout5
466	$movkey		($key,%rax),$rndkey1
467	add		\$32,%rax
468	aes${dir}	$rndkey0,$inout0
469	aes${dir}	$rndkey0,$inout1
470	aes${dir}	$rndkey0,$inout2
471	aes${dir}	$rndkey0,$inout3
472	aes${dir}	$rndkey0,$inout4
473	aes${dir}	$rndkey0,$inout5
474	$movkey		-16($key,%rax),$rndkey0
475	jnz		.L${dir}_loop6
476
477	aes${dir}	$rndkey1,$inout0
478	aes${dir}	$rndkey1,$inout1
479	aes${dir}	$rndkey1,$inout2
480	aes${dir}	$rndkey1,$inout3
481	aes${dir}	$rndkey1,$inout4
482	aes${dir}	$rndkey1,$inout5
483	aes${dir}last	$rndkey0,$inout0
484	aes${dir}last	$rndkey0,$inout1
485	aes${dir}last	$rndkey0,$inout2
486	aes${dir}last	$rndkey0,$inout3
487	aes${dir}last	$rndkey0,$inout4
488	aes${dir}last	$rndkey0,$inout5
489	ret
490.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
491___
492}
493sub aesni_generate8 {
494my $dir=shift;
495# As already mentioned it takes in $key and $rounds, which are *not*
496# preserved. $inout[0-7] is cipher/clear text...
497$code.=<<___;
498.type	_aesni_${dir}rypt8,\@abi-omnipotent
499.align	16
500_aesni_${dir}rypt8:
501	$movkey		($key),$rndkey0
502	shl		\$4,$rounds
503	$movkey		16($key),$rndkey1
504	xorps		$rndkey0,$inout0
505	xorps		$rndkey0,$inout1
506	pxor		$rndkey0,$inout2
507	pxor		$rndkey0,$inout3
508	pxor		$rndkey0,$inout4
509	lea		32($key,$rounds),$key
510	neg		%rax			# $rounds
511	aes${dir}	$rndkey1,$inout0
512	pxor		$rndkey0,$inout5
513	pxor		$rndkey0,$inout6
514	aes${dir}	$rndkey1,$inout1
515	pxor		$rndkey0,$inout7
516	$movkey		($key,%rax),$rndkey0
517	add		\$16,%rax
518	jmp		.L${dir}_loop8_inner
519.align	16
520.L${dir}_loop8:
521	aes${dir}	$rndkey1,$inout0
522	aes${dir}	$rndkey1,$inout1
523.L${dir}_loop8_inner:
524	aes${dir}	$rndkey1,$inout2
525	aes${dir}	$rndkey1,$inout3
526	aes${dir}	$rndkey1,$inout4
527	aes${dir}	$rndkey1,$inout5
528	aes${dir}	$rndkey1,$inout6
529	aes${dir}	$rndkey1,$inout7
530.L${dir}_loop8_enter:
531	$movkey		($key,%rax),$rndkey1
532	add		\$32,%rax
533	aes${dir}	$rndkey0,$inout0
534	aes${dir}	$rndkey0,$inout1
535	aes${dir}	$rndkey0,$inout2
536	aes${dir}	$rndkey0,$inout3
537	aes${dir}	$rndkey0,$inout4
538	aes${dir}	$rndkey0,$inout5
539	aes${dir}	$rndkey0,$inout6
540	aes${dir}	$rndkey0,$inout7
541	$movkey		-16($key,%rax),$rndkey0
542	jnz		.L${dir}_loop8
543
544	aes${dir}	$rndkey1,$inout0
545	aes${dir}	$rndkey1,$inout1
546	aes${dir}	$rndkey1,$inout2
547	aes${dir}	$rndkey1,$inout3
548	aes${dir}	$rndkey1,$inout4
549	aes${dir}	$rndkey1,$inout5
550	aes${dir}	$rndkey1,$inout6
551	aes${dir}	$rndkey1,$inout7
552	aes${dir}last	$rndkey0,$inout0
553	aes${dir}last	$rndkey0,$inout1
554	aes${dir}last	$rndkey0,$inout2
555	aes${dir}last	$rndkey0,$inout3
556	aes${dir}last	$rndkey0,$inout4
557	aes${dir}last	$rndkey0,$inout5
558	aes${dir}last	$rndkey0,$inout6
559	aes${dir}last	$rndkey0,$inout7
560	ret
561.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
562___
563}
564&aesni_generate2("enc") if ($PREFIX eq "aesni");
565&aesni_generate2("dec");
566&aesni_generate3("enc") if ($PREFIX eq "aesni");
567&aesni_generate3("dec");
568&aesni_generate4("enc") if ($PREFIX eq "aesni");
569&aesni_generate4("dec");
570&aesni_generate6("enc") if ($PREFIX eq "aesni");
571&aesni_generate6("dec");
572&aesni_generate8("enc") if ($PREFIX eq "aesni");
573&aesni_generate8("dec");
574
575if ($PREFIX eq "aesni") {
576########################################################################
577# void aesni_ecb_encrypt (const void *in, void *out,
578#			  size_t length, const AES_KEY *key,
579#			  int enc);
580$code.=<<___;
581.globl	aesni_ecb_encrypt
582.type	aesni_ecb_encrypt,\@function,5
583.align	16
584aesni_ecb_encrypt:
585___
586$code.=<<___ if ($win64);
587	lea	-0x58(%rsp),%rsp
588	movaps	%xmm6,(%rsp)		# offload $inout4..7
589	movaps	%xmm7,0x10(%rsp)
590	movaps	%xmm8,0x20(%rsp)
591	movaps	%xmm9,0x30(%rsp)
592.Lecb_enc_body:
593___
594$code.=<<___;
595	and	\$-16,$len		# if ($len<16)
596	jz	.Lecb_ret		# return
597
598	mov	240($key),$rounds	# key->rounds
599	$movkey	($key),$rndkey0
600	mov	$key,$key_		# backup $key
601	mov	$rounds,$rnds_		# backup $rounds
602	test	%r8d,%r8d		# 5th argument
603	jz	.Lecb_decrypt
604#--------------------------- ECB ENCRYPT ------------------------------#
605	cmp	\$0x80,$len		# if ($len<8*16)
606	jb	.Lecb_enc_tail		# short input
607
608	movdqu	($inp),$inout0		# load 8 input blocks
609	movdqu	0x10($inp),$inout1
610	movdqu	0x20($inp),$inout2
611	movdqu	0x30($inp),$inout3
612	movdqu	0x40($inp),$inout4
613	movdqu	0x50($inp),$inout5
614	movdqu	0x60($inp),$inout6
615	movdqu	0x70($inp),$inout7
616	lea	0x80($inp),$inp		# $inp+=8*16
617	sub	\$0x80,$len		# $len-=8*16 (can be zero)
618	jmp	.Lecb_enc_loop8_enter
619.align 16
620.Lecb_enc_loop8:
621	movups	$inout0,($out)		# store 8 output blocks
622	mov	$key_,$key		# restore $key
623	movdqu	($inp),$inout0		# load 8 input blocks
624	mov	$rnds_,$rounds		# restore $rounds
625	movups	$inout1,0x10($out)
626	movdqu	0x10($inp),$inout1
627	movups	$inout2,0x20($out)
628	movdqu	0x20($inp),$inout2
629	movups	$inout3,0x30($out)
630	movdqu	0x30($inp),$inout3
631	movups	$inout4,0x40($out)
632	movdqu	0x40($inp),$inout4
633	movups	$inout5,0x50($out)
634	movdqu	0x50($inp),$inout5
635	movups	$inout6,0x60($out)
636	movdqu	0x60($inp),$inout6
637	movups	$inout7,0x70($out)
638	lea	0x80($out),$out		# $out+=8*16
639	movdqu	0x70($inp),$inout7
640	lea	0x80($inp),$inp		# $inp+=8*16
641.Lecb_enc_loop8_enter:
642
643	call	_aesni_encrypt8
644
645	sub	\$0x80,$len
646	jnc	.Lecb_enc_loop8		# loop if $len-=8*16 didn't borrow
647
648	movups	$inout0,($out)		# store 8 output blocks
649	mov	$key_,$key		# restore $key
650	movups	$inout1,0x10($out)
651	mov	$rnds_,$rounds		# restore $rounds
652	movups	$inout2,0x20($out)
653	movups	$inout3,0x30($out)
654	movups	$inout4,0x40($out)
655	movups	$inout5,0x50($out)
656	movups	$inout6,0x60($out)
657	movups	$inout7,0x70($out)
658	lea	0x80($out),$out		# $out+=8*16
659	add	\$0x80,$len		# restore real remaining $len
660	jz	.Lecb_ret		# done if ($len==0)
661
662.Lecb_enc_tail:				# $len is less than 8*16
663	movups	($inp),$inout0
664	cmp	\$0x20,$len
665	jb	.Lecb_enc_one
666	movups	0x10($inp),$inout1
667	je	.Lecb_enc_two
668	movups	0x20($inp),$inout2
669	cmp	\$0x40,$len
670	jb	.Lecb_enc_three
671	movups	0x30($inp),$inout3
672	je	.Lecb_enc_four
673	movups	0x40($inp),$inout4
674	cmp	\$0x60,$len
675	jb	.Lecb_enc_five
676	movups	0x50($inp),$inout5
677	je	.Lecb_enc_six
678	movdqu	0x60($inp),$inout6
679	xorps	$inout7,$inout7
680	call	_aesni_encrypt8
681	movups	$inout0,($out)		# store 7 output blocks
682	movups	$inout1,0x10($out)
683	movups	$inout2,0x20($out)
684	movups	$inout3,0x30($out)
685	movups	$inout4,0x40($out)
686	movups	$inout5,0x50($out)
687	movups	$inout6,0x60($out)
688	jmp	.Lecb_ret
689.align	16
690.Lecb_enc_one:
691___
692	&aesni_generate1("enc",$key,$rounds);
693$code.=<<___;
694	movups	$inout0,($out)		# store one output block
695	jmp	.Lecb_ret
696.align	16
697.Lecb_enc_two:
698	call	_aesni_encrypt2
699	movups	$inout0,($out)		# store 2 output blocks
700	movups	$inout1,0x10($out)
701	jmp	.Lecb_ret
702.align	16
703.Lecb_enc_three:
704	call	_aesni_encrypt3
705	movups	$inout0,($out)		# store 3 output blocks
706	movups	$inout1,0x10($out)
707	movups	$inout2,0x20($out)
708	jmp	.Lecb_ret
709.align	16
710.Lecb_enc_four:
711	call	_aesni_encrypt4
712	movups	$inout0,($out)		# store 4 output blocks
713	movups	$inout1,0x10($out)
714	movups	$inout2,0x20($out)
715	movups	$inout3,0x30($out)
716	jmp	.Lecb_ret
717.align	16
718.Lecb_enc_five:
719	xorps	$inout5,$inout5
720	call	_aesni_encrypt6
721	movups	$inout0,($out)		# store 5 output blocks
722	movups	$inout1,0x10($out)
723	movups	$inout2,0x20($out)
724	movups	$inout3,0x30($out)
725	movups	$inout4,0x40($out)
726	jmp	.Lecb_ret
727.align	16
728.Lecb_enc_six:
729	call	_aesni_encrypt6
730	movups	$inout0,($out)		# store 6 output blocks
731	movups	$inout1,0x10($out)
732	movups	$inout2,0x20($out)
733	movups	$inout3,0x30($out)
734	movups	$inout4,0x40($out)
735	movups	$inout5,0x50($out)
736	jmp	.Lecb_ret
737#--------------------------- ECB DECRYPT ------------------------------#
738.align	16
739.Lecb_decrypt:
740	cmp	\$0x80,$len		# if ($len<8*16)
741	jb	.Lecb_dec_tail		# short input
742
743	movdqu	($inp),$inout0		# load 8 input blocks
744	movdqu	0x10($inp),$inout1
745	movdqu	0x20($inp),$inout2
746	movdqu	0x30($inp),$inout3
747	movdqu	0x40($inp),$inout4
748	movdqu	0x50($inp),$inout5
749	movdqu	0x60($inp),$inout6
750	movdqu	0x70($inp),$inout7
751	lea	0x80($inp),$inp		# $inp+=8*16
752	sub	\$0x80,$len		# $len-=8*16 (can be zero)
753	jmp	.Lecb_dec_loop8_enter
754.align 16
755.Lecb_dec_loop8:
756	movups	$inout0,($out)		# store 8 output blocks
757	mov	$key_,$key		# restore $key
758	movdqu	($inp),$inout0		# load 8 input blocks
759	mov	$rnds_,$rounds		# restore $rounds
760	movups	$inout1,0x10($out)
761	movdqu	0x10($inp),$inout1
762	movups	$inout2,0x20($out)
763	movdqu	0x20($inp),$inout2
764	movups	$inout3,0x30($out)
765	movdqu	0x30($inp),$inout3
766	movups	$inout4,0x40($out)
767	movdqu	0x40($inp),$inout4
768	movups	$inout5,0x50($out)
769	movdqu	0x50($inp),$inout5
770	movups	$inout6,0x60($out)
771	movdqu	0x60($inp),$inout6
772	movups	$inout7,0x70($out)
773	lea	0x80($out),$out		# $out+=8*16
774	movdqu	0x70($inp),$inout7
775	lea	0x80($inp),$inp		# $inp+=8*16
776.Lecb_dec_loop8_enter:
777
778	call	_aesni_decrypt8
779
780	$movkey	($key_),$rndkey0
781	sub	\$0x80,$len
782	jnc	.Lecb_dec_loop8		# loop if $len-=8*16 didn't borrow
783
784	movups	$inout0,($out)		# store 8 output blocks
785	 pxor	$inout0,$inout0		# clear register bank
786	mov	$key_,$key		# restore $key
787	movups	$inout1,0x10($out)
788	 pxor	$inout1,$inout1
789	mov	$rnds_,$rounds		# restore $rounds
790	movups	$inout2,0x20($out)
791	 pxor	$inout2,$inout2
792	movups	$inout3,0x30($out)
793	 pxor	$inout3,$inout3
794	movups	$inout4,0x40($out)
795	 pxor	$inout4,$inout4
796	movups	$inout5,0x50($out)
797	 pxor	$inout5,$inout5
798	movups	$inout6,0x60($out)
799	 pxor	$inout6,$inout6
800	movups	$inout7,0x70($out)
801	 pxor	$inout7,$inout7
802	lea	0x80($out),$out		# $out+=8*16
803	add	\$0x80,$len		# restore real remaining $len
804	jz	.Lecb_ret		# done if ($len==0)
805
806.Lecb_dec_tail:
807	movups	($inp),$inout0
808	cmp	\$0x20,$len
809	jb	.Lecb_dec_one
810	movups	0x10($inp),$inout1
811	je	.Lecb_dec_two
812	movups	0x20($inp),$inout2
813	cmp	\$0x40,$len
814	jb	.Lecb_dec_three
815	movups	0x30($inp),$inout3
816	je	.Lecb_dec_four
817	movups	0x40($inp),$inout4
818	cmp	\$0x60,$len
819	jb	.Lecb_dec_five
820	movups	0x50($inp),$inout5
821	je	.Lecb_dec_six
822	movups	0x60($inp),$inout6
823	$movkey	($key),$rndkey0
824	xorps	$inout7,$inout7
825	call	_aesni_decrypt8
826	movups	$inout0,($out)		# store 7 output blocks
827	 pxor	$inout0,$inout0		# clear register bank
828	movups	$inout1,0x10($out)
829	 pxor	$inout1,$inout1
830	movups	$inout2,0x20($out)
831	 pxor	$inout2,$inout2
832	movups	$inout3,0x30($out)
833	 pxor	$inout3,$inout3
834	movups	$inout4,0x40($out)
835	 pxor	$inout4,$inout4
836	movups	$inout5,0x50($out)
837	 pxor	$inout5,$inout5
838	movups	$inout6,0x60($out)
839	 pxor	$inout6,$inout6
840	 pxor	$inout7,$inout7
841	jmp	.Lecb_ret
842.align	16
843.Lecb_dec_one:
844___
845	&aesni_generate1("dec",$key,$rounds);
846$code.=<<___;
847	movups	$inout0,($out)		# store one output block
848	 pxor	$inout0,$inout0		# clear register bank
849	jmp	.Lecb_ret
850.align	16
851.Lecb_dec_two:
852	call	_aesni_decrypt2
853	movups	$inout0,($out)		# store 2 output blocks
854	 pxor	$inout0,$inout0		# clear register bank
855	movups	$inout1,0x10($out)
856	 pxor	$inout1,$inout1
857	jmp	.Lecb_ret
858.align	16
859.Lecb_dec_three:
860	call	_aesni_decrypt3
861	movups	$inout0,($out)		# store 3 output blocks
862	 pxor	$inout0,$inout0		# clear register bank
863	movups	$inout1,0x10($out)
864	 pxor	$inout1,$inout1
865	movups	$inout2,0x20($out)
866	 pxor	$inout2,$inout2
867	jmp	.Lecb_ret
868.align	16
869.Lecb_dec_four:
870	call	_aesni_decrypt4
871	movups	$inout0,($out)		# store 4 output blocks
872	 pxor	$inout0,$inout0		# clear register bank
873	movups	$inout1,0x10($out)
874	 pxor	$inout1,$inout1
875	movups	$inout2,0x20($out)
876	 pxor	$inout2,$inout2
877	movups	$inout3,0x30($out)
878	 pxor	$inout3,$inout3
879	jmp	.Lecb_ret
880.align	16
881.Lecb_dec_five:
882	xorps	$inout5,$inout5
883	call	_aesni_decrypt6
884	movups	$inout0,($out)		# store 5 output blocks
885	 pxor	$inout0,$inout0		# clear register bank
886	movups	$inout1,0x10($out)
887	 pxor	$inout1,$inout1
888	movups	$inout2,0x20($out)
889	 pxor	$inout2,$inout2
890	movups	$inout3,0x30($out)
891	 pxor	$inout3,$inout3
892	movups	$inout4,0x40($out)
893	 pxor	$inout4,$inout4
894	 pxor	$inout5,$inout5
895	jmp	.Lecb_ret
896.align	16
897.Lecb_dec_six:
898	call	_aesni_decrypt6
899	movups	$inout0,($out)		# store 6 output blocks
900	 pxor	$inout0,$inout0		# clear register bank
901	movups	$inout1,0x10($out)
902	 pxor	$inout1,$inout1
903	movups	$inout2,0x20($out)
904	 pxor	$inout2,$inout2
905	movups	$inout3,0x30($out)
906	 pxor	$inout3,$inout3
907	movups	$inout4,0x40($out)
908	 pxor	$inout4,$inout4
909	movups	$inout5,0x50($out)
910	 pxor	$inout5,$inout5
911
912.Lecb_ret:
913	xorps	$rndkey0,$rndkey0	# %xmm0
914	pxor	$rndkey1,$rndkey1
915___
916$code.=<<___ if ($win64);
917	movaps	(%rsp),%xmm6
918	movaps	%xmm0,(%rsp)		# clear stack
919	movaps	0x10(%rsp),%xmm7
920	movaps	%xmm0,0x10(%rsp)
921	movaps	0x20(%rsp),%xmm8
922	movaps	%xmm0,0x20(%rsp)
923	movaps	0x30(%rsp),%xmm9
924	movaps	%xmm0,0x30(%rsp)
925	lea	0x58(%rsp),%rsp
926.Lecb_enc_ret:
927___
928$code.=<<___;
929	ret
930.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
931___
932
933{
934######################################################################
935# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
936#                         size_t blocks, const AES_KEY *key,
937#                         const char *ivec,char *cmac);
938#
939# Handles only complete blocks, operates on 64-bit counter and
940# does not update *ivec! Nor does it finalize CMAC value
941# (see engine/eng_aesni.c for details)
942#
943{
944my $cmac="%r9";	# 6th argument
945
946my $increment="%xmm9";
947my $iv="%xmm6";
948my $bswap_mask="%xmm7";
949
950$code.=<<___;
951.globl	aesni_ccm64_encrypt_blocks
952.type	aesni_ccm64_encrypt_blocks,\@function,6
953.align	16
954aesni_ccm64_encrypt_blocks:
955___
956$code.=<<___ if ($win64);
957	lea	-0x58(%rsp),%rsp
958	movaps	%xmm6,(%rsp)		# $iv
959	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
960	movaps	%xmm8,0x20(%rsp)	# $in0
961	movaps	%xmm9,0x30(%rsp)	# $increment
962.Lccm64_enc_body:
963___
964$code.=<<___;
965	mov	240($key),$rounds		# key->rounds
966	movdqu	($ivp),$iv
967	movdqa	.Lincrement64(%rip),$increment
968	movdqa	.Lbswap_mask(%rip),$bswap_mask
969
970	shl	\$4,$rounds
971	mov	\$16,$rnds_
972	lea	0($key),$key_
973	movdqu	($cmac),$inout1
974	movdqa	$iv,$inout0
975	lea	32($key,$rounds),$key		# end of key schedule
976	pshufb	$bswap_mask,$iv
977	sub	%rax,%r10			# twisted $rounds
978	jmp	.Lccm64_enc_outer
979.align	16
980.Lccm64_enc_outer:
981	$movkey	($key_),$rndkey0
982	mov	%r10,%rax
983	movups	($inp),$in0			# load inp
984
985	xorps	$rndkey0,$inout0		# counter
986	$movkey	16($key_),$rndkey1
987	xorps	$in0,$rndkey0
988	xorps	$rndkey0,$inout1		# cmac^=inp
989	$movkey	32($key_),$rndkey0
990
991.Lccm64_enc2_loop:
992	aesenc	$rndkey1,$inout0
993	aesenc	$rndkey1,$inout1
994	$movkey	($key,%rax),$rndkey1
995	add	\$32,%rax
996	aesenc	$rndkey0,$inout0
997	aesenc	$rndkey0,$inout1
998	$movkey	-16($key,%rax),$rndkey0
999	jnz	.Lccm64_enc2_loop
1000	aesenc	$rndkey1,$inout0
1001	aesenc	$rndkey1,$inout1
1002	paddq	$increment,$iv
1003	dec	$len				# $len-- ($len is in blocks)
1004	aesenclast	$rndkey0,$inout0
1005	aesenclast	$rndkey0,$inout1
1006
1007	lea	16($inp),$inp
1008	xorps	$inout0,$in0			# inp ^= E(iv)
1009	movdqa	$iv,$inout0
1010	movups	$in0,($out)			# save output
1011	pshufb	$bswap_mask,$inout0
1012	lea	16($out),$out			# $out+=16
1013	jnz	.Lccm64_enc_outer		# loop if ($len!=0)
1014
1015	 pxor	$rndkey0,$rndkey0		# clear register bank
1016	 pxor	$rndkey1,$rndkey1
1017	 pxor	$inout0,$inout0
1018	movups	$inout1,($cmac)			# store resulting mac
1019	 pxor	$inout1,$inout1
1020	 pxor	$in0,$in0
1021	 pxor	$iv,$iv
1022___
1023$code.=<<___ if ($win64);
1024	movaps	(%rsp),%xmm6
1025	movaps	%xmm0,(%rsp)			# clear stack
1026	movaps	0x10(%rsp),%xmm7
1027	movaps	%xmm0,0x10(%rsp)
1028	movaps	0x20(%rsp),%xmm8
1029	movaps	%xmm0,0x20(%rsp)
1030	movaps	0x30(%rsp),%xmm9
1031	movaps	%xmm0,0x30(%rsp)
1032	lea	0x58(%rsp),%rsp
1033.Lccm64_enc_ret:
1034___
1035$code.=<<___;
1036	ret
1037.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1038___
1039######################################################################
1040$code.=<<___;
1041.globl	aesni_ccm64_decrypt_blocks
1042.type	aesni_ccm64_decrypt_blocks,\@function,6
1043.align	16
1044aesni_ccm64_decrypt_blocks:
1045___
1046$code.=<<___ if ($win64);
1047	lea	-0x58(%rsp),%rsp
1048	movaps	%xmm6,(%rsp)		# $iv
1049	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
1050	movaps	%xmm8,0x20(%rsp)	# $in8
1051	movaps	%xmm9,0x30(%rsp)	# $increment
1052.Lccm64_dec_body:
1053___
1054$code.=<<___;
1055	mov	240($key),$rounds		# key->rounds
1056	movups	($ivp),$iv
1057	movdqu	($cmac),$inout1
1058	movdqa	.Lincrement64(%rip),$increment
1059	movdqa	.Lbswap_mask(%rip),$bswap_mask
1060
1061	movaps	$iv,$inout0
1062	mov	$rounds,$rnds_
1063	mov	$key,$key_
1064	pshufb	$bswap_mask,$iv
1065___
1066	&aesni_generate1("enc",$key,$rounds);
1067$code.=<<___;
1068	shl	\$4,$rnds_
1069	mov	\$16,$rounds
1070	movups	($inp),$in0			# load inp
1071	paddq	$increment,$iv
1072	lea	16($inp),$inp			# $inp+=16
1073	sub	%r10,%rax			# twisted $rounds
1074	lea	32($key_,$rnds_),$key		# end of key schedule
1075	mov	%rax,%r10
1076	jmp	.Lccm64_dec_outer
1077.align	16
1078.Lccm64_dec_outer:
1079	xorps	$inout0,$in0			# inp ^= E(iv)
1080	movdqa	$iv,$inout0
1081	movups	$in0,($out)			# save output
1082	lea	16($out),$out			# $out+=16
1083	pshufb	$bswap_mask,$inout0
1084
1085	sub	\$1,$len			# $len-- ($len is in blocks)
1086	jz	.Lccm64_dec_break		# if ($len==0) break
1087
1088	$movkey	($key_),$rndkey0
1089	mov	%r10,%rax
1090	$movkey	16($key_),$rndkey1
1091	xorps	$rndkey0,$in0
1092	xorps	$rndkey0,$inout0
1093	xorps	$in0,$inout1			# cmac^=out
1094	$movkey	32($key_),$rndkey0
1095	jmp	.Lccm64_dec2_loop
1096.align	16
1097.Lccm64_dec2_loop:
1098	aesenc	$rndkey1,$inout0
1099	aesenc	$rndkey1,$inout1
1100	$movkey	($key,%rax),$rndkey1
1101	add	\$32,%rax
1102	aesenc	$rndkey0,$inout0
1103	aesenc	$rndkey0,$inout1
1104	$movkey	-16($key,%rax),$rndkey0
1105	jnz	.Lccm64_dec2_loop
1106	movups	($inp),$in0			# load input
1107	paddq	$increment,$iv
1108	aesenc	$rndkey1,$inout0
1109	aesenc	$rndkey1,$inout1
1110	aesenclast	$rndkey0,$inout0
1111	aesenclast	$rndkey0,$inout1
1112	lea	16($inp),$inp			# $inp+=16
1113	jmp	.Lccm64_dec_outer
1114
1115.align	16
1116.Lccm64_dec_break:
1117	#xorps	$in0,$inout1			# cmac^=out
1118	mov	240($key_),$rounds
1119___
1120	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1121$code.=<<___;
1122	 pxor	$rndkey0,$rndkey0		# clear register bank
1123	 pxor	$rndkey1,$rndkey1
1124	 pxor	$inout0,$inout0
1125	movups	$inout1,($cmac)			# store resulting mac
1126	 pxor	$inout1,$inout1
1127	 pxor	$in0,$in0
1128	 pxor	$iv,$iv
1129___
1130$code.=<<___ if ($win64);
1131	movaps	(%rsp),%xmm6
1132	movaps	%xmm0,(%rsp)			# clear stack
1133	movaps	0x10(%rsp),%xmm7
1134	movaps	%xmm0,0x10(%rsp)
1135	movaps	0x20(%rsp),%xmm8
1136	movaps	%xmm0,0x20(%rsp)
1137	movaps	0x30(%rsp),%xmm9
1138	movaps	%xmm0,0x30(%rsp)
1139	lea	0x58(%rsp),%rsp
1140.Lccm64_dec_ret:
1141___
1142$code.=<<___;
1143	ret
1144.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1145___
1146}
1147######################################################################
1148# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1149#                         size_t blocks, const AES_KEY *key,
1150#                         const char *ivec);
1151#
1152# Handles only complete blocks, operates on 32-bit counter and
1153# does not update *ivec! (see crypto/modes/ctr128.c for details)
1154#
1155# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1156# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1157# Keywords are full unroll and modulo-schedule counter calculations
1158# with zero-round key xor.
1159{
1160my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1161my ($key0,$ctr)=("${key_}d","${ivp}d");
1162my $frame_size = 0x80 + ($win64?160:0);
1163
1164$code.=<<___;
1165.globl	aesni_ctr32_encrypt_blocks
1166.type	aesni_ctr32_encrypt_blocks,\@function,5
1167.align	16
1168aesni_ctr32_encrypt_blocks:
1169	cmp	\$1,$len
1170	jne	.Lctr32_bulk
1171
1172	# handle single block without allocating stack frame,
1173	# useful when handling edges
1174	movups	($ivp),$inout0
1175	movups	($inp),$inout1
1176	mov	240($key),%edx			# key->rounds
1177___
1178	&aesni_generate1("enc",$key,"%edx");
1179$code.=<<___;
1180	 pxor	$rndkey0,$rndkey0		# clear register bank
1181	 pxor	$rndkey1,$rndkey1
1182	xorps	$inout1,$inout0
1183	 pxor	$inout1,$inout1
1184	movups	$inout0,($out)
1185	 xorps	$inout0,$inout0
1186	jmp	.Lctr32_epilogue
1187
1188.align	16
1189.Lctr32_bulk:
1190	lea	(%rsp),%rax
1191	push	%rbp
1192	sub	\$$frame_size,%rsp
1193	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1194___
1195$code.=<<___ if ($win64);
1196	movaps	%xmm6,-0xa8(%rax)		# offload everything
1197	movaps	%xmm7,-0x98(%rax)
1198	movaps	%xmm8,-0x88(%rax)
1199	movaps	%xmm9,-0x78(%rax)
1200	movaps	%xmm10,-0x68(%rax)
1201	movaps	%xmm11,-0x58(%rax)
1202	movaps	%xmm12,-0x48(%rax)
1203	movaps	%xmm13,-0x38(%rax)
1204	movaps	%xmm14,-0x28(%rax)
1205	movaps	%xmm15,-0x18(%rax)
1206.Lctr32_body:
1207___
1208$code.=<<___;
1209	lea	-8(%rax),%rbp
1210
1211	# 8 16-byte words on top of stack are counter values
1212	# xor-ed with zero-round key
1213
1214	movdqu	($ivp),$inout0
1215	movdqu	($key),$rndkey0
1216	mov	12($ivp),$ctr			# counter LSB
1217	pxor	$rndkey0,$inout0
1218	mov	12($key),$key0			# 0-round key LSB
1219	movdqa	$inout0,0x00(%rsp)		# populate counter block
1220	bswap	$ctr
1221	movdqa	$inout0,$inout1
1222	movdqa	$inout0,$inout2
1223	movdqa	$inout0,$inout3
1224	movdqa	$inout0,0x40(%rsp)
1225	movdqa	$inout0,0x50(%rsp)
1226	movdqa	$inout0,0x60(%rsp)
1227	mov	%rdx,%r10			# about to borrow %rdx
1228	movdqa	$inout0,0x70(%rsp)
1229
1230	lea	1($ctr),%rax
1231	 lea	2($ctr),%rdx
1232	bswap	%eax
1233	 bswap	%edx
1234	xor	$key0,%eax
1235	 xor	$key0,%edx
1236	pinsrd	\$3,%eax,$inout1
1237	lea	3($ctr),%rax
1238	movdqa	$inout1,0x10(%rsp)
1239	 pinsrd	\$3,%edx,$inout2
1240	bswap	%eax
1241	 mov	%r10,%rdx			# restore %rdx
1242	 lea	4($ctr),%r10
1243	 movdqa	$inout2,0x20(%rsp)
1244	xor	$key0,%eax
1245	 bswap	%r10d
1246	pinsrd	\$3,%eax,$inout3
1247	 xor	$key0,%r10d
1248	movdqa	$inout3,0x30(%rsp)
1249	lea	5($ctr),%r9
1250	 mov	%r10d,0x40+12(%rsp)
1251	bswap	%r9d
1252	 lea	6($ctr),%r10
1253	mov	240($key),$rounds		# key->rounds
1254	xor	$key0,%r9d
1255	 bswap	%r10d
1256	mov	%r9d,0x50+12(%rsp)
1257	 xor	$key0,%r10d
1258	lea	7($ctr),%r9
1259	 mov	%r10d,0x60+12(%rsp)
1260	bswap	%r9d
1261	 mov	OPENSSL_ia32cap_P+4(%rip),%r10d
1262	xor	$key0,%r9d
1263	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
1264	mov	%r9d,0x70+12(%rsp)
1265
1266	$movkey	0x10($key),$rndkey1
1267
1268	movdqa	0x40(%rsp),$inout4
1269	movdqa	0x50(%rsp),$inout5
1270
1271	cmp	\$8,$len		# $len is in blocks
1272	jb	.Lctr32_tail		# short input if ($len<8)
1273
1274	sub	\$6,$len		# $len is biased by -6
1275	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
1276	je	.Lctr32_6x		# [which denotes Atom Silvermont]
1277
1278	lea	0x80($key),$key		# size optimization
1279	sub	\$2,$len		# $len is biased by -8
1280	jmp	.Lctr32_loop8
1281
1282.align	16
1283.Lctr32_6x:
1284	shl	\$4,$rounds
1285	mov	\$48,$rnds_
1286	bswap	$key0
1287	lea	32($key,$rounds),$key	# end of key schedule
1288	sub	%rax,%r10		# twisted $rounds
1289	jmp	.Lctr32_loop6
1290
1291.align	16
1292.Lctr32_loop6:
1293	 add	\$6,$ctr		# next counter value
1294	$movkey	-48($key,$rnds_),$rndkey0
1295	aesenc	$rndkey1,$inout0
1296	 mov	$ctr,%eax
1297	 xor	$key0,%eax
1298	aesenc	$rndkey1,$inout1
1299	 movbe	%eax,`0x00+12`(%rsp)	# store next counter value
1300	 lea	1($ctr),%eax
1301	aesenc	$rndkey1,$inout2
1302	 xor	$key0,%eax
1303	 movbe	%eax,`0x10+12`(%rsp)
1304	aesenc	$rndkey1,$inout3
1305	 lea	2($ctr),%eax
1306	 xor	$key0,%eax
1307	aesenc	$rndkey1,$inout4
1308	 movbe	%eax,`0x20+12`(%rsp)
1309	 lea	3($ctr),%eax
1310	aesenc	$rndkey1,$inout5
1311	$movkey	-32($key,$rnds_),$rndkey1
1312	 xor	$key0,%eax
1313
1314	aesenc	$rndkey0,$inout0
1315	 movbe	%eax,`0x30+12`(%rsp)
1316	 lea	4($ctr),%eax
1317	aesenc	$rndkey0,$inout1
1318	 xor	$key0,%eax
1319	 movbe	%eax,`0x40+12`(%rsp)
1320	aesenc	$rndkey0,$inout2
1321	 lea	5($ctr),%eax
1322	 xor	$key0,%eax
1323	aesenc	$rndkey0,$inout3
1324	 movbe	%eax,`0x50+12`(%rsp)
1325	 mov	%r10,%rax		# mov	$rnds_,$rounds
1326	aesenc	$rndkey0,$inout4
1327	aesenc	$rndkey0,$inout5
1328	$movkey	-16($key,$rnds_),$rndkey0
1329
1330	call	.Lenc_loop6
1331
1332	movdqu	($inp),$inout6		# load 6 input blocks
1333	movdqu	0x10($inp),$inout7
1334	movdqu	0x20($inp),$in0
1335	movdqu	0x30($inp),$in1
1336	movdqu	0x40($inp),$in2
1337	movdqu	0x50($inp),$in3
1338	lea	0x60($inp),$inp		# $inp+=6*16
1339	$movkey	-64($key,$rnds_),$rndkey1
1340	pxor	$inout0,$inout6		# inp^=E(ctr)
1341	movaps	0x00(%rsp),$inout0	# load next counter [xor-ed with 0 round]
1342	pxor	$inout1,$inout7
1343	movaps	0x10(%rsp),$inout1
1344	pxor	$inout2,$in0
1345	movaps	0x20(%rsp),$inout2
1346	pxor	$inout3,$in1
1347	movaps	0x30(%rsp),$inout3
1348	pxor	$inout4,$in2
1349	movaps	0x40(%rsp),$inout4
1350	pxor	$inout5,$in3
1351	movaps	0x50(%rsp),$inout5
1352	movdqu	$inout6,($out)		# store 6 output blocks
1353	movdqu	$inout7,0x10($out)
1354	movdqu	$in0,0x20($out)
1355	movdqu	$in1,0x30($out)
1356	movdqu	$in2,0x40($out)
1357	movdqu	$in3,0x50($out)
1358	lea	0x60($out),$out		# $out+=6*16
1359
1360	sub	\$6,$len
1361	jnc	.Lctr32_loop6		# loop if $len-=6 didn't borrow
1362
1363	add	\$6,$len		# restore real remaining $len
1364	jz	.Lctr32_done		# done if ($len==0)
1365
1366	lea	-48($rnds_),$rounds
1367	lea	-80($key,$rnds_),$key	# restore $key
1368	neg	$rounds
1369	shr	\$4,$rounds		# restore $rounds
1370	jmp	.Lctr32_tail
1371
1372.align	32
1373.Lctr32_loop8:
1374	 add		\$8,$ctr		# next counter value
1375	movdqa		0x60(%rsp),$inout6
1376	aesenc		$rndkey1,$inout0
1377	 mov		$ctr,%r9d
1378	movdqa		0x70(%rsp),$inout7
1379	aesenc		$rndkey1,$inout1
1380	 bswap		%r9d
1381	$movkey		0x20-0x80($key),$rndkey0
1382	aesenc		$rndkey1,$inout2
1383	 xor		$key0,%r9d
1384	 nop
1385	aesenc		$rndkey1,$inout3
1386	 mov		%r9d,0x00+12(%rsp)	# store next counter value
1387	 lea		1($ctr),%r9
1388	aesenc		$rndkey1,$inout4
1389	aesenc		$rndkey1,$inout5
1390	aesenc		$rndkey1,$inout6
1391	aesenc		$rndkey1,$inout7
1392	$movkey		0x30-0x80($key),$rndkey1
1393___
1394for($i=2;$i<8;$i++) {
1395my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1396$code.=<<___;
1397	 bswap		%r9d
1398	aesenc		$rndkeyx,$inout0
1399	aesenc		$rndkeyx,$inout1
1400	 xor		$key0,%r9d
1401	 .byte		0x66,0x90
1402	aesenc		$rndkeyx,$inout2
1403	aesenc		$rndkeyx,$inout3
1404	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
1405	 lea		$i($ctr),%r9
1406	aesenc		$rndkeyx,$inout4
1407	aesenc		$rndkeyx,$inout5
1408	aesenc		$rndkeyx,$inout6
1409	aesenc		$rndkeyx,$inout7
1410	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
1411___
1412}
1413$code.=<<___;
1414	 bswap		%r9d
1415	aesenc		$rndkey0,$inout0
1416	aesenc		$rndkey0,$inout1
1417	aesenc		$rndkey0,$inout2
1418	 xor		$key0,%r9d
1419	 movdqu		0x00($inp),$in0		# start loading input
1420	aesenc		$rndkey0,$inout3
1421	 mov		%r9d,0x70+12(%rsp)
1422	 cmp		\$11,$rounds
1423	aesenc		$rndkey0,$inout4
1424	aesenc		$rndkey0,$inout5
1425	aesenc		$rndkey0,$inout6
1426	aesenc		$rndkey0,$inout7
1427	$movkey		0xa0-0x80($key),$rndkey0
1428
1429	jb		.Lctr32_enc_done
1430
1431	aesenc		$rndkey1,$inout0
1432	aesenc		$rndkey1,$inout1
1433	aesenc		$rndkey1,$inout2
1434	aesenc		$rndkey1,$inout3
1435	aesenc		$rndkey1,$inout4
1436	aesenc		$rndkey1,$inout5
1437	aesenc		$rndkey1,$inout6
1438	aesenc		$rndkey1,$inout7
1439	$movkey		0xb0-0x80($key),$rndkey1
1440
1441	aesenc		$rndkey0,$inout0
1442	aesenc		$rndkey0,$inout1
1443	aesenc		$rndkey0,$inout2
1444	aesenc		$rndkey0,$inout3
1445	aesenc		$rndkey0,$inout4
1446	aesenc		$rndkey0,$inout5
1447	aesenc		$rndkey0,$inout6
1448	aesenc		$rndkey0,$inout7
1449	$movkey		0xc0-0x80($key),$rndkey0
1450	je		.Lctr32_enc_done
1451
1452	aesenc		$rndkey1,$inout0
1453	aesenc		$rndkey1,$inout1
1454	aesenc		$rndkey1,$inout2
1455	aesenc		$rndkey1,$inout3
1456	aesenc		$rndkey1,$inout4
1457	aesenc		$rndkey1,$inout5
1458	aesenc		$rndkey1,$inout6
1459	aesenc		$rndkey1,$inout7
1460	$movkey		0xd0-0x80($key),$rndkey1
1461
1462	aesenc		$rndkey0,$inout0
1463	aesenc		$rndkey0,$inout1
1464	aesenc		$rndkey0,$inout2
1465	aesenc		$rndkey0,$inout3
1466	aesenc		$rndkey0,$inout4
1467	aesenc		$rndkey0,$inout5
1468	aesenc		$rndkey0,$inout6
1469	aesenc		$rndkey0,$inout7
1470	$movkey		0xe0-0x80($key),$rndkey0
1471	jmp		.Lctr32_enc_done
1472
1473.align	16
1474.Lctr32_enc_done:
1475	movdqu		0x10($inp),$in1
1476	pxor		$rndkey0,$in0		# input^=round[last]
1477	movdqu		0x20($inp),$in2
1478	pxor		$rndkey0,$in1
1479	movdqu		0x30($inp),$in3
1480	pxor		$rndkey0,$in2
1481	movdqu		0x40($inp),$in4
1482	pxor		$rndkey0,$in3
1483	movdqu		0x50($inp),$in5
1484	pxor		$rndkey0,$in4
1485	pxor		$rndkey0,$in5
1486	aesenc		$rndkey1,$inout0
1487	aesenc		$rndkey1,$inout1
1488	aesenc		$rndkey1,$inout2
1489	aesenc		$rndkey1,$inout3
1490	aesenc		$rndkey1,$inout4
1491	aesenc		$rndkey1,$inout5
1492	aesenc		$rndkey1,$inout6
1493	aesenc		$rndkey1,$inout7
1494	movdqu		0x60($inp),$rndkey1	# borrow $rndkey1 for inp[6]
1495	lea		0x80($inp),$inp		# $inp+=8*16
1496
1497	aesenclast	$in0,$inout0		# $inN is inp[N]^round[last]
1498	pxor		$rndkey0,$rndkey1	# borrowed $rndkey
1499	movdqu		0x70-0x80($inp),$in0
1500	aesenclast	$in1,$inout1
1501	pxor		$rndkey0,$in0
1502	movdqa		0x00(%rsp),$in1		# load next counter block
1503	aesenclast	$in2,$inout2
1504	aesenclast	$in3,$inout3
1505	movdqa		0x10(%rsp),$in2
1506	movdqa		0x20(%rsp),$in3
1507	aesenclast	$in4,$inout4
1508	aesenclast	$in5,$inout5
1509	movdqa		0x30(%rsp),$in4
1510	movdqa		0x40(%rsp),$in5
1511	aesenclast	$rndkey1,$inout6
1512	movdqa		0x50(%rsp),$rndkey0
1513	$movkey		0x10-0x80($key),$rndkey1#real 1st-round key
1514	aesenclast	$in0,$inout7
1515
1516	movups		$inout0,($out)		# store 8 output blocks
1517	movdqa		$in1,$inout0
1518	movups		$inout1,0x10($out)
1519	movdqa		$in2,$inout1
1520	movups		$inout2,0x20($out)
1521	movdqa		$in3,$inout2
1522	movups		$inout3,0x30($out)
1523	movdqa		$in4,$inout3
1524	movups		$inout4,0x40($out)
1525	movdqa		$in5,$inout4
1526	movups		$inout5,0x50($out)
1527	movdqa		$rndkey0,$inout5
1528	movups		$inout6,0x60($out)
1529	movups		$inout7,0x70($out)
1530	lea		0x80($out),$out		# $out+=8*16
1531
1532	sub	\$8,$len
1533	jnc	.Lctr32_loop8			# loop if $len-=8 didn't borrow
1534
1535	add	\$8,$len			# restore real remainig $len
1536	jz	.Lctr32_done			# done if ($len==0)
1537	lea	-0x80($key),$key
1538
1539.Lctr32_tail:
1540	# note that at this point $inout0..5 are populated with
1541	# counter values xor-ed with 0-round key
1542	lea	16($key),$key
1543	cmp	\$4,$len
1544	jb	.Lctr32_loop3
1545	je	.Lctr32_loop4
1546
1547	# if ($len>4) compute 7 E(counter)
1548	shl		\$4,$rounds
1549	movdqa		0x60(%rsp),$inout6
1550	pxor		$inout7,$inout7
1551
1552	$movkey		16($key),$rndkey0
1553	aesenc		$rndkey1,$inout0
1554	aesenc		$rndkey1,$inout1
1555	lea		32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1556	neg		%rax
1557	aesenc		$rndkey1,$inout2
1558	add		\$16,%rax		# prepare for .Lenc_loop8_enter
1559	 movups		($inp),$in0
1560	aesenc		$rndkey1,$inout3
1561	aesenc		$rndkey1,$inout4
1562	 movups		0x10($inp),$in1		# pre-load input
1563	 movups		0x20($inp),$in2
1564	aesenc		$rndkey1,$inout5
1565	aesenc		$rndkey1,$inout6
1566
1567	call            .Lenc_loop8_enter
1568
1569	movdqu	0x30($inp),$in3
1570	pxor	$in0,$inout0
1571	movdqu	0x40($inp),$in0
1572	pxor	$in1,$inout1
1573	movdqu	$inout0,($out)			# store output
1574	pxor	$in2,$inout2
1575	movdqu	$inout1,0x10($out)
1576	pxor	$in3,$inout3
1577	movdqu	$inout2,0x20($out)
1578	pxor	$in0,$inout4
1579	movdqu	$inout3,0x30($out)
1580	movdqu	$inout4,0x40($out)
1581	cmp	\$6,$len
1582	jb	.Lctr32_done			# $len was 5, stop store
1583
1584	movups	0x50($inp),$in1
1585	xorps	$in1,$inout5
1586	movups	$inout5,0x50($out)
1587	je	.Lctr32_done			# $len was 6, stop store
1588
1589	movups	0x60($inp),$in2
1590	xorps	$in2,$inout6
1591	movups	$inout6,0x60($out)
1592	jmp	.Lctr32_done			# $len was 7, stop store
1593
1594.align	32
1595.Lctr32_loop4:
1596	aesenc		$rndkey1,$inout0
1597	lea		16($key),$key
1598	dec		$rounds
1599	aesenc		$rndkey1,$inout1
1600	aesenc		$rndkey1,$inout2
1601	aesenc		$rndkey1,$inout3
1602	$movkey		($key),$rndkey1
1603	jnz		.Lctr32_loop4
1604	aesenclast	$rndkey1,$inout0
1605	aesenclast	$rndkey1,$inout1
1606	 movups		($inp),$in0		# load input
1607	 movups		0x10($inp),$in1
1608	aesenclast	$rndkey1,$inout2
1609	aesenclast	$rndkey1,$inout3
1610	 movups		0x20($inp),$in2
1611	 movups		0x30($inp),$in3
1612
1613	xorps	$in0,$inout0
1614	movups	$inout0,($out)			# store output
1615	xorps	$in1,$inout1
1616	movups	$inout1,0x10($out)
1617	pxor	$in2,$inout2
1618	movdqu	$inout2,0x20($out)
1619	pxor	$in3,$inout3
1620	movdqu	$inout3,0x30($out)
1621	jmp	.Lctr32_done			# $len was 4, stop store
1622
1623.align	32
1624.Lctr32_loop3:
1625	aesenc		$rndkey1,$inout0
1626	lea		16($key),$key
1627	dec		$rounds
1628	aesenc		$rndkey1,$inout1
1629	aesenc		$rndkey1,$inout2
1630	$movkey		($key),$rndkey1
1631	jnz		.Lctr32_loop3
1632	aesenclast	$rndkey1,$inout0
1633	aesenclast	$rndkey1,$inout1
1634	aesenclast	$rndkey1,$inout2
1635
1636	movups	($inp),$in0			# load input
1637	xorps	$in0,$inout0
1638	movups	$inout0,($out)			# store output
1639	cmp	\$2,$len
1640	jb	.Lctr32_done			# $len was 1, stop store
1641
1642	movups	0x10($inp),$in1
1643	xorps	$in1,$inout1
1644	movups	$inout1,0x10($out)
1645	je	.Lctr32_done			# $len was 2, stop store
1646
1647	movups	0x20($inp),$in2
1648	xorps	$in2,$inout2
1649	movups	$inout2,0x20($out)		# $len was 3, stop store
1650
1651.Lctr32_done:
1652	xorps	%xmm0,%xmm0			# clear regiser bank
1653	xor	$key0,$key0
1654	pxor	%xmm1,%xmm1
1655	pxor	%xmm2,%xmm2
1656	pxor	%xmm3,%xmm3
1657	pxor	%xmm4,%xmm4
1658	pxor	%xmm5,%xmm5
1659___
1660$code.=<<___ if (!$win64);
1661	pxor	%xmm6,%xmm6
1662	pxor	%xmm7,%xmm7
1663	movaps	%xmm0,0x00(%rsp)		# clear stack
1664	pxor	%xmm8,%xmm8
1665	movaps	%xmm0,0x10(%rsp)
1666	pxor	%xmm9,%xmm9
1667	movaps	%xmm0,0x20(%rsp)
1668	pxor	%xmm10,%xmm10
1669	movaps	%xmm0,0x30(%rsp)
1670	pxor	%xmm11,%xmm11
1671	movaps	%xmm0,0x40(%rsp)
1672	pxor	%xmm12,%xmm12
1673	movaps	%xmm0,0x50(%rsp)
1674	pxor	%xmm13,%xmm13
1675	movaps	%xmm0,0x60(%rsp)
1676	pxor	%xmm14,%xmm14
1677	movaps	%xmm0,0x70(%rsp)
1678	pxor	%xmm15,%xmm15
1679___
1680$code.=<<___ if ($win64);
1681	movaps	-0xa0(%rbp),%xmm6
1682	movaps	%xmm0,-0xa0(%rbp)		# clear stack
1683	movaps	-0x90(%rbp),%xmm7
1684	movaps	%xmm0,-0x90(%rbp)
1685	movaps	-0x80(%rbp),%xmm8
1686	movaps	%xmm0,-0x80(%rbp)
1687	movaps	-0x70(%rbp),%xmm9
1688	movaps	%xmm0,-0x70(%rbp)
1689	movaps	-0x60(%rbp),%xmm10
1690	movaps	%xmm0,-0x60(%rbp)
1691	movaps	-0x50(%rbp),%xmm11
1692	movaps	%xmm0,-0x50(%rbp)
1693	movaps	-0x40(%rbp),%xmm12
1694	movaps	%xmm0,-0x40(%rbp)
1695	movaps	-0x30(%rbp),%xmm13
1696	movaps	%xmm0,-0x30(%rbp)
1697	movaps	-0x20(%rbp),%xmm14
1698	movaps	%xmm0,-0x20(%rbp)
1699	movaps	-0x10(%rbp),%xmm15
1700	movaps	%xmm0,-0x10(%rbp)
1701	movaps	%xmm0,0x00(%rsp)
1702	movaps	%xmm0,0x10(%rsp)
1703	movaps	%xmm0,0x20(%rsp)
1704	movaps	%xmm0,0x30(%rsp)
1705	movaps	%xmm0,0x40(%rsp)
1706	movaps	%xmm0,0x50(%rsp)
1707	movaps	%xmm0,0x60(%rsp)
1708	movaps	%xmm0,0x70(%rsp)
1709___
1710$code.=<<___;
1711	lea	(%rbp),%rsp
1712	pop	%rbp
1713.Lctr32_epilogue:
1714	ret
1715.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1716___
1717}
1718
1719######################################################################
1720# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1721#	const AES_KEY *key1, const AES_KEY *key2
1722#	const unsigned char iv[16]);
1723#
1724{
1725my @tweak=map("%xmm$_",(10..15));
1726my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1727my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1728my $frame_size = 0x70 + ($win64?160:0);
1729
1730$code.=<<___;
1731.globl	aesni_xts_encrypt
1732.type	aesni_xts_encrypt,\@function,6
1733.align	16
1734aesni_xts_encrypt:
1735	lea	(%rsp),%rax
1736	push	%rbp
1737	sub	\$$frame_size,%rsp
1738	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1739___
1740$code.=<<___ if ($win64);
1741	movaps	%xmm6,-0xa8(%rax)		# offload everything
1742	movaps	%xmm7,-0x98(%rax)
1743	movaps	%xmm8,-0x88(%rax)
1744	movaps	%xmm9,-0x78(%rax)
1745	movaps	%xmm10,-0x68(%rax)
1746	movaps	%xmm11,-0x58(%rax)
1747	movaps	%xmm12,-0x48(%rax)
1748	movaps	%xmm13,-0x38(%rax)
1749	movaps	%xmm14,-0x28(%rax)
1750	movaps	%xmm15,-0x18(%rax)
1751.Lxts_enc_body:
1752___
1753$code.=<<___;
1754	lea	-8(%rax),%rbp
1755	movups	($ivp),$inout0			# load clear-text tweak
1756	mov	240(%r8),$rounds		# key2->rounds
1757	mov	240($key),$rnds_		# key1->rounds
1758___
1759	# generate the tweak
1760	&aesni_generate1("enc",$key2,$rounds,$inout0);
1761$code.=<<___;
1762	$movkey	($key),$rndkey0			# zero round key
1763	mov	$key,$key_			# backup $key
1764	mov	$rnds_,$rounds			# backup $rounds
1765	shl	\$4,$rnds_
1766	mov	$len,$len_			# backup $len
1767	and	\$-16,$len
1768
1769	$movkey	16($key,$rnds_),$rndkey1	# last round key
1770
1771	movdqa	.Lxts_magic(%rip),$twmask
1772	movdqa	$inout0,@tweak[5]
1773	pshufd	\$0x5f,$inout0,$twres
1774	pxor	$rndkey0,$rndkey1
1775___
1776    # alternative tweak calculation algorithm is based on suggestions
1777    # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1778    # and should help in the future...
1779    for ($i=0;$i<4;$i++) {
1780    $code.=<<___;
1781	movdqa	$twres,$twtmp
1782	paddd	$twres,$twres
1783	movdqa	@tweak[5],@tweak[$i]
1784	psrad	\$31,$twtmp			# broadcast upper bits
1785	paddq	@tweak[5],@tweak[5]
1786	pand	$twmask,$twtmp
1787	pxor	$rndkey0,@tweak[$i]
1788	pxor	$twtmp,@tweak[5]
1789___
1790    }
1791$code.=<<___;
1792	movdqa	@tweak[5],@tweak[4]
1793	psrad	\$31,$twres
1794	paddq	@tweak[5],@tweak[5]
1795	pand	$twmask,$twres
1796	pxor	$rndkey0,@tweak[4]
1797	pxor	$twres,@tweak[5]
1798	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
1799
1800	sub	\$16*6,$len
1801	jc	.Lxts_enc_short			# if $len-=6*16 borrowed
1802
1803	mov	\$16+96,$rounds
1804	lea	32($key_,$rnds_),$key		# end of key schedule
1805	sub	%r10,%rax			# twisted $rounds
1806	$movkey	16($key_),$rndkey1
1807	mov	%rax,%r10			# backup twisted $rounds
1808	lea	.Lxts_magic(%rip),%r8
1809	jmp	.Lxts_enc_grandloop
1810
1811.align	32
1812.Lxts_enc_grandloop:
1813	movdqu	`16*0`($inp),$inout0		# load input
1814	movdqa	$rndkey0,$twmask
1815	movdqu	`16*1`($inp),$inout1
1816	pxor	@tweak[0],$inout0		# input^=tweak^round[0]
1817	movdqu	`16*2`($inp),$inout2
1818	pxor	@tweak[1],$inout1
1819	 aesenc		$rndkey1,$inout0
1820	movdqu	`16*3`($inp),$inout3
1821	pxor	@tweak[2],$inout2
1822	 aesenc		$rndkey1,$inout1
1823	movdqu	`16*4`($inp),$inout4
1824	pxor	@tweak[3],$inout3
1825	 aesenc		$rndkey1,$inout2
1826	movdqu	`16*5`($inp),$inout5
1827	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
1828	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
1829	pxor	@tweak[4],$inout4
1830	 aesenc		$rndkey1,$inout3
1831	$movkey	32($key_),$rndkey0
1832	lea	`16*6`($inp),$inp
1833	pxor	$twmask,$inout5
1834
1835	 pxor	$twres,@tweak[0]		# calclulate tweaks^round[last]
1836	aesenc		$rndkey1,$inout4
1837	 pxor	$twres,@tweak[1]
1838	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^round[last]
1839	aesenc		$rndkey1,$inout5
1840	$movkey		48($key_),$rndkey1
1841	 pxor	$twres,@tweak[2]
1842
1843	aesenc		$rndkey0,$inout0
1844	 pxor	$twres,@tweak[3]
1845	 movdqa	@tweak[1],`16*1`(%rsp)
1846	aesenc		$rndkey0,$inout1
1847	 pxor	$twres,@tweak[4]
1848	 movdqa	@tweak[2],`16*2`(%rsp)
1849	aesenc		$rndkey0,$inout2
1850	aesenc		$rndkey0,$inout3
1851	 pxor	$twres,$twmask
1852	 movdqa	@tweak[4],`16*4`(%rsp)
1853	aesenc		$rndkey0,$inout4
1854	aesenc		$rndkey0,$inout5
1855	$movkey		64($key_),$rndkey0
1856	 movdqa	$twmask,`16*5`(%rsp)
1857	pshufd	\$0x5f,@tweak[5],$twres
1858	jmp	.Lxts_enc_loop6
1859.align	32
1860.Lxts_enc_loop6:
1861	aesenc		$rndkey1,$inout0
1862	aesenc		$rndkey1,$inout1
1863	aesenc		$rndkey1,$inout2
1864	aesenc		$rndkey1,$inout3
1865	aesenc		$rndkey1,$inout4
1866	aesenc		$rndkey1,$inout5
1867	$movkey		-64($key,%rax),$rndkey1
1868	add		\$32,%rax
1869
1870	aesenc		$rndkey0,$inout0
1871	aesenc		$rndkey0,$inout1
1872	aesenc		$rndkey0,$inout2
1873	aesenc		$rndkey0,$inout3
1874	aesenc		$rndkey0,$inout4
1875	aesenc		$rndkey0,$inout5
1876	$movkey		-80($key,%rax),$rndkey0
1877	jnz		.Lxts_enc_loop6
1878
1879	movdqa	(%r8),$twmask			# start calculating next tweak
1880	movdqa	$twres,$twtmp
1881	paddd	$twres,$twres
1882	 aesenc		$rndkey1,$inout0
1883	paddq	@tweak[5],@tweak[5]
1884	psrad	\$31,$twtmp
1885	 aesenc		$rndkey1,$inout1
1886	pand	$twmask,$twtmp
1887	$movkey	($key_),@tweak[0]		# load round[0]
1888	 aesenc		$rndkey1,$inout2
1889	 aesenc		$rndkey1,$inout3
1890	 aesenc		$rndkey1,$inout4
1891	pxor	$twtmp,@tweak[5]
1892	movaps	@tweak[0],@tweak[1]		# copy round[0]
1893	 aesenc		$rndkey1,$inout5
1894	 $movkey	-64($key),$rndkey1
1895
1896	movdqa	$twres,$twtmp
1897	 aesenc		$rndkey0,$inout0
1898	paddd	$twres,$twres
1899	pxor	@tweak[5],@tweak[0]
1900	 aesenc		$rndkey0,$inout1
1901	psrad	\$31,$twtmp
1902	paddq	@tweak[5],@tweak[5]
1903	 aesenc		$rndkey0,$inout2
1904	 aesenc		$rndkey0,$inout3
1905	pand	$twmask,$twtmp
1906	movaps	@tweak[1],@tweak[2]
1907	 aesenc		$rndkey0,$inout4
1908	pxor	$twtmp,@tweak[5]
1909	movdqa	$twres,$twtmp
1910	 aesenc		$rndkey0,$inout5
1911	 $movkey	-48($key),$rndkey0
1912
1913	paddd	$twres,$twres
1914	 aesenc		$rndkey1,$inout0
1915	pxor	@tweak[5],@tweak[1]
1916	psrad	\$31,$twtmp
1917	 aesenc		$rndkey1,$inout1
1918	paddq	@tweak[5],@tweak[5]
1919	pand	$twmask,$twtmp
1920	 aesenc		$rndkey1,$inout2
1921	 aesenc		$rndkey1,$inout3
1922	 movdqa	@tweak[3],`16*3`(%rsp)
1923	pxor	$twtmp,@tweak[5]
1924	 aesenc		$rndkey1,$inout4
1925	movaps	@tweak[2],@tweak[3]
1926	movdqa	$twres,$twtmp
1927	 aesenc		$rndkey1,$inout5
1928	 $movkey	-32($key),$rndkey1
1929
1930	paddd	$twres,$twres
1931	 aesenc		$rndkey0,$inout0
1932	pxor	@tweak[5],@tweak[2]
1933	psrad	\$31,$twtmp
1934	 aesenc		$rndkey0,$inout1
1935	paddq	@tweak[5],@tweak[5]
1936	pand	$twmask,$twtmp
1937	 aesenc		$rndkey0,$inout2
1938	 aesenc		$rndkey0,$inout3
1939	 aesenc		$rndkey0,$inout4
1940	pxor	$twtmp,@tweak[5]
1941	movaps	@tweak[3],@tweak[4]
1942	 aesenc		$rndkey0,$inout5
1943
1944	movdqa	$twres,$rndkey0
1945	paddd	$twres,$twres
1946	 aesenc		$rndkey1,$inout0
1947	pxor	@tweak[5],@tweak[3]
1948	psrad	\$31,$rndkey0
1949	 aesenc		$rndkey1,$inout1
1950	paddq	@tweak[5],@tweak[5]
1951	pand	$twmask,$rndkey0
1952	 aesenc		$rndkey1,$inout2
1953	 aesenc		$rndkey1,$inout3
1954	pxor	$rndkey0,@tweak[5]
1955	$movkey		($key_),$rndkey0
1956	 aesenc		$rndkey1,$inout4
1957	 aesenc		$rndkey1,$inout5
1958	$movkey		16($key_),$rndkey1
1959
1960	pxor	@tweak[5],@tweak[4]
1961	 aesenclast	`16*0`(%rsp),$inout0
1962	psrad	\$31,$twres
1963	paddq	@tweak[5],@tweak[5]
1964	 aesenclast	`16*1`(%rsp),$inout1
1965	 aesenclast	`16*2`(%rsp),$inout2
1966	pand	$twmask,$twres
1967	mov	%r10,%rax			# restore $rounds
1968	 aesenclast	`16*3`(%rsp),$inout3
1969	 aesenclast	`16*4`(%rsp),$inout4
1970	 aesenclast	`16*5`(%rsp),$inout5
1971	pxor	$twres,@tweak[5]
1972
1973	lea	`16*6`($out),$out		# $out+=6*16
1974	movups	$inout0,`-16*6`($out)		# store 6 output blocks
1975	movups	$inout1,`-16*5`($out)
1976	movups	$inout2,`-16*4`($out)
1977	movups	$inout3,`-16*3`($out)
1978	movups	$inout4,`-16*2`($out)
1979	movups	$inout5,`-16*1`($out)
1980	sub	\$16*6,$len
1981	jnc	.Lxts_enc_grandloop		# loop if $len-=6*16 didn't borrow
1982
1983	mov	\$16+96,$rounds
1984	sub	$rnds_,$rounds
1985	mov	$key_,$key			# restore $key
1986	shr	\$4,$rounds			# restore original value
1987
1988.Lxts_enc_short:
1989	# at the point @tweak[0..5] are populated with tweak values
1990	mov	$rounds,$rnds_			# backup $rounds
1991	pxor	$rndkey0,@tweak[0]
1992	add	\$16*6,$len			# restore real remaining $len
1993	jz	.Lxts_enc_done			# done if ($len==0)
1994
1995	pxor	$rndkey0,@tweak[1]
1996	cmp	\$0x20,$len
1997	jb	.Lxts_enc_one			# $len is 1*16
1998	pxor	$rndkey0,@tweak[2]
1999	je	.Lxts_enc_two			# $len is 2*16
2000
2001	pxor	$rndkey0,@tweak[3]
2002	cmp	\$0x40,$len
2003	jb	.Lxts_enc_three			# $len is 3*16
2004	pxor	$rndkey0,@tweak[4]
2005	je	.Lxts_enc_four			# $len is 4*16
2006
2007	movdqu	($inp),$inout0			# $len is 5*16
2008	movdqu	16*1($inp),$inout1
2009	movdqu	16*2($inp),$inout2
2010	pxor	@tweak[0],$inout0
2011	movdqu	16*3($inp),$inout3
2012	pxor	@tweak[1],$inout1
2013	movdqu	16*4($inp),$inout4
2014	lea	16*5($inp),$inp			# $inp+=5*16
2015	pxor	@tweak[2],$inout2
2016	pxor	@tweak[3],$inout3
2017	pxor	@tweak[4],$inout4
2018	pxor	$inout5,$inout5
2019
2020	call	_aesni_encrypt6
2021
2022	xorps	@tweak[0],$inout0
2023	movdqa	@tweak[5],@tweak[0]
2024	xorps	@tweak[1],$inout1
2025	xorps	@tweak[2],$inout2
2026	movdqu	$inout0,($out)			# store 5 output blocks
2027	xorps	@tweak[3],$inout3
2028	movdqu	$inout1,16*1($out)
2029	xorps	@tweak[4],$inout4
2030	movdqu	$inout2,16*2($out)
2031	movdqu	$inout3,16*3($out)
2032	movdqu	$inout4,16*4($out)
2033	lea	16*5($out),$out			# $out+=5*16
2034	jmp	.Lxts_enc_done
2035
2036.align	16
2037.Lxts_enc_one:
2038	movups	($inp),$inout0
2039	lea	16*1($inp),$inp			# inp+=1*16
2040	xorps	@tweak[0],$inout0
2041___
2042	&aesni_generate1("enc",$key,$rounds);
2043$code.=<<___;
2044	xorps	@tweak[0],$inout0
2045	movdqa	@tweak[1],@tweak[0]
2046	movups	$inout0,($out)			# store one output block
2047	lea	16*1($out),$out			# $out+=1*16
2048	jmp	.Lxts_enc_done
2049
2050.align	16
2051.Lxts_enc_two:
2052	movups	($inp),$inout0
2053	movups	16($inp),$inout1
2054	lea	32($inp),$inp			# $inp+=2*16
2055	xorps	@tweak[0],$inout0
2056	xorps	@tweak[1],$inout1
2057
2058	call	_aesni_encrypt2
2059
2060	xorps	@tweak[0],$inout0
2061	movdqa	@tweak[2],@tweak[0]
2062	xorps	@tweak[1],$inout1
2063	movups	$inout0,($out)			# store 2 output blocks
2064	movups	$inout1,16*1($out)
2065	lea	16*2($out),$out			# $out+=2*16
2066	jmp	.Lxts_enc_done
2067
2068.align	16
2069.Lxts_enc_three:
2070	movups	($inp),$inout0
2071	movups	16*1($inp),$inout1
2072	movups	16*2($inp),$inout2
2073	lea	16*3($inp),$inp			# $inp+=3*16
2074	xorps	@tweak[0],$inout0
2075	xorps	@tweak[1],$inout1
2076	xorps	@tweak[2],$inout2
2077
2078	call	_aesni_encrypt3
2079
2080	xorps	@tweak[0],$inout0
2081	movdqa	@tweak[3],@tweak[0]
2082	xorps	@tweak[1],$inout1
2083	xorps	@tweak[2],$inout2
2084	movups	$inout0,($out)			# store 3 output blocks
2085	movups	$inout1,16*1($out)
2086	movups	$inout2,16*2($out)
2087	lea	16*3($out),$out			# $out+=3*16
2088	jmp	.Lxts_enc_done
2089
2090.align	16
2091.Lxts_enc_four:
2092	movups	($inp),$inout0
2093	movups	16*1($inp),$inout1
2094	movups	16*2($inp),$inout2
2095	xorps	@tweak[0],$inout0
2096	movups	16*3($inp),$inout3
2097	lea	16*4($inp),$inp			# $inp+=4*16
2098	xorps	@tweak[1],$inout1
2099	xorps	@tweak[2],$inout2
2100	xorps	@tweak[3],$inout3
2101
2102	call	_aesni_encrypt4
2103
2104	pxor	@tweak[0],$inout0
2105	movdqa	@tweak[4],@tweak[0]
2106	pxor	@tweak[1],$inout1
2107	pxor	@tweak[2],$inout2
2108	movdqu	$inout0,($out)			# store 4 output blocks
2109	pxor	@tweak[3],$inout3
2110	movdqu	$inout1,16*1($out)
2111	movdqu	$inout2,16*2($out)
2112	movdqu	$inout3,16*3($out)
2113	lea	16*4($out),$out			# $out+=4*16
2114	jmp	.Lxts_enc_done
2115
2116.align	16
2117.Lxts_enc_done:
2118	and	\$15,$len_			# see if $len%16 is 0
2119	jz	.Lxts_enc_ret
2120	mov	$len_,$len
2121
2122.Lxts_enc_steal:
2123	movzb	($inp),%eax			# borrow $rounds ...
2124	movzb	-16($out),%ecx			# ... and $key
2125	lea	1($inp),$inp
2126	mov	%al,-16($out)
2127	mov	%cl,0($out)
2128	lea	1($out),$out
2129	sub	\$1,$len
2130	jnz	.Lxts_enc_steal
2131
2132	sub	$len_,$out			# rewind $out
2133	mov	$key_,$key			# restore $key
2134	mov	$rnds_,$rounds			# restore $rounds
2135
2136	movups	-16($out),$inout0
2137	xorps	@tweak[0],$inout0
2138___
2139	&aesni_generate1("enc",$key,$rounds);
2140$code.=<<___;
2141	xorps	@tweak[0],$inout0
2142	movups	$inout0,-16($out)
2143
2144.Lxts_enc_ret:
2145	xorps	%xmm0,%xmm0			# clear register bank
2146	pxor	%xmm1,%xmm1
2147	pxor	%xmm2,%xmm2
2148	pxor	%xmm3,%xmm3
2149	pxor	%xmm4,%xmm4
2150	pxor	%xmm5,%xmm5
2151___
2152$code.=<<___ if (!$win64);
2153	pxor	%xmm6,%xmm6
2154	pxor	%xmm7,%xmm7
2155	movaps	%xmm0,0x00(%rsp)		# clear stack
2156	pxor	%xmm8,%xmm8
2157	movaps	%xmm0,0x10(%rsp)
2158	pxor	%xmm9,%xmm9
2159	movaps	%xmm0,0x20(%rsp)
2160	pxor	%xmm10,%xmm10
2161	movaps	%xmm0,0x30(%rsp)
2162	pxor	%xmm11,%xmm11
2163	movaps	%xmm0,0x40(%rsp)
2164	pxor	%xmm12,%xmm12
2165	movaps	%xmm0,0x50(%rsp)
2166	pxor	%xmm13,%xmm13
2167	movaps	%xmm0,0x60(%rsp)
2168	pxor	%xmm14,%xmm14
2169	pxor	%xmm15,%xmm15
2170___
2171$code.=<<___ if ($win64);
2172	movaps	-0xa0(%rbp),%xmm6
2173	movaps	%xmm0,-0xa0(%rbp)		# clear stack
2174	movaps	-0x90(%rbp),%xmm7
2175	movaps	%xmm0,-0x90(%rbp)
2176	movaps	-0x80(%rbp),%xmm8
2177	movaps	%xmm0,-0x80(%rbp)
2178	movaps	-0x70(%rbp),%xmm9
2179	movaps	%xmm0,-0x70(%rbp)
2180	movaps	-0x60(%rbp),%xmm10
2181	movaps	%xmm0,-0x60(%rbp)
2182	movaps	-0x50(%rbp),%xmm11
2183	movaps	%xmm0,-0x50(%rbp)
2184	movaps	-0x40(%rbp),%xmm12
2185	movaps	%xmm0,-0x40(%rbp)
2186	movaps	-0x30(%rbp),%xmm13
2187	movaps	%xmm0,-0x30(%rbp)
2188	movaps	-0x20(%rbp),%xmm14
2189	movaps	%xmm0,-0x20(%rbp)
2190	movaps	-0x10(%rbp),%xmm15
2191	movaps	%xmm0,-0x10(%rbp)
2192	movaps	%xmm0,0x00(%rsp)
2193	movaps	%xmm0,0x10(%rsp)
2194	movaps	%xmm0,0x20(%rsp)
2195	movaps	%xmm0,0x30(%rsp)
2196	movaps	%xmm0,0x40(%rsp)
2197	movaps	%xmm0,0x50(%rsp)
2198	movaps	%xmm0,0x60(%rsp)
2199___
2200$code.=<<___;
2201	lea	(%rbp),%rsp
2202	pop	%rbp
2203.Lxts_enc_epilogue:
2204	ret
2205.size	aesni_xts_encrypt,.-aesni_xts_encrypt
2206___
2207
2208$code.=<<___;
2209.globl	aesni_xts_decrypt
2210.type	aesni_xts_decrypt,\@function,6
2211.align	16
2212aesni_xts_decrypt:
2213	lea	(%rsp),%rax
2214	push	%rbp
2215	sub	\$$frame_size,%rsp
2216	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
2217___
2218$code.=<<___ if ($win64);
2219	movaps	%xmm6,-0xa8(%rax)		# offload everything
2220	movaps	%xmm7,-0x98(%rax)
2221	movaps	%xmm8,-0x88(%rax)
2222	movaps	%xmm9,-0x78(%rax)
2223	movaps	%xmm10,-0x68(%rax)
2224	movaps	%xmm11,-0x58(%rax)
2225	movaps	%xmm12,-0x48(%rax)
2226	movaps	%xmm13,-0x38(%rax)
2227	movaps	%xmm14,-0x28(%rax)
2228	movaps	%xmm15,-0x18(%rax)
2229.Lxts_dec_body:
2230___
2231$code.=<<___;
2232	lea	-8(%rax),%rbp
2233	movups	($ivp),$inout0			# load clear-text tweak
2234	mov	240($key2),$rounds		# key2->rounds
2235	mov	240($key),$rnds_		# key1->rounds
2236___
2237	# generate the tweak
2238	&aesni_generate1("enc",$key2,$rounds,$inout0);
2239$code.=<<___;
2240	xor	%eax,%eax			# if ($len%16) len-=16;
2241	test	\$15,$len
2242	setnz	%al
2243	shl	\$4,%rax
2244	sub	%rax,$len
2245
2246	$movkey	($key),$rndkey0			# zero round key
2247	mov	$key,$key_			# backup $key
2248	mov	$rnds_,$rounds			# backup $rounds
2249	shl	\$4,$rnds_
2250	mov	$len,$len_			# backup $len
2251	and	\$-16,$len
2252
2253	$movkey	16($key,$rnds_),$rndkey1	# last round key
2254
2255	movdqa	.Lxts_magic(%rip),$twmask
2256	movdqa	$inout0,@tweak[5]
2257	pshufd	\$0x5f,$inout0,$twres
2258	pxor	$rndkey0,$rndkey1
2259___
2260    for ($i=0;$i<4;$i++) {
2261    $code.=<<___;
2262	movdqa	$twres,$twtmp
2263	paddd	$twres,$twres
2264	movdqa	@tweak[5],@tweak[$i]
2265	psrad	\$31,$twtmp			# broadcast upper bits
2266	paddq	@tweak[5],@tweak[5]
2267	pand	$twmask,$twtmp
2268	pxor	$rndkey0,@tweak[$i]
2269	pxor	$twtmp,@tweak[5]
2270___
2271    }
2272$code.=<<___;
2273	movdqa	@tweak[5],@tweak[4]
2274	psrad	\$31,$twres
2275	paddq	@tweak[5],@tweak[5]
2276	pand	$twmask,$twres
2277	pxor	$rndkey0,@tweak[4]
2278	pxor	$twres,@tweak[5]
2279	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
2280
2281	sub	\$16*6,$len
2282	jc	.Lxts_dec_short			# if $len-=6*16 borrowed
2283
2284	mov	\$16+96,$rounds
2285	lea	32($key_,$rnds_),$key		# end of key schedule
2286	sub	%r10,%rax			# twisted $rounds
2287	$movkey	16($key_),$rndkey1
2288	mov	%rax,%r10			# backup twisted $rounds
2289	lea	.Lxts_magic(%rip),%r8
2290	jmp	.Lxts_dec_grandloop
2291
2292.align	32
2293.Lxts_dec_grandloop:
2294	movdqu	`16*0`($inp),$inout0		# load input
2295	movdqa	$rndkey0,$twmask
2296	movdqu	`16*1`($inp),$inout1
2297	pxor	@tweak[0],$inout0		# intput^=tweak^round[0]
2298	movdqu	`16*2`($inp),$inout2
2299	pxor	@tweak[1],$inout1
2300	 aesdec		$rndkey1,$inout0
2301	movdqu	`16*3`($inp),$inout3
2302	pxor	@tweak[2],$inout2
2303	 aesdec		$rndkey1,$inout1
2304	movdqu	`16*4`($inp),$inout4
2305	pxor	@tweak[3],$inout3
2306	 aesdec		$rndkey1,$inout2
2307	movdqu	`16*5`($inp),$inout5
2308	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
2309	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
2310	pxor	@tweak[4],$inout4
2311	 aesdec		$rndkey1,$inout3
2312	$movkey	32($key_),$rndkey0
2313	lea	`16*6`($inp),$inp
2314	pxor	$twmask,$inout5
2315
2316	 pxor	$twres,@tweak[0]		# calclulate tweaks^round[last]
2317	aesdec		$rndkey1,$inout4
2318	 pxor	$twres,@tweak[1]
2319	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
2320	aesdec		$rndkey1,$inout5
2321	$movkey		48($key_),$rndkey1
2322	 pxor	$twres,@tweak[2]
2323
2324	aesdec		$rndkey0,$inout0
2325	 pxor	$twres,@tweak[3]
2326	 movdqa	@tweak[1],`16*1`(%rsp)
2327	aesdec		$rndkey0,$inout1
2328	 pxor	$twres,@tweak[4]
2329	 movdqa	@tweak[2],`16*2`(%rsp)
2330	aesdec		$rndkey0,$inout2
2331	aesdec		$rndkey0,$inout3
2332	 pxor	$twres,$twmask
2333	 movdqa	@tweak[4],`16*4`(%rsp)
2334	aesdec		$rndkey0,$inout4
2335	aesdec		$rndkey0,$inout5
2336	$movkey		64($key_),$rndkey0
2337	 movdqa	$twmask,`16*5`(%rsp)
2338	pshufd	\$0x5f,@tweak[5],$twres
2339	jmp	.Lxts_dec_loop6
2340.align	32
2341.Lxts_dec_loop6:
2342	aesdec		$rndkey1,$inout0
2343	aesdec		$rndkey1,$inout1
2344	aesdec		$rndkey1,$inout2
2345	aesdec		$rndkey1,$inout3
2346	aesdec		$rndkey1,$inout4
2347	aesdec		$rndkey1,$inout5
2348	$movkey		-64($key,%rax),$rndkey1
2349	add		\$32,%rax
2350
2351	aesdec		$rndkey0,$inout0
2352	aesdec		$rndkey0,$inout1
2353	aesdec		$rndkey0,$inout2
2354	aesdec		$rndkey0,$inout3
2355	aesdec		$rndkey0,$inout4
2356	aesdec		$rndkey0,$inout5
2357	$movkey		-80($key,%rax),$rndkey0
2358	jnz		.Lxts_dec_loop6
2359
2360	movdqa	(%r8),$twmask			# start calculating next tweak
2361	movdqa	$twres,$twtmp
2362	paddd	$twres,$twres
2363	 aesdec		$rndkey1,$inout0
2364	paddq	@tweak[5],@tweak[5]
2365	psrad	\$31,$twtmp
2366	 aesdec		$rndkey1,$inout1
2367	pand	$twmask,$twtmp
2368	$movkey	($key_),@tweak[0]		# load round[0]
2369	 aesdec		$rndkey1,$inout2
2370	 aesdec		$rndkey1,$inout3
2371	 aesdec		$rndkey1,$inout4
2372	pxor	$twtmp,@tweak[5]
2373	movaps	@tweak[0],@tweak[1]		# copy round[0]
2374	 aesdec		$rndkey1,$inout5
2375	 $movkey	-64($key),$rndkey1
2376
2377	movdqa	$twres,$twtmp
2378	 aesdec		$rndkey0,$inout0
2379	paddd	$twres,$twres
2380	pxor	@tweak[5],@tweak[0]
2381	 aesdec		$rndkey0,$inout1
2382	psrad	\$31,$twtmp
2383	paddq	@tweak[5],@tweak[5]
2384	 aesdec		$rndkey0,$inout2
2385	 aesdec		$rndkey0,$inout3
2386	pand	$twmask,$twtmp
2387	movaps	@tweak[1],@tweak[2]
2388	 aesdec		$rndkey0,$inout4
2389	pxor	$twtmp,@tweak[5]
2390	movdqa	$twres,$twtmp
2391	 aesdec		$rndkey0,$inout5
2392	 $movkey	-48($key),$rndkey0
2393
2394	paddd	$twres,$twres
2395	 aesdec		$rndkey1,$inout0
2396	pxor	@tweak[5],@tweak[1]
2397	psrad	\$31,$twtmp
2398	 aesdec		$rndkey1,$inout1
2399	paddq	@tweak[5],@tweak[5]
2400	pand	$twmask,$twtmp
2401	 aesdec		$rndkey1,$inout2
2402	 aesdec		$rndkey1,$inout3
2403	 movdqa	@tweak[3],`16*3`(%rsp)
2404	pxor	$twtmp,@tweak[5]
2405	 aesdec		$rndkey1,$inout4
2406	movaps	@tweak[2],@tweak[3]
2407	movdqa	$twres,$twtmp
2408	 aesdec		$rndkey1,$inout5
2409	 $movkey	-32($key),$rndkey1
2410
2411	paddd	$twres,$twres
2412	 aesdec		$rndkey0,$inout0
2413	pxor	@tweak[5],@tweak[2]
2414	psrad	\$31,$twtmp
2415	 aesdec		$rndkey0,$inout1
2416	paddq	@tweak[5],@tweak[5]
2417	pand	$twmask,$twtmp
2418	 aesdec		$rndkey0,$inout2
2419	 aesdec		$rndkey0,$inout3
2420	 aesdec		$rndkey0,$inout4
2421	pxor	$twtmp,@tweak[5]
2422	movaps	@tweak[3],@tweak[4]
2423	 aesdec		$rndkey0,$inout5
2424
2425	movdqa	$twres,$rndkey0
2426	paddd	$twres,$twres
2427	 aesdec		$rndkey1,$inout0
2428	pxor	@tweak[5],@tweak[3]
2429	psrad	\$31,$rndkey0
2430	 aesdec		$rndkey1,$inout1
2431	paddq	@tweak[5],@tweak[5]
2432	pand	$twmask,$rndkey0
2433	 aesdec		$rndkey1,$inout2
2434	 aesdec		$rndkey1,$inout3
2435	pxor	$rndkey0,@tweak[5]
2436	$movkey		($key_),$rndkey0
2437	 aesdec		$rndkey1,$inout4
2438	 aesdec		$rndkey1,$inout5
2439	$movkey		16($key_),$rndkey1
2440
2441	pxor	@tweak[5],@tweak[4]
2442	 aesdeclast	`16*0`(%rsp),$inout0
2443	psrad	\$31,$twres
2444	paddq	@tweak[5],@tweak[5]
2445	 aesdeclast	`16*1`(%rsp),$inout1
2446	 aesdeclast	`16*2`(%rsp),$inout2
2447	pand	$twmask,$twres
2448	mov	%r10,%rax			# restore $rounds
2449	 aesdeclast	`16*3`(%rsp),$inout3
2450	 aesdeclast	`16*4`(%rsp),$inout4
2451	 aesdeclast	`16*5`(%rsp),$inout5
2452	pxor	$twres,@tweak[5]
2453
2454	lea	`16*6`($out),$out		# $out+=6*16
2455	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2456	movups	$inout1,`-16*5`($out)
2457	movups	$inout2,`-16*4`($out)
2458	movups	$inout3,`-16*3`($out)
2459	movups	$inout4,`-16*2`($out)
2460	movups	$inout5,`-16*1`($out)
2461	sub	\$16*6,$len
2462	jnc	.Lxts_dec_grandloop		# loop if $len-=6*16 didn't borrow
2463
2464	mov	\$16+96,$rounds
2465	sub	$rnds_,$rounds
2466	mov	$key_,$key			# restore $key
2467	shr	\$4,$rounds			# restore original value
2468
2469.Lxts_dec_short:
2470	# at the point @tweak[0..5] are populated with tweak values
2471	mov	$rounds,$rnds_			# backup $rounds
2472	pxor	$rndkey0,@tweak[0]
2473	pxor	$rndkey0,@tweak[1]
2474	add	\$16*6,$len			# restore real remaining $len
2475	jz	.Lxts_dec_done			# done if ($len==0)
2476
2477	pxor	$rndkey0,@tweak[2]
2478	cmp	\$0x20,$len
2479	jb	.Lxts_dec_one			# $len is 1*16
2480	pxor	$rndkey0,@tweak[3]
2481	je	.Lxts_dec_two			# $len is 2*16
2482
2483	pxor	$rndkey0,@tweak[4]
2484	cmp	\$0x40,$len
2485	jb	.Lxts_dec_three			# $len is 3*16
2486	je	.Lxts_dec_four			# $len is 4*16
2487
2488	movdqu	($inp),$inout0			# $len is 5*16
2489	movdqu	16*1($inp),$inout1
2490	movdqu	16*2($inp),$inout2
2491	pxor	@tweak[0],$inout0
2492	movdqu	16*3($inp),$inout3
2493	pxor	@tweak[1],$inout1
2494	movdqu	16*4($inp),$inout4
2495	lea	16*5($inp),$inp			# $inp+=5*16
2496	pxor	@tweak[2],$inout2
2497	pxor	@tweak[3],$inout3
2498	pxor	@tweak[4],$inout4
2499
2500	call	_aesni_decrypt6
2501
2502	xorps	@tweak[0],$inout0
2503	xorps	@tweak[1],$inout1
2504	xorps	@tweak[2],$inout2
2505	movdqu	$inout0,($out)			# store 5 output blocks
2506	xorps	@tweak[3],$inout3
2507	movdqu	$inout1,16*1($out)
2508	xorps	@tweak[4],$inout4
2509	movdqu	$inout2,16*2($out)
2510	 pxor		$twtmp,$twtmp
2511	movdqu	$inout3,16*3($out)
2512	 pcmpgtd	@tweak[5],$twtmp
2513	movdqu	$inout4,16*4($out)
2514	lea	16*5($out),$out			# $out+=5*16
2515	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
2516	and	\$15,$len_
2517	jz	.Lxts_dec_ret
2518
2519	movdqa	@tweak[5],@tweak[0]
2520	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
2521	pand	$twmask,@tweak[1]		# isolate carry and residue
2522	pxor	@tweak[5],@tweak[1]
2523	jmp	.Lxts_dec_done2
2524
2525.align	16
2526.Lxts_dec_one:
2527	movups	($inp),$inout0
2528	lea	16*1($inp),$inp			# $inp+=1*16
2529	xorps	@tweak[0],$inout0
2530___
2531	&aesni_generate1("dec",$key,$rounds);
2532$code.=<<___;
2533	xorps	@tweak[0],$inout0
2534	movdqa	@tweak[1],@tweak[0]
2535	movups	$inout0,($out)			# store one output block
2536	movdqa	@tweak[2],@tweak[1]
2537	lea	16*1($out),$out			# $out+=1*16
2538	jmp	.Lxts_dec_done
2539
2540.align	16
2541.Lxts_dec_two:
2542	movups	($inp),$inout0
2543	movups	16($inp),$inout1
2544	lea	32($inp),$inp			# $inp+=2*16
2545	xorps	@tweak[0],$inout0
2546	xorps	@tweak[1],$inout1
2547
2548	call	_aesni_decrypt2
2549
2550	xorps	@tweak[0],$inout0
2551	movdqa	@tweak[2],@tweak[0]
2552	xorps	@tweak[1],$inout1
2553	movdqa	@tweak[3],@tweak[1]
2554	movups	$inout0,($out)			# store 2 output blocks
2555	movups	$inout1,16*1($out)
2556	lea	16*2($out),$out			# $out+=2*16
2557	jmp	.Lxts_dec_done
2558
2559.align	16
2560.Lxts_dec_three:
2561	movups	($inp),$inout0
2562	movups	16*1($inp),$inout1
2563	movups	16*2($inp),$inout2
2564	lea	16*3($inp),$inp			# $inp+=3*16
2565	xorps	@tweak[0],$inout0
2566	xorps	@tweak[1],$inout1
2567	xorps	@tweak[2],$inout2
2568
2569	call	_aesni_decrypt3
2570
2571	xorps	@tweak[0],$inout0
2572	movdqa	@tweak[3],@tweak[0]
2573	xorps	@tweak[1],$inout1
2574	movdqa	@tweak[4],@tweak[1]
2575	xorps	@tweak[2],$inout2
2576	movups	$inout0,($out)			# store 3 output blocks
2577	movups	$inout1,16*1($out)
2578	movups	$inout2,16*2($out)
2579	lea	16*3($out),$out			# $out+=3*16
2580	jmp	.Lxts_dec_done
2581
2582.align	16
2583.Lxts_dec_four:
2584	movups	($inp),$inout0
2585	movups	16*1($inp),$inout1
2586	movups	16*2($inp),$inout2
2587	xorps	@tweak[0],$inout0
2588	movups	16*3($inp),$inout3
2589	lea	16*4($inp),$inp			# $inp+=4*16
2590	xorps	@tweak[1],$inout1
2591	xorps	@tweak[2],$inout2
2592	xorps	@tweak[3],$inout3
2593
2594	call	_aesni_decrypt4
2595
2596	pxor	@tweak[0],$inout0
2597	movdqa	@tweak[4],@tweak[0]
2598	pxor	@tweak[1],$inout1
2599	movdqa	@tweak[5],@tweak[1]
2600	pxor	@tweak[2],$inout2
2601	movdqu	$inout0,($out)			# store 4 output blocks
2602	pxor	@tweak[3],$inout3
2603	movdqu	$inout1,16*1($out)
2604	movdqu	$inout2,16*2($out)
2605	movdqu	$inout3,16*3($out)
2606	lea	16*4($out),$out			# $out+=4*16
2607	jmp	.Lxts_dec_done
2608
2609.align	16
2610.Lxts_dec_done:
2611	and	\$15,$len_			# see if $len%16 is 0
2612	jz	.Lxts_dec_ret
2613.Lxts_dec_done2:
2614	mov	$len_,$len
2615	mov	$key_,$key			# restore $key
2616	mov	$rnds_,$rounds			# restore $rounds
2617
2618	movups	($inp),$inout0
2619	xorps	@tweak[1],$inout0
2620___
2621	&aesni_generate1("dec",$key,$rounds);
2622$code.=<<___;
2623	xorps	@tweak[1],$inout0
2624	movups	$inout0,($out)
2625
2626.Lxts_dec_steal:
2627	movzb	16($inp),%eax			# borrow $rounds ...
2628	movzb	($out),%ecx			# ... and $key
2629	lea	1($inp),$inp
2630	mov	%al,($out)
2631	mov	%cl,16($out)
2632	lea	1($out),$out
2633	sub	\$1,$len
2634	jnz	.Lxts_dec_steal
2635
2636	sub	$len_,$out			# rewind $out
2637	mov	$key_,$key			# restore $key
2638	mov	$rnds_,$rounds			# restore $rounds
2639
2640	movups	($out),$inout0
2641	xorps	@tweak[0],$inout0
2642___
2643	&aesni_generate1("dec",$key,$rounds);
2644$code.=<<___;
2645	xorps	@tweak[0],$inout0
2646	movups	$inout0,($out)
2647
2648.Lxts_dec_ret:
2649	xorps	%xmm0,%xmm0			# clear register bank
2650	pxor	%xmm1,%xmm1
2651	pxor	%xmm2,%xmm2
2652	pxor	%xmm3,%xmm3
2653	pxor	%xmm4,%xmm4
2654	pxor	%xmm5,%xmm5
2655___
2656$code.=<<___ if (!$win64);
2657	pxor	%xmm6,%xmm6
2658	pxor	%xmm7,%xmm7
2659	movaps	%xmm0,0x00(%rsp)		# clear stack
2660	pxor	%xmm8,%xmm8
2661	movaps	%xmm0,0x10(%rsp)
2662	pxor	%xmm9,%xmm9
2663	movaps	%xmm0,0x20(%rsp)
2664	pxor	%xmm10,%xmm10
2665	movaps	%xmm0,0x30(%rsp)
2666	pxor	%xmm11,%xmm11
2667	movaps	%xmm0,0x40(%rsp)
2668	pxor	%xmm12,%xmm12
2669	movaps	%xmm0,0x50(%rsp)
2670	pxor	%xmm13,%xmm13
2671	movaps	%xmm0,0x60(%rsp)
2672	pxor	%xmm14,%xmm14
2673	pxor	%xmm15,%xmm15
2674___
2675$code.=<<___ if ($win64);
2676	movaps	-0xa0(%rbp),%xmm6
2677	movaps	%xmm0,-0xa0(%rbp)		# clear stack
2678	movaps	-0x90(%rbp),%xmm7
2679	movaps	%xmm0,-0x90(%rbp)
2680	movaps	-0x80(%rbp),%xmm8
2681	movaps	%xmm0,-0x80(%rbp)
2682	movaps	-0x70(%rbp),%xmm9
2683	movaps	%xmm0,-0x70(%rbp)
2684	movaps	-0x60(%rbp),%xmm10
2685	movaps	%xmm0,-0x60(%rbp)
2686	movaps	-0x50(%rbp),%xmm11
2687	movaps	%xmm0,-0x50(%rbp)
2688	movaps	-0x40(%rbp),%xmm12
2689	movaps	%xmm0,-0x40(%rbp)
2690	movaps	-0x30(%rbp),%xmm13
2691	movaps	%xmm0,-0x30(%rbp)
2692	movaps	-0x20(%rbp),%xmm14
2693	movaps	%xmm0,-0x20(%rbp)
2694	movaps	-0x10(%rbp),%xmm15
2695	movaps	%xmm0,-0x10(%rbp)
2696	movaps	%xmm0,0x00(%rsp)
2697	movaps	%xmm0,0x10(%rsp)
2698	movaps	%xmm0,0x20(%rsp)
2699	movaps	%xmm0,0x30(%rsp)
2700	movaps	%xmm0,0x40(%rsp)
2701	movaps	%xmm0,0x50(%rsp)
2702	movaps	%xmm0,0x60(%rsp)
2703___
2704$code.=<<___;
2705	lea	(%rbp),%rsp
2706	pop	%rbp
2707.Lxts_dec_epilogue:
2708	ret
2709.size	aesni_xts_decrypt,.-aesni_xts_decrypt
2710___
2711} }}
2712
2713########################################################################
2714# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2715#			    size_t length, const AES_KEY *key,
2716#			    unsigned char *ivp,const int enc);
2717{
2718my $frame_size = 0x10 + ($win64?0xa0:0);	# used in decrypt
2719my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
2720my $inp_=$key_;
2721
2722$code.=<<___;
2723.globl	${PREFIX}_cbc_encrypt
2724.type	${PREFIX}_cbc_encrypt,\@function,6
2725.align	16
2726${PREFIX}_cbc_encrypt:
2727	test	$len,$len		# check length
2728	jz	.Lcbc_ret
2729
2730	mov	240($key),$rnds_	# key->rounds
2731	mov	$key,$key_		# backup $key
2732	test	%r9d,%r9d		# 6th argument
2733	jz	.Lcbc_decrypt
2734#--------------------------- CBC ENCRYPT ------------------------------#
2735	movups	($ivp),$inout0		# load iv as initial state
2736	mov	$rnds_,$rounds
2737	cmp	\$16,$len
2738	jb	.Lcbc_enc_tail
2739	sub	\$16,$len
2740	jmp	.Lcbc_enc_loop
2741.align	16
2742.Lcbc_enc_loop:
2743	movups	($inp),$inout1		# load input
2744	lea	16($inp),$inp
2745	#xorps	$inout1,$inout0
2746___
2747	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
2748$code.=<<___;
2749	mov	$rnds_,$rounds		# restore $rounds
2750	mov	$key_,$key		# restore $key
2751	movups	$inout0,0($out)		# store output
2752	lea	16($out),$out
2753	sub	\$16,$len
2754	jnc	.Lcbc_enc_loop
2755	add	\$16,$len
2756	jnz	.Lcbc_enc_tail
2757	 pxor	$rndkey0,$rndkey0	# clear register bank
2758	 pxor	$rndkey1,$rndkey1
2759	movups	$inout0,($ivp)
2760	 pxor	$inout0,$inout0
2761	 pxor	$inout1,$inout1
2762	jmp	.Lcbc_ret
2763
2764.Lcbc_enc_tail:
2765	mov	$len,%rcx	# zaps $key
2766	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
2767	.long	0x9066A4F3	# rep movsb
2768	mov	\$16,%ecx	# zero tail
2769	sub	$len,%rcx
2770	xor	%eax,%eax
2771	.long	0x9066AAF3	# rep stosb
2772	lea	-16(%rdi),%rdi	# rewind $out by 1 block
2773	mov	$rnds_,$rounds	# restore $rounds
2774	mov	%rdi,%rsi	# $inp and $out are the same
2775	mov	$key_,$key	# restore $key
2776	xor	$len,$len	# len=16
2777	jmp	.Lcbc_enc_loop	# one more spin
2778#--------------------------- CBC DECRYPT ------------------------------#
2779.align	16
2780.Lcbc_decrypt:
2781	cmp	\$16,$len
2782	jne	.Lcbc_decrypt_bulk
2783
2784	# handle single block without allocating stack frame,
2785	# useful in ciphertext stealing mode
2786	movdqu	($inp),$inout0		# load input
2787	movdqu	($ivp),$inout1		# load iv
2788	movdqa	$inout0,$inout2		# future iv
2789___
2790	&aesni_generate1("dec",$key,$rnds_);
2791$code.=<<___;
2792	 pxor	$rndkey0,$rndkey0	# clear register bank
2793	 pxor	$rndkey1,$rndkey1
2794	movdqu	$inout2,($ivp)		# store iv
2795	xorps	$inout1,$inout0		# ^=iv
2796	 pxor	$inout1,$inout1
2797	movups	$inout0,($out)		# store output
2798	 pxor	$inout0,$inout0
2799	jmp	.Lcbc_ret
2800.align	16
2801.Lcbc_decrypt_bulk:
2802	lea	(%rsp),%rax
2803	push	%rbp
2804	sub	\$$frame_size,%rsp
2805	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
2806___
2807$code.=<<___ if ($win64);
2808	movaps	%xmm6,0x10(%rsp)
2809	movaps	%xmm7,0x20(%rsp)
2810	movaps	%xmm8,0x30(%rsp)
2811	movaps	%xmm9,0x40(%rsp)
2812	movaps	%xmm10,0x50(%rsp)
2813	movaps	%xmm11,0x60(%rsp)
2814	movaps	%xmm12,0x70(%rsp)
2815	movaps	%xmm13,0x80(%rsp)
2816	movaps	%xmm14,0x90(%rsp)
2817	movaps	%xmm15,0xa0(%rsp)
2818.Lcbc_decrypt_body:
2819___
2820$code.=<<___;
2821	lea	-8(%rax),%rbp
2822	movups	($ivp),$iv
2823	mov	$rnds_,$rounds
2824	cmp	\$0x50,$len
2825	jbe	.Lcbc_dec_tail
2826
2827	$movkey	($key),$rndkey0
2828	movdqu	0x00($inp),$inout0	# load input
2829	movdqu	0x10($inp),$inout1
2830	movdqa	$inout0,$in0
2831	movdqu	0x20($inp),$inout2
2832	movdqa	$inout1,$in1
2833	movdqu	0x30($inp),$inout3
2834	movdqa	$inout2,$in2
2835	movdqu	0x40($inp),$inout4
2836	movdqa	$inout3,$in3
2837	movdqu	0x50($inp),$inout5
2838	movdqa	$inout4,$in4
2839	mov	OPENSSL_ia32cap_P+4(%rip),%r9d
2840	cmp	\$0x70,$len
2841	jbe	.Lcbc_dec_six_or_seven
2842
2843	and	\$`1<<26|1<<22`,%r9d	# isolate XSAVE+MOVBE
2844	sub	\$0x50,$len		# $len is biased by -5*16
2845	cmp	\$`1<<22`,%r9d		# check for MOVBE without XSAVE
2846	je	.Lcbc_dec_loop6_enter	# [which denotes Atom Silvermont]
2847	sub	\$0x20,$len		# $len is biased by -7*16
2848	lea	0x70($key),$key		# size optimization
2849	jmp	.Lcbc_dec_loop8_enter
2850.align	16
2851.Lcbc_dec_loop8:
2852	movups	$inout7,($out)
2853	lea	0x10($out),$out
2854.Lcbc_dec_loop8_enter:
2855	movdqu		0x60($inp),$inout6
2856	pxor		$rndkey0,$inout0
2857	movdqu		0x70($inp),$inout7
2858	pxor		$rndkey0,$inout1
2859	$movkey		0x10-0x70($key),$rndkey1
2860	pxor		$rndkey0,$inout2
2861	xor		$inp_,$inp_
2862	cmp		\$0x70,$len	# is there at least 0x60 bytes ahead?
2863	pxor		$rndkey0,$inout3
2864	pxor		$rndkey0,$inout4
2865	pxor		$rndkey0,$inout5
2866	pxor		$rndkey0,$inout6
2867
2868	aesdec		$rndkey1,$inout0
2869	pxor		$rndkey0,$inout7
2870	$movkey		0x20-0x70($key),$rndkey0
2871	aesdec		$rndkey1,$inout1
2872	aesdec		$rndkey1,$inout2
2873	aesdec		$rndkey1,$inout3
2874	aesdec		$rndkey1,$inout4
2875	aesdec		$rndkey1,$inout5
2876	aesdec		$rndkey1,$inout6
2877	setnc		${inp_}b
2878	shl		\$7,$inp_
2879	aesdec		$rndkey1,$inout7
2880	add		$inp,$inp_
2881	$movkey		0x30-0x70($key),$rndkey1
2882___
2883for($i=1;$i<12;$i++) {
2884my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
2885$code.=<<___	if ($i==7);
2886	cmp		\$11,$rounds
2887___
2888$code.=<<___;
2889	aesdec		$rndkeyx,$inout0
2890	aesdec		$rndkeyx,$inout1
2891	aesdec		$rndkeyx,$inout2
2892	aesdec		$rndkeyx,$inout3
2893	aesdec		$rndkeyx,$inout4
2894	aesdec		$rndkeyx,$inout5
2895	aesdec		$rndkeyx,$inout6
2896	aesdec		$rndkeyx,$inout7
2897	$movkey		`0x30+0x10*$i`-0x70($key),$rndkeyx
2898___
2899$code.=<<___	if ($i<6 || (!($i&1) && $i>7));
2900	nop
2901___
2902$code.=<<___	if ($i==7);
2903	jb		.Lcbc_dec_done
2904___
2905$code.=<<___	if ($i==9);
2906	je		.Lcbc_dec_done
2907___
2908$code.=<<___	if ($i==11);
2909	jmp		.Lcbc_dec_done
2910___
2911}
2912$code.=<<___;
2913.align	16
2914.Lcbc_dec_done:
2915	aesdec		$rndkey1,$inout0
2916	aesdec		$rndkey1,$inout1
2917	pxor		$rndkey0,$iv
2918	pxor		$rndkey0,$in0
2919	aesdec		$rndkey1,$inout2
2920	aesdec		$rndkey1,$inout3
2921	pxor		$rndkey0,$in1
2922	pxor		$rndkey0,$in2
2923	aesdec		$rndkey1,$inout4
2924	aesdec		$rndkey1,$inout5
2925	pxor		$rndkey0,$in3
2926	pxor		$rndkey0,$in4
2927	aesdec		$rndkey1,$inout6
2928	aesdec		$rndkey1,$inout7
2929	movdqu		0x50($inp),$rndkey1
2930
2931	aesdeclast	$iv,$inout0
2932	movdqu		0x60($inp),$iv		# borrow $iv
2933	pxor		$rndkey0,$rndkey1
2934	aesdeclast	$in0,$inout1
2935	pxor		$rndkey0,$iv
2936	movdqu		0x70($inp),$rndkey0	# next IV
2937	aesdeclast	$in1,$inout2
2938	lea		0x80($inp),$inp
2939	movdqu		0x00($inp_),$in0
2940	aesdeclast	$in2,$inout3
2941	aesdeclast	$in3,$inout4
2942	movdqu		0x10($inp_),$in1
2943	movdqu		0x20($inp_),$in2
2944	aesdeclast	$in4,$inout5
2945	aesdeclast	$rndkey1,$inout6
2946	movdqu		0x30($inp_),$in3
2947	movdqu		0x40($inp_),$in4
2948	aesdeclast	$iv,$inout7
2949	movdqa		$rndkey0,$iv		# return $iv
2950	movdqu		0x50($inp_),$rndkey1
2951	$movkey		-0x70($key),$rndkey0
2952
2953	movups		$inout0,($out)		# store output
2954	movdqa		$in0,$inout0
2955	movups		$inout1,0x10($out)
2956	movdqa		$in1,$inout1
2957	movups		$inout2,0x20($out)
2958	movdqa		$in2,$inout2
2959	movups		$inout3,0x30($out)
2960	movdqa		$in3,$inout3
2961	movups		$inout4,0x40($out)
2962	movdqa		$in4,$inout4
2963	movups		$inout5,0x50($out)
2964	movdqa		$rndkey1,$inout5
2965	movups		$inout6,0x60($out)
2966	lea		0x70($out),$out
2967
2968	sub	\$0x80,$len
2969	ja	.Lcbc_dec_loop8
2970
2971	movaps	$inout7,$inout0
2972	lea	-0x70($key),$key
2973	add	\$0x70,$len
2974	jle	.Lcbc_dec_clear_tail_collected
2975	movups	$inout7,($out)
2976	lea	0x10($out),$out
2977	cmp	\$0x50,$len
2978	jbe	.Lcbc_dec_tail
2979
2980	movaps	$in0,$inout0
2981.Lcbc_dec_six_or_seven:
2982	cmp	\$0x60,$len
2983	ja	.Lcbc_dec_seven
2984
2985	movaps	$inout5,$inout6
2986	call	_aesni_decrypt6
2987	pxor	$iv,$inout0		# ^= IV
2988	movaps	$inout6,$iv
2989	pxor	$in0,$inout1
2990	movdqu	$inout0,($out)
2991	pxor	$in1,$inout2
2992	movdqu	$inout1,0x10($out)
2993	 pxor	$inout1,$inout1		# clear register bank
2994	pxor	$in2,$inout3
2995	movdqu	$inout2,0x20($out)
2996	 pxor	$inout2,$inout2
2997	pxor	$in3,$inout4
2998	movdqu	$inout3,0x30($out)
2999	 pxor	$inout3,$inout3
3000	pxor	$in4,$inout5
3001	movdqu	$inout4,0x40($out)
3002	 pxor	$inout4,$inout4
3003	lea	0x50($out),$out
3004	movdqa	$inout5,$inout0
3005	 pxor	$inout5,$inout5
3006	jmp	.Lcbc_dec_tail_collected
3007
3008.align	16
3009.Lcbc_dec_seven:
3010	movups	0x60($inp),$inout6
3011	xorps	$inout7,$inout7
3012	call	_aesni_decrypt8
3013	movups	0x50($inp),$inout7
3014	pxor	$iv,$inout0		# ^= IV
3015	movups	0x60($inp),$iv
3016	pxor	$in0,$inout1
3017	movdqu	$inout0,($out)
3018	pxor	$in1,$inout2
3019	movdqu	$inout1,0x10($out)
3020	 pxor	$inout1,$inout1		# clear register bank
3021	pxor	$in2,$inout3
3022	movdqu	$inout2,0x20($out)
3023	 pxor	$inout2,$inout2
3024	pxor	$in3,$inout4
3025	movdqu	$inout3,0x30($out)
3026	 pxor	$inout3,$inout3
3027	pxor	$in4,$inout5
3028	movdqu	$inout4,0x40($out)
3029	 pxor	$inout4,$inout4
3030	pxor	$inout7,$inout6
3031	movdqu	$inout5,0x50($out)
3032	 pxor	$inout5,$inout5
3033	lea	0x60($out),$out
3034	movdqa	$inout6,$inout0
3035	 pxor	$inout6,$inout6
3036	 pxor	$inout7,$inout7
3037	jmp	.Lcbc_dec_tail_collected
3038
3039.align	16
3040.Lcbc_dec_loop6:
3041	movups	$inout5,($out)
3042	lea	0x10($out),$out
3043	movdqu	0x00($inp),$inout0	# load input
3044	movdqu	0x10($inp),$inout1
3045	movdqa	$inout0,$in0
3046	movdqu	0x20($inp),$inout2
3047	movdqa	$inout1,$in1
3048	movdqu	0x30($inp),$inout3
3049	movdqa	$inout2,$in2
3050	movdqu	0x40($inp),$inout4
3051	movdqa	$inout3,$in3
3052	movdqu	0x50($inp),$inout5
3053	movdqa	$inout4,$in4
3054.Lcbc_dec_loop6_enter:
3055	lea	0x60($inp),$inp
3056	movdqa	$inout5,$inout6
3057
3058	call	_aesni_decrypt6
3059
3060	pxor	$iv,$inout0		# ^= IV
3061	movdqa	$inout6,$iv
3062	pxor	$in0,$inout1
3063	movdqu	$inout0,($out)
3064	pxor	$in1,$inout2
3065	movdqu	$inout1,0x10($out)
3066	pxor	$in2,$inout3
3067	movdqu	$inout2,0x20($out)
3068	pxor	$in3,$inout4
3069	mov	$key_,$key
3070	movdqu	$inout3,0x30($out)
3071	pxor	$in4,$inout5
3072	mov	$rnds_,$rounds
3073	movdqu	$inout4,0x40($out)
3074	lea	0x50($out),$out
3075	sub	\$0x60,$len
3076	ja	.Lcbc_dec_loop6
3077
3078	movdqa	$inout5,$inout0
3079	add	\$0x50,$len
3080	jle	.Lcbc_dec_clear_tail_collected
3081	movups	$inout5,($out)
3082	lea	0x10($out),$out
3083
3084.Lcbc_dec_tail:
3085	movups	($inp),$inout0
3086	sub	\$0x10,$len
3087	jbe	.Lcbc_dec_one		# $len is 1*16 or less
3088
3089	movups	0x10($inp),$inout1
3090	movaps	$inout0,$in0
3091	sub	\$0x10,$len
3092	jbe	.Lcbc_dec_two		# $len is 2*16 or less
3093
3094	movups	0x20($inp),$inout2
3095	movaps	$inout1,$in1
3096	sub	\$0x10,$len
3097	jbe	.Lcbc_dec_three		# $len is 3*16 or less
3098
3099	movups	0x30($inp),$inout3
3100	movaps	$inout2,$in2
3101	sub	\$0x10,$len
3102	jbe	.Lcbc_dec_four		# $len is 4*16 or less
3103
3104	movups	0x40($inp),$inout4	# $len is 5*16 or less
3105	movaps	$inout3,$in3
3106	movaps	$inout4,$in4
3107	xorps	$inout5,$inout5
3108	call	_aesni_decrypt6
3109	pxor	$iv,$inout0
3110	movaps	$in4,$iv
3111	pxor	$in0,$inout1
3112	movdqu	$inout0,($out)
3113	pxor	$in1,$inout2
3114	movdqu	$inout1,0x10($out)
3115	 pxor	$inout1,$inout1		# clear register bank
3116	pxor	$in2,$inout3
3117	movdqu	$inout2,0x20($out)
3118	 pxor	$inout2,$inout2
3119	pxor	$in3,$inout4
3120	movdqu	$inout3,0x30($out)
3121	 pxor	$inout3,$inout3
3122	lea	0x40($out),$out
3123	movdqa	$inout4,$inout0
3124	 pxor	$inout4,$inout4
3125	 pxor	$inout5,$inout5
3126	sub	\$0x10,$len
3127	jmp	.Lcbc_dec_tail_collected
3128
3129.align	16
3130.Lcbc_dec_one:
3131	movaps	$inout0,$in0
3132___
3133	&aesni_generate1("dec",$key,$rounds);
3134$code.=<<___;
3135	xorps	$iv,$inout0
3136	movaps	$in0,$iv
3137	jmp	.Lcbc_dec_tail_collected
3138.align	16
3139.Lcbc_dec_two:
3140	movaps	$inout1,$in1
3141	call	_aesni_decrypt2
3142	pxor	$iv,$inout0
3143	movaps	$in1,$iv
3144	pxor	$in0,$inout1
3145	movdqu	$inout0,($out)
3146	movdqa	$inout1,$inout0
3147	 pxor	$inout1,$inout1		# clear register bank
3148	lea	0x10($out),$out
3149	jmp	.Lcbc_dec_tail_collected
3150.align	16
3151.Lcbc_dec_three:
3152	movaps	$inout2,$in2
3153	call	_aesni_decrypt3
3154	pxor	$iv,$inout0
3155	movaps	$in2,$iv
3156	pxor	$in0,$inout1
3157	movdqu	$inout0,($out)
3158	pxor	$in1,$inout2
3159	movdqu	$inout1,0x10($out)
3160	 pxor	$inout1,$inout1		# clear register bank
3161	movdqa	$inout2,$inout0
3162	 pxor	$inout2,$inout2
3163	lea	0x20($out),$out
3164	jmp	.Lcbc_dec_tail_collected
3165.align	16
3166.Lcbc_dec_four:
3167	movaps	$inout3,$in3
3168	call	_aesni_decrypt4
3169	pxor	$iv,$inout0
3170	movaps	$in3,$iv
3171	pxor	$in0,$inout1
3172	movdqu	$inout0,($out)
3173	pxor	$in1,$inout2
3174	movdqu	$inout1,0x10($out)
3175	 pxor	$inout1,$inout1		# clear register bank
3176	pxor	$in2,$inout3
3177	movdqu	$inout2,0x20($out)
3178	 pxor	$inout2,$inout2
3179	movdqa	$inout3,$inout0
3180	 pxor	$inout3,$inout3
3181	lea	0x30($out),$out
3182	jmp	.Lcbc_dec_tail_collected
3183
3184.align	16
3185.Lcbc_dec_clear_tail_collected:
3186	pxor	$inout1,$inout1		# clear register bank
3187	pxor	$inout2,$inout2
3188	pxor	$inout3,$inout3
3189___
3190$code.=<<___ if (!$win64);
3191	pxor	$inout4,$inout4		# %xmm6..9
3192	pxor	$inout5,$inout5
3193	pxor	$inout6,$inout6
3194	pxor	$inout7,$inout7
3195___
3196$code.=<<___;
3197.Lcbc_dec_tail_collected:
3198	movups	$iv,($ivp)
3199	and	\$15,$len
3200	jnz	.Lcbc_dec_tail_partial
3201	movups	$inout0,($out)
3202	pxor	$inout0,$inout0
3203	jmp	.Lcbc_dec_ret
3204.align	16
3205.Lcbc_dec_tail_partial:
3206	movaps	$inout0,(%rsp)
3207	pxor	$inout0,$inout0
3208	mov	\$16,%rcx
3209	mov	$out,%rdi
3210	sub	$len,%rcx
3211	lea	(%rsp),%rsi
3212	.long	0x9066A4F3		# rep movsb
3213	movdqa	$inout0,(%rsp)
3214
3215.Lcbc_dec_ret:
3216	xorps	$rndkey0,$rndkey0	# %xmm0
3217	pxor	$rndkey1,$rndkey1
3218___
3219$code.=<<___ if ($win64);
3220	movaps	0x10(%rsp),%xmm6
3221	movaps	%xmm0,0x10(%rsp)	# clear stack
3222	movaps	0x20(%rsp),%xmm7
3223	movaps	%xmm0,0x20(%rsp)
3224	movaps	0x30(%rsp),%xmm8
3225	movaps	%xmm0,0x30(%rsp)
3226	movaps	0x40(%rsp),%xmm9
3227	movaps	%xmm0,0x40(%rsp)
3228	movaps	0x50(%rsp),%xmm10
3229	movaps	%xmm0,0x50(%rsp)
3230	movaps	0x60(%rsp),%xmm11
3231	movaps	%xmm0,0x60(%rsp)
3232	movaps	0x70(%rsp),%xmm12
3233	movaps	%xmm0,0x70(%rsp)
3234	movaps	0x80(%rsp),%xmm13
3235	movaps	%xmm0,0x80(%rsp)
3236	movaps	0x90(%rsp),%xmm14
3237	movaps	%xmm0,0x90(%rsp)
3238	movaps	0xa0(%rsp),%xmm15
3239	movaps	%xmm0,0xa0(%rsp)
3240___
3241$code.=<<___;
3242	lea	(%rbp),%rsp
3243	pop	%rbp
3244.Lcbc_ret:
3245	ret
3246.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
3247___
3248}
3249# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
3250#				int bits, AES_KEY *key)
3251#
3252# input:	$inp	user-supplied key
3253#		$bits	$inp length in bits
3254#		$key	pointer to key schedule
3255# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
3256#		*$key	key schedule
3257#
3258{ my ($inp,$bits,$key) = @_4args;
3259  $bits =~ s/%r/%e/;
3260
3261$code.=<<___;
3262.globl	${PREFIX}_set_decrypt_key
3263.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
3264.align	16
3265${PREFIX}_set_decrypt_key:
3266	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
3267	call	__aesni_set_encrypt_key
3268	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
3269	test	%eax,%eax
3270	jnz	.Ldec_key_ret
3271	lea	16($key,$bits),$inp	# points at the end of key schedule
3272
3273	$movkey	($key),%xmm0		# just swap
3274	$movkey	($inp),%xmm1
3275	$movkey	%xmm0,($inp)
3276	$movkey	%xmm1,($key)
3277	lea	16($key),$key
3278	lea	-16($inp),$inp
3279
3280.Ldec_key_inverse:
3281	$movkey	($key),%xmm0		# swap and inverse
3282	$movkey	($inp),%xmm1
3283	aesimc	%xmm0,%xmm0
3284	aesimc	%xmm1,%xmm1
3285	lea	16($key),$key
3286	lea	-16($inp),$inp
3287	$movkey	%xmm0,16($inp)
3288	$movkey	%xmm1,-16($key)
3289	cmp	$key,$inp
3290	ja	.Ldec_key_inverse
3291
3292	$movkey	($key),%xmm0		# inverse middle
3293	aesimc	%xmm0,%xmm0
3294	pxor	%xmm1,%xmm1
3295	$movkey	%xmm0,($inp)
3296	pxor	%xmm0,%xmm0
3297.Ldec_key_ret:
3298	add	\$8,%rsp
3299	ret
3300.LSEH_end_set_decrypt_key:
3301.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
3302___
3303
3304# This is based on submission by
3305#
3306#	Huang Ying <ying.huang@intel.com>
3307#	Vinodh Gopal <vinodh.gopal@intel.com>
3308#	Kahraman Akdemir
3309#
3310# Agressively optimized in respect to aeskeygenassist's critical path
3311# and is contained in %xmm0-5 to meet Win64 ABI requirement.
3312#
3313# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
3314#				int bits, AES_KEY * const key);
3315#
3316# input:	$inp	user-supplied key
3317#		$bits	$inp length in bits
3318#		$key	pointer to key schedule
3319# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
3320#		$bits	rounds-1 (used in aesni_set_decrypt_key)
3321#		*$key	key schedule
3322#		$key	pointer to key schedule (used in
3323#			aesni_set_decrypt_key)
3324#
3325# Subroutine is frame-less, which means that only volatile registers
3326# are used. Note that it's declared "abi-omnipotent", which means that
3327# amount of volatile registers is smaller on Windows.
3328#
3329$code.=<<___;
3330.globl	${PREFIX}_set_encrypt_key
3331.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
3332.align	16
3333${PREFIX}_set_encrypt_key:
3334__aesni_set_encrypt_key:
3335	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
3336	mov	\$-1,%rax
3337	test	$inp,$inp
3338	jz	.Lenc_key_ret
3339	test	$key,$key
3340	jz	.Lenc_key_ret
3341
3342	mov	\$`1<<28|1<<11`,%r10d	# AVX and XOP bits
3343	movups	($inp),%xmm0		# pull first 128 bits of *userKey
3344	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
3345	and	OPENSSL_ia32cap_P+4(%rip),%r10d
3346	lea	16($key),%rax		# %rax is used as modifiable copy of $key
3347	cmp	\$256,$bits
3348	je	.L14rounds
3349	cmp	\$192,$bits
3350	je	.L12rounds
3351	cmp	\$128,$bits
3352	jne	.Lbad_keybits
3353
3354.L10rounds:
3355	mov	\$9,$bits			# 10 rounds for 128-bit key
3356	cmp	\$`1<<28`,%r10d			# AVX, bit no XOP
3357	je	.L10rounds_alt
3358
3359	$movkey	%xmm0,($key)			# round 0
3360	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
3361	call		.Lkey_expansion_128_cold
3362	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
3363	call		.Lkey_expansion_128
3364	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
3365	call		.Lkey_expansion_128
3366	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
3367	call		.Lkey_expansion_128
3368	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
3369	call		.Lkey_expansion_128
3370	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
3371	call		.Lkey_expansion_128
3372	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
3373	call		.Lkey_expansion_128
3374	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
3375	call		.Lkey_expansion_128
3376	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
3377	call		.Lkey_expansion_128
3378	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
3379	call		.Lkey_expansion_128
3380	$movkey	%xmm0,(%rax)
3381	mov	$bits,80(%rax)	# 240(%rdx)
3382	xor	%eax,%eax
3383	jmp	.Lenc_key_ret
3384
3385.align	16
3386.L10rounds_alt:
3387	movdqa	.Lkey_rotate(%rip),%xmm5
3388	mov	\$8,%r10d
3389	movdqa	.Lkey_rcon1(%rip),%xmm4
3390	movdqa	%xmm0,%xmm2
3391	movdqu	%xmm0,($key)
3392	jmp	.Loop_key128
3393
3394.align	16
3395.Loop_key128:
3396	pshufb		%xmm5,%xmm0
3397	aesenclast	%xmm4,%xmm0
3398	pslld		\$1,%xmm4
3399	lea		16(%rax),%rax
3400
3401	movdqa		%xmm2,%xmm3
3402	pslldq		\$4,%xmm2
3403	pxor		%xmm2,%xmm3
3404	pslldq		\$4,%xmm2
3405	pxor		%xmm2,%xmm3
3406	pslldq		\$4,%xmm2
3407	pxor		%xmm3,%xmm2
3408
3409	pxor		%xmm2,%xmm0
3410	movdqu		%xmm0,-16(%rax)
3411	movdqa		%xmm0,%xmm2
3412
3413	dec	%r10d
3414	jnz	.Loop_key128
3415
3416	movdqa		.Lkey_rcon1b(%rip),%xmm4
3417
3418	pshufb		%xmm5,%xmm0
3419	aesenclast	%xmm4,%xmm0
3420	pslld		\$1,%xmm4
3421
3422	movdqa		%xmm2,%xmm3
3423	pslldq		\$4,%xmm2
3424	pxor		%xmm2,%xmm3
3425	pslldq		\$4,%xmm2
3426	pxor		%xmm2,%xmm3
3427	pslldq		\$4,%xmm2
3428	pxor		%xmm3,%xmm2
3429
3430	pxor		%xmm2,%xmm0
3431	movdqu		%xmm0,(%rax)
3432
3433	movdqa		%xmm0,%xmm2
3434	pshufb		%xmm5,%xmm0
3435	aesenclast	%xmm4,%xmm0
3436
3437	movdqa		%xmm2,%xmm3
3438	pslldq		\$4,%xmm2
3439	pxor		%xmm2,%xmm3
3440	pslldq		\$4,%xmm2
3441	pxor		%xmm2,%xmm3
3442	pslldq		\$4,%xmm2
3443	pxor		%xmm3,%xmm2
3444
3445	pxor		%xmm2,%xmm0
3446	movdqu		%xmm0,16(%rax)
3447
3448	mov	$bits,96(%rax)	# 240($key)
3449	xor	%eax,%eax
3450	jmp	.Lenc_key_ret
3451
3452.align	16
3453.L12rounds:
3454	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
3455	mov	\$11,$bits			# 12 rounds for 192
3456	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
3457	je	.L12rounds_alt
3458
3459	$movkey	%xmm0,($key)			# round 0
3460	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
3461	call		.Lkey_expansion_192a_cold
3462	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
3463	call		.Lkey_expansion_192b
3464	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
3465	call		.Lkey_expansion_192a
3466	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
3467	call		.Lkey_expansion_192b
3468	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
3469	call		.Lkey_expansion_192a
3470	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
3471	call		.Lkey_expansion_192b
3472	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
3473	call		.Lkey_expansion_192a
3474	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
3475	call		.Lkey_expansion_192b
3476	$movkey	%xmm0,(%rax)
3477	mov	$bits,48(%rax)	# 240(%rdx)
3478	xor	%rax, %rax
3479	jmp	.Lenc_key_ret
3480
3481.align	16
3482.L12rounds_alt:
3483	movdqa	.Lkey_rotate192(%rip),%xmm5
3484	movdqa	.Lkey_rcon1(%rip),%xmm4
3485	mov	\$8,%r10d
3486	movdqu	%xmm0,($key)
3487	jmp	.Loop_key192
3488
3489.align	16
3490.Loop_key192:
3491	movq		%xmm2,0(%rax)
3492	movdqa		%xmm2,%xmm1
3493	pshufb		%xmm5,%xmm2
3494	aesenclast	%xmm4,%xmm2
3495	pslld		\$1, %xmm4
3496	lea		24(%rax),%rax
3497
3498	movdqa		%xmm0,%xmm3
3499	pslldq		\$4,%xmm0
3500	pxor		%xmm0,%xmm3
3501	pslldq		\$4,%xmm0
3502	pxor		%xmm0,%xmm3
3503	pslldq		\$4,%xmm0
3504	pxor		%xmm3,%xmm0
3505
3506	pshufd		\$0xff,%xmm0,%xmm3
3507	pxor		%xmm1,%xmm3
3508	pslldq		\$4,%xmm1
3509	pxor		%xmm1,%xmm3
3510
3511	pxor		%xmm2,%xmm0
3512	pxor		%xmm3,%xmm2
3513	movdqu		%xmm0,-16(%rax)
3514
3515	dec	%r10d
3516	jnz	.Loop_key192
3517
3518	mov	$bits,32(%rax)	# 240($key)
3519	xor	%eax,%eax
3520	jmp	.Lenc_key_ret
3521
3522.align	16
3523.L14rounds:
3524	movups	16($inp),%xmm2			# remaning half of *userKey
3525	mov	\$13,$bits			# 14 rounds for 256
3526	lea	16(%rax),%rax
3527	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
3528	je	.L14rounds_alt
3529
3530	$movkey	%xmm0,($key)			# round 0
3531	$movkey	%xmm2,16($key)			# round 1
3532	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
3533	call		.Lkey_expansion_256a_cold
3534	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
3535	call		.Lkey_expansion_256b
3536	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
3537	call		.Lkey_expansion_256a
3538	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
3539	call		.Lkey_expansion_256b
3540	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
3541	call		.Lkey_expansion_256a
3542	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
3543	call		.Lkey_expansion_256b
3544	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
3545	call		.Lkey_expansion_256a
3546	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
3547	call		.Lkey_expansion_256b
3548	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
3549	call		.Lkey_expansion_256a
3550	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
3551	call		.Lkey_expansion_256b
3552	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
3553	call		.Lkey_expansion_256a
3554	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
3555	call		.Lkey_expansion_256b
3556	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
3557	call		.Lkey_expansion_256a
3558	$movkey	%xmm0,(%rax)
3559	mov	$bits,16(%rax)	# 240(%rdx)
3560	xor	%rax,%rax
3561	jmp	.Lenc_key_ret
3562
3563.align	16
3564.L14rounds_alt:
3565	movdqa	.Lkey_rotate(%rip),%xmm5
3566	movdqa	.Lkey_rcon1(%rip),%xmm4
3567	mov	\$7,%r10d
3568	movdqu	%xmm0,0($key)
3569	movdqa	%xmm2,%xmm1
3570	movdqu	%xmm2,16($key)
3571	jmp	.Loop_key256
3572
3573.align	16
3574.Loop_key256:
3575	pshufb		%xmm5,%xmm2
3576	aesenclast	%xmm4,%xmm2
3577
3578	movdqa		%xmm0,%xmm3
3579	pslldq		\$4,%xmm0
3580	pxor		%xmm0,%xmm3
3581	pslldq		\$4,%xmm0
3582	pxor		%xmm0,%xmm3
3583	pslldq		\$4,%xmm0
3584	pxor		%xmm3,%xmm0
3585	pslld		\$1,%xmm4
3586
3587	pxor		%xmm2,%xmm0
3588	movdqu		%xmm0,(%rax)
3589
3590	dec	%r10d
3591	jz	.Ldone_key256
3592
3593	pshufd		\$0xff,%xmm0,%xmm2
3594	pxor		%xmm3,%xmm3
3595	aesenclast	%xmm3,%xmm2
3596
3597	movdqa		%xmm1,%xmm3
3598	pslldq		\$4,%xmm1
3599	pxor		%xmm1,%xmm3
3600	pslldq		\$4,%xmm1
3601	pxor		%xmm1,%xmm3
3602	pslldq		\$4,%xmm1
3603	pxor		%xmm3,%xmm1
3604
3605	pxor		%xmm1,%xmm2
3606	movdqu		%xmm2,16(%rax)
3607	lea		32(%rax),%rax
3608	movdqa		%xmm2,%xmm1
3609
3610	jmp	.Loop_key256
3611
3612.Ldone_key256:
3613	mov	$bits,16(%rax)	# 240($key)
3614	xor	%eax,%eax
3615	jmp	.Lenc_key_ret
3616
3617.align	16
3618.Lbad_keybits:
3619	mov	\$-2,%rax
3620.Lenc_key_ret:
3621	pxor	%xmm0,%xmm0
3622	pxor	%xmm1,%xmm1
3623	pxor	%xmm2,%xmm2
3624	pxor	%xmm3,%xmm3
3625	pxor	%xmm4,%xmm4
3626	pxor	%xmm5,%xmm5
3627	add	\$8,%rsp
3628	ret
3629.LSEH_end_set_encrypt_key:
3630
3631.align	16
3632.Lkey_expansion_128:
3633	$movkey	%xmm0,(%rax)
3634	lea	16(%rax),%rax
3635.Lkey_expansion_128_cold:
3636	shufps	\$0b00010000,%xmm0,%xmm4
3637	xorps	%xmm4, %xmm0
3638	shufps	\$0b10001100,%xmm0,%xmm4
3639	xorps	%xmm4, %xmm0
3640	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
3641	xorps	%xmm1,%xmm0
3642	ret
3643
3644.align 16
3645.Lkey_expansion_192a:
3646	$movkey	%xmm0,(%rax)
3647	lea	16(%rax),%rax
3648.Lkey_expansion_192a_cold:
3649	movaps	%xmm2, %xmm5
3650.Lkey_expansion_192b_warm:
3651	shufps	\$0b00010000,%xmm0,%xmm4
3652	movdqa	%xmm2,%xmm3
3653	xorps	%xmm4,%xmm0
3654	shufps	\$0b10001100,%xmm0,%xmm4
3655	pslldq	\$4,%xmm3
3656	xorps	%xmm4,%xmm0
3657	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
3658	pxor	%xmm3,%xmm2
3659	pxor	%xmm1,%xmm0
3660	pshufd	\$0b11111111,%xmm0,%xmm3
3661	pxor	%xmm3,%xmm2
3662	ret
3663
3664.align 16
3665.Lkey_expansion_192b:
3666	movaps	%xmm0,%xmm3
3667	shufps	\$0b01000100,%xmm0,%xmm5
3668	$movkey	%xmm5,(%rax)
3669	shufps	\$0b01001110,%xmm2,%xmm3
3670	$movkey	%xmm3,16(%rax)
3671	lea	32(%rax),%rax
3672	jmp	.Lkey_expansion_192b_warm
3673
3674.align	16
3675.Lkey_expansion_256a:
3676	$movkey	%xmm2,(%rax)
3677	lea	16(%rax),%rax
3678.Lkey_expansion_256a_cold:
3679	shufps	\$0b00010000,%xmm0,%xmm4
3680	xorps	%xmm4,%xmm0
3681	shufps	\$0b10001100,%xmm0,%xmm4
3682	xorps	%xmm4,%xmm0
3683	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
3684	xorps	%xmm1,%xmm0
3685	ret
3686
3687.align 16
3688.Lkey_expansion_256b:
3689	$movkey	%xmm0,(%rax)
3690	lea	16(%rax),%rax
3691
3692	shufps	\$0b00010000,%xmm2,%xmm4
3693	xorps	%xmm4,%xmm2
3694	shufps	\$0b10001100,%xmm2,%xmm4
3695	xorps	%xmm4,%xmm2
3696	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
3697	xorps	%xmm1,%xmm2
3698	ret
3699.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
3700.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
3701___
3702}
3703
3704$code.=<<___;
3705.align	64
3706.Lbswap_mask:
3707	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
3708.Lincrement32:
3709	.long	6,6,6,0
3710.Lincrement64:
3711	.long	1,0,0,0
3712.Lxts_magic:
3713	.long	0x87,0,1,0
3714.Lincrement1:
3715	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3716.Lkey_rotate:
3717	.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
3718.Lkey_rotate192:
3719	.long	0x04070605,0x04070605,0x04070605,0x04070605
3720.Lkey_rcon1:
3721	.long	1,1,1,1
3722.Lkey_rcon1b:
3723	.long	0x1b,0x1b,0x1b,0x1b
3724
3725.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
3726.align	64
3727___
3728
3729# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3730#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3731if ($win64) {
3732$rec="%rcx";
3733$frame="%rdx";
3734$context="%r8";
3735$disp="%r9";
3736
3737$code.=<<___;
3738.extern	__imp_RtlVirtualUnwind
3739___
3740$code.=<<___ if ($PREFIX eq "aesni");
3741.type	ecb_ccm64_se_handler,\@abi-omnipotent
3742.align	16
3743ecb_ccm64_se_handler:
3744	push	%rsi
3745	push	%rdi
3746	push	%rbx
3747	push	%rbp
3748	push	%r12
3749	push	%r13
3750	push	%r14
3751	push	%r15
3752	pushfq
3753	sub	\$64,%rsp
3754
3755	mov	120($context),%rax	# pull context->Rax
3756	mov	248($context),%rbx	# pull context->Rip
3757
3758	mov	8($disp),%rsi		# disp->ImageBase
3759	mov	56($disp),%r11		# disp->HandlerData
3760
3761	mov	0(%r11),%r10d		# HandlerData[0]
3762	lea	(%rsi,%r10),%r10	# prologue label
3763	cmp	%r10,%rbx		# context->Rip<prologue label
3764	jb	.Lcommon_seh_tail
3765
3766	mov	152($context),%rax	# pull context->Rsp
3767
3768	mov	4(%r11),%r10d		# HandlerData[1]
3769	lea	(%rsi,%r10),%r10	# epilogue label
3770	cmp	%r10,%rbx		# context->Rip>=epilogue label
3771	jae	.Lcommon_seh_tail
3772
3773	lea	0(%rax),%rsi		# %xmm save area
3774	lea	512($context),%rdi	# &context.Xmm6
3775	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
3776	.long	0xa548f3fc		# cld; rep movsq
3777	lea	0x58(%rax),%rax		# adjust stack pointer
3778
3779	jmp	.Lcommon_seh_tail
3780.size	ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
3781
3782.type	ctr_xts_se_handler,\@abi-omnipotent
3783.align	16
3784ctr_xts_se_handler:
3785	push	%rsi
3786	push	%rdi
3787	push	%rbx
3788	push	%rbp
3789	push	%r12
3790	push	%r13
3791	push	%r14
3792	push	%r15
3793	pushfq
3794	sub	\$64,%rsp
3795
3796	mov	120($context),%rax	# pull context->Rax
3797	mov	248($context),%rbx	# pull context->Rip
3798
3799	mov	8($disp),%rsi		# disp->ImageBase
3800	mov	56($disp),%r11		# disp->HandlerData
3801
3802	mov	0(%r11),%r10d		# HandlerData[0]
3803	lea	(%rsi,%r10),%r10	# prologue lable
3804	cmp	%r10,%rbx		# context->Rip<prologue label
3805	jb	.Lcommon_seh_tail
3806
3807	mov	152($context),%rax	# pull context->Rsp
3808
3809	mov	4(%r11),%r10d		# HandlerData[1]
3810	lea	(%rsi,%r10),%r10	# epilogue label
3811	cmp	%r10,%rbx		# context->Rip>=epilogue label
3812	jae	.Lcommon_seh_tail
3813
3814	mov	160($context),%rax	# pull context->Rbp
3815	lea	-0xa0(%rax),%rsi	# %xmm save area
3816	lea	512($context),%rdi	# & context.Xmm6
3817	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
3818	.long	0xa548f3fc		# cld; rep movsq
3819
3820	jmp	.Lcommon_rbp_tail
3821.size	ctr_xts_se_handler,.-ctr_xts_se_handler
3822___
3823$code.=<<___;
3824.type	cbc_se_handler,\@abi-omnipotent
3825.align	16
3826cbc_se_handler:
3827	push	%rsi
3828	push	%rdi
3829	push	%rbx
3830	push	%rbp
3831	push	%r12
3832	push	%r13
3833	push	%r14
3834	push	%r15
3835	pushfq
3836	sub	\$64,%rsp
3837
3838	mov	152($context),%rax	# pull context->Rsp
3839	mov	248($context),%rbx	# pull context->Rip
3840
3841	lea	.Lcbc_decrypt_bulk(%rip),%r10
3842	cmp	%r10,%rbx		# context->Rip<"prologue" label
3843	jb	.Lcommon_seh_tail
3844
3845	lea	.Lcbc_decrypt_body(%rip),%r10
3846	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
3847	jb	.Lrestore_cbc_rax
3848
3849	lea	.Lcbc_ret(%rip),%r10
3850	cmp	%r10,%rbx		# context->Rip>="epilogue" label
3851	jae	.Lcommon_seh_tail
3852
3853	lea	16(%rax),%rsi		# %xmm save area
3854	lea	512($context),%rdi	# &context.Xmm6
3855	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
3856	.long	0xa548f3fc		# cld; rep movsq
3857
3858.Lcommon_rbp_tail:
3859	mov	160($context),%rax	# pull context->Rbp
3860	mov	(%rax),%rbp		# restore saved %rbp
3861	lea	8(%rax),%rax		# adjust stack pointer
3862	mov	%rbp,160($context)	# restore context->Rbp
3863	jmp	.Lcommon_seh_tail
3864
3865.Lrestore_cbc_rax:
3866	mov	120($context),%rax
3867
3868.Lcommon_seh_tail:
3869	mov	8(%rax),%rdi
3870	mov	16(%rax),%rsi
3871	mov	%rax,152($context)	# restore context->Rsp
3872	mov	%rsi,168($context)	# restore context->Rsi
3873	mov	%rdi,176($context)	# restore context->Rdi
3874
3875	mov	40($disp),%rdi		# disp->ContextRecord
3876	mov	$context,%rsi		# context
3877	mov	\$154,%ecx		# sizeof(CONTEXT)
3878	.long	0xa548f3fc		# cld; rep movsq
3879
3880	mov	$disp,%rsi
3881	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3882	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3883	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3884	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3885	mov	40(%rsi),%r10		# disp->ContextRecord
3886	lea	56(%rsi),%r11		# &disp->HandlerData
3887	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3888	mov	%r10,32(%rsp)		# arg5
3889	mov	%r11,40(%rsp)		# arg6
3890	mov	%r12,48(%rsp)		# arg7
3891	mov	%rcx,56(%rsp)		# arg8, (NULL)
3892	call	*__imp_RtlVirtualUnwind(%rip)
3893
3894	mov	\$1,%eax		# ExceptionContinueSearch
3895	add	\$64,%rsp
3896	popfq
3897	pop	%r15
3898	pop	%r14
3899	pop	%r13
3900	pop	%r12
3901	pop	%rbp
3902	pop	%rbx
3903	pop	%rdi
3904	pop	%rsi
3905	ret
3906.size	cbc_se_handler,.-cbc_se_handler
3907
3908.section	.pdata
3909.align	4
3910___
3911$code.=<<___ if ($PREFIX eq "aesni");
3912	.rva	.LSEH_begin_aesni_ecb_encrypt
3913	.rva	.LSEH_end_aesni_ecb_encrypt
3914	.rva	.LSEH_info_ecb
3915
3916	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
3917	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
3918	.rva	.LSEH_info_ccm64_enc
3919
3920	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
3921	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
3922	.rva	.LSEH_info_ccm64_dec
3923
3924	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
3925	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
3926	.rva	.LSEH_info_ctr32
3927
3928	.rva	.LSEH_begin_aesni_xts_encrypt
3929	.rva	.LSEH_end_aesni_xts_encrypt
3930	.rva	.LSEH_info_xts_enc
3931
3932	.rva	.LSEH_begin_aesni_xts_decrypt
3933	.rva	.LSEH_end_aesni_xts_decrypt
3934	.rva	.LSEH_info_xts_dec
3935___
3936$code.=<<___;
3937	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
3938	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
3939	.rva	.LSEH_info_cbc
3940
3941	.rva	${PREFIX}_set_decrypt_key
3942	.rva	.LSEH_end_set_decrypt_key
3943	.rva	.LSEH_info_key
3944
3945	.rva	${PREFIX}_set_encrypt_key
3946	.rva	.LSEH_end_set_encrypt_key
3947	.rva	.LSEH_info_key
3948.section	.xdata
3949.align	8
3950___
3951$code.=<<___ if ($PREFIX eq "aesni");
3952.LSEH_info_ecb:
3953	.byte	9,0,0,0
3954	.rva	ecb_ccm64_se_handler
3955	.rva	.Lecb_enc_body,.Lecb_enc_ret		# HandlerData[]
3956.LSEH_info_ccm64_enc:
3957	.byte	9,0,0,0
3958	.rva	ecb_ccm64_se_handler
3959	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
3960.LSEH_info_ccm64_dec:
3961	.byte	9,0,0,0
3962	.rva	ecb_ccm64_se_handler
3963	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
3964.LSEH_info_ctr32:
3965	.byte	9,0,0,0
3966	.rva	ctr_xts_se_handler
3967	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
3968.LSEH_info_xts_enc:
3969	.byte	9,0,0,0
3970	.rva	ctr_xts_se_handler
3971	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3972.LSEH_info_xts_dec:
3973	.byte	9,0,0,0
3974	.rva	ctr_xts_se_handler
3975	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3976___
3977$code.=<<___;
3978.LSEH_info_cbc:
3979	.byte	9,0,0,0
3980	.rva	cbc_se_handler
3981.LSEH_info_key:
3982	.byte	0x01,0x04,0x01,0x00
3983	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
3984___
3985}
3986
3987sub rex {
3988  local *opcode=shift;
3989  my ($dst,$src)=@_;
3990  my $rex=0;
3991
3992    $rex|=0x04			if($dst>=8);
3993    $rex|=0x01			if($src>=8);
3994    push @opcode,$rex|0x40	if($rex);
3995}
3996
3997sub aesni {
3998  my $line=shift;
3999  my @opcode=(0x66);
4000
4001    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
4002	rex(\@opcode,$4,$3);
4003	push @opcode,0x0f,0x3a,0xdf;
4004	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
4005	my $c=$2;
4006	push @opcode,$c=~/^0/?oct($c):$c;
4007	return ".byte\t".join(',',@opcode);
4008    }
4009    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
4010	my %opcodelet = (
4011		"aesimc" => 0xdb,
4012		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
4013		"aesdec" => 0xde,	"aesdeclast" => 0xdf
4014	);
4015	return undef if (!defined($opcodelet{$1}));
4016	rex(\@opcode,$3,$2);
4017	push @opcode,0x0f,0x38,$opcodelet{$1};
4018	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
4019	return ".byte\t".join(',',@opcode);
4020    }
4021    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
4022	my %opcodelet = (
4023		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
4024		"aesdec" => 0xde,	"aesdeclast" => 0xdf
4025	);
4026	return undef if (!defined($opcodelet{$1}));
4027	my $off = $2;
4028	push @opcode,0x44 if ($3>=8);
4029	push @opcode,0x0f,0x38,$opcodelet{$1};
4030	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
4031	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
4032	return ".byte\t".join(',',@opcode);
4033    }
4034    return $line;
4035}
4036
4037sub movbe {
4038	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
4039}
4040
4041$code =~ s/\`([^\`]*)\`/eval($1)/gem;
4042$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
4043#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
4044$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
4045
4046print $code;
4047
4048close STDOUT;
4049