xref: /freebsd/crypto/openssl/crypto/aes/asm/aesni-x86_64.pl (revision 8aac90f18aef7c9eea906c3ff9a001ca7b94f375)
1#! /usr/bin/env perl
2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
28# asymptotic limit it's not something you commonly achieve in reality,
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33#	16-byte     64-byte     256-byte    1-KB        8-KB
34# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
35# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
36# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
37# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
38# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
39# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved with CBC-MAC. This provides ~30% improvement over
63# "straightforward" CCM implementation with CTR and CBC-MAC performed
64# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
121# instructions' interleave factor. Westmere can execute at most 3
122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor		3x	6x	8x
132# theoretical asymptotic limit		1.67	0.83	0.625
133# measured performance for 8KB block	1.05	0.86	0.84
134#
135# "as if" interleave factor		4.7x	5.8x	6.0x
136#
137# Further data for other parallelizable modes:
138#
139# CBC decrypt				1.16	0.93	0.74
140# CTR					1.14	0.91	0.74
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
146# additional instructions with AES ones, but even AES instructions
147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions  still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153# utilizes 6x interleave because of limited register bank capacity.
154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
172######################################################################
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
176#		CBC en-/decrypt	CTR	XTS	ECB	OCB
177# Westmere	3.77/1.25	1.25	1.25	1.26
178# * Bridge	5.07/0.74	0.75	0.90	0.85	0.98
179# Haswell	4.44/0.63	0.63	0.73	0.63	0.70
180# Skylake	2.62/0.63	0.63	0.63	0.63
181# Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
182# Knights L	2.54/0.77	0.78	0.85	-	1.50
183# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
184# Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
185# Ryzen		2.71/0.35	0.35	0.44	0.38	0.49
186#
187# (*)	Atom Silvermont ECB result is suboptimal because of penalties
188#	incurred by operations on %xmm8-15. As ECB is not considered
189#	critical, nothing was done to mitigate the problem.
190
191$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
192			# generates drop-in replacement for
193			# crypto/aes/asm/aes-x86_64.pl:-)
194
195# $output is the last argument if it looks like a file (it has an extension)
196# $flavour is the first argument if it doesn't look like a file
197$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
198$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
199
200$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
201
202$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
203( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
204( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
205die "can't locate x86_64-xlate.pl";
206
207open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
208    or die "can't call $xlate: $!";
209*STDOUT=*OUT;
210
211$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
212@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
213		("%rdi","%rsi","%rdx","%rcx");	# Unix order
214
215$code=".text\n";
216$code.=".extern	OPENSSL_ia32cap_P\n";
217
218$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
219# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
220$inp="%rdi";
221$out="%rsi";
222$len="%rdx";
223$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
224$ivp="%r8";	# cbc, ctr, ...
225
226$rnds_="%r10d";	# backup copy for $rounds
227$key_="%r11";	# backup copy for $key
228
229# %xmm register layout
230$rndkey0="%xmm0";	$rndkey1="%xmm1";
231$inout0="%xmm2";	$inout1="%xmm3";
232$inout2="%xmm4";	$inout3="%xmm5";
233$inout4="%xmm6";	$inout5="%xmm7";
234$inout6="%xmm8";	$inout7="%xmm9";
235
236$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
237$in0="%xmm8";		$iv="%xmm9";
238
239# Inline version of internal aesni_[en|de]crypt1.
240#
241# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
242# cycles which take care of loop variables...
243{ my $sn;
244sub aesni_generate1 {
245my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
246++$sn;
247$code.=<<___;
248	$movkey	($key),$rndkey0
249	$movkey	16($key),$rndkey1
250___
251$code.=<<___ if (defined($ivec));
252	xorps	$rndkey0,$ivec
253	lea	32($key),$key
254	xorps	$ivec,$inout
255___
256$code.=<<___ if (!defined($ivec));
257	lea	32($key),$key
258	xorps	$rndkey0,$inout
259___
260$code.=<<___;
261.Loop_${p}1_$sn:
262	aes${p}	$rndkey1,$inout
263	dec	$rounds
264	$movkey	($key),$rndkey1
265	lea	16($key),$key
266	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
267	aes${p}last	$rndkey1,$inout
268___
269}}
270# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
271#
272{ my ($inp,$out,$key) = @_4args;
273
274$code.=<<___;
275.globl	${PREFIX}_encrypt
276.type	${PREFIX}_encrypt,\@abi-omnipotent
277.align	16
278${PREFIX}_encrypt:
279.cfi_startproc
280	endbranch
281	movups	($inp),$inout0		# load input
282	mov	240($key),$rounds	# key->rounds
283___
284	&aesni_generate1("enc",$key,$rounds);
285$code.=<<___;
286	 pxor	$rndkey0,$rndkey0	# clear register bank
287	 pxor	$rndkey1,$rndkey1
288	movups	$inout0,($out)		# output
289	 pxor	$inout0,$inout0
290	ret
291.cfi_endproc
292.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
293
294.globl	${PREFIX}_decrypt
295.type	${PREFIX}_decrypt,\@abi-omnipotent
296.align	16
297${PREFIX}_decrypt:
298.cfi_startproc
299	endbranch
300	movups	($inp),$inout0		# load input
301	mov	240($key),$rounds	# key->rounds
302___
303	&aesni_generate1("dec",$key,$rounds);
304$code.=<<___;
305	 pxor	$rndkey0,$rndkey0	# clear register bank
306	 pxor	$rndkey1,$rndkey1
307	movups	$inout0,($out)		# output
308	 pxor	$inout0,$inout0
309	ret
310.cfi_endproc
311.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
312___
313}
314
315# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
316# factor. Why 3x subroutine were originally used in loops? Even though
317# aes[enc|dec] latency was originally 6, it could be scheduled only
318# every *2nd* cycle. Thus 3x interleave was the one providing optimal
319# utilization, i.e. when subroutine's throughput is virtually same as
320# of non-interleaved subroutine [for number of input blocks up to 3].
321# This is why it originally made no sense to implement 2x subroutine.
322# But times change and it became appropriate to spend extra 192 bytes
323# on 2x subroutine on Atom Silvermont account. For processors that
324# can schedule aes[enc|dec] every cycle optimal interleave factor
325# equals to corresponding instructions latency. 8x is optimal for
326# * Bridge and "super-optimal" for other Intel CPUs...
327
328sub aesni_generate2 {
329my $dir=shift;
330# As already mentioned it takes in $key and $rounds, which are *not*
331# preserved. $inout[0-1] is cipher/clear text...
332$code.=<<___;
333.type	_aesni_${dir}rypt2,\@abi-omnipotent
334.align	16
335_aesni_${dir}rypt2:
336.cfi_startproc
337	$movkey	($key),$rndkey0
338	shl	\$4,$rounds
339	$movkey	16($key),$rndkey1
340	xorps	$rndkey0,$inout0
341	xorps	$rndkey0,$inout1
342	$movkey	32($key),$rndkey0
343	lea	32($key,$rounds),$key
344	neg	%rax				# $rounds
345	add	\$16,%rax
346
347.L${dir}_loop2:
348	aes${dir}	$rndkey1,$inout0
349	aes${dir}	$rndkey1,$inout1
350	$movkey		($key,%rax),$rndkey1
351	add		\$32,%rax
352	aes${dir}	$rndkey0,$inout0
353	aes${dir}	$rndkey0,$inout1
354	$movkey		-16($key,%rax),$rndkey0
355	jnz		.L${dir}_loop2
356
357	aes${dir}	$rndkey1,$inout0
358	aes${dir}	$rndkey1,$inout1
359	aes${dir}last	$rndkey0,$inout0
360	aes${dir}last	$rndkey0,$inout1
361	ret
362.cfi_endproc
363.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
364___
365}
366sub aesni_generate3 {
367my $dir=shift;
368# As already mentioned it takes in $key and $rounds, which are *not*
369# preserved. $inout[0-2] is cipher/clear text...
370$code.=<<___;
371.type	_aesni_${dir}rypt3,\@abi-omnipotent
372.align	16
373_aesni_${dir}rypt3:
374.cfi_startproc
375	$movkey	($key),$rndkey0
376	shl	\$4,$rounds
377	$movkey	16($key),$rndkey1
378	xorps	$rndkey0,$inout0
379	xorps	$rndkey0,$inout1
380	xorps	$rndkey0,$inout2
381	$movkey	32($key),$rndkey0
382	lea	32($key,$rounds),$key
383	neg	%rax				# $rounds
384	add	\$16,%rax
385
386.L${dir}_loop3:
387	aes${dir}	$rndkey1,$inout0
388	aes${dir}	$rndkey1,$inout1
389	aes${dir}	$rndkey1,$inout2
390	$movkey		($key,%rax),$rndkey1
391	add		\$32,%rax
392	aes${dir}	$rndkey0,$inout0
393	aes${dir}	$rndkey0,$inout1
394	aes${dir}	$rndkey0,$inout2
395	$movkey		-16($key,%rax),$rndkey0
396	jnz		.L${dir}_loop3
397
398	aes${dir}	$rndkey1,$inout0
399	aes${dir}	$rndkey1,$inout1
400	aes${dir}	$rndkey1,$inout2
401	aes${dir}last	$rndkey0,$inout0
402	aes${dir}last	$rndkey0,$inout1
403	aes${dir}last	$rndkey0,$inout2
404	ret
405.cfi_endproc
406.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
407___
408}
409# 4x interleave is implemented to improve small block performance,
410# most notably [and naturally] 4 block by ~30%. One can argue that one
411# should have implemented 5x as well, but improvement would be <20%,
412# so it's not worth it...
413sub aesni_generate4 {
414my $dir=shift;
415# As already mentioned it takes in $key and $rounds, which are *not*
416# preserved. $inout[0-3] is cipher/clear text...
417$code.=<<___;
418.type	_aesni_${dir}rypt4,\@abi-omnipotent
419.align	16
420_aesni_${dir}rypt4:
421.cfi_startproc
422	$movkey	($key),$rndkey0
423	shl	\$4,$rounds
424	$movkey	16($key),$rndkey1
425	xorps	$rndkey0,$inout0
426	xorps	$rndkey0,$inout1
427	xorps	$rndkey0,$inout2
428	xorps	$rndkey0,$inout3
429	$movkey	32($key),$rndkey0
430	lea	32($key,$rounds),$key
431	neg	%rax				# $rounds
432	.byte	0x0f,0x1f,0x00
433	add	\$16,%rax
434
435.L${dir}_loop4:
436	aes${dir}	$rndkey1,$inout0
437	aes${dir}	$rndkey1,$inout1
438	aes${dir}	$rndkey1,$inout2
439	aes${dir}	$rndkey1,$inout3
440	$movkey		($key,%rax),$rndkey1
441	add		\$32,%rax
442	aes${dir}	$rndkey0,$inout0
443	aes${dir}	$rndkey0,$inout1
444	aes${dir}	$rndkey0,$inout2
445	aes${dir}	$rndkey0,$inout3
446	$movkey		-16($key,%rax),$rndkey0
447	jnz		.L${dir}_loop4
448
449	aes${dir}	$rndkey1,$inout0
450	aes${dir}	$rndkey1,$inout1
451	aes${dir}	$rndkey1,$inout2
452	aes${dir}	$rndkey1,$inout3
453	aes${dir}last	$rndkey0,$inout0
454	aes${dir}last	$rndkey0,$inout1
455	aes${dir}last	$rndkey0,$inout2
456	aes${dir}last	$rndkey0,$inout3
457	ret
458.cfi_endproc
459.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
460___
461}
462sub aesni_generate6 {
463my $dir=shift;
464# As already mentioned it takes in $key and $rounds, which are *not*
465# preserved. $inout[0-5] is cipher/clear text...
466$code.=<<___;
467.type	_aesni_${dir}rypt6,\@abi-omnipotent
468.align	16
469_aesni_${dir}rypt6:
470.cfi_startproc
471	$movkey		($key),$rndkey0
472	shl		\$4,$rounds
473	$movkey		16($key),$rndkey1
474	xorps		$rndkey0,$inout0
475	pxor		$rndkey0,$inout1
476	pxor		$rndkey0,$inout2
477	aes${dir}	$rndkey1,$inout0
478	lea		32($key,$rounds),$key
479	neg		%rax			# $rounds
480	aes${dir}	$rndkey1,$inout1
481	pxor		$rndkey0,$inout3
482	pxor		$rndkey0,$inout4
483	aes${dir}	$rndkey1,$inout2
484	pxor		$rndkey0,$inout5
485	$movkey		($key,%rax),$rndkey0
486	add		\$16,%rax
487	jmp		.L${dir}_loop6_enter
488.align	16
489.L${dir}_loop6:
490	aes${dir}	$rndkey1,$inout0
491	aes${dir}	$rndkey1,$inout1
492	aes${dir}	$rndkey1,$inout2
493.L${dir}_loop6_enter:
494	aes${dir}	$rndkey1,$inout3
495	aes${dir}	$rndkey1,$inout4
496	aes${dir}	$rndkey1,$inout5
497	$movkey		($key,%rax),$rndkey1
498	add		\$32,%rax
499	aes${dir}	$rndkey0,$inout0
500	aes${dir}	$rndkey0,$inout1
501	aes${dir}	$rndkey0,$inout2
502	aes${dir}	$rndkey0,$inout3
503	aes${dir}	$rndkey0,$inout4
504	aes${dir}	$rndkey0,$inout5
505	$movkey		-16($key,%rax),$rndkey0
506	jnz		.L${dir}_loop6
507
508	aes${dir}	$rndkey1,$inout0
509	aes${dir}	$rndkey1,$inout1
510	aes${dir}	$rndkey1,$inout2
511	aes${dir}	$rndkey1,$inout3
512	aes${dir}	$rndkey1,$inout4
513	aes${dir}	$rndkey1,$inout5
514	aes${dir}last	$rndkey0,$inout0
515	aes${dir}last	$rndkey0,$inout1
516	aes${dir}last	$rndkey0,$inout2
517	aes${dir}last	$rndkey0,$inout3
518	aes${dir}last	$rndkey0,$inout4
519	aes${dir}last	$rndkey0,$inout5
520	ret
521.cfi_endproc
522.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
523___
524}
525sub aesni_generate8 {
526my $dir=shift;
527# As already mentioned it takes in $key and $rounds, which are *not*
528# preserved. $inout[0-7] is cipher/clear text...
529$code.=<<___;
530.type	_aesni_${dir}rypt8,\@abi-omnipotent
531.align	16
532_aesni_${dir}rypt8:
533.cfi_startproc
534	$movkey		($key),$rndkey0
535	shl		\$4,$rounds
536	$movkey		16($key),$rndkey1
537	xorps		$rndkey0,$inout0
538	xorps		$rndkey0,$inout1
539	pxor		$rndkey0,$inout2
540	pxor		$rndkey0,$inout3
541	pxor		$rndkey0,$inout4
542	lea		32($key,$rounds),$key
543	neg		%rax			# $rounds
544	aes${dir}	$rndkey1,$inout0
545	pxor		$rndkey0,$inout5
546	pxor		$rndkey0,$inout6
547	aes${dir}	$rndkey1,$inout1
548	pxor		$rndkey0,$inout7
549	$movkey		($key,%rax),$rndkey0
550	add		\$16,%rax
551	jmp		.L${dir}_loop8_inner
552.align	16
553.L${dir}_loop8:
554	aes${dir}	$rndkey1,$inout0
555	aes${dir}	$rndkey1,$inout1
556.L${dir}_loop8_inner:
557	aes${dir}	$rndkey1,$inout2
558	aes${dir}	$rndkey1,$inout3
559	aes${dir}	$rndkey1,$inout4
560	aes${dir}	$rndkey1,$inout5
561	aes${dir}	$rndkey1,$inout6
562	aes${dir}	$rndkey1,$inout7
563.L${dir}_loop8_enter:
564	$movkey		($key,%rax),$rndkey1
565	add		\$32,%rax
566	aes${dir}	$rndkey0,$inout0
567	aes${dir}	$rndkey0,$inout1
568	aes${dir}	$rndkey0,$inout2
569	aes${dir}	$rndkey0,$inout3
570	aes${dir}	$rndkey0,$inout4
571	aes${dir}	$rndkey0,$inout5
572	aes${dir}	$rndkey0,$inout6
573	aes${dir}	$rndkey0,$inout7
574	$movkey		-16($key,%rax),$rndkey0
575	jnz		.L${dir}_loop8
576
577	aes${dir}	$rndkey1,$inout0
578	aes${dir}	$rndkey1,$inout1
579	aes${dir}	$rndkey1,$inout2
580	aes${dir}	$rndkey1,$inout3
581	aes${dir}	$rndkey1,$inout4
582	aes${dir}	$rndkey1,$inout5
583	aes${dir}	$rndkey1,$inout6
584	aes${dir}	$rndkey1,$inout7
585	aes${dir}last	$rndkey0,$inout0
586	aes${dir}last	$rndkey0,$inout1
587	aes${dir}last	$rndkey0,$inout2
588	aes${dir}last	$rndkey0,$inout3
589	aes${dir}last	$rndkey0,$inout4
590	aes${dir}last	$rndkey0,$inout5
591	aes${dir}last	$rndkey0,$inout6
592	aes${dir}last	$rndkey0,$inout7
593	ret
594.cfi_endproc
595.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
596___
597}
598&aesni_generate2("enc") if ($PREFIX eq "aesni");
599&aesni_generate2("dec");
600&aesni_generate3("enc") if ($PREFIX eq "aesni");
601&aesni_generate3("dec");
602&aesni_generate4("enc") if ($PREFIX eq "aesni");
603&aesni_generate4("dec");
604&aesni_generate6("enc") if ($PREFIX eq "aesni");
605&aesni_generate6("dec");
606&aesni_generate8("enc") if ($PREFIX eq "aesni");
607&aesni_generate8("dec");
608
609if ($PREFIX eq "aesni") {
610########################################################################
611# void aesni_ecb_encrypt (const void *in, void *out,
612#			  size_t length, const AES_KEY *key,
613#			  int enc);
614$code.=<<___;
615.globl	aesni_ecb_encrypt
616.type	aesni_ecb_encrypt,\@function,5
617.align	16
618aesni_ecb_encrypt:
619.cfi_startproc
620	endbranch
621___
622$code.=<<___ if ($win64);
623	lea	-0x58(%rsp),%rsp
624	movaps	%xmm6,(%rsp)		# offload $inout4..7
625	movaps	%xmm7,0x10(%rsp)
626	movaps	%xmm8,0x20(%rsp)
627	movaps	%xmm9,0x30(%rsp)
628.Lecb_enc_body:
629___
630$code.=<<___;
631	and	\$-16,$len		# if ($len<16)
632	jz	.Lecb_ret		# return
633
634	mov	240($key),$rounds	# key->rounds
635	$movkey	($key),$rndkey0
636	mov	$key,$key_		# backup $key
637	mov	$rounds,$rnds_		# backup $rounds
638	test	%r8d,%r8d		# 5th argument
639	jz	.Lecb_decrypt
640#--------------------------- ECB ENCRYPT ------------------------------#
641	cmp	\$0x80,$len		# if ($len<8*16)
642	jb	.Lecb_enc_tail		# short input
643
644	movdqu	($inp),$inout0		# load 8 input blocks
645	movdqu	0x10($inp),$inout1
646	movdqu	0x20($inp),$inout2
647	movdqu	0x30($inp),$inout3
648	movdqu	0x40($inp),$inout4
649	movdqu	0x50($inp),$inout5
650	movdqu	0x60($inp),$inout6
651	movdqu	0x70($inp),$inout7
652	lea	0x80($inp),$inp		# $inp+=8*16
653	sub	\$0x80,$len		# $len-=8*16 (can be zero)
654	jmp	.Lecb_enc_loop8_enter
655.align 16
656.Lecb_enc_loop8:
657	movups	$inout0,($out)		# store 8 output blocks
658	mov	$key_,$key		# restore $key
659	movdqu	($inp),$inout0		# load 8 input blocks
660	mov	$rnds_,$rounds		# restore $rounds
661	movups	$inout1,0x10($out)
662	movdqu	0x10($inp),$inout1
663	movups	$inout2,0x20($out)
664	movdqu	0x20($inp),$inout2
665	movups	$inout3,0x30($out)
666	movdqu	0x30($inp),$inout3
667	movups	$inout4,0x40($out)
668	movdqu	0x40($inp),$inout4
669	movups	$inout5,0x50($out)
670	movdqu	0x50($inp),$inout5
671	movups	$inout6,0x60($out)
672	movdqu	0x60($inp),$inout6
673	movups	$inout7,0x70($out)
674	lea	0x80($out),$out		# $out+=8*16
675	movdqu	0x70($inp),$inout7
676	lea	0x80($inp),$inp		# $inp+=8*16
677.Lecb_enc_loop8_enter:
678
679	call	_aesni_encrypt8
680
681	sub	\$0x80,$len
682	jnc	.Lecb_enc_loop8		# loop if $len-=8*16 didn't borrow
683
684	movups	$inout0,($out)		# store 8 output blocks
685	mov	$key_,$key		# restore $key
686	movups	$inout1,0x10($out)
687	mov	$rnds_,$rounds		# restore $rounds
688	movups	$inout2,0x20($out)
689	movups	$inout3,0x30($out)
690	movups	$inout4,0x40($out)
691	movups	$inout5,0x50($out)
692	movups	$inout6,0x60($out)
693	movups	$inout7,0x70($out)
694	lea	0x80($out),$out		# $out+=8*16
695	add	\$0x80,$len		# restore real remaining $len
696	jz	.Lecb_ret		# done if ($len==0)
697
698.Lecb_enc_tail:				# $len is less than 8*16
699	movups	($inp),$inout0
700	cmp	\$0x20,$len
701	jb	.Lecb_enc_one
702	movups	0x10($inp),$inout1
703	je	.Lecb_enc_two
704	movups	0x20($inp),$inout2
705	cmp	\$0x40,$len
706	jb	.Lecb_enc_three
707	movups	0x30($inp),$inout3
708	je	.Lecb_enc_four
709	movups	0x40($inp),$inout4
710	cmp	\$0x60,$len
711	jb	.Lecb_enc_five
712	movups	0x50($inp),$inout5
713	je	.Lecb_enc_six
714	movdqu	0x60($inp),$inout6
715	xorps	$inout7,$inout7
716	call	_aesni_encrypt8
717	movups	$inout0,($out)		# store 7 output blocks
718	movups	$inout1,0x10($out)
719	movups	$inout2,0x20($out)
720	movups	$inout3,0x30($out)
721	movups	$inout4,0x40($out)
722	movups	$inout5,0x50($out)
723	movups	$inout6,0x60($out)
724	jmp	.Lecb_ret
725.align	16
726.Lecb_enc_one:
727___
728	&aesni_generate1("enc",$key,$rounds);
729$code.=<<___;
730	movups	$inout0,($out)		# store one output block
731	jmp	.Lecb_ret
732.align	16
733.Lecb_enc_two:
734	call	_aesni_encrypt2
735	movups	$inout0,($out)		# store 2 output blocks
736	movups	$inout1,0x10($out)
737	jmp	.Lecb_ret
738.align	16
739.Lecb_enc_three:
740	call	_aesni_encrypt3
741	movups	$inout0,($out)		# store 3 output blocks
742	movups	$inout1,0x10($out)
743	movups	$inout2,0x20($out)
744	jmp	.Lecb_ret
745.align	16
746.Lecb_enc_four:
747	call	_aesni_encrypt4
748	movups	$inout0,($out)		# store 4 output blocks
749	movups	$inout1,0x10($out)
750	movups	$inout2,0x20($out)
751	movups	$inout3,0x30($out)
752	jmp	.Lecb_ret
753.align	16
754.Lecb_enc_five:
755	xorps	$inout5,$inout5
756	call	_aesni_encrypt6
757	movups	$inout0,($out)		# store 5 output blocks
758	movups	$inout1,0x10($out)
759	movups	$inout2,0x20($out)
760	movups	$inout3,0x30($out)
761	movups	$inout4,0x40($out)
762	jmp	.Lecb_ret
763.align	16
764.Lecb_enc_six:
765	call	_aesni_encrypt6
766	movups	$inout0,($out)		# store 6 output blocks
767	movups	$inout1,0x10($out)
768	movups	$inout2,0x20($out)
769	movups	$inout3,0x30($out)
770	movups	$inout4,0x40($out)
771	movups	$inout5,0x50($out)
772	jmp	.Lecb_ret
773#--------------------------- ECB DECRYPT ------------------------------#
774.align	16
775.Lecb_decrypt:
776	cmp	\$0x80,$len		# if ($len<8*16)
777	jb	.Lecb_dec_tail		# short input
778
779	movdqu	($inp),$inout0		# load 8 input blocks
780	movdqu	0x10($inp),$inout1
781	movdqu	0x20($inp),$inout2
782	movdqu	0x30($inp),$inout3
783	movdqu	0x40($inp),$inout4
784	movdqu	0x50($inp),$inout5
785	movdqu	0x60($inp),$inout6
786	movdqu	0x70($inp),$inout7
787	lea	0x80($inp),$inp		# $inp+=8*16
788	sub	\$0x80,$len		# $len-=8*16 (can be zero)
789	jmp	.Lecb_dec_loop8_enter
790.align 16
791.Lecb_dec_loop8:
792	movups	$inout0,($out)		# store 8 output blocks
793	mov	$key_,$key		# restore $key
794	movdqu	($inp),$inout0		# load 8 input blocks
795	mov	$rnds_,$rounds		# restore $rounds
796	movups	$inout1,0x10($out)
797	movdqu	0x10($inp),$inout1
798	movups	$inout2,0x20($out)
799	movdqu	0x20($inp),$inout2
800	movups	$inout3,0x30($out)
801	movdqu	0x30($inp),$inout3
802	movups	$inout4,0x40($out)
803	movdqu	0x40($inp),$inout4
804	movups	$inout5,0x50($out)
805	movdqu	0x50($inp),$inout5
806	movups	$inout6,0x60($out)
807	movdqu	0x60($inp),$inout6
808	movups	$inout7,0x70($out)
809	lea	0x80($out),$out		# $out+=8*16
810	movdqu	0x70($inp),$inout7
811	lea	0x80($inp),$inp		# $inp+=8*16
812.Lecb_dec_loop8_enter:
813
814	call	_aesni_decrypt8
815
816	$movkey	($key_),$rndkey0
817	sub	\$0x80,$len
818	jnc	.Lecb_dec_loop8		# loop if $len-=8*16 didn't borrow
819
820	movups	$inout0,($out)		# store 8 output blocks
821	 pxor	$inout0,$inout0		# clear register bank
822	mov	$key_,$key		# restore $key
823	movups	$inout1,0x10($out)
824	 pxor	$inout1,$inout1
825	mov	$rnds_,$rounds		# restore $rounds
826	movups	$inout2,0x20($out)
827	 pxor	$inout2,$inout2
828	movups	$inout3,0x30($out)
829	 pxor	$inout3,$inout3
830	movups	$inout4,0x40($out)
831	 pxor	$inout4,$inout4
832	movups	$inout5,0x50($out)
833	 pxor	$inout5,$inout5
834	movups	$inout6,0x60($out)
835	 pxor	$inout6,$inout6
836	movups	$inout7,0x70($out)
837	 pxor	$inout7,$inout7
838	lea	0x80($out),$out		# $out+=8*16
839	add	\$0x80,$len		# restore real remaining $len
840	jz	.Lecb_ret		# done if ($len==0)
841
842.Lecb_dec_tail:
843	movups	($inp),$inout0
844	cmp	\$0x20,$len
845	jb	.Lecb_dec_one
846	movups	0x10($inp),$inout1
847	je	.Lecb_dec_two
848	movups	0x20($inp),$inout2
849	cmp	\$0x40,$len
850	jb	.Lecb_dec_three
851	movups	0x30($inp),$inout3
852	je	.Lecb_dec_four
853	movups	0x40($inp),$inout4
854	cmp	\$0x60,$len
855	jb	.Lecb_dec_five
856	movups	0x50($inp),$inout5
857	je	.Lecb_dec_six
858	movups	0x60($inp),$inout6
859	$movkey	($key),$rndkey0
860	xorps	$inout7,$inout7
861	call	_aesni_decrypt8
862	movups	$inout0,($out)		# store 7 output blocks
863	 pxor	$inout0,$inout0		# clear register bank
864	movups	$inout1,0x10($out)
865	 pxor	$inout1,$inout1
866	movups	$inout2,0x20($out)
867	 pxor	$inout2,$inout2
868	movups	$inout3,0x30($out)
869	 pxor	$inout3,$inout3
870	movups	$inout4,0x40($out)
871	 pxor	$inout4,$inout4
872	movups	$inout5,0x50($out)
873	 pxor	$inout5,$inout5
874	movups	$inout6,0x60($out)
875	 pxor	$inout6,$inout6
876	 pxor	$inout7,$inout7
877	jmp	.Lecb_ret
878.align	16
879.Lecb_dec_one:
880___
881	&aesni_generate1("dec",$key,$rounds);
882$code.=<<___;
883	movups	$inout0,($out)		# store one output block
884	 pxor	$inout0,$inout0		# clear register bank
885	jmp	.Lecb_ret
886.align	16
887.Lecb_dec_two:
888	call	_aesni_decrypt2
889	movups	$inout0,($out)		# store 2 output blocks
890	 pxor	$inout0,$inout0		# clear register bank
891	movups	$inout1,0x10($out)
892	 pxor	$inout1,$inout1
893	jmp	.Lecb_ret
894.align	16
895.Lecb_dec_three:
896	call	_aesni_decrypt3
897	movups	$inout0,($out)		# store 3 output blocks
898	 pxor	$inout0,$inout0		# clear register bank
899	movups	$inout1,0x10($out)
900	 pxor	$inout1,$inout1
901	movups	$inout2,0x20($out)
902	 pxor	$inout2,$inout2
903	jmp	.Lecb_ret
904.align	16
905.Lecb_dec_four:
906	call	_aesni_decrypt4
907	movups	$inout0,($out)		# store 4 output blocks
908	 pxor	$inout0,$inout0		# clear register bank
909	movups	$inout1,0x10($out)
910	 pxor	$inout1,$inout1
911	movups	$inout2,0x20($out)
912	 pxor	$inout2,$inout2
913	movups	$inout3,0x30($out)
914	 pxor	$inout3,$inout3
915	jmp	.Lecb_ret
916.align	16
917.Lecb_dec_five:
918	xorps	$inout5,$inout5
919	call	_aesni_decrypt6
920	movups	$inout0,($out)		# store 5 output blocks
921	 pxor	$inout0,$inout0		# clear register bank
922	movups	$inout1,0x10($out)
923	 pxor	$inout1,$inout1
924	movups	$inout2,0x20($out)
925	 pxor	$inout2,$inout2
926	movups	$inout3,0x30($out)
927	 pxor	$inout3,$inout3
928	movups	$inout4,0x40($out)
929	 pxor	$inout4,$inout4
930	 pxor	$inout5,$inout5
931	jmp	.Lecb_ret
932.align	16
933.Lecb_dec_six:
934	call	_aesni_decrypt6
935	movups	$inout0,($out)		# store 6 output blocks
936	 pxor	$inout0,$inout0		# clear register bank
937	movups	$inout1,0x10($out)
938	 pxor	$inout1,$inout1
939	movups	$inout2,0x20($out)
940	 pxor	$inout2,$inout2
941	movups	$inout3,0x30($out)
942	 pxor	$inout3,$inout3
943	movups	$inout4,0x40($out)
944	 pxor	$inout4,$inout4
945	movups	$inout5,0x50($out)
946	 pxor	$inout5,$inout5
947
948.Lecb_ret:
949	xorps	$rndkey0,$rndkey0	# %xmm0
950	pxor	$rndkey1,$rndkey1
951___
952$code.=<<___ if ($win64);
953	movaps	(%rsp),%xmm6
954	movaps	%xmm0,(%rsp)		# clear stack
955	movaps	0x10(%rsp),%xmm7
956	movaps	%xmm0,0x10(%rsp)
957	movaps	0x20(%rsp),%xmm8
958	movaps	%xmm0,0x20(%rsp)
959	movaps	0x30(%rsp),%xmm9
960	movaps	%xmm0,0x30(%rsp)
961	lea	0x58(%rsp),%rsp
962.Lecb_enc_ret:
963___
964$code.=<<___;
965	ret
966.cfi_endproc
967.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
968___
969
970{
971######################################################################
972# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
973#                         size_t blocks, const AES_KEY *key,
974#                         const char *ivec,char *cmac);
975#
976# Handles only complete blocks, operates on 64-bit counter and
977# does not update *ivec! Nor does it finalize CMAC value
978# (see engine/eng_aesni.c for details)
979#
980{
981my $cmac="%r9";	# 6th argument
982
983my $increment="%xmm9";
984my $iv="%xmm6";
985my $bswap_mask="%xmm7";
986
987$code.=<<___;
988.globl	aesni_ccm64_encrypt_blocks
989.type	aesni_ccm64_encrypt_blocks,\@function,6
990.align	16
991aesni_ccm64_encrypt_blocks:
992.cfi_startproc
993	endbranch
994___
995$code.=<<___ if ($win64);
996	lea	-0x58(%rsp),%rsp
997	movaps	%xmm6,(%rsp)		# $iv
998	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
999	movaps	%xmm8,0x20(%rsp)	# $in0
1000	movaps	%xmm9,0x30(%rsp)	# $increment
1001.Lccm64_enc_body:
1002___
1003$code.=<<___;
1004	mov	240($key),$rounds		# key->rounds
1005	movdqu	($ivp),$iv
1006	movdqa	.Lincrement64(%rip),$increment
1007	movdqa	.Lbswap_mask(%rip),$bswap_mask
1008
1009	shl	\$4,$rounds
1010	mov	\$16,$rnds_
1011	lea	0($key),$key_
1012	movdqu	($cmac),$inout1
1013	movdqa	$iv,$inout0
1014	lea	32($key,$rounds),$key		# end of key schedule
1015	pshufb	$bswap_mask,$iv
1016	sub	%rax,%r10			# twisted $rounds
1017	jmp	.Lccm64_enc_outer
1018.align	16
1019.Lccm64_enc_outer:
1020	$movkey	($key_),$rndkey0
1021	mov	%r10,%rax
1022	movups	($inp),$in0			# load inp
1023
1024	xorps	$rndkey0,$inout0		# counter
1025	$movkey	16($key_),$rndkey1
1026	xorps	$in0,$rndkey0
1027	xorps	$rndkey0,$inout1		# cmac^=inp
1028	$movkey	32($key_),$rndkey0
1029
1030.Lccm64_enc2_loop:
1031	aesenc	$rndkey1,$inout0
1032	aesenc	$rndkey1,$inout1
1033	$movkey	($key,%rax),$rndkey1
1034	add	\$32,%rax
1035	aesenc	$rndkey0,$inout0
1036	aesenc	$rndkey0,$inout1
1037	$movkey	-16($key,%rax),$rndkey0
1038	jnz	.Lccm64_enc2_loop
1039	aesenc	$rndkey1,$inout0
1040	aesenc	$rndkey1,$inout1
1041	paddq	$increment,$iv
1042	dec	$len				# $len-- ($len is in blocks)
1043	aesenclast	$rndkey0,$inout0
1044	aesenclast	$rndkey0,$inout1
1045
1046	lea	16($inp),$inp
1047	xorps	$inout0,$in0			# inp ^= E(iv)
1048	movdqa	$iv,$inout0
1049	movups	$in0,($out)			# save output
1050	pshufb	$bswap_mask,$inout0
1051	lea	16($out),$out			# $out+=16
1052	jnz	.Lccm64_enc_outer		# loop if ($len!=0)
1053
1054	 pxor	$rndkey0,$rndkey0		# clear register bank
1055	 pxor	$rndkey1,$rndkey1
1056	 pxor	$inout0,$inout0
1057	movups	$inout1,($cmac)			# store resulting mac
1058	 pxor	$inout1,$inout1
1059	 pxor	$in0,$in0
1060	 pxor	$iv,$iv
1061___
1062$code.=<<___ if ($win64);
1063	movaps	(%rsp),%xmm6
1064	movaps	%xmm0,(%rsp)			# clear stack
1065	movaps	0x10(%rsp),%xmm7
1066	movaps	%xmm0,0x10(%rsp)
1067	movaps	0x20(%rsp),%xmm8
1068	movaps	%xmm0,0x20(%rsp)
1069	movaps	0x30(%rsp),%xmm9
1070	movaps	%xmm0,0x30(%rsp)
1071	lea	0x58(%rsp),%rsp
1072.Lccm64_enc_ret:
1073___
1074$code.=<<___;
1075	ret
1076.cfi_endproc
1077.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1078___
1079######################################################################
1080$code.=<<___;
1081.globl	aesni_ccm64_decrypt_blocks
1082.type	aesni_ccm64_decrypt_blocks,\@function,6
1083.align	16
1084aesni_ccm64_decrypt_blocks:
1085.cfi_startproc
1086	endbranch
1087___
1088$code.=<<___ if ($win64);
1089	lea	-0x58(%rsp),%rsp
1090	movaps	%xmm6,(%rsp)		# $iv
1091	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
1092	movaps	%xmm8,0x20(%rsp)	# $in8
1093	movaps	%xmm9,0x30(%rsp)	# $increment
1094.Lccm64_dec_body:
1095___
1096$code.=<<___;
1097	mov	240($key),$rounds		# key->rounds
1098	movups	($ivp),$iv
1099	movdqu	($cmac),$inout1
1100	movdqa	.Lincrement64(%rip),$increment
1101	movdqa	.Lbswap_mask(%rip),$bswap_mask
1102
1103	movaps	$iv,$inout0
1104	mov	$rounds,$rnds_
1105	mov	$key,$key_
1106	pshufb	$bswap_mask,$iv
1107___
1108	&aesni_generate1("enc",$key,$rounds);
1109$code.=<<___;
1110	shl	\$4,$rnds_
1111	mov	\$16,$rounds
1112	movups	($inp),$in0			# load inp
1113	paddq	$increment,$iv
1114	lea	16($inp),$inp			# $inp+=16
1115	sub	%r10,%rax			# twisted $rounds
1116	lea	32($key_,$rnds_),$key		# end of key schedule
1117	mov	%rax,%r10
1118	jmp	.Lccm64_dec_outer
1119.align	16
1120.Lccm64_dec_outer:
1121	xorps	$inout0,$in0			# inp ^= E(iv)
1122	movdqa	$iv,$inout0
1123	movups	$in0,($out)			# save output
1124	lea	16($out),$out			# $out+=16
1125	pshufb	$bswap_mask,$inout0
1126
1127	sub	\$1,$len			# $len-- ($len is in blocks)
1128	jz	.Lccm64_dec_break		# if ($len==0) break
1129
1130	$movkey	($key_),$rndkey0
1131	mov	%r10,%rax
1132	$movkey	16($key_),$rndkey1
1133	xorps	$rndkey0,$in0
1134	xorps	$rndkey0,$inout0
1135	xorps	$in0,$inout1			# cmac^=out
1136	$movkey	32($key_),$rndkey0
1137	jmp	.Lccm64_dec2_loop
1138.align	16
1139.Lccm64_dec2_loop:
1140	aesenc	$rndkey1,$inout0
1141	aesenc	$rndkey1,$inout1
1142	$movkey	($key,%rax),$rndkey1
1143	add	\$32,%rax
1144	aesenc	$rndkey0,$inout0
1145	aesenc	$rndkey0,$inout1
1146	$movkey	-16($key,%rax),$rndkey0
1147	jnz	.Lccm64_dec2_loop
1148	movups	($inp),$in0			# load input
1149	paddq	$increment,$iv
1150	aesenc	$rndkey1,$inout0
1151	aesenc	$rndkey1,$inout1
1152	aesenclast	$rndkey0,$inout0
1153	aesenclast	$rndkey0,$inout1
1154	lea	16($inp),$inp			# $inp+=16
1155	jmp	.Lccm64_dec_outer
1156
1157.align	16
1158.Lccm64_dec_break:
1159	#xorps	$in0,$inout1			# cmac^=out
1160	mov	240($key_),$rounds
1161___
1162	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1163$code.=<<___;
1164	 pxor	$rndkey0,$rndkey0		# clear register bank
1165	 pxor	$rndkey1,$rndkey1
1166	 pxor	$inout0,$inout0
1167	movups	$inout1,($cmac)			# store resulting mac
1168	 pxor	$inout1,$inout1
1169	 pxor	$in0,$in0
1170	 pxor	$iv,$iv
1171___
1172$code.=<<___ if ($win64);
1173	movaps	(%rsp),%xmm6
1174	movaps	%xmm0,(%rsp)			# clear stack
1175	movaps	0x10(%rsp),%xmm7
1176	movaps	%xmm0,0x10(%rsp)
1177	movaps	0x20(%rsp),%xmm8
1178	movaps	%xmm0,0x20(%rsp)
1179	movaps	0x30(%rsp),%xmm9
1180	movaps	%xmm0,0x30(%rsp)
1181	lea	0x58(%rsp),%rsp
1182.Lccm64_dec_ret:
1183___
1184$code.=<<___;
1185	ret
1186.cfi_endproc
1187.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1188___
1189}
1190######################################################################
1191# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1192#                         size_t blocks, const AES_KEY *key,
1193#                         const char *ivec);
1194#
1195# Handles only complete blocks, operates on 32-bit counter and
1196# does not update *ivec! (see crypto/modes/ctr128.c for details)
1197#
1198# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1199# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1200# Keywords are full unroll and modulo-schedule counter calculations
1201# with zero-round key xor.
1202{
1203my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1204my ($key0,$ctr)=("%ebp","${ivp}d");
1205my $frame_size = 0x80 + ($win64?160:0);
1206
1207$code.=<<___;
1208.globl	aesni_ctr32_encrypt_blocks
1209.type	aesni_ctr32_encrypt_blocks,\@function,5
1210.align	16
1211aesni_ctr32_encrypt_blocks:
1212.cfi_startproc
1213	endbranch
1214	cmp	\$1,$len
1215	jne	.Lctr32_bulk
1216
1217	# handle single block without allocating stack frame,
1218	# useful when handling edges
1219	movups	($ivp),$inout0
1220	movups	($inp),$inout1
1221	mov	240($key),%edx			# key->rounds
1222___
1223	&aesni_generate1("enc",$key,"%edx");
1224$code.=<<___;
1225	 pxor	$rndkey0,$rndkey0		# clear register bank
1226	 pxor	$rndkey1,$rndkey1
1227	xorps	$inout1,$inout0
1228	 pxor	$inout1,$inout1
1229	movups	$inout0,($out)
1230	 xorps	$inout0,$inout0
1231	jmp	.Lctr32_epilogue
1232
1233.align	16
1234.Lctr32_bulk:
1235	lea	(%rsp),$key_			# use $key_ as frame pointer
1236.cfi_def_cfa_register	$key_
1237	push	%rbp
1238.cfi_push	%rbp
1239	sub	\$$frame_size,%rsp
1240	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1241___
1242$code.=<<___ if ($win64);
1243	movaps	%xmm6,-0xa8($key_)		# offload everything
1244	movaps	%xmm7,-0x98($key_)
1245	movaps	%xmm8,-0x88($key_)
1246	movaps	%xmm9,-0x78($key_)
1247	movaps	%xmm10,-0x68($key_)
1248	movaps	%xmm11,-0x58($key_)
1249	movaps	%xmm12,-0x48($key_)
1250	movaps	%xmm13,-0x38($key_)
1251	movaps	%xmm14,-0x28($key_)
1252	movaps	%xmm15,-0x18($key_)
1253.Lctr32_body:
1254___
1255$code.=<<___;
1256
1257	# 8 16-byte words on top of stack are counter values
1258	# xor-ed with zero-round key
1259
1260	movdqu	($ivp),$inout0
1261	movdqu	($key),$rndkey0
1262	mov	12($ivp),$ctr			# counter LSB
1263	pxor	$rndkey0,$inout0
1264	mov	12($key),$key0			# 0-round key LSB
1265	movdqa	$inout0,0x00(%rsp)		# populate counter block
1266	bswap	$ctr
1267	movdqa	$inout0,$inout1
1268	movdqa	$inout0,$inout2
1269	movdqa	$inout0,$inout3
1270	movdqa	$inout0,0x40(%rsp)
1271	movdqa	$inout0,0x50(%rsp)
1272	movdqa	$inout0,0x60(%rsp)
1273	mov	%rdx,%r10			# about to borrow %rdx
1274	movdqa	$inout0,0x70(%rsp)
1275
1276	lea	1($ctr),%rax
1277	 lea	2($ctr),%rdx
1278	bswap	%eax
1279	 bswap	%edx
1280	xor	$key0,%eax
1281	 xor	$key0,%edx
1282	pinsrd	\$3,%eax,$inout1
1283	lea	3($ctr),%rax
1284	movdqa	$inout1,0x10(%rsp)
1285	 pinsrd	\$3,%edx,$inout2
1286	bswap	%eax
1287	 mov	%r10,%rdx			# restore %rdx
1288	 lea	4($ctr),%r10
1289	 movdqa	$inout2,0x20(%rsp)
1290	xor	$key0,%eax
1291	 bswap	%r10d
1292	pinsrd	\$3,%eax,$inout3
1293	 xor	$key0,%r10d
1294	movdqa	$inout3,0x30(%rsp)
1295	lea	5($ctr),%r9
1296	 mov	%r10d,0x40+12(%rsp)
1297	bswap	%r9d
1298	 lea	6($ctr),%r10
1299	mov	240($key),$rounds		# key->rounds
1300	xor	$key0,%r9d
1301	 bswap	%r10d
1302	mov	%r9d,0x50+12(%rsp)
1303	 xor	$key0,%r10d
1304	lea	7($ctr),%r9
1305	 mov	%r10d,0x60+12(%rsp)
1306	bswap	%r9d
1307	 mov	OPENSSL_ia32cap_P+4(%rip),%r10d
1308	xor	$key0,%r9d
1309	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
1310	mov	%r9d,0x70+12(%rsp)
1311
1312	$movkey	0x10($key),$rndkey1
1313
1314	movdqa	0x40(%rsp),$inout4
1315	movdqa	0x50(%rsp),$inout5
1316
1317	cmp	\$8,$len		# $len is in blocks
1318	jb	.Lctr32_tail		# short input if ($len<8)
1319
1320	sub	\$6,$len		# $len is biased by -6
1321	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
1322	je	.Lctr32_6x		# [which denotes Atom Silvermont]
1323
1324	lea	0x80($key),$key		# size optimization
1325	sub	\$2,$len		# $len is biased by -8
1326	jmp	.Lctr32_loop8
1327
1328.align	16
1329.Lctr32_6x:
1330	shl	\$4,$rounds
1331	mov	\$48,$rnds_
1332	bswap	$key0
1333	lea	32($key,$rounds),$key	# end of key schedule
1334	sub	%rax,%r10		# twisted $rounds
1335	jmp	.Lctr32_loop6
1336
1337.align	16
1338.Lctr32_loop6:
1339	 add	\$6,$ctr		# next counter value
1340	$movkey	-48($key,$rnds_),$rndkey0
1341	aesenc	$rndkey1,$inout0
1342	 mov	$ctr,%eax
1343	 xor	$key0,%eax
1344	aesenc	$rndkey1,$inout1
1345	 movbe	%eax,`0x00+12`(%rsp)	# store next counter value
1346	 lea	1($ctr),%eax
1347	aesenc	$rndkey1,$inout2
1348	 xor	$key0,%eax
1349	 movbe	%eax,`0x10+12`(%rsp)
1350	aesenc	$rndkey1,$inout3
1351	 lea	2($ctr),%eax
1352	 xor	$key0,%eax
1353	aesenc	$rndkey1,$inout4
1354	 movbe	%eax,`0x20+12`(%rsp)
1355	 lea	3($ctr),%eax
1356	aesenc	$rndkey1,$inout5
1357	$movkey	-32($key,$rnds_),$rndkey1
1358	 xor	$key0,%eax
1359
1360	aesenc	$rndkey0,$inout0
1361	 movbe	%eax,`0x30+12`(%rsp)
1362	 lea	4($ctr),%eax
1363	aesenc	$rndkey0,$inout1
1364	 xor	$key0,%eax
1365	 movbe	%eax,`0x40+12`(%rsp)
1366	aesenc	$rndkey0,$inout2
1367	 lea	5($ctr),%eax
1368	 xor	$key0,%eax
1369	aesenc	$rndkey0,$inout3
1370	 movbe	%eax,`0x50+12`(%rsp)
1371	 mov	%r10,%rax		# mov	$rnds_,$rounds
1372	aesenc	$rndkey0,$inout4
1373	aesenc	$rndkey0,$inout5
1374	$movkey	-16($key,$rnds_),$rndkey0
1375
1376	call	.Lenc_loop6
1377
1378	movdqu	($inp),$inout6		# load 6 input blocks
1379	movdqu	0x10($inp),$inout7
1380	movdqu	0x20($inp),$in0
1381	movdqu	0x30($inp),$in1
1382	movdqu	0x40($inp),$in2
1383	movdqu	0x50($inp),$in3
1384	lea	0x60($inp),$inp		# $inp+=6*16
1385	$movkey	-64($key,$rnds_),$rndkey1
1386	pxor	$inout0,$inout6		# inp^=E(ctr)
1387	movaps	0x00(%rsp),$inout0	# load next counter [xor-ed with 0 round]
1388	pxor	$inout1,$inout7
1389	movaps	0x10(%rsp),$inout1
1390	pxor	$inout2,$in0
1391	movaps	0x20(%rsp),$inout2
1392	pxor	$inout3,$in1
1393	movaps	0x30(%rsp),$inout3
1394	pxor	$inout4,$in2
1395	movaps	0x40(%rsp),$inout4
1396	pxor	$inout5,$in3
1397	movaps	0x50(%rsp),$inout5
1398	movdqu	$inout6,($out)		# store 6 output blocks
1399	movdqu	$inout7,0x10($out)
1400	movdqu	$in0,0x20($out)
1401	movdqu	$in1,0x30($out)
1402	movdqu	$in2,0x40($out)
1403	movdqu	$in3,0x50($out)
1404	lea	0x60($out),$out		# $out+=6*16
1405
1406	sub	\$6,$len
1407	jnc	.Lctr32_loop6		# loop if $len-=6 didn't borrow
1408
1409	add	\$6,$len		# restore real remaining $len
1410	jz	.Lctr32_done		# done if ($len==0)
1411
1412	lea	-48($rnds_),$rounds
1413	lea	-80($key,$rnds_),$key	# restore $key
1414	neg	$rounds
1415	shr	\$4,$rounds		# restore $rounds
1416	jmp	.Lctr32_tail
1417
1418.align	32
1419.Lctr32_loop8:
1420	 add		\$8,$ctr		# next counter value
1421	movdqa		0x60(%rsp),$inout6
1422	aesenc		$rndkey1,$inout0
1423	 mov		$ctr,%r9d
1424	movdqa		0x70(%rsp),$inout7
1425	aesenc		$rndkey1,$inout1
1426	 bswap		%r9d
1427	$movkey		0x20-0x80($key),$rndkey0
1428	aesenc		$rndkey1,$inout2
1429	 xor		$key0,%r9d
1430	 nop
1431	aesenc		$rndkey1,$inout3
1432	 mov		%r9d,0x00+12(%rsp)	# store next counter value
1433	 lea		1($ctr),%r9
1434	aesenc		$rndkey1,$inout4
1435	aesenc		$rndkey1,$inout5
1436	aesenc		$rndkey1,$inout6
1437	aesenc		$rndkey1,$inout7
1438	$movkey		0x30-0x80($key),$rndkey1
1439___
1440for($i=2;$i<8;$i++) {
1441my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1442$code.=<<___;
1443	 bswap		%r9d
1444	aesenc		$rndkeyx,$inout0
1445	aesenc		$rndkeyx,$inout1
1446	 xor		$key0,%r9d
1447	 .byte		0x66,0x90
1448	aesenc		$rndkeyx,$inout2
1449	aesenc		$rndkeyx,$inout3
1450	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
1451	 lea		$i($ctr),%r9
1452	aesenc		$rndkeyx,$inout4
1453	aesenc		$rndkeyx,$inout5
1454	aesenc		$rndkeyx,$inout6
1455	aesenc		$rndkeyx,$inout7
1456	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
1457___
1458}
1459$code.=<<___;
1460	 bswap		%r9d
1461	aesenc		$rndkey0,$inout0
1462	aesenc		$rndkey0,$inout1
1463	aesenc		$rndkey0,$inout2
1464	 xor		$key0,%r9d
1465	 movdqu		0x00($inp),$in0		# start loading input
1466	aesenc		$rndkey0,$inout3
1467	 mov		%r9d,0x70+12(%rsp)
1468	 cmp		\$11,$rounds
1469	aesenc		$rndkey0,$inout4
1470	aesenc		$rndkey0,$inout5
1471	aesenc		$rndkey0,$inout6
1472	aesenc		$rndkey0,$inout7
1473	$movkey		0xa0-0x80($key),$rndkey0
1474
1475	jb		.Lctr32_enc_done
1476
1477	aesenc		$rndkey1,$inout0
1478	aesenc		$rndkey1,$inout1
1479	aesenc		$rndkey1,$inout2
1480	aesenc		$rndkey1,$inout3
1481	aesenc		$rndkey1,$inout4
1482	aesenc		$rndkey1,$inout5
1483	aesenc		$rndkey1,$inout6
1484	aesenc		$rndkey1,$inout7
1485	$movkey		0xb0-0x80($key),$rndkey1
1486
1487	aesenc		$rndkey0,$inout0
1488	aesenc		$rndkey0,$inout1
1489	aesenc		$rndkey0,$inout2
1490	aesenc		$rndkey0,$inout3
1491	aesenc		$rndkey0,$inout4
1492	aesenc		$rndkey0,$inout5
1493	aesenc		$rndkey0,$inout6
1494	aesenc		$rndkey0,$inout7
1495	$movkey		0xc0-0x80($key),$rndkey0
1496	je		.Lctr32_enc_done
1497
1498	aesenc		$rndkey1,$inout0
1499	aesenc		$rndkey1,$inout1
1500	aesenc		$rndkey1,$inout2
1501	aesenc		$rndkey1,$inout3
1502	aesenc		$rndkey1,$inout4
1503	aesenc		$rndkey1,$inout5
1504	aesenc		$rndkey1,$inout6
1505	aesenc		$rndkey1,$inout7
1506	$movkey		0xd0-0x80($key),$rndkey1
1507
1508	aesenc		$rndkey0,$inout0
1509	aesenc		$rndkey0,$inout1
1510	aesenc		$rndkey0,$inout2
1511	aesenc		$rndkey0,$inout3
1512	aesenc		$rndkey0,$inout4
1513	aesenc		$rndkey0,$inout5
1514	aesenc		$rndkey0,$inout6
1515	aesenc		$rndkey0,$inout7
1516	$movkey		0xe0-0x80($key),$rndkey0
1517	jmp		.Lctr32_enc_done
1518
1519.align	16
1520.Lctr32_enc_done:
1521	movdqu		0x10($inp),$in1
1522	pxor		$rndkey0,$in0		# input^=round[last]
1523	movdqu		0x20($inp),$in2
1524	pxor		$rndkey0,$in1
1525	movdqu		0x30($inp),$in3
1526	pxor		$rndkey0,$in2
1527	movdqu		0x40($inp),$in4
1528	pxor		$rndkey0,$in3
1529	movdqu		0x50($inp),$in5
1530	pxor		$rndkey0,$in4
1531	pxor		$rndkey0,$in5
1532	aesenc		$rndkey1,$inout0
1533	aesenc		$rndkey1,$inout1
1534	aesenc		$rndkey1,$inout2
1535	aesenc		$rndkey1,$inout3
1536	aesenc		$rndkey1,$inout4
1537	aesenc		$rndkey1,$inout5
1538	aesenc		$rndkey1,$inout6
1539	aesenc		$rndkey1,$inout7
1540	movdqu		0x60($inp),$rndkey1	# borrow $rndkey1 for inp[6]
1541	lea		0x80($inp),$inp		# $inp+=8*16
1542
1543	aesenclast	$in0,$inout0		# $inN is inp[N]^round[last]
1544	pxor		$rndkey0,$rndkey1	# borrowed $rndkey
1545	movdqu		0x70-0x80($inp),$in0
1546	aesenclast	$in1,$inout1
1547	pxor		$rndkey0,$in0
1548	movdqa		0x00(%rsp),$in1		# load next counter block
1549	aesenclast	$in2,$inout2
1550	aesenclast	$in3,$inout3
1551	movdqa		0x10(%rsp),$in2
1552	movdqa		0x20(%rsp),$in3
1553	aesenclast	$in4,$inout4
1554	aesenclast	$in5,$inout5
1555	movdqa		0x30(%rsp),$in4
1556	movdqa		0x40(%rsp),$in5
1557	aesenclast	$rndkey1,$inout6
1558	movdqa		0x50(%rsp),$rndkey0
1559	$movkey		0x10-0x80($key),$rndkey1#real 1st-round key
1560	aesenclast	$in0,$inout7
1561
1562	movups		$inout0,($out)		# store 8 output blocks
1563	movdqa		$in1,$inout0
1564	movups		$inout1,0x10($out)
1565	movdqa		$in2,$inout1
1566	movups		$inout2,0x20($out)
1567	movdqa		$in3,$inout2
1568	movups		$inout3,0x30($out)
1569	movdqa		$in4,$inout3
1570	movups		$inout4,0x40($out)
1571	movdqa		$in5,$inout4
1572	movups		$inout5,0x50($out)
1573	movdqa		$rndkey0,$inout5
1574	movups		$inout6,0x60($out)
1575	movups		$inout7,0x70($out)
1576	lea		0x80($out),$out		# $out+=8*16
1577
1578	sub	\$8,$len
1579	jnc	.Lctr32_loop8			# loop if $len-=8 didn't borrow
1580
1581	add	\$8,$len			# restore real remaining $len
1582	jz	.Lctr32_done			# done if ($len==0)
1583	lea	-0x80($key),$key
1584
1585.Lctr32_tail:
1586	# note that at this point $inout0..5 are populated with
1587	# counter values xor-ed with 0-round key
1588	lea	16($key),$key
1589	cmp	\$4,$len
1590	jb	.Lctr32_loop3
1591	je	.Lctr32_loop4
1592
1593	# if ($len>4) compute 7 E(counter)
1594	shl		\$4,$rounds
1595	movdqa		0x60(%rsp),$inout6
1596	pxor		$inout7,$inout7
1597
1598	$movkey		16($key),$rndkey0
1599	aesenc		$rndkey1,$inout0
1600	aesenc		$rndkey1,$inout1
1601	lea		32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1602	neg		%rax
1603	aesenc		$rndkey1,$inout2
1604	add		\$16,%rax		# prepare for .Lenc_loop8_enter
1605	 movups		($inp),$in0
1606	aesenc		$rndkey1,$inout3
1607	aesenc		$rndkey1,$inout4
1608	 movups		0x10($inp),$in1		# pre-load input
1609	 movups		0x20($inp),$in2
1610	aesenc		$rndkey1,$inout5
1611	aesenc		$rndkey1,$inout6
1612
1613	call            .Lenc_loop8_enter
1614
1615	movdqu	0x30($inp),$in3
1616	pxor	$in0,$inout0
1617	movdqu	0x40($inp),$in0
1618	pxor	$in1,$inout1
1619	movdqu	$inout0,($out)			# store output
1620	pxor	$in2,$inout2
1621	movdqu	$inout1,0x10($out)
1622	pxor	$in3,$inout3
1623	movdqu	$inout2,0x20($out)
1624	pxor	$in0,$inout4
1625	movdqu	$inout3,0x30($out)
1626	movdqu	$inout4,0x40($out)
1627	cmp	\$6,$len
1628	jb	.Lctr32_done			# $len was 5, stop store
1629
1630	movups	0x50($inp),$in1
1631	xorps	$in1,$inout5
1632	movups	$inout5,0x50($out)
1633	je	.Lctr32_done			# $len was 6, stop store
1634
1635	movups	0x60($inp),$in2
1636	xorps	$in2,$inout6
1637	movups	$inout6,0x60($out)
1638	jmp	.Lctr32_done			# $len was 7, stop store
1639
1640.align	32
1641.Lctr32_loop4:
1642	aesenc		$rndkey1,$inout0
1643	lea		16($key),$key
1644	dec		$rounds
1645	aesenc		$rndkey1,$inout1
1646	aesenc		$rndkey1,$inout2
1647	aesenc		$rndkey1,$inout3
1648	$movkey		($key),$rndkey1
1649	jnz		.Lctr32_loop4
1650	aesenclast	$rndkey1,$inout0
1651	aesenclast	$rndkey1,$inout1
1652	 movups		($inp),$in0		# load input
1653	 movups		0x10($inp),$in1
1654	aesenclast	$rndkey1,$inout2
1655	aesenclast	$rndkey1,$inout3
1656	 movups		0x20($inp),$in2
1657	 movups		0x30($inp),$in3
1658
1659	xorps	$in0,$inout0
1660	movups	$inout0,($out)			# store output
1661	xorps	$in1,$inout1
1662	movups	$inout1,0x10($out)
1663	pxor	$in2,$inout2
1664	movdqu	$inout2,0x20($out)
1665	pxor	$in3,$inout3
1666	movdqu	$inout3,0x30($out)
1667	jmp	.Lctr32_done			# $len was 4, stop store
1668
1669.align	32
1670.Lctr32_loop3:
1671	aesenc		$rndkey1,$inout0
1672	lea		16($key),$key
1673	dec		$rounds
1674	aesenc		$rndkey1,$inout1
1675	aesenc		$rndkey1,$inout2
1676	$movkey		($key),$rndkey1
1677	jnz		.Lctr32_loop3
1678	aesenclast	$rndkey1,$inout0
1679	aesenclast	$rndkey1,$inout1
1680	aesenclast	$rndkey1,$inout2
1681
1682	movups	($inp),$in0			# load input
1683	xorps	$in0,$inout0
1684	movups	$inout0,($out)			# store output
1685	cmp	\$2,$len
1686	jb	.Lctr32_done			# $len was 1, stop store
1687
1688	movups	0x10($inp),$in1
1689	xorps	$in1,$inout1
1690	movups	$inout1,0x10($out)
1691	je	.Lctr32_done			# $len was 2, stop store
1692
1693	movups	0x20($inp),$in2
1694	xorps	$in2,$inout2
1695	movups	$inout2,0x20($out)		# $len was 3, stop store
1696
1697.Lctr32_done:
1698	xorps	%xmm0,%xmm0			# clear register bank
1699	xor	$key0,$key0
1700	pxor	%xmm1,%xmm1
1701	pxor	%xmm2,%xmm2
1702	pxor	%xmm3,%xmm3
1703	pxor	%xmm4,%xmm4
1704	pxor	%xmm5,%xmm5
1705___
1706$code.=<<___ if (!$win64);
1707	pxor	%xmm6,%xmm6
1708	pxor	%xmm7,%xmm7
1709	movaps	%xmm0,0x00(%rsp)		# clear stack
1710	pxor	%xmm8,%xmm8
1711	movaps	%xmm0,0x10(%rsp)
1712	pxor	%xmm9,%xmm9
1713	movaps	%xmm0,0x20(%rsp)
1714	pxor	%xmm10,%xmm10
1715	movaps	%xmm0,0x30(%rsp)
1716	pxor	%xmm11,%xmm11
1717	movaps	%xmm0,0x40(%rsp)
1718	pxor	%xmm12,%xmm12
1719	movaps	%xmm0,0x50(%rsp)
1720	pxor	%xmm13,%xmm13
1721	movaps	%xmm0,0x60(%rsp)
1722	pxor	%xmm14,%xmm14
1723	movaps	%xmm0,0x70(%rsp)
1724	pxor	%xmm15,%xmm15
1725___
1726$code.=<<___ if ($win64);
1727	movaps	-0xa8($key_),%xmm6
1728	movaps	%xmm0,-0xa8($key_)		# clear stack
1729	movaps	-0x98($key_),%xmm7
1730	movaps	%xmm0,-0x98($key_)
1731	movaps	-0x88($key_),%xmm8
1732	movaps	%xmm0,-0x88($key_)
1733	movaps	-0x78($key_),%xmm9
1734	movaps	%xmm0,-0x78($key_)
1735	movaps	-0x68($key_),%xmm10
1736	movaps	%xmm0,-0x68($key_)
1737	movaps	-0x58($key_),%xmm11
1738	movaps	%xmm0,-0x58($key_)
1739	movaps	-0x48($key_),%xmm12
1740	movaps	%xmm0,-0x48($key_)
1741	movaps	-0x38($key_),%xmm13
1742	movaps	%xmm0,-0x38($key_)
1743	movaps	-0x28($key_),%xmm14
1744	movaps	%xmm0,-0x28($key_)
1745	movaps	-0x18($key_),%xmm15
1746	movaps	%xmm0,-0x18($key_)
1747	movaps	%xmm0,0x00(%rsp)
1748	movaps	%xmm0,0x10(%rsp)
1749	movaps	%xmm0,0x20(%rsp)
1750	movaps	%xmm0,0x30(%rsp)
1751	movaps	%xmm0,0x40(%rsp)
1752	movaps	%xmm0,0x50(%rsp)
1753	movaps	%xmm0,0x60(%rsp)
1754	movaps	%xmm0,0x70(%rsp)
1755___
1756$code.=<<___;
1757	mov	-8($key_),%rbp
1758.cfi_restore	%rbp
1759	lea	($key_),%rsp
1760.cfi_def_cfa_register	%rsp
1761.Lctr32_epilogue:
1762	ret
1763.cfi_endproc
1764.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1765___
1766}
1767
1768######################################################################
1769# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1770#	const AES_KEY *key1, const AES_KEY *key2
1771#	const unsigned char iv[16]);
1772#
1773{
1774my @tweak=map("%xmm$_",(10..15));
1775my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1776my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1777my $frame_size = 0x70 + ($win64?160:0);
1778my $key_ = "%rbp";	# override so that we can use %r11 as FP
1779
1780$code.=<<___;
1781.globl	aesni_xts_encrypt
1782.type	aesni_xts_encrypt,\@function,6
1783.align	16
1784aesni_xts_encrypt:
1785.cfi_startproc
1786	endbranch
1787	lea	(%rsp),%r11			# frame pointer
1788.cfi_def_cfa_register	%r11
1789	push	%rbp
1790.cfi_push	%rbp
1791	sub	\$$frame_size,%rsp
1792	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1793___
1794$code.=<<___ if ($win64);
1795	movaps	%xmm6,-0xa8(%r11)		# offload everything
1796	movaps	%xmm7,-0x98(%r11)
1797	movaps	%xmm8,-0x88(%r11)
1798	movaps	%xmm9,-0x78(%r11)
1799	movaps	%xmm10,-0x68(%r11)
1800	movaps	%xmm11,-0x58(%r11)
1801	movaps	%xmm12,-0x48(%r11)
1802	movaps	%xmm13,-0x38(%r11)
1803	movaps	%xmm14,-0x28(%r11)
1804	movaps	%xmm15,-0x18(%r11)
1805.Lxts_enc_body:
1806___
1807$code.=<<___;
1808	movups	($ivp),$inout0			# load clear-text tweak
1809	mov	240(%r8),$rounds		# key2->rounds
1810	mov	240($key),$rnds_		# key1->rounds
1811___
1812	# generate the tweak
1813	&aesni_generate1("enc",$key2,$rounds,$inout0);
1814$code.=<<___;
1815	$movkey	($key),$rndkey0			# zero round key
1816	mov	$key,$key_			# backup $key
1817	mov	$rnds_,$rounds			# backup $rounds
1818	shl	\$4,$rnds_
1819	mov	$len,$len_			# backup $len
1820	and	\$-16,$len
1821
1822	$movkey	16($key,$rnds_),$rndkey1	# last round key
1823
1824	movdqa	.Lxts_magic(%rip),$twmask
1825	movdqa	$inout0,@tweak[5]
1826	pshufd	\$0x5f,$inout0,$twres
1827	pxor	$rndkey0,$rndkey1
1828___
1829    # alternative tweak calculation algorithm is based on suggestions
1830    # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1831    # and should help in the future...
1832    for ($i=0;$i<4;$i++) {
1833    $code.=<<___;
1834	movdqa	$twres,$twtmp
1835	paddd	$twres,$twres
1836	movdqa	@tweak[5],@tweak[$i]
1837	psrad	\$31,$twtmp			# broadcast upper bits
1838	paddq	@tweak[5],@tweak[5]
1839	pand	$twmask,$twtmp
1840	pxor	$rndkey0,@tweak[$i]
1841	pxor	$twtmp,@tweak[5]
1842___
1843    }
1844$code.=<<___;
1845	movdqa	@tweak[5],@tweak[4]
1846	psrad	\$31,$twres
1847	paddq	@tweak[5],@tweak[5]
1848	pand	$twmask,$twres
1849	pxor	$rndkey0,@tweak[4]
1850	pxor	$twres,@tweak[5]
1851	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
1852
1853	sub	\$16*6,$len
1854	jc	.Lxts_enc_short			# if $len-=6*16 borrowed
1855
1856	mov	\$16+96,$rounds
1857	lea	32($key_,$rnds_),$key		# end of key schedule
1858	sub	%r10,%rax			# twisted $rounds
1859	$movkey	16($key_),$rndkey1
1860	mov	%rax,%r10			# backup twisted $rounds
1861	lea	.Lxts_magic(%rip),%r8
1862	jmp	.Lxts_enc_grandloop
1863
1864.align	32
1865.Lxts_enc_grandloop:
1866	movdqu	`16*0`($inp),$inout0		# load input
1867	movdqa	$rndkey0,$twmask
1868	movdqu	`16*1`($inp),$inout1
1869	pxor	@tweak[0],$inout0		# input^=tweak^round[0]
1870	movdqu	`16*2`($inp),$inout2
1871	pxor	@tweak[1],$inout1
1872	 aesenc		$rndkey1,$inout0
1873	movdqu	`16*3`($inp),$inout3
1874	pxor	@tweak[2],$inout2
1875	 aesenc		$rndkey1,$inout1
1876	movdqu	`16*4`($inp),$inout4
1877	pxor	@tweak[3],$inout3
1878	 aesenc		$rndkey1,$inout2
1879	movdqu	`16*5`($inp),$inout5
1880	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
1881	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
1882	pxor	@tweak[4],$inout4
1883	 aesenc		$rndkey1,$inout3
1884	$movkey	32($key_),$rndkey0
1885	lea	`16*6`($inp),$inp
1886	pxor	$twmask,$inout5
1887
1888	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
1889	aesenc		$rndkey1,$inout4
1890	 pxor	$twres,@tweak[1]
1891	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^round[last]
1892	aesenc		$rndkey1,$inout5
1893	$movkey		48($key_),$rndkey1
1894	 pxor	$twres,@tweak[2]
1895
1896	aesenc		$rndkey0,$inout0
1897	 pxor	$twres,@tweak[3]
1898	 movdqa	@tweak[1],`16*1`(%rsp)
1899	aesenc		$rndkey0,$inout1
1900	 pxor	$twres,@tweak[4]
1901	 movdqa	@tweak[2],`16*2`(%rsp)
1902	aesenc		$rndkey0,$inout2
1903	aesenc		$rndkey0,$inout3
1904	 pxor	$twres,$twmask
1905	 movdqa	@tweak[4],`16*4`(%rsp)
1906	aesenc		$rndkey0,$inout4
1907	aesenc		$rndkey0,$inout5
1908	$movkey		64($key_),$rndkey0
1909	 movdqa	$twmask,`16*5`(%rsp)
1910	pshufd	\$0x5f,@tweak[5],$twres
1911	jmp	.Lxts_enc_loop6
1912.align	32
1913.Lxts_enc_loop6:
1914	aesenc		$rndkey1,$inout0
1915	aesenc		$rndkey1,$inout1
1916	aesenc		$rndkey1,$inout2
1917	aesenc		$rndkey1,$inout3
1918	aesenc		$rndkey1,$inout4
1919	aesenc		$rndkey1,$inout5
1920	$movkey		-64($key,%rax),$rndkey1
1921	add		\$32,%rax
1922
1923	aesenc		$rndkey0,$inout0
1924	aesenc		$rndkey0,$inout1
1925	aesenc		$rndkey0,$inout2
1926	aesenc		$rndkey0,$inout3
1927	aesenc		$rndkey0,$inout4
1928	aesenc		$rndkey0,$inout5
1929	$movkey		-80($key,%rax),$rndkey0
1930	jnz		.Lxts_enc_loop6
1931
1932	movdqa	(%r8),$twmask			# start calculating next tweak
1933	movdqa	$twres,$twtmp
1934	paddd	$twres,$twres
1935	 aesenc		$rndkey1,$inout0
1936	paddq	@tweak[5],@tweak[5]
1937	psrad	\$31,$twtmp
1938	 aesenc		$rndkey1,$inout1
1939	pand	$twmask,$twtmp
1940	$movkey	($key_),@tweak[0]		# load round[0]
1941	 aesenc		$rndkey1,$inout2
1942	 aesenc		$rndkey1,$inout3
1943	 aesenc		$rndkey1,$inout4
1944	pxor	$twtmp,@tweak[5]
1945	movaps	@tweak[0],@tweak[1]		# copy round[0]
1946	 aesenc		$rndkey1,$inout5
1947	 $movkey	-64($key),$rndkey1
1948
1949	movdqa	$twres,$twtmp
1950	 aesenc		$rndkey0,$inout0
1951	paddd	$twres,$twres
1952	pxor	@tweak[5],@tweak[0]
1953	 aesenc		$rndkey0,$inout1
1954	psrad	\$31,$twtmp
1955	paddq	@tweak[5],@tweak[5]
1956	 aesenc		$rndkey0,$inout2
1957	 aesenc		$rndkey0,$inout3
1958	pand	$twmask,$twtmp
1959	movaps	@tweak[1],@tweak[2]
1960	 aesenc		$rndkey0,$inout4
1961	pxor	$twtmp,@tweak[5]
1962	movdqa	$twres,$twtmp
1963	 aesenc		$rndkey0,$inout5
1964	 $movkey	-48($key),$rndkey0
1965
1966	paddd	$twres,$twres
1967	 aesenc		$rndkey1,$inout0
1968	pxor	@tweak[5],@tweak[1]
1969	psrad	\$31,$twtmp
1970	 aesenc		$rndkey1,$inout1
1971	paddq	@tweak[5],@tweak[5]
1972	pand	$twmask,$twtmp
1973	 aesenc		$rndkey1,$inout2
1974	 aesenc		$rndkey1,$inout3
1975	 movdqa	@tweak[3],`16*3`(%rsp)
1976	pxor	$twtmp,@tweak[5]
1977	 aesenc		$rndkey1,$inout4
1978	movaps	@tweak[2],@tweak[3]
1979	movdqa	$twres,$twtmp
1980	 aesenc		$rndkey1,$inout5
1981	 $movkey	-32($key),$rndkey1
1982
1983	paddd	$twres,$twres
1984	 aesenc		$rndkey0,$inout0
1985	pxor	@tweak[5],@tweak[2]
1986	psrad	\$31,$twtmp
1987	 aesenc		$rndkey0,$inout1
1988	paddq	@tweak[5],@tweak[5]
1989	pand	$twmask,$twtmp
1990	 aesenc		$rndkey0,$inout2
1991	 aesenc		$rndkey0,$inout3
1992	 aesenc		$rndkey0,$inout4
1993	pxor	$twtmp,@tweak[5]
1994	movaps	@tweak[3],@tweak[4]
1995	 aesenc		$rndkey0,$inout5
1996
1997	movdqa	$twres,$rndkey0
1998	paddd	$twres,$twres
1999	 aesenc		$rndkey1,$inout0
2000	pxor	@tweak[5],@tweak[3]
2001	psrad	\$31,$rndkey0
2002	 aesenc		$rndkey1,$inout1
2003	paddq	@tweak[5],@tweak[5]
2004	pand	$twmask,$rndkey0
2005	 aesenc		$rndkey1,$inout2
2006	 aesenc		$rndkey1,$inout3
2007	pxor	$rndkey0,@tweak[5]
2008	$movkey		($key_),$rndkey0
2009	 aesenc		$rndkey1,$inout4
2010	 aesenc		$rndkey1,$inout5
2011	$movkey		16($key_),$rndkey1
2012
2013	pxor	@tweak[5],@tweak[4]
2014	 aesenclast	`16*0`(%rsp),$inout0
2015	psrad	\$31,$twres
2016	paddq	@tweak[5],@tweak[5]
2017	 aesenclast	`16*1`(%rsp),$inout1
2018	 aesenclast	`16*2`(%rsp),$inout2
2019	pand	$twmask,$twres
2020	mov	%r10,%rax			# restore $rounds
2021	 aesenclast	`16*3`(%rsp),$inout3
2022	 aesenclast	`16*4`(%rsp),$inout4
2023	 aesenclast	`16*5`(%rsp),$inout5
2024	pxor	$twres,@tweak[5]
2025
2026	lea	`16*6`($out),$out		# $out+=6*16
2027	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2028	movups	$inout1,`-16*5`($out)
2029	movups	$inout2,`-16*4`($out)
2030	movups	$inout3,`-16*3`($out)
2031	movups	$inout4,`-16*2`($out)
2032	movups	$inout5,`-16*1`($out)
2033	sub	\$16*6,$len
2034	jnc	.Lxts_enc_grandloop		# loop if $len-=6*16 didn't borrow
2035
2036	mov	\$16+96,$rounds
2037	sub	$rnds_,$rounds
2038	mov	$key_,$key			# restore $key
2039	shr	\$4,$rounds			# restore original value
2040
2041.Lxts_enc_short:
2042	# at the point @tweak[0..5] are populated with tweak values
2043	mov	$rounds,$rnds_			# backup $rounds
2044	pxor	$rndkey0,@tweak[0]
2045	add	\$16*6,$len			# restore real remaining $len
2046	jz	.Lxts_enc_done			# done if ($len==0)
2047
2048	pxor	$rndkey0,@tweak[1]
2049	cmp	\$0x20,$len
2050	jb	.Lxts_enc_one			# $len is 1*16
2051	pxor	$rndkey0,@tweak[2]
2052	je	.Lxts_enc_two			# $len is 2*16
2053
2054	pxor	$rndkey0,@tweak[3]
2055	cmp	\$0x40,$len
2056	jb	.Lxts_enc_three			# $len is 3*16
2057	pxor	$rndkey0,@tweak[4]
2058	je	.Lxts_enc_four			# $len is 4*16
2059
2060	movdqu	($inp),$inout0			# $len is 5*16
2061	movdqu	16*1($inp),$inout1
2062	movdqu	16*2($inp),$inout2
2063	pxor	@tweak[0],$inout0
2064	movdqu	16*3($inp),$inout3
2065	pxor	@tweak[1],$inout1
2066	movdqu	16*4($inp),$inout4
2067	lea	16*5($inp),$inp			# $inp+=5*16
2068	pxor	@tweak[2],$inout2
2069	pxor	@tweak[3],$inout3
2070	pxor	@tweak[4],$inout4
2071	pxor	$inout5,$inout5
2072
2073	call	_aesni_encrypt6
2074
2075	xorps	@tweak[0],$inout0
2076	movdqa	@tweak[5],@tweak[0]
2077	xorps	@tweak[1],$inout1
2078	xorps	@tweak[2],$inout2
2079	movdqu	$inout0,($out)			# store 5 output blocks
2080	xorps	@tweak[3],$inout3
2081	movdqu	$inout1,16*1($out)
2082	xorps	@tweak[4],$inout4
2083	movdqu	$inout2,16*2($out)
2084	movdqu	$inout3,16*3($out)
2085	movdqu	$inout4,16*4($out)
2086	lea	16*5($out),$out			# $out+=5*16
2087	jmp	.Lxts_enc_done
2088
2089.align	16
2090.Lxts_enc_one:
2091	movups	($inp),$inout0
2092	lea	16*1($inp),$inp			# inp+=1*16
2093	xorps	@tweak[0],$inout0
2094___
2095	&aesni_generate1("enc",$key,$rounds);
2096$code.=<<___;
2097	xorps	@tweak[0],$inout0
2098	movdqa	@tweak[1],@tweak[0]
2099	movups	$inout0,($out)			# store one output block
2100	lea	16*1($out),$out			# $out+=1*16
2101	jmp	.Lxts_enc_done
2102
2103.align	16
2104.Lxts_enc_two:
2105	movups	($inp),$inout0
2106	movups	16($inp),$inout1
2107	lea	32($inp),$inp			# $inp+=2*16
2108	xorps	@tweak[0],$inout0
2109	xorps	@tweak[1],$inout1
2110
2111	call	_aesni_encrypt2
2112
2113	xorps	@tweak[0],$inout0
2114	movdqa	@tweak[2],@tweak[0]
2115	xorps	@tweak[1],$inout1
2116	movups	$inout0,($out)			# store 2 output blocks
2117	movups	$inout1,16*1($out)
2118	lea	16*2($out),$out			# $out+=2*16
2119	jmp	.Lxts_enc_done
2120
2121.align	16
2122.Lxts_enc_three:
2123	movups	($inp),$inout0
2124	movups	16*1($inp),$inout1
2125	movups	16*2($inp),$inout2
2126	lea	16*3($inp),$inp			# $inp+=3*16
2127	xorps	@tweak[0],$inout0
2128	xorps	@tweak[1],$inout1
2129	xorps	@tweak[2],$inout2
2130
2131	call	_aesni_encrypt3
2132
2133	xorps	@tweak[0],$inout0
2134	movdqa	@tweak[3],@tweak[0]
2135	xorps	@tweak[1],$inout1
2136	xorps	@tweak[2],$inout2
2137	movups	$inout0,($out)			# store 3 output blocks
2138	movups	$inout1,16*1($out)
2139	movups	$inout2,16*2($out)
2140	lea	16*3($out),$out			# $out+=3*16
2141	jmp	.Lxts_enc_done
2142
2143.align	16
2144.Lxts_enc_four:
2145	movups	($inp),$inout0
2146	movups	16*1($inp),$inout1
2147	movups	16*2($inp),$inout2
2148	xorps	@tweak[0],$inout0
2149	movups	16*3($inp),$inout3
2150	lea	16*4($inp),$inp			# $inp+=4*16
2151	xorps	@tweak[1],$inout1
2152	xorps	@tweak[2],$inout2
2153	xorps	@tweak[3],$inout3
2154
2155	call	_aesni_encrypt4
2156
2157	pxor	@tweak[0],$inout0
2158	movdqa	@tweak[4],@tweak[0]
2159	pxor	@tweak[1],$inout1
2160	pxor	@tweak[2],$inout2
2161	movdqu	$inout0,($out)			# store 4 output blocks
2162	pxor	@tweak[3],$inout3
2163	movdqu	$inout1,16*1($out)
2164	movdqu	$inout2,16*2($out)
2165	movdqu	$inout3,16*3($out)
2166	lea	16*4($out),$out			# $out+=4*16
2167	jmp	.Lxts_enc_done
2168
2169.align	16
2170.Lxts_enc_done:
2171	and	\$15,$len_			# see if $len%16 is 0
2172	jz	.Lxts_enc_ret
2173	mov	$len_,$len
2174
2175.Lxts_enc_steal:
2176	movzb	($inp),%eax			# borrow $rounds ...
2177	movzb	-16($out),%ecx			# ... and $key
2178	lea	1($inp),$inp
2179	mov	%al,-16($out)
2180	mov	%cl,0($out)
2181	lea	1($out),$out
2182	sub	\$1,$len
2183	jnz	.Lxts_enc_steal
2184
2185	sub	$len_,$out			# rewind $out
2186	mov	$key_,$key			# restore $key
2187	mov	$rnds_,$rounds			# restore $rounds
2188
2189	movups	-16($out),$inout0
2190	xorps	@tweak[0],$inout0
2191___
2192	&aesni_generate1("enc",$key,$rounds);
2193$code.=<<___;
2194	xorps	@tweak[0],$inout0
2195	movups	$inout0,-16($out)
2196
2197.Lxts_enc_ret:
2198	xorps	%xmm0,%xmm0			# clear register bank
2199	pxor	%xmm1,%xmm1
2200	pxor	%xmm2,%xmm2
2201	pxor	%xmm3,%xmm3
2202	pxor	%xmm4,%xmm4
2203	pxor	%xmm5,%xmm5
2204___
2205$code.=<<___ if (!$win64);
2206	pxor	%xmm6,%xmm6
2207	pxor	%xmm7,%xmm7
2208	movaps	%xmm0,0x00(%rsp)		# clear stack
2209	pxor	%xmm8,%xmm8
2210	movaps	%xmm0,0x10(%rsp)
2211	pxor	%xmm9,%xmm9
2212	movaps	%xmm0,0x20(%rsp)
2213	pxor	%xmm10,%xmm10
2214	movaps	%xmm0,0x30(%rsp)
2215	pxor	%xmm11,%xmm11
2216	movaps	%xmm0,0x40(%rsp)
2217	pxor	%xmm12,%xmm12
2218	movaps	%xmm0,0x50(%rsp)
2219	pxor	%xmm13,%xmm13
2220	movaps	%xmm0,0x60(%rsp)
2221	pxor	%xmm14,%xmm14
2222	pxor	%xmm15,%xmm15
2223___
2224$code.=<<___ if ($win64);
2225	movaps	-0xa8(%r11),%xmm6
2226	movaps	%xmm0,-0xa8(%r11)		# clear stack
2227	movaps	-0x98(%r11),%xmm7
2228	movaps	%xmm0,-0x98(%r11)
2229	movaps	-0x88(%r11),%xmm8
2230	movaps	%xmm0,-0x88(%r11)
2231	movaps	-0x78(%r11),%xmm9
2232	movaps	%xmm0,-0x78(%r11)
2233	movaps	-0x68(%r11),%xmm10
2234	movaps	%xmm0,-0x68(%r11)
2235	movaps	-0x58(%r11),%xmm11
2236	movaps	%xmm0,-0x58(%r11)
2237	movaps	-0x48(%r11),%xmm12
2238	movaps	%xmm0,-0x48(%r11)
2239	movaps	-0x38(%r11),%xmm13
2240	movaps	%xmm0,-0x38(%r11)
2241	movaps	-0x28(%r11),%xmm14
2242	movaps	%xmm0,-0x28(%r11)
2243	movaps	-0x18(%r11),%xmm15
2244	movaps	%xmm0,-0x18(%r11)
2245	movaps	%xmm0,0x00(%rsp)
2246	movaps	%xmm0,0x10(%rsp)
2247	movaps	%xmm0,0x20(%rsp)
2248	movaps	%xmm0,0x30(%rsp)
2249	movaps	%xmm0,0x40(%rsp)
2250	movaps	%xmm0,0x50(%rsp)
2251	movaps	%xmm0,0x60(%rsp)
2252___
2253$code.=<<___;
2254	mov	-8(%r11),%rbp
2255.cfi_restore	%rbp
2256	lea	(%r11),%rsp
2257.cfi_def_cfa_register	%rsp
2258.Lxts_enc_epilogue:
2259	ret
2260.cfi_endproc
2261.size	aesni_xts_encrypt,.-aesni_xts_encrypt
2262___
2263
2264$code.=<<___;
2265.globl	aesni_xts_decrypt
2266.type	aesni_xts_decrypt,\@function,6
2267.align	16
2268aesni_xts_decrypt:
2269.cfi_startproc
2270	endbranch
2271	lea	(%rsp),%r11			# frame pointer
2272.cfi_def_cfa_register	%r11
2273	push	%rbp
2274.cfi_push	%rbp
2275	sub	\$$frame_size,%rsp
2276	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
2277___
2278$code.=<<___ if ($win64);
2279	movaps	%xmm6,-0xa8(%r11)		# offload everything
2280	movaps	%xmm7,-0x98(%r11)
2281	movaps	%xmm8,-0x88(%r11)
2282	movaps	%xmm9,-0x78(%r11)
2283	movaps	%xmm10,-0x68(%r11)
2284	movaps	%xmm11,-0x58(%r11)
2285	movaps	%xmm12,-0x48(%r11)
2286	movaps	%xmm13,-0x38(%r11)
2287	movaps	%xmm14,-0x28(%r11)
2288	movaps	%xmm15,-0x18(%r11)
2289.Lxts_dec_body:
2290___
2291$code.=<<___;
2292	movups	($ivp),$inout0			# load clear-text tweak
2293	mov	240($key2),$rounds		# key2->rounds
2294	mov	240($key),$rnds_		# key1->rounds
2295___
2296	# generate the tweak
2297	&aesni_generate1("enc",$key2,$rounds,$inout0);
2298$code.=<<___;
2299	xor	%eax,%eax			# if ($len%16) len-=16;
2300	test	\$15,$len
2301	setnz	%al
2302	shl	\$4,%rax
2303	sub	%rax,$len
2304
2305	$movkey	($key),$rndkey0			# zero round key
2306	mov	$key,$key_			# backup $key
2307	mov	$rnds_,$rounds			# backup $rounds
2308	shl	\$4,$rnds_
2309	mov	$len,$len_			# backup $len
2310	and	\$-16,$len
2311
2312	$movkey	16($key,$rnds_),$rndkey1	# last round key
2313
2314	movdqa	.Lxts_magic(%rip),$twmask
2315	movdqa	$inout0,@tweak[5]
2316	pshufd	\$0x5f,$inout0,$twres
2317	pxor	$rndkey0,$rndkey1
2318___
2319    for ($i=0;$i<4;$i++) {
2320    $code.=<<___;
2321	movdqa	$twres,$twtmp
2322	paddd	$twres,$twres
2323	movdqa	@tweak[5],@tweak[$i]
2324	psrad	\$31,$twtmp			# broadcast upper bits
2325	paddq	@tweak[5],@tweak[5]
2326	pand	$twmask,$twtmp
2327	pxor	$rndkey0,@tweak[$i]
2328	pxor	$twtmp,@tweak[5]
2329___
2330    }
2331$code.=<<___;
2332	movdqa	@tweak[5],@tweak[4]
2333	psrad	\$31,$twres
2334	paddq	@tweak[5],@tweak[5]
2335	pand	$twmask,$twres
2336	pxor	$rndkey0,@tweak[4]
2337	pxor	$twres,@tweak[5]
2338	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
2339
2340	sub	\$16*6,$len
2341	jc	.Lxts_dec_short			# if $len-=6*16 borrowed
2342
2343	mov	\$16+96,$rounds
2344	lea	32($key_,$rnds_),$key		# end of key schedule
2345	sub	%r10,%rax			# twisted $rounds
2346	$movkey	16($key_),$rndkey1
2347	mov	%rax,%r10			# backup twisted $rounds
2348	lea	.Lxts_magic(%rip),%r8
2349	jmp	.Lxts_dec_grandloop
2350
2351.align	32
2352.Lxts_dec_grandloop:
2353	movdqu	`16*0`($inp),$inout0		# load input
2354	movdqa	$rndkey0,$twmask
2355	movdqu	`16*1`($inp),$inout1
2356	pxor	@tweak[0],$inout0		# input^=tweak^round[0]
2357	movdqu	`16*2`($inp),$inout2
2358	pxor	@tweak[1],$inout1
2359	 aesdec		$rndkey1,$inout0
2360	movdqu	`16*3`($inp),$inout3
2361	pxor	@tweak[2],$inout2
2362	 aesdec		$rndkey1,$inout1
2363	movdqu	`16*4`($inp),$inout4
2364	pxor	@tweak[3],$inout3
2365	 aesdec		$rndkey1,$inout2
2366	movdqu	`16*5`($inp),$inout5
2367	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
2368	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
2369	pxor	@tweak[4],$inout4
2370	 aesdec		$rndkey1,$inout3
2371	$movkey	32($key_),$rndkey0
2372	lea	`16*6`($inp),$inp
2373	pxor	$twmask,$inout5
2374
2375	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
2376	aesdec		$rndkey1,$inout4
2377	 pxor	$twres,@tweak[1]
2378	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
2379	aesdec		$rndkey1,$inout5
2380	$movkey		48($key_),$rndkey1
2381	 pxor	$twres,@tweak[2]
2382
2383	aesdec		$rndkey0,$inout0
2384	 pxor	$twres,@tweak[3]
2385	 movdqa	@tweak[1],`16*1`(%rsp)
2386	aesdec		$rndkey0,$inout1
2387	 pxor	$twres,@tweak[4]
2388	 movdqa	@tweak[2],`16*2`(%rsp)
2389	aesdec		$rndkey0,$inout2
2390	aesdec		$rndkey0,$inout3
2391	 pxor	$twres,$twmask
2392	 movdqa	@tweak[4],`16*4`(%rsp)
2393	aesdec		$rndkey0,$inout4
2394	aesdec		$rndkey0,$inout5
2395	$movkey		64($key_),$rndkey0
2396	 movdqa	$twmask,`16*5`(%rsp)
2397	pshufd	\$0x5f,@tweak[5],$twres
2398	jmp	.Lxts_dec_loop6
2399.align	32
2400.Lxts_dec_loop6:
2401	aesdec		$rndkey1,$inout0
2402	aesdec		$rndkey1,$inout1
2403	aesdec		$rndkey1,$inout2
2404	aesdec		$rndkey1,$inout3
2405	aesdec		$rndkey1,$inout4
2406	aesdec		$rndkey1,$inout5
2407	$movkey		-64($key,%rax),$rndkey1
2408	add		\$32,%rax
2409
2410	aesdec		$rndkey0,$inout0
2411	aesdec		$rndkey0,$inout1
2412	aesdec		$rndkey0,$inout2
2413	aesdec		$rndkey0,$inout3
2414	aesdec		$rndkey0,$inout4
2415	aesdec		$rndkey0,$inout5
2416	$movkey		-80($key,%rax),$rndkey0
2417	jnz		.Lxts_dec_loop6
2418
2419	movdqa	(%r8),$twmask			# start calculating next tweak
2420	movdqa	$twres,$twtmp
2421	paddd	$twres,$twres
2422	 aesdec		$rndkey1,$inout0
2423	paddq	@tweak[5],@tweak[5]
2424	psrad	\$31,$twtmp
2425	 aesdec		$rndkey1,$inout1
2426	pand	$twmask,$twtmp
2427	$movkey	($key_),@tweak[0]		# load round[0]
2428	 aesdec		$rndkey1,$inout2
2429	 aesdec		$rndkey1,$inout3
2430	 aesdec		$rndkey1,$inout4
2431	pxor	$twtmp,@tweak[5]
2432	movaps	@tweak[0],@tweak[1]		# copy round[0]
2433	 aesdec		$rndkey1,$inout5
2434	 $movkey	-64($key),$rndkey1
2435
2436	movdqa	$twres,$twtmp
2437	 aesdec		$rndkey0,$inout0
2438	paddd	$twres,$twres
2439	pxor	@tweak[5],@tweak[0]
2440	 aesdec		$rndkey0,$inout1
2441	psrad	\$31,$twtmp
2442	paddq	@tweak[5],@tweak[5]
2443	 aesdec		$rndkey0,$inout2
2444	 aesdec		$rndkey0,$inout3
2445	pand	$twmask,$twtmp
2446	movaps	@tweak[1],@tweak[2]
2447	 aesdec		$rndkey0,$inout4
2448	pxor	$twtmp,@tweak[5]
2449	movdqa	$twres,$twtmp
2450	 aesdec		$rndkey0,$inout5
2451	 $movkey	-48($key),$rndkey0
2452
2453	paddd	$twres,$twres
2454	 aesdec		$rndkey1,$inout0
2455	pxor	@tweak[5],@tweak[1]
2456	psrad	\$31,$twtmp
2457	 aesdec		$rndkey1,$inout1
2458	paddq	@tweak[5],@tweak[5]
2459	pand	$twmask,$twtmp
2460	 aesdec		$rndkey1,$inout2
2461	 aesdec		$rndkey1,$inout3
2462	 movdqa	@tweak[3],`16*3`(%rsp)
2463	pxor	$twtmp,@tweak[5]
2464	 aesdec		$rndkey1,$inout4
2465	movaps	@tweak[2],@tweak[3]
2466	movdqa	$twres,$twtmp
2467	 aesdec		$rndkey1,$inout5
2468	 $movkey	-32($key),$rndkey1
2469
2470	paddd	$twres,$twres
2471	 aesdec		$rndkey0,$inout0
2472	pxor	@tweak[5],@tweak[2]
2473	psrad	\$31,$twtmp
2474	 aesdec		$rndkey0,$inout1
2475	paddq	@tweak[5],@tweak[5]
2476	pand	$twmask,$twtmp
2477	 aesdec		$rndkey0,$inout2
2478	 aesdec		$rndkey0,$inout3
2479	 aesdec		$rndkey0,$inout4
2480	pxor	$twtmp,@tweak[5]
2481	movaps	@tweak[3],@tweak[4]
2482	 aesdec		$rndkey0,$inout5
2483
2484	movdqa	$twres,$rndkey0
2485	paddd	$twres,$twres
2486	 aesdec		$rndkey1,$inout0
2487	pxor	@tweak[5],@tweak[3]
2488	psrad	\$31,$rndkey0
2489	 aesdec		$rndkey1,$inout1
2490	paddq	@tweak[5],@tweak[5]
2491	pand	$twmask,$rndkey0
2492	 aesdec		$rndkey1,$inout2
2493	 aesdec		$rndkey1,$inout3
2494	pxor	$rndkey0,@tweak[5]
2495	$movkey		($key_),$rndkey0
2496	 aesdec		$rndkey1,$inout4
2497	 aesdec		$rndkey1,$inout5
2498	$movkey		16($key_),$rndkey1
2499
2500	pxor	@tweak[5],@tweak[4]
2501	 aesdeclast	`16*0`(%rsp),$inout0
2502	psrad	\$31,$twres
2503	paddq	@tweak[5],@tweak[5]
2504	 aesdeclast	`16*1`(%rsp),$inout1
2505	 aesdeclast	`16*2`(%rsp),$inout2
2506	pand	$twmask,$twres
2507	mov	%r10,%rax			# restore $rounds
2508	 aesdeclast	`16*3`(%rsp),$inout3
2509	 aesdeclast	`16*4`(%rsp),$inout4
2510	 aesdeclast	`16*5`(%rsp),$inout5
2511	pxor	$twres,@tweak[5]
2512
2513	lea	`16*6`($out),$out		# $out+=6*16
2514	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2515	movups	$inout1,`-16*5`($out)
2516	movups	$inout2,`-16*4`($out)
2517	movups	$inout3,`-16*3`($out)
2518	movups	$inout4,`-16*2`($out)
2519	movups	$inout5,`-16*1`($out)
2520	sub	\$16*6,$len
2521	jnc	.Lxts_dec_grandloop		# loop if $len-=6*16 didn't borrow
2522
2523	mov	\$16+96,$rounds
2524	sub	$rnds_,$rounds
2525	mov	$key_,$key			# restore $key
2526	shr	\$4,$rounds			# restore original value
2527
2528.Lxts_dec_short:
2529	# at the point @tweak[0..5] are populated with tweak values
2530	mov	$rounds,$rnds_			# backup $rounds
2531	pxor	$rndkey0,@tweak[0]
2532	pxor	$rndkey0,@tweak[1]
2533	add	\$16*6,$len			# restore real remaining $len
2534	jz	.Lxts_dec_done			# done if ($len==0)
2535
2536	pxor	$rndkey0,@tweak[2]
2537	cmp	\$0x20,$len
2538	jb	.Lxts_dec_one			# $len is 1*16
2539	pxor	$rndkey0,@tweak[3]
2540	je	.Lxts_dec_two			# $len is 2*16
2541
2542	pxor	$rndkey0,@tweak[4]
2543	cmp	\$0x40,$len
2544	jb	.Lxts_dec_three			# $len is 3*16
2545	je	.Lxts_dec_four			# $len is 4*16
2546
2547	movdqu	($inp),$inout0			# $len is 5*16
2548	movdqu	16*1($inp),$inout1
2549	movdqu	16*2($inp),$inout2
2550	pxor	@tweak[0],$inout0
2551	movdqu	16*3($inp),$inout3
2552	pxor	@tweak[1],$inout1
2553	movdqu	16*4($inp),$inout4
2554	lea	16*5($inp),$inp			# $inp+=5*16
2555	pxor	@tweak[2],$inout2
2556	pxor	@tweak[3],$inout3
2557	pxor	@tweak[4],$inout4
2558
2559	call	_aesni_decrypt6
2560
2561	xorps	@tweak[0],$inout0
2562	xorps	@tweak[1],$inout1
2563	xorps	@tweak[2],$inout2
2564	movdqu	$inout0,($out)			# store 5 output blocks
2565	xorps	@tweak[3],$inout3
2566	movdqu	$inout1,16*1($out)
2567	xorps	@tweak[4],$inout4
2568	movdqu	$inout2,16*2($out)
2569	 pxor		$twtmp,$twtmp
2570	movdqu	$inout3,16*3($out)
2571	 pcmpgtd	@tweak[5],$twtmp
2572	movdqu	$inout4,16*4($out)
2573	lea	16*5($out),$out			# $out+=5*16
2574	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
2575	and	\$15,$len_
2576	jz	.Lxts_dec_ret
2577
2578	movdqa	@tweak[5],@tweak[0]
2579	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
2580	pand	$twmask,@tweak[1]		# isolate carry and residue
2581	pxor	@tweak[5],@tweak[1]
2582	jmp	.Lxts_dec_done2
2583
2584.align	16
2585.Lxts_dec_one:
2586	movups	($inp),$inout0
2587	lea	16*1($inp),$inp			# $inp+=1*16
2588	xorps	@tweak[0],$inout0
2589___
2590	&aesni_generate1("dec",$key,$rounds);
2591$code.=<<___;
2592	xorps	@tweak[0],$inout0
2593	movdqa	@tweak[1],@tweak[0]
2594	movups	$inout0,($out)			# store one output block
2595	movdqa	@tweak[2],@tweak[1]
2596	lea	16*1($out),$out			# $out+=1*16
2597	jmp	.Lxts_dec_done
2598
2599.align	16
2600.Lxts_dec_two:
2601	movups	($inp),$inout0
2602	movups	16($inp),$inout1
2603	lea	32($inp),$inp			# $inp+=2*16
2604	xorps	@tweak[0],$inout0
2605	xorps	@tweak[1],$inout1
2606
2607	call	_aesni_decrypt2
2608
2609	xorps	@tweak[0],$inout0
2610	movdqa	@tweak[2],@tweak[0]
2611	xorps	@tweak[1],$inout1
2612	movdqa	@tweak[3],@tweak[1]
2613	movups	$inout0,($out)			# store 2 output blocks
2614	movups	$inout1,16*1($out)
2615	lea	16*2($out),$out			# $out+=2*16
2616	jmp	.Lxts_dec_done
2617
2618.align	16
2619.Lxts_dec_three:
2620	movups	($inp),$inout0
2621	movups	16*1($inp),$inout1
2622	movups	16*2($inp),$inout2
2623	lea	16*3($inp),$inp			# $inp+=3*16
2624	xorps	@tweak[0],$inout0
2625	xorps	@tweak[1],$inout1
2626	xorps	@tweak[2],$inout2
2627
2628	call	_aesni_decrypt3
2629
2630	xorps	@tweak[0],$inout0
2631	movdqa	@tweak[3],@tweak[0]
2632	xorps	@tweak[1],$inout1
2633	movdqa	@tweak[4],@tweak[1]
2634	xorps	@tweak[2],$inout2
2635	movups	$inout0,($out)			# store 3 output blocks
2636	movups	$inout1,16*1($out)
2637	movups	$inout2,16*2($out)
2638	lea	16*3($out),$out			# $out+=3*16
2639	jmp	.Lxts_dec_done
2640
2641.align	16
2642.Lxts_dec_four:
2643	movups	($inp),$inout0
2644	movups	16*1($inp),$inout1
2645	movups	16*2($inp),$inout2
2646	xorps	@tweak[0],$inout0
2647	movups	16*3($inp),$inout3
2648	lea	16*4($inp),$inp			# $inp+=4*16
2649	xorps	@tweak[1],$inout1
2650	xorps	@tweak[2],$inout2
2651	xorps	@tweak[3],$inout3
2652
2653	call	_aesni_decrypt4
2654
2655	pxor	@tweak[0],$inout0
2656	movdqa	@tweak[4],@tweak[0]
2657	pxor	@tweak[1],$inout1
2658	movdqa	@tweak[5],@tweak[1]
2659	pxor	@tweak[2],$inout2
2660	movdqu	$inout0,($out)			# store 4 output blocks
2661	pxor	@tweak[3],$inout3
2662	movdqu	$inout1,16*1($out)
2663	movdqu	$inout2,16*2($out)
2664	movdqu	$inout3,16*3($out)
2665	lea	16*4($out),$out			# $out+=4*16
2666	jmp	.Lxts_dec_done
2667
2668.align	16
2669.Lxts_dec_done:
2670	and	\$15,$len_			# see if $len%16 is 0
2671	jz	.Lxts_dec_ret
2672.Lxts_dec_done2:
2673	mov	$len_,$len
2674	mov	$key_,$key			# restore $key
2675	mov	$rnds_,$rounds			# restore $rounds
2676
2677	movups	($inp),$inout0
2678	xorps	@tweak[1],$inout0
2679___
2680	&aesni_generate1("dec",$key,$rounds);
2681$code.=<<___;
2682	xorps	@tweak[1],$inout0
2683	movups	$inout0,($out)
2684
2685.Lxts_dec_steal:
2686	movzb	16($inp),%eax			# borrow $rounds ...
2687	movzb	($out),%ecx			# ... and $key
2688	lea	1($inp),$inp
2689	mov	%al,($out)
2690	mov	%cl,16($out)
2691	lea	1($out),$out
2692	sub	\$1,$len
2693	jnz	.Lxts_dec_steal
2694
2695	sub	$len_,$out			# rewind $out
2696	mov	$key_,$key			# restore $key
2697	mov	$rnds_,$rounds			# restore $rounds
2698
2699	movups	($out),$inout0
2700	xorps	@tweak[0],$inout0
2701___
2702	&aesni_generate1("dec",$key,$rounds);
2703$code.=<<___;
2704	xorps	@tweak[0],$inout0
2705	movups	$inout0,($out)
2706
2707.Lxts_dec_ret:
2708	xorps	%xmm0,%xmm0			# clear register bank
2709	pxor	%xmm1,%xmm1
2710	pxor	%xmm2,%xmm2
2711	pxor	%xmm3,%xmm3
2712	pxor	%xmm4,%xmm4
2713	pxor	%xmm5,%xmm5
2714___
2715$code.=<<___ if (!$win64);
2716	pxor	%xmm6,%xmm6
2717	pxor	%xmm7,%xmm7
2718	movaps	%xmm0,0x00(%rsp)		# clear stack
2719	pxor	%xmm8,%xmm8
2720	movaps	%xmm0,0x10(%rsp)
2721	pxor	%xmm9,%xmm9
2722	movaps	%xmm0,0x20(%rsp)
2723	pxor	%xmm10,%xmm10
2724	movaps	%xmm0,0x30(%rsp)
2725	pxor	%xmm11,%xmm11
2726	movaps	%xmm0,0x40(%rsp)
2727	pxor	%xmm12,%xmm12
2728	movaps	%xmm0,0x50(%rsp)
2729	pxor	%xmm13,%xmm13
2730	movaps	%xmm0,0x60(%rsp)
2731	pxor	%xmm14,%xmm14
2732	pxor	%xmm15,%xmm15
2733___
2734$code.=<<___ if ($win64);
2735	movaps	-0xa8(%r11),%xmm6
2736	movaps	%xmm0,-0xa8(%r11)		# clear stack
2737	movaps	-0x98(%r11),%xmm7
2738	movaps	%xmm0,-0x98(%r11)
2739	movaps	-0x88(%r11),%xmm8
2740	movaps	%xmm0,-0x88(%r11)
2741	movaps	-0x78(%r11),%xmm9
2742	movaps	%xmm0,-0x78(%r11)
2743	movaps	-0x68(%r11),%xmm10
2744	movaps	%xmm0,-0x68(%r11)
2745	movaps	-0x58(%r11),%xmm11
2746	movaps	%xmm0,-0x58(%r11)
2747	movaps	-0x48(%r11),%xmm12
2748	movaps	%xmm0,-0x48(%r11)
2749	movaps	-0x38(%r11),%xmm13
2750	movaps	%xmm0,-0x38(%r11)
2751	movaps	-0x28(%r11),%xmm14
2752	movaps	%xmm0,-0x28(%r11)
2753	movaps	-0x18(%r11),%xmm15
2754	movaps	%xmm0,-0x18(%r11)
2755	movaps	%xmm0,0x00(%rsp)
2756	movaps	%xmm0,0x10(%rsp)
2757	movaps	%xmm0,0x20(%rsp)
2758	movaps	%xmm0,0x30(%rsp)
2759	movaps	%xmm0,0x40(%rsp)
2760	movaps	%xmm0,0x50(%rsp)
2761	movaps	%xmm0,0x60(%rsp)
2762___
2763$code.=<<___;
2764	mov	-8(%r11),%rbp
2765.cfi_restore	%rbp
2766	lea	(%r11),%rsp
2767.cfi_def_cfa_register	%rsp
2768.Lxts_dec_epilogue:
2769	ret
2770.cfi_endproc
2771.size	aesni_xts_decrypt,.-aesni_xts_decrypt
2772___
2773}
2774
2775######################################################################
2776# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2777#	const AES_KEY *key, unsigned int start_block_num,
2778#	unsigned char offset_i[16], const unsigned char L_[][16],
2779#	unsigned char checksum[16]);
2780#
2781{
2782my @offset=map("%xmm$_",(10..15));
2783my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2784my ($block_num,$offset_p)=("%r8","%r9");		# 5th and 6th arguments
2785my ($L_p,$checksum_p) = ("%rbx","%rbp");
2786my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2787my $seventh_arg = $win64 ? 56 : 8;
2788my $blocks = $len;
2789
2790$code.=<<___;
2791.globl	aesni_ocb_encrypt
2792.type	aesni_ocb_encrypt,\@function,6
2793.align	32
2794aesni_ocb_encrypt:
2795.cfi_startproc
2796	endbranch
2797	lea	(%rsp),%rax
2798	push	%rbx
2799.cfi_push	%rbx
2800	push	%rbp
2801.cfi_push	%rbp
2802	push	%r12
2803.cfi_push	%r12
2804	push	%r13
2805.cfi_push	%r13
2806	push	%r14
2807.cfi_push	%r14
2808___
2809$code.=<<___ if ($win64);
2810	lea	-0xa0(%rsp),%rsp
2811	movaps	%xmm6,0x00(%rsp)		# offload everything
2812	movaps	%xmm7,0x10(%rsp)
2813	movaps	%xmm8,0x20(%rsp)
2814	movaps	%xmm9,0x30(%rsp)
2815	movaps	%xmm10,0x40(%rsp)
2816	movaps	%xmm11,0x50(%rsp)
2817	movaps	%xmm12,0x60(%rsp)
2818	movaps	%xmm13,0x70(%rsp)
2819	movaps	%xmm14,0x80(%rsp)
2820	movaps	%xmm15,0x90(%rsp)
2821.Locb_enc_body:
2822___
2823$code.=<<___;
2824	mov	$seventh_arg(%rax),$L_p		# 7th argument
2825	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
2826
2827	mov	240($key),$rnds_
2828	mov	$key,$key_
2829	shl	\$4,$rnds_
2830	$movkey	($key),$rndkey0l		# round[0]
2831	$movkey	16($key,$rnds_),$rndkey1	# round[last]
2832
2833	movdqu	($offset_p),@offset[5]		# load last offset_i
2834	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
2835	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
2836
2837	mov	\$16+32,$rounds
2838	lea	32($key_,$rnds_),$key
2839	$movkey	16($key_),$rndkey1		# round[1]
2840	sub	%r10,%rax			# twisted $rounds
2841	mov	%rax,%r10			# backup twisted $rounds
2842
2843	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
2844	movdqu	($checksum_p),$checksum		# load checksum
2845
2846	test	\$1,$block_num			# is first block number odd?
2847	jnz	.Locb_enc_odd
2848
2849	bsf	$block_num,$i1
2850	add	\$1,$block_num
2851	shl	\$4,$i1
2852	movdqu	($L_p,$i1),$inout5		# borrow
2853	movdqu	($inp),$inout0
2854	lea	16($inp),$inp
2855
2856	call	__ocb_encrypt1
2857
2858	movdqa	$inout5,@offset[5]
2859	movups	$inout0,($out)
2860	lea	16($out),$out
2861	sub	\$1,$blocks
2862	jz	.Locb_enc_done
2863
2864.Locb_enc_odd:
2865	lea	1($block_num),$i1		# even-numbered blocks
2866	lea	3($block_num),$i3
2867	lea	5($block_num),$i5
2868	lea	6($block_num),$block_num
2869	bsf	$i1,$i1				# ntz(block)
2870	bsf	$i3,$i3
2871	bsf	$i5,$i5
2872	shl	\$4,$i1				# ntz(block) -> table offset
2873	shl	\$4,$i3
2874	shl	\$4,$i5
2875
2876	sub	\$6,$blocks
2877	jc	.Locb_enc_short
2878	jmp	.Locb_enc_grandloop
2879
2880.align	32
2881.Locb_enc_grandloop:
2882	movdqu	`16*0`($inp),$inout0		# load input
2883	movdqu	`16*1`($inp),$inout1
2884	movdqu	`16*2`($inp),$inout2
2885	movdqu	`16*3`($inp),$inout3
2886	movdqu	`16*4`($inp),$inout4
2887	movdqu	`16*5`($inp),$inout5
2888	lea	`16*6`($inp),$inp
2889
2890	call	__ocb_encrypt6
2891
2892	movups	$inout0,`16*0`($out)		# store output
2893	movups	$inout1,`16*1`($out)
2894	movups	$inout2,`16*2`($out)
2895	movups	$inout3,`16*3`($out)
2896	movups	$inout4,`16*4`($out)
2897	movups	$inout5,`16*5`($out)
2898	lea	`16*6`($out),$out
2899	sub	\$6,$blocks
2900	jnc	.Locb_enc_grandloop
2901
2902.Locb_enc_short:
2903	add	\$6,$blocks
2904	jz	.Locb_enc_done
2905
2906	movdqu	`16*0`($inp),$inout0
2907	cmp	\$2,$blocks
2908	jb	.Locb_enc_one
2909	movdqu	`16*1`($inp),$inout1
2910	je	.Locb_enc_two
2911
2912	movdqu	`16*2`($inp),$inout2
2913	cmp	\$4,$blocks
2914	jb	.Locb_enc_three
2915	movdqu	`16*3`($inp),$inout3
2916	je	.Locb_enc_four
2917
2918	movdqu	`16*4`($inp),$inout4
2919	pxor	$inout5,$inout5
2920
2921	call	__ocb_encrypt6
2922
2923	movdqa	@offset[4],@offset[5]
2924	movups	$inout0,`16*0`($out)
2925	movups	$inout1,`16*1`($out)
2926	movups	$inout2,`16*2`($out)
2927	movups	$inout3,`16*3`($out)
2928	movups	$inout4,`16*4`($out)
2929
2930	jmp	.Locb_enc_done
2931
2932.align	16
2933.Locb_enc_one:
2934	movdqa	@offset[0],$inout5		# borrow
2935
2936	call	__ocb_encrypt1
2937
2938	movdqa	$inout5,@offset[5]
2939	movups	$inout0,`16*0`($out)
2940	jmp	.Locb_enc_done
2941
2942.align	16
2943.Locb_enc_two:
2944	pxor	$inout2,$inout2
2945	pxor	$inout3,$inout3
2946
2947	call	__ocb_encrypt4
2948
2949	movdqa	@offset[1],@offset[5]
2950	movups	$inout0,`16*0`($out)
2951	movups	$inout1,`16*1`($out)
2952
2953	jmp	.Locb_enc_done
2954
2955.align	16
2956.Locb_enc_three:
2957	pxor	$inout3,$inout3
2958
2959	call	__ocb_encrypt4
2960
2961	movdqa	@offset[2],@offset[5]
2962	movups	$inout0,`16*0`($out)
2963	movups	$inout1,`16*1`($out)
2964	movups	$inout2,`16*2`($out)
2965
2966	jmp	.Locb_enc_done
2967
2968.align	16
2969.Locb_enc_four:
2970	call	__ocb_encrypt4
2971
2972	movdqa	@offset[3],@offset[5]
2973	movups	$inout0,`16*0`($out)
2974	movups	$inout1,`16*1`($out)
2975	movups	$inout2,`16*2`($out)
2976	movups	$inout3,`16*3`($out)
2977
2978.Locb_enc_done:
2979	pxor	$rndkey0,@offset[5]		# "remove" round[last]
2980	movdqu	$checksum,($checksum_p)		# store checksum
2981	movdqu	@offset[5],($offset_p)		# store last offset_i
2982
2983	xorps	%xmm0,%xmm0			# clear register bank
2984	pxor	%xmm1,%xmm1
2985	pxor	%xmm2,%xmm2
2986	pxor	%xmm3,%xmm3
2987	pxor	%xmm4,%xmm4
2988	pxor	%xmm5,%xmm5
2989___
2990$code.=<<___ if (!$win64);
2991	pxor	%xmm6,%xmm6
2992	pxor	%xmm7,%xmm7
2993	pxor	%xmm8,%xmm8
2994	pxor	%xmm9,%xmm9
2995	pxor	%xmm10,%xmm10
2996	pxor	%xmm11,%xmm11
2997	pxor	%xmm12,%xmm12
2998	pxor	%xmm13,%xmm13
2999	pxor	%xmm14,%xmm14
3000	pxor	%xmm15,%xmm15
3001	lea	0x28(%rsp),%rax
3002.cfi_def_cfa	%rax,8
3003___
3004$code.=<<___ if ($win64);
3005	movaps	0x00(%rsp),%xmm6
3006	movaps	%xmm0,0x00(%rsp)		# clear stack
3007	movaps	0x10(%rsp),%xmm7
3008	movaps	%xmm0,0x10(%rsp)
3009	movaps	0x20(%rsp),%xmm8
3010	movaps	%xmm0,0x20(%rsp)
3011	movaps	0x30(%rsp),%xmm9
3012	movaps	%xmm0,0x30(%rsp)
3013	movaps	0x40(%rsp),%xmm10
3014	movaps	%xmm0,0x40(%rsp)
3015	movaps	0x50(%rsp),%xmm11
3016	movaps	%xmm0,0x50(%rsp)
3017	movaps	0x60(%rsp),%xmm12
3018	movaps	%xmm0,0x60(%rsp)
3019	movaps	0x70(%rsp),%xmm13
3020	movaps	%xmm0,0x70(%rsp)
3021	movaps	0x80(%rsp),%xmm14
3022	movaps	%xmm0,0x80(%rsp)
3023	movaps	0x90(%rsp),%xmm15
3024	movaps	%xmm0,0x90(%rsp)
3025	lea	0xa0+0x28(%rsp),%rax
3026.Locb_enc_pop:
3027___
3028$code.=<<___;
3029	mov	-40(%rax),%r14
3030.cfi_restore	%r14
3031	mov	-32(%rax),%r13
3032.cfi_restore	%r13
3033	mov	-24(%rax),%r12
3034.cfi_restore	%r12
3035	mov	-16(%rax),%rbp
3036.cfi_restore	%rbp
3037	mov	-8(%rax),%rbx
3038.cfi_restore	%rbx
3039	lea	(%rax),%rsp
3040.cfi_def_cfa_register	%rsp
3041.Locb_enc_epilogue:
3042	ret
3043.cfi_endproc
3044.size	aesni_ocb_encrypt,.-aesni_ocb_encrypt
3045
3046.type	__ocb_encrypt6,\@abi-omnipotent
3047.align	32
3048__ocb_encrypt6:
3049.cfi_startproc
3050	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3051	 movdqu		($L_p,$i1),@offset[1]
3052	 movdqa		@offset[0],@offset[2]
3053	 movdqu		($L_p,$i3),@offset[3]
3054	 movdqa		@offset[0],@offset[4]
3055	 pxor		@offset[5],@offset[0]
3056	 movdqu		($L_p,$i5),@offset[5]
3057	 pxor		@offset[0],@offset[1]
3058	pxor		$inout0,$checksum	# accumulate checksum
3059	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3060	 pxor		@offset[1],@offset[2]
3061	pxor		$inout1,$checksum
3062	pxor		@offset[1],$inout1
3063	 pxor		@offset[2],@offset[3]
3064	pxor		$inout2,$checksum
3065	pxor		@offset[2],$inout2
3066	 pxor		@offset[3],@offset[4]
3067	pxor		$inout3,$checksum
3068	pxor		@offset[3],$inout3
3069	 pxor		@offset[4],@offset[5]
3070	pxor		$inout4,$checksum
3071	pxor		@offset[4],$inout4
3072	pxor		$inout5,$checksum
3073	pxor		@offset[5],$inout5
3074	$movkey		32($key_),$rndkey0
3075
3076	lea		1($block_num),$i1	# even-numbered blocks
3077	lea		3($block_num),$i3
3078	lea		5($block_num),$i5
3079	add		\$6,$block_num
3080	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3081	bsf		$i1,$i1			# ntz(block)
3082	bsf		$i3,$i3
3083	bsf		$i5,$i5
3084
3085	aesenc		$rndkey1,$inout0
3086	aesenc		$rndkey1,$inout1
3087	aesenc		$rndkey1,$inout2
3088	aesenc		$rndkey1,$inout3
3089	 pxor		$rndkey0l,@offset[1]
3090	 pxor		$rndkey0l,@offset[2]
3091	aesenc		$rndkey1,$inout4
3092	 pxor		$rndkey0l,@offset[3]
3093	 pxor		$rndkey0l,@offset[4]
3094	aesenc		$rndkey1,$inout5
3095	$movkey		48($key_),$rndkey1
3096	 pxor		$rndkey0l,@offset[5]
3097
3098	aesenc		$rndkey0,$inout0
3099	aesenc		$rndkey0,$inout1
3100	aesenc		$rndkey0,$inout2
3101	aesenc		$rndkey0,$inout3
3102	aesenc		$rndkey0,$inout4
3103	aesenc		$rndkey0,$inout5
3104	$movkey		64($key_),$rndkey0
3105	shl		\$4,$i1			# ntz(block) -> table offset
3106	shl		\$4,$i3
3107	jmp		.Locb_enc_loop6
3108
3109.align	32
3110.Locb_enc_loop6:
3111	aesenc		$rndkey1,$inout0
3112	aesenc		$rndkey1,$inout1
3113	aesenc		$rndkey1,$inout2
3114	aesenc		$rndkey1,$inout3
3115	aesenc		$rndkey1,$inout4
3116	aesenc		$rndkey1,$inout5
3117	$movkey		($key,%rax),$rndkey1
3118	add		\$32,%rax
3119
3120	aesenc		$rndkey0,$inout0
3121	aesenc		$rndkey0,$inout1
3122	aesenc		$rndkey0,$inout2
3123	aesenc		$rndkey0,$inout3
3124	aesenc		$rndkey0,$inout4
3125	aesenc		$rndkey0,$inout5
3126	$movkey		-16($key,%rax),$rndkey0
3127	jnz		.Locb_enc_loop6
3128
3129	aesenc		$rndkey1,$inout0
3130	aesenc		$rndkey1,$inout1
3131	aesenc		$rndkey1,$inout2
3132	aesenc		$rndkey1,$inout3
3133	aesenc		$rndkey1,$inout4
3134	aesenc		$rndkey1,$inout5
3135	$movkey		16($key_),$rndkey1
3136	shl		\$4,$i5
3137
3138	aesenclast	@offset[0],$inout0
3139	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3140	mov		%r10,%rax		# restore twisted rounds
3141	aesenclast	@offset[1],$inout1
3142	aesenclast	@offset[2],$inout2
3143	aesenclast	@offset[3],$inout3
3144	aesenclast	@offset[4],$inout4
3145	aesenclast	@offset[5],$inout5
3146	ret
3147.cfi_endproc
3148.size	__ocb_encrypt6,.-__ocb_encrypt6
3149
3150.type	__ocb_encrypt4,\@abi-omnipotent
3151.align	32
3152__ocb_encrypt4:
3153.cfi_startproc
3154	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3155	 movdqu		($L_p,$i1),@offset[1]
3156	 movdqa		@offset[0],@offset[2]
3157	 movdqu		($L_p,$i3),@offset[3]
3158	 pxor		@offset[5],@offset[0]
3159	 pxor		@offset[0],@offset[1]
3160	pxor		$inout0,$checksum	# accumulate checksum
3161	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3162	 pxor		@offset[1],@offset[2]
3163	pxor		$inout1,$checksum
3164	pxor		@offset[1],$inout1
3165	 pxor		@offset[2],@offset[3]
3166	pxor		$inout2,$checksum
3167	pxor		@offset[2],$inout2
3168	pxor		$inout3,$checksum
3169	pxor		@offset[3],$inout3
3170	$movkey		32($key_),$rndkey0
3171
3172	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3173	 pxor		$rndkey0l,@offset[1]
3174	 pxor		$rndkey0l,@offset[2]
3175	 pxor		$rndkey0l,@offset[3]
3176
3177	aesenc		$rndkey1,$inout0
3178	aesenc		$rndkey1,$inout1
3179	aesenc		$rndkey1,$inout2
3180	aesenc		$rndkey1,$inout3
3181	$movkey		48($key_),$rndkey1
3182
3183	aesenc		$rndkey0,$inout0
3184	aesenc		$rndkey0,$inout1
3185	aesenc		$rndkey0,$inout2
3186	aesenc		$rndkey0,$inout3
3187	$movkey		64($key_),$rndkey0
3188	jmp		.Locb_enc_loop4
3189
3190.align	32
3191.Locb_enc_loop4:
3192	aesenc		$rndkey1,$inout0
3193	aesenc		$rndkey1,$inout1
3194	aesenc		$rndkey1,$inout2
3195	aesenc		$rndkey1,$inout3
3196	$movkey		($key,%rax),$rndkey1
3197	add		\$32,%rax
3198
3199	aesenc		$rndkey0,$inout0
3200	aesenc		$rndkey0,$inout1
3201	aesenc		$rndkey0,$inout2
3202	aesenc		$rndkey0,$inout3
3203	$movkey		-16($key,%rax),$rndkey0
3204	jnz		.Locb_enc_loop4
3205
3206	aesenc		$rndkey1,$inout0
3207	aesenc		$rndkey1,$inout1
3208	aesenc		$rndkey1,$inout2
3209	aesenc		$rndkey1,$inout3
3210	$movkey		16($key_),$rndkey1
3211	mov		%r10,%rax		# restore twisted rounds
3212
3213	aesenclast	@offset[0],$inout0
3214	aesenclast	@offset[1],$inout1
3215	aesenclast	@offset[2],$inout2
3216	aesenclast	@offset[3],$inout3
3217	ret
3218.cfi_endproc
3219.size	__ocb_encrypt4,.-__ocb_encrypt4
3220
3221.type	__ocb_encrypt1,\@abi-omnipotent
3222.align	32
3223__ocb_encrypt1:
3224.cfi_startproc
3225	 pxor		@offset[5],$inout5	# offset_i
3226	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3227	pxor		$inout0,$checksum	# accumulate checksum
3228	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3229	$movkey		32($key_),$rndkey0
3230
3231	aesenc		$rndkey1,$inout0
3232	$movkey		48($key_),$rndkey1
3233	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3234
3235	aesenc		$rndkey0,$inout0
3236	$movkey		64($key_),$rndkey0
3237	jmp		.Locb_enc_loop1
3238
3239.align	32
3240.Locb_enc_loop1:
3241	aesenc		$rndkey1,$inout0
3242	$movkey		($key,%rax),$rndkey1
3243	add		\$32,%rax
3244
3245	aesenc		$rndkey0,$inout0
3246	$movkey		-16($key,%rax),$rndkey0
3247	jnz		.Locb_enc_loop1
3248
3249	aesenc		$rndkey1,$inout0
3250	$movkey		16($key_),$rndkey1	# redundant in tail
3251	mov		%r10,%rax		# restore twisted rounds
3252
3253	aesenclast	$inout5,$inout0
3254	ret
3255.cfi_endproc
3256.size	__ocb_encrypt1,.-__ocb_encrypt1
3257
3258.globl	aesni_ocb_decrypt
3259.type	aesni_ocb_decrypt,\@function,6
3260.align	32
3261aesni_ocb_decrypt:
3262.cfi_startproc
3263	endbranch
3264	lea	(%rsp),%rax
3265	push	%rbx
3266.cfi_push	%rbx
3267	push	%rbp
3268.cfi_push	%rbp
3269	push	%r12
3270.cfi_push	%r12
3271	push	%r13
3272.cfi_push	%r13
3273	push	%r14
3274.cfi_push	%r14
3275___
3276$code.=<<___ if ($win64);
3277	lea	-0xa0(%rsp),%rsp
3278	movaps	%xmm6,0x00(%rsp)		# offload everything
3279	movaps	%xmm7,0x10(%rsp)
3280	movaps	%xmm8,0x20(%rsp)
3281	movaps	%xmm9,0x30(%rsp)
3282	movaps	%xmm10,0x40(%rsp)
3283	movaps	%xmm11,0x50(%rsp)
3284	movaps	%xmm12,0x60(%rsp)
3285	movaps	%xmm13,0x70(%rsp)
3286	movaps	%xmm14,0x80(%rsp)
3287	movaps	%xmm15,0x90(%rsp)
3288.Locb_dec_body:
3289___
3290$code.=<<___;
3291	mov	$seventh_arg(%rax),$L_p		# 7th argument
3292	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
3293
3294	mov	240($key),$rnds_
3295	mov	$key,$key_
3296	shl	\$4,$rnds_
3297	$movkey	($key),$rndkey0l		# round[0]
3298	$movkey	16($key,$rnds_),$rndkey1	# round[last]
3299
3300	movdqu	($offset_p),@offset[5]		# load last offset_i
3301	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
3302	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
3303
3304	mov	\$16+32,$rounds
3305	lea	32($key_,$rnds_),$key
3306	$movkey	16($key_),$rndkey1		# round[1]
3307	sub	%r10,%rax			# twisted $rounds
3308	mov	%rax,%r10			# backup twisted $rounds
3309
3310	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
3311	movdqu	($checksum_p),$checksum		# load checksum
3312
3313	test	\$1,$block_num			# is first block number odd?
3314	jnz	.Locb_dec_odd
3315
3316	bsf	$block_num,$i1
3317	add	\$1,$block_num
3318	shl	\$4,$i1
3319	movdqu	($L_p,$i1),$inout5		# borrow
3320	movdqu	($inp),$inout0
3321	lea	16($inp),$inp
3322
3323	call	__ocb_decrypt1
3324
3325	movdqa	$inout5,@offset[5]
3326	movups	$inout0,($out)
3327	xorps	$inout0,$checksum		# accumulate checksum
3328	lea	16($out),$out
3329	sub	\$1,$blocks
3330	jz	.Locb_dec_done
3331
3332.Locb_dec_odd:
3333	lea	1($block_num),$i1		# even-numbered blocks
3334	lea	3($block_num),$i3
3335	lea	5($block_num),$i5
3336	lea	6($block_num),$block_num
3337	bsf	$i1,$i1				# ntz(block)
3338	bsf	$i3,$i3
3339	bsf	$i5,$i5
3340	shl	\$4,$i1				# ntz(block) -> table offset
3341	shl	\$4,$i3
3342	shl	\$4,$i5
3343
3344	sub	\$6,$blocks
3345	jc	.Locb_dec_short
3346	jmp	.Locb_dec_grandloop
3347
3348.align	32
3349.Locb_dec_grandloop:
3350	movdqu	`16*0`($inp),$inout0		# load input
3351	movdqu	`16*1`($inp),$inout1
3352	movdqu	`16*2`($inp),$inout2
3353	movdqu	`16*3`($inp),$inout3
3354	movdqu	`16*4`($inp),$inout4
3355	movdqu	`16*5`($inp),$inout5
3356	lea	`16*6`($inp),$inp
3357
3358	call	__ocb_decrypt6
3359
3360	movups	$inout0,`16*0`($out)		# store output
3361	pxor	$inout0,$checksum		# accumulate checksum
3362	movups	$inout1,`16*1`($out)
3363	pxor	$inout1,$checksum
3364	movups	$inout2,`16*2`($out)
3365	pxor	$inout2,$checksum
3366	movups	$inout3,`16*3`($out)
3367	pxor	$inout3,$checksum
3368	movups	$inout4,`16*4`($out)
3369	pxor	$inout4,$checksum
3370	movups	$inout5,`16*5`($out)
3371	pxor	$inout5,$checksum
3372	lea	`16*6`($out),$out
3373	sub	\$6,$blocks
3374	jnc	.Locb_dec_grandloop
3375
3376.Locb_dec_short:
3377	add	\$6,$blocks
3378	jz	.Locb_dec_done
3379
3380	movdqu	`16*0`($inp),$inout0
3381	cmp	\$2,$blocks
3382	jb	.Locb_dec_one
3383	movdqu	`16*1`($inp),$inout1
3384	je	.Locb_dec_two
3385
3386	movdqu	`16*2`($inp),$inout2
3387	cmp	\$4,$blocks
3388	jb	.Locb_dec_three
3389	movdqu	`16*3`($inp),$inout3
3390	je	.Locb_dec_four
3391
3392	movdqu	`16*4`($inp),$inout4
3393	pxor	$inout5,$inout5
3394
3395	call	__ocb_decrypt6
3396
3397	movdqa	@offset[4],@offset[5]
3398	movups	$inout0,`16*0`($out)		# store output
3399	pxor	$inout0,$checksum		# accumulate checksum
3400	movups	$inout1,`16*1`($out)
3401	pxor	$inout1,$checksum
3402	movups	$inout2,`16*2`($out)
3403	pxor	$inout2,$checksum
3404	movups	$inout3,`16*3`($out)
3405	pxor	$inout3,$checksum
3406	movups	$inout4,`16*4`($out)
3407	pxor	$inout4,$checksum
3408
3409	jmp	.Locb_dec_done
3410
3411.align	16
3412.Locb_dec_one:
3413	movdqa	@offset[0],$inout5		# borrow
3414
3415	call	__ocb_decrypt1
3416
3417	movdqa	$inout5,@offset[5]
3418	movups	$inout0,`16*0`($out)		# store output
3419	xorps	$inout0,$checksum		# accumulate checksum
3420	jmp	.Locb_dec_done
3421
3422.align	16
3423.Locb_dec_two:
3424	pxor	$inout2,$inout2
3425	pxor	$inout3,$inout3
3426
3427	call	__ocb_decrypt4
3428
3429	movdqa	@offset[1],@offset[5]
3430	movups	$inout0,`16*0`($out)		# store output
3431	xorps	$inout0,$checksum		# accumulate checksum
3432	movups	$inout1,`16*1`($out)
3433	xorps	$inout1,$checksum
3434
3435	jmp	.Locb_dec_done
3436
3437.align	16
3438.Locb_dec_three:
3439	pxor	$inout3,$inout3
3440
3441	call	__ocb_decrypt4
3442
3443	movdqa	@offset[2],@offset[5]
3444	movups	$inout0,`16*0`($out)		# store output
3445	xorps	$inout0,$checksum		# accumulate checksum
3446	movups	$inout1,`16*1`($out)
3447	xorps	$inout1,$checksum
3448	movups	$inout2,`16*2`($out)
3449	xorps	$inout2,$checksum
3450
3451	jmp	.Locb_dec_done
3452
3453.align	16
3454.Locb_dec_four:
3455	call	__ocb_decrypt4
3456
3457	movdqa	@offset[3],@offset[5]
3458	movups	$inout0,`16*0`($out)		# store output
3459	pxor	$inout0,$checksum		# accumulate checksum
3460	movups	$inout1,`16*1`($out)
3461	pxor	$inout1,$checksum
3462	movups	$inout2,`16*2`($out)
3463	pxor	$inout2,$checksum
3464	movups	$inout3,`16*3`($out)
3465	pxor	$inout3,$checksum
3466
3467.Locb_dec_done:
3468	pxor	$rndkey0,@offset[5]		# "remove" round[last]
3469	movdqu	$checksum,($checksum_p)		# store checksum
3470	movdqu	@offset[5],($offset_p)		# store last offset_i
3471
3472	xorps	%xmm0,%xmm0			# clear register bank
3473	pxor	%xmm1,%xmm1
3474	pxor	%xmm2,%xmm2
3475	pxor	%xmm3,%xmm3
3476	pxor	%xmm4,%xmm4
3477	pxor	%xmm5,%xmm5
3478___
3479$code.=<<___ if (!$win64);
3480	pxor	%xmm6,%xmm6
3481	pxor	%xmm7,%xmm7
3482	pxor	%xmm8,%xmm8
3483	pxor	%xmm9,%xmm9
3484	pxor	%xmm10,%xmm10
3485	pxor	%xmm11,%xmm11
3486	pxor	%xmm12,%xmm12
3487	pxor	%xmm13,%xmm13
3488	pxor	%xmm14,%xmm14
3489	pxor	%xmm15,%xmm15
3490	lea	0x28(%rsp),%rax
3491.cfi_def_cfa	%rax,8
3492___
3493$code.=<<___ if ($win64);
3494	movaps	0x00(%rsp),%xmm6
3495	movaps	%xmm0,0x00(%rsp)		# clear stack
3496	movaps	0x10(%rsp),%xmm7
3497	movaps	%xmm0,0x10(%rsp)
3498	movaps	0x20(%rsp),%xmm8
3499	movaps	%xmm0,0x20(%rsp)
3500	movaps	0x30(%rsp),%xmm9
3501	movaps	%xmm0,0x30(%rsp)
3502	movaps	0x40(%rsp),%xmm10
3503	movaps	%xmm0,0x40(%rsp)
3504	movaps	0x50(%rsp),%xmm11
3505	movaps	%xmm0,0x50(%rsp)
3506	movaps	0x60(%rsp),%xmm12
3507	movaps	%xmm0,0x60(%rsp)
3508	movaps	0x70(%rsp),%xmm13
3509	movaps	%xmm0,0x70(%rsp)
3510	movaps	0x80(%rsp),%xmm14
3511	movaps	%xmm0,0x80(%rsp)
3512	movaps	0x90(%rsp),%xmm15
3513	movaps	%xmm0,0x90(%rsp)
3514	lea	0xa0+0x28(%rsp),%rax
3515.Locb_dec_pop:
3516___
3517$code.=<<___;
3518	mov	-40(%rax),%r14
3519.cfi_restore	%r14
3520	mov	-32(%rax),%r13
3521.cfi_restore	%r13
3522	mov	-24(%rax),%r12
3523.cfi_restore	%r12
3524	mov	-16(%rax),%rbp
3525.cfi_restore	%rbp
3526	mov	-8(%rax),%rbx
3527.cfi_restore	%rbx
3528	lea	(%rax),%rsp
3529.cfi_def_cfa_register	%rsp
3530.Locb_dec_epilogue:
3531	ret
3532.cfi_endproc
3533.size	aesni_ocb_decrypt,.-aesni_ocb_decrypt
3534
3535.type	__ocb_decrypt6,\@abi-omnipotent
3536.align	32
3537__ocb_decrypt6:
3538.cfi_startproc
3539	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3540	 movdqu		($L_p,$i1),@offset[1]
3541	 movdqa		@offset[0],@offset[2]
3542	 movdqu		($L_p,$i3),@offset[3]
3543	 movdqa		@offset[0],@offset[4]
3544	 pxor		@offset[5],@offset[0]
3545	 movdqu		($L_p,$i5),@offset[5]
3546	 pxor		@offset[0],@offset[1]
3547	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3548	 pxor		@offset[1],@offset[2]
3549	pxor		@offset[1],$inout1
3550	 pxor		@offset[2],@offset[3]
3551	pxor		@offset[2],$inout2
3552	 pxor		@offset[3],@offset[4]
3553	pxor		@offset[3],$inout3
3554	 pxor		@offset[4],@offset[5]
3555	pxor		@offset[4],$inout4
3556	pxor		@offset[5],$inout5
3557	$movkey		32($key_),$rndkey0
3558
3559	lea		1($block_num),$i1	# even-numbered blocks
3560	lea		3($block_num),$i3
3561	lea		5($block_num),$i5
3562	add		\$6,$block_num
3563	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3564	bsf		$i1,$i1			# ntz(block)
3565	bsf		$i3,$i3
3566	bsf		$i5,$i5
3567
3568	aesdec		$rndkey1,$inout0
3569	aesdec		$rndkey1,$inout1
3570	aesdec		$rndkey1,$inout2
3571	aesdec		$rndkey1,$inout3
3572	 pxor		$rndkey0l,@offset[1]
3573	 pxor		$rndkey0l,@offset[2]
3574	aesdec		$rndkey1,$inout4
3575	 pxor		$rndkey0l,@offset[3]
3576	 pxor		$rndkey0l,@offset[4]
3577	aesdec		$rndkey1,$inout5
3578	$movkey		48($key_),$rndkey1
3579	 pxor		$rndkey0l,@offset[5]
3580
3581	aesdec		$rndkey0,$inout0
3582	aesdec		$rndkey0,$inout1
3583	aesdec		$rndkey0,$inout2
3584	aesdec		$rndkey0,$inout3
3585	aesdec		$rndkey0,$inout4
3586	aesdec		$rndkey0,$inout5
3587	$movkey		64($key_),$rndkey0
3588	shl		\$4,$i1			# ntz(block) -> table offset
3589	shl		\$4,$i3
3590	jmp		.Locb_dec_loop6
3591
3592.align	32
3593.Locb_dec_loop6:
3594	aesdec		$rndkey1,$inout0
3595	aesdec		$rndkey1,$inout1
3596	aesdec		$rndkey1,$inout2
3597	aesdec		$rndkey1,$inout3
3598	aesdec		$rndkey1,$inout4
3599	aesdec		$rndkey1,$inout5
3600	$movkey		($key,%rax),$rndkey1
3601	add		\$32,%rax
3602
3603	aesdec		$rndkey0,$inout0
3604	aesdec		$rndkey0,$inout1
3605	aesdec		$rndkey0,$inout2
3606	aesdec		$rndkey0,$inout3
3607	aesdec		$rndkey0,$inout4
3608	aesdec		$rndkey0,$inout5
3609	$movkey		-16($key,%rax),$rndkey0
3610	jnz		.Locb_dec_loop6
3611
3612	aesdec		$rndkey1,$inout0
3613	aesdec		$rndkey1,$inout1
3614	aesdec		$rndkey1,$inout2
3615	aesdec		$rndkey1,$inout3
3616	aesdec		$rndkey1,$inout4
3617	aesdec		$rndkey1,$inout5
3618	$movkey		16($key_),$rndkey1
3619	shl		\$4,$i5
3620
3621	aesdeclast	@offset[0],$inout0
3622	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3623	mov		%r10,%rax		# restore twisted rounds
3624	aesdeclast	@offset[1],$inout1
3625	aesdeclast	@offset[2],$inout2
3626	aesdeclast	@offset[3],$inout3
3627	aesdeclast	@offset[4],$inout4
3628	aesdeclast	@offset[5],$inout5
3629	ret
3630.cfi_endproc
3631.size	__ocb_decrypt6,.-__ocb_decrypt6
3632
3633.type	__ocb_decrypt4,\@abi-omnipotent
3634.align	32
3635__ocb_decrypt4:
3636.cfi_startproc
3637	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3638	 movdqu		($L_p,$i1),@offset[1]
3639	 movdqa		@offset[0],@offset[2]
3640	 movdqu		($L_p,$i3),@offset[3]
3641	 pxor		@offset[5],@offset[0]
3642	 pxor		@offset[0],@offset[1]
3643	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3644	 pxor		@offset[1],@offset[2]
3645	pxor		@offset[1],$inout1
3646	 pxor		@offset[2],@offset[3]
3647	pxor		@offset[2],$inout2
3648	pxor		@offset[3],$inout3
3649	$movkey		32($key_),$rndkey0
3650
3651	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3652	 pxor		$rndkey0l,@offset[1]
3653	 pxor		$rndkey0l,@offset[2]
3654	 pxor		$rndkey0l,@offset[3]
3655
3656	aesdec		$rndkey1,$inout0
3657	aesdec		$rndkey1,$inout1
3658	aesdec		$rndkey1,$inout2
3659	aesdec		$rndkey1,$inout3
3660	$movkey		48($key_),$rndkey1
3661
3662	aesdec		$rndkey0,$inout0
3663	aesdec		$rndkey0,$inout1
3664	aesdec		$rndkey0,$inout2
3665	aesdec		$rndkey0,$inout3
3666	$movkey		64($key_),$rndkey0
3667	jmp		.Locb_dec_loop4
3668
3669.align	32
3670.Locb_dec_loop4:
3671	aesdec		$rndkey1,$inout0
3672	aesdec		$rndkey1,$inout1
3673	aesdec		$rndkey1,$inout2
3674	aesdec		$rndkey1,$inout3
3675	$movkey		($key,%rax),$rndkey1
3676	add		\$32,%rax
3677
3678	aesdec		$rndkey0,$inout0
3679	aesdec		$rndkey0,$inout1
3680	aesdec		$rndkey0,$inout2
3681	aesdec		$rndkey0,$inout3
3682	$movkey		-16($key,%rax),$rndkey0
3683	jnz		.Locb_dec_loop4
3684
3685	aesdec		$rndkey1,$inout0
3686	aesdec		$rndkey1,$inout1
3687	aesdec		$rndkey1,$inout2
3688	aesdec		$rndkey1,$inout3
3689	$movkey		16($key_),$rndkey1
3690	mov		%r10,%rax		# restore twisted rounds
3691
3692	aesdeclast	@offset[0],$inout0
3693	aesdeclast	@offset[1],$inout1
3694	aesdeclast	@offset[2],$inout2
3695	aesdeclast	@offset[3],$inout3
3696	ret
3697.cfi_endproc
3698.size	__ocb_decrypt4,.-__ocb_decrypt4
3699
3700.type	__ocb_decrypt1,\@abi-omnipotent
3701.align	32
3702__ocb_decrypt1:
3703.cfi_startproc
3704	 pxor		@offset[5],$inout5	# offset_i
3705	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3706	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3707	$movkey		32($key_),$rndkey0
3708
3709	aesdec		$rndkey1,$inout0
3710	$movkey		48($key_),$rndkey1
3711	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3712
3713	aesdec		$rndkey0,$inout0
3714	$movkey		64($key_),$rndkey0
3715	jmp		.Locb_dec_loop1
3716
3717.align	32
3718.Locb_dec_loop1:
3719	aesdec		$rndkey1,$inout0
3720	$movkey		($key,%rax),$rndkey1
3721	add		\$32,%rax
3722
3723	aesdec		$rndkey0,$inout0
3724	$movkey		-16($key,%rax),$rndkey0
3725	jnz		.Locb_dec_loop1
3726
3727	aesdec		$rndkey1,$inout0
3728	$movkey		16($key_),$rndkey1	# redundant in tail
3729	mov		%r10,%rax		# restore twisted rounds
3730
3731	aesdeclast	$inout5,$inout0
3732	ret
3733.cfi_endproc
3734.size	__ocb_decrypt1,.-__ocb_decrypt1
3735___
3736} }}
3737
3738########################################################################
3739# void $PREFIX_cbc_encrypt (const void *inp, void *out,
3740#			    size_t length, const AES_KEY *key,
3741#			    unsigned char *ivp,const int enc);
3742{
3743my $frame_size = 0x10 + ($win64?0xa0:0);	# used in decrypt
3744my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3745
3746$code.=<<___;
3747.globl	${PREFIX}_cbc_encrypt
3748.type	${PREFIX}_cbc_encrypt,\@function,6
3749.align	16
3750${PREFIX}_cbc_encrypt:
3751.cfi_startproc
3752	endbranch
3753	test	$len,$len		# check length
3754	jz	.Lcbc_ret
3755
3756	mov	240($key),$rnds_	# key->rounds
3757	mov	$key,$key_		# backup $key
3758	test	%r9d,%r9d		# 6th argument
3759	jz	.Lcbc_decrypt
3760#--------------------------- CBC ENCRYPT ------------------------------#
3761	movups	($ivp),$inout0		# load iv as initial state
3762	mov	$rnds_,$rounds
3763	cmp	\$16,$len
3764	jb	.Lcbc_enc_tail
3765	sub	\$16,$len
3766	jmp	.Lcbc_enc_loop
3767.align	16
3768.Lcbc_enc_loop:
3769	movups	($inp),$inout1		# load input
3770	lea	16($inp),$inp
3771	#xorps	$inout1,$inout0
3772___
3773	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3774$code.=<<___;
3775	mov	$rnds_,$rounds		# restore $rounds
3776	mov	$key_,$key		# restore $key
3777	movups	$inout0,0($out)		# store output
3778	lea	16($out),$out
3779	sub	\$16,$len
3780	jnc	.Lcbc_enc_loop
3781	add	\$16,$len
3782	jnz	.Lcbc_enc_tail
3783	 pxor	$rndkey0,$rndkey0	# clear register bank
3784	 pxor	$rndkey1,$rndkey1
3785	movups	$inout0,($ivp)
3786	 pxor	$inout0,$inout0
3787	 pxor	$inout1,$inout1
3788	jmp	.Lcbc_ret
3789
3790.Lcbc_enc_tail:
3791	mov	$len,%rcx	# zaps $key
3792	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
3793	.long	0x9066A4F3	# rep movsb
3794	mov	\$16,%ecx	# zero tail
3795	sub	$len,%rcx
3796	xor	%eax,%eax
3797	.long	0x9066AAF3	# rep stosb
3798	lea	-16(%rdi),%rdi	# rewind $out by 1 block
3799	mov	$rnds_,$rounds	# restore $rounds
3800	mov	%rdi,%rsi	# $inp and $out are the same
3801	mov	$key_,$key	# restore $key
3802	xor	$len,$len	# len=16
3803	jmp	.Lcbc_enc_loop	# one more spin
3804#--------------------------- CBC DECRYPT ------------------------------#
3805.align	16
3806.Lcbc_decrypt:
3807	cmp	\$16,$len
3808	jne	.Lcbc_decrypt_bulk
3809
3810	# handle single block without allocating stack frame,
3811	# useful in ciphertext stealing mode
3812	movdqu	($inp),$inout0		# load input
3813	movdqu	($ivp),$inout1		# load iv
3814	movdqa	$inout0,$inout2		# future iv
3815___
3816	&aesni_generate1("dec",$key,$rnds_);
3817$code.=<<___;
3818	 pxor	$rndkey0,$rndkey0	# clear register bank
3819	 pxor	$rndkey1,$rndkey1
3820	movdqu	$inout2,($ivp)		# store iv
3821	xorps	$inout1,$inout0		# ^=iv
3822	 pxor	$inout1,$inout1
3823	movups	$inout0,($out)		# store output
3824	 pxor	$inout0,$inout0
3825	jmp	.Lcbc_ret
3826.align	16
3827.Lcbc_decrypt_bulk:
3828	lea	(%rsp),%r11		# frame pointer
3829.cfi_def_cfa_register	%r11
3830	push	%rbp
3831.cfi_push	%rbp
3832	sub	\$$frame_size,%rsp
3833	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
3834___
3835$code.=<<___ if ($win64);
3836	movaps	%xmm6,0x10(%rsp)
3837	movaps	%xmm7,0x20(%rsp)
3838	movaps	%xmm8,0x30(%rsp)
3839	movaps	%xmm9,0x40(%rsp)
3840	movaps	%xmm10,0x50(%rsp)
3841	movaps	%xmm11,0x60(%rsp)
3842	movaps	%xmm12,0x70(%rsp)
3843	movaps	%xmm13,0x80(%rsp)
3844	movaps	%xmm14,0x90(%rsp)
3845	movaps	%xmm15,0xa0(%rsp)
3846.Lcbc_decrypt_body:
3847___
3848
3849my $inp_=$key_="%rbp";			# reassign $key_
3850
3851$code.=<<___;
3852	mov	$key,$key_		# [re-]backup $key [after reassignment]
3853	movups	($ivp),$iv
3854	mov	$rnds_,$rounds
3855	cmp	\$0x50,$len
3856	jbe	.Lcbc_dec_tail
3857
3858	$movkey	($key),$rndkey0
3859	movdqu	0x00($inp),$inout0	# load input
3860	movdqu	0x10($inp),$inout1
3861	movdqa	$inout0,$in0
3862	movdqu	0x20($inp),$inout2
3863	movdqa	$inout1,$in1
3864	movdqu	0x30($inp),$inout3
3865	movdqa	$inout2,$in2
3866	movdqu	0x40($inp),$inout4
3867	movdqa	$inout3,$in3
3868	movdqu	0x50($inp),$inout5
3869	movdqa	$inout4,$in4
3870	mov	OPENSSL_ia32cap_P+4(%rip),%r9d
3871	cmp	\$0x70,$len
3872	jbe	.Lcbc_dec_six_or_seven
3873
3874	and	\$`1<<26|1<<22`,%r9d	# isolate XSAVE+MOVBE
3875	sub	\$0x50,$len		# $len is biased by -5*16
3876	cmp	\$`1<<22`,%r9d		# check for MOVBE without XSAVE
3877	je	.Lcbc_dec_loop6_enter	# [which denotes Atom Silvermont]
3878	sub	\$0x20,$len		# $len is biased by -7*16
3879	lea	0x70($key),$key		# size optimization
3880	jmp	.Lcbc_dec_loop8_enter
3881.align	16
3882.Lcbc_dec_loop8:
3883	movups	$inout7,($out)
3884	lea	0x10($out),$out
3885.Lcbc_dec_loop8_enter:
3886	movdqu		0x60($inp),$inout6
3887	pxor		$rndkey0,$inout0
3888	movdqu		0x70($inp),$inout7
3889	pxor		$rndkey0,$inout1
3890	$movkey		0x10-0x70($key),$rndkey1
3891	pxor		$rndkey0,$inout2
3892	mov		\$-1,$inp_
3893	cmp		\$0x70,$len	# is there at least 0x60 bytes ahead?
3894	pxor		$rndkey0,$inout3
3895	pxor		$rndkey0,$inout4
3896	pxor		$rndkey0,$inout5
3897	pxor		$rndkey0,$inout6
3898
3899	aesdec		$rndkey1,$inout0
3900	pxor		$rndkey0,$inout7
3901	$movkey		0x20-0x70($key),$rndkey0
3902	aesdec		$rndkey1,$inout1
3903	aesdec		$rndkey1,$inout2
3904	aesdec		$rndkey1,$inout3
3905	aesdec		$rndkey1,$inout4
3906	aesdec		$rndkey1,$inout5
3907	aesdec		$rndkey1,$inout6
3908	adc		\$0,$inp_
3909	and		\$128,$inp_
3910	aesdec		$rndkey1,$inout7
3911	add		$inp,$inp_
3912	$movkey		0x30-0x70($key),$rndkey1
3913___
3914for($i=1;$i<12;$i++) {
3915my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3916$code.=<<___	if ($i==7);
3917	cmp		\$11,$rounds
3918___
3919$code.=<<___;
3920	aesdec		$rndkeyx,$inout0
3921	aesdec		$rndkeyx,$inout1
3922	aesdec		$rndkeyx,$inout2
3923	aesdec		$rndkeyx,$inout3
3924	aesdec		$rndkeyx,$inout4
3925	aesdec		$rndkeyx,$inout5
3926	aesdec		$rndkeyx,$inout6
3927	aesdec		$rndkeyx,$inout7
3928	$movkey		`0x30+0x10*$i`-0x70($key),$rndkeyx
3929___
3930$code.=<<___	if ($i<6 || (!($i&1) && $i>7));
3931	nop
3932___
3933$code.=<<___	if ($i==7);
3934	jb		.Lcbc_dec_done
3935___
3936$code.=<<___	if ($i==9);
3937	je		.Lcbc_dec_done
3938___
3939$code.=<<___	if ($i==11);
3940	jmp		.Lcbc_dec_done
3941___
3942}
3943$code.=<<___;
3944.align	16
3945.Lcbc_dec_done:
3946	aesdec		$rndkey1,$inout0
3947	aesdec		$rndkey1,$inout1
3948	pxor		$rndkey0,$iv
3949	pxor		$rndkey0,$in0
3950	aesdec		$rndkey1,$inout2
3951	aesdec		$rndkey1,$inout3
3952	pxor		$rndkey0,$in1
3953	pxor		$rndkey0,$in2
3954	aesdec		$rndkey1,$inout4
3955	aesdec		$rndkey1,$inout5
3956	pxor		$rndkey0,$in3
3957	pxor		$rndkey0,$in4
3958	aesdec		$rndkey1,$inout6
3959	aesdec		$rndkey1,$inout7
3960	movdqu		0x50($inp),$rndkey1
3961
3962	aesdeclast	$iv,$inout0
3963	movdqu		0x60($inp),$iv		# borrow $iv
3964	pxor		$rndkey0,$rndkey1
3965	aesdeclast	$in0,$inout1
3966	pxor		$rndkey0,$iv
3967	movdqu		0x70($inp),$rndkey0	# next IV
3968	aesdeclast	$in1,$inout2
3969	lea		0x80($inp),$inp
3970	movdqu		0x00($inp_),$in0
3971	aesdeclast	$in2,$inout3
3972	aesdeclast	$in3,$inout4
3973	movdqu		0x10($inp_),$in1
3974	movdqu		0x20($inp_),$in2
3975	aesdeclast	$in4,$inout5
3976	aesdeclast	$rndkey1,$inout6
3977	movdqu		0x30($inp_),$in3
3978	movdqu		0x40($inp_),$in4
3979	aesdeclast	$iv,$inout7
3980	movdqa		$rndkey0,$iv		# return $iv
3981	movdqu		0x50($inp_),$rndkey1
3982	$movkey		-0x70($key),$rndkey0
3983
3984	movups		$inout0,($out)		# store output
3985	movdqa		$in0,$inout0
3986	movups		$inout1,0x10($out)
3987	movdqa		$in1,$inout1
3988	movups		$inout2,0x20($out)
3989	movdqa		$in2,$inout2
3990	movups		$inout3,0x30($out)
3991	movdqa		$in3,$inout3
3992	movups		$inout4,0x40($out)
3993	movdqa		$in4,$inout4
3994	movups		$inout5,0x50($out)
3995	movdqa		$rndkey1,$inout5
3996	movups		$inout6,0x60($out)
3997	lea		0x70($out),$out
3998
3999	sub	\$0x80,$len
4000	ja	.Lcbc_dec_loop8
4001
4002	movaps	$inout7,$inout0
4003	lea	-0x70($key),$key
4004	add	\$0x70,$len
4005	jle	.Lcbc_dec_clear_tail_collected
4006	movups	$inout7,($out)
4007	lea	0x10($out),$out
4008	cmp	\$0x50,$len
4009	jbe	.Lcbc_dec_tail
4010
4011	movaps	$in0,$inout0
4012.Lcbc_dec_six_or_seven:
4013	cmp	\$0x60,$len
4014	ja	.Lcbc_dec_seven
4015
4016	movaps	$inout5,$inout6
4017	call	_aesni_decrypt6
4018	pxor	$iv,$inout0		# ^= IV
4019	movaps	$inout6,$iv
4020	pxor	$in0,$inout1
4021	movdqu	$inout0,($out)
4022	pxor	$in1,$inout2
4023	movdqu	$inout1,0x10($out)
4024	 pxor	$inout1,$inout1		# clear register bank
4025	pxor	$in2,$inout3
4026	movdqu	$inout2,0x20($out)
4027	 pxor	$inout2,$inout2
4028	pxor	$in3,$inout4
4029	movdqu	$inout3,0x30($out)
4030	 pxor	$inout3,$inout3
4031	pxor	$in4,$inout5
4032	movdqu	$inout4,0x40($out)
4033	 pxor	$inout4,$inout4
4034	lea	0x50($out),$out
4035	movdqa	$inout5,$inout0
4036	 pxor	$inout5,$inout5
4037	jmp	.Lcbc_dec_tail_collected
4038
4039.align	16
4040.Lcbc_dec_seven:
4041	movups	0x60($inp),$inout6
4042	xorps	$inout7,$inout7
4043	call	_aesni_decrypt8
4044	movups	0x50($inp),$inout7
4045	pxor	$iv,$inout0		# ^= IV
4046	movups	0x60($inp),$iv
4047	pxor	$in0,$inout1
4048	movdqu	$inout0,($out)
4049	pxor	$in1,$inout2
4050	movdqu	$inout1,0x10($out)
4051	 pxor	$inout1,$inout1		# clear register bank
4052	pxor	$in2,$inout3
4053	movdqu	$inout2,0x20($out)
4054	 pxor	$inout2,$inout2
4055	pxor	$in3,$inout4
4056	movdqu	$inout3,0x30($out)
4057	 pxor	$inout3,$inout3
4058	pxor	$in4,$inout5
4059	movdqu	$inout4,0x40($out)
4060	 pxor	$inout4,$inout4
4061	pxor	$inout7,$inout6
4062	movdqu	$inout5,0x50($out)
4063	 pxor	$inout5,$inout5
4064	lea	0x60($out),$out
4065	movdqa	$inout6,$inout0
4066	 pxor	$inout6,$inout6
4067	 pxor	$inout7,$inout7
4068	jmp	.Lcbc_dec_tail_collected
4069
4070.align	16
4071.Lcbc_dec_loop6:
4072	movups	$inout5,($out)
4073	lea	0x10($out),$out
4074	movdqu	0x00($inp),$inout0	# load input
4075	movdqu	0x10($inp),$inout1
4076	movdqa	$inout0,$in0
4077	movdqu	0x20($inp),$inout2
4078	movdqa	$inout1,$in1
4079	movdqu	0x30($inp),$inout3
4080	movdqa	$inout2,$in2
4081	movdqu	0x40($inp),$inout4
4082	movdqa	$inout3,$in3
4083	movdqu	0x50($inp),$inout5
4084	movdqa	$inout4,$in4
4085.Lcbc_dec_loop6_enter:
4086	lea	0x60($inp),$inp
4087	movdqa	$inout5,$inout6
4088
4089	call	_aesni_decrypt6
4090
4091	pxor	$iv,$inout0		# ^= IV
4092	movdqa	$inout6,$iv
4093	pxor	$in0,$inout1
4094	movdqu	$inout0,($out)
4095	pxor	$in1,$inout2
4096	movdqu	$inout1,0x10($out)
4097	pxor	$in2,$inout3
4098	movdqu	$inout2,0x20($out)
4099	pxor	$in3,$inout4
4100	mov	$key_,$key
4101	movdqu	$inout3,0x30($out)
4102	pxor	$in4,$inout5
4103	mov	$rnds_,$rounds
4104	movdqu	$inout4,0x40($out)
4105	lea	0x50($out),$out
4106	sub	\$0x60,$len
4107	ja	.Lcbc_dec_loop6
4108
4109	movdqa	$inout5,$inout0
4110	add	\$0x50,$len
4111	jle	.Lcbc_dec_clear_tail_collected
4112	movups	$inout5,($out)
4113	lea	0x10($out),$out
4114
4115.Lcbc_dec_tail:
4116	movups	($inp),$inout0
4117	sub	\$0x10,$len
4118	jbe	.Lcbc_dec_one		# $len is 1*16 or less
4119
4120	movups	0x10($inp),$inout1
4121	movaps	$inout0,$in0
4122	sub	\$0x10,$len
4123	jbe	.Lcbc_dec_two		# $len is 2*16 or less
4124
4125	movups	0x20($inp),$inout2
4126	movaps	$inout1,$in1
4127	sub	\$0x10,$len
4128	jbe	.Lcbc_dec_three		# $len is 3*16 or less
4129
4130	movups	0x30($inp),$inout3
4131	movaps	$inout2,$in2
4132	sub	\$0x10,$len
4133	jbe	.Lcbc_dec_four		# $len is 4*16 or less
4134
4135	movups	0x40($inp),$inout4	# $len is 5*16 or less
4136	movaps	$inout3,$in3
4137	movaps	$inout4,$in4
4138	xorps	$inout5,$inout5
4139	call	_aesni_decrypt6
4140	pxor	$iv,$inout0
4141	movaps	$in4,$iv
4142	pxor	$in0,$inout1
4143	movdqu	$inout0,($out)
4144	pxor	$in1,$inout2
4145	movdqu	$inout1,0x10($out)
4146	 pxor	$inout1,$inout1		# clear register bank
4147	pxor	$in2,$inout3
4148	movdqu	$inout2,0x20($out)
4149	 pxor	$inout2,$inout2
4150	pxor	$in3,$inout4
4151	movdqu	$inout3,0x30($out)
4152	 pxor	$inout3,$inout3
4153	lea	0x40($out),$out
4154	movdqa	$inout4,$inout0
4155	 pxor	$inout4,$inout4
4156	 pxor	$inout5,$inout5
4157	sub	\$0x10,$len
4158	jmp	.Lcbc_dec_tail_collected
4159
4160.align	16
4161.Lcbc_dec_one:
4162	movaps	$inout0,$in0
4163___
4164	&aesni_generate1("dec",$key,$rounds);
4165$code.=<<___;
4166	xorps	$iv,$inout0
4167	movaps	$in0,$iv
4168	jmp	.Lcbc_dec_tail_collected
4169.align	16
4170.Lcbc_dec_two:
4171	movaps	$inout1,$in1
4172	call	_aesni_decrypt2
4173	pxor	$iv,$inout0
4174	movaps	$in1,$iv
4175	pxor	$in0,$inout1
4176	movdqu	$inout0,($out)
4177	movdqa	$inout1,$inout0
4178	 pxor	$inout1,$inout1		# clear register bank
4179	lea	0x10($out),$out
4180	jmp	.Lcbc_dec_tail_collected
4181.align	16
4182.Lcbc_dec_three:
4183	movaps	$inout2,$in2
4184	call	_aesni_decrypt3
4185	pxor	$iv,$inout0
4186	movaps	$in2,$iv
4187	pxor	$in0,$inout1
4188	movdqu	$inout0,($out)
4189	pxor	$in1,$inout2
4190	movdqu	$inout1,0x10($out)
4191	 pxor	$inout1,$inout1		# clear register bank
4192	movdqa	$inout2,$inout0
4193	 pxor	$inout2,$inout2
4194	lea	0x20($out),$out
4195	jmp	.Lcbc_dec_tail_collected
4196.align	16
4197.Lcbc_dec_four:
4198	movaps	$inout3,$in3
4199	call	_aesni_decrypt4
4200	pxor	$iv,$inout0
4201	movaps	$in3,$iv
4202	pxor	$in0,$inout1
4203	movdqu	$inout0,($out)
4204	pxor	$in1,$inout2
4205	movdqu	$inout1,0x10($out)
4206	 pxor	$inout1,$inout1		# clear register bank
4207	pxor	$in2,$inout3
4208	movdqu	$inout2,0x20($out)
4209	 pxor	$inout2,$inout2
4210	movdqa	$inout3,$inout0
4211	 pxor	$inout3,$inout3
4212	lea	0x30($out),$out
4213	jmp	.Lcbc_dec_tail_collected
4214
4215.align	16
4216.Lcbc_dec_clear_tail_collected:
4217	pxor	$inout1,$inout1		# clear register bank
4218	pxor	$inout2,$inout2
4219	pxor	$inout3,$inout3
4220___
4221$code.=<<___ if (!$win64);
4222	pxor	$inout4,$inout4		# %xmm6..9
4223	pxor	$inout5,$inout5
4224	pxor	$inout6,$inout6
4225	pxor	$inout7,$inout7
4226___
4227$code.=<<___;
4228.Lcbc_dec_tail_collected:
4229	movups	$iv,($ivp)
4230	and	\$15,$len
4231	jnz	.Lcbc_dec_tail_partial
4232	movups	$inout0,($out)
4233	pxor	$inout0,$inout0
4234	jmp	.Lcbc_dec_ret
4235.align	16
4236.Lcbc_dec_tail_partial:
4237	movaps	$inout0,(%rsp)
4238	pxor	$inout0,$inout0
4239	mov	\$16,%rcx
4240	mov	$out,%rdi
4241	sub	$len,%rcx
4242	lea	(%rsp),%rsi
4243	.long	0x9066A4F3		# rep movsb
4244	movdqa	$inout0,(%rsp)
4245
4246.Lcbc_dec_ret:
4247	xorps	$rndkey0,$rndkey0	# %xmm0
4248	pxor	$rndkey1,$rndkey1
4249___
4250$code.=<<___ if ($win64);
4251	movaps	0x10(%rsp),%xmm6
4252	movaps	%xmm0,0x10(%rsp)	# clear stack
4253	movaps	0x20(%rsp),%xmm7
4254	movaps	%xmm0,0x20(%rsp)
4255	movaps	0x30(%rsp),%xmm8
4256	movaps	%xmm0,0x30(%rsp)
4257	movaps	0x40(%rsp),%xmm9
4258	movaps	%xmm0,0x40(%rsp)
4259	movaps	0x50(%rsp),%xmm10
4260	movaps	%xmm0,0x50(%rsp)
4261	movaps	0x60(%rsp),%xmm11
4262	movaps	%xmm0,0x60(%rsp)
4263	movaps	0x70(%rsp),%xmm12
4264	movaps	%xmm0,0x70(%rsp)
4265	movaps	0x80(%rsp),%xmm13
4266	movaps	%xmm0,0x80(%rsp)
4267	movaps	0x90(%rsp),%xmm14
4268	movaps	%xmm0,0x90(%rsp)
4269	movaps	0xa0(%rsp),%xmm15
4270	movaps	%xmm0,0xa0(%rsp)
4271___
4272$code.=<<___;
4273	mov	-8(%r11),%rbp
4274.cfi_restore	%rbp
4275	lea	(%r11),%rsp
4276.cfi_def_cfa_register	%rsp
4277.Lcbc_ret:
4278	ret
4279.cfi_endproc
4280.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4281___
4282}
4283# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4284#				int bits, AES_KEY *key)
4285#
4286# input:	$inp	user-supplied key
4287#		$bits	$inp length in bits
4288#		$key	pointer to key schedule
4289# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4290#		*$key	key schedule
4291#
4292{ my ($inp,$bits,$key) = @_4args;
4293  $bits =~ s/%r/%e/;
4294
4295$code.=<<___;
4296.globl	${PREFIX}_set_decrypt_key
4297.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
4298.align	16
4299${PREFIX}_set_decrypt_key:
4300.cfi_startproc
4301	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4302.cfi_adjust_cfa_offset	8
4303	call	__aesni_set_encrypt_key
4304	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
4305	test	%eax,%eax
4306	jnz	.Ldec_key_ret
4307	lea	16($key,$bits),$inp	# points at the end of key schedule
4308
4309	$movkey	($key),%xmm0		# just swap
4310	$movkey	($inp),%xmm1
4311	$movkey	%xmm0,($inp)
4312	$movkey	%xmm1,($key)
4313	lea	16($key),$key
4314	lea	-16($inp),$inp
4315
4316.Ldec_key_inverse:
4317	$movkey	($key),%xmm0		# swap and inverse
4318	$movkey	($inp),%xmm1
4319	aesimc	%xmm0,%xmm0
4320	aesimc	%xmm1,%xmm1
4321	lea	16($key),$key
4322	lea	-16($inp),$inp
4323	$movkey	%xmm0,16($inp)
4324	$movkey	%xmm1,-16($key)
4325	cmp	$key,$inp
4326	ja	.Ldec_key_inverse
4327
4328	$movkey	($key),%xmm0		# inverse middle
4329	aesimc	%xmm0,%xmm0
4330	pxor	%xmm1,%xmm1
4331	$movkey	%xmm0,($inp)
4332	pxor	%xmm0,%xmm0
4333.Ldec_key_ret:
4334	add	\$8,%rsp
4335.cfi_adjust_cfa_offset	-8
4336	ret
4337.cfi_endproc
4338.LSEH_end_set_decrypt_key:
4339.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4340___
4341
4342# This is based on submission from Intel by
4343#	Huang Ying
4344#	Vinodh Gopal
4345#	Kahraman Akdemir
4346#
4347# Aggressively optimized in respect to aeskeygenassist's critical path
4348# and is contained in %xmm0-5 to meet Win64 ABI requirement.
4349#
4350# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4351#				int bits, AES_KEY * const key);
4352#
4353# input:	$inp	user-supplied key
4354#		$bits	$inp length in bits
4355#		$key	pointer to key schedule
4356# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4357#		$bits	rounds-1 (used in aesni_set_decrypt_key)
4358#		*$key	key schedule
4359#		$key	pointer to key schedule (used in
4360#			aesni_set_decrypt_key)
4361#
4362# Subroutine is frame-less, which means that only volatile registers
4363# are used. Note that it's declared "abi-omnipotent", which means that
4364# amount of volatile registers is smaller on Windows.
4365#
4366$code.=<<___;
4367.globl	${PREFIX}_set_encrypt_key
4368.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
4369.align	16
4370${PREFIX}_set_encrypt_key:
4371__aesni_set_encrypt_key:
4372.cfi_startproc
4373	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4374.cfi_adjust_cfa_offset	8
4375	mov	\$-1,%rax
4376	test	$inp,$inp
4377	jz	.Lenc_key_ret
4378	test	$key,$key
4379	jz	.Lenc_key_ret
4380
4381	mov	\$`1<<28|1<<11`,%r10d	# AVX and XOP bits
4382	movups	($inp),%xmm0		# pull first 128 bits of *userKey
4383	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
4384	and	OPENSSL_ia32cap_P+4(%rip),%r10d
4385	lea	16($key),%rax		# %rax is used as modifiable copy of $key
4386	cmp	\$256,$bits
4387	je	.L14rounds
4388	cmp	\$192,$bits
4389	je	.L12rounds
4390	cmp	\$128,$bits
4391	jne	.Lbad_keybits
4392
4393.L10rounds:
4394	mov	\$9,$bits			# 10 rounds for 128-bit key
4395	cmp	\$`1<<28`,%r10d			# AVX, bit no XOP
4396	je	.L10rounds_alt
4397
4398	$movkey	%xmm0,($key)			# round 0
4399	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
4400	call		.Lkey_expansion_128_cold
4401	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
4402	call		.Lkey_expansion_128
4403	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
4404	call		.Lkey_expansion_128
4405	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
4406	call		.Lkey_expansion_128
4407	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
4408	call		.Lkey_expansion_128
4409	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
4410	call		.Lkey_expansion_128
4411	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
4412	call		.Lkey_expansion_128
4413	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
4414	call		.Lkey_expansion_128
4415	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
4416	call		.Lkey_expansion_128
4417	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
4418	call		.Lkey_expansion_128
4419	$movkey	%xmm0,(%rax)
4420	mov	$bits,80(%rax)	# 240(%rdx)
4421	xor	%eax,%eax
4422	jmp	.Lenc_key_ret
4423
4424.align	16
4425.L10rounds_alt:
4426	movdqa	.Lkey_rotate(%rip),%xmm5
4427	mov	\$8,%r10d
4428	movdqa	.Lkey_rcon1(%rip),%xmm4
4429	movdqa	%xmm0,%xmm2
4430	movdqu	%xmm0,($key)
4431	jmp	.Loop_key128
4432
4433.align	16
4434.Loop_key128:
4435	pshufb		%xmm5,%xmm0
4436	aesenclast	%xmm4,%xmm0
4437	pslld		\$1,%xmm4
4438	lea		16(%rax),%rax
4439
4440	movdqa		%xmm2,%xmm3
4441	pslldq		\$4,%xmm2
4442	pxor		%xmm2,%xmm3
4443	pslldq		\$4,%xmm2
4444	pxor		%xmm2,%xmm3
4445	pslldq		\$4,%xmm2
4446	pxor		%xmm3,%xmm2
4447
4448	pxor		%xmm2,%xmm0
4449	movdqu		%xmm0,-16(%rax)
4450	movdqa		%xmm0,%xmm2
4451
4452	dec	%r10d
4453	jnz	.Loop_key128
4454
4455	movdqa		.Lkey_rcon1b(%rip),%xmm4
4456
4457	pshufb		%xmm5,%xmm0
4458	aesenclast	%xmm4,%xmm0
4459	pslld		\$1,%xmm4
4460
4461	movdqa		%xmm2,%xmm3
4462	pslldq		\$4,%xmm2
4463	pxor		%xmm2,%xmm3
4464	pslldq		\$4,%xmm2
4465	pxor		%xmm2,%xmm3
4466	pslldq		\$4,%xmm2
4467	pxor		%xmm3,%xmm2
4468
4469	pxor		%xmm2,%xmm0
4470	movdqu		%xmm0,(%rax)
4471
4472	movdqa		%xmm0,%xmm2
4473	pshufb		%xmm5,%xmm0
4474	aesenclast	%xmm4,%xmm0
4475
4476	movdqa		%xmm2,%xmm3
4477	pslldq		\$4,%xmm2
4478	pxor		%xmm2,%xmm3
4479	pslldq		\$4,%xmm2
4480	pxor		%xmm2,%xmm3
4481	pslldq		\$4,%xmm2
4482	pxor		%xmm3,%xmm2
4483
4484	pxor		%xmm2,%xmm0
4485	movdqu		%xmm0,16(%rax)
4486
4487	mov	$bits,96(%rax)	# 240($key)
4488	xor	%eax,%eax
4489	jmp	.Lenc_key_ret
4490
4491.align	16
4492.L12rounds:
4493	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
4494	mov	\$11,$bits			# 12 rounds for 192
4495	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4496	je	.L12rounds_alt
4497
4498	$movkey	%xmm0,($key)			# round 0
4499	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
4500	call		.Lkey_expansion_192a_cold
4501	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
4502	call		.Lkey_expansion_192b
4503	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
4504	call		.Lkey_expansion_192a
4505	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
4506	call		.Lkey_expansion_192b
4507	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
4508	call		.Lkey_expansion_192a
4509	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
4510	call		.Lkey_expansion_192b
4511	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
4512	call		.Lkey_expansion_192a
4513	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
4514	call		.Lkey_expansion_192b
4515	$movkey	%xmm0,(%rax)
4516	mov	$bits,48(%rax)	# 240(%rdx)
4517	xor	%rax, %rax
4518	jmp	.Lenc_key_ret
4519
4520.align	16
4521.L12rounds_alt:
4522	movdqa	.Lkey_rotate192(%rip),%xmm5
4523	movdqa	.Lkey_rcon1(%rip),%xmm4
4524	mov	\$8,%r10d
4525	movdqu	%xmm0,($key)
4526	jmp	.Loop_key192
4527
4528.align	16
4529.Loop_key192:
4530	movq		%xmm2,0(%rax)
4531	movdqa		%xmm2,%xmm1
4532	pshufb		%xmm5,%xmm2
4533	aesenclast	%xmm4,%xmm2
4534	pslld		\$1, %xmm4
4535	lea		24(%rax),%rax
4536
4537	movdqa		%xmm0,%xmm3
4538	pslldq		\$4,%xmm0
4539	pxor		%xmm0,%xmm3
4540	pslldq		\$4,%xmm0
4541	pxor		%xmm0,%xmm3
4542	pslldq		\$4,%xmm0
4543	pxor		%xmm3,%xmm0
4544
4545	pshufd		\$0xff,%xmm0,%xmm3
4546	pxor		%xmm1,%xmm3
4547	pslldq		\$4,%xmm1
4548	pxor		%xmm1,%xmm3
4549
4550	pxor		%xmm2,%xmm0
4551	pxor		%xmm3,%xmm2
4552	movdqu		%xmm0,-16(%rax)
4553
4554	dec	%r10d
4555	jnz	.Loop_key192
4556
4557	mov	$bits,32(%rax)	# 240($key)
4558	xor	%eax,%eax
4559	jmp	.Lenc_key_ret
4560
4561.align	16
4562.L14rounds:
4563	movups	16($inp),%xmm2			# remaining half of *userKey
4564	mov	\$13,$bits			# 14 rounds for 256
4565	lea	16(%rax),%rax
4566	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4567	je	.L14rounds_alt
4568
4569	$movkey	%xmm0,($key)			# round 0
4570	$movkey	%xmm2,16($key)			# round 1
4571	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
4572	call		.Lkey_expansion_256a_cold
4573	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
4574	call		.Lkey_expansion_256b
4575	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
4576	call		.Lkey_expansion_256a
4577	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
4578	call		.Lkey_expansion_256b
4579	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
4580	call		.Lkey_expansion_256a
4581	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
4582	call		.Lkey_expansion_256b
4583	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
4584	call		.Lkey_expansion_256a
4585	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
4586	call		.Lkey_expansion_256b
4587	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
4588	call		.Lkey_expansion_256a
4589	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
4590	call		.Lkey_expansion_256b
4591	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
4592	call		.Lkey_expansion_256a
4593	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
4594	call		.Lkey_expansion_256b
4595	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
4596	call		.Lkey_expansion_256a
4597	$movkey	%xmm0,(%rax)
4598	mov	$bits,16(%rax)	# 240(%rdx)
4599	xor	%rax,%rax
4600	jmp	.Lenc_key_ret
4601
4602.align	16
4603.L14rounds_alt:
4604	movdqa	.Lkey_rotate(%rip),%xmm5
4605	movdqa	.Lkey_rcon1(%rip),%xmm4
4606	mov	\$7,%r10d
4607	movdqu	%xmm0,0($key)
4608	movdqa	%xmm2,%xmm1
4609	movdqu	%xmm2,16($key)
4610	jmp	.Loop_key256
4611
4612.align	16
4613.Loop_key256:
4614	pshufb		%xmm5,%xmm2
4615	aesenclast	%xmm4,%xmm2
4616
4617	movdqa		%xmm0,%xmm3
4618	pslldq		\$4,%xmm0
4619	pxor		%xmm0,%xmm3
4620	pslldq		\$4,%xmm0
4621	pxor		%xmm0,%xmm3
4622	pslldq		\$4,%xmm0
4623	pxor		%xmm3,%xmm0
4624	pslld		\$1,%xmm4
4625
4626	pxor		%xmm2,%xmm0
4627	movdqu		%xmm0,(%rax)
4628
4629	dec	%r10d
4630	jz	.Ldone_key256
4631
4632	pshufd		\$0xff,%xmm0,%xmm2
4633	pxor		%xmm3,%xmm3
4634	aesenclast	%xmm3,%xmm2
4635
4636	movdqa		%xmm1,%xmm3
4637	pslldq		\$4,%xmm1
4638	pxor		%xmm1,%xmm3
4639	pslldq		\$4,%xmm1
4640	pxor		%xmm1,%xmm3
4641	pslldq		\$4,%xmm1
4642	pxor		%xmm3,%xmm1
4643
4644	pxor		%xmm1,%xmm2
4645	movdqu		%xmm2,16(%rax)
4646	lea		32(%rax),%rax
4647	movdqa		%xmm2,%xmm1
4648
4649	jmp	.Loop_key256
4650
4651.Ldone_key256:
4652	mov	$bits,16(%rax)	# 240($key)
4653	xor	%eax,%eax
4654	jmp	.Lenc_key_ret
4655
4656.align	16
4657.Lbad_keybits:
4658	mov	\$-2,%rax
4659.Lenc_key_ret:
4660	pxor	%xmm0,%xmm0
4661	pxor	%xmm1,%xmm1
4662	pxor	%xmm2,%xmm2
4663	pxor	%xmm3,%xmm3
4664	pxor	%xmm4,%xmm4
4665	pxor	%xmm5,%xmm5
4666	add	\$8,%rsp
4667.cfi_adjust_cfa_offset	-8
4668	ret
4669.LSEH_end_set_encrypt_key:
4670
4671.align	16
4672.Lkey_expansion_128:
4673	$movkey	%xmm0,(%rax)
4674	lea	16(%rax),%rax
4675.Lkey_expansion_128_cold:
4676	shufps	\$0b00010000,%xmm0,%xmm4
4677	xorps	%xmm4, %xmm0
4678	shufps	\$0b10001100,%xmm0,%xmm4
4679	xorps	%xmm4, %xmm0
4680	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4681	xorps	%xmm1,%xmm0
4682	ret
4683
4684.align 16
4685.Lkey_expansion_192a:
4686	$movkey	%xmm0,(%rax)
4687	lea	16(%rax),%rax
4688.Lkey_expansion_192a_cold:
4689	movaps	%xmm2, %xmm5
4690.Lkey_expansion_192b_warm:
4691	shufps	\$0b00010000,%xmm0,%xmm4
4692	movdqa	%xmm2,%xmm3
4693	xorps	%xmm4,%xmm0
4694	shufps	\$0b10001100,%xmm0,%xmm4
4695	pslldq	\$4,%xmm3
4696	xorps	%xmm4,%xmm0
4697	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
4698	pxor	%xmm3,%xmm2
4699	pxor	%xmm1,%xmm0
4700	pshufd	\$0b11111111,%xmm0,%xmm3
4701	pxor	%xmm3,%xmm2
4702	ret
4703
4704.align 16
4705.Lkey_expansion_192b:
4706	movaps	%xmm0,%xmm3
4707	shufps	\$0b01000100,%xmm0,%xmm5
4708	$movkey	%xmm5,(%rax)
4709	shufps	\$0b01001110,%xmm2,%xmm3
4710	$movkey	%xmm3,16(%rax)
4711	lea	32(%rax),%rax
4712	jmp	.Lkey_expansion_192b_warm
4713
4714.align	16
4715.Lkey_expansion_256a:
4716	$movkey	%xmm2,(%rax)
4717	lea	16(%rax),%rax
4718.Lkey_expansion_256a_cold:
4719	shufps	\$0b00010000,%xmm0,%xmm4
4720	xorps	%xmm4,%xmm0
4721	shufps	\$0b10001100,%xmm0,%xmm4
4722	xorps	%xmm4,%xmm0
4723	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4724	xorps	%xmm1,%xmm0
4725	ret
4726
4727.align 16
4728.Lkey_expansion_256b:
4729	$movkey	%xmm0,(%rax)
4730	lea	16(%rax),%rax
4731
4732	shufps	\$0b00010000,%xmm2,%xmm4
4733	xorps	%xmm4,%xmm2
4734	shufps	\$0b10001100,%xmm2,%xmm4
4735	xorps	%xmm4,%xmm2
4736	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
4737	xorps	%xmm1,%xmm2
4738	ret
4739.cfi_endproc
4740.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4741.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4742___
4743}
4744
4745$code.=<<___;
4746.align	64
4747.Lbswap_mask:
4748	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4749.Lincrement32:
4750	.long	6,6,6,0
4751.Lincrement64:
4752	.long	1,0,0,0
4753.Lxts_magic:
4754	.long	0x87,0,1,0
4755.Lincrement1:
4756	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4757.Lkey_rotate:
4758	.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4759.Lkey_rotate192:
4760	.long	0x04070605,0x04070605,0x04070605,0x04070605
4761.Lkey_rcon1:
4762	.long	1,1,1,1
4763.Lkey_rcon1b:
4764	.long	0x1b,0x1b,0x1b,0x1b
4765
4766.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4767.align	64
4768___
4769
4770# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4771#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
4772if ($win64) {
4773$rec="%rcx";
4774$frame="%rdx";
4775$context="%r8";
4776$disp="%r9";
4777
4778$code.=<<___;
4779.extern	__imp_RtlVirtualUnwind
4780___
4781$code.=<<___ if ($PREFIX eq "aesni");
4782.type	ecb_ccm64_se_handler,\@abi-omnipotent
4783.align	16
4784ecb_ccm64_se_handler:
4785	push	%rsi
4786	push	%rdi
4787	push	%rbx
4788	push	%rbp
4789	push	%r12
4790	push	%r13
4791	push	%r14
4792	push	%r15
4793	pushfq
4794	sub	\$64,%rsp
4795
4796	mov	120($context),%rax	# pull context->Rax
4797	mov	248($context),%rbx	# pull context->Rip
4798
4799	mov	8($disp),%rsi		# disp->ImageBase
4800	mov	56($disp),%r11		# disp->HandlerData
4801
4802	mov	0(%r11),%r10d		# HandlerData[0]
4803	lea	(%rsi,%r10),%r10	# prologue label
4804	cmp	%r10,%rbx		# context->Rip<prologue label
4805	jb	.Lcommon_seh_tail
4806
4807	mov	152($context),%rax	# pull context->Rsp
4808
4809	mov	4(%r11),%r10d		# HandlerData[1]
4810	lea	(%rsi,%r10),%r10	# epilogue label
4811	cmp	%r10,%rbx		# context->Rip>=epilogue label
4812	jae	.Lcommon_seh_tail
4813
4814	lea	0(%rax),%rsi		# %xmm save area
4815	lea	512($context),%rdi	# &context.Xmm6
4816	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
4817	.long	0xa548f3fc		# cld; rep movsq
4818	lea	0x58(%rax),%rax		# adjust stack pointer
4819
4820	jmp	.Lcommon_seh_tail
4821.size	ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4822
4823.type	ctr_xts_se_handler,\@abi-omnipotent
4824.align	16
4825ctr_xts_se_handler:
4826	push	%rsi
4827	push	%rdi
4828	push	%rbx
4829	push	%rbp
4830	push	%r12
4831	push	%r13
4832	push	%r14
4833	push	%r15
4834	pushfq
4835	sub	\$64,%rsp
4836
4837	mov	120($context),%rax	# pull context->Rax
4838	mov	248($context),%rbx	# pull context->Rip
4839
4840	mov	8($disp),%rsi		# disp->ImageBase
4841	mov	56($disp),%r11		# disp->HandlerData
4842
4843	mov	0(%r11),%r10d		# HandlerData[0]
4844	lea	(%rsi,%r10),%r10	# prologue label
4845	cmp	%r10,%rbx		# context->Rip<prologue label
4846	jb	.Lcommon_seh_tail
4847
4848	mov	152($context),%rax	# pull context->Rsp
4849
4850	mov	4(%r11),%r10d		# HandlerData[1]
4851	lea	(%rsi,%r10),%r10	# epilogue label
4852	cmp	%r10,%rbx		# context->Rip>=epilogue label
4853	jae	.Lcommon_seh_tail
4854
4855	mov	208($context),%rax	# pull context->R11
4856
4857	lea	-0xa8(%rax),%rsi	# %xmm save area
4858	lea	512($context),%rdi	# & context.Xmm6
4859	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4860	.long	0xa548f3fc		# cld; rep movsq
4861
4862	mov	-8(%rax),%rbp		# restore saved %rbp
4863	mov	%rbp,160($context)	# restore context->Rbp
4864	jmp	.Lcommon_seh_tail
4865.size	ctr_xts_se_handler,.-ctr_xts_se_handler
4866
4867.type	ocb_se_handler,\@abi-omnipotent
4868.align	16
4869ocb_se_handler:
4870	push	%rsi
4871	push	%rdi
4872	push	%rbx
4873	push	%rbp
4874	push	%r12
4875	push	%r13
4876	push	%r14
4877	push	%r15
4878	pushfq
4879	sub	\$64,%rsp
4880
4881	mov	120($context),%rax	# pull context->Rax
4882	mov	248($context),%rbx	# pull context->Rip
4883
4884	mov	8($disp),%rsi		# disp->ImageBase
4885	mov	56($disp),%r11		# disp->HandlerData
4886
4887	mov	0(%r11),%r10d		# HandlerData[0]
4888	lea	(%rsi,%r10),%r10	# prologue label
4889	cmp	%r10,%rbx		# context->Rip<prologue label
4890	jb	.Lcommon_seh_tail
4891
4892	mov	4(%r11),%r10d		# HandlerData[1]
4893	lea	(%rsi,%r10),%r10	# epilogue label
4894	cmp	%r10,%rbx		# context->Rip>=epilogue label
4895	jae	.Lcommon_seh_tail
4896
4897	mov	8(%r11),%r10d		# HandlerData[2]
4898	lea	(%rsi,%r10),%r10
4899	cmp	%r10,%rbx		# context->Rip>=pop label
4900	jae	.Locb_no_xmm
4901
4902	mov	152($context),%rax	# pull context->Rsp
4903
4904	lea	(%rax),%rsi		# %xmm save area
4905	lea	512($context),%rdi	# & context.Xmm6
4906	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4907	.long	0xa548f3fc		# cld; rep movsq
4908	lea	0xa0+0x28(%rax),%rax
4909
4910.Locb_no_xmm:
4911	mov	-8(%rax),%rbx
4912	mov	-16(%rax),%rbp
4913	mov	-24(%rax),%r12
4914	mov	-32(%rax),%r13
4915	mov	-40(%rax),%r14
4916
4917	mov	%rbx,144($context)	# restore context->Rbx
4918	mov	%rbp,160($context)	# restore context->Rbp
4919	mov	%r12,216($context)	# restore context->R12
4920	mov	%r13,224($context)	# restore context->R13
4921	mov	%r14,232($context)	# restore context->R14
4922
4923	jmp	.Lcommon_seh_tail
4924.size	ocb_se_handler,.-ocb_se_handler
4925___
4926$code.=<<___;
4927.type	cbc_se_handler,\@abi-omnipotent
4928.align	16
4929cbc_se_handler:
4930	push	%rsi
4931	push	%rdi
4932	push	%rbx
4933	push	%rbp
4934	push	%r12
4935	push	%r13
4936	push	%r14
4937	push	%r15
4938	pushfq
4939	sub	\$64,%rsp
4940
4941	mov	152($context),%rax	# pull context->Rsp
4942	mov	248($context),%rbx	# pull context->Rip
4943
4944	lea	.Lcbc_decrypt_bulk(%rip),%r10
4945	cmp	%r10,%rbx		# context->Rip<"prologue" label
4946	jb	.Lcommon_seh_tail
4947
4948	mov	120($context),%rax	# pull context->Rax
4949
4950	lea	.Lcbc_decrypt_body(%rip),%r10
4951	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
4952	jb	.Lcommon_seh_tail
4953
4954	mov	152($context),%rax	# pull context->Rsp
4955
4956	lea	.Lcbc_ret(%rip),%r10
4957	cmp	%r10,%rbx		# context->Rip>="epilogue" label
4958	jae	.Lcommon_seh_tail
4959
4960	lea	16(%rax),%rsi		# %xmm save area
4961	lea	512($context),%rdi	# &context.Xmm6
4962	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4963	.long	0xa548f3fc		# cld; rep movsq
4964
4965	mov	208($context),%rax	# pull context->R11
4966
4967	mov	-8(%rax),%rbp		# restore saved %rbp
4968	mov	%rbp,160($context)	# restore context->Rbp
4969
4970.Lcommon_seh_tail:
4971	mov	8(%rax),%rdi
4972	mov	16(%rax),%rsi
4973	mov	%rax,152($context)	# restore context->Rsp
4974	mov	%rsi,168($context)	# restore context->Rsi
4975	mov	%rdi,176($context)	# restore context->Rdi
4976
4977	mov	40($disp),%rdi		# disp->ContextRecord
4978	mov	$context,%rsi		# context
4979	mov	\$154,%ecx		# sizeof(CONTEXT)
4980	.long	0xa548f3fc		# cld; rep movsq
4981
4982	mov	$disp,%rsi
4983	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4984	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4985	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4986	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4987	mov	40(%rsi),%r10		# disp->ContextRecord
4988	lea	56(%rsi),%r11		# &disp->HandlerData
4989	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4990	mov	%r10,32(%rsp)		# arg5
4991	mov	%r11,40(%rsp)		# arg6
4992	mov	%r12,48(%rsp)		# arg7
4993	mov	%rcx,56(%rsp)		# arg8, (NULL)
4994	call	*__imp_RtlVirtualUnwind(%rip)
4995
4996	mov	\$1,%eax		# ExceptionContinueSearch
4997	add	\$64,%rsp
4998	popfq
4999	pop	%r15
5000	pop	%r14
5001	pop	%r13
5002	pop	%r12
5003	pop	%rbp
5004	pop	%rbx
5005	pop	%rdi
5006	pop	%rsi
5007	ret
5008.size	cbc_se_handler,.-cbc_se_handler
5009
5010.section	.pdata
5011.align	4
5012___
5013$code.=<<___ if ($PREFIX eq "aesni");
5014	.rva	.LSEH_begin_aesni_ecb_encrypt
5015	.rva	.LSEH_end_aesni_ecb_encrypt
5016	.rva	.LSEH_info_ecb
5017
5018	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
5019	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
5020	.rva	.LSEH_info_ccm64_enc
5021
5022	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
5023	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
5024	.rva	.LSEH_info_ccm64_dec
5025
5026	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
5027	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
5028	.rva	.LSEH_info_ctr32
5029
5030	.rva	.LSEH_begin_aesni_xts_encrypt
5031	.rva	.LSEH_end_aesni_xts_encrypt
5032	.rva	.LSEH_info_xts_enc
5033
5034	.rva	.LSEH_begin_aesni_xts_decrypt
5035	.rva	.LSEH_end_aesni_xts_decrypt
5036	.rva	.LSEH_info_xts_dec
5037
5038	.rva	.LSEH_begin_aesni_ocb_encrypt
5039	.rva	.LSEH_end_aesni_ocb_encrypt
5040	.rva	.LSEH_info_ocb_enc
5041
5042	.rva	.LSEH_begin_aesni_ocb_decrypt
5043	.rva	.LSEH_end_aesni_ocb_decrypt
5044	.rva	.LSEH_info_ocb_dec
5045___
5046$code.=<<___;
5047	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
5048	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
5049	.rva	.LSEH_info_cbc
5050
5051	.rva	${PREFIX}_set_decrypt_key
5052	.rva	.LSEH_end_set_decrypt_key
5053	.rva	.LSEH_info_key
5054
5055	.rva	${PREFIX}_set_encrypt_key
5056	.rva	.LSEH_end_set_encrypt_key
5057	.rva	.LSEH_info_key
5058.section	.xdata
5059.align	8
5060___
5061$code.=<<___ if ($PREFIX eq "aesni");
5062.LSEH_info_ecb:
5063	.byte	9,0,0,0
5064	.rva	ecb_ccm64_se_handler
5065	.rva	.Lecb_enc_body,.Lecb_enc_ret		# HandlerData[]
5066.LSEH_info_ccm64_enc:
5067	.byte	9,0,0,0
5068	.rva	ecb_ccm64_se_handler
5069	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
5070.LSEH_info_ccm64_dec:
5071	.byte	9,0,0,0
5072	.rva	ecb_ccm64_se_handler
5073	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
5074.LSEH_info_ctr32:
5075	.byte	9,0,0,0
5076	.rva	ctr_xts_se_handler
5077	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
5078.LSEH_info_xts_enc:
5079	.byte	9,0,0,0
5080	.rva	ctr_xts_se_handler
5081	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
5082.LSEH_info_xts_dec:
5083	.byte	9,0,0,0
5084	.rva	ctr_xts_se_handler
5085	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
5086.LSEH_info_ocb_enc:
5087	.byte	9,0,0,0
5088	.rva	ocb_se_handler
5089	.rva	.Locb_enc_body,.Locb_enc_epilogue	# HandlerData[]
5090	.rva	.Locb_enc_pop
5091	.long	0
5092.LSEH_info_ocb_dec:
5093	.byte	9,0,0,0
5094	.rva	ocb_se_handler
5095	.rva	.Locb_dec_body,.Locb_dec_epilogue	# HandlerData[]
5096	.rva	.Locb_dec_pop
5097	.long	0
5098___
5099$code.=<<___;
5100.LSEH_info_cbc:
5101	.byte	9,0,0,0
5102	.rva	cbc_se_handler
5103.LSEH_info_key:
5104	.byte	0x01,0x04,0x01,0x00
5105	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
5106___
5107}
5108
5109sub rex {
5110  local *opcode=shift;
5111  my ($dst,$src)=@_;
5112  my $rex=0;
5113
5114    $rex|=0x04			if($dst>=8);
5115    $rex|=0x01			if($src>=8);
5116    push @opcode,$rex|0x40	if($rex);
5117}
5118
5119sub aesni {
5120  my $line=shift;
5121  my @opcode=(0x66);
5122
5123    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5124	rex(\@opcode,$4,$3);
5125	push @opcode,0x0f,0x3a,0xdf;
5126	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
5127	my $c=$2;
5128	push @opcode,$c=~/^0/?oct($c):$c;
5129	return ".byte\t".join(',',@opcode);
5130    }
5131    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5132	my %opcodelet = (
5133		"aesimc" => 0xdb,
5134		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5135		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5136	);
5137	return undef if (!defined($opcodelet{$1}));
5138	rex(\@opcode,$3,$2);
5139	push @opcode,0x0f,0x38,$opcodelet{$1};
5140	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
5141	return ".byte\t".join(',',@opcode);
5142    }
5143    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5144	my %opcodelet = (
5145		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5146		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5147	);
5148	return undef if (!defined($opcodelet{$1}));
5149	my $off = $2;
5150	push @opcode,0x44 if ($3>=8);
5151	push @opcode,0x0f,0x38,$opcodelet{$1};
5152	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
5153	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5154	return ".byte\t".join(',',@opcode);
5155    }
5156    return $line;
5157}
5158
5159sub movbe {
5160	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
5161}
5162
5163$code =~ s/\`([^\`]*)\`/eval($1)/gem;
5164$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5165#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
5166$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5167
5168print $code;
5169
5170close STDOUT or die "error closing STDOUT: $!";
5171