xref: /freebsd/crypto/openssl/crypto/aes/asm/aesni-x86_64.pl (revision 19261079b74319502c6ffa1249920079f0f69a72)
1#! /usr/bin/env perl
2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
28# asymptotic limit it's not something you commonly achieve in reality,
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33#	16-byte     64-byte     256-byte    1-KB        8-KB
34# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
35# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
36# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
37# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
38# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
39# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved with CBC-MAC. This provides ~30% improvement over
63# "straightforward" CCM implementation with CTR and CBC-MAC performed
64# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
121# instructions' interleave factor. Westmere can execute at most 3
122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor		3x	6x	8x
132# theoretical asymptotic limit		1.67	0.83	0.625
133# measured performance for 8KB block	1.05	0.86	0.84
134#
135# "as if" interleave factor		4.7x	5.8x	6.0x
136#
137# Further data for other parallelizable modes:
138#
139# CBC decrypt				1.16	0.93	0.74
140# CTR					1.14	0.91	0.74
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
146# additional instructions with AES ones, but even AES instructions
147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions  still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153# utilizes 6x interleave because of limited register bank capacity.
154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
172######################################################################
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
176#		CBC en-/decrypt	CTR	XTS	ECB	OCB
177# Westmere	3.77/1.25	1.25	1.25	1.26
178# * Bridge	5.07/0.74	0.75	0.90	0.85	0.98
179# Haswell	4.44/0.63	0.63	0.73	0.63	0.70
180# Skylake	2.62/0.63	0.63	0.63	0.63
181# Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
182# Knights L	2.54/0.77	0.78	0.85	-	1.50
183# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
184# Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
185# Ryzen		2.71/0.35	0.35	0.44	0.38	0.49
186#
187# (*)	Atom Silvermont ECB result is suboptimal because of penalties
188#	incurred by operations on %xmm8-15. As ECB is not considered
189#	critical, nothing was done to mitigate the problem.
190
191$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
192			# generates drop-in replacement for
193			# crypto/aes/asm/aes-x86_64.pl:-)
194
195$flavour = shift;
196$output  = shift;
197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
198
199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
200
201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
203( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
204die "can't locate x86_64-xlate.pl";
205
206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
207*STDOUT=*OUT;
208
209$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
210@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
211		("%rdi","%rsi","%rdx","%rcx");	# Unix order
212
213$code=".text\n";
214$code.=".extern	OPENSSL_ia32cap_P\n";
215
216$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
218$inp="%rdi";
219$out="%rsi";
220$len="%rdx";
221$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
222$ivp="%r8";	# cbc, ctr, ...
223
224$rnds_="%r10d";	# backup copy for $rounds
225$key_="%r11";	# backup copy for $key
226
227# %xmm register layout
228$rndkey0="%xmm0";	$rndkey1="%xmm1";
229$inout0="%xmm2";	$inout1="%xmm3";
230$inout2="%xmm4";	$inout3="%xmm5";
231$inout4="%xmm6";	$inout5="%xmm7";
232$inout6="%xmm8";	$inout7="%xmm9";
233
234$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
235$in0="%xmm8";		$iv="%xmm9";
236
237# Inline version of internal aesni_[en|de]crypt1.
238#
239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
240# cycles which take care of loop variables...
241{ my $sn;
242sub aesni_generate1 {
243my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
244++$sn;
245$code.=<<___;
246	$movkey	($key),$rndkey0
247	$movkey	16($key),$rndkey1
248___
249$code.=<<___ if (defined($ivec));
250	xorps	$rndkey0,$ivec
251	lea	32($key),$key
252	xorps	$ivec,$inout
253___
254$code.=<<___ if (!defined($ivec));
255	lea	32($key),$key
256	xorps	$rndkey0,$inout
257___
258$code.=<<___;
259.Loop_${p}1_$sn:
260	aes${p}	$rndkey1,$inout
261	dec	$rounds
262	$movkey	($key),$rndkey1
263	lea	16($key),$key
264	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
265	aes${p}last	$rndkey1,$inout
266___
267}}
268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
269#
270{ my ($inp,$out,$key) = @_4args;
271
272$code.=<<___;
273.globl	${PREFIX}_encrypt
274.type	${PREFIX}_encrypt,\@abi-omnipotent
275.align	16
276${PREFIX}_encrypt:
277.cfi_startproc
278	movups	($inp),$inout0		# load input
279	mov	240($key),$rounds	# key->rounds
280___
281	&aesni_generate1("enc",$key,$rounds);
282$code.=<<___;
283	 pxor	$rndkey0,$rndkey0	# clear register bank
284	 pxor	$rndkey1,$rndkey1
285	movups	$inout0,($out)		# output
286	 pxor	$inout0,$inout0
287	ret
288.cfi_endproc
289.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
290
291.globl	${PREFIX}_decrypt
292.type	${PREFIX}_decrypt,\@abi-omnipotent
293.align	16
294${PREFIX}_decrypt:
295.cfi_startproc
296	movups	($inp),$inout0		# load input
297	mov	240($key),$rounds	# key->rounds
298___
299	&aesni_generate1("dec",$key,$rounds);
300$code.=<<___;
301	 pxor	$rndkey0,$rndkey0	# clear register bank
302	 pxor	$rndkey1,$rndkey1
303	movups	$inout0,($out)		# output
304	 pxor	$inout0,$inout0
305	ret
306.cfi_endproc
307.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
308___
309}
310
311# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
312# factor. Why 3x subroutine were originally used in loops? Even though
313# aes[enc|dec] latency was originally 6, it could be scheduled only
314# every *2nd* cycle. Thus 3x interleave was the one providing optimal
315# utilization, i.e. when subroutine's throughput is virtually same as
316# of non-interleaved subroutine [for number of input blocks up to 3].
317# This is why it originally made no sense to implement 2x subroutine.
318# But times change and it became appropriate to spend extra 192 bytes
319# on 2x subroutine on Atom Silvermont account. For processors that
320# can schedule aes[enc|dec] every cycle optimal interleave factor
321# equals to corresponding instructions latency. 8x is optimal for
322# * Bridge and "super-optimal" for other Intel CPUs...
323
324sub aesni_generate2 {
325my $dir=shift;
326# As already mentioned it takes in $key and $rounds, which are *not*
327# preserved. $inout[0-1] is cipher/clear text...
328$code.=<<___;
329.type	_aesni_${dir}rypt2,\@abi-omnipotent
330.align	16
331_aesni_${dir}rypt2:
332.cfi_startproc
333	$movkey	($key),$rndkey0
334	shl	\$4,$rounds
335	$movkey	16($key),$rndkey1
336	xorps	$rndkey0,$inout0
337	xorps	$rndkey0,$inout1
338	$movkey	32($key),$rndkey0
339	lea	32($key,$rounds),$key
340	neg	%rax				# $rounds
341	add	\$16,%rax
342
343.L${dir}_loop2:
344	aes${dir}	$rndkey1,$inout0
345	aes${dir}	$rndkey1,$inout1
346	$movkey		($key,%rax),$rndkey1
347	add		\$32,%rax
348	aes${dir}	$rndkey0,$inout0
349	aes${dir}	$rndkey0,$inout1
350	$movkey		-16($key,%rax),$rndkey0
351	jnz		.L${dir}_loop2
352
353	aes${dir}	$rndkey1,$inout0
354	aes${dir}	$rndkey1,$inout1
355	aes${dir}last	$rndkey0,$inout0
356	aes${dir}last	$rndkey0,$inout1
357	ret
358.cfi_endproc
359.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
360___
361}
362sub aesni_generate3 {
363my $dir=shift;
364# As already mentioned it takes in $key and $rounds, which are *not*
365# preserved. $inout[0-2] is cipher/clear text...
366$code.=<<___;
367.type	_aesni_${dir}rypt3,\@abi-omnipotent
368.align	16
369_aesni_${dir}rypt3:
370.cfi_startproc
371	$movkey	($key),$rndkey0
372	shl	\$4,$rounds
373	$movkey	16($key),$rndkey1
374	xorps	$rndkey0,$inout0
375	xorps	$rndkey0,$inout1
376	xorps	$rndkey0,$inout2
377	$movkey	32($key),$rndkey0
378	lea	32($key,$rounds),$key
379	neg	%rax				# $rounds
380	add	\$16,%rax
381
382.L${dir}_loop3:
383	aes${dir}	$rndkey1,$inout0
384	aes${dir}	$rndkey1,$inout1
385	aes${dir}	$rndkey1,$inout2
386	$movkey		($key,%rax),$rndkey1
387	add		\$32,%rax
388	aes${dir}	$rndkey0,$inout0
389	aes${dir}	$rndkey0,$inout1
390	aes${dir}	$rndkey0,$inout2
391	$movkey		-16($key,%rax),$rndkey0
392	jnz		.L${dir}_loop3
393
394	aes${dir}	$rndkey1,$inout0
395	aes${dir}	$rndkey1,$inout1
396	aes${dir}	$rndkey1,$inout2
397	aes${dir}last	$rndkey0,$inout0
398	aes${dir}last	$rndkey0,$inout1
399	aes${dir}last	$rndkey0,$inout2
400	ret
401.cfi_endproc
402.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
403___
404}
405# 4x interleave is implemented to improve small block performance,
406# most notably [and naturally] 4 block by ~30%. One can argue that one
407# should have implemented 5x as well, but improvement would be <20%,
408# so it's not worth it...
409sub aesni_generate4 {
410my $dir=shift;
411# As already mentioned it takes in $key and $rounds, which are *not*
412# preserved. $inout[0-3] is cipher/clear text...
413$code.=<<___;
414.type	_aesni_${dir}rypt4,\@abi-omnipotent
415.align	16
416_aesni_${dir}rypt4:
417.cfi_startproc
418	$movkey	($key),$rndkey0
419	shl	\$4,$rounds
420	$movkey	16($key),$rndkey1
421	xorps	$rndkey0,$inout0
422	xorps	$rndkey0,$inout1
423	xorps	$rndkey0,$inout2
424	xorps	$rndkey0,$inout3
425	$movkey	32($key),$rndkey0
426	lea	32($key,$rounds),$key
427	neg	%rax				# $rounds
428	.byte	0x0f,0x1f,0x00
429	add	\$16,%rax
430
431.L${dir}_loop4:
432	aes${dir}	$rndkey1,$inout0
433	aes${dir}	$rndkey1,$inout1
434	aes${dir}	$rndkey1,$inout2
435	aes${dir}	$rndkey1,$inout3
436	$movkey		($key,%rax),$rndkey1
437	add		\$32,%rax
438	aes${dir}	$rndkey0,$inout0
439	aes${dir}	$rndkey0,$inout1
440	aes${dir}	$rndkey0,$inout2
441	aes${dir}	$rndkey0,$inout3
442	$movkey		-16($key,%rax),$rndkey0
443	jnz		.L${dir}_loop4
444
445	aes${dir}	$rndkey1,$inout0
446	aes${dir}	$rndkey1,$inout1
447	aes${dir}	$rndkey1,$inout2
448	aes${dir}	$rndkey1,$inout3
449	aes${dir}last	$rndkey0,$inout0
450	aes${dir}last	$rndkey0,$inout1
451	aes${dir}last	$rndkey0,$inout2
452	aes${dir}last	$rndkey0,$inout3
453	ret
454.cfi_endproc
455.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
456___
457}
458sub aesni_generate6 {
459my $dir=shift;
460# As already mentioned it takes in $key and $rounds, which are *not*
461# preserved. $inout[0-5] is cipher/clear text...
462$code.=<<___;
463.type	_aesni_${dir}rypt6,\@abi-omnipotent
464.align	16
465_aesni_${dir}rypt6:
466.cfi_startproc
467	$movkey		($key),$rndkey0
468	shl		\$4,$rounds
469	$movkey		16($key),$rndkey1
470	xorps		$rndkey0,$inout0
471	pxor		$rndkey0,$inout1
472	pxor		$rndkey0,$inout2
473	aes${dir}	$rndkey1,$inout0
474	lea		32($key,$rounds),$key
475	neg		%rax			# $rounds
476	aes${dir}	$rndkey1,$inout1
477	pxor		$rndkey0,$inout3
478	pxor		$rndkey0,$inout4
479	aes${dir}	$rndkey1,$inout2
480	pxor		$rndkey0,$inout5
481	$movkey		($key,%rax),$rndkey0
482	add		\$16,%rax
483	jmp		.L${dir}_loop6_enter
484.align	16
485.L${dir}_loop6:
486	aes${dir}	$rndkey1,$inout0
487	aes${dir}	$rndkey1,$inout1
488	aes${dir}	$rndkey1,$inout2
489.L${dir}_loop6_enter:
490	aes${dir}	$rndkey1,$inout3
491	aes${dir}	$rndkey1,$inout4
492	aes${dir}	$rndkey1,$inout5
493	$movkey		($key,%rax),$rndkey1
494	add		\$32,%rax
495	aes${dir}	$rndkey0,$inout0
496	aes${dir}	$rndkey0,$inout1
497	aes${dir}	$rndkey0,$inout2
498	aes${dir}	$rndkey0,$inout3
499	aes${dir}	$rndkey0,$inout4
500	aes${dir}	$rndkey0,$inout5
501	$movkey		-16($key,%rax),$rndkey0
502	jnz		.L${dir}_loop6
503
504	aes${dir}	$rndkey1,$inout0
505	aes${dir}	$rndkey1,$inout1
506	aes${dir}	$rndkey1,$inout2
507	aes${dir}	$rndkey1,$inout3
508	aes${dir}	$rndkey1,$inout4
509	aes${dir}	$rndkey1,$inout5
510	aes${dir}last	$rndkey0,$inout0
511	aes${dir}last	$rndkey0,$inout1
512	aes${dir}last	$rndkey0,$inout2
513	aes${dir}last	$rndkey0,$inout3
514	aes${dir}last	$rndkey0,$inout4
515	aes${dir}last	$rndkey0,$inout5
516	ret
517.cfi_endproc
518.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
519___
520}
521sub aesni_generate8 {
522my $dir=shift;
523# As already mentioned it takes in $key and $rounds, which are *not*
524# preserved. $inout[0-7] is cipher/clear text...
525$code.=<<___;
526.type	_aesni_${dir}rypt8,\@abi-omnipotent
527.align	16
528_aesni_${dir}rypt8:
529.cfi_startproc
530	$movkey		($key),$rndkey0
531	shl		\$4,$rounds
532	$movkey		16($key),$rndkey1
533	xorps		$rndkey0,$inout0
534	xorps		$rndkey0,$inout1
535	pxor		$rndkey0,$inout2
536	pxor		$rndkey0,$inout3
537	pxor		$rndkey0,$inout4
538	lea		32($key,$rounds),$key
539	neg		%rax			# $rounds
540	aes${dir}	$rndkey1,$inout0
541	pxor		$rndkey0,$inout5
542	pxor		$rndkey0,$inout6
543	aes${dir}	$rndkey1,$inout1
544	pxor		$rndkey0,$inout7
545	$movkey		($key,%rax),$rndkey0
546	add		\$16,%rax
547	jmp		.L${dir}_loop8_inner
548.align	16
549.L${dir}_loop8:
550	aes${dir}	$rndkey1,$inout0
551	aes${dir}	$rndkey1,$inout1
552.L${dir}_loop8_inner:
553	aes${dir}	$rndkey1,$inout2
554	aes${dir}	$rndkey1,$inout3
555	aes${dir}	$rndkey1,$inout4
556	aes${dir}	$rndkey1,$inout5
557	aes${dir}	$rndkey1,$inout6
558	aes${dir}	$rndkey1,$inout7
559.L${dir}_loop8_enter:
560	$movkey		($key,%rax),$rndkey1
561	add		\$32,%rax
562	aes${dir}	$rndkey0,$inout0
563	aes${dir}	$rndkey0,$inout1
564	aes${dir}	$rndkey0,$inout2
565	aes${dir}	$rndkey0,$inout3
566	aes${dir}	$rndkey0,$inout4
567	aes${dir}	$rndkey0,$inout5
568	aes${dir}	$rndkey0,$inout6
569	aes${dir}	$rndkey0,$inout7
570	$movkey		-16($key,%rax),$rndkey0
571	jnz		.L${dir}_loop8
572
573	aes${dir}	$rndkey1,$inout0
574	aes${dir}	$rndkey1,$inout1
575	aes${dir}	$rndkey1,$inout2
576	aes${dir}	$rndkey1,$inout3
577	aes${dir}	$rndkey1,$inout4
578	aes${dir}	$rndkey1,$inout5
579	aes${dir}	$rndkey1,$inout6
580	aes${dir}	$rndkey1,$inout7
581	aes${dir}last	$rndkey0,$inout0
582	aes${dir}last	$rndkey0,$inout1
583	aes${dir}last	$rndkey0,$inout2
584	aes${dir}last	$rndkey0,$inout3
585	aes${dir}last	$rndkey0,$inout4
586	aes${dir}last	$rndkey0,$inout5
587	aes${dir}last	$rndkey0,$inout6
588	aes${dir}last	$rndkey0,$inout7
589	ret
590.cfi_endproc
591.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
592___
593}
594&aesni_generate2("enc") if ($PREFIX eq "aesni");
595&aesni_generate2("dec");
596&aesni_generate3("enc") if ($PREFIX eq "aesni");
597&aesni_generate3("dec");
598&aesni_generate4("enc") if ($PREFIX eq "aesni");
599&aesni_generate4("dec");
600&aesni_generate6("enc") if ($PREFIX eq "aesni");
601&aesni_generate6("dec");
602&aesni_generate8("enc") if ($PREFIX eq "aesni");
603&aesni_generate8("dec");
604
605if ($PREFIX eq "aesni") {
606########################################################################
607# void aesni_ecb_encrypt (const void *in, void *out,
608#			  size_t length, const AES_KEY *key,
609#			  int enc);
610$code.=<<___;
611.globl	aesni_ecb_encrypt
612.type	aesni_ecb_encrypt,\@function,5
613.align	16
614aesni_ecb_encrypt:
615.cfi_startproc
616___
617$code.=<<___ if ($win64);
618	lea	-0x58(%rsp),%rsp
619	movaps	%xmm6,(%rsp)		# offload $inout4..7
620	movaps	%xmm7,0x10(%rsp)
621	movaps	%xmm8,0x20(%rsp)
622	movaps	%xmm9,0x30(%rsp)
623.Lecb_enc_body:
624___
625$code.=<<___;
626	and	\$-16,$len		# if ($len<16)
627	jz	.Lecb_ret		# return
628
629	mov	240($key),$rounds	# key->rounds
630	$movkey	($key),$rndkey0
631	mov	$key,$key_		# backup $key
632	mov	$rounds,$rnds_		# backup $rounds
633	test	%r8d,%r8d		# 5th argument
634	jz	.Lecb_decrypt
635#--------------------------- ECB ENCRYPT ------------------------------#
636	cmp	\$0x80,$len		# if ($len<8*16)
637	jb	.Lecb_enc_tail		# short input
638
639	movdqu	($inp),$inout0		# load 8 input blocks
640	movdqu	0x10($inp),$inout1
641	movdqu	0x20($inp),$inout2
642	movdqu	0x30($inp),$inout3
643	movdqu	0x40($inp),$inout4
644	movdqu	0x50($inp),$inout5
645	movdqu	0x60($inp),$inout6
646	movdqu	0x70($inp),$inout7
647	lea	0x80($inp),$inp		# $inp+=8*16
648	sub	\$0x80,$len		# $len-=8*16 (can be zero)
649	jmp	.Lecb_enc_loop8_enter
650.align 16
651.Lecb_enc_loop8:
652	movups	$inout0,($out)		# store 8 output blocks
653	mov	$key_,$key		# restore $key
654	movdqu	($inp),$inout0		# load 8 input blocks
655	mov	$rnds_,$rounds		# restore $rounds
656	movups	$inout1,0x10($out)
657	movdqu	0x10($inp),$inout1
658	movups	$inout2,0x20($out)
659	movdqu	0x20($inp),$inout2
660	movups	$inout3,0x30($out)
661	movdqu	0x30($inp),$inout3
662	movups	$inout4,0x40($out)
663	movdqu	0x40($inp),$inout4
664	movups	$inout5,0x50($out)
665	movdqu	0x50($inp),$inout5
666	movups	$inout6,0x60($out)
667	movdqu	0x60($inp),$inout6
668	movups	$inout7,0x70($out)
669	lea	0x80($out),$out		# $out+=8*16
670	movdqu	0x70($inp),$inout7
671	lea	0x80($inp),$inp		# $inp+=8*16
672.Lecb_enc_loop8_enter:
673
674	call	_aesni_encrypt8
675
676	sub	\$0x80,$len
677	jnc	.Lecb_enc_loop8		# loop if $len-=8*16 didn't borrow
678
679	movups	$inout0,($out)		# store 8 output blocks
680	mov	$key_,$key		# restore $key
681	movups	$inout1,0x10($out)
682	mov	$rnds_,$rounds		# restore $rounds
683	movups	$inout2,0x20($out)
684	movups	$inout3,0x30($out)
685	movups	$inout4,0x40($out)
686	movups	$inout5,0x50($out)
687	movups	$inout6,0x60($out)
688	movups	$inout7,0x70($out)
689	lea	0x80($out),$out		# $out+=8*16
690	add	\$0x80,$len		# restore real remaining $len
691	jz	.Lecb_ret		# done if ($len==0)
692
693.Lecb_enc_tail:				# $len is less than 8*16
694	movups	($inp),$inout0
695	cmp	\$0x20,$len
696	jb	.Lecb_enc_one
697	movups	0x10($inp),$inout1
698	je	.Lecb_enc_two
699	movups	0x20($inp),$inout2
700	cmp	\$0x40,$len
701	jb	.Lecb_enc_three
702	movups	0x30($inp),$inout3
703	je	.Lecb_enc_four
704	movups	0x40($inp),$inout4
705	cmp	\$0x60,$len
706	jb	.Lecb_enc_five
707	movups	0x50($inp),$inout5
708	je	.Lecb_enc_six
709	movdqu	0x60($inp),$inout6
710	xorps	$inout7,$inout7
711	call	_aesni_encrypt8
712	movups	$inout0,($out)		# store 7 output blocks
713	movups	$inout1,0x10($out)
714	movups	$inout2,0x20($out)
715	movups	$inout3,0x30($out)
716	movups	$inout4,0x40($out)
717	movups	$inout5,0x50($out)
718	movups	$inout6,0x60($out)
719	jmp	.Lecb_ret
720.align	16
721.Lecb_enc_one:
722___
723	&aesni_generate1("enc",$key,$rounds);
724$code.=<<___;
725	movups	$inout0,($out)		# store one output block
726	jmp	.Lecb_ret
727.align	16
728.Lecb_enc_two:
729	call	_aesni_encrypt2
730	movups	$inout0,($out)		# store 2 output blocks
731	movups	$inout1,0x10($out)
732	jmp	.Lecb_ret
733.align	16
734.Lecb_enc_three:
735	call	_aesni_encrypt3
736	movups	$inout0,($out)		# store 3 output blocks
737	movups	$inout1,0x10($out)
738	movups	$inout2,0x20($out)
739	jmp	.Lecb_ret
740.align	16
741.Lecb_enc_four:
742	call	_aesni_encrypt4
743	movups	$inout0,($out)		# store 4 output blocks
744	movups	$inout1,0x10($out)
745	movups	$inout2,0x20($out)
746	movups	$inout3,0x30($out)
747	jmp	.Lecb_ret
748.align	16
749.Lecb_enc_five:
750	xorps	$inout5,$inout5
751	call	_aesni_encrypt6
752	movups	$inout0,($out)		# store 5 output blocks
753	movups	$inout1,0x10($out)
754	movups	$inout2,0x20($out)
755	movups	$inout3,0x30($out)
756	movups	$inout4,0x40($out)
757	jmp	.Lecb_ret
758.align	16
759.Lecb_enc_six:
760	call	_aesni_encrypt6
761	movups	$inout0,($out)		# store 6 output blocks
762	movups	$inout1,0x10($out)
763	movups	$inout2,0x20($out)
764	movups	$inout3,0x30($out)
765	movups	$inout4,0x40($out)
766	movups	$inout5,0x50($out)
767	jmp	.Lecb_ret
768#--------------------------- ECB DECRYPT ------------------------------#
769.align	16
770.Lecb_decrypt:
771	cmp	\$0x80,$len		# if ($len<8*16)
772	jb	.Lecb_dec_tail		# short input
773
774	movdqu	($inp),$inout0		# load 8 input blocks
775	movdqu	0x10($inp),$inout1
776	movdqu	0x20($inp),$inout2
777	movdqu	0x30($inp),$inout3
778	movdqu	0x40($inp),$inout4
779	movdqu	0x50($inp),$inout5
780	movdqu	0x60($inp),$inout6
781	movdqu	0x70($inp),$inout7
782	lea	0x80($inp),$inp		# $inp+=8*16
783	sub	\$0x80,$len		# $len-=8*16 (can be zero)
784	jmp	.Lecb_dec_loop8_enter
785.align 16
786.Lecb_dec_loop8:
787	movups	$inout0,($out)		# store 8 output blocks
788	mov	$key_,$key		# restore $key
789	movdqu	($inp),$inout0		# load 8 input blocks
790	mov	$rnds_,$rounds		# restore $rounds
791	movups	$inout1,0x10($out)
792	movdqu	0x10($inp),$inout1
793	movups	$inout2,0x20($out)
794	movdqu	0x20($inp),$inout2
795	movups	$inout3,0x30($out)
796	movdqu	0x30($inp),$inout3
797	movups	$inout4,0x40($out)
798	movdqu	0x40($inp),$inout4
799	movups	$inout5,0x50($out)
800	movdqu	0x50($inp),$inout5
801	movups	$inout6,0x60($out)
802	movdqu	0x60($inp),$inout6
803	movups	$inout7,0x70($out)
804	lea	0x80($out),$out		# $out+=8*16
805	movdqu	0x70($inp),$inout7
806	lea	0x80($inp),$inp		# $inp+=8*16
807.Lecb_dec_loop8_enter:
808
809	call	_aesni_decrypt8
810
811	$movkey	($key_),$rndkey0
812	sub	\$0x80,$len
813	jnc	.Lecb_dec_loop8		# loop if $len-=8*16 didn't borrow
814
815	movups	$inout0,($out)		# store 8 output blocks
816	 pxor	$inout0,$inout0		# clear register bank
817	mov	$key_,$key		# restore $key
818	movups	$inout1,0x10($out)
819	 pxor	$inout1,$inout1
820	mov	$rnds_,$rounds		# restore $rounds
821	movups	$inout2,0x20($out)
822	 pxor	$inout2,$inout2
823	movups	$inout3,0x30($out)
824	 pxor	$inout3,$inout3
825	movups	$inout4,0x40($out)
826	 pxor	$inout4,$inout4
827	movups	$inout5,0x50($out)
828	 pxor	$inout5,$inout5
829	movups	$inout6,0x60($out)
830	 pxor	$inout6,$inout6
831	movups	$inout7,0x70($out)
832	 pxor	$inout7,$inout7
833	lea	0x80($out),$out		# $out+=8*16
834	add	\$0x80,$len		# restore real remaining $len
835	jz	.Lecb_ret		# done if ($len==0)
836
837.Lecb_dec_tail:
838	movups	($inp),$inout0
839	cmp	\$0x20,$len
840	jb	.Lecb_dec_one
841	movups	0x10($inp),$inout1
842	je	.Lecb_dec_two
843	movups	0x20($inp),$inout2
844	cmp	\$0x40,$len
845	jb	.Lecb_dec_three
846	movups	0x30($inp),$inout3
847	je	.Lecb_dec_four
848	movups	0x40($inp),$inout4
849	cmp	\$0x60,$len
850	jb	.Lecb_dec_five
851	movups	0x50($inp),$inout5
852	je	.Lecb_dec_six
853	movups	0x60($inp),$inout6
854	$movkey	($key),$rndkey0
855	xorps	$inout7,$inout7
856	call	_aesni_decrypt8
857	movups	$inout0,($out)		# store 7 output blocks
858	 pxor	$inout0,$inout0		# clear register bank
859	movups	$inout1,0x10($out)
860	 pxor	$inout1,$inout1
861	movups	$inout2,0x20($out)
862	 pxor	$inout2,$inout2
863	movups	$inout3,0x30($out)
864	 pxor	$inout3,$inout3
865	movups	$inout4,0x40($out)
866	 pxor	$inout4,$inout4
867	movups	$inout5,0x50($out)
868	 pxor	$inout5,$inout5
869	movups	$inout6,0x60($out)
870	 pxor	$inout6,$inout6
871	 pxor	$inout7,$inout7
872	jmp	.Lecb_ret
873.align	16
874.Lecb_dec_one:
875___
876	&aesni_generate1("dec",$key,$rounds);
877$code.=<<___;
878	movups	$inout0,($out)		# store one output block
879	 pxor	$inout0,$inout0		# clear register bank
880	jmp	.Lecb_ret
881.align	16
882.Lecb_dec_two:
883	call	_aesni_decrypt2
884	movups	$inout0,($out)		# store 2 output blocks
885	 pxor	$inout0,$inout0		# clear register bank
886	movups	$inout1,0x10($out)
887	 pxor	$inout1,$inout1
888	jmp	.Lecb_ret
889.align	16
890.Lecb_dec_three:
891	call	_aesni_decrypt3
892	movups	$inout0,($out)		# store 3 output blocks
893	 pxor	$inout0,$inout0		# clear register bank
894	movups	$inout1,0x10($out)
895	 pxor	$inout1,$inout1
896	movups	$inout2,0x20($out)
897	 pxor	$inout2,$inout2
898	jmp	.Lecb_ret
899.align	16
900.Lecb_dec_four:
901	call	_aesni_decrypt4
902	movups	$inout0,($out)		# store 4 output blocks
903	 pxor	$inout0,$inout0		# clear register bank
904	movups	$inout1,0x10($out)
905	 pxor	$inout1,$inout1
906	movups	$inout2,0x20($out)
907	 pxor	$inout2,$inout2
908	movups	$inout3,0x30($out)
909	 pxor	$inout3,$inout3
910	jmp	.Lecb_ret
911.align	16
912.Lecb_dec_five:
913	xorps	$inout5,$inout5
914	call	_aesni_decrypt6
915	movups	$inout0,($out)		# store 5 output blocks
916	 pxor	$inout0,$inout0		# clear register bank
917	movups	$inout1,0x10($out)
918	 pxor	$inout1,$inout1
919	movups	$inout2,0x20($out)
920	 pxor	$inout2,$inout2
921	movups	$inout3,0x30($out)
922	 pxor	$inout3,$inout3
923	movups	$inout4,0x40($out)
924	 pxor	$inout4,$inout4
925	 pxor	$inout5,$inout5
926	jmp	.Lecb_ret
927.align	16
928.Lecb_dec_six:
929	call	_aesni_decrypt6
930	movups	$inout0,($out)		# store 6 output blocks
931	 pxor	$inout0,$inout0		# clear register bank
932	movups	$inout1,0x10($out)
933	 pxor	$inout1,$inout1
934	movups	$inout2,0x20($out)
935	 pxor	$inout2,$inout2
936	movups	$inout3,0x30($out)
937	 pxor	$inout3,$inout3
938	movups	$inout4,0x40($out)
939	 pxor	$inout4,$inout4
940	movups	$inout5,0x50($out)
941	 pxor	$inout5,$inout5
942
943.Lecb_ret:
944	xorps	$rndkey0,$rndkey0	# %xmm0
945	pxor	$rndkey1,$rndkey1
946___
947$code.=<<___ if ($win64);
948	movaps	(%rsp),%xmm6
949	movaps	%xmm0,(%rsp)		# clear stack
950	movaps	0x10(%rsp),%xmm7
951	movaps	%xmm0,0x10(%rsp)
952	movaps	0x20(%rsp),%xmm8
953	movaps	%xmm0,0x20(%rsp)
954	movaps	0x30(%rsp),%xmm9
955	movaps	%xmm0,0x30(%rsp)
956	lea	0x58(%rsp),%rsp
957.Lecb_enc_ret:
958___
959$code.=<<___;
960	ret
961.cfi_endproc
962.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
963___
964
965{
966######################################################################
967# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
968#                         size_t blocks, const AES_KEY *key,
969#                         const char *ivec,char *cmac);
970#
971# Handles only complete blocks, operates on 64-bit counter and
972# does not update *ivec! Nor does it finalize CMAC value
973# (see engine/eng_aesni.c for details)
974#
975{
976my $cmac="%r9";	# 6th argument
977
978my $increment="%xmm9";
979my $iv="%xmm6";
980my $bswap_mask="%xmm7";
981
982$code.=<<___;
983.globl	aesni_ccm64_encrypt_blocks
984.type	aesni_ccm64_encrypt_blocks,\@function,6
985.align	16
986aesni_ccm64_encrypt_blocks:
987.cfi_startproc
988___
989$code.=<<___ if ($win64);
990	lea	-0x58(%rsp),%rsp
991	movaps	%xmm6,(%rsp)		# $iv
992	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
993	movaps	%xmm8,0x20(%rsp)	# $in0
994	movaps	%xmm9,0x30(%rsp)	# $increment
995.Lccm64_enc_body:
996___
997$code.=<<___;
998	mov	240($key),$rounds		# key->rounds
999	movdqu	($ivp),$iv
1000	movdqa	.Lincrement64(%rip),$increment
1001	movdqa	.Lbswap_mask(%rip),$bswap_mask
1002
1003	shl	\$4,$rounds
1004	mov	\$16,$rnds_
1005	lea	0($key),$key_
1006	movdqu	($cmac),$inout1
1007	movdqa	$iv,$inout0
1008	lea	32($key,$rounds),$key		# end of key schedule
1009	pshufb	$bswap_mask,$iv
1010	sub	%rax,%r10			# twisted $rounds
1011	jmp	.Lccm64_enc_outer
1012.align	16
1013.Lccm64_enc_outer:
1014	$movkey	($key_),$rndkey0
1015	mov	%r10,%rax
1016	movups	($inp),$in0			# load inp
1017
1018	xorps	$rndkey0,$inout0		# counter
1019	$movkey	16($key_),$rndkey1
1020	xorps	$in0,$rndkey0
1021	xorps	$rndkey0,$inout1		# cmac^=inp
1022	$movkey	32($key_),$rndkey0
1023
1024.Lccm64_enc2_loop:
1025	aesenc	$rndkey1,$inout0
1026	aesenc	$rndkey1,$inout1
1027	$movkey	($key,%rax),$rndkey1
1028	add	\$32,%rax
1029	aesenc	$rndkey0,$inout0
1030	aesenc	$rndkey0,$inout1
1031	$movkey	-16($key,%rax),$rndkey0
1032	jnz	.Lccm64_enc2_loop
1033	aesenc	$rndkey1,$inout0
1034	aesenc	$rndkey1,$inout1
1035	paddq	$increment,$iv
1036	dec	$len				# $len-- ($len is in blocks)
1037	aesenclast	$rndkey0,$inout0
1038	aesenclast	$rndkey0,$inout1
1039
1040	lea	16($inp),$inp
1041	xorps	$inout0,$in0			# inp ^= E(iv)
1042	movdqa	$iv,$inout0
1043	movups	$in0,($out)			# save output
1044	pshufb	$bswap_mask,$inout0
1045	lea	16($out),$out			# $out+=16
1046	jnz	.Lccm64_enc_outer		# loop if ($len!=0)
1047
1048	 pxor	$rndkey0,$rndkey0		# clear register bank
1049	 pxor	$rndkey1,$rndkey1
1050	 pxor	$inout0,$inout0
1051	movups	$inout1,($cmac)			# store resulting mac
1052	 pxor	$inout1,$inout1
1053	 pxor	$in0,$in0
1054	 pxor	$iv,$iv
1055___
1056$code.=<<___ if ($win64);
1057	movaps	(%rsp),%xmm6
1058	movaps	%xmm0,(%rsp)			# clear stack
1059	movaps	0x10(%rsp),%xmm7
1060	movaps	%xmm0,0x10(%rsp)
1061	movaps	0x20(%rsp),%xmm8
1062	movaps	%xmm0,0x20(%rsp)
1063	movaps	0x30(%rsp),%xmm9
1064	movaps	%xmm0,0x30(%rsp)
1065	lea	0x58(%rsp),%rsp
1066.Lccm64_enc_ret:
1067___
1068$code.=<<___;
1069	ret
1070.cfi_endproc
1071.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1072___
1073######################################################################
1074$code.=<<___;
1075.globl	aesni_ccm64_decrypt_blocks
1076.type	aesni_ccm64_decrypt_blocks,\@function,6
1077.align	16
1078aesni_ccm64_decrypt_blocks:
1079.cfi_startproc
1080___
1081$code.=<<___ if ($win64);
1082	lea	-0x58(%rsp),%rsp
1083	movaps	%xmm6,(%rsp)		# $iv
1084	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
1085	movaps	%xmm8,0x20(%rsp)	# $in8
1086	movaps	%xmm9,0x30(%rsp)	# $increment
1087.Lccm64_dec_body:
1088___
1089$code.=<<___;
1090	mov	240($key),$rounds		# key->rounds
1091	movups	($ivp),$iv
1092	movdqu	($cmac),$inout1
1093	movdqa	.Lincrement64(%rip),$increment
1094	movdqa	.Lbswap_mask(%rip),$bswap_mask
1095
1096	movaps	$iv,$inout0
1097	mov	$rounds,$rnds_
1098	mov	$key,$key_
1099	pshufb	$bswap_mask,$iv
1100___
1101	&aesni_generate1("enc",$key,$rounds);
1102$code.=<<___;
1103	shl	\$4,$rnds_
1104	mov	\$16,$rounds
1105	movups	($inp),$in0			# load inp
1106	paddq	$increment,$iv
1107	lea	16($inp),$inp			# $inp+=16
1108	sub	%r10,%rax			# twisted $rounds
1109	lea	32($key_,$rnds_),$key		# end of key schedule
1110	mov	%rax,%r10
1111	jmp	.Lccm64_dec_outer
1112.align	16
1113.Lccm64_dec_outer:
1114	xorps	$inout0,$in0			# inp ^= E(iv)
1115	movdqa	$iv,$inout0
1116	movups	$in0,($out)			# save output
1117	lea	16($out),$out			# $out+=16
1118	pshufb	$bswap_mask,$inout0
1119
1120	sub	\$1,$len			# $len-- ($len is in blocks)
1121	jz	.Lccm64_dec_break		# if ($len==0) break
1122
1123	$movkey	($key_),$rndkey0
1124	mov	%r10,%rax
1125	$movkey	16($key_),$rndkey1
1126	xorps	$rndkey0,$in0
1127	xorps	$rndkey0,$inout0
1128	xorps	$in0,$inout1			# cmac^=out
1129	$movkey	32($key_),$rndkey0
1130	jmp	.Lccm64_dec2_loop
1131.align	16
1132.Lccm64_dec2_loop:
1133	aesenc	$rndkey1,$inout0
1134	aesenc	$rndkey1,$inout1
1135	$movkey	($key,%rax),$rndkey1
1136	add	\$32,%rax
1137	aesenc	$rndkey0,$inout0
1138	aesenc	$rndkey0,$inout1
1139	$movkey	-16($key,%rax),$rndkey0
1140	jnz	.Lccm64_dec2_loop
1141	movups	($inp),$in0			# load input
1142	paddq	$increment,$iv
1143	aesenc	$rndkey1,$inout0
1144	aesenc	$rndkey1,$inout1
1145	aesenclast	$rndkey0,$inout0
1146	aesenclast	$rndkey0,$inout1
1147	lea	16($inp),$inp			# $inp+=16
1148	jmp	.Lccm64_dec_outer
1149
1150.align	16
1151.Lccm64_dec_break:
1152	#xorps	$in0,$inout1			# cmac^=out
1153	mov	240($key_),$rounds
1154___
1155	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1156$code.=<<___;
1157	 pxor	$rndkey0,$rndkey0		# clear register bank
1158	 pxor	$rndkey1,$rndkey1
1159	 pxor	$inout0,$inout0
1160	movups	$inout1,($cmac)			# store resulting mac
1161	 pxor	$inout1,$inout1
1162	 pxor	$in0,$in0
1163	 pxor	$iv,$iv
1164___
1165$code.=<<___ if ($win64);
1166	movaps	(%rsp),%xmm6
1167	movaps	%xmm0,(%rsp)			# clear stack
1168	movaps	0x10(%rsp),%xmm7
1169	movaps	%xmm0,0x10(%rsp)
1170	movaps	0x20(%rsp),%xmm8
1171	movaps	%xmm0,0x20(%rsp)
1172	movaps	0x30(%rsp),%xmm9
1173	movaps	%xmm0,0x30(%rsp)
1174	lea	0x58(%rsp),%rsp
1175.Lccm64_dec_ret:
1176___
1177$code.=<<___;
1178	ret
1179.cfi_endproc
1180.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1181___
1182}
1183######################################################################
1184# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1185#                         size_t blocks, const AES_KEY *key,
1186#                         const char *ivec);
1187#
1188# Handles only complete blocks, operates on 32-bit counter and
1189# does not update *ivec! (see crypto/modes/ctr128.c for details)
1190#
1191# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1192# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1193# Keywords are full unroll and modulo-schedule counter calculations
1194# with zero-round key xor.
1195{
1196my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1197my ($key0,$ctr)=("%ebp","${ivp}d");
1198my $frame_size = 0x80 + ($win64?160:0);
1199
1200$code.=<<___;
1201.globl	aesni_ctr32_encrypt_blocks
1202.type	aesni_ctr32_encrypt_blocks,\@function,5
1203.align	16
1204aesni_ctr32_encrypt_blocks:
1205.cfi_startproc
1206	cmp	\$1,$len
1207	jne	.Lctr32_bulk
1208
1209	# handle single block without allocating stack frame,
1210	# useful when handling edges
1211	movups	($ivp),$inout0
1212	movups	($inp),$inout1
1213	mov	240($key),%edx			# key->rounds
1214___
1215	&aesni_generate1("enc",$key,"%edx");
1216$code.=<<___;
1217	 pxor	$rndkey0,$rndkey0		# clear register bank
1218	 pxor	$rndkey1,$rndkey1
1219	xorps	$inout1,$inout0
1220	 pxor	$inout1,$inout1
1221	movups	$inout0,($out)
1222	 xorps	$inout0,$inout0
1223	jmp	.Lctr32_epilogue
1224
1225.align	16
1226.Lctr32_bulk:
1227	lea	(%rsp),$key_			# use $key_ as frame pointer
1228.cfi_def_cfa_register	$key_
1229	push	%rbp
1230.cfi_push	%rbp
1231	sub	\$$frame_size,%rsp
1232	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1233___
1234$code.=<<___ if ($win64);
1235	movaps	%xmm6,-0xa8($key_)		# offload everything
1236	movaps	%xmm7,-0x98($key_)
1237	movaps	%xmm8,-0x88($key_)
1238	movaps	%xmm9,-0x78($key_)
1239	movaps	%xmm10,-0x68($key_)
1240	movaps	%xmm11,-0x58($key_)
1241	movaps	%xmm12,-0x48($key_)
1242	movaps	%xmm13,-0x38($key_)
1243	movaps	%xmm14,-0x28($key_)
1244	movaps	%xmm15,-0x18($key_)
1245.Lctr32_body:
1246___
1247$code.=<<___;
1248
1249	# 8 16-byte words on top of stack are counter values
1250	# xor-ed with zero-round key
1251
1252	movdqu	($ivp),$inout0
1253	movdqu	($key),$rndkey0
1254	mov	12($ivp),$ctr			# counter LSB
1255	pxor	$rndkey0,$inout0
1256	mov	12($key),$key0			# 0-round key LSB
1257	movdqa	$inout0,0x00(%rsp)		# populate counter block
1258	bswap	$ctr
1259	movdqa	$inout0,$inout1
1260	movdqa	$inout0,$inout2
1261	movdqa	$inout0,$inout3
1262	movdqa	$inout0,0x40(%rsp)
1263	movdqa	$inout0,0x50(%rsp)
1264	movdqa	$inout0,0x60(%rsp)
1265	mov	%rdx,%r10			# about to borrow %rdx
1266	movdqa	$inout0,0x70(%rsp)
1267
1268	lea	1($ctr),%rax
1269	 lea	2($ctr),%rdx
1270	bswap	%eax
1271	 bswap	%edx
1272	xor	$key0,%eax
1273	 xor	$key0,%edx
1274	pinsrd	\$3,%eax,$inout1
1275	lea	3($ctr),%rax
1276	movdqa	$inout1,0x10(%rsp)
1277	 pinsrd	\$3,%edx,$inout2
1278	bswap	%eax
1279	 mov	%r10,%rdx			# restore %rdx
1280	 lea	4($ctr),%r10
1281	 movdqa	$inout2,0x20(%rsp)
1282	xor	$key0,%eax
1283	 bswap	%r10d
1284	pinsrd	\$3,%eax,$inout3
1285	 xor	$key0,%r10d
1286	movdqa	$inout3,0x30(%rsp)
1287	lea	5($ctr),%r9
1288	 mov	%r10d,0x40+12(%rsp)
1289	bswap	%r9d
1290	 lea	6($ctr),%r10
1291	mov	240($key),$rounds		# key->rounds
1292	xor	$key0,%r9d
1293	 bswap	%r10d
1294	mov	%r9d,0x50+12(%rsp)
1295	 xor	$key0,%r10d
1296	lea	7($ctr),%r9
1297	 mov	%r10d,0x60+12(%rsp)
1298	bswap	%r9d
1299	 mov	OPENSSL_ia32cap_P+4(%rip),%r10d
1300	xor	$key0,%r9d
1301	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
1302	mov	%r9d,0x70+12(%rsp)
1303
1304	$movkey	0x10($key),$rndkey1
1305
1306	movdqa	0x40(%rsp),$inout4
1307	movdqa	0x50(%rsp),$inout5
1308
1309	cmp	\$8,$len		# $len is in blocks
1310	jb	.Lctr32_tail		# short input if ($len<8)
1311
1312	sub	\$6,$len		# $len is biased by -6
1313	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
1314	je	.Lctr32_6x		# [which denotes Atom Silvermont]
1315
1316	lea	0x80($key),$key		# size optimization
1317	sub	\$2,$len		# $len is biased by -8
1318	jmp	.Lctr32_loop8
1319
1320.align	16
1321.Lctr32_6x:
1322	shl	\$4,$rounds
1323	mov	\$48,$rnds_
1324	bswap	$key0
1325	lea	32($key,$rounds),$key	# end of key schedule
1326	sub	%rax,%r10		# twisted $rounds
1327	jmp	.Lctr32_loop6
1328
1329.align	16
1330.Lctr32_loop6:
1331	 add	\$6,$ctr		# next counter value
1332	$movkey	-48($key,$rnds_),$rndkey0
1333	aesenc	$rndkey1,$inout0
1334	 mov	$ctr,%eax
1335	 xor	$key0,%eax
1336	aesenc	$rndkey1,$inout1
1337	 movbe	%eax,`0x00+12`(%rsp)	# store next counter value
1338	 lea	1($ctr),%eax
1339	aesenc	$rndkey1,$inout2
1340	 xor	$key0,%eax
1341	 movbe	%eax,`0x10+12`(%rsp)
1342	aesenc	$rndkey1,$inout3
1343	 lea	2($ctr),%eax
1344	 xor	$key0,%eax
1345	aesenc	$rndkey1,$inout4
1346	 movbe	%eax,`0x20+12`(%rsp)
1347	 lea	3($ctr),%eax
1348	aesenc	$rndkey1,$inout5
1349	$movkey	-32($key,$rnds_),$rndkey1
1350	 xor	$key0,%eax
1351
1352	aesenc	$rndkey0,$inout0
1353	 movbe	%eax,`0x30+12`(%rsp)
1354	 lea	4($ctr),%eax
1355	aesenc	$rndkey0,$inout1
1356	 xor	$key0,%eax
1357	 movbe	%eax,`0x40+12`(%rsp)
1358	aesenc	$rndkey0,$inout2
1359	 lea	5($ctr),%eax
1360	 xor	$key0,%eax
1361	aesenc	$rndkey0,$inout3
1362	 movbe	%eax,`0x50+12`(%rsp)
1363	 mov	%r10,%rax		# mov	$rnds_,$rounds
1364	aesenc	$rndkey0,$inout4
1365	aesenc	$rndkey0,$inout5
1366	$movkey	-16($key,$rnds_),$rndkey0
1367
1368	call	.Lenc_loop6
1369
1370	movdqu	($inp),$inout6		# load 6 input blocks
1371	movdqu	0x10($inp),$inout7
1372	movdqu	0x20($inp),$in0
1373	movdqu	0x30($inp),$in1
1374	movdqu	0x40($inp),$in2
1375	movdqu	0x50($inp),$in3
1376	lea	0x60($inp),$inp		# $inp+=6*16
1377	$movkey	-64($key,$rnds_),$rndkey1
1378	pxor	$inout0,$inout6		# inp^=E(ctr)
1379	movaps	0x00(%rsp),$inout0	# load next counter [xor-ed with 0 round]
1380	pxor	$inout1,$inout7
1381	movaps	0x10(%rsp),$inout1
1382	pxor	$inout2,$in0
1383	movaps	0x20(%rsp),$inout2
1384	pxor	$inout3,$in1
1385	movaps	0x30(%rsp),$inout3
1386	pxor	$inout4,$in2
1387	movaps	0x40(%rsp),$inout4
1388	pxor	$inout5,$in3
1389	movaps	0x50(%rsp),$inout5
1390	movdqu	$inout6,($out)		# store 6 output blocks
1391	movdqu	$inout7,0x10($out)
1392	movdqu	$in0,0x20($out)
1393	movdqu	$in1,0x30($out)
1394	movdqu	$in2,0x40($out)
1395	movdqu	$in3,0x50($out)
1396	lea	0x60($out),$out		# $out+=6*16
1397
1398	sub	\$6,$len
1399	jnc	.Lctr32_loop6		# loop if $len-=6 didn't borrow
1400
1401	add	\$6,$len		# restore real remaining $len
1402	jz	.Lctr32_done		# done if ($len==0)
1403
1404	lea	-48($rnds_),$rounds
1405	lea	-80($key,$rnds_),$key	# restore $key
1406	neg	$rounds
1407	shr	\$4,$rounds		# restore $rounds
1408	jmp	.Lctr32_tail
1409
1410.align	32
1411.Lctr32_loop8:
1412	 add		\$8,$ctr		# next counter value
1413	movdqa		0x60(%rsp),$inout6
1414	aesenc		$rndkey1,$inout0
1415	 mov		$ctr,%r9d
1416	movdqa		0x70(%rsp),$inout7
1417	aesenc		$rndkey1,$inout1
1418	 bswap		%r9d
1419	$movkey		0x20-0x80($key),$rndkey0
1420	aesenc		$rndkey1,$inout2
1421	 xor		$key0,%r9d
1422	 nop
1423	aesenc		$rndkey1,$inout3
1424	 mov		%r9d,0x00+12(%rsp)	# store next counter value
1425	 lea		1($ctr),%r9
1426	aesenc		$rndkey1,$inout4
1427	aesenc		$rndkey1,$inout5
1428	aesenc		$rndkey1,$inout6
1429	aesenc		$rndkey1,$inout7
1430	$movkey		0x30-0x80($key),$rndkey1
1431___
1432for($i=2;$i<8;$i++) {
1433my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1434$code.=<<___;
1435	 bswap		%r9d
1436	aesenc		$rndkeyx,$inout0
1437	aesenc		$rndkeyx,$inout1
1438	 xor		$key0,%r9d
1439	 .byte		0x66,0x90
1440	aesenc		$rndkeyx,$inout2
1441	aesenc		$rndkeyx,$inout3
1442	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
1443	 lea		$i($ctr),%r9
1444	aesenc		$rndkeyx,$inout4
1445	aesenc		$rndkeyx,$inout5
1446	aesenc		$rndkeyx,$inout6
1447	aesenc		$rndkeyx,$inout7
1448	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
1449___
1450}
1451$code.=<<___;
1452	 bswap		%r9d
1453	aesenc		$rndkey0,$inout0
1454	aesenc		$rndkey0,$inout1
1455	aesenc		$rndkey0,$inout2
1456	 xor		$key0,%r9d
1457	 movdqu		0x00($inp),$in0		# start loading input
1458	aesenc		$rndkey0,$inout3
1459	 mov		%r9d,0x70+12(%rsp)
1460	 cmp		\$11,$rounds
1461	aesenc		$rndkey0,$inout4
1462	aesenc		$rndkey0,$inout5
1463	aesenc		$rndkey0,$inout6
1464	aesenc		$rndkey0,$inout7
1465	$movkey		0xa0-0x80($key),$rndkey0
1466
1467	jb		.Lctr32_enc_done
1468
1469	aesenc		$rndkey1,$inout0
1470	aesenc		$rndkey1,$inout1
1471	aesenc		$rndkey1,$inout2
1472	aesenc		$rndkey1,$inout3
1473	aesenc		$rndkey1,$inout4
1474	aesenc		$rndkey1,$inout5
1475	aesenc		$rndkey1,$inout6
1476	aesenc		$rndkey1,$inout7
1477	$movkey		0xb0-0x80($key),$rndkey1
1478
1479	aesenc		$rndkey0,$inout0
1480	aesenc		$rndkey0,$inout1
1481	aesenc		$rndkey0,$inout2
1482	aesenc		$rndkey0,$inout3
1483	aesenc		$rndkey0,$inout4
1484	aesenc		$rndkey0,$inout5
1485	aesenc		$rndkey0,$inout6
1486	aesenc		$rndkey0,$inout7
1487	$movkey		0xc0-0x80($key),$rndkey0
1488	je		.Lctr32_enc_done
1489
1490	aesenc		$rndkey1,$inout0
1491	aesenc		$rndkey1,$inout1
1492	aesenc		$rndkey1,$inout2
1493	aesenc		$rndkey1,$inout3
1494	aesenc		$rndkey1,$inout4
1495	aesenc		$rndkey1,$inout5
1496	aesenc		$rndkey1,$inout6
1497	aesenc		$rndkey1,$inout7
1498	$movkey		0xd0-0x80($key),$rndkey1
1499
1500	aesenc		$rndkey0,$inout0
1501	aesenc		$rndkey0,$inout1
1502	aesenc		$rndkey0,$inout2
1503	aesenc		$rndkey0,$inout3
1504	aesenc		$rndkey0,$inout4
1505	aesenc		$rndkey0,$inout5
1506	aesenc		$rndkey0,$inout6
1507	aesenc		$rndkey0,$inout7
1508	$movkey		0xe0-0x80($key),$rndkey0
1509	jmp		.Lctr32_enc_done
1510
1511.align	16
1512.Lctr32_enc_done:
1513	movdqu		0x10($inp),$in1
1514	pxor		$rndkey0,$in0		# input^=round[last]
1515	movdqu		0x20($inp),$in2
1516	pxor		$rndkey0,$in1
1517	movdqu		0x30($inp),$in3
1518	pxor		$rndkey0,$in2
1519	movdqu		0x40($inp),$in4
1520	pxor		$rndkey0,$in3
1521	movdqu		0x50($inp),$in5
1522	pxor		$rndkey0,$in4
1523	pxor		$rndkey0,$in5
1524	aesenc		$rndkey1,$inout0
1525	aesenc		$rndkey1,$inout1
1526	aesenc		$rndkey1,$inout2
1527	aesenc		$rndkey1,$inout3
1528	aesenc		$rndkey1,$inout4
1529	aesenc		$rndkey1,$inout5
1530	aesenc		$rndkey1,$inout6
1531	aesenc		$rndkey1,$inout7
1532	movdqu		0x60($inp),$rndkey1	# borrow $rndkey1 for inp[6]
1533	lea		0x80($inp),$inp		# $inp+=8*16
1534
1535	aesenclast	$in0,$inout0		# $inN is inp[N]^round[last]
1536	pxor		$rndkey0,$rndkey1	# borrowed $rndkey
1537	movdqu		0x70-0x80($inp),$in0
1538	aesenclast	$in1,$inout1
1539	pxor		$rndkey0,$in0
1540	movdqa		0x00(%rsp),$in1		# load next counter block
1541	aesenclast	$in2,$inout2
1542	aesenclast	$in3,$inout3
1543	movdqa		0x10(%rsp),$in2
1544	movdqa		0x20(%rsp),$in3
1545	aesenclast	$in4,$inout4
1546	aesenclast	$in5,$inout5
1547	movdqa		0x30(%rsp),$in4
1548	movdqa		0x40(%rsp),$in5
1549	aesenclast	$rndkey1,$inout6
1550	movdqa		0x50(%rsp),$rndkey0
1551	$movkey		0x10-0x80($key),$rndkey1#real 1st-round key
1552	aesenclast	$in0,$inout7
1553
1554	movups		$inout0,($out)		# store 8 output blocks
1555	movdqa		$in1,$inout0
1556	movups		$inout1,0x10($out)
1557	movdqa		$in2,$inout1
1558	movups		$inout2,0x20($out)
1559	movdqa		$in3,$inout2
1560	movups		$inout3,0x30($out)
1561	movdqa		$in4,$inout3
1562	movups		$inout4,0x40($out)
1563	movdqa		$in5,$inout4
1564	movups		$inout5,0x50($out)
1565	movdqa		$rndkey0,$inout5
1566	movups		$inout6,0x60($out)
1567	movups		$inout7,0x70($out)
1568	lea		0x80($out),$out		# $out+=8*16
1569
1570	sub	\$8,$len
1571	jnc	.Lctr32_loop8			# loop if $len-=8 didn't borrow
1572
1573	add	\$8,$len			# restore real remaining $len
1574	jz	.Lctr32_done			# done if ($len==0)
1575	lea	-0x80($key),$key
1576
1577.Lctr32_tail:
1578	# note that at this point $inout0..5 are populated with
1579	# counter values xor-ed with 0-round key
1580	lea	16($key),$key
1581	cmp	\$4,$len
1582	jb	.Lctr32_loop3
1583	je	.Lctr32_loop4
1584
1585	# if ($len>4) compute 7 E(counter)
1586	shl		\$4,$rounds
1587	movdqa		0x60(%rsp),$inout6
1588	pxor		$inout7,$inout7
1589
1590	$movkey		16($key),$rndkey0
1591	aesenc		$rndkey1,$inout0
1592	aesenc		$rndkey1,$inout1
1593	lea		32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1594	neg		%rax
1595	aesenc		$rndkey1,$inout2
1596	add		\$16,%rax		# prepare for .Lenc_loop8_enter
1597	 movups		($inp),$in0
1598	aesenc		$rndkey1,$inout3
1599	aesenc		$rndkey1,$inout4
1600	 movups		0x10($inp),$in1		# pre-load input
1601	 movups		0x20($inp),$in2
1602	aesenc		$rndkey1,$inout5
1603	aesenc		$rndkey1,$inout6
1604
1605	call            .Lenc_loop8_enter
1606
1607	movdqu	0x30($inp),$in3
1608	pxor	$in0,$inout0
1609	movdqu	0x40($inp),$in0
1610	pxor	$in1,$inout1
1611	movdqu	$inout0,($out)			# store output
1612	pxor	$in2,$inout2
1613	movdqu	$inout1,0x10($out)
1614	pxor	$in3,$inout3
1615	movdqu	$inout2,0x20($out)
1616	pxor	$in0,$inout4
1617	movdqu	$inout3,0x30($out)
1618	movdqu	$inout4,0x40($out)
1619	cmp	\$6,$len
1620	jb	.Lctr32_done			# $len was 5, stop store
1621
1622	movups	0x50($inp),$in1
1623	xorps	$in1,$inout5
1624	movups	$inout5,0x50($out)
1625	je	.Lctr32_done			# $len was 6, stop store
1626
1627	movups	0x60($inp),$in2
1628	xorps	$in2,$inout6
1629	movups	$inout6,0x60($out)
1630	jmp	.Lctr32_done			# $len was 7, stop store
1631
1632.align	32
1633.Lctr32_loop4:
1634	aesenc		$rndkey1,$inout0
1635	lea		16($key),$key
1636	dec		$rounds
1637	aesenc		$rndkey1,$inout1
1638	aesenc		$rndkey1,$inout2
1639	aesenc		$rndkey1,$inout3
1640	$movkey		($key),$rndkey1
1641	jnz		.Lctr32_loop4
1642	aesenclast	$rndkey1,$inout0
1643	aesenclast	$rndkey1,$inout1
1644	 movups		($inp),$in0		# load input
1645	 movups		0x10($inp),$in1
1646	aesenclast	$rndkey1,$inout2
1647	aesenclast	$rndkey1,$inout3
1648	 movups		0x20($inp),$in2
1649	 movups		0x30($inp),$in3
1650
1651	xorps	$in0,$inout0
1652	movups	$inout0,($out)			# store output
1653	xorps	$in1,$inout1
1654	movups	$inout1,0x10($out)
1655	pxor	$in2,$inout2
1656	movdqu	$inout2,0x20($out)
1657	pxor	$in3,$inout3
1658	movdqu	$inout3,0x30($out)
1659	jmp	.Lctr32_done			# $len was 4, stop store
1660
1661.align	32
1662.Lctr32_loop3:
1663	aesenc		$rndkey1,$inout0
1664	lea		16($key),$key
1665	dec		$rounds
1666	aesenc		$rndkey1,$inout1
1667	aesenc		$rndkey1,$inout2
1668	$movkey		($key),$rndkey1
1669	jnz		.Lctr32_loop3
1670	aesenclast	$rndkey1,$inout0
1671	aesenclast	$rndkey1,$inout1
1672	aesenclast	$rndkey1,$inout2
1673
1674	movups	($inp),$in0			# load input
1675	xorps	$in0,$inout0
1676	movups	$inout0,($out)			# store output
1677	cmp	\$2,$len
1678	jb	.Lctr32_done			# $len was 1, stop store
1679
1680	movups	0x10($inp),$in1
1681	xorps	$in1,$inout1
1682	movups	$inout1,0x10($out)
1683	je	.Lctr32_done			# $len was 2, stop store
1684
1685	movups	0x20($inp),$in2
1686	xorps	$in2,$inout2
1687	movups	$inout2,0x20($out)		# $len was 3, stop store
1688
1689.Lctr32_done:
1690	xorps	%xmm0,%xmm0			# clear register bank
1691	xor	$key0,$key0
1692	pxor	%xmm1,%xmm1
1693	pxor	%xmm2,%xmm2
1694	pxor	%xmm3,%xmm3
1695	pxor	%xmm4,%xmm4
1696	pxor	%xmm5,%xmm5
1697___
1698$code.=<<___ if (!$win64);
1699	pxor	%xmm6,%xmm6
1700	pxor	%xmm7,%xmm7
1701	movaps	%xmm0,0x00(%rsp)		# clear stack
1702	pxor	%xmm8,%xmm8
1703	movaps	%xmm0,0x10(%rsp)
1704	pxor	%xmm9,%xmm9
1705	movaps	%xmm0,0x20(%rsp)
1706	pxor	%xmm10,%xmm10
1707	movaps	%xmm0,0x30(%rsp)
1708	pxor	%xmm11,%xmm11
1709	movaps	%xmm0,0x40(%rsp)
1710	pxor	%xmm12,%xmm12
1711	movaps	%xmm0,0x50(%rsp)
1712	pxor	%xmm13,%xmm13
1713	movaps	%xmm0,0x60(%rsp)
1714	pxor	%xmm14,%xmm14
1715	movaps	%xmm0,0x70(%rsp)
1716	pxor	%xmm15,%xmm15
1717___
1718$code.=<<___ if ($win64);
1719	movaps	-0xa8($key_),%xmm6
1720	movaps	%xmm0,-0xa8($key_)		# clear stack
1721	movaps	-0x98($key_),%xmm7
1722	movaps	%xmm0,-0x98($key_)
1723	movaps	-0x88($key_),%xmm8
1724	movaps	%xmm0,-0x88($key_)
1725	movaps	-0x78($key_),%xmm9
1726	movaps	%xmm0,-0x78($key_)
1727	movaps	-0x68($key_),%xmm10
1728	movaps	%xmm0,-0x68($key_)
1729	movaps	-0x58($key_),%xmm11
1730	movaps	%xmm0,-0x58($key_)
1731	movaps	-0x48($key_),%xmm12
1732	movaps	%xmm0,-0x48($key_)
1733	movaps	-0x38($key_),%xmm13
1734	movaps	%xmm0,-0x38($key_)
1735	movaps	-0x28($key_),%xmm14
1736	movaps	%xmm0,-0x28($key_)
1737	movaps	-0x18($key_),%xmm15
1738	movaps	%xmm0,-0x18($key_)
1739	movaps	%xmm0,0x00(%rsp)
1740	movaps	%xmm0,0x10(%rsp)
1741	movaps	%xmm0,0x20(%rsp)
1742	movaps	%xmm0,0x30(%rsp)
1743	movaps	%xmm0,0x40(%rsp)
1744	movaps	%xmm0,0x50(%rsp)
1745	movaps	%xmm0,0x60(%rsp)
1746	movaps	%xmm0,0x70(%rsp)
1747___
1748$code.=<<___;
1749	mov	-8($key_),%rbp
1750.cfi_restore	%rbp
1751	lea	($key_),%rsp
1752.cfi_def_cfa_register	%rsp
1753.Lctr32_epilogue:
1754	ret
1755.cfi_endproc
1756.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1757___
1758}
1759
1760######################################################################
1761# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1762#	const AES_KEY *key1, const AES_KEY *key2
1763#	const unsigned char iv[16]);
1764#
1765{
1766my @tweak=map("%xmm$_",(10..15));
1767my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1768my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1769my $frame_size = 0x70 + ($win64?160:0);
1770my $key_ = "%rbp";	# override so that we can use %r11 as FP
1771
1772$code.=<<___;
1773.globl	aesni_xts_encrypt
1774.type	aesni_xts_encrypt,\@function,6
1775.align	16
1776aesni_xts_encrypt:
1777.cfi_startproc
1778	lea	(%rsp),%r11			# frame pointer
1779.cfi_def_cfa_register	%r11
1780	push	%rbp
1781.cfi_push	%rbp
1782	sub	\$$frame_size,%rsp
1783	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1784___
1785$code.=<<___ if ($win64);
1786	movaps	%xmm6,-0xa8(%r11)		# offload everything
1787	movaps	%xmm7,-0x98(%r11)
1788	movaps	%xmm8,-0x88(%r11)
1789	movaps	%xmm9,-0x78(%r11)
1790	movaps	%xmm10,-0x68(%r11)
1791	movaps	%xmm11,-0x58(%r11)
1792	movaps	%xmm12,-0x48(%r11)
1793	movaps	%xmm13,-0x38(%r11)
1794	movaps	%xmm14,-0x28(%r11)
1795	movaps	%xmm15,-0x18(%r11)
1796.Lxts_enc_body:
1797___
1798$code.=<<___;
1799	movups	($ivp),$inout0			# load clear-text tweak
1800	mov	240(%r8),$rounds		# key2->rounds
1801	mov	240($key),$rnds_		# key1->rounds
1802___
1803	# generate the tweak
1804	&aesni_generate1("enc",$key2,$rounds,$inout0);
1805$code.=<<___;
1806	$movkey	($key),$rndkey0			# zero round key
1807	mov	$key,$key_			# backup $key
1808	mov	$rnds_,$rounds			# backup $rounds
1809	shl	\$4,$rnds_
1810	mov	$len,$len_			# backup $len
1811	and	\$-16,$len
1812
1813	$movkey	16($key,$rnds_),$rndkey1	# last round key
1814
1815	movdqa	.Lxts_magic(%rip),$twmask
1816	movdqa	$inout0,@tweak[5]
1817	pshufd	\$0x5f,$inout0,$twres
1818	pxor	$rndkey0,$rndkey1
1819___
1820    # alternative tweak calculation algorithm is based on suggestions
1821    # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1822    # and should help in the future...
1823    for ($i=0;$i<4;$i++) {
1824    $code.=<<___;
1825	movdqa	$twres,$twtmp
1826	paddd	$twres,$twres
1827	movdqa	@tweak[5],@tweak[$i]
1828	psrad	\$31,$twtmp			# broadcast upper bits
1829	paddq	@tweak[5],@tweak[5]
1830	pand	$twmask,$twtmp
1831	pxor	$rndkey0,@tweak[$i]
1832	pxor	$twtmp,@tweak[5]
1833___
1834    }
1835$code.=<<___;
1836	movdqa	@tweak[5],@tweak[4]
1837	psrad	\$31,$twres
1838	paddq	@tweak[5],@tweak[5]
1839	pand	$twmask,$twres
1840	pxor	$rndkey0,@tweak[4]
1841	pxor	$twres,@tweak[5]
1842	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
1843
1844	sub	\$16*6,$len
1845	jc	.Lxts_enc_short			# if $len-=6*16 borrowed
1846
1847	mov	\$16+96,$rounds
1848	lea	32($key_,$rnds_),$key		# end of key schedule
1849	sub	%r10,%rax			# twisted $rounds
1850	$movkey	16($key_),$rndkey1
1851	mov	%rax,%r10			# backup twisted $rounds
1852	lea	.Lxts_magic(%rip),%r8
1853	jmp	.Lxts_enc_grandloop
1854
1855.align	32
1856.Lxts_enc_grandloop:
1857	movdqu	`16*0`($inp),$inout0		# load input
1858	movdqa	$rndkey0,$twmask
1859	movdqu	`16*1`($inp),$inout1
1860	pxor	@tweak[0],$inout0		# input^=tweak^round[0]
1861	movdqu	`16*2`($inp),$inout2
1862	pxor	@tweak[1],$inout1
1863	 aesenc		$rndkey1,$inout0
1864	movdqu	`16*3`($inp),$inout3
1865	pxor	@tweak[2],$inout2
1866	 aesenc		$rndkey1,$inout1
1867	movdqu	`16*4`($inp),$inout4
1868	pxor	@tweak[3],$inout3
1869	 aesenc		$rndkey1,$inout2
1870	movdqu	`16*5`($inp),$inout5
1871	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
1872	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
1873	pxor	@tweak[4],$inout4
1874	 aesenc		$rndkey1,$inout3
1875	$movkey	32($key_),$rndkey0
1876	lea	`16*6`($inp),$inp
1877	pxor	$twmask,$inout5
1878
1879	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
1880	aesenc		$rndkey1,$inout4
1881	 pxor	$twres,@tweak[1]
1882	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^round[last]
1883	aesenc		$rndkey1,$inout5
1884	$movkey		48($key_),$rndkey1
1885	 pxor	$twres,@tweak[2]
1886
1887	aesenc		$rndkey0,$inout0
1888	 pxor	$twres,@tweak[3]
1889	 movdqa	@tweak[1],`16*1`(%rsp)
1890	aesenc		$rndkey0,$inout1
1891	 pxor	$twres,@tweak[4]
1892	 movdqa	@tweak[2],`16*2`(%rsp)
1893	aesenc		$rndkey0,$inout2
1894	aesenc		$rndkey0,$inout3
1895	 pxor	$twres,$twmask
1896	 movdqa	@tweak[4],`16*4`(%rsp)
1897	aesenc		$rndkey0,$inout4
1898	aesenc		$rndkey0,$inout5
1899	$movkey		64($key_),$rndkey0
1900	 movdqa	$twmask,`16*5`(%rsp)
1901	pshufd	\$0x5f,@tweak[5],$twres
1902	jmp	.Lxts_enc_loop6
1903.align	32
1904.Lxts_enc_loop6:
1905	aesenc		$rndkey1,$inout0
1906	aesenc		$rndkey1,$inout1
1907	aesenc		$rndkey1,$inout2
1908	aesenc		$rndkey1,$inout3
1909	aesenc		$rndkey1,$inout4
1910	aesenc		$rndkey1,$inout5
1911	$movkey		-64($key,%rax),$rndkey1
1912	add		\$32,%rax
1913
1914	aesenc		$rndkey0,$inout0
1915	aesenc		$rndkey0,$inout1
1916	aesenc		$rndkey0,$inout2
1917	aesenc		$rndkey0,$inout3
1918	aesenc		$rndkey0,$inout4
1919	aesenc		$rndkey0,$inout5
1920	$movkey		-80($key,%rax),$rndkey0
1921	jnz		.Lxts_enc_loop6
1922
1923	movdqa	(%r8),$twmask			# start calculating next tweak
1924	movdqa	$twres,$twtmp
1925	paddd	$twres,$twres
1926	 aesenc		$rndkey1,$inout0
1927	paddq	@tweak[5],@tweak[5]
1928	psrad	\$31,$twtmp
1929	 aesenc		$rndkey1,$inout1
1930	pand	$twmask,$twtmp
1931	$movkey	($key_),@tweak[0]		# load round[0]
1932	 aesenc		$rndkey1,$inout2
1933	 aesenc		$rndkey1,$inout3
1934	 aesenc		$rndkey1,$inout4
1935	pxor	$twtmp,@tweak[5]
1936	movaps	@tweak[0],@tweak[1]		# copy round[0]
1937	 aesenc		$rndkey1,$inout5
1938	 $movkey	-64($key),$rndkey1
1939
1940	movdqa	$twres,$twtmp
1941	 aesenc		$rndkey0,$inout0
1942	paddd	$twres,$twres
1943	pxor	@tweak[5],@tweak[0]
1944	 aesenc		$rndkey0,$inout1
1945	psrad	\$31,$twtmp
1946	paddq	@tweak[5],@tweak[5]
1947	 aesenc		$rndkey0,$inout2
1948	 aesenc		$rndkey0,$inout3
1949	pand	$twmask,$twtmp
1950	movaps	@tweak[1],@tweak[2]
1951	 aesenc		$rndkey0,$inout4
1952	pxor	$twtmp,@tweak[5]
1953	movdqa	$twres,$twtmp
1954	 aesenc		$rndkey0,$inout5
1955	 $movkey	-48($key),$rndkey0
1956
1957	paddd	$twres,$twres
1958	 aesenc		$rndkey1,$inout0
1959	pxor	@tweak[5],@tweak[1]
1960	psrad	\$31,$twtmp
1961	 aesenc		$rndkey1,$inout1
1962	paddq	@tweak[5],@tweak[5]
1963	pand	$twmask,$twtmp
1964	 aesenc		$rndkey1,$inout2
1965	 aesenc		$rndkey1,$inout3
1966	 movdqa	@tweak[3],`16*3`(%rsp)
1967	pxor	$twtmp,@tweak[5]
1968	 aesenc		$rndkey1,$inout4
1969	movaps	@tweak[2],@tweak[3]
1970	movdqa	$twres,$twtmp
1971	 aesenc		$rndkey1,$inout5
1972	 $movkey	-32($key),$rndkey1
1973
1974	paddd	$twres,$twres
1975	 aesenc		$rndkey0,$inout0
1976	pxor	@tweak[5],@tweak[2]
1977	psrad	\$31,$twtmp
1978	 aesenc		$rndkey0,$inout1
1979	paddq	@tweak[5],@tweak[5]
1980	pand	$twmask,$twtmp
1981	 aesenc		$rndkey0,$inout2
1982	 aesenc		$rndkey0,$inout3
1983	 aesenc		$rndkey0,$inout4
1984	pxor	$twtmp,@tweak[5]
1985	movaps	@tweak[3],@tweak[4]
1986	 aesenc		$rndkey0,$inout5
1987
1988	movdqa	$twres,$rndkey0
1989	paddd	$twres,$twres
1990	 aesenc		$rndkey1,$inout0
1991	pxor	@tweak[5],@tweak[3]
1992	psrad	\$31,$rndkey0
1993	 aesenc		$rndkey1,$inout1
1994	paddq	@tweak[5],@tweak[5]
1995	pand	$twmask,$rndkey0
1996	 aesenc		$rndkey1,$inout2
1997	 aesenc		$rndkey1,$inout3
1998	pxor	$rndkey0,@tweak[5]
1999	$movkey		($key_),$rndkey0
2000	 aesenc		$rndkey1,$inout4
2001	 aesenc		$rndkey1,$inout5
2002	$movkey		16($key_),$rndkey1
2003
2004	pxor	@tweak[5],@tweak[4]
2005	 aesenclast	`16*0`(%rsp),$inout0
2006	psrad	\$31,$twres
2007	paddq	@tweak[5],@tweak[5]
2008	 aesenclast	`16*1`(%rsp),$inout1
2009	 aesenclast	`16*2`(%rsp),$inout2
2010	pand	$twmask,$twres
2011	mov	%r10,%rax			# restore $rounds
2012	 aesenclast	`16*3`(%rsp),$inout3
2013	 aesenclast	`16*4`(%rsp),$inout4
2014	 aesenclast	`16*5`(%rsp),$inout5
2015	pxor	$twres,@tweak[5]
2016
2017	lea	`16*6`($out),$out		# $out+=6*16
2018	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2019	movups	$inout1,`-16*5`($out)
2020	movups	$inout2,`-16*4`($out)
2021	movups	$inout3,`-16*3`($out)
2022	movups	$inout4,`-16*2`($out)
2023	movups	$inout5,`-16*1`($out)
2024	sub	\$16*6,$len
2025	jnc	.Lxts_enc_grandloop		# loop if $len-=6*16 didn't borrow
2026
2027	mov	\$16+96,$rounds
2028	sub	$rnds_,$rounds
2029	mov	$key_,$key			# restore $key
2030	shr	\$4,$rounds			# restore original value
2031
2032.Lxts_enc_short:
2033	# at the point @tweak[0..5] are populated with tweak values
2034	mov	$rounds,$rnds_			# backup $rounds
2035	pxor	$rndkey0,@tweak[0]
2036	add	\$16*6,$len			# restore real remaining $len
2037	jz	.Lxts_enc_done			# done if ($len==0)
2038
2039	pxor	$rndkey0,@tweak[1]
2040	cmp	\$0x20,$len
2041	jb	.Lxts_enc_one			# $len is 1*16
2042	pxor	$rndkey0,@tweak[2]
2043	je	.Lxts_enc_two			# $len is 2*16
2044
2045	pxor	$rndkey0,@tweak[3]
2046	cmp	\$0x40,$len
2047	jb	.Lxts_enc_three			# $len is 3*16
2048	pxor	$rndkey0,@tweak[4]
2049	je	.Lxts_enc_four			# $len is 4*16
2050
2051	movdqu	($inp),$inout0			# $len is 5*16
2052	movdqu	16*1($inp),$inout1
2053	movdqu	16*2($inp),$inout2
2054	pxor	@tweak[0],$inout0
2055	movdqu	16*3($inp),$inout3
2056	pxor	@tweak[1],$inout1
2057	movdqu	16*4($inp),$inout4
2058	lea	16*5($inp),$inp			# $inp+=5*16
2059	pxor	@tweak[2],$inout2
2060	pxor	@tweak[3],$inout3
2061	pxor	@tweak[4],$inout4
2062	pxor	$inout5,$inout5
2063
2064	call	_aesni_encrypt6
2065
2066	xorps	@tweak[0],$inout0
2067	movdqa	@tweak[5],@tweak[0]
2068	xorps	@tweak[1],$inout1
2069	xorps	@tweak[2],$inout2
2070	movdqu	$inout0,($out)			# store 5 output blocks
2071	xorps	@tweak[3],$inout3
2072	movdqu	$inout1,16*1($out)
2073	xorps	@tweak[4],$inout4
2074	movdqu	$inout2,16*2($out)
2075	movdqu	$inout3,16*3($out)
2076	movdqu	$inout4,16*4($out)
2077	lea	16*5($out),$out			# $out+=5*16
2078	jmp	.Lxts_enc_done
2079
2080.align	16
2081.Lxts_enc_one:
2082	movups	($inp),$inout0
2083	lea	16*1($inp),$inp			# inp+=1*16
2084	xorps	@tweak[0],$inout0
2085___
2086	&aesni_generate1("enc",$key,$rounds);
2087$code.=<<___;
2088	xorps	@tweak[0],$inout0
2089	movdqa	@tweak[1],@tweak[0]
2090	movups	$inout0,($out)			# store one output block
2091	lea	16*1($out),$out			# $out+=1*16
2092	jmp	.Lxts_enc_done
2093
2094.align	16
2095.Lxts_enc_two:
2096	movups	($inp),$inout0
2097	movups	16($inp),$inout1
2098	lea	32($inp),$inp			# $inp+=2*16
2099	xorps	@tweak[0],$inout0
2100	xorps	@tweak[1],$inout1
2101
2102	call	_aesni_encrypt2
2103
2104	xorps	@tweak[0],$inout0
2105	movdqa	@tweak[2],@tweak[0]
2106	xorps	@tweak[1],$inout1
2107	movups	$inout0,($out)			# store 2 output blocks
2108	movups	$inout1,16*1($out)
2109	lea	16*2($out),$out			# $out+=2*16
2110	jmp	.Lxts_enc_done
2111
2112.align	16
2113.Lxts_enc_three:
2114	movups	($inp),$inout0
2115	movups	16*1($inp),$inout1
2116	movups	16*2($inp),$inout2
2117	lea	16*3($inp),$inp			# $inp+=3*16
2118	xorps	@tweak[0],$inout0
2119	xorps	@tweak[1],$inout1
2120	xorps	@tweak[2],$inout2
2121
2122	call	_aesni_encrypt3
2123
2124	xorps	@tweak[0],$inout0
2125	movdqa	@tweak[3],@tweak[0]
2126	xorps	@tweak[1],$inout1
2127	xorps	@tweak[2],$inout2
2128	movups	$inout0,($out)			# store 3 output blocks
2129	movups	$inout1,16*1($out)
2130	movups	$inout2,16*2($out)
2131	lea	16*3($out),$out			# $out+=3*16
2132	jmp	.Lxts_enc_done
2133
2134.align	16
2135.Lxts_enc_four:
2136	movups	($inp),$inout0
2137	movups	16*1($inp),$inout1
2138	movups	16*2($inp),$inout2
2139	xorps	@tweak[0],$inout0
2140	movups	16*3($inp),$inout3
2141	lea	16*4($inp),$inp			# $inp+=4*16
2142	xorps	@tweak[1],$inout1
2143	xorps	@tweak[2],$inout2
2144	xorps	@tweak[3],$inout3
2145
2146	call	_aesni_encrypt4
2147
2148	pxor	@tweak[0],$inout0
2149	movdqa	@tweak[4],@tweak[0]
2150	pxor	@tweak[1],$inout1
2151	pxor	@tweak[2],$inout2
2152	movdqu	$inout0,($out)			# store 4 output blocks
2153	pxor	@tweak[3],$inout3
2154	movdqu	$inout1,16*1($out)
2155	movdqu	$inout2,16*2($out)
2156	movdqu	$inout3,16*3($out)
2157	lea	16*4($out),$out			# $out+=4*16
2158	jmp	.Lxts_enc_done
2159
2160.align	16
2161.Lxts_enc_done:
2162	and	\$15,$len_			# see if $len%16 is 0
2163	jz	.Lxts_enc_ret
2164	mov	$len_,$len
2165
2166.Lxts_enc_steal:
2167	movzb	($inp),%eax			# borrow $rounds ...
2168	movzb	-16($out),%ecx			# ... and $key
2169	lea	1($inp),$inp
2170	mov	%al,-16($out)
2171	mov	%cl,0($out)
2172	lea	1($out),$out
2173	sub	\$1,$len
2174	jnz	.Lxts_enc_steal
2175
2176	sub	$len_,$out			# rewind $out
2177	mov	$key_,$key			# restore $key
2178	mov	$rnds_,$rounds			# restore $rounds
2179
2180	movups	-16($out),$inout0
2181	xorps	@tweak[0],$inout0
2182___
2183	&aesni_generate1("enc",$key,$rounds);
2184$code.=<<___;
2185	xorps	@tweak[0],$inout0
2186	movups	$inout0,-16($out)
2187
2188.Lxts_enc_ret:
2189	xorps	%xmm0,%xmm0			# clear register bank
2190	pxor	%xmm1,%xmm1
2191	pxor	%xmm2,%xmm2
2192	pxor	%xmm3,%xmm3
2193	pxor	%xmm4,%xmm4
2194	pxor	%xmm5,%xmm5
2195___
2196$code.=<<___ if (!$win64);
2197	pxor	%xmm6,%xmm6
2198	pxor	%xmm7,%xmm7
2199	movaps	%xmm0,0x00(%rsp)		# clear stack
2200	pxor	%xmm8,%xmm8
2201	movaps	%xmm0,0x10(%rsp)
2202	pxor	%xmm9,%xmm9
2203	movaps	%xmm0,0x20(%rsp)
2204	pxor	%xmm10,%xmm10
2205	movaps	%xmm0,0x30(%rsp)
2206	pxor	%xmm11,%xmm11
2207	movaps	%xmm0,0x40(%rsp)
2208	pxor	%xmm12,%xmm12
2209	movaps	%xmm0,0x50(%rsp)
2210	pxor	%xmm13,%xmm13
2211	movaps	%xmm0,0x60(%rsp)
2212	pxor	%xmm14,%xmm14
2213	pxor	%xmm15,%xmm15
2214___
2215$code.=<<___ if ($win64);
2216	movaps	-0xa8(%r11),%xmm6
2217	movaps	%xmm0,-0xa8(%r11)		# clear stack
2218	movaps	-0x98(%r11),%xmm7
2219	movaps	%xmm0,-0x98(%r11)
2220	movaps	-0x88(%r11),%xmm8
2221	movaps	%xmm0,-0x88(%r11)
2222	movaps	-0x78(%r11),%xmm9
2223	movaps	%xmm0,-0x78(%r11)
2224	movaps	-0x68(%r11),%xmm10
2225	movaps	%xmm0,-0x68(%r11)
2226	movaps	-0x58(%r11),%xmm11
2227	movaps	%xmm0,-0x58(%r11)
2228	movaps	-0x48(%r11),%xmm12
2229	movaps	%xmm0,-0x48(%r11)
2230	movaps	-0x38(%r11),%xmm13
2231	movaps	%xmm0,-0x38(%r11)
2232	movaps	-0x28(%r11),%xmm14
2233	movaps	%xmm0,-0x28(%r11)
2234	movaps	-0x18(%r11),%xmm15
2235	movaps	%xmm0,-0x18(%r11)
2236	movaps	%xmm0,0x00(%rsp)
2237	movaps	%xmm0,0x10(%rsp)
2238	movaps	%xmm0,0x20(%rsp)
2239	movaps	%xmm0,0x30(%rsp)
2240	movaps	%xmm0,0x40(%rsp)
2241	movaps	%xmm0,0x50(%rsp)
2242	movaps	%xmm0,0x60(%rsp)
2243___
2244$code.=<<___;
2245	mov	-8(%r11),%rbp
2246.cfi_restore	%rbp
2247	lea	(%r11),%rsp
2248.cfi_def_cfa_register	%rsp
2249.Lxts_enc_epilogue:
2250	ret
2251.cfi_endproc
2252.size	aesni_xts_encrypt,.-aesni_xts_encrypt
2253___
2254
2255$code.=<<___;
2256.globl	aesni_xts_decrypt
2257.type	aesni_xts_decrypt,\@function,6
2258.align	16
2259aesni_xts_decrypt:
2260.cfi_startproc
2261	lea	(%rsp),%r11			# frame pointer
2262.cfi_def_cfa_register	%r11
2263	push	%rbp
2264.cfi_push	%rbp
2265	sub	\$$frame_size,%rsp
2266	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
2267___
2268$code.=<<___ if ($win64);
2269	movaps	%xmm6,-0xa8(%r11)		# offload everything
2270	movaps	%xmm7,-0x98(%r11)
2271	movaps	%xmm8,-0x88(%r11)
2272	movaps	%xmm9,-0x78(%r11)
2273	movaps	%xmm10,-0x68(%r11)
2274	movaps	%xmm11,-0x58(%r11)
2275	movaps	%xmm12,-0x48(%r11)
2276	movaps	%xmm13,-0x38(%r11)
2277	movaps	%xmm14,-0x28(%r11)
2278	movaps	%xmm15,-0x18(%r11)
2279.Lxts_dec_body:
2280___
2281$code.=<<___;
2282	movups	($ivp),$inout0			# load clear-text tweak
2283	mov	240($key2),$rounds		# key2->rounds
2284	mov	240($key),$rnds_		# key1->rounds
2285___
2286	# generate the tweak
2287	&aesni_generate1("enc",$key2,$rounds,$inout0);
2288$code.=<<___;
2289	xor	%eax,%eax			# if ($len%16) len-=16;
2290	test	\$15,$len
2291	setnz	%al
2292	shl	\$4,%rax
2293	sub	%rax,$len
2294
2295	$movkey	($key),$rndkey0			# zero round key
2296	mov	$key,$key_			# backup $key
2297	mov	$rnds_,$rounds			# backup $rounds
2298	shl	\$4,$rnds_
2299	mov	$len,$len_			# backup $len
2300	and	\$-16,$len
2301
2302	$movkey	16($key,$rnds_),$rndkey1	# last round key
2303
2304	movdqa	.Lxts_magic(%rip),$twmask
2305	movdqa	$inout0,@tweak[5]
2306	pshufd	\$0x5f,$inout0,$twres
2307	pxor	$rndkey0,$rndkey1
2308___
2309    for ($i=0;$i<4;$i++) {
2310    $code.=<<___;
2311	movdqa	$twres,$twtmp
2312	paddd	$twres,$twres
2313	movdqa	@tweak[5],@tweak[$i]
2314	psrad	\$31,$twtmp			# broadcast upper bits
2315	paddq	@tweak[5],@tweak[5]
2316	pand	$twmask,$twtmp
2317	pxor	$rndkey0,@tweak[$i]
2318	pxor	$twtmp,@tweak[5]
2319___
2320    }
2321$code.=<<___;
2322	movdqa	@tweak[5],@tweak[4]
2323	psrad	\$31,$twres
2324	paddq	@tweak[5],@tweak[5]
2325	pand	$twmask,$twres
2326	pxor	$rndkey0,@tweak[4]
2327	pxor	$twres,@tweak[5]
2328	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
2329
2330	sub	\$16*6,$len
2331	jc	.Lxts_dec_short			# if $len-=6*16 borrowed
2332
2333	mov	\$16+96,$rounds
2334	lea	32($key_,$rnds_),$key		# end of key schedule
2335	sub	%r10,%rax			# twisted $rounds
2336	$movkey	16($key_),$rndkey1
2337	mov	%rax,%r10			# backup twisted $rounds
2338	lea	.Lxts_magic(%rip),%r8
2339	jmp	.Lxts_dec_grandloop
2340
2341.align	32
2342.Lxts_dec_grandloop:
2343	movdqu	`16*0`($inp),$inout0		# load input
2344	movdqa	$rndkey0,$twmask
2345	movdqu	`16*1`($inp),$inout1
2346	pxor	@tweak[0],$inout0		# input^=tweak^round[0]
2347	movdqu	`16*2`($inp),$inout2
2348	pxor	@tweak[1],$inout1
2349	 aesdec		$rndkey1,$inout0
2350	movdqu	`16*3`($inp),$inout3
2351	pxor	@tweak[2],$inout2
2352	 aesdec		$rndkey1,$inout1
2353	movdqu	`16*4`($inp),$inout4
2354	pxor	@tweak[3],$inout3
2355	 aesdec		$rndkey1,$inout2
2356	movdqu	`16*5`($inp),$inout5
2357	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
2358	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
2359	pxor	@tweak[4],$inout4
2360	 aesdec		$rndkey1,$inout3
2361	$movkey	32($key_),$rndkey0
2362	lea	`16*6`($inp),$inp
2363	pxor	$twmask,$inout5
2364
2365	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
2366	aesdec		$rndkey1,$inout4
2367	 pxor	$twres,@tweak[1]
2368	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
2369	aesdec		$rndkey1,$inout5
2370	$movkey		48($key_),$rndkey1
2371	 pxor	$twres,@tweak[2]
2372
2373	aesdec		$rndkey0,$inout0
2374	 pxor	$twres,@tweak[3]
2375	 movdqa	@tweak[1],`16*1`(%rsp)
2376	aesdec		$rndkey0,$inout1
2377	 pxor	$twres,@tweak[4]
2378	 movdqa	@tweak[2],`16*2`(%rsp)
2379	aesdec		$rndkey0,$inout2
2380	aesdec		$rndkey0,$inout3
2381	 pxor	$twres,$twmask
2382	 movdqa	@tweak[4],`16*4`(%rsp)
2383	aesdec		$rndkey0,$inout4
2384	aesdec		$rndkey0,$inout5
2385	$movkey		64($key_),$rndkey0
2386	 movdqa	$twmask,`16*5`(%rsp)
2387	pshufd	\$0x5f,@tweak[5],$twres
2388	jmp	.Lxts_dec_loop6
2389.align	32
2390.Lxts_dec_loop6:
2391	aesdec		$rndkey1,$inout0
2392	aesdec		$rndkey1,$inout1
2393	aesdec		$rndkey1,$inout2
2394	aesdec		$rndkey1,$inout3
2395	aesdec		$rndkey1,$inout4
2396	aesdec		$rndkey1,$inout5
2397	$movkey		-64($key,%rax),$rndkey1
2398	add		\$32,%rax
2399
2400	aesdec		$rndkey0,$inout0
2401	aesdec		$rndkey0,$inout1
2402	aesdec		$rndkey0,$inout2
2403	aesdec		$rndkey0,$inout3
2404	aesdec		$rndkey0,$inout4
2405	aesdec		$rndkey0,$inout5
2406	$movkey		-80($key,%rax),$rndkey0
2407	jnz		.Lxts_dec_loop6
2408
2409	movdqa	(%r8),$twmask			# start calculating next tweak
2410	movdqa	$twres,$twtmp
2411	paddd	$twres,$twres
2412	 aesdec		$rndkey1,$inout0
2413	paddq	@tweak[5],@tweak[5]
2414	psrad	\$31,$twtmp
2415	 aesdec		$rndkey1,$inout1
2416	pand	$twmask,$twtmp
2417	$movkey	($key_),@tweak[0]		# load round[0]
2418	 aesdec		$rndkey1,$inout2
2419	 aesdec		$rndkey1,$inout3
2420	 aesdec		$rndkey1,$inout4
2421	pxor	$twtmp,@tweak[5]
2422	movaps	@tweak[0],@tweak[1]		# copy round[0]
2423	 aesdec		$rndkey1,$inout5
2424	 $movkey	-64($key),$rndkey1
2425
2426	movdqa	$twres,$twtmp
2427	 aesdec		$rndkey0,$inout0
2428	paddd	$twres,$twres
2429	pxor	@tweak[5],@tweak[0]
2430	 aesdec		$rndkey0,$inout1
2431	psrad	\$31,$twtmp
2432	paddq	@tweak[5],@tweak[5]
2433	 aesdec		$rndkey0,$inout2
2434	 aesdec		$rndkey0,$inout3
2435	pand	$twmask,$twtmp
2436	movaps	@tweak[1],@tweak[2]
2437	 aesdec		$rndkey0,$inout4
2438	pxor	$twtmp,@tweak[5]
2439	movdqa	$twres,$twtmp
2440	 aesdec		$rndkey0,$inout5
2441	 $movkey	-48($key),$rndkey0
2442
2443	paddd	$twres,$twres
2444	 aesdec		$rndkey1,$inout0
2445	pxor	@tweak[5],@tweak[1]
2446	psrad	\$31,$twtmp
2447	 aesdec		$rndkey1,$inout1
2448	paddq	@tweak[5],@tweak[5]
2449	pand	$twmask,$twtmp
2450	 aesdec		$rndkey1,$inout2
2451	 aesdec		$rndkey1,$inout3
2452	 movdqa	@tweak[3],`16*3`(%rsp)
2453	pxor	$twtmp,@tweak[5]
2454	 aesdec		$rndkey1,$inout4
2455	movaps	@tweak[2],@tweak[3]
2456	movdqa	$twres,$twtmp
2457	 aesdec		$rndkey1,$inout5
2458	 $movkey	-32($key),$rndkey1
2459
2460	paddd	$twres,$twres
2461	 aesdec		$rndkey0,$inout0
2462	pxor	@tweak[5],@tweak[2]
2463	psrad	\$31,$twtmp
2464	 aesdec		$rndkey0,$inout1
2465	paddq	@tweak[5],@tweak[5]
2466	pand	$twmask,$twtmp
2467	 aesdec		$rndkey0,$inout2
2468	 aesdec		$rndkey0,$inout3
2469	 aesdec		$rndkey0,$inout4
2470	pxor	$twtmp,@tweak[5]
2471	movaps	@tweak[3],@tweak[4]
2472	 aesdec		$rndkey0,$inout5
2473
2474	movdqa	$twres,$rndkey0
2475	paddd	$twres,$twres
2476	 aesdec		$rndkey1,$inout0
2477	pxor	@tweak[5],@tweak[3]
2478	psrad	\$31,$rndkey0
2479	 aesdec		$rndkey1,$inout1
2480	paddq	@tweak[5],@tweak[5]
2481	pand	$twmask,$rndkey0
2482	 aesdec		$rndkey1,$inout2
2483	 aesdec		$rndkey1,$inout3
2484	pxor	$rndkey0,@tweak[5]
2485	$movkey		($key_),$rndkey0
2486	 aesdec		$rndkey1,$inout4
2487	 aesdec		$rndkey1,$inout5
2488	$movkey		16($key_),$rndkey1
2489
2490	pxor	@tweak[5],@tweak[4]
2491	 aesdeclast	`16*0`(%rsp),$inout0
2492	psrad	\$31,$twres
2493	paddq	@tweak[5],@tweak[5]
2494	 aesdeclast	`16*1`(%rsp),$inout1
2495	 aesdeclast	`16*2`(%rsp),$inout2
2496	pand	$twmask,$twres
2497	mov	%r10,%rax			# restore $rounds
2498	 aesdeclast	`16*3`(%rsp),$inout3
2499	 aesdeclast	`16*4`(%rsp),$inout4
2500	 aesdeclast	`16*5`(%rsp),$inout5
2501	pxor	$twres,@tweak[5]
2502
2503	lea	`16*6`($out),$out		# $out+=6*16
2504	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2505	movups	$inout1,`-16*5`($out)
2506	movups	$inout2,`-16*4`($out)
2507	movups	$inout3,`-16*3`($out)
2508	movups	$inout4,`-16*2`($out)
2509	movups	$inout5,`-16*1`($out)
2510	sub	\$16*6,$len
2511	jnc	.Lxts_dec_grandloop		# loop if $len-=6*16 didn't borrow
2512
2513	mov	\$16+96,$rounds
2514	sub	$rnds_,$rounds
2515	mov	$key_,$key			# restore $key
2516	shr	\$4,$rounds			# restore original value
2517
2518.Lxts_dec_short:
2519	# at the point @tweak[0..5] are populated with tweak values
2520	mov	$rounds,$rnds_			# backup $rounds
2521	pxor	$rndkey0,@tweak[0]
2522	pxor	$rndkey0,@tweak[1]
2523	add	\$16*6,$len			# restore real remaining $len
2524	jz	.Lxts_dec_done			# done if ($len==0)
2525
2526	pxor	$rndkey0,@tweak[2]
2527	cmp	\$0x20,$len
2528	jb	.Lxts_dec_one			# $len is 1*16
2529	pxor	$rndkey0,@tweak[3]
2530	je	.Lxts_dec_two			# $len is 2*16
2531
2532	pxor	$rndkey0,@tweak[4]
2533	cmp	\$0x40,$len
2534	jb	.Lxts_dec_three			# $len is 3*16
2535	je	.Lxts_dec_four			# $len is 4*16
2536
2537	movdqu	($inp),$inout0			# $len is 5*16
2538	movdqu	16*1($inp),$inout1
2539	movdqu	16*2($inp),$inout2
2540	pxor	@tweak[0],$inout0
2541	movdqu	16*3($inp),$inout3
2542	pxor	@tweak[1],$inout1
2543	movdqu	16*4($inp),$inout4
2544	lea	16*5($inp),$inp			# $inp+=5*16
2545	pxor	@tweak[2],$inout2
2546	pxor	@tweak[3],$inout3
2547	pxor	@tweak[4],$inout4
2548
2549	call	_aesni_decrypt6
2550
2551	xorps	@tweak[0],$inout0
2552	xorps	@tweak[1],$inout1
2553	xorps	@tweak[2],$inout2
2554	movdqu	$inout0,($out)			# store 5 output blocks
2555	xorps	@tweak[3],$inout3
2556	movdqu	$inout1,16*1($out)
2557	xorps	@tweak[4],$inout4
2558	movdqu	$inout2,16*2($out)
2559	 pxor		$twtmp,$twtmp
2560	movdqu	$inout3,16*3($out)
2561	 pcmpgtd	@tweak[5],$twtmp
2562	movdqu	$inout4,16*4($out)
2563	lea	16*5($out),$out			# $out+=5*16
2564	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
2565	and	\$15,$len_
2566	jz	.Lxts_dec_ret
2567
2568	movdqa	@tweak[5],@tweak[0]
2569	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
2570	pand	$twmask,@tweak[1]		# isolate carry and residue
2571	pxor	@tweak[5],@tweak[1]
2572	jmp	.Lxts_dec_done2
2573
2574.align	16
2575.Lxts_dec_one:
2576	movups	($inp),$inout0
2577	lea	16*1($inp),$inp			# $inp+=1*16
2578	xorps	@tweak[0],$inout0
2579___
2580	&aesni_generate1("dec",$key,$rounds);
2581$code.=<<___;
2582	xorps	@tweak[0],$inout0
2583	movdqa	@tweak[1],@tweak[0]
2584	movups	$inout0,($out)			# store one output block
2585	movdqa	@tweak[2],@tweak[1]
2586	lea	16*1($out),$out			# $out+=1*16
2587	jmp	.Lxts_dec_done
2588
2589.align	16
2590.Lxts_dec_two:
2591	movups	($inp),$inout0
2592	movups	16($inp),$inout1
2593	lea	32($inp),$inp			# $inp+=2*16
2594	xorps	@tweak[0],$inout0
2595	xorps	@tweak[1],$inout1
2596
2597	call	_aesni_decrypt2
2598
2599	xorps	@tweak[0],$inout0
2600	movdqa	@tweak[2],@tweak[0]
2601	xorps	@tweak[1],$inout1
2602	movdqa	@tweak[3],@tweak[1]
2603	movups	$inout0,($out)			# store 2 output blocks
2604	movups	$inout1,16*1($out)
2605	lea	16*2($out),$out			# $out+=2*16
2606	jmp	.Lxts_dec_done
2607
2608.align	16
2609.Lxts_dec_three:
2610	movups	($inp),$inout0
2611	movups	16*1($inp),$inout1
2612	movups	16*2($inp),$inout2
2613	lea	16*3($inp),$inp			# $inp+=3*16
2614	xorps	@tweak[0],$inout0
2615	xorps	@tweak[1],$inout1
2616	xorps	@tweak[2],$inout2
2617
2618	call	_aesni_decrypt3
2619
2620	xorps	@tweak[0],$inout0
2621	movdqa	@tweak[3],@tweak[0]
2622	xorps	@tweak[1],$inout1
2623	movdqa	@tweak[4],@tweak[1]
2624	xorps	@tweak[2],$inout2
2625	movups	$inout0,($out)			# store 3 output blocks
2626	movups	$inout1,16*1($out)
2627	movups	$inout2,16*2($out)
2628	lea	16*3($out),$out			# $out+=3*16
2629	jmp	.Lxts_dec_done
2630
2631.align	16
2632.Lxts_dec_four:
2633	movups	($inp),$inout0
2634	movups	16*1($inp),$inout1
2635	movups	16*2($inp),$inout2
2636	xorps	@tweak[0],$inout0
2637	movups	16*3($inp),$inout3
2638	lea	16*4($inp),$inp			# $inp+=4*16
2639	xorps	@tweak[1],$inout1
2640	xorps	@tweak[2],$inout2
2641	xorps	@tweak[3],$inout3
2642
2643	call	_aesni_decrypt4
2644
2645	pxor	@tweak[0],$inout0
2646	movdqa	@tweak[4],@tweak[0]
2647	pxor	@tweak[1],$inout1
2648	movdqa	@tweak[5],@tweak[1]
2649	pxor	@tweak[2],$inout2
2650	movdqu	$inout0,($out)			# store 4 output blocks
2651	pxor	@tweak[3],$inout3
2652	movdqu	$inout1,16*1($out)
2653	movdqu	$inout2,16*2($out)
2654	movdqu	$inout3,16*3($out)
2655	lea	16*4($out),$out			# $out+=4*16
2656	jmp	.Lxts_dec_done
2657
2658.align	16
2659.Lxts_dec_done:
2660	and	\$15,$len_			# see if $len%16 is 0
2661	jz	.Lxts_dec_ret
2662.Lxts_dec_done2:
2663	mov	$len_,$len
2664	mov	$key_,$key			# restore $key
2665	mov	$rnds_,$rounds			# restore $rounds
2666
2667	movups	($inp),$inout0
2668	xorps	@tweak[1],$inout0
2669___
2670	&aesni_generate1("dec",$key,$rounds);
2671$code.=<<___;
2672	xorps	@tweak[1],$inout0
2673	movups	$inout0,($out)
2674
2675.Lxts_dec_steal:
2676	movzb	16($inp),%eax			# borrow $rounds ...
2677	movzb	($out),%ecx			# ... and $key
2678	lea	1($inp),$inp
2679	mov	%al,($out)
2680	mov	%cl,16($out)
2681	lea	1($out),$out
2682	sub	\$1,$len
2683	jnz	.Lxts_dec_steal
2684
2685	sub	$len_,$out			# rewind $out
2686	mov	$key_,$key			# restore $key
2687	mov	$rnds_,$rounds			# restore $rounds
2688
2689	movups	($out),$inout0
2690	xorps	@tweak[0],$inout0
2691___
2692	&aesni_generate1("dec",$key,$rounds);
2693$code.=<<___;
2694	xorps	@tweak[0],$inout0
2695	movups	$inout0,($out)
2696
2697.Lxts_dec_ret:
2698	xorps	%xmm0,%xmm0			# clear register bank
2699	pxor	%xmm1,%xmm1
2700	pxor	%xmm2,%xmm2
2701	pxor	%xmm3,%xmm3
2702	pxor	%xmm4,%xmm4
2703	pxor	%xmm5,%xmm5
2704___
2705$code.=<<___ if (!$win64);
2706	pxor	%xmm6,%xmm6
2707	pxor	%xmm7,%xmm7
2708	movaps	%xmm0,0x00(%rsp)		# clear stack
2709	pxor	%xmm8,%xmm8
2710	movaps	%xmm0,0x10(%rsp)
2711	pxor	%xmm9,%xmm9
2712	movaps	%xmm0,0x20(%rsp)
2713	pxor	%xmm10,%xmm10
2714	movaps	%xmm0,0x30(%rsp)
2715	pxor	%xmm11,%xmm11
2716	movaps	%xmm0,0x40(%rsp)
2717	pxor	%xmm12,%xmm12
2718	movaps	%xmm0,0x50(%rsp)
2719	pxor	%xmm13,%xmm13
2720	movaps	%xmm0,0x60(%rsp)
2721	pxor	%xmm14,%xmm14
2722	pxor	%xmm15,%xmm15
2723___
2724$code.=<<___ if ($win64);
2725	movaps	-0xa8(%r11),%xmm6
2726	movaps	%xmm0,-0xa8(%r11)		# clear stack
2727	movaps	-0x98(%r11),%xmm7
2728	movaps	%xmm0,-0x98(%r11)
2729	movaps	-0x88(%r11),%xmm8
2730	movaps	%xmm0,-0x88(%r11)
2731	movaps	-0x78(%r11),%xmm9
2732	movaps	%xmm0,-0x78(%r11)
2733	movaps	-0x68(%r11),%xmm10
2734	movaps	%xmm0,-0x68(%r11)
2735	movaps	-0x58(%r11),%xmm11
2736	movaps	%xmm0,-0x58(%r11)
2737	movaps	-0x48(%r11),%xmm12
2738	movaps	%xmm0,-0x48(%r11)
2739	movaps	-0x38(%r11),%xmm13
2740	movaps	%xmm0,-0x38(%r11)
2741	movaps	-0x28(%r11),%xmm14
2742	movaps	%xmm0,-0x28(%r11)
2743	movaps	-0x18(%r11),%xmm15
2744	movaps	%xmm0,-0x18(%r11)
2745	movaps	%xmm0,0x00(%rsp)
2746	movaps	%xmm0,0x10(%rsp)
2747	movaps	%xmm0,0x20(%rsp)
2748	movaps	%xmm0,0x30(%rsp)
2749	movaps	%xmm0,0x40(%rsp)
2750	movaps	%xmm0,0x50(%rsp)
2751	movaps	%xmm0,0x60(%rsp)
2752___
2753$code.=<<___;
2754	mov	-8(%r11),%rbp
2755.cfi_restore	%rbp
2756	lea	(%r11),%rsp
2757.cfi_def_cfa_register	%rsp
2758.Lxts_dec_epilogue:
2759	ret
2760.cfi_endproc
2761.size	aesni_xts_decrypt,.-aesni_xts_decrypt
2762___
2763}
2764
2765######################################################################
2766# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2767#	const AES_KEY *key, unsigned int start_block_num,
2768#	unsigned char offset_i[16], const unsigned char L_[][16],
2769#	unsigned char checksum[16]);
2770#
2771{
2772my @offset=map("%xmm$_",(10..15));
2773my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2774my ($block_num,$offset_p)=("%r8","%r9");		# 5th and 6th arguments
2775my ($L_p,$checksum_p) = ("%rbx","%rbp");
2776my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2777my $seventh_arg = $win64 ? 56 : 8;
2778my $blocks = $len;
2779
2780$code.=<<___;
2781.globl	aesni_ocb_encrypt
2782.type	aesni_ocb_encrypt,\@function,6
2783.align	32
2784aesni_ocb_encrypt:
2785.cfi_startproc
2786	lea	(%rsp),%rax
2787	push	%rbx
2788.cfi_push	%rbx
2789	push	%rbp
2790.cfi_push	%rbp
2791	push	%r12
2792.cfi_push	%r12
2793	push	%r13
2794.cfi_push	%r13
2795	push	%r14
2796.cfi_push	%r14
2797___
2798$code.=<<___ if ($win64);
2799	lea	-0xa0(%rsp),%rsp
2800	movaps	%xmm6,0x00(%rsp)		# offload everything
2801	movaps	%xmm7,0x10(%rsp)
2802	movaps	%xmm8,0x20(%rsp)
2803	movaps	%xmm9,0x30(%rsp)
2804	movaps	%xmm10,0x40(%rsp)
2805	movaps	%xmm11,0x50(%rsp)
2806	movaps	%xmm12,0x60(%rsp)
2807	movaps	%xmm13,0x70(%rsp)
2808	movaps	%xmm14,0x80(%rsp)
2809	movaps	%xmm15,0x90(%rsp)
2810.Locb_enc_body:
2811___
2812$code.=<<___;
2813	mov	$seventh_arg(%rax),$L_p		# 7th argument
2814	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
2815
2816	mov	240($key),$rnds_
2817	mov	$key,$key_
2818	shl	\$4,$rnds_
2819	$movkey	($key),$rndkey0l		# round[0]
2820	$movkey	16($key,$rnds_),$rndkey1	# round[last]
2821
2822	movdqu	($offset_p),@offset[5]		# load last offset_i
2823	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
2824	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
2825
2826	mov	\$16+32,$rounds
2827	lea	32($key_,$rnds_),$key
2828	$movkey	16($key_),$rndkey1		# round[1]
2829	sub	%r10,%rax			# twisted $rounds
2830	mov	%rax,%r10			# backup twisted $rounds
2831
2832	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
2833	movdqu	($checksum_p),$checksum		# load checksum
2834
2835	test	\$1,$block_num			# is first block number odd?
2836	jnz	.Locb_enc_odd
2837
2838	bsf	$block_num,$i1
2839	add	\$1,$block_num
2840	shl	\$4,$i1
2841	movdqu	($L_p,$i1),$inout5		# borrow
2842	movdqu	($inp),$inout0
2843	lea	16($inp),$inp
2844
2845	call	__ocb_encrypt1
2846
2847	movdqa	$inout5,@offset[5]
2848	movups	$inout0,($out)
2849	lea	16($out),$out
2850	sub	\$1,$blocks
2851	jz	.Locb_enc_done
2852
2853.Locb_enc_odd:
2854	lea	1($block_num),$i1		# even-numbered blocks
2855	lea	3($block_num),$i3
2856	lea	5($block_num),$i5
2857	lea	6($block_num),$block_num
2858	bsf	$i1,$i1				# ntz(block)
2859	bsf	$i3,$i3
2860	bsf	$i5,$i5
2861	shl	\$4,$i1				# ntz(block) -> table offset
2862	shl	\$4,$i3
2863	shl	\$4,$i5
2864
2865	sub	\$6,$blocks
2866	jc	.Locb_enc_short
2867	jmp	.Locb_enc_grandloop
2868
2869.align	32
2870.Locb_enc_grandloop:
2871	movdqu	`16*0`($inp),$inout0		# load input
2872	movdqu	`16*1`($inp),$inout1
2873	movdqu	`16*2`($inp),$inout2
2874	movdqu	`16*3`($inp),$inout3
2875	movdqu	`16*4`($inp),$inout4
2876	movdqu	`16*5`($inp),$inout5
2877	lea	`16*6`($inp),$inp
2878
2879	call	__ocb_encrypt6
2880
2881	movups	$inout0,`16*0`($out)		# store output
2882	movups	$inout1,`16*1`($out)
2883	movups	$inout2,`16*2`($out)
2884	movups	$inout3,`16*3`($out)
2885	movups	$inout4,`16*4`($out)
2886	movups	$inout5,`16*5`($out)
2887	lea	`16*6`($out),$out
2888	sub	\$6,$blocks
2889	jnc	.Locb_enc_grandloop
2890
2891.Locb_enc_short:
2892	add	\$6,$blocks
2893	jz	.Locb_enc_done
2894
2895	movdqu	`16*0`($inp),$inout0
2896	cmp	\$2,$blocks
2897	jb	.Locb_enc_one
2898	movdqu	`16*1`($inp),$inout1
2899	je	.Locb_enc_two
2900
2901	movdqu	`16*2`($inp),$inout2
2902	cmp	\$4,$blocks
2903	jb	.Locb_enc_three
2904	movdqu	`16*3`($inp),$inout3
2905	je	.Locb_enc_four
2906
2907	movdqu	`16*4`($inp),$inout4
2908	pxor	$inout5,$inout5
2909
2910	call	__ocb_encrypt6
2911
2912	movdqa	@offset[4],@offset[5]
2913	movups	$inout0,`16*0`($out)
2914	movups	$inout1,`16*1`($out)
2915	movups	$inout2,`16*2`($out)
2916	movups	$inout3,`16*3`($out)
2917	movups	$inout4,`16*4`($out)
2918
2919	jmp	.Locb_enc_done
2920
2921.align	16
2922.Locb_enc_one:
2923	movdqa	@offset[0],$inout5		# borrow
2924
2925	call	__ocb_encrypt1
2926
2927	movdqa	$inout5,@offset[5]
2928	movups	$inout0,`16*0`($out)
2929	jmp	.Locb_enc_done
2930
2931.align	16
2932.Locb_enc_two:
2933	pxor	$inout2,$inout2
2934	pxor	$inout3,$inout3
2935
2936	call	__ocb_encrypt4
2937
2938	movdqa	@offset[1],@offset[5]
2939	movups	$inout0,`16*0`($out)
2940	movups	$inout1,`16*1`($out)
2941
2942	jmp	.Locb_enc_done
2943
2944.align	16
2945.Locb_enc_three:
2946	pxor	$inout3,$inout3
2947
2948	call	__ocb_encrypt4
2949
2950	movdqa	@offset[2],@offset[5]
2951	movups	$inout0,`16*0`($out)
2952	movups	$inout1,`16*1`($out)
2953	movups	$inout2,`16*2`($out)
2954
2955	jmp	.Locb_enc_done
2956
2957.align	16
2958.Locb_enc_four:
2959	call	__ocb_encrypt4
2960
2961	movdqa	@offset[3],@offset[5]
2962	movups	$inout0,`16*0`($out)
2963	movups	$inout1,`16*1`($out)
2964	movups	$inout2,`16*2`($out)
2965	movups	$inout3,`16*3`($out)
2966
2967.Locb_enc_done:
2968	pxor	$rndkey0,@offset[5]		# "remove" round[last]
2969	movdqu	$checksum,($checksum_p)		# store checksum
2970	movdqu	@offset[5],($offset_p)		# store last offset_i
2971
2972	xorps	%xmm0,%xmm0			# clear register bank
2973	pxor	%xmm1,%xmm1
2974	pxor	%xmm2,%xmm2
2975	pxor	%xmm3,%xmm3
2976	pxor	%xmm4,%xmm4
2977	pxor	%xmm5,%xmm5
2978___
2979$code.=<<___ if (!$win64);
2980	pxor	%xmm6,%xmm6
2981	pxor	%xmm7,%xmm7
2982	pxor	%xmm8,%xmm8
2983	pxor	%xmm9,%xmm9
2984	pxor	%xmm10,%xmm10
2985	pxor	%xmm11,%xmm11
2986	pxor	%xmm12,%xmm12
2987	pxor	%xmm13,%xmm13
2988	pxor	%xmm14,%xmm14
2989	pxor	%xmm15,%xmm15
2990	lea	0x28(%rsp),%rax
2991.cfi_def_cfa	%rax,8
2992___
2993$code.=<<___ if ($win64);
2994	movaps	0x00(%rsp),%xmm6
2995	movaps	%xmm0,0x00(%rsp)		# clear stack
2996	movaps	0x10(%rsp),%xmm7
2997	movaps	%xmm0,0x10(%rsp)
2998	movaps	0x20(%rsp),%xmm8
2999	movaps	%xmm0,0x20(%rsp)
3000	movaps	0x30(%rsp),%xmm9
3001	movaps	%xmm0,0x30(%rsp)
3002	movaps	0x40(%rsp),%xmm10
3003	movaps	%xmm0,0x40(%rsp)
3004	movaps	0x50(%rsp),%xmm11
3005	movaps	%xmm0,0x50(%rsp)
3006	movaps	0x60(%rsp),%xmm12
3007	movaps	%xmm0,0x60(%rsp)
3008	movaps	0x70(%rsp),%xmm13
3009	movaps	%xmm0,0x70(%rsp)
3010	movaps	0x80(%rsp),%xmm14
3011	movaps	%xmm0,0x80(%rsp)
3012	movaps	0x90(%rsp),%xmm15
3013	movaps	%xmm0,0x90(%rsp)
3014	lea	0xa0+0x28(%rsp),%rax
3015.Locb_enc_pop:
3016___
3017$code.=<<___;
3018	mov	-40(%rax),%r14
3019.cfi_restore	%r14
3020	mov	-32(%rax),%r13
3021.cfi_restore	%r13
3022	mov	-24(%rax),%r12
3023.cfi_restore	%r12
3024	mov	-16(%rax),%rbp
3025.cfi_restore	%rbp
3026	mov	-8(%rax),%rbx
3027.cfi_restore	%rbx
3028	lea	(%rax),%rsp
3029.cfi_def_cfa_register	%rsp
3030.Locb_enc_epilogue:
3031	ret
3032.cfi_endproc
3033.size	aesni_ocb_encrypt,.-aesni_ocb_encrypt
3034
3035.type	__ocb_encrypt6,\@abi-omnipotent
3036.align	32
3037__ocb_encrypt6:
3038.cfi_startproc
3039	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3040	 movdqu		($L_p,$i1),@offset[1]
3041	 movdqa		@offset[0],@offset[2]
3042	 movdqu		($L_p,$i3),@offset[3]
3043	 movdqa		@offset[0],@offset[4]
3044	 pxor		@offset[5],@offset[0]
3045	 movdqu		($L_p,$i5),@offset[5]
3046	 pxor		@offset[0],@offset[1]
3047	pxor		$inout0,$checksum	# accumulate checksum
3048	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3049	 pxor		@offset[1],@offset[2]
3050	pxor		$inout1,$checksum
3051	pxor		@offset[1],$inout1
3052	 pxor		@offset[2],@offset[3]
3053	pxor		$inout2,$checksum
3054	pxor		@offset[2],$inout2
3055	 pxor		@offset[3],@offset[4]
3056	pxor		$inout3,$checksum
3057	pxor		@offset[3],$inout3
3058	 pxor		@offset[4],@offset[5]
3059	pxor		$inout4,$checksum
3060	pxor		@offset[4],$inout4
3061	pxor		$inout5,$checksum
3062	pxor		@offset[5],$inout5
3063	$movkey		32($key_),$rndkey0
3064
3065	lea		1($block_num),$i1	# even-numbered blocks
3066	lea		3($block_num),$i3
3067	lea		5($block_num),$i5
3068	add		\$6,$block_num
3069	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3070	bsf		$i1,$i1			# ntz(block)
3071	bsf		$i3,$i3
3072	bsf		$i5,$i5
3073
3074	aesenc		$rndkey1,$inout0
3075	aesenc		$rndkey1,$inout1
3076	aesenc		$rndkey1,$inout2
3077	aesenc		$rndkey1,$inout3
3078	 pxor		$rndkey0l,@offset[1]
3079	 pxor		$rndkey0l,@offset[2]
3080	aesenc		$rndkey1,$inout4
3081	 pxor		$rndkey0l,@offset[3]
3082	 pxor		$rndkey0l,@offset[4]
3083	aesenc		$rndkey1,$inout5
3084	$movkey		48($key_),$rndkey1
3085	 pxor		$rndkey0l,@offset[5]
3086
3087	aesenc		$rndkey0,$inout0
3088	aesenc		$rndkey0,$inout1
3089	aesenc		$rndkey0,$inout2
3090	aesenc		$rndkey0,$inout3
3091	aesenc		$rndkey0,$inout4
3092	aesenc		$rndkey0,$inout5
3093	$movkey		64($key_),$rndkey0
3094	shl		\$4,$i1			# ntz(block) -> table offset
3095	shl		\$4,$i3
3096	jmp		.Locb_enc_loop6
3097
3098.align	32
3099.Locb_enc_loop6:
3100	aesenc		$rndkey1,$inout0
3101	aesenc		$rndkey1,$inout1
3102	aesenc		$rndkey1,$inout2
3103	aesenc		$rndkey1,$inout3
3104	aesenc		$rndkey1,$inout4
3105	aesenc		$rndkey1,$inout5
3106	$movkey		($key,%rax),$rndkey1
3107	add		\$32,%rax
3108
3109	aesenc		$rndkey0,$inout0
3110	aesenc		$rndkey0,$inout1
3111	aesenc		$rndkey0,$inout2
3112	aesenc		$rndkey0,$inout3
3113	aesenc		$rndkey0,$inout4
3114	aesenc		$rndkey0,$inout5
3115	$movkey		-16($key,%rax),$rndkey0
3116	jnz		.Locb_enc_loop6
3117
3118	aesenc		$rndkey1,$inout0
3119	aesenc		$rndkey1,$inout1
3120	aesenc		$rndkey1,$inout2
3121	aesenc		$rndkey1,$inout3
3122	aesenc		$rndkey1,$inout4
3123	aesenc		$rndkey1,$inout5
3124	$movkey		16($key_),$rndkey1
3125	shl		\$4,$i5
3126
3127	aesenclast	@offset[0],$inout0
3128	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3129	mov		%r10,%rax		# restore twisted rounds
3130	aesenclast	@offset[1],$inout1
3131	aesenclast	@offset[2],$inout2
3132	aesenclast	@offset[3],$inout3
3133	aesenclast	@offset[4],$inout4
3134	aesenclast	@offset[5],$inout5
3135	ret
3136.cfi_endproc
3137.size	__ocb_encrypt6,.-__ocb_encrypt6
3138
3139.type	__ocb_encrypt4,\@abi-omnipotent
3140.align	32
3141__ocb_encrypt4:
3142.cfi_startproc
3143	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3144	 movdqu		($L_p,$i1),@offset[1]
3145	 movdqa		@offset[0],@offset[2]
3146	 movdqu		($L_p,$i3),@offset[3]
3147	 pxor		@offset[5],@offset[0]
3148	 pxor		@offset[0],@offset[1]
3149	pxor		$inout0,$checksum	# accumulate checksum
3150	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3151	 pxor		@offset[1],@offset[2]
3152	pxor		$inout1,$checksum
3153	pxor		@offset[1],$inout1
3154	 pxor		@offset[2],@offset[3]
3155	pxor		$inout2,$checksum
3156	pxor		@offset[2],$inout2
3157	pxor		$inout3,$checksum
3158	pxor		@offset[3],$inout3
3159	$movkey		32($key_),$rndkey0
3160
3161	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3162	 pxor		$rndkey0l,@offset[1]
3163	 pxor		$rndkey0l,@offset[2]
3164	 pxor		$rndkey0l,@offset[3]
3165
3166	aesenc		$rndkey1,$inout0
3167	aesenc		$rndkey1,$inout1
3168	aesenc		$rndkey1,$inout2
3169	aesenc		$rndkey1,$inout3
3170	$movkey		48($key_),$rndkey1
3171
3172	aesenc		$rndkey0,$inout0
3173	aesenc		$rndkey0,$inout1
3174	aesenc		$rndkey0,$inout2
3175	aesenc		$rndkey0,$inout3
3176	$movkey		64($key_),$rndkey0
3177	jmp		.Locb_enc_loop4
3178
3179.align	32
3180.Locb_enc_loop4:
3181	aesenc		$rndkey1,$inout0
3182	aesenc		$rndkey1,$inout1
3183	aesenc		$rndkey1,$inout2
3184	aesenc		$rndkey1,$inout3
3185	$movkey		($key,%rax),$rndkey1
3186	add		\$32,%rax
3187
3188	aesenc		$rndkey0,$inout0
3189	aesenc		$rndkey0,$inout1
3190	aesenc		$rndkey0,$inout2
3191	aesenc		$rndkey0,$inout3
3192	$movkey		-16($key,%rax),$rndkey0
3193	jnz		.Locb_enc_loop4
3194
3195	aesenc		$rndkey1,$inout0
3196	aesenc		$rndkey1,$inout1
3197	aesenc		$rndkey1,$inout2
3198	aesenc		$rndkey1,$inout3
3199	$movkey		16($key_),$rndkey1
3200	mov		%r10,%rax		# restore twisted rounds
3201
3202	aesenclast	@offset[0],$inout0
3203	aesenclast	@offset[1],$inout1
3204	aesenclast	@offset[2],$inout2
3205	aesenclast	@offset[3],$inout3
3206	ret
3207.cfi_endproc
3208.size	__ocb_encrypt4,.-__ocb_encrypt4
3209
3210.type	__ocb_encrypt1,\@abi-omnipotent
3211.align	32
3212__ocb_encrypt1:
3213.cfi_startproc
3214	 pxor		@offset[5],$inout5	# offset_i
3215	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3216	pxor		$inout0,$checksum	# accumulate checksum
3217	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3218	$movkey		32($key_),$rndkey0
3219
3220	aesenc		$rndkey1,$inout0
3221	$movkey		48($key_),$rndkey1
3222	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3223
3224	aesenc		$rndkey0,$inout0
3225	$movkey		64($key_),$rndkey0
3226	jmp		.Locb_enc_loop1
3227
3228.align	32
3229.Locb_enc_loop1:
3230	aesenc		$rndkey1,$inout0
3231	$movkey		($key,%rax),$rndkey1
3232	add		\$32,%rax
3233
3234	aesenc		$rndkey0,$inout0
3235	$movkey		-16($key,%rax),$rndkey0
3236	jnz		.Locb_enc_loop1
3237
3238	aesenc		$rndkey1,$inout0
3239	$movkey		16($key_),$rndkey1	# redundant in tail
3240	mov		%r10,%rax		# restore twisted rounds
3241
3242	aesenclast	$inout5,$inout0
3243	ret
3244.cfi_endproc
3245.size	__ocb_encrypt1,.-__ocb_encrypt1
3246
3247.globl	aesni_ocb_decrypt
3248.type	aesni_ocb_decrypt,\@function,6
3249.align	32
3250aesni_ocb_decrypt:
3251.cfi_startproc
3252	lea	(%rsp),%rax
3253	push	%rbx
3254.cfi_push	%rbx
3255	push	%rbp
3256.cfi_push	%rbp
3257	push	%r12
3258.cfi_push	%r12
3259	push	%r13
3260.cfi_push	%r13
3261	push	%r14
3262.cfi_push	%r14
3263___
3264$code.=<<___ if ($win64);
3265	lea	-0xa0(%rsp),%rsp
3266	movaps	%xmm6,0x00(%rsp)		# offload everything
3267	movaps	%xmm7,0x10(%rsp)
3268	movaps	%xmm8,0x20(%rsp)
3269	movaps	%xmm9,0x30(%rsp)
3270	movaps	%xmm10,0x40(%rsp)
3271	movaps	%xmm11,0x50(%rsp)
3272	movaps	%xmm12,0x60(%rsp)
3273	movaps	%xmm13,0x70(%rsp)
3274	movaps	%xmm14,0x80(%rsp)
3275	movaps	%xmm15,0x90(%rsp)
3276.Locb_dec_body:
3277___
3278$code.=<<___;
3279	mov	$seventh_arg(%rax),$L_p		# 7th argument
3280	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
3281
3282	mov	240($key),$rnds_
3283	mov	$key,$key_
3284	shl	\$4,$rnds_
3285	$movkey	($key),$rndkey0l		# round[0]
3286	$movkey	16($key,$rnds_),$rndkey1	# round[last]
3287
3288	movdqu	($offset_p),@offset[5]		# load last offset_i
3289	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
3290	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
3291
3292	mov	\$16+32,$rounds
3293	lea	32($key_,$rnds_),$key
3294	$movkey	16($key_),$rndkey1		# round[1]
3295	sub	%r10,%rax			# twisted $rounds
3296	mov	%rax,%r10			# backup twisted $rounds
3297
3298	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
3299	movdqu	($checksum_p),$checksum		# load checksum
3300
3301	test	\$1,$block_num			# is first block number odd?
3302	jnz	.Locb_dec_odd
3303
3304	bsf	$block_num,$i1
3305	add	\$1,$block_num
3306	shl	\$4,$i1
3307	movdqu	($L_p,$i1),$inout5		# borrow
3308	movdqu	($inp),$inout0
3309	lea	16($inp),$inp
3310
3311	call	__ocb_decrypt1
3312
3313	movdqa	$inout5,@offset[5]
3314	movups	$inout0,($out)
3315	xorps	$inout0,$checksum		# accumulate checksum
3316	lea	16($out),$out
3317	sub	\$1,$blocks
3318	jz	.Locb_dec_done
3319
3320.Locb_dec_odd:
3321	lea	1($block_num),$i1		# even-numbered blocks
3322	lea	3($block_num),$i3
3323	lea	5($block_num),$i5
3324	lea	6($block_num),$block_num
3325	bsf	$i1,$i1				# ntz(block)
3326	bsf	$i3,$i3
3327	bsf	$i5,$i5
3328	shl	\$4,$i1				# ntz(block) -> table offset
3329	shl	\$4,$i3
3330	shl	\$4,$i5
3331
3332	sub	\$6,$blocks
3333	jc	.Locb_dec_short
3334	jmp	.Locb_dec_grandloop
3335
3336.align	32
3337.Locb_dec_grandloop:
3338	movdqu	`16*0`($inp),$inout0		# load input
3339	movdqu	`16*1`($inp),$inout1
3340	movdqu	`16*2`($inp),$inout2
3341	movdqu	`16*3`($inp),$inout3
3342	movdqu	`16*4`($inp),$inout4
3343	movdqu	`16*5`($inp),$inout5
3344	lea	`16*6`($inp),$inp
3345
3346	call	__ocb_decrypt6
3347
3348	movups	$inout0,`16*0`($out)		# store output
3349	pxor	$inout0,$checksum		# accumulate checksum
3350	movups	$inout1,`16*1`($out)
3351	pxor	$inout1,$checksum
3352	movups	$inout2,`16*2`($out)
3353	pxor	$inout2,$checksum
3354	movups	$inout3,`16*3`($out)
3355	pxor	$inout3,$checksum
3356	movups	$inout4,`16*4`($out)
3357	pxor	$inout4,$checksum
3358	movups	$inout5,`16*5`($out)
3359	pxor	$inout5,$checksum
3360	lea	`16*6`($out),$out
3361	sub	\$6,$blocks
3362	jnc	.Locb_dec_grandloop
3363
3364.Locb_dec_short:
3365	add	\$6,$blocks
3366	jz	.Locb_dec_done
3367
3368	movdqu	`16*0`($inp),$inout0
3369	cmp	\$2,$blocks
3370	jb	.Locb_dec_one
3371	movdqu	`16*1`($inp),$inout1
3372	je	.Locb_dec_two
3373
3374	movdqu	`16*2`($inp),$inout2
3375	cmp	\$4,$blocks
3376	jb	.Locb_dec_three
3377	movdqu	`16*3`($inp),$inout3
3378	je	.Locb_dec_four
3379
3380	movdqu	`16*4`($inp),$inout4
3381	pxor	$inout5,$inout5
3382
3383	call	__ocb_decrypt6
3384
3385	movdqa	@offset[4],@offset[5]
3386	movups	$inout0,`16*0`($out)		# store output
3387	pxor	$inout0,$checksum		# accumulate checksum
3388	movups	$inout1,`16*1`($out)
3389	pxor	$inout1,$checksum
3390	movups	$inout2,`16*2`($out)
3391	pxor	$inout2,$checksum
3392	movups	$inout3,`16*3`($out)
3393	pxor	$inout3,$checksum
3394	movups	$inout4,`16*4`($out)
3395	pxor	$inout4,$checksum
3396
3397	jmp	.Locb_dec_done
3398
3399.align	16
3400.Locb_dec_one:
3401	movdqa	@offset[0],$inout5		# borrow
3402
3403	call	__ocb_decrypt1
3404
3405	movdqa	$inout5,@offset[5]
3406	movups	$inout0,`16*0`($out)		# store output
3407	xorps	$inout0,$checksum		# accumulate checksum
3408	jmp	.Locb_dec_done
3409
3410.align	16
3411.Locb_dec_two:
3412	pxor	$inout2,$inout2
3413	pxor	$inout3,$inout3
3414
3415	call	__ocb_decrypt4
3416
3417	movdqa	@offset[1],@offset[5]
3418	movups	$inout0,`16*0`($out)		# store output
3419	xorps	$inout0,$checksum		# accumulate checksum
3420	movups	$inout1,`16*1`($out)
3421	xorps	$inout1,$checksum
3422
3423	jmp	.Locb_dec_done
3424
3425.align	16
3426.Locb_dec_three:
3427	pxor	$inout3,$inout3
3428
3429	call	__ocb_decrypt4
3430
3431	movdqa	@offset[2],@offset[5]
3432	movups	$inout0,`16*0`($out)		# store output
3433	xorps	$inout0,$checksum		# accumulate checksum
3434	movups	$inout1,`16*1`($out)
3435	xorps	$inout1,$checksum
3436	movups	$inout2,`16*2`($out)
3437	xorps	$inout2,$checksum
3438
3439	jmp	.Locb_dec_done
3440
3441.align	16
3442.Locb_dec_four:
3443	call	__ocb_decrypt4
3444
3445	movdqa	@offset[3],@offset[5]
3446	movups	$inout0,`16*0`($out)		# store output
3447	pxor	$inout0,$checksum		# accumulate checksum
3448	movups	$inout1,`16*1`($out)
3449	pxor	$inout1,$checksum
3450	movups	$inout2,`16*2`($out)
3451	pxor	$inout2,$checksum
3452	movups	$inout3,`16*3`($out)
3453	pxor	$inout3,$checksum
3454
3455.Locb_dec_done:
3456	pxor	$rndkey0,@offset[5]		# "remove" round[last]
3457	movdqu	$checksum,($checksum_p)		# store checksum
3458	movdqu	@offset[5],($offset_p)		# store last offset_i
3459
3460	xorps	%xmm0,%xmm0			# clear register bank
3461	pxor	%xmm1,%xmm1
3462	pxor	%xmm2,%xmm2
3463	pxor	%xmm3,%xmm3
3464	pxor	%xmm4,%xmm4
3465	pxor	%xmm5,%xmm5
3466___
3467$code.=<<___ if (!$win64);
3468	pxor	%xmm6,%xmm6
3469	pxor	%xmm7,%xmm7
3470	pxor	%xmm8,%xmm8
3471	pxor	%xmm9,%xmm9
3472	pxor	%xmm10,%xmm10
3473	pxor	%xmm11,%xmm11
3474	pxor	%xmm12,%xmm12
3475	pxor	%xmm13,%xmm13
3476	pxor	%xmm14,%xmm14
3477	pxor	%xmm15,%xmm15
3478	lea	0x28(%rsp),%rax
3479.cfi_def_cfa	%rax,8
3480___
3481$code.=<<___ if ($win64);
3482	movaps	0x00(%rsp),%xmm6
3483	movaps	%xmm0,0x00(%rsp)		# clear stack
3484	movaps	0x10(%rsp),%xmm7
3485	movaps	%xmm0,0x10(%rsp)
3486	movaps	0x20(%rsp),%xmm8
3487	movaps	%xmm0,0x20(%rsp)
3488	movaps	0x30(%rsp),%xmm9
3489	movaps	%xmm0,0x30(%rsp)
3490	movaps	0x40(%rsp),%xmm10
3491	movaps	%xmm0,0x40(%rsp)
3492	movaps	0x50(%rsp),%xmm11
3493	movaps	%xmm0,0x50(%rsp)
3494	movaps	0x60(%rsp),%xmm12
3495	movaps	%xmm0,0x60(%rsp)
3496	movaps	0x70(%rsp),%xmm13
3497	movaps	%xmm0,0x70(%rsp)
3498	movaps	0x80(%rsp),%xmm14
3499	movaps	%xmm0,0x80(%rsp)
3500	movaps	0x90(%rsp),%xmm15
3501	movaps	%xmm0,0x90(%rsp)
3502	lea	0xa0+0x28(%rsp),%rax
3503.Locb_dec_pop:
3504___
3505$code.=<<___;
3506	mov	-40(%rax),%r14
3507.cfi_restore	%r14
3508	mov	-32(%rax),%r13
3509.cfi_restore	%r13
3510	mov	-24(%rax),%r12
3511.cfi_restore	%r12
3512	mov	-16(%rax),%rbp
3513.cfi_restore	%rbp
3514	mov	-8(%rax),%rbx
3515.cfi_restore	%rbx
3516	lea	(%rax),%rsp
3517.cfi_def_cfa_register	%rsp
3518.Locb_dec_epilogue:
3519	ret
3520.cfi_endproc
3521.size	aesni_ocb_decrypt,.-aesni_ocb_decrypt
3522
3523.type	__ocb_decrypt6,\@abi-omnipotent
3524.align	32
3525__ocb_decrypt6:
3526.cfi_startproc
3527	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3528	 movdqu		($L_p,$i1),@offset[1]
3529	 movdqa		@offset[0],@offset[2]
3530	 movdqu		($L_p,$i3),@offset[3]
3531	 movdqa		@offset[0],@offset[4]
3532	 pxor		@offset[5],@offset[0]
3533	 movdqu		($L_p,$i5),@offset[5]
3534	 pxor		@offset[0],@offset[1]
3535	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3536	 pxor		@offset[1],@offset[2]
3537	pxor		@offset[1],$inout1
3538	 pxor		@offset[2],@offset[3]
3539	pxor		@offset[2],$inout2
3540	 pxor		@offset[3],@offset[4]
3541	pxor		@offset[3],$inout3
3542	 pxor		@offset[4],@offset[5]
3543	pxor		@offset[4],$inout4
3544	pxor		@offset[5],$inout5
3545	$movkey		32($key_),$rndkey0
3546
3547	lea		1($block_num),$i1	# even-numbered blocks
3548	lea		3($block_num),$i3
3549	lea		5($block_num),$i5
3550	add		\$6,$block_num
3551	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3552	bsf		$i1,$i1			# ntz(block)
3553	bsf		$i3,$i3
3554	bsf		$i5,$i5
3555
3556	aesdec		$rndkey1,$inout0
3557	aesdec		$rndkey1,$inout1
3558	aesdec		$rndkey1,$inout2
3559	aesdec		$rndkey1,$inout3
3560	 pxor		$rndkey0l,@offset[1]
3561	 pxor		$rndkey0l,@offset[2]
3562	aesdec		$rndkey1,$inout4
3563	 pxor		$rndkey0l,@offset[3]
3564	 pxor		$rndkey0l,@offset[4]
3565	aesdec		$rndkey1,$inout5
3566	$movkey		48($key_),$rndkey1
3567	 pxor		$rndkey0l,@offset[5]
3568
3569	aesdec		$rndkey0,$inout0
3570	aesdec		$rndkey0,$inout1
3571	aesdec		$rndkey0,$inout2
3572	aesdec		$rndkey0,$inout3
3573	aesdec		$rndkey0,$inout4
3574	aesdec		$rndkey0,$inout5
3575	$movkey		64($key_),$rndkey0
3576	shl		\$4,$i1			# ntz(block) -> table offset
3577	shl		\$4,$i3
3578	jmp		.Locb_dec_loop6
3579
3580.align	32
3581.Locb_dec_loop6:
3582	aesdec		$rndkey1,$inout0
3583	aesdec		$rndkey1,$inout1
3584	aesdec		$rndkey1,$inout2
3585	aesdec		$rndkey1,$inout3
3586	aesdec		$rndkey1,$inout4
3587	aesdec		$rndkey1,$inout5
3588	$movkey		($key,%rax),$rndkey1
3589	add		\$32,%rax
3590
3591	aesdec		$rndkey0,$inout0
3592	aesdec		$rndkey0,$inout1
3593	aesdec		$rndkey0,$inout2
3594	aesdec		$rndkey0,$inout3
3595	aesdec		$rndkey0,$inout4
3596	aesdec		$rndkey0,$inout5
3597	$movkey		-16($key,%rax),$rndkey0
3598	jnz		.Locb_dec_loop6
3599
3600	aesdec		$rndkey1,$inout0
3601	aesdec		$rndkey1,$inout1
3602	aesdec		$rndkey1,$inout2
3603	aesdec		$rndkey1,$inout3
3604	aesdec		$rndkey1,$inout4
3605	aesdec		$rndkey1,$inout5
3606	$movkey		16($key_),$rndkey1
3607	shl		\$4,$i5
3608
3609	aesdeclast	@offset[0],$inout0
3610	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3611	mov		%r10,%rax		# restore twisted rounds
3612	aesdeclast	@offset[1],$inout1
3613	aesdeclast	@offset[2],$inout2
3614	aesdeclast	@offset[3],$inout3
3615	aesdeclast	@offset[4],$inout4
3616	aesdeclast	@offset[5],$inout5
3617	ret
3618.cfi_endproc
3619.size	__ocb_decrypt6,.-__ocb_decrypt6
3620
3621.type	__ocb_decrypt4,\@abi-omnipotent
3622.align	32
3623__ocb_decrypt4:
3624.cfi_startproc
3625	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3626	 movdqu		($L_p,$i1),@offset[1]
3627	 movdqa		@offset[0],@offset[2]
3628	 movdqu		($L_p,$i3),@offset[3]
3629	 pxor		@offset[5],@offset[0]
3630	 pxor		@offset[0],@offset[1]
3631	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3632	 pxor		@offset[1],@offset[2]
3633	pxor		@offset[1],$inout1
3634	 pxor		@offset[2],@offset[3]
3635	pxor		@offset[2],$inout2
3636	pxor		@offset[3],$inout3
3637	$movkey		32($key_),$rndkey0
3638
3639	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3640	 pxor		$rndkey0l,@offset[1]
3641	 pxor		$rndkey0l,@offset[2]
3642	 pxor		$rndkey0l,@offset[3]
3643
3644	aesdec		$rndkey1,$inout0
3645	aesdec		$rndkey1,$inout1
3646	aesdec		$rndkey1,$inout2
3647	aesdec		$rndkey1,$inout3
3648	$movkey		48($key_),$rndkey1
3649
3650	aesdec		$rndkey0,$inout0
3651	aesdec		$rndkey0,$inout1
3652	aesdec		$rndkey0,$inout2
3653	aesdec		$rndkey0,$inout3
3654	$movkey		64($key_),$rndkey0
3655	jmp		.Locb_dec_loop4
3656
3657.align	32
3658.Locb_dec_loop4:
3659	aesdec		$rndkey1,$inout0
3660	aesdec		$rndkey1,$inout1
3661	aesdec		$rndkey1,$inout2
3662	aesdec		$rndkey1,$inout3
3663	$movkey		($key,%rax),$rndkey1
3664	add		\$32,%rax
3665
3666	aesdec		$rndkey0,$inout0
3667	aesdec		$rndkey0,$inout1
3668	aesdec		$rndkey0,$inout2
3669	aesdec		$rndkey0,$inout3
3670	$movkey		-16($key,%rax),$rndkey0
3671	jnz		.Locb_dec_loop4
3672
3673	aesdec		$rndkey1,$inout0
3674	aesdec		$rndkey1,$inout1
3675	aesdec		$rndkey1,$inout2
3676	aesdec		$rndkey1,$inout3
3677	$movkey		16($key_),$rndkey1
3678	mov		%r10,%rax		# restore twisted rounds
3679
3680	aesdeclast	@offset[0],$inout0
3681	aesdeclast	@offset[1],$inout1
3682	aesdeclast	@offset[2],$inout2
3683	aesdeclast	@offset[3],$inout3
3684	ret
3685.cfi_endproc
3686.size	__ocb_decrypt4,.-__ocb_decrypt4
3687
3688.type	__ocb_decrypt1,\@abi-omnipotent
3689.align	32
3690__ocb_decrypt1:
3691.cfi_startproc
3692	 pxor		@offset[5],$inout5	# offset_i
3693	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3694	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3695	$movkey		32($key_),$rndkey0
3696
3697	aesdec		$rndkey1,$inout0
3698	$movkey		48($key_),$rndkey1
3699	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3700
3701	aesdec		$rndkey0,$inout0
3702	$movkey		64($key_),$rndkey0
3703	jmp		.Locb_dec_loop1
3704
3705.align	32
3706.Locb_dec_loop1:
3707	aesdec		$rndkey1,$inout0
3708	$movkey		($key,%rax),$rndkey1
3709	add		\$32,%rax
3710
3711	aesdec		$rndkey0,$inout0
3712	$movkey		-16($key,%rax),$rndkey0
3713	jnz		.Locb_dec_loop1
3714
3715	aesdec		$rndkey1,$inout0
3716	$movkey		16($key_),$rndkey1	# redundant in tail
3717	mov		%r10,%rax		# restore twisted rounds
3718
3719	aesdeclast	$inout5,$inout0
3720	ret
3721.cfi_endproc
3722.size	__ocb_decrypt1,.-__ocb_decrypt1
3723___
3724} }}
3725
3726########################################################################
3727# void $PREFIX_cbc_encrypt (const void *inp, void *out,
3728#			    size_t length, const AES_KEY *key,
3729#			    unsigned char *ivp,const int enc);
3730{
3731my $frame_size = 0x10 + ($win64?0xa0:0);	# used in decrypt
3732my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3733
3734$code.=<<___;
3735.globl	${PREFIX}_cbc_encrypt
3736.type	${PREFIX}_cbc_encrypt,\@function,6
3737.align	16
3738${PREFIX}_cbc_encrypt:
3739.cfi_startproc
3740	test	$len,$len		# check length
3741	jz	.Lcbc_ret
3742
3743	mov	240($key),$rnds_	# key->rounds
3744	mov	$key,$key_		# backup $key
3745	test	%r9d,%r9d		# 6th argument
3746	jz	.Lcbc_decrypt
3747#--------------------------- CBC ENCRYPT ------------------------------#
3748	movups	($ivp),$inout0		# load iv as initial state
3749	mov	$rnds_,$rounds
3750	cmp	\$16,$len
3751	jb	.Lcbc_enc_tail
3752	sub	\$16,$len
3753	jmp	.Lcbc_enc_loop
3754.align	16
3755.Lcbc_enc_loop:
3756	movups	($inp),$inout1		# load input
3757	lea	16($inp),$inp
3758	#xorps	$inout1,$inout0
3759___
3760	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3761$code.=<<___;
3762	mov	$rnds_,$rounds		# restore $rounds
3763	mov	$key_,$key		# restore $key
3764	movups	$inout0,0($out)		# store output
3765	lea	16($out),$out
3766	sub	\$16,$len
3767	jnc	.Lcbc_enc_loop
3768	add	\$16,$len
3769	jnz	.Lcbc_enc_tail
3770	 pxor	$rndkey0,$rndkey0	# clear register bank
3771	 pxor	$rndkey1,$rndkey1
3772	movups	$inout0,($ivp)
3773	 pxor	$inout0,$inout0
3774	 pxor	$inout1,$inout1
3775	jmp	.Lcbc_ret
3776
3777.Lcbc_enc_tail:
3778	mov	$len,%rcx	# zaps $key
3779	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
3780	.long	0x9066A4F3	# rep movsb
3781	mov	\$16,%ecx	# zero tail
3782	sub	$len,%rcx
3783	xor	%eax,%eax
3784	.long	0x9066AAF3	# rep stosb
3785	lea	-16(%rdi),%rdi	# rewind $out by 1 block
3786	mov	$rnds_,$rounds	# restore $rounds
3787	mov	%rdi,%rsi	# $inp and $out are the same
3788	mov	$key_,$key	# restore $key
3789	xor	$len,$len	# len=16
3790	jmp	.Lcbc_enc_loop	# one more spin
3791#--------------------------- CBC DECRYPT ------------------------------#
3792.align	16
3793.Lcbc_decrypt:
3794	cmp	\$16,$len
3795	jne	.Lcbc_decrypt_bulk
3796
3797	# handle single block without allocating stack frame,
3798	# useful in ciphertext stealing mode
3799	movdqu	($inp),$inout0		# load input
3800	movdqu	($ivp),$inout1		# load iv
3801	movdqa	$inout0,$inout2		# future iv
3802___
3803	&aesni_generate1("dec",$key,$rnds_);
3804$code.=<<___;
3805	 pxor	$rndkey0,$rndkey0	# clear register bank
3806	 pxor	$rndkey1,$rndkey1
3807	movdqu	$inout2,($ivp)		# store iv
3808	xorps	$inout1,$inout0		# ^=iv
3809	 pxor	$inout1,$inout1
3810	movups	$inout0,($out)		# store output
3811	 pxor	$inout0,$inout0
3812	jmp	.Lcbc_ret
3813.align	16
3814.Lcbc_decrypt_bulk:
3815	lea	(%rsp),%r11		# frame pointer
3816.cfi_def_cfa_register	%r11
3817	push	%rbp
3818.cfi_push	%rbp
3819	sub	\$$frame_size,%rsp
3820	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
3821___
3822$code.=<<___ if ($win64);
3823	movaps	%xmm6,0x10(%rsp)
3824	movaps	%xmm7,0x20(%rsp)
3825	movaps	%xmm8,0x30(%rsp)
3826	movaps	%xmm9,0x40(%rsp)
3827	movaps	%xmm10,0x50(%rsp)
3828	movaps	%xmm11,0x60(%rsp)
3829	movaps	%xmm12,0x70(%rsp)
3830	movaps	%xmm13,0x80(%rsp)
3831	movaps	%xmm14,0x90(%rsp)
3832	movaps	%xmm15,0xa0(%rsp)
3833.Lcbc_decrypt_body:
3834___
3835
3836my $inp_=$key_="%rbp";			# reassign $key_
3837
3838$code.=<<___;
3839	mov	$key,$key_		# [re-]backup $key [after reassignment]
3840	movups	($ivp),$iv
3841	mov	$rnds_,$rounds
3842	cmp	\$0x50,$len
3843	jbe	.Lcbc_dec_tail
3844
3845	$movkey	($key),$rndkey0
3846	movdqu	0x00($inp),$inout0	# load input
3847	movdqu	0x10($inp),$inout1
3848	movdqa	$inout0,$in0
3849	movdqu	0x20($inp),$inout2
3850	movdqa	$inout1,$in1
3851	movdqu	0x30($inp),$inout3
3852	movdqa	$inout2,$in2
3853	movdqu	0x40($inp),$inout4
3854	movdqa	$inout3,$in3
3855	movdqu	0x50($inp),$inout5
3856	movdqa	$inout4,$in4
3857	mov	OPENSSL_ia32cap_P+4(%rip),%r9d
3858	cmp	\$0x70,$len
3859	jbe	.Lcbc_dec_six_or_seven
3860
3861	and	\$`1<<26|1<<22`,%r9d	# isolate XSAVE+MOVBE
3862	sub	\$0x50,$len		# $len is biased by -5*16
3863	cmp	\$`1<<22`,%r9d		# check for MOVBE without XSAVE
3864	je	.Lcbc_dec_loop6_enter	# [which denotes Atom Silvermont]
3865	sub	\$0x20,$len		# $len is biased by -7*16
3866	lea	0x70($key),$key		# size optimization
3867	jmp	.Lcbc_dec_loop8_enter
3868.align	16
3869.Lcbc_dec_loop8:
3870	movups	$inout7,($out)
3871	lea	0x10($out),$out
3872.Lcbc_dec_loop8_enter:
3873	movdqu		0x60($inp),$inout6
3874	pxor		$rndkey0,$inout0
3875	movdqu		0x70($inp),$inout7
3876	pxor		$rndkey0,$inout1
3877	$movkey		0x10-0x70($key),$rndkey1
3878	pxor		$rndkey0,$inout2
3879	mov		\$-1,$inp_
3880	cmp		\$0x70,$len	# is there at least 0x60 bytes ahead?
3881	pxor		$rndkey0,$inout3
3882	pxor		$rndkey0,$inout4
3883	pxor		$rndkey0,$inout5
3884	pxor		$rndkey0,$inout6
3885
3886	aesdec		$rndkey1,$inout0
3887	pxor		$rndkey0,$inout7
3888	$movkey		0x20-0x70($key),$rndkey0
3889	aesdec		$rndkey1,$inout1
3890	aesdec		$rndkey1,$inout2
3891	aesdec		$rndkey1,$inout3
3892	aesdec		$rndkey1,$inout4
3893	aesdec		$rndkey1,$inout5
3894	aesdec		$rndkey1,$inout6
3895	adc		\$0,$inp_
3896	and		\$128,$inp_
3897	aesdec		$rndkey1,$inout7
3898	add		$inp,$inp_
3899	$movkey		0x30-0x70($key),$rndkey1
3900___
3901for($i=1;$i<12;$i++) {
3902my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3903$code.=<<___	if ($i==7);
3904	cmp		\$11,$rounds
3905___
3906$code.=<<___;
3907	aesdec		$rndkeyx,$inout0
3908	aesdec		$rndkeyx,$inout1
3909	aesdec		$rndkeyx,$inout2
3910	aesdec		$rndkeyx,$inout3
3911	aesdec		$rndkeyx,$inout4
3912	aesdec		$rndkeyx,$inout5
3913	aesdec		$rndkeyx,$inout6
3914	aesdec		$rndkeyx,$inout7
3915	$movkey		`0x30+0x10*$i`-0x70($key),$rndkeyx
3916___
3917$code.=<<___	if ($i<6 || (!($i&1) && $i>7));
3918	nop
3919___
3920$code.=<<___	if ($i==7);
3921	jb		.Lcbc_dec_done
3922___
3923$code.=<<___	if ($i==9);
3924	je		.Lcbc_dec_done
3925___
3926$code.=<<___	if ($i==11);
3927	jmp		.Lcbc_dec_done
3928___
3929}
3930$code.=<<___;
3931.align	16
3932.Lcbc_dec_done:
3933	aesdec		$rndkey1,$inout0
3934	aesdec		$rndkey1,$inout1
3935	pxor		$rndkey0,$iv
3936	pxor		$rndkey0,$in0
3937	aesdec		$rndkey1,$inout2
3938	aesdec		$rndkey1,$inout3
3939	pxor		$rndkey0,$in1
3940	pxor		$rndkey0,$in2
3941	aesdec		$rndkey1,$inout4
3942	aesdec		$rndkey1,$inout5
3943	pxor		$rndkey0,$in3
3944	pxor		$rndkey0,$in4
3945	aesdec		$rndkey1,$inout6
3946	aesdec		$rndkey1,$inout7
3947	movdqu		0x50($inp),$rndkey1
3948
3949	aesdeclast	$iv,$inout0
3950	movdqu		0x60($inp),$iv		# borrow $iv
3951	pxor		$rndkey0,$rndkey1
3952	aesdeclast	$in0,$inout1
3953	pxor		$rndkey0,$iv
3954	movdqu		0x70($inp),$rndkey0	# next IV
3955	aesdeclast	$in1,$inout2
3956	lea		0x80($inp),$inp
3957	movdqu		0x00($inp_),$in0
3958	aesdeclast	$in2,$inout3
3959	aesdeclast	$in3,$inout4
3960	movdqu		0x10($inp_),$in1
3961	movdqu		0x20($inp_),$in2
3962	aesdeclast	$in4,$inout5
3963	aesdeclast	$rndkey1,$inout6
3964	movdqu		0x30($inp_),$in3
3965	movdqu		0x40($inp_),$in4
3966	aesdeclast	$iv,$inout7
3967	movdqa		$rndkey0,$iv		# return $iv
3968	movdqu		0x50($inp_),$rndkey1
3969	$movkey		-0x70($key),$rndkey0
3970
3971	movups		$inout0,($out)		# store output
3972	movdqa		$in0,$inout0
3973	movups		$inout1,0x10($out)
3974	movdqa		$in1,$inout1
3975	movups		$inout2,0x20($out)
3976	movdqa		$in2,$inout2
3977	movups		$inout3,0x30($out)
3978	movdqa		$in3,$inout3
3979	movups		$inout4,0x40($out)
3980	movdqa		$in4,$inout4
3981	movups		$inout5,0x50($out)
3982	movdqa		$rndkey1,$inout5
3983	movups		$inout6,0x60($out)
3984	lea		0x70($out),$out
3985
3986	sub	\$0x80,$len
3987	ja	.Lcbc_dec_loop8
3988
3989	movaps	$inout7,$inout0
3990	lea	-0x70($key),$key
3991	add	\$0x70,$len
3992	jle	.Lcbc_dec_clear_tail_collected
3993	movups	$inout7,($out)
3994	lea	0x10($out),$out
3995	cmp	\$0x50,$len
3996	jbe	.Lcbc_dec_tail
3997
3998	movaps	$in0,$inout0
3999.Lcbc_dec_six_or_seven:
4000	cmp	\$0x60,$len
4001	ja	.Lcbc_dec_seven
4002
4003	movaps	$inout5,$inout6
4004	call	_aesni_decrypt6
4005	pxor	$iv,$inout0		# ^= IV
4006	movaps	$inout6,$iv
4007	pxor	$in0,$inout1
4008	movdqu	$inout0,($out)
4009	pxor	$in1,$inout2
4010	movdqu	$inout1,0x10($out)
4011	 pxor	$inout1,$inout1		# clear register bank
4012	pxor	$in2,$inout3
4013	movdqu	$inout2,0x20($out)
4014	 pxor	$inout2,$inout2
4015	pxor	$in3,$inout4
4016	movdqu	$inout3,0x30($out)
4017	 pxor	$inout3,$inout3
4018	pxor	$in4,$inout5
4019	movdqu	$inout4,0x40($out)
4020	 pxor	$inout4,$inout4
4021	lea	0x50($out),$out
4022	movdqa	$inout5,$inout0
4023	 pxor	$inout5,$inout5
4024	jmp	.Lcbc_dec_tail_collected
4025
4026.align	16
4027.Lcbc_dec_seven:
4028	movups	0x60($inp),$inout6
4029	xorps	$inout7,$inout7
4030	call	_aesni_decrypt8
4031	movups	0x50($inp),$inout7
4032	pxor	$iv,$inout0		# ^= IV
4033	movups	0x60($inp),$iv
4034	pxor	$in0,$inout1
4035	movdqu	$inout0,($out)
4036	pxor	$in1,$inout2
4037	movdqu	$inout1,0x10($out)
4038	 pxor	$inout1,$inout1		# clear register bank
4039	pxor	$in2,$inout3
4040	movdqu	$inout2,0x20($out)
4041	 pxor	$inout2,$inout2
4042	pxor	$in3,$inout4
4043	movdqu	$inout3,0x30($out)
4044	 pxor	$inout3,$inout3
4045	pxor	$in4,$inout5
4046	movdqu	$inout4,0x40($out)
4047	 pxor	$inout4,$inout4
4048	pxor	$inout7,$inout6
4049	movdqu	$inout5,0x50($out)
4050	 pxor	$inout5,$inout5
4051	lea	0x60($out),$out
4052	movdqa	$inout6,$inout0
4053	 pxor	$inout6,$inout6
4054	 pxor	$inout7,$inout7
4055	jmp	.Lcbc_dec_tail_collected
4056
4057.align	16
4058.Lcbc_dec_loop6:
4059	movups	$inout5,($out)
4060	lea	0x10($out),$out
4061	movdqu	0x00($inp),$inout0	# load input
4062	movdqu	0x10($inp),$inout1
4063	movdqa	$inout0,$in0
4064	movdqu	0x20($inp),$inout2
4065	movdqa	$inout1,$in1
4066	movdqu	0x30($inp),$inout3
4067	movdqa	$inout2,$in2
4068	movdqu	0x40($inp),$inout4
4069	movdqa	$inout3,$in3
4070	movdqu	0x50($inp),$inout5
4071	movdqa	$inout4,$in4
4072.Lcbc_dec_loop6_enter:
4073	lea	0x60($inp),$inp
4074	movdqa	$inout5,$inout6
4075
4076	call	_aesni_decrypt6
4077
4078	pxor	$iv,$inout0		# ^= IV
4079	movdqa	$inout6,$iv
4080	pxor	$in0,$inout1
4081	movdqu	$inout0,($out)
4082	pxor	$in1,$inout2
4083	movdqu	$inout1,0x10($out)
4084	pxor	$in2,$inout3
4085	movdqu	$inout2,0x20($out)
4086	pxor	$in3,$inout4
4087	mov	$key_,$key
4088	movdqu	$inout3,0x30($out)
4089	pxor	$in4,$inout5
4090	mov	$rnds_,$rounds
4091	movdqu	$inout4,0x40($out)
4092	lea	0x50($out),$out
4093	sub	\$0x60,$len
4094	ja	.Lcbc_dec_loop6
4095
4096	movdqa	$inout5,$inout0
4097	add	\$0x50,$len
4098	jle	.Lcbc_dec_clear_tail_collected
4099	movups	$inout5,($out)
4100	lea	0x10($out),$out
4101
4102.Lcbc_dec_tail:
4103	movups	($inp),$inout0
4104	sub	\$0x10,$len
4105	jbe	.Lcbc_dec_one		# $len is 1*16 or less
4106
4107	movups	0x10($inp),$inout1
4108	movaps	$inout0,$in0
4109	sub	\$0x10,$len
4110	jbe	.Lcbc_dec_two		# $len is 2*16 or less
4111
4112	movups	0x20($inp),$inout2
4113	movaps	$inout1,$in1
4114	sub	\$0x10,$len
4115	jbe	.Lcbc_dec_three		# $len is 3*16 or less
4116
4117	movups	0x30($inp),$inout3
4118	movaps	$inout2,$in2
4119	sub	\$0x10,$len
4120	jbe	.Lcbc_dec_four		# $len is 4*16 or less
4121
4122	movups	0x40($inp),$inout4	# $len is 5*16 or less
4123	movaps	$inout3,$in3
4124	movaps	$inout4,$in4
4125	xorps	$inout5,$inout5
4126	call	_aesni_decrypt6
4127	pxor	$iv,$inout0
4128	movaps	$in4,$iv
4129	pxor	$in0,$inout1
4130	movdqu	$inout0,($out)
4131	pxor	$in1,$inout2
4132	movdqu	$inout1,0x10($out)
4133	 pxor	$inout1,$inout1		# clear register bank
4134	pxor	$in2,$inout3
4135	movdqu	$inout2,0x20($out)
4136	 pxor	$inout2,$inout2
4137	pxor	$in3,$inout4
4138	movdqu	$inout3,0x30($out)
4139	 pxor	$inout3,$inout3
4140	lea	0x40($out),$out
4141	movdqa	$inout4,$inout0
4142	 pxor	$inout4,$inout4
4143	 pxor	$inout5,$inout5
4144	sub	\$0x10,$len
4145	jmp	.Lcbc_dec_tail_collected
4146
4147.align	16
4148.Lcbc_dec_one:
4149	movaps	$inout0,$in0
4150___
4151	&aesni_generate1("dec",$key,$rounds);
4152$code.=<<___;
4153	xorps	$iv,$inout0
4154	movaps	$in0,$iv
4155	jmp	.Lcbc_dec_tail_collected
4156.align	16
4157.Lcbc_dec_two:
4158	movaps	$inout1,$in1
4159	call	_aesni_decrypt2
4160	pxor	$iv,$inout0
4161	movaps	$in1,$iv
4162	pxor	$in0,$inout1
4163	movdqu	$inout0,($out)
4164	movdqa	$inout1,$inout0
4165	 pxor	$inout1,$inout1		# clear register bank
4166	lea	0x10($out),$out
4167	jmp	.Lcbc_dec_tail_collected
4168.align	16
4169.Lcbc_dec_three:
4170	movaps	$inout2,$in2
4171	call	_aesni_decrypt3
4172	pxor	$iv,$inout0
4173	movaps	$in2,$iv
4174	pxor	$in0,$inout1
4175	movdqu	$inout0,($out)
4176	pxor	$in1,$inout2
4177	movdqu	$inout1,0x10($out)
4178	 pxor	$inout1,$inout1		# clear register bank
4179	movdqa	$inout2,$inout0
4180	 pxor	$inout2,$inout2
4181	lea	0x20($out),$out
4182	jmp	.Lcbc_dec_tail_collected
4183.align	16
4184.Lcbc_dec_four:
4185	movaps	$inout3,$in3
4186	call	_aesni_decrypt4
4187	pxor	$iv,$inout0
4188	movaps	$in3,$iv
4189	pxor	$in0,$inout1
4190	movdqu	$inout0,($out)
4191	pxor	$in1,$inout2
4192	movdqu	$inout1,0x10($out)
4193	 pxor	$inout1,$inout1		# clear register bank
4194	pxor	$in2,$inout3
4195	movdqu	$inout2,0x20($out)
4196	 pxor	$inout2,$inout2
4197	movdqa	$inout3,$inout0
4198	 pxor	$inout3,$inout3
4199	lea	0x30($out),$out
4200	jmp	.Lcbc_dec_tail_collected
4201
4202.align	16
4203.Lcbc_dec_clear_tail_collected:
4204	pxor	$inout1,$inout1		# clear register bank
4205	pxor	$inout2,$inout2
4206	pxor	$inout3,$inout3
4207___
4208$code.=<<___ if (!$win64);
4209	pxor	$inout4,$inout4		# %xmm6..9
4210	pxor	$inout5,$inout5
4211	pxor	$inout6,$inout6
4212	pxor	$inout7,$inout7
4213___
4214$code.=<<___;
4215.Lcbc_dec_tail_collected:
4216	movups	$iv,($ivp)
4217	and	\$15,$len
4218	jnz	.Lcbc_dec_tail_partial
4219	movups	$inout0,($out)
4220	pxor	$inout0,$inout0
4221	jmp	.Lcbc_dec_ret
4222.align	16
4223.Lcbc_dec_tail_partial:
4224	movaps	$inout0,(%rsp)
4225	pxor	$inout0,$inout0
4226	mov	\$16,%rcx
4227	mov	$out,%rdi
4228	sub	$len,%rcx
4229	lea	(%rsp),%rsi
4230	.long	0x9066A4F3		# rep movsb
4231	movdqa	$inout0,(%rsp)
4232
4233.Lcbc_dec_ret:
4234	xorps	$rndkey0,$rndkey0	# %xmm0
4235	pxor	$rndkey1,$rndkey1
4236___
4237$code.=<<___ if ($win64);
4238	movaps	0x10(%rsp),%xmm6
4239	movaps	%xmm0,0x10(%rsp)	# clear stack
4240	movaps	0x20(%rsp),%xmm7
4241	movaps	%xmm0,0x20(%rsp)
4242	movaps	0x30(%rsp),%xmm8
4243	movaps	%xmm0,0x30(%rsp)
4244	movaps	0x40(%rsp),%xmm9
4245	movaps	%xmm0,0x40(%rsp)
4246	movaps	0x50(%rsp),%xmm10
4247	movaps	%xmm0,0x50(%rsp)
4248	movaps	0x60(%rsp),%xmm11
4249	movaps	%xmm0,0x60(%rsp)
4250	movaps	0x70(%rsp),%xmm12
4251	movaps	%xmm0,0x70(%rsp)
4252	movaps	0x80(%rsp),%xmm13
4253	movaps	%xmm0,0x80(%rsp)
4254	movaps	0x90(%rsp),%xmm14
4255	movaps	%xmm0,0x90(%rsp)
4256	movaps	0xa0(%rsp),%xmm15
4257	movaps	%xmm0,0xa0(%rsp)
4258___
4259$code.=<<___;
4260	mov	-8(%r11),%rbp
4261.cfi_restore	%rbp
4262	lea	(%r11),%rsp
4263.cfi_def_cfa_register	%rsp
4264.Lcbc_ret:
4265	ret
4266.cfi_endproc
4267.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4268___
4269}
4270# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4271#				int bits, AES_KEY *key)
4272#
4273# input:	$inp	user-supplied key
4274#		$bits	$inp length in bits
4275#		$key	pointer to key schedule
4276# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4277#		*$key	key schedule
4278#
4279{ my ($inp,$bits,$key) = @_4args;
4280  $bits =~ s/%r/%e/;
4281
4282$code.=<<___;
4283.globl	${PREFIX}_set_decrypt_key
4284.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
4285.align	16
4286${PREFIX}_set_decrypt_key:
4287.cfi_startproc
4288	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4289.cfi_adjust_cfa_offset	8
4290	call	__aesni_set_encrypt_key
4291	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
4292	test	%eax,%eax
4293	jnz	.Ldec_key_ret
4294	lea	16($key,$bits),$inp	# points at the end of key schedule
4295
4296	$movkey	($key),%xmm0		# just swap
4297	$movkey	($inp),%xmm1
4298	$movkey	%xmm0,($inp)
4299	$movkey	%xmm1,($key)
4300	lea	16($key),$key
4301	lea	-16($inp),$inp
4302
4303.Ldec_key_inverse:
4304	$movkey	($key),%xmm0		# swap and inverse
4305	$movkey	($inp),%xmm1
4306	aesimc	%xmm0,%xmm0
4307	aesimc	%xmm1,%xmm1
4308	lea	16($key),$key
4309	lea	-16($inp),$inp
4310	$movkey	%xmm0,16($inp)
4311	$movkey	%xmm1,-16($key)
4312	cmp	$key,$inp
4313	ja	.Ldec_key_inverse
4314
4315	$movkey	($key),%xmm0		# inverse middle
4316	aesimc	%xmm0,%xmm0
4317	pxor	%xmm1,%xmm1
4318	$movkey	%xmm0,($inp)
4319	pxor	%xmm0,%xmm0
4320.Ldec_key_ret:
4321	add	\$8,%rsp
4322.cfi_adjust_cfa_offset	-8
4323	ret
4324.cfi_endproc
4325.LSEH_end_set_decrypt_key:
4326.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4327___
4328
4329# This is based on submission from Intel by
4330#	Huang Ying
4331#	Vinodh Gopal
4332#	Kahraman Akdemir
4333#
4334# Aggressively optimized in respect to aeskeygenassist's critical path
4335# and is contained in %xmm0-5 to meet Win64 ABI requirement.
4336#
4337# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4338#				int bits, AES_KEY * const key);
4339#
4340# input:	$inp	user-supplied key
4341#		$bits	$inp length in bits
4342#		$key	pointer to key schedule
4343# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4344#		$bits	rounds-1 (used in aesni_set_decrypt_key)
4345#		*$key	key schedule
4346#		$key	pointer to key schedule (used in
4347#			aesni_set_decrypt_key)
4348#
4349# Subroutine is frame-less, which means that only volatile registers
4350# are used. Note that it's declared "abi-omnipotent", which means that
4351# amount of volatile registers is smaller on Windows.
4352#
4353$code.=<<___;
4354.globl	${PREFIX}_set_encrypt_key
4355.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
4356.align	16
4357${PREFIX}_set_encrypt_key:
4358__aesni_set_encrypt_key:
4359.cfi_startproc
4360	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4361.cfi_adjust_cfa_offset	8
4362	mov	\$-1,%rax
4363	test	$inp,$inp
4364	jz	.Lenc_key_ret
4365	test	$key,$key
4366	jz	.Lenc_key_ret
4367
4368	mov	\$`1<<28|1<<11`,%r10d	# AVX and XOP bits
4369	movups	($inp),%xmm0		# pull first 128 bits of *userKey
4370	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
4371	and	OPENSSL_ia32cap_P+4(%rip),%r10d
4372	lea	16($key),%rax		# %rax is used as modifiable copy of $key
4373	cmp	\$256,$bits
4374	je	.L14rounds
4375	cmp	\$192,$bits
4376	je	.L12rounds
4377	cmp	\$128,$bits
4378	jne	.Lbad_keybits
4379
4380.L10rounds:
4381	mov	\$9,$bits			# 10 rounds for 128-bit key
4382	cmp	\$`1<<28`,%r10d			# AVX, bit no XOP
4383	je	.L10rounds_alt
4384
4385	$movkey	%xmm0,($key)			# round 0
4386	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
4387	call		.Lkey_expansion_128_cold
4388	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
4389	call		.Lkey_expansion_128
4390	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
4391	call		.Lkey_expansion_128
4392	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
4393	call		.Lkey_expansion_128
4394	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
4395	call		.Lkey_expansion_128
4396	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
4397	call		.Lkey_expansion_128
4398	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
4399	call		.Lkey_expansion_128
4400	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
4401	call		.Lkey_expansion_128
4402	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
4403	call		.Lkey_expansion_128
4404	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
4405	call		.Lkey_expansion_128
4406	$movkey	%xmm0,(%rax)
4407	mov	$bits,80(%rax)	# 240(%rdx)
4408	xor	%eax,%eax
4409	jmp	.Lenc_key_ret
4410
4411.align	16
4412.L10rounds_alt:
4413	movdqa	.Lkey_rotate(%rip),%xmm5
4414	mov	\$8,%r10d
4415	movdqa	.Lkey_rcon1(%rip),%xmm4
4416	movdqa	%xmm0,%xmm2
4417	movdqu	%xmm0,($key)
4418	jmp	.Loop_key128
4419
4420.align	16
4421.Loop_key128:
4422	pshufb		%xmm5,%xmm0
4423	aesenclast	%xmm4,%xmm0
4424	pslld		\$1,%xmm4
4425	lea		16(%rax),%rax
4426
4427	movdqa		%xmm2,%xmm3
4428	pslldq		\$4,%xmm2
4429	pxor		%xmm2,%xmm3
4430	pslldq		\$4,%xmm2
4431	pxor		%xmm2,%xmm3
4432	pslldq		\$4,%xmm2
4433	pxor		%xmm3,%xmm2
4434
4435	pxor		%xmm2,%xmm0
4436	movdqu		%xmm0,-16(%rax)
4437	movdqa		%xmm0,%xmm2
4438
4439	dec	%r10d
4440	jnz	.Loop_key128
4441
4442	movdqa		.Lkey_rcon1b(%rip),%xmm4
4443
4444	pshufb		%xmm5,%xmm0
4445	aesenclast	%xmm4,%xmm0
4446	pslld		\$1,%xmm4
4447
4448	movdqa		%xmm2,%xmm3
4449	pslldq		\$4,%xmm2
4450	pxor		%xmm2,%xmm3
4451	pslldq		\$4,%xmm2
4452	pxor		%xmm2,%xmm3
4453	pslldq		\$4,%xmm2
4454	pxor		%xmm3,%xmm2
4455
4456	pxor		%xmm2,%xmm0
4457	movdqu		%xmm0,(%rax)
4458
4459	movdqa		%xmm0,%xmm2
4460	pshufb		%xmm5,%xmm0
4461	aesenclast	%xmm4,%xmm0
4462
4463	movdqa		%xmm2,%xmm3
4464	pslldq		\$4,%xmm2
4465	pxor		%xmm2,%xmm3
4466	pslldq		\$4,%xmm2
4467	pxor		%xmm2,%xmm3
4468	pslldq		\$4,%xmm2
4469	pxor		%xmm3,%xmm2
4470
4471	pxor		%xmm2,%xmm0
4472	movdqu		%xmm0,16(%rax)
4473
4474	mov	$bits,96(%rax)	# 240($key)
4475	xor	%eax,%eax
4476	jmp	.Lenc_key_ret
4477
4478.align	16
4479.L12rounds:
4480	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
4481	mov	\$11,$bits			# 12 rounds for 192
4482	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4483	je	.L12rounds_alt
4484
4485	$movkey	%xmm0,($key)			# round 0
4486	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
4487	call		.Lkey_expansion_192a_cold
4488	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
4489	call		.Lkey_expansion_192b
4490	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
4491	call		.Lkey_expansion_192a
4492	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
4493	call		.Lkey_expansion_192b
4494	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
4495	call		.Lkey_expansion_192a
4496	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
4497	call		.Lkey_expansion_192b
4498	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
4499	call		.Lkey_expansion_192a
4500	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
4501	call		.Lkey_expansion_192b
4502	$movkey	%xmm0,(%rax)
4503	mov	$bits,48(%rax)	# 240(%rdx)
4504	xor	%rax, %rax
4505	jmp	.Lenc_key_ret
4506
4507.align	16
4508.L12rounds_alt:
4509	movdqa	.Lkey_rotate192(%rip),%xmm5
4510	movdqa	.Lkey_rcon1(%rip),%xmm4
4511	mov	\$8,%r10d
4512	movdqu	%xmm0,($key)
4513	jmp	.Loop_key192
4514
4515.align	16
4516.Loop_key192:
4517	movq		%xmm2,0(%rax)
4518	movdqa		%xmm2,%xmm1
4519	pshufb		%xmm5,%xmm2
4520	aesenclast	%xmm4,%xmm2
4521	pslld		\$1, %xmm4
4522	lea		24(%rax),%rax
4523
4524	movdqa		%xmm0,%xmm3
4525	pslldq		\$4,%xmm0
4526	pxor		%xmm0,%xmm3
4527	pslldq		\$4,%xmm0
4528	pxor		%xmm0,%xmm3
4529	pslldq		\$4,%xmm0
4530	pxor		%xmm3,%xmm0
4531
4532	pshufd		\$0xff,%xmm0,%xmm3
4533	pxor		%xmm1,%xmm3
4534	pslldq		\$4,%xmm1
4535	pxor		%xmm1,%xmm3
4536
4537	pxor		%xmm2,%xmm0
4538	pxor		%xmm3,%xmm2
4539	movdqu		%xmm0,-16(%rax)
4540
4541	dec	%r10d
4542	jnz	.Loop_key192
4543
4544	mov	$bits,32(%rax)	# 240($key)
4545	xor	%eax,%eax
4546	jmp	.Lenc_key_ret
4547
4548.align	16
4549.L14rounds:
4550	movups	16($inp),%xmm2			# remaining half of *userKey
4551	mov	\$13,$bits			# 14 rounds for 256
4552	lea	16(%rax),%rax
4553	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4554	je	.L14rounds_alt
4555
4556	$movkey	%xmm0,($key)			# round 0
4557	$movkey	%xmm2,16($key)			# round 1
4558	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
4559	call		.Lkey_expansion_256a_cold
4560	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
4561	call		.Lkey_expansion_256b
4562	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
4563	call		.Lkey_expansion_256a
4564	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
4565	call		.Lkey_expansion_256b
4566	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
4567	call		.Lkey_expansion_256a
4568	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
4569	call		.Lkey_expansion_256b
4570	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
4571	call		.Lkey_expansion_256a
4572	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
4573	call		.Lkey_expansion_256b
4574	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
4575	call		.Lkey_expansion_256a
4576	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
4577	call		.Lkey_expansion_256b
4578	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
4579	call		.Lkey_expansion_256a
4580	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
4581	call		.Lkey_expansion_256b
4582	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
4583	call		.Lkey_expansion_256a
4584	$movkey	%xmm0,(%rax)
4585	mov	$bits,16(%rax)	# 240(%rdx)
4586	xor	%rax,%rax
4587	jmp	.Lenc_key_ret
4588
4589.align	16
4590.L14rounds_alt:
4591	movdqa	.Lkey_rotate(%rip),%xmm5
4592	movdqa	.Lkey_rcon1(%rip),%xmm4
4593	mov	\$7,%r10d
4594	movdqu	%xmm0,0($key)
4595	movdqa	%xmm2,%xmm1
4596	movdqu	%xmm2,16($key)
4597	jmp	.Loop_key256
4598
4599.align	16
4600.Loop_key256:
4601	pshufb		%xmm5,%xmm2
4602	aesenclast	%xmm4,%xmm2
4603
4604	movdqa		%xmm0,%xmm3
4605	pslldq		\$4,%xmm0
4606	pxor		%xmm0,%xmm3
4607	pslldq		\$4,%xmm0
4608	pxor		%xmm0,%xmm3
4609	pslldq		\$4,%xmm0
4610	pxor		%xmm3,%xmm0
4611	pslld		\$1,%xmm4
4612
4613	pxor		%xmm2,%xmm0
4614	movdqu		%xmm0,(%rax)
4615
4616	dec	%r10d
4617	jz	.Ldone_key256
4618
4619	pshufd		\$0xff,%xmm0,%xmm2
4620	pxor		%xmm3,%xmm3
4621	aesenclast	%xmm3,%xmm2
4622
4623	movdqa		%xmm1,%xmm3
4624	pslldq		\$4,%xmm1
4625	pxor		%xmm1,%xmm3
4626	pslldq		\$4,%xmm1
4627	pxor		%xmm1,%xmm3
4628	pslldq		\$4,%xmm1
4629	pxor		%xmm3,%xmm1
4630
4631	pxor		%xmm1,%xmm2
4632	movdqu		%xmm2,16(%rax)
4633	lea		32(%rax),%rax
4634	movdqa		%xmm2,%xmm1
4635
4636	jmp	.Loop_key256
4637
4638.Ldone_key256:
4639	mov	$bits,16(%rax)	# 240($key)
4640	xor	%eax,%eax
4641	jmp	.Lenc_key_ret
4642
4643.align	16
4644.Lbad_keybits:
4645	mov	\$-2,%rax
4646.Lenc_key_ret:
4647	pxor	%xmm0,%xmm0
4648	pxor	%xmm1,%xmm1
4649	pxor	%xmm2,%xmm2
4650	pxor	%xmm3,%xmm3
4651	pxor	%xmm4,%xmm4
4652	pxor	%xmm5,%xmm5
4653	add	\$8,%rsp
4654.cfi_adjust_cfa_offset	-8
4655	ret
4656.LSEH_end_set_encrypt_key:
4657
4658.align	16
4659.Lkey_expansion_128:
4660	$movkey	%xmm0,(%rax)
4661	lea	16(%rax),%rax
4662.Lkey_expansion_128_cold:
4663	shufps	\$0b00010000,%xmm0,%xmm4
4664	xorps	%xmm4, %xmm0
4665	shufps	\$0b10001100,%xmm0,%xmm4
4666	xorps	%xmm4, %xmm0
4667	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4668	xorps	%xmm1,%xmm0
4669	ret
4670
4671.align 16
4672.Lkey_expansion_192a:
4673	$movkey	%xmm0,(%rax)
4674	lea	16(%rax),%rax
4675.Lkey_expansion_192a_cold:
4676	movaps	%xmm2, %xmm5
4677.Lkey_expansion_192b_warm:
4678	shufps	\$0b00010000,%xmm0,%xmm4
4679	movdqa	%xmm2,%xmm3
4680	xorps	%xmm4,%xmm0
4681	shufps	\$0b10001100,%xmm0,%xmm4
4682	pslldq	\$4,%xmm3
4683	xorps	%xmm4,%xmm0
4684	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
4685	pxor	%xmm3,%xmm2
4686	pxor	%xmm1,%xmm0
4687	pshufd	\$0b11111111,%xmm0,%xmm3
4688	pxor	%xmm3,%xmm2
4689	ret
4690
4691.align 16
4692.Lkey_expansion_192b:
4693	movaps	%xmm0,%xmm3
4694	shufps	\$0b01000100,%xmm0,%xmm5
4695	$movkey	%xmm5,(%rax)
4696	shufps	\$0b01001110,%xmm2,%xmm3
4697	$movkey	%xmm3,16(%rax)
4698	lea	32(%rax),%rax
4699	jmp	.Lkey_expansion_192b_warm
4700
4701.align	16
4702.Lkey_expansion_256a:
4703	$movkey	%xmm2,(%rax)
4704	lea	16(%rax),%rax
4705.Lkey_expansion_256a_cold:
4706	shufps	\$0b00010000,%xmm0,%xmm4
4707	xorps	%xmm4,%xmm0
4708	shufps	\$0b10001100,%xmm0,%xmm4
4709	xorps	%xmm4,%xmm0
4710	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4711	xorps	%xmm1,%xmm0
4712	ret
4713
4714.align 16
4715.Lkey_expansion_256b:
4716	$movkey	%xmm0,(%rax)
4717	lea	16(%rax),%rax
4718
4719	shufps	\$0b00010000,%xmm2,%xmm4
4720	xorps	%xmm4,%xmm2
4721	shufps	\$0b10001100,%xmm2,%xmm4
4722	xorps	%xmm4,%xmm2
4723	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
4724	xorps	%xmm1,%xmm2
4725	ret
4726.cfi_endproc
4727.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4728.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4729___
4730}
4731
4732$code.=<<___;
4733.align	64
4734.Lbswap_mask:
4735	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4736.Lincrement32:
4737	.long	6,6,6,0
4738.Lincrement64:
4739	.long	1,0,0,0
4740.Lxts_magic:
4741	.long	0x87,0,1,0
4742.Lincrement1:
4743	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4744.Lkey_rotate:
4745	.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4746.Lkey_rotate192:
4747	.long	0x04070605,0x04070605,0x04070605,0x04070605
4748.Lkey_rcon1:
4749	.long	1,1,1,1
4750.Lkey_rcon1b:
4751	.long	0x1b,0x1b,0x1b,0x1b
4752
4753.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4754.align	64
4755___
4756
4757# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4758#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
4759if ($win64) {
4760$rec="%rcx";
4761$frame="%rdx";
4762$context="%r8";
4763$disp="%r9";
4764
4765$code.=<<___;
4766.extern	__imp_RtlVirtualUnwind
4767___
4768$code.=<<___ if ($PREFIX eq "aesni");
4769.type	ecb_ccm64_se_handler,\@abi-omnipotent
4770.align	16
4771ecb_ccm64_se_handler:
4772	push	%rsi
4773	push	%rdi
4774	push	%rbx
4775	push	%rbp
4776	push	%r12
4777	push	%r13
4778	push	%r14
4779	push	%r15
4780	pushfq
4781	sub	\$64,%rsp
4782
4783	mov	120($context),%rax	# pull context->Rax
4784	mov	248($context),%rbx	# pull context->Rip
4785
4786	mov	8($disp),%rsi		# disp->ImageBase
4787	mov	56($disp),%r11		# disp->HandlerData
4788
4789	mov	0(%r11),%r10d		# HandlerData[0]
4790	lea	(%rsi,%r10),%r10	# prologue label
4791	cmp	%r10,%rbx		# context->Rip<prologue label
4792	jb	.Lcommon_seh_tail
4793
4794	mov	152($context),%rax	# pull context->Rsp
4795
4796	mov	4(%r11),%r10d		# HandlerData[1]
4797	lea	(%rsi,%r10),%r10	# epilogue label
4798	cmp	%r10,%rbx		# context->Rip>=epilogue label
4799	jae	.Lcommon_seh_tail
4800
4801	lea	0(%rax),%rsi		# %xmm save area
4802	lea	512($context),%rdi	# &context.Xmm6
4803	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
4804	.long	0xa548f3fc		# cld; rep movsq
4805	lea	0x58(%rax),%rax		# adjust stack pointer
4806
4807	jmp	.Lcommon_seh_tail
4808.size	ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4809
4810.type	ctr_xts_se_handler,\@abi-omnipotent
4811.align	16
4812ctr_xts_se_handler:
4813	push	%rsi
4814	push	%rdi
4815	push	%rbx
4816	push	%rbp
4817	push	%r12
4818	push	%r13
4819	push	%r14
4820	push	%r15
4821	pushfq
4822	sub	\$64,%rsp
4823
4824	mov	120($context),%rax	# pull context->Rax
4825	mov	248($context),%rbx	# pull context->Rip
4826
4827	mov	8($disp),%rsi		# disp->ImageBase
4828	mov	56($disp),%r11		# disp->HandlerData
4829
4830	mov	0(%r11),%r10d		# HandlerData[0]
4831	lea	(%rsi,%r10),%r10	# prologue label
4832	cmp	%r10,%rbx		# context->Rip<prologue label
4833	jb	.Lcommon_seh_tail
4834
4835	mov	152($context),%rax	# pull context->Rsp
4836
4837	mov	4(%r11),%r10d		# HandlerData[1]
4838	lea	(%rsi,%r10),%r10	# epilogue label
4839	cmp	%r10,%rbx		# context->Rip>=epilogue label
4840	jae	.Lcommon_seh_tail
4841
4842	mov	208($context),%rax	# pull context->R11
4843
4844	lea	-0xa8(%rax),%rsi	# %xmm save area
4845	lea	512($context),%rdi	# & context.Xmm6
4846	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4847	.long	0xa548f3fc		# cld; rep movsq
4848
4849	mov	-8(%rax),%rbp		# restore saved %rbp
4850	mov	%rbp,160($context)	# restore context->Rbp
4851	jmp	.Lcommon_seh_tail
4852.size	ctr_xts_se_handler,.-ctr_xts_se_handler
4853
4854.type	ocb_se_handler,\@abi-omnipotent
4855.align	16
4856ocb_se_handler:
4857	push	%rsi
4858	push	%rdi
4859	push	%rbx
4860	push	%rbp
4861	push	%r12
4862	push	%r13
4863	push	%r14
4864	push	%r15
4865	pushfq
4866	sub	\$64,%rsp
4867
4868	mov	120($context),%rax	# pull context->Rax
4869	mov	248($context),%rbx	# pull context->Rip
4870
4871	mov	8($disp),%rsi		# disp->ImageBase
4872	mov	56($disp),%r11		# disp->HandlerData
4873
4874	mov	0(%r11),%r10d		# HandlerData[0]
4875	lea	(%rsi,%r10),%r10	# prologue label
4876	cmp	%r10,%rbx		# context->Rip<prologue label
4877	jb	.Lcommon_seh_tail
4878
4879	mov	4(%r11),%r10d		# HandlerData[1]
4880	lea	(%rsi,%r10),%r10	# epilogue label
4881	cmp	%r10,%rbx		# context->Rip>=epilogue label
4882	jae	.Lcommon_seh_tail
4883
4884	mov	8(%r11),%r10d		# HandlerData[2]
4885	lea	(%rsi,%r10),%r10
4886	cmp	%r10,%rbx		# context->Rip>=pop label
4887	jae	.Locb_no_xmm
4888
4889	mov	152($context),%rax	# pull context->Rsp
4890
4891	lea	(%rax),%rsi		# %xmm save area
4892	lea	512($context),%rdi	# & context.Xmm6
4893	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4894	.long	0xa548f3fc		# cld; rep movsq
4895	lea	0xa0+0x28(%rax),%rax
4896
4897.Locb_no_xmm:
4898	mov	-8(%rax),%rbx
4899	mov	-16(%rax),%rbp
4900	mov	-24(%rax),%r12
4901	mov	-32(%rax),%r13
4902	mov	-40(%rax),%r14
4903
4904	mov	%rbx,144($context)	# restore context->Rbx
4905	mov	%rbp,160($context)	# restore context->Rbp
4906	mov	%r12,216($context)	# restore context->R12
4907	mov	%r13,224($context)	# restore context->R13
4908	mov	%r14,232($context)	# restore context->R14
4909
4910	jmp	.Lcommon_seh_tail
4911.size	ocb_se_handler,.-ocb_se_handler
4912___
4913$code.=<<___;
4914.type	cbc_se_handler,\@abi-omnipotent
4915.align	16
4916cbc_se_handler:
4917	push	%rsi
4918	push	%rdi
4919	push	%rbx
4920	push	%rbp
4921	push	%r12
4922	push	%r13
4923	push	%r14
4924	push	%r15
4925	pushfq
4926	sub	\$64,%rsp
4927
4928	mov	152($context),%rax	# pull context->Rsp
4929	mov	248($context),%rbx	# pull context->Rip
4930
4931	lea	.Lcbc_decrypt_bulk(%rip),%r10
4932	cmp	%r10,%rbx		# context->Rip<"prologue" label
4933	jb	.Lcommon_seh_tail
4934
4935	mov	120($context),%rax	# pull context->Rax
4936
4937	lea	.Lcbc_decrypt_body(%rip),%r10
4938	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
4939	jb	.Lcommon_seh_tail
4940
4941	mov	152($context),%rax	# pull context->Rsp
4942
4943	lea	.Lcbc_ret(%rip),%r10
4944	cmp	%r10,%rbx		# context->Rip>="epilogue" label
4945	jae	.Lcommon_seh_tail
4946
4947	lea	16(%rax),%rsi		# %xmm save area
4948	lea	512($context),%rdi	# &context.Xmm6
4949	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4950	.long	0xa548f3fc		# cld; rep movsq
4951
4952	mov	208($context),%rax	# pull context->R11
4953
4954	mov	-8(%rax),%rbp		# restore saved %rbp
4955	mov	%rbp,160($context)	# restore context->Rbp
4956
4957.Lcommon_seh_tail:
4958	mov	8(%rax),%rdi
4959	mov	16(%rax),%rsi
4960	mov	%rax,152($context)	# restore context->Rsp
4961	mov	%rsi,168($context)	# restore context->Rsi
4962	mov	%rdi,176($context)	# restore context->Rdi
4963
4964	mov	40($disp),%rdi		# disp->ContextRecord
4965	mov	$context,%rsi		# context
4966	mov	\$154,%ecx		# sizeof(CONTEXT)
4967	.long	0xa548f3fc		# cld; rep movsq
4968
4969	mov	$disp,%rsi
4970	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4971	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4972	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4973	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4974	mov	40(%rsi),%r10		# disp->ContextRecord
4975	lea	56(%rsi),%r11		# &disp->HandlerData
4976	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4977	mov	%r10,32(%rsp)		# arg5
4978	mov	%r11,40(%rsp)		# arg6
4979	mov	%r12,48(%rsp)		# arg7
4980	mov	%rcx,56(%rsp)		# arg8, (NULL)
4981	call	*__imp_RtlVirtualUnwind(%rip)
4982
4983	mov	\$1,%eax		# ExceptionContinueSearch
4984	add	\$64,%rsp
4985	popfq
4986	pop	%r15
4987	pop	%r14
4988	pop	%r13
4989	pop	%r12
4990	pop	%rbp
4991	pop	%rbx
4992	pop	%rdi
4993	pop	%rsi
4994	ret
4995.size	cbc_se_handler,.-cbc_se_handler
4996
4997.section	.pdata
4998.align	4
4999___
5000$code.=<<___ if ($PREFIX eq "aesni");
5001	.rva	.LSEH_begin_aesni_ecb_encrypt
5002	.rva	.LSEH_end_aesni_ecb_encrypt
5003	.rva	.LSEH_info_ecb
5004
5005	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
5006	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
5007	.rva	.LSEH_info_ccm64_enc
5008
5009	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
5010	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
5011	.rva	.LSEH_info_ccm64_dec
5012
5013	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
5014	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
5015	.rva	.LSEH_info_ctr32
5016
5017	.rva	.LSEH_begin_aesni_xts_encrypt
5018	.rva	.LSEH_end_aesni_xts_encrypt
5019	.rva	.LSEH_info_xts_enc
5020
5021	.rva	.LSEH_begin_aesni_xts_decrypt
5022	.rva	.LSEH_end_aesni_xts_decrypt
5023	.rva	.LSEH_info_xts_dec
5024
5025	.rva	.LSEH_begin_aesni_ocb_encrypt
5026	.rva	.LSEH_end_aesni_ocb_encrypt
5027	.rva	.LSEH_info_ocb_enc
5028
5029	.rva	.LSEH_begin_aesni_ocb_decrypt
5030	.rva	.LSEH_end_aesni_ocb_decrypt
5031	.rva	.LSEH_info_ocb_dec
5032___
5033$code.=<<___;
5034	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
5035	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
5036	.rva	.LSEH_info_cbc
5037
5038	.rva	${PREFIX}_set_decrypt_key
5039	.rva	.LSEH_end_set_decrypt_key
5040	.rva	.LSEH_info_key
5041
5042	.rva	${PREFIX}_set_encrypt_key
5043	.rva	.LSEH_end_set_encrypt_key
5044	.rva	.LSEH_info_key
5045.section	.xdata
5046.align	8
5047___
5048$code.=<<___ if ($PREFIX eq "aesni");
5049.LSEH_info_ecb:
5050	.byte	9,0,0,0
5051	.rva	ecb_ccm64_se_handler
5052	.rva	.Lecb_enc_body,.Lecb_enc_ret		# HandlerData[]
5053.LSEH_info_ccm64_enc:
5054	.byte	9,0,0,0
5055	.rva	ecb_ccm64_se_handler
5056	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
5057.LSEH_info_ccm64_dec:
5058	.byte	9,0,0,0
5059	.rva	ecb_ccm64_se_handler
5060	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
5061.LSEH_info_ctr32:
5062	.byte	9,0,0,0
5063	.rva	ctr_xts_se_handler
5064	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
5065.LSEH_info_xts_enc:
5066	.byte	9,0,0,0
5067	.rva	ctr_xts_se_handler
5068	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
5069.LSEH_info_xts_dec:
5070	.byte	9,0,0,0
5071	.rva	ctr_xts_se_handler
5072	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
5073.LSEH_info_ocb_enc:
5074	.byte	9,0,0,0
5075	.rva	ocb_se_handler
5076	.rva	.Locb_enc_body,.Locb_enc_epilogue	# HandlerData[]
5077	.rva	.Locb_enc_pop
5078	.long	0
5079.LSEH_info_ocb_dec:
5080	.byte	9,0,0,0
5081	.rva	ocb_se_handler
5082	.rva	.Locb_dec_body,.Locb_dec_epilogue	# HandlerData[]
5083	.rva	.Locb_dec_pop
5084	.long	0
5085___
5086$code.=<<___;
5087.LSEH_info_cbc:
5088	.byte	9,0,0,0
5089	.rva	cbc_se_handler
5090.LSEH_info_key:
5091	.byte	0x01,0x04,0x01,0x00
5092	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
5093___
5094}
5095
5096sub rex {
5097  local *opcode=shift;
5098  my ($dst,$src)=@_;
5099  my $rex=0;
5100
5101    $rex|=0x04			if($dst>=8);
5102    $rex|=0x01			if($src>=8);
5103    push @opcode,$rex|0x40	if($rex);
5104}
5105
5106sub aesni {
5107  my $line=shift;
5108  my @opcode=(0x66);
5109
5110    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5111	rex(\@opcode,$4,$3);
5112	push @opcode,0x0f,0x3a,0xdf;
5113	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
5114	my $c=$2;
5115	push @opcode,$c=~/^0/?oct($c):$c;
5116	return ".byte\t".join(',',@opcode);
5117    }
5118    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5119	my %opcodelet = (
5120		"aesimc" => 0xdb,
5121		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5122		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5123	);
5124	return undef if (!defined($opcodelet{$1}));
5125	rex(\@opcode,$3,$2);
5126	push @opcode,0x0f,0x38,$opcodelet{$1};
5127	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
5128	return ".byte\t".join(',',@opcode);
5129    }
5130    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5131	my %opcodelet = (
5132		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5133		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5134	);
5135	return undef if (!defined($opcodelet{$1}));
5136	my $off = $2;
5137	push @opcode,0x44 if ($3>=8);
5138	push @opcode,0x0f,0x38,$opcodelet{$1};
5139	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
5140	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5141	return ".byte\t".join(',',@opcode);
5142    }
5143    return $line;
5144}
5145
5146sub movbe {
5147	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
5148}
5149
5150$code =~ s/\`([^\`]*)\`/eval($1)/gem;
5151$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5152#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
5153$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5154
5155print $code;
5156
5157close STDOUT or die "error closing STDOUT: $!";
5158