xref: /freebsd/crypto/openssl/crypto/aes/asm/aesni-x86_64.pl (revision 6683132d54bd6d589889e43dabdc53d35e38a028)
1#! /usr/bin/env perl
2# Copyright 2009-2019 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
28# asymptotic limit it's not something you commonly achieve in reality,
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33#	16-byte     64-byte     256-byte    1-KB        8-KB
34# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
35# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
36# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
37# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
38# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
39# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved wih CBC-MAC. This provides ~30% improvement over
63# "straightforward" CCM implementation with CTR and CBC-MAC performed
64# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
121# instructions' interleave factor. Westmere can execute at most 3
122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor		3x	6x	8x
132# theoretical asymptotic limit		1.67	0.83	0.625
133# measured performance for 8KB block	1.05	0.86	0.84
134#
135# "as if" interleave factor		4.7x	5.8x	6.0x
136#
137# Further data for other parallelizable modes:
138#
139# CBC decrypt				1.16	0.93	0.74
140# CTR					1.14	0.91	0.74
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
146# additional instructions with AES ones, but even AES instructions
147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions  still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153# utilizes 6x interleave because of limited register bank capacity.
154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
172######################################################################
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
176#		CBC en-/decrypt	CTR	XTS	ECB	OCB
177# Westmere	3.77/1.25	1.25	1.25	1.26
178# * Bridge	5.07/0.74	0.75	0.90	0.85	0.98
179# Haswell	4.44/0.63	0.63	0.73	0.63	0.70
180# Skylake	2.62/0.63	0.63	0.63	0.63
181# Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
182# Knights L	2.54/0.77	0.78	0.85	-	1.50
183# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
184# Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
185# Ryzen		2.71/0.35	0.35	0.44	0.38	0.49
186#
187# (*)	Atom Silvermont ECB result is suboptimal because of penalties
188#	incurred by operations on %xmm8-15. As ECB is not considered
189#	critical, nothing was done to mitigate the problem.
190
191$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
192			# generates drop-in replacement for
193			# crypto/aes/asm/aes-x86_64.pl:-)
194
195$flavour = shift;
196$output  = shift;
197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
198
199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
200
201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
203( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
204die "can't locate x86_64-xlate.pl";
205
206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
207*STDOUT=*OUT;
208
209$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
210@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
211		("%rdi","%rsi","%rdx","%rcx");	# Unix order
212
213$code=".text\n";
214$code.=".extern	OPENSSL_ia32cap_P\n";
215
216$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
218$inp="%rdi";
219$out="%rsi";
220$len="%rdx";
221$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
222$ivp="%r8";	# cbc, ctr, ...
223
224$rnds_="%r10d";	# backup copy for $rounds
225$key_="%r11";	# backup copy for $key
226
227# %xmm register layout
228$rndkey0="%xmm0";	$rndkey1="%xmm1";
229$inout0="%xmm2";	$inout1="%xmm3";
230$inout2="%xmm4";	$inout3="%xmm5";
231$inout4="%xmm6";	$inout5="%xmm7";
232$inout6="%xmm8";	$inout7="%xmm9";
233
234$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
235$in0="%xmm8";		$iv="%xmm9";
236
237# Inline version of internal aesni_[en|de]crypt1.
238#
239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
240# cycles which take care of loop variables...
241{ my $sn;
242sub aesni_generate1 {
243my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
244++$sn;
245$code.=<<___;
246	$movkey	($key),$rndkey0
247	$movkey	16($key),$rndkey1
248___
249$code.=<<___ if (defined($ivec));
250	xorps	$rndkey0,$ivec
251	lea	32($key),$key
252	xorps	$ivec,$inout
253___
254$code.=<<___ if (!defined($ivec));
255	lea	32($key),$key
256	xorps	$rndkey0,$inout
257___
258$code.=<<___;
259.Loop_${p}1_$sn:
260	aes${p}	$rndkey1,$inout
261	dec	$rounds
262	$movkey	($key),$rndkey1
263	lea	16($key),$key
264	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
265	aes${p}last	$rndkey1,$inout
266___
267}}
268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
269#
270{ my ($inp,$out,$key) = @_4args;
271
272$code.=<<___;
273.globl	${PREFIX}_encrypt
274.type	${PREFIX}_encrypt,\@abi-omnipotent
275.align	16
276${PREFIX}_encrypt:
277.cfi_startproc
278	movups	($inp),$inout0		# load input
279	mov	240($key),$rounds	# key->rounds
280___
281	&aesni_generate1("enc",$key,$rounds);
282$code.=<<___;
283	 pxor	$rndkey0,$rndkey0	# clear register bank
284	 pxor	$rndkey1,$rndkey1
285	movups	$inout0,($out)		# output
286	 pxor	$inout0,$inout0
287	ret
288.cfi_endproc
289.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
290
291.globl	${PREFIX}_decrypt
292.type	${PREFIX}_decrypt,\@abi-omnipotent
293.align	16
294${PREFIX}_decrypt:
295.cfi_startproc
296	movups	($inp),$inout0		# load input
297	mov	240($key),$rounds	# key->rounds
298___
299	&aesni_generate1("dec",$key,$rounds);
300$code.=<<___;
301	 pxor	$rndkey0,$rndkey0	# clear register bank
302	 pxor	$rndkey1,$rndkey1
303	movups	$inout0,($out)		# output
304	 pxor	$inout0,$inout0
305	ret
306.cfi_endproc
307.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
308___
309}
310
311# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
312# factor. Why 3x subroutine were originally used in loops? Even though
313# aes[enc|dec] latency was originally 6, it could be scheduled only
314# every *2nd* cycle. Thus 3x interleave was the one providing optimal
315# utilization, i.e. when subroutine's throughput is virtually same as
316# of non-interleaved subroutine [for number of input blocks up to 3].
317# This is why it originally made no sense to implement 2x subroutine.
318# But times change and it became appropriate to spend extra 192 bytes
319# on 2x subroutine on Atom Silvermont account. For processors that
320# can schedule aes[enc|dec] every cycle optimal interleave factor
321# equals to corresponding instructions latency. 8x is optimal for
322# * Bridge and "super-optimal" for other Intel CPUs...
323
324sub aesni_generate2 {
325my $dir=shift;
326# As already mentioned it takes in $key and $rounds, which are *not*
327# preserved. $inout[0-1] is cipher/clear text...
328$code.=<<___;
329.type	_aesni_${dir}rypt2,\@abi-omnipotent
330.align	16
331_aesni_${dir}rypt2:
332.cfi_startproc
333	$movkey	($key),$rndkey0
334	shl	\$4,$rounds
335	$movkey	16($key),$rndkey1
336	xorps	$rndkey0,$inout0
337	xorps	$rndkey0,$inout1
338	$movkey	32($key),$rndkey0
339	lea	32($key,$rounds),$key
340	neg	%rax				# $rounds
341	add	\$16,%rax
342
343.L${dir}_loop2:
344	aes${dir}	$rndkey1,$inout0
345	aes${dir}	$rndkey1,$inout1
346	$movkey		($key,%rax),$rndkey1
347	add		\$32,%rax
348	aes${dir}	$rndkey0,$inout0
349	aes${dir}	$rndkey0,$inout1
350	$movkey		-16($key,%rax),$rndkey0
351	jnz		.L${dir}_loop2
352
353	aes${dir}	$rndkey1,$inout0
354	aes${dir}	$rndkey1,$inout1
355	aes${dir}last	$rndkey0,$inout0
356	aes${dir}last	$rndkey0,$inout1
357	ret
358.cfi_endproc
359.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
360___
361}
362sub aesni_generate3 {
363my $dir=shift;
364# As already mentioned it takes in $key and $rounds, which are *not*
365# preserved. $inout[0-2] is cipher/clear text...
366$code.=<<___;
367.type	_aesni_${dir}rypt3,\@abi-omnipotent
368.align	16
369_aesni_${dir}rypt3:
370.cfi_startproc
371	$movkey	($key),$rndkey0
372	shl	\$4,$rounds
373	$movkey	16($key),$rndkey1
374	xorps	$rndkey0,$inout0
375	xorps	$rndkey0,$inout1
376	xorps	$rndkey0,$inout2
377	$movkey	32($key),$rndkey0
378	lea	32($key,$rounds),$key
379	neg	%rax				# $rounds
380	add	\$16,%rax
381
382.L${dir}_loop3:
383	aes${dir}	$rndkey1,$inout0
384	aes${dir}	$rndkey1,$inout1
385	aes${dir}	$rndkey1,$inout2
386	$movkey		($key,%rax),$rndkey1
387	add		\$32,%rax
388	aes${dir}	$rndkey0,$inout0
389	aes${dir}	$rndkey0,$inout1
390	aes${dir}	$rndkey0,$inout2
391	$movkey		-16($key,%rax),$rndkey0
392	jnz		.L${dir}_loop3
393
394	aes${dir}	$rndkey1,$inout0
395	aes${dir}	$rndkey1,$inout1
396	aes${dir}	$rndkey1,$inout2
397	aes${dir}last	$rndkey0,$inout0
398	aes${dir}last	$rndkey0,$inout1
399	aes${dir}last	$rndkey0,$inout2
400	ret
401.cfi_endproc
402.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
403___
404}
405# 4x interleave is implemented to improve small block performance,
406# most notably [and naturally] 4 block by ~30%. One can argue that one
407# should have implemented 5x as well, but improvement would be <20%,
408# so it's not worth it...
409sub aesni_generate4 {
410my $dir=shift;
411# As already mentioned it takes in $key and $rounds, which are *not*
412# preserved. $inout[0-3] is cipher/clear text...
413$code.=<<___;
414.type	_aesni_${dir}rypt4,\@abi-omnipotent
415.align	16
416_aesni_${dir}rypt4:
417.cfi_startproc
418	$movkey	($key),$rndkey0
419	shl	\$4,$rounds
420	$movkey	16($key),$rndkey1
421	xorps	$rndkey0,$inout0
422	xorps	$rndkey0,$inout1
423	xorps	$rndkey0,$inout2
424	xorps	$rndkey0,$inout3
425	$movkey	32($key),$rndkey0
426	lea	32($key,$rounds),$key
427	neg	%rax				# $rounds
428	.byte	0x0f,0x1f,0x00
429	add	\$16,%rax
430
431.L${dir}_loop4:
432	aes${dir}	$rndkey1,$inout0
433	aes${dir}	$rndkey1,$inout1
434	aes${dir}	$rndkey1,$inout2
435	aes${dir}	$rndkey1,$inout3
436	$movkey		($key,%rax),$rndkey1
437	add		\$32,%rax
438	aes${dir}	$rndkey0,$inout0
439	aes${dir}	$rndkey0,$inout1
440	aes${dir}	$rndkey0,$inout2
441	aes${dir}	$rndkey0,$inout3
442	$movkey		-16($key,%rax),$rndkey0
443	jnz		.L${dir}_loop4
444
445	aes${dir}	$rndkey1,$inout0
446	aes${dir}	$rndkey1,$inout1
447	aes${dir}	$rndkey1,$inout2
448	aes${dir}	$rndkey1,$inout3
449	aes${dir}last	$rndkey0,$inout0
450	aes${dir}last	$rndkey0,$inout1
451	aes${dir}last	$rndkey0,$inout2
452	aes${dir}last	$rndkey0,$inout3
453	ret
454.cfi_endproc
455.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
456___
457}
458sub aesni_generate6 {
459my $dir=shift;
460# As already mentioned it takes in $key and $rounds, which are *not*
461# preserved. $inout[0-5] is cipher/clear text...
462$code.=<<___;
463.type	_aesni_${dir}rypt6,\@abi-omnipotent
464.align	16
465_aesni_${dir}rypt6:
466.cfi_startproc
467	$movkey		($key),$rndkey0
468	shl		\$4,$rounds
469	$movkey		16($key),$rndkey1
470	xorps		$rndkey0,$inout0
471	pxor		$rndkey0,$inout1
472	pxor		$rndkey0,$inout2
473	aes${dir}	$rndkey1,$inout0
474	lea		32($key,$rounds),$key
475	neg		%rax			# $rounds
476	aes${dir}	$rndkey1,$inout1
477	pxor		$rndkey0,$inout3
478	pxor		$rndkey0,$inout4
479	aes${dir}	$rndkey1,$inout2
480	pxor		$rndkey0,$inout5
481	$movkey		($key,%rax),$rndkey0
482	add		\$16,%rax
483	jmp		.L${dir}_loop6_enter
484.align	16
485.L${dir}_loop6:
486	aes${dir}	$rndkey1,$inout0
487	aes${dir}	$rndkey1,$inout1
488	aes${dir}	$rndkey1,$inout2
489.L${dir}_loop6_enter:
490	aes${dir}	$rndkey1,$inout3
491	aes${dir}	$rndkey1,$inout4
492	aes${dir}	$rndkey1,$inout5
493	$movkey		($key,%rax),$rndkey1
494	add		\$32,%rax
495	aes${dir}	$rndkey0,$inout0
496	aes${dir}	$rndkey0,$inout1
497	aes${dir}	$rndkey0,$inout2
498	aes${dir}	$rndkey0,$inout3
499	aes${dir}	$rndkey0,$inout4
500	aes${dir}	$rndkey0,$inout5
501	$movkey		-16($key,%rax),$rndkey0
502	jnz		.L${dir}_loop6
503
504	aes${dir}	$rndkey1,$inout0
505	aes${dir}	$rndkey1,$inout1
506	aes${dir}	$rndkey1,$inout2
507	aes${dir}	$rndkey1,$inout3
508	aes${dir}	$rndkey1,$inout4
509	aes${dir}	$rndkey1,$inout5
510	aes${dir}last	$rndkey0,$inout0
511	aes${dir}last	$rndkey0,$inout1
512	aes${dir}last	$rndkey0,$inout2
513	aes${dir}last	$rndkey0,$inout3
514	aes${dir}last	$rndkey0,$inout4
515	aes${dir}last	$rndkey0,$inout5
516	ret
517.cfi_endproc
518.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
519___
520}
521sub aesni_generate8 {
522my $dir=shift;
523# As already mentioned it takes in $key and $rounds, which are *not*
524# preserved. $inout[0-7] is cipher/clear text...
525$code.=<<___;
526.type	_aesni_${dir}rypt8,\@abi-omnipotent
527.align	16
528_aesni_${dir}rypt8:
529.cfi_startproc
530	$movkey		($key),$rndkey0
531	shl		\$4,$rounds
532	$movkey		16($key),$rndkey1
533	xorps		$rndkey0,$inout0
534	xorps		$rndkey0,$inout1
535	pxor		$rndkey0,$inout2
536	pxor		$rndkey0,$inout3
537	pxor		$rndkey0,$inout4
538	lea		32($key,$rounds),$key
539	neg		%rax			# $rounds
540	aes${dir}	$rndkey1,$inout0
541	pxor		$rndkey0,$inout5
542	pxor		$rndkey0,$inout6
543	aes${dir}	$rndkey1,$inout1
544	pxor		$rndkey0,$inout7
545	$movkey		($key,%rax),$rndkey0
546	add		\$16,%rax
547	jmp		.L${dir}_loop8_inner
548.align	16
549.L${dir}_loop8:
550	aes${dir}	$rndkey1,$inout0
551	aes${dir}	$rndkey1,$inout1
552.L${dir}_loop8_inner:
553	aes${dir}	$rndkey1,$inout2
554	aes${dir}	$rndkey1,$inout3
555	aes${dir}	$rndkey1,$inout4
556	aes${dir}	$rndkey1,$inout5
557	aes${dir}	$rndkey1,$inout6
558	aes${dir}	$rndkey1,$inout7
559.L${dir}_loop8_enter:
560	$movkey		($key,%rax),$rndkey1
561	add		\$32,%rax
562	aes${dir}	$rndkey0,$inout0
563	aes${dir}	$rndkey0,$inout1
564	aes${dir}	$rndkey0,$inout2
565	aes${dir}	$rndkey0,$inout3
566	aes${dir}	$rndkey0,$inout4
567	aes${dir}	$rndkey0,$inout5
568	aes${dir}	$rndkey0,$inout6
569	aes${dir}	$rndkey0,$inout7
570	$movkey		-16($key,%rax),$rndkey0
571	jnz		.L${dir}_loop8
572
573	aes${dir}	$rndkey1,$inout0
574	aes${dir}	$rndkey1,$inout1
575	aes${dir}	$rndkey1,$inout2
576	aes${dir}	$rndkey1,$inout3
577	aes${dir}	$rndkey1,$inout4
578	aes${dir}	$rndkey1,$inout5
579	aes${dir}	$rndkey1,$inout6
580	aes${dir}	$rndkey1,$inout7
581	aes${dir}last	$rndkey0,$inout0
582	aes${dir}last	$rndkey0,$inout1
583	aes${dir}last	$rndkey0,$inout2
584	aes${dir}last	$rndkey0,$inout3
585	aes${dir}last	$rndkey0,$inout4
586	aes${dir}last	$rndkey0,$inout5
587	aes${dir}last	$rndkey0,$inout6
588	aes${dir}last	$rndkey0,$inout7
589	ret
590.cfi_endproc
591.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
592___
593}
594&aesni_generate2("enc") if ($PREFIX eq "aesni");
595&aesni_generate2("dec");
596&aesni_generate3("enc") if ($PREFIX eq "aesni");
597&aesni_generate3("dec");
598&aesni_generate4("enc") if ($PREFIX eq "aesni");
599&aesni_generate4("dec");
600&aesni_generate6("enc") if ($PREFIX eq "aesni");
601&aesni_generate6("dec");
602&aesni_generate8("enc") if ($PREFIX eq "aesni");
603&aesni_generate8("dec");
604
605if ($PREFIX eq "aesni") {
606########################################################################
607# void aesni_ecb_encrypt (const void *in, void *out,
608#			  size_t length, const AES_KEY *key,
609#			  int enc);
610$code.=<<___;
611.globl	aesni_ecb_encrypt
612.type	aesni_ecb_encrypt,\@function,5
613.align	16
614aesni_ecb_encrypt:
615.cfi_startproc
616___
617$code.=<<___ if ($win64);
618	lea	-0x58(%rsp),%rsp
619	movaps	%xmm6,(%rsp)		# offload $inout4..7
620	movaps	%xmm7,0x10(%rsp)
621	movaps	%xmm8,0x20(%rsp)
622	movaps	%xmm9,0x30(%rsp)
623.Lecb_enc_body:
624___
625$code.=<<___;
626	and	\$-16,$len		# if ($len<16)
627	jz	.Lecb_ret		# return
628
629	mov	240($key),$rounds	# key->rounds
630	$movkey	($key),$rndkey0
631	mov	$key,$key_		# backup $key
632	mov	$rounds,$rnds_		# backup $rounds
633	test	%r8d,%r8d		# 5th argument
634	jz	.Lecb_decrypt
635#--------------------------- ECB ENCRYPT ------------------------------#
636	cmp	\$0x80,$len		# if ($len<8*16)
637	jb	.Lecb_enc_tail		# short input
638
639	movdqu	($inp),$inout0		# load 8 input blocks
640	movdqu	0x10($inp),$inout1
641	movdqu	0x20($inp),$inout2
642	movdqu	0x30($inp),$inout3
643	movdqu	0x40($inp),$inout4
644	movdqu	0x50($inp),$inout5
645	movdqu	0x60($inp),$inout6
646	movdqu	0x70($inp),$inout7
647	lea	0x80($inp),$inp		# $inp+=8*16
648	sub	\$0x80,$len		# $len-=8*16 (can be zero)
649	jmp	.Lecb_enc_loop8_enter
650.align 16
651.Lecb_enc_loop8:
652	movups	$inout0,($out)		# store 8 output blocks
653	mov	$key_,$key		# restore $key
654	movdqu	($inp),$inout0		# load 8 input blocks
655	mov	$rnds_,$rounds		# restore $rounds
656	movups	$inout1,0x10($out)
657	movdqu	0x10($inp),$inout1
658	movups	$inout2,0x20($out)
659	movdqu	0x20($inp),$inout2
660	movups	$inout3,0x30($out)
661	movdqu	0x30($inp),$inout3
662	movups	$inout4,0x40($out)
663	movdqu	0x40($inp),$inout4
664	movups	$inout5,0x50($out)
665	movdqu	0x50($inp),$inout5
666	movups	$inout6,0x60($out)
667	movdqu	0x60($inp),$inout6
668	movups	$inout7,0x70($out)
669	lea	0x80($out),$out		# $out+=8*16
670	movdqu	0x70($inp),$inout7
671	lea	0x80($inp),$inp		# $inp+=8*16
672.Lecb_enc_loop8_enter:
673
674	call	_aesni_encrypt8
675
676	sub	\$0x80,$len
677	jnc	.Lecb_enc_loop8		# loop if $len-=8*16 didn't borrow
678
679	movups	$inout0,($out)		# store 8 output blocks
680	mov	$key_,$key		# restore $key
681	movups	$inout1,0x10($out)
682	mov	$rnds_,$rounds		# restore $rounds
683	movups	$inout2,0x20($out)
684	movups	$inout3,0x30($out)
685	movups	$inout4,0x40($out)
686	movups	$inout5,0x50($out)
687	movups	$inout6,0x60($out)
688	movups	$inout7,0x70($out)
689	lea	0x80($out),$out		# $out+=8*16
690	add	\$0x80,$len		# restore real remaining $len
691	jz	.Lecb_ret		# done if ($len==0)
692
693.Lecb_enc_tail:				# $len is less than 8*16
694	movups	($inp),$inout0
695	cmp	\$0x20,$len
696	jb	.Lecb_enc_one
697	movups	0x10($inp),$inout1
698	je	.Lecb_enc_two
699	movups	0x20($inp),$inout2
700	cmp	\$0x40,$len
701	jb	.Lecb_enc_three
702	movups	0x30($inp),$inout3
703	je	.Lecb_enc_four
704	movups	0x40($inp),$inout4
705	cmp	\$0x60,$len
706	jb	.Lecb_enc_five
707	movups	0x50($inp),$inout5
708	je	.Lecb_enc_six
709	movdqu	0x60($inp),$inout6
710	xorps	$inout7,$inout7
711	call	_aesni_encrypt8
712	movups	$inout0,($out)		# store 7 output blocks
713	movups	$inout1,0x10($out)
714	movups	$inout2,0x20($out)
715	movups	$inout3,0x30($out)
716	movups	$inout4,0x40($out)
717	movups	$inout5,0x50($out)
718	movups	$inout6,0x60($out)
719	jmp	.Lecb_ret
720.align	16
721.Lecb_enc_one:
722___
723	&aesni_generate1("enc",$key,$rounds);
724$code.=<<___;
725	movups	$inout0,($out)		# store one output block
726	jmp	.Lecb_ret
727.align	16
728.Lecb_enc_two:
729	call	_aesni_encrypt2
730	movups	$inout0,($out)		# store 2 output blocks
731	movups	$inout1,0x10($out)
732	jmp	.Lecb_ret
733.align	16
734.Lecb_enc_three:
735	call	_aesni_encrypt3
736	movups	$inout0,($out)		# store 3 output blocks
737	movups	$inout1,0x10($out)
738	movups	$inout2,0x20($out)
739	jmp	.Lecb_ret
740.align	16
741.Lecb_enc_four:
742	call	_aesni_encrypt4
743	movups	$inout0,($out)		# store 4 output blocks
744	movups	$inout1,0x10($out)
745	movups	$inout2,0x20($out)
746	movups	$inout3,0x30($out)
747	jmp	.Lecb_ret
748.align	16
749.Lecb_enc_five:
750	xorps	$inout5,$inout5
751	call	_aesni_encrypt6
752	movups	$inout0,($out)		# store 5 output blocks
753	movups	$inout1,0x10($out)
754	movups	$inout2,0x20($out)
755	movups	$inout3,0x30($out)
756	movups	$inout4,0x40($out)
757	jmp	.Lecb_ret
758.align	16
759.Lecb_enc_six:
760	call	_aesni_encrypt6
761	movups	$inout0,($out)		# store 6 output blocks
762	movups	$inout1,0x10($out)
763	movups	$inout2,0x20($out)
764	movups	$inout3,0x30($out)
765	movups	$inout4,0x40($out)
766	movups	$inout5,0x50($out)
767	jmp	.Lecb_ret
768#--------------------------- ECB DECRYPT ------------------------------#
769.align	16
770.Lecb_decrypt:
771	cmp	\$0x80,$len		# if ($len<8*16)
772	jb	.Lecb_dec_tail		# short input
773
774	movdqu	($inp),$inout0		# load 8 input blocks
775	movdqu	0x10($inp),$inout1
776	movdqu	0x20($inp),$inout2
777	movdqu	0x30($inp),$inout3
778	movdqu	0x40($inp),$inout4
779	movdqu	0x50($inp),$inout5
780	movdqu	0x60($inp),$inout6
781	movdqu	0x70($inp),$inout7
782	lea	0x80($inp),$inp		# $inp+=8*16
783	sub	\$0x80,$len		# $len-=8*16 (can be zero)
784	jmp	.Lecb_dec_loop8_enter
785.align 16
786.Lecb_dec_loop8:
787	movups	$inout0,($out)		# store 8 output blocks
788	mov	$key_,$key		# restore $key
789	movdqu	($inp),$inout0		# load 8 input blocks
790	mov	$rnds_,$rounds		# restore $rounds
791	movups	$inout1,0x10($out)
792	movdqu	0x10($inp),$inout1
793	movups	$inout2,0x20($out)
794	movdqu	0x20($inp),$inout2
795	movups	$inout3,0x30($out)
796	movdqu	0x30($inp),$inout3
797	movups	$inout4,0x40($out)
798	movdqu	0x40($inp),$inout4
799	movups	$inout5,0x50($out)
800	movdqu	0x50($inp),$inout5
801	movups	$inout6,0x60($out)
802	movdqu	0x60($inp),$inout6
803	movups	$inout7,0x70($out)
804	lea	0x80($out),$out		# $out+=8*16
805	movdqu	0x70($inp),$inout7
806	lea	0x80($inp),$inp		# $inp+=8*16
807.Lecb_dec_loop8_enter:
808
809	call	_aesni_decrypt8
810
811	$movkey	($key_),$rndkey0
812	sub	\$0x80,$len
813	jnc	.Lecb_dec_loop8		# loop if $len-=8*16 didn't borrow
814
815	movups	$inout0,($out)		# store 8 output blocks
816	 pxor	$inout0,$inout0		# clear register bank
817	mov	$key_,$key		# restore $key
818	movups	$inout1,0x10($out)
819	 pxor	$inout1,$inout1
820	mov	$rnds_,$rounds		# restore $rounds
821	movups	$inout2,0x20($out)
822	 pxor	$inout2,$inout2
823	movups	$inout3,0x30($out)
824	 pxor	$inout3,$inout3
825	movups	$inout4,0x40($out)
826	 pxor	$inout4,$inout4
827	movups	$inout5,0x50($out)
828	 pxor	$inout5,$inout5
829	movups	$inout6,0x60($out)
830	 pxor	$inout6,$inout6
831	movups	$inout7,0x70($out)
832	 pxor	$inout7,$inout7
833	lea	0x80($out),$out		# $out+=8*16
834	add	\$0x80,$len		# restore real remaining $len
835	jz	.Lecb_ret		# done if ($len==0)
836
837.Lecb_dec_tail:
838	movups	($inp),$inout0
839	cmp	\$0x20,$len
840	jb	.Lecb_dec_one
841	movups	0x10($inp),$inout1
842	je	.Lecb_dec_two
843	movups	0x20($inp),$inout2
844	cmp	\$0x40,$len
845	jb	.Lecb_dec_three
846	movups	0x30($inp),$inout3
847	je	.Lecb_dec_four
848	movups	0x40($inp),$inout4
849	cmp	\$0x60,$len
850	jb	.Lecb_dec_five
851	movups	0x50($inp),$inout5
852	je	.Lecb_dec_six
853	movups	0x60($inp),$inout6
854	$movkey	($key),$rndkey0
855	xorps	$inout7,$inout7
856	call	_aesni_decrypt8
857	movups	$inout0,($out)		# store 7 output blocks
858	 pxor	$inout0,$inout0		# clear register bank
859	movups	$inout1,0x10($out)
860	 pxor	$inout1,$inout1
861	movups	$inout2,0x20($out)
862	 pxor	$inout2,$inout2
863	movups	$inout3,0x30($out)
864	 pxor	$inout3,$inout3
865	movups	$inout4,0x40($out)
866	 pxor	$inout4,$inout4
867	movups	$inout5,0x50($out)
868	 pxor	$inout5,$inout5
869	movups	$inout6,0x60($out)
870	 pxor	$inout6,$inout6
871	 pxor	$inout7,$inout7
872	jmp	.Lecb_ret
873.align	16
874.Lecb_dec_one:
875___
876	&aesni_generate1("dec",$key,$rounds);
877$code.=<<___;
878	movups	$inout0,($out)		# store one output block
879	 pxor	$inout0,$inout0		# clear register bank
880	jmp	.Lecb_ret
881.align	16
882.Lecb_dec_two:
883	call	_aesni_decrypt2
884	movups	$inout0,($out)		# store 2 output blocks
885	 pxor	$inout0,$inout0		# clear register bank
886	movups	$inout1,0x10($out)
887	 pxor	$inout1,$inout1
888	jmp	.Lecb_ret
889.align	16
890.Lecb_dec_three:
891	call	_aesni_decrypt3
892	movups	$inout0,($out)		# store 3 output blocks
893	 pxor	$inout0,$inout0		# clear register bank
894	movups	$inout1,0x10($out)
895	 pxor	$inout1,$inout1
896	movups	$inout2,0x20($out)
897	 pxor	$inout2,$inout2
898	jmp	.Lecb_ret
899.align	16
900.Lecb_dec_four:
901	call	_aesni_decrypt4
902	movups	$inout0,($out)		# store 4 output blocks
903	 pxor	$inout0,$inout0		# clear register bank
904	movups	$inout1,0x10($out)
905	 pxor	$inout1,$inout1
906	movups	$inout2,0x20($out)
907	 pxor	$inout2,$inout2
908	movups	$inout3,0x30($out)
909	 pxor	$inout3,$inout3
910	jmp	.Lecb_ret
911.align	16
912.Lecb_dec_five:
913	xorps	$inout5,$inout5
914	call	_aesni_decrypt6
915	movups	$inout0,($out)		# store 5 output blocks
916	 pxor	$inout0,$inout0		# clear register bank
917	movups	$inout1,0x10($out)
918	 pxor	$inout1,$inout1
919	movups	$inout2,0x20($out)
920	 pxor	$inout2,$inout2
921	movups	$inout3,0x30($out)
922	 pxor	$inout3,$inout3
923	movups	$inout4,0x40($out)
924	 pxor	$inout4,$inout4
925	 pxor	$inout5,$inout5
926	jmp	.Lecb_ret
927.align	16
928.Lecb_dec_six:
929	call	_aesni_decrypt6
930	movups	$inout0,($out)		# store 6 output blocks
931	 pxor	$inout0,$inout0		# clear register bank
932	movups	$inout1,0x10($out)
933	 pxor	$inout1,$inout1
934	movups	$inout2,0x20($out)
935	 pxor	$inout2,$inout2
936	movups	$inout3,0x30($out)
937	 pxor	$inout3,$inout3
938	movups	$inout4,0x40($out)
939	 pxor	$inout4,$inout4
940	movups	$inout5,0x50($out)
941	 pxor	$inout5,$inout5
942
943.Lecb_ret:
944	xorps	$rndkey0,$rndkey0	# %xmm0
945	pxor	$rndkey1,$rndkey1
946___
947$code.=<<___ if ($win64);
948	movaps	(%rsp),%xmm6
949	movaps	%xmm0,(%rsp)		# clear stack
950	movaps	0x10(%rsp),%xmm7
951	movaps	%xmm0,0x10(%rsp)
952	movaps	0x20(%rsp),%xmm8
953	movaps	%xmm0,0x20(%rsp)
954	movaps	0x30(%rsp),%xmm9
955	movaps	%xmm0,0x30(%rsp)
956	lea	0x58(%rsp),%rsp
957.Lecb_enc_ret:
958___
959$code.=<<___;
960	ret
961.cfi_endproc
962.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
963___
964
965{
966######################################################################
967# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
968#                         size_t blocks, const AES_KEY *key,
969#                         const char *ivec,char *cmac);
970#
971# Handles only complete blocks, operates on 64-bit counter and
972# does not update *ivec! Nor does it finalize CMAC value
973# (see engine/eng_aesni.c for details)
974#
975{
976my $cmac="%r9";	# 6th argument
977
978my $increment="%xmm9";
979my $iv="%xmm6";
980my $bswap_mask="%xmm7";
981
982$code.=<<___;
983.globl	aesni_ccm64_encrypt_blocks
984.type	aesni_ccm64_encrypt_blocks,\@function,6
985.align	16
986aesni_ccm64_encrypt_blocks:
987___
988$code.=<<___ if ($win64);
989	lea	-0x58(%rsp),%rsp
990	movaps	%xmm6,(%rsp)		# $iv
991	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
992	movaps	%xmm8,0x20(%rsp)	# $in0
993	movaps	%xmm9,0x30(%rsp)	# $increment
994.Lccm64_enc_body:
995___
996$code.=<<___;
997	mov	240($key),$rounds		# key->rounds
998	movdqu	($ivp),$iv
999	movdqa	.Lincrement64(%rip),$increment
1000	movdqa	.Lbswap_mask(%rip),$bswap_mask
1001
1002	shl	\$4,$rounds
1003	mov	\$16,$rnds_
1004	lea	0($key),$key_
1005	movdqu	($cmac),$inout1
1006	movdqa	$iv,$inout0
1007	lea	32($key,$rounds),$key		# end of key schedule
1008	pshufb	$bswap_mask,$iv
1009	sub	%rax,%r10			# twisted $rounds
1010	jmp	.Lccm64_enc_outer
1011.align	16
1012.Lccm64_enc_outer:
1013	$movkey	($key_),$rndkey0
1014	mov	%r10,%rax
1015	movups	($inp),$in0			# load inp
1016
1017	xorps	$rndkey0,$inout0		# counter
1018	$movkey	16($key_),$rndkey1
1019	xorps	$in0,$rndkey0
1020	xorps	$rndkey0,$inout1		# cmac^=inp
1021	$movkey	32($key_),$rndkey0
1022
1023.Lccm64_enc2_loop:
1024	aesenc	$rndkey1,$inout0
1025	aesenc	$rndkey1,$inout1
1026	$movkey	($key,%rax),$rndkey1
1027	add	\$32,%rax
1028	aesenc	$rndkey0,$inout0
1029	aesenc	$rndkey0,$inout1
1030	$movkey	-16($key,%rax),$rndkey0
1031	jnz	.Lccm64_enc2_loop
1032	aesenc	$rndkey1,$inout0
1033	aesenc	$rndkey1,$inout1
1034	paddq	$increment,$iv
1035	dec	$len				# $len-- ($len is in blocks)
1036	aesenclast	$rndkey0,$inout0
1037	aesenclast	$rndkey0,$inout1
1038
1039	lea	16($inp),$inp
1040	xorps	$inout0,$in0			# inp ^= E(iv)
1041	movdqa	$iv,$inout0
1042	movups	$in0,($out)			# save output
1043	pshufb	$bswap_mask,$inout0
1044	lea	16($out),$out			# $out+=16
1045	jnz	.Lccm64_enc_outer		# loop if ($len!=0)
1046
1047	 pxor	$rndkey0,$rndkey0		# clear register bank
1048	 pxor	$rndkey1,$rndkey1
1049	 pxor	$inout0,$inout0
1050	movups	$inout1,($cmac)			# store resulting mac
1051	 pxor	$inout1,$inout1
1052	 pxor	$in0,$in0
1053	 pxor	$iv,$iv
1054___
1055$code.=<<___ if ($win64);
1056	movaps	(%rsp),%xmm6
1057	movaps	%xmm0,(%rsp)			# clear stack
1058	movaps	0x10(%rsp),%xmm7
1059	movaps	%xmm0,0x10(%rsp)
1060	movaps	0x20(%rsp),%xmm8
1061	movaps	%xmm0,0x20(%rsp)
1062	movaps	0x30(%rsp),%xmm9
1063	movaps	%xmm0,0x30(%rsp)
1064	lea	0x58(%rsp),%rsp
1065.Lccm64_enc_ret:
1066___
1067$code.=<<___;
1068	ret
1069.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1070___
1071######################################################################
1072$code.=<<___;
1073.globl	aesni_ccm64_decrypt_blocks
1074.type	aesni_ccm64_decrypt_blocks,\@function,6
1075.align	16
1076aesni_ccm64_decrypt_blocks:
1077___
1078$code.=<<___ if ($win64);
1079	lea	-0x58(%rsp),%rsp
1080	movaps	%xmm6,(%rsp)		# $iv
1081	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
1082	movaps	%xmm8,0x20(%rsp)	# $in8
1083	movaps	%xmm9,0x30(%rsp)	# $increment
1084.Lccm64_dec_body:
1085___
1086$code.=<<___;
1087	mov	240($key),$rounds		# key->rounds
1088	movups	($ivp),$iv
1089	movdqu	($cmac),$inout1
1090	movdqa	.Lincrement64(%rip),$increment
1091	movdqa	.Lbswap_mask(%rip),$bswap_mask
1092
1093	movaps	$iv,$inout0
1094	mov	$rounds,$rnds_
1095	mov	$key,$key_
1096	pshufb	$bswap_mask,$iv
1097___
1098	&aesni_generate1("enc",$key,$rounds);
1099$code.=<<___;
1100	shl	\$4,$rnds_
1101	mov	\$16,$rounds
1102	movups	($inp),$in0			# load inp
1103	paddq	$increment,$iv
1104	lea	16($inp),$inp			# $inp+=16
1105	sub	%r10,%rax			# twisted $rounds
1106	lea	32($key_,$rnds_),$key		# end of key schedule
1107	mov	%rax,%r10
1108	jmp	.Lccm64_dec_outer
1109.align	16
1110.Lccm64_dec_outer:
1111	xorps	$inout0,$in0			# inp ^= E(iv)
1112	movdqa	$iv,$inout0
1113	movups	$in0,($out)			# save output
1114	lea	16($out),$out			# $out+=16
1115	pshufb	$bswap_mask,$inout0
1116
1117	sub	\$1,$len			# $len-- ($len is in blocks)
1118	jz	.Lccm64_dec_break		# if ($len==0) break
1119
1120	$movkey	($key_),$rndkey0
1121	mov	%r10,%rax
1122	$movkey	16($key_),$rndkey1
1123	xorps	$rndkey0,$in0
1124	xorps	$rndkey0,$inout0
1125	xorps	$in0,$inout1			# cmac^=out
1126	$movkey	32($key_),$rndkey0
1127	jmp	.Lccm64_dec2_loop
1128.align	16
1129.Lccm64_dec2_loop:
1130	aesenc	$rndkey1,$inout0
1131	aesenc	$rndkey1,$inout1
1132	$movkey	($key,%rax),$rndkey1
1133	add	\$32,%rax
1134	aesenc	$rndkey0,$inout0
1135	aesenc	$rndkey0,$inout1
1136	$movkey	-16($key,%rax),$rndkey0
1137	jnz	.Lccm64_dec2_loop
1138	movups	($inp),$in0			# load input
1139	paddq	$increment,$iv
1140	aesenc	$rndkey1,$inout0
1141	aesenc	$rndkey1,$inout1
1142	aesenclast	$rndkey0,$inout0
1143	aesenclast	$rndkey0,$inout1
1144	lea	16($inp),$inp			# $inp+=16
1145	jmp	.Lccm64_dec_outer
1146
1147.align	16
1148.Lccm64_dec_break:
1149	#xorps	$in0,$inout1			# cmac^=out
1150	mov	240($key_),$rounds
1151___
1152	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1153$code.=<<___;
1154	 pxor	$rndkey0,$rndkey0		# clear register bank
1155	 pxor	$rndkey1,$rndkey1
1156	 pxor	$inout0,$inout0
1157	movups	$inout1,($cmac)			# store resulting mac
1158	 pxor	$inout1,$inout1
1159	 pxor	$in0,$in0
1160	 pxor	$iv,$iv
1161___
1162$code.=<<___ if ($win64);
1163	movaps	(%rsp),%xmm6
1164	movaps	%xmm0,(%rsp)			# clear stack
1165	movaps	0x10(%rsp),%xmm7
1166	movaps	%xmm0,0x10(%rsp)
1167	movaps	0x20(%rsp),%xmm8
1168	movaps	%xmm0,0x20(%rsp)
1169	movaps	0x30(%rsp),%xmm9
1170	movaps	%xmm0,0x30(%rsp)
1171	lea	0x58(%rsp),%rsp
1172.Lccm64_dec_ret:
1173___
1174$code.=<<___;
1175	ret
1176.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1177___
1178}
1179######################################################################
1180# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1181#                         size_t blocks, const AES_KEY *key,
1182#                         const char *ivec);
1183#
1184# Handles only complete blocks, operates on 32-bit counter and
1185# does not update *ivec! (see crypto/modes/ctr128.c for details)
1186#
1187# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1188# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1189# Keywords are full unroll and modulo-schedule counter calculations
1190# with zero-round key xor.
1191{
1192my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1193my ($key0,$ctr)=("%ebp","${ivp}d");
1194my $frame_size = 0x80 + ($win64?160:0);
1195
1196$code.=<<___;
1197.globl	aesni_ctr32_encrypt_blocks
1198.type	aesni_ctr32_encrypt_blocks,\@function,5
1199.align	16
1200aesni_ctr32_encrypt_blocks:
1201.cfi_startproc
1202	cmp	\$1,$len
1203	jne	.Lctr32_bulk
1204
1205	# handle single block without allocating stack frame,
1206	# useful when handling edges
1207	movups	($ivp),$inout0
1208	movups	($inp),$inout1
1209	mov	240($key),%edx			# key->rounds
1210___
1211	&aesni_generate1("enc",$key,"%edx");
1212$code.=<<___;
1213	 pxor	$rndkey0,$rndkey0		# clear register bank
1214	 pxor	$rndkey1,$rndkey1
1215	xorps	$inout1,$inout0
1216	 pxor	$inout1,$inout1
1217	movups	$inout0,($out)
1218	 xorps	$inout0,$inout0
1219	jmp	.Lctr32_epilogue
1220
1221.align	16
1222.Lctr32_bulk:
1223	lea	(%rsp),$key_			# use $key_ as frame pointer
1224.cfi_def_cfa_register	$key_
1225	push	%rbp
1226.cfi_push	%rbp
1227	sub	\$$frame_size,%rsp
1228	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1229___
1230$code.=<<___ if ($win64);
1231	movaps	%xmm6,-0xa8($key_)		# offload everything
1232	movaps	%xmm7,-0x98($key_)
1233	movaps	%xmm8,-0x88($key_)
1234	movaps	%xmm9,-0x78($key_)
1235	movaps	%xmm10,-0x68($key_)
1236	movaps	%xmm11,-0x58($key_)
1237	movaps	%xmm12,-0x48($key_)
1238	movaps	%xmm13,-0x38($key_)
1239	movaps	%xmm14,-0x28($key_)
1240	movaps	%xmm15,-0x18($key_)
1241.Lctr32_body:
1242___
1243$code.=<<___;
1244
1245	# 8 16-byte words on top of stack are counter values
1246	# xor-ed with zero-round key
1247
1248	movdqu	($ivp),$inout0
1249	movdqu	($key),$rndkey0
1250	mov	12($ivp),$ctr			# counter LSB
1251	pxor	$rndkey0,$inout0
1252	mov	12($key),$key0			# 0-round key LSB
1253	movdqa	$inout0,0x00(%rsp)		# populate counter block
1254	bswap	$ctr
1255	movdqa	$inout0,$inout1
1256	movdqa	$inout0,$inout2
1257	movdqa	$inout0,$inout3
1258	movdqa	$inout0,0x40(%rsp)
1259	movdqa	$inout0,0x50(%rsp)
1260	movdqa	$inout0,0x60(%rsp)
1261	mov	%rdx,%r10			# about to borrow %rdx
1262	movdqa	$inout0,0x70(%rsp)
1263
1264	lea	1($ctr),%rax
1265	 lea	2($ctr),%rdx
1266	bswap	%eax
1267	 bswap	%edx
1268	xor	$key0,%eax
1269	 xor	$key0,%edx
1270	pinsrd	\$3,%eax,$inout1
1271	lea	3($ctr),%rax
1272	movdqa	$inout1,0x10(%rsp)
1273	 pinsrd	\$3,%edx,$inout2
1274	bswap	%eax
1275	 mov	%r10,%rdx			# restore %rdx
1276	 lea	4($ctr),%r10
1277	 movdqa	$inout2,0x20(%rsp)
1278	xor	$key0,%eax
1279	 bswap	%r10d
1280	pinsrd	\$3,%eax,$inout3
1281	 xor	$key0,%r10d
1282	movdqa	$inout3,0x30(%rsp)
1283	lea	5($ctr),%r9
1284	 mov	%r10d,0x40+12(%rsp)
1285	bswap	%r9d
1286	 lea	6($ctr),%r10
1287	mov	240($key),$rounds		# key->rounds
1288	xor	$key0,%r9d
1289	 bswap	%r10d
1290	mov	%r9d,0x50+12(%rsp)
1291	 xor	$key0,%r10d
1292	lea	7($ctr),%r9
1293	 mov	%r10d,0x60+12(%rsp)
1294	bswap	%r9d
1295	 mov	OPENSSL_ia32cap_P+4(%rip),%r10d
1296	xor	$key0,%r9d
1297	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
1298	mov	%r9d,0x70+12(%rsp)
1299
1300	$movkey	0x10($key),$rndkey1
1301
1302	movdqa	0x40(%rsp),$inout4
1303	movdqa	0x50(%rsp),$inout5
1304
1305	cmp	\$8,$len		# $len is in blocks
1306	jb	.Lctr32_tail		# short input if ($len<8)
1307
1308	sub	\$6,$len		# $len is biased by -6
1309	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
1310	je	.Lctr32_6x		# [which denotes Atom Silvermont]
1311
1312	lea	0x80($key),$key		# size optimization
1313	sub	\$2,$len		# $len is biased by -8
1314	jmp	.Lctr32_loop8
1315
1316.align	16
1317.Lctr32_6x:
1318	shl	\$4,$rounds
1319	mov	\$48,$rnds_
1320	bswap	$key0
1321	lea	32($key,$rounds),$key	# end of key schedule
1322	sub	%rax,%r10		# twisted $rounds
1323	jmp	.Lctr32_loop6
1324
1325.align	16
1326.Lctr32_loop6:
1327	 add	\$6,$ctr		# next counter value
1328	$movkey	-48($key,$rnds_),$rndkey0
1329	aesenc	$rndkey1,$inout0
1330	 mov	$ctr,%eax
1331	 xor	$key0,%eax
1332	aesenc	$rndkey1,$inout1
1333	 movbe	%eax,`0x00+12`(%rsp)	# store next counter value
1334	 lea	1($ctr),%eax
1335	aesenc	$rndkey1,$inout2
1336	 xor	$key0,%eax
1337	 movbe	%eax,`0x10+12`(%rsp)
1338	aesenc	$rndkey1,$inout3
1339	 lea	2($ctr),%eax
1340	 xor	$key0,%eax
1341	aesenc	$rndkey1,$inout4
1342	 movbe	%eax,`0x20+12`(%rsp)
1343	 lea	3($ctr),%eax
1344	aesenc	$rndkey1,$inout5
1345	$movkey	-32($key,$rnds_),$rndkey1
1346	 xor	$key0,%eax
1347
1348	aesenc	$rndkey0,$inout0
1349	 movbe	%eax,`0x30+12`(%rsp)
1350	 lea	4($ctr),%eax
1351	aesenc	$rndkey0,$inout1
1352	 xor	$key0,%eax
1353	 movbe	%eax,`0x40+12`(%rsp)
1354	aesenc	$rndkey0,$inout2
1355	 lea	5($ctr),%eax
1356	 xor	$key0,%eax
1357	aesenc	$rndkey0,$inout3
1358	 movbe	%eax,`0x50+12`(%rsp)
1359	 mov	%r10,%rax		# mov	$rnds_,$rounds
1360	aesenc	$rndkey0,$inout4
1361	aesenc	$rndkey0,$inout5
1362	$movkey	-16($key,$rnds_),$rndkey0
1363
1364	call	.Lenc_loop6
1365
1366	movdqu	($inp),$inout6		# load 6 input blocks
1367	movdqu	0x10($inp),$inout7
1368	movdqu	0x20($inp),$in0
1369	movdqu	0x30($inp),$in1
1370	movdqu	0x40($inp),$in2
1371	movdqu	0x50($inp),$in3
1372	lea	0x60($inp),$inp		# $inp+=6*16
1373	$movkey	-64($key,$rnds_),$rndkey1
1374	pxor	$inout0,$inout6		# inp^=E(ctr)
1375	movaps	0x00(%rsp),$inout0	# load next counter [xor-ed with 0 round]
1376	pxor	$inout1,$inout7
1377	movaps	0x10(%rsp),$inout1
1378	pxor	$inout2,$in0
1379	movaps	0x20(%rsp),$inout2
1380	pxor	$inout3,$in1
1381	movaps	0x30(%rsp),$inout3
1382	pxor	$inout4,$in2
1383	movaps	0x40(%rsp),$inout4
1384	pxor	$inout5,$in3
1385	movaps	0x50(%rsp),$inout5
1386	movdqu	$inout6,($out)		# store 6 output blocks
1387	movdqu	$inout7,0x10($out)
1388	movdqu	$in0,0x20($out)
1389	movdqu	$in1,0x30($out)
1390	movdqu	$in2,0x40($out)
1391	movdqu	$in3,0x50($out)
1392	lea	0x60($out),$out		# $out+=6*16
1393
1394	sub	\$6,$len
1395	jnc	.Lctr32_loop6		# loop if $len-=6 didn't borrow
1396
1397	add	\$6,$len		# restore real remaining $len
1398	jz	.Lctr32_done		# done if ($len==0)
1399
1400	lea	-48($rnds_),$rounds
1401	lea	-80($key,$rnds_),$key	# restore $key
1402	neg	$rounds
1403	shr	\$4,$rounds		# restore $rounds
1404	jmp	.Lctr32_tail
1405
1406.align	32
1407.Lctr32_loop8:
1408	 add		\$8,$ctr		# next counter value
1409	movdqa		0x60(%rsp),$inout6
1410	aesenc		$rndkey1,$inout0
1411	 mov		$ctr,%r9d
1412	movdqa		0x70(%rsp),$inout7
1413	aesenc		$rndkey1,$inout1
1414	 bswap		%r9d
1415	$movkey		0x20-0x80($key),$rndkey0
1416	aesenc		$rndkey1,$inout2
1417	 xor		$key0,%r9d
1418	 nop
1419	aesenc		$rndkey1,$inout3
1420	 mov		%r9d,0x00+12(%rsp)	# store next counter value
1421	 lea		1($ctr),%r9
1422	aesenc		$rndkey1,$inout4
1423	aesenc		$rndkey1,$inout5
1424	aesenc		$rndkey1,$inout6
1425	aesenc		$rndkey1,$inout7
1426	$movkey		0x30-0x80($key),$rndkey1
1427___
1428for($i=2;$i<8;$i++) {
1429my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1430$code.=<<___;
1431	 bswap		%r9d
1432	aesenc		$rndkeyx,$inout0
1433	aesenc		$rndkeyx,$inout1
1434	 xor		$key0,%r9d
1435	 .byte		0x66,0x90
1436	aesenc		$rndkeyx,$inout2
1437	aesenc		$rndkeyx,$inout3
1438	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
1439	 lea		$i($ctr),%r9
1440	aesenc		$rndkeyx,$inout4
1441	aesenc		$rndkeyx,$inout5
1442	aesenc		$rndkeyx,$inout6
1443	aesenc		$rndkeyx,$inout7
1444	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
1445___
1446}
1447$code.=<<___;
1448	 bswap		%r9d
1449	aesenc		$rndkey0,$inout0
1450	aesenc		$rndkey0,$inout1
1451	aesenc		$rndkey0,$inout2
1452	 xor		$key0,%r9d
1453	 movdqu		0x00($inp),$in0		# start loading input
1454	aesenc		$rndkey0,$inout3
1455	 mov		%r9d,0x70+12(%rsp)
1456	 cmp		\$11,$rounds
1457	aesenc		$rndkey0,$inout4
1458	aesenc		$rndkey0,$inout5
1459	aesenc		$rndkey0,$inout6
1460	aesenc		$rndkey0,$inout7
1461	$movkey		0xa0-0x80($key),$rndkey0
1462
1463	jb		.Lctr32_enc_done
1464
1465	aesenc		$rndkey1,$inout0
1466	aesenc		$rndkey1,$inout1
1467	aesenc		$rndkey1,$inout2
1468	aesenc		$rndkey1,$inout3
1469	aesenc		$rndkey1,$inout4
1470	aesenc		$rndkey1,$inout5
1471	aesenc		$rndkey1,$inout6
1472	aesenc		$rndkey1,$inout7
1473	$movkey		0xb0-0x80($key),$rndkey1
1474
1475	aesenc		$rndkey0,$inout0
1476	aesenc		$rndkey0,$inout1
1477	aesenc		$rndkey0,$inout2
1478	aesenc		$rndkey0,$inout3
1479	aesenc		$rndkey0,$inout4
1480	aesenc		$rndkey0,$inout5
1481	aesenc		$rndkey0,$inout6
1482	aesenc		$rndkey0,$inout7
1483	$movkey		0xc0-0x80($key),$rndkey0
1484	je		.Lctr32_enc_done
1485
1486	aesenc		$rndkey1,$inout0
1487	aesenc		$rndkey1,$inout1
1488	aesenc		$rndkey1,$inout2
1489	aesenc		$rndkey1,$inout3
1490	aesenc		$rndkey1,$inout4
1491	aesenc		$rndkey1,$inout5
1492	aesenc		$rndkey1,$inout6
1493	aesenc		$rndkey1,$inout7
1494	$movkey		0xd0-0x80($key),$rndkey1
1495
1496	aesenc		$rndkey0,$inout0
1497	aesenc		$rndkey0,$inout1
1498	aesenc		$rndkey0,$inout2
1499	aesenc		$rndkey0,$inout3
1500	aesenc		$rndkey0,$inout4
1501	aesenc		$rndkey0,$inout5
1502	aesenc		$rndkey0,$inout6
1503	aesenc		$rndkey0,$inout7
1504	$movkey		0xe0-0x80($key),$rndkey0
1505	jmp		.Lctr32_enc_done
1506
1507.align	16
1508.Lctr32_enc_done:
1509	movdqu		0x10($inp),$in1
1510	pxor		$rndkey0,$in0		# input^=round[last]
1511	movdqu		0x20($inp),$in2
1512	pxor		$rndkey0,$in1
1513	movdqu		0x30($inp),$in3
1514	pxor		$rndkey0,$in2
1515	movdqu		0x40($inp),$in4
1516	pxor		$rndkey0,$in3
1517	movdqu		0x50($inp),$in5
1518	pxor		$rndkey0,$in4
1519	pxor		$rndkey0,$in5
1520	aesenc		$rndkey1,$inout0
1521	aesenc		$rndkey1,$inout1
1522	aesenc		$rndkey1,$inout2
1523	aesenc		$rndkey1,$inout3
1524	aesenc		$rndkey1,$inout4
1525	aesenc		$rndkey1,$inout5
1526	aesenc		$rndkey1,$inout6
1527	aesenc		$rndkey1,$inout7
1528	movdqu		0x60($inp),$rndkey1	# borrow $rndkey1 for inp[6]
1529	lea		0x80($inp),$inp		# $inp+=8*16
1530
1531	aesenclast	$in0,$inout0		# $inN is inp[N]^round[last]
1532	pxor		$rndkey0,$rndkey1	# borrowed $rndkey
1533	movdqu		0x70-0x80($inp),$in0
1534	aesenclast	$in1,$inout1
1535	pxor		$rndkey0,$in0
1536	movdqa		0x00(%rsp),$in1		# load next counter block
1537	aesenclast	$in2,$inout2
1538	aesenclast	$in3,$inout3
1539	movdqa		0x10(%rsp),$in2
1540	movdqa		0x20(%rsp),$in3
1541	aesenclast	$in4,$inout4
1542	aesenclast	$in5,$inout5
1543	movdqa		0x30(%rsp),$in4
1544	movdqa		0x40(%rsp),$in5
1545	aesenclast	$rndkey1,$inout6
1546	movdqa		0x50(%rsp),$rndkey0
1547	$movkey		0x10-0x80($key),$rndkey1#real 1st-round key
1548	aesenclast	$in0,$inout7
1549
1550	movups		$inout0,($out)		# store 8 output blocks
1551	movdqa		$in1,$inout0
1552	movups		$inout1,0x10($out)
1553	movdqa		$in2,$inout1
1554	movups		$inout2,0x20($out)
1555	movdqa		$in3,$inout2
1556	movups		$inout3,0x30($out)
1557	movdqa		$in4,$inout3
1558	movups		$inout4,0x40($out)
1559	movdqa		$in5,$inout4
1560	movups		$inout5,0x50($out)
1561	movdqa		$rndkey0,$inout5
1562	movups		$inout6,0x60($out)
1563	movups		$inout7,0x70($out)
1564	lea		0x80($out),$out		# $out+=8*16
1565
1566	sub	\$8,$len
1567	jnc	.Lctr32_loop8			# loop if $len-=8 didn't borrow
1568
1569	add	\$8,$len			# restore real remaining $len
1570	jz	.Lctr32_done			# done if ($len==0)
1571	lea	-0x80($key),$key
1572
1573.Lctr32_tail:
1574	# note that at this point $inout0..5 are populated with
1575	# counter values xor-ed with 0-round key
1576	lea	16($key),$key
1577	cmp	\$4,$len
1578	jb	.Lctr32_loop3
1579	je	.Lctr32_loop4
1580
1581	# if ($len>4) compute 7 E(counter)
1582	shl		\$4,$rounds
1583	movdqa		0x60(%rsp),$inout6
1584	pxor		$inout7,$inout7
1585
1586	$movkey		16($key),$rndkey0
1587	aesenc		$rndkey1,$inout0
1588	aesenc		$rndkey1,$inout1
1589	lea		32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1590	neg		%rax
1591	aesenc		$rndkey1,$inout2
1592	add		\$16,%rax		# prepare for .Lenc_loop8_enter
1593	 movups		($inp),$in0
1594	aesenc		$rndkey1,$inout3
1595	aesenc		$rndkey1,$inout4
1596	 movups		0x10($inp),$in1		# pre-load input
1597	 movups		0x20($inp),$in2
1598	aesenc		$rndkey1,$inout5
1599	aesenc		$rndkey1,$inout6
1600
1601	call            .Lenc_loop8_enter
1602
1603	movdqu	0x30($inp),$in3
1604	pxor	$in0,$inout0
1605	movdqu	0x40($inp),$in0
1606	pxor	$in1,$inout1
1607	movdqu	$inout0,($out)			# store output
1608	pxor	$in2,$inout2
1609	movdqu	$inout1,0x10($out)
1610	pxor	$in3,$inout3
1611	movdqu	$inout2,0x20($out)
1612	pxor	$in0,$inout4
1613	movdqu	$inout3,0x30($out)
1614	movdqu	$inout4,0x40($out)
1615	cmp	\$6,$len
1616	jb	.Lctr32_done			# $len was 5, stop store
1617
1618	movups	0x50($inp),$in1
1619	xorps	$in1,$inout5
1620	movups	$inout5,0x50($out)
1621	je	.Lctr32_done			# $len was 6, stop store
1622
1623	movups	0x60($inp),$in2
1624	xorps	$in2,$inout6
1625	movups	$inout6,0x60($out)
1626	jmp	.Lctr32_done			# $len was 7, stop store
1627
1628.align	32
1629.Lctr32_loop4:
1630	aesenc		$rndkey1,$inout0
1631	lea		16($key),$key
1632	dec		$rounds
1633	aesenc		$rndkey1,$inout1
1634	aesenc		$rndkey1,$inout2
1635	aesenc		$rndkey1,$inout3
1636	$movkey		($key),$rndkey1
1637	jnz		.Lctr32_loop4
1638	aesenclast	$rndkey1,$inout0
1639	aesenclast	$rndkey1,$inout1
1640	 movups		($inp),$in0		# load input
1641	 movups		0x10($inp),$in1
1642	aesenclast	$rndkey1,$inout2
1643	aesenclast	$rndkey1,$inout3
1644	 movups		0x20($inp),$in2
1645	 movups		0x30($inp),$in3
1646
1647	xorps	$in0,$inout0
1648	movups	$inout0,($out)			# store output
1649	xorps	$in1,$inout1
1650	movups	$inout1,0x10($out)
1651	pxor	$in2,$inout2
1652	movdqu	$inout2,0x20($out)
1653	pxor	$in3,$inout3
1654	movdqu	$inout3,0x30($out)
1655	jmp	.Lctr32_done			# $len was 4, stop store
1656
1657.align	32
1658.Lctr32_loop3:
1659	aesenc		$rndkey1,$inout0
1660	lea		16($key),$key
1661	dec		$rounds
1662	aesenc		$rndkey1,$inout1
1663	aesenc		$rndkey1,$inout2
1664	$movkey		($key),$rndkey1
1665	jnz		.Lctr32_loop3
1666	aesenclast	$rndkey1,$inout0
1667	aesenclast	$rndkey1,$inout1
1668	aesenclast	$rndkey1,$inout2
1669
1670	movups	($inp),$in0			# load input
1671	xorps	$in0,$inout0
1672	movups	$inout0,($out)			# store output
1673	cmp	\$2,$len
1674	jb	.Lctr32_done			# $len was 1, stop store
1675
1676	movups	0x10($inp),$in1
1677	xorps	$in1,$inout1
1678	movups	$inout1,0x10($out)
1679	je	.Lctr32_done			# $len was 2, stop store
1680
1681	movups	0x20($inp),$in2
1682	xorps	$in2,$inout2
1683	movups	$inout2,0x20($out)		# $len was 3, stop store
1684
1685.Lctr32_done:
1686	xorps	%xmm0,%xmm0			# clear register bank
1687	xor	$key0,$key0
1688	pxor	%xmm1,%xmm1
1689	pxor	%xmm2,%xmm2
1690	pxor	%xmm3,%xmm3
1691	pxor	%xmm4,%xmm4
1692	pxor	%xmm5,%xmm5
1693___
1694$code.=<<___ if (!$win64);
1695	pxor	%xmm6,%xmm6
1696	pxor	%xmm7,%xmm7
1697	movaps	%xmm0,0x00(%rsp)		# clear stack
1698	pxor	%xmm8,%xmm8
1699	movaps	%xmm0,0x10(%rsp)
1700	pxor	%xmm9,%xmm9
1701	movaps	%xmm0,0x20(%rsp)
1702	pxor	%xmm10,%xmm10
1703	movaps	%xmm0,0x30(%rsp)
1704	pxor	%xmm11,%xmm11
1705	movaps	%xmm0,0x40(%rsp)
1706	pxor	%xmm12,%xmm12
1707	movaps	%xmm0,0x50(%rsp)
1708	pxor	%xmm13,%xmm13
1709	movaps	%xmm0,0x60(%rsp)
1710	pxor	%xmm14,%xmm14
1711	movaps	%xmm0,0x70(%rsp)
1712	pxor	%xmm15,%xmm15
1713___
1714$code.=<<___ if ($win64);
1715	movaps	-0xa8($key_),%xmm6
1716	movaps	%xmm0,-0xa8($key_)		# clear stack
1717	movaps	-0x98($key_),%xmm7
1718	movaps	%xmm0,-0x98($key_)
1719	movaps	-0x88($key_),%xmm8
1720	movaps	%xmm0,-0x88($key_)
1721	movaps	-0x78($key_),%xmm9
1722	movaps	%xmm0,-0x78($key_)
1723	movaps	-0x68($key_),%xmm10
1724	movaps	%xmm0,-0x68($key_)
1725	movaps	-0x58($key_),%xmm11
1726	movaps	%xmm0,-0x58($key_)
1727	movaps	-0x48($key_),%xmm12
1728	movaps	%xmm0,-0x48($key_)
1729	movaps	-0x38($key_),%xmm13
1730	movaps	%xmm0,-0x38($key_)
1731	movaps	-0x28($key_),%xmm14
1732	movaps	%xmm0,-0x28($key_)
1733	movaps	-0x18($key_),%xmm15
1734	movaps	%xmm0,-0x18($key_)
1735	movaps	%xmm0,0x00(%rsp)
1736	movaps	%xmm0,0x10(%rsp)
1737	movaps	%xmm0,0x20(%rsp)
1738	movaps	%xmm0,0x30(%rsp)
1739	movaps	%xmm0,0x40(%rsp)
1740	movaps	%xmm0,0x50(%rsp)
1741	movaps	%xmm0,0x60(%rsp)
1742	movaps	%xmm0,0x70(%rsp)
1743___
1744$code.=<<___;
1745	mov	-8($key_),%rbp
1746.cfi_restore	%rbp
1747	lea	($key_),%rsp
1748.cfi_def_cfa_register	%rsp
1749.Lctr32_epilogue:
1750	ret
1751.cfi_endproc
1752.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1753___
1754}
1755
1756######################################################################
1757# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1758#	const AES_KEY *key1, const AES_KEY *key2
1759#	const unsigned char iv[16]);
1760#
1761{
1762my @tweak=map("%xmm$_",(10..15));
1763my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1764my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1765my $frame_size = 0x70 + ($win64?160:0);
1766my $key_ = "%rbp";	# override so that we can use %r11 as FP
1767
1768$code.=<<___;
1769.globl	aesni_xts_encrypt
1770.type	aesni_xts_encrypt,\@function,6
1771.align	16
1772aesni_xts_encrypt:
1773.cfi_startproc
1774	lea	(%rsp),%r11			# frame pointer
1775.cfi_def_cfa_register	%r11
1776	push	%rbp
1777.cfi_push	%rbp
1778	sub	\$$frame_size,%rsp
1779	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1780___
1781$code.=<<___ if ($win64);
1782	movaps	%xmm6,-0xa8(%r11)		# offload everything
1783	movaps	%xmm7,-0x98(%r11)
1784	movaps	%xmm8,-0x88(%r11)
1785	movaps	%xmm9,-0x78(%r11)
1786	movaps	%xmm10,-0x68(%r11)
1787	movaps	%xmm11,-0x58(%r11)
1788	movaps	%xmm12,-0x48(%r11)
1789	movaps	%xmm13,-0x38(%r11)
1790	movaps	%xmm14,-0x28(%r11)
1791	movaps	%xmm15,-0x18(%r11)
1792.Lxts_enc_body:
1793___
1794$code.=<<___;
1795	movups	($ivp),$inout0			# load clear-text tweak
1796	mov	240(%r8),$rounds		# key2->rounds
1797	mov	240($key),$rnds_		# key1->rounds
1798___
1799	# generate the tweak
1800	&aesni_generate1("enc",$key2,$rounds,$inout0);
1801$code.=<<___;
1802	$movkey	($key),$rndkey0			# zero round key
1803	mov	$key,$key_			# backup $key
1804	mov	$rnds_,$rounds			# backup $rounds
1805	shl	\$4,$rnds_
1806	mov	$len,$len_			# backup $len
1807	and	\$-16,$len
1808
1809	$movkey	16($key,$rnds_),$rndkey1	# last round key
1810
1811	movdqa	.Lxts_magic(%rip),$twmask
1812	movdqa	$inout0,@tweak[5]
1813	pshufd	\$0x5f,$inout0,$twres
1814	pxor	$rndkey0,$rndkey1
1815___
1816    # alternative tweak calculation algorithm is based on suggestions
1817    # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1818    # and should help in the future...
1819    for ($i=0;$i<4;$i++) {
1820    $code.=<<___;
1821	movdqa	$twres,$twtmp
1822	paddd	$twres,$twres
1823	movdqa	@tweak[5],@tweak[$i]
1824	psrad	\$31,$twtmp			# broadcast upper bits
1825	paddq	@tweak[5],@tweak[5]
1826	pand	$twmask,$twtmp
1827	pxor	$rndkey0,@tweak[$i]
1828	pxor	$twtmp,@tweak[5]
1829___
1830    }
1831$code.=<<___;
1832	movdqa	@tweak[5],@tweak[4]
1833	psrad	\$31,$twres
1834	paddq	@tweak[5],@tweak[5]
1835	pand	$twmask,$twres
1836	pxor	$rndkey0,@tweak[4]
1837	pxor	$twres,@tweak[5]
1838	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
1839
1840	sub	\$16*6,$len
1841	jc	.Lxts_enc_short			# if $len-=6*16 borrowed
1842
1843	mov	\$16+96,$rounds
1844	lea	32($key_,$rnds_),$key		# end of key schedule
1845	sub	%r10,%rax			# twisted $rounds
1846	$movkey	16($key_),$rndkey1
1847	mov	%rax,%r10			# backup twisted $rounds
1848	lea	.Lxts_magic(%rip),%r8
1849	jmp	.Lxts_enc_grandloop
1850
1851.align	32
1852.Lxts_enc_grandloop:
1853	movdqu	`16*0`($inp),$inout0		# load input
1854	movdqa	$rndkey0,$twmask
1855	movdqu	`16*1`($inp),$inout1
1856	pxor	@tweak[0],$inout0		# input^=tweak^round[0]
1857	movdqu	`16*2`($inp),$inout2
1858	pxor	@tweak[1],$inout1
1859	 aesenc		$rndkey1,$inout0
1860	movdqu	`16*3`($inp),$inout3
1861	pxor	@tweak[2],$inout2
1862	 aesenc		$rndkey1,$inout1
1863	movdqu	`16*4`($inp),$inout4
1864	pxor	@tweak[3],$inout3
1865	 aesenc		$rndkey1,$inout2
1866	movdqu	`16*5`($inp),$inout5
1867	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
1868	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
1869	pxor	@tweak[4],$inout4
1870	 aesenc		$rndkey1,$inout3
1871	$movkey	32($key_),$rndkey0
1872	lea	`16*6`($inp),$inp
1873	pxor	$twmask,$inout5
1874
1875	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
1876	aesenc		$rndkey1,$inout4
1877	 pxor	$twres,@tweak[1]
1878	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^round[last]
1879	aesenc		$rndkey1,$inout5
1880	$movkey		48($key_),$rndkey1
1881	 pxor	$twres,@tweak[2]
1882
1883	aesenc		$rndkey0,$inout0
1884	 pxor	$twres,@tweak[3]
1885	 movdqa	@tweak[1],`16*1`(%rsp)
1886	aesenc		$rndkey0,$inout1
1887	 pxor	$twres,@tweak[4]
1888	 movdqa	@tweak[2],`16*2`(%rsp)
1889	aesenc		$rndkey0,$inout2
1890	aesenc		$rndkey0,$inout3
1891	 pxor	$twres,$twmask
1892	 movdqa	@tweak[4],`16*4`(%rsp)
1893	aesenc		$rndkey0,$inout4
1894	aesenc		$rndkey0,$inout5
1895	$movkey		64($key_),$rndkey0
1896	 movdqa	$twmask,`16*5`(%rsp)
1897	pshufd	\$0x5f,@tweak[5],$twres
1898	jmp	.Lxts_enc_loop6
1899.align	32
1900.Lxts_enc_loop6:
1901	aesenc		$rndkey1,$inout0
1902	aesenc		$rndkey1,$inout1
1903	aesenc		$rndkey1,$inout2
1904	aesenc		$rndkey1,$inout3
1905	aesenc		$rndkey1,$inout4
1906	aesenc		$rndkey1,$inout5
1907	$movkey		-64($key,%rax),$rndkey1
1908	add		\$32,%rax
1909
1910	aesenc		$rndkey0,$inout0
1911	aesenc		$rndkey0,$inout1
1912	aesenc		$rndkey0,$inout2
1913	aesenc		$rndkey0,$inout3
1914	aesenc		$rndkey0,$inout4
1915	aesenc		$rndkey0,$inout5
1916	$movkey		-80($key,%rax),$rndkey0
1917	jnz		.Lxts_enc_loop6
1918
1919	movdqa	(%r8),$twmask			# start calculating next tweak
1920	movdqa	$twres,$twtmp
1921	paddd	$twres,$twres
1922	 aesenc		$rndkey1,$inout0
1923	paddq	@tweak[5],@tweak[5]
1924	psrad	\$31,$twtmp
1925	 aesenc		$rndkey1,$inout1
1926	pand	$twmask,$twtmp
1927	$movkey	($key_),@tweak[0]		# load round[0]
1928	 aesenc		$rndkey1,$inout2
1929	 aesenc		$rndkey1,$inout3
1930	 aesenc		$rndkey1,$inout4
1931	pxor	$twtmp,@tweak[5]
1932	movaps	@tweak[0],@tweak[1]		# copy round[0]
1933	 aesenc		$rndkey1,$inout5
1934	 $movkey	-64($key),$rndkey1
1935
1936	movdqa	$twres,$twtmp
1937	 aesenc		$rndkey0,$inout0
1938	paddd	$twres,$twres
1939	pxor	@tweak[5],@tweak[0]
1940	 aesenc		$rndkey0,$inout1
1941	psrad	\$31,$twtmp
1942	paddq	@tweak[5],@tweak[5]
1943	 aesenc		$rndkey0,$inout2
1944	 aesenc		$rndkey0,$inout3
1945	pand	$twmask,$twtmp
1946	movaps	@tweak[1],@tweak[2]
1947	 aesenc		$rndkey0,$inout4
1948	pxor	$twtmp,@tweak[5]
1949	movdqa	$twres,$twtmp
1950	 aesenc		$rndkey0,$inout5
1951	 $movkey	-48($key),$rndkey0
1952
1953	paddd	$twres,$twres
1954	 aesenc		$rndkey1,$inout0
1955	pxor	@tweak[5],@tweak[1]
1956	psrad	\$31,$twtmp
1957	 aesenc		$rndkey1,$inout1
1958	paddq	@tweak[5],@tweak[5]
1959	pand	$twmask,$twtmp
1960	 aesenc		$rndkey1,$inout2
1961	 aesenc		$rndkey1,$inout3
1962	 movdqa	@tweak[3],`16*3`(%rsp)
1963	pxor	$twtmp,@tweak[5]
1964	 aesenc		$rndkey1,$inout4
1965	movaps	@tweak[2],@tweak[3]
1966	movdqa	$twres,$twtmp
1967	 aesenc		$rndkey1,$inout5
1968	 $movkey	-32($key),$rndkey1
1969
1970	paddd	$twres,$twres
1971	 aesenc		$rndkey0,$inout0
1972	pxor	@tweak[5],@tweak[2]
1973	psrad	\$31,$twtmp
1974	 aesenc		$rndkey0,$inout1
1975	paddq	@tweak[5],@tweak[5]
1976	pand	$twmask,$twtmp
1977	 aesenc		$rndkey0,$inout2
1978	 aesenc		$rndkey0,$inout3
1979	 aesenc		$rndkey0,$inout4
1980	pxor	$twtmp,@tweak[5]
1981	movaps	@tweak[3],@tweak[4]
1982	 aesenc		$rndkey0,$inout5
1983
1984	movdqa	$twres,$rndkey0
1985	paddd	$twres,$twres
1986	 aesenc		$rndkey1,$inout0
1987	pxor	@tweak[5],@tweak[3]
1988	psrad	\$31,$rndkey0
1989	 aesenc		$rndkey1,$inout1
1990	paddq	@tweak[5],@tweak[5]
1991	pand	$twmask,$rndkey0
1992	 aesenc		$rndkey1,$inout2
1993	 aesenc		$rndkey1,$inout3
1994	pxor	$rndkey0,@tweak[5]
1995	$movkey		($key_),$rndkey0
1996	 aesenc		$rndkey1,$inout4
1997	 aesenc		$rndkey1,$inout5
1998	$movkey		16($key_),$rndkey1
1999
2000	pxor	@tweak[5],@tweak[4]
2001	 aesenclast	`16*0`(%rsp),$inout0
2002	psrad	\$31,$twres
2003	paddq	@tweak[5],@tweak[5]
2004	 aesenclast	`16*1`(%rsp),$inout1
2005	 aesenclast	`16*2`(%rsp),$inout2
2006	pand	$twmask,$twres
2007	mov	%r10,%rax			# restore $rounds
2008	 aesenclast	`16*3`(%rsp),$inout3
2009	 aesenclast	`16*4`(%rsp),$inout4
2010	 aesenclast	`16*5`(%rsp),$inout5
2011	pxor	$twres,@tweak[5]
2012
2013	lea	`16*6`($out),$out		# $out+=6*16
2014	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2015	movups	$inout1,`-16*5`($out)
2016	movups	$inout2,`-16*4`($out)
2017	movups	$inout3,`-16*3`($out)
2018	movups	$inout4,`-16*2`($out)
2019	movups	$inout5,`-16*1`($out)
2020	sub	\$16*6,$len
2021	jnc	.Lxts_enc_grandloop		# loop if $len-=6*16 didn't borrow
2022
2023	mov	\$16+96,$rounds
2024	sub	$rnds_,$rounds
2025	mov	$key_,$key			# restore $key
2026	shr	\$4,$rounds			# restore original value
2027
2028.Lxts_enc_short:
2029	# at the point @tweak[0..5] are populated with tweak values
2030	mov	$rounds,$rnds_			# backup $rounds
2031	pxor	$rndkey0,@tweak[0]
2032	add	\$16*6,$len			# restore real remaining $len
2033	jz	.Lxts_enc_done			# done if ($len==0)
2034
2035	pxor	$rndkey0,@tweak[1]
2036	cmp	\$0x20,$len
2037	jb	.Lxts_enc_one			# $len is 1*16
2038	pxor	$rndkey0,@tweak[2]
2039	je	.Lxts_enc_two			# $len is 2*16
2040
2041	pxor	$rndkey0,@tweak[3]
2042	cmp	\$0x40,$len
2043	jb	.Lxts_enc_three			# $len is 3*16
2044	pxor	$rndkey0,@tweak[4]
2045	je	.Lxts_enc_four			# $len is 4*16
2046
2047	movdqu	($inp),$inout0			# $len is 5*16
2048	movdqu	16*1($inp),$inout1
2049	movdqu	16*2($inp),$inout2
2050	pxor	@tweak[0],$inout0
2051	movdqu	16*3($inp),$inout3
2052	pxor	@tweak[1],$inout1
2053	movdqu	16*4($inp),$inout4
2054	lea	16*5($inp),$inp			# $inp+=5*16
2055	pxor	@tweak[2],$inout2
2056	pxor	@tweak[3],$inout3
2057	pxor	@tweak[4],$inout4
2058	pxor	$inout5,$inout5
2059
2060	call	_aesni_encrypt6
2061
2062	xorps	@tweak[0],$inout0
2063	movdqa	@tweak[5],@tweak[0]
2064	xorps	@tweak[1],$inout1
2065	xorps	@tweak[2],$inout2
2066	movdqu	$inout0,($out)			# store 5 output blocks
2067	xorps	@tweak[3],$inout3
2068	movdqu	$inout1,16*1($out)
2069	xorps	@tweak[4],$inout4
2070	movdqu	$inout2,16*2($out)
2071	movdqu	$inout3,16*3($out)
2072	movdqu	$inout4,16*4($out)
2073	lea	16*5($out),$out			# $out+=5*16
2074	jmp	.Lxts_enc_done
2075
2076.align	16
2077.Lxts_enc_one:
2078	movups	($inp),$inout0
2079	lea	16*1($inp),$inp			# inp+=1*16
2080	xorps	@tweak[0],$inout0
2081___
2082	&aesni_generate1("enc",$key,$rounds);
2083$code.=<<___;
2084	xorps	@tweak[0],$inout0
2085	movdqa	@tweak[1],@tweak[0]
2086	movups	$inout0,($out)			# store one output block
2087	lea	16*1($out),$out			# $out+=1*16
2088	jmp	.Lxts_enc_done
2089
2090.align	16
2091.Lxts_enc_two:
2092	movups	($inp),$inout0
2093	movups	16($inp),$inout1
2094	lea	32($inp),$inp			# $inp+=2*16
2095	xorps	@tweak[0],$inout0
2096	xorps	@tweak[1],$inout1
2097
2098	call	_aesni_encrypt2
2099
2100	xorps	@tweak[0],$inout0
2101	movdqa	@tweak[2],@tweak[0]
2102	xorps	@tweak[1],$inout1
2103	movups	$inout0,($out)			# store 2 output blocks
2104	movups	$inout1,16*1($out)
2105	lea	16*2($out),$out			# $out+=2*16
2106	jmp	.Lxts_enc_done
2107
2108.align	16
2109.Lxts_enc_three:
2110	movups	($inp),$inout0
2111	movups	16*1($inp),$inout1
2112	movups	16*2($inp),$inout2
2113	lea	16*3($inp),$inp			# $inp+=3*16
2114	xorps	@tweak[0],$inout0
2115	xorps	@tweak[1],$inout1
2116	xorps	@tweak[2],$inout2
2117
2118	call	_aesni_encrypt3
2119
2120	xorps	@tweak[0],$inout0
2121	movdqa	@tweak[3],@tweak[0]
2122	xorps	@tweak[1],$inout1
2123	xorps	@tweak[2],$inout2
2124	movups	$inout0,($out)			# store 3 output blocks
2125	movups	$inout1,16*1($out)
2126	movups	$inout2,16*2($out)
2127	lea	16*3($out),$out			# $out+=3*16
2128	jmp	.Lxts_enc_done
2129
2130.align	16
2131.Lxts_enc_four:
2132	movups	($inp),$inout0
2133	movups	16*1($inp),$inout1
2134	movups	16*2($inp),$inout2
2135	xorps	@tweak[0],$inout0
2136	movups	16*3($inp),$inout3
2137	lea	16*4($inp),$inp			# $inp+=4*16
2138	xorps	@tweak[1],$inout1
2139	xorps	@tweak[2],$inout2
2140	xorps	@tweak[3],$inout3
2141
2142	call	_aesni_encrypt4
2143
2144	pxor	@tweak[0],$inout0
2145	movdqa	@tweak[4],@tweak[0]
2146	pxor	@tweak[1],$inout1
2147	pxor	@tweak[2],$inout2
2148	movdqu	$inout0,($out)			# store 4 output blocks
2149	pxor	@tweak[3],$inout3
2150	movdqu	$inout1,16*1($out)
2151	movdqu	$inout2,16*2($out)
2152	movdqu	$inout3,16*3($out)
2153	lea	16*4($out),$out			# $out+=4*16
2154	jmp	.Lxts_enc_done
2155
2156.align	16
2157.Lxts_enc_done:
2158	and	\$15,$len_			# see if $len%16 is 0
2159	jz	.Lxts_enc_ret
2160	mov	$len_,$len
2161
2162.Lxts_enc_steal:
2163	movzb	($inp),%eax			# borrow $rounds ...
2164	movzb	-16($out),%ecx			# ... and $key
2165	lea	1($inp),$inp
2166	mov	%al,-16($out)
2167	mov	%cl,0($out)
2168	lea	1($out),$out
2169	sub	\$1,$len
2170	jnz	.Lxts_enc_steal
2171
2172	sub	$len_,$out			# rewind $out
2173	mov	$key_,$key			# restore $key
2174	mov	$rnds_,$rounds			# restore $rounds
2175
2176	movups	-16($out),$inout0
2177	xorps	@tweak[0],$inout0
2178___
2179	&aesni_generate1("enc",$key,$rounds);
2180$code.=<<___;
2181	xorps	@tweak[0],$inout0
2182	movups	$inout0,-16($out)
2183
2184.Lxts_enc_ret:
2185	xorps	%xmm0,%xmm0			# clear register bank
2186	pxor	%xmm1,%xmm1
2187	pxor	%xmm2,%xmm2
2188	pxor	%xmm3,%xmm3
2189	pxor	%xmm4,%xmm4
2190	pxor	%xmm5,%xmm5
2191___
2192$code.=<<___ if (!$win64);
2193	pxor	%xmm6,%xmm6
2194	pxor	%xmm7,%xmm7
2195	movaps	%xmm0,0x00(%rsp)		# clear stack
2196	pxor	%xmm8,%xmm8
2197	movaps	%xmm0,0x10(%rsp)
2198	pxor	%xmm9,%xmm9
2199	movaps	%xmm0,0x20(%rsp)
2200	pxor	%xmm10,%xmm10
2201	movaps	%xmm0,0x30(%rsp)
2202	pxor	%xmm11,%xmm11
2203	movaps	%xmm0,0x40(%rsp)
2204	pxor	%xmm12,%xmm12
2205	movaps	%xmm0,0x50(%rsp)
2206	pxor	%xmm13,%xmm13
2207	movaps	%xmm0,0x60(%rsp)
2208	pxor	%xmm14,%xmm14
2209	pxor	%xmm15,%xmm15
2210___
2211$code.=<<___ if ($win64);
2212	movaps	-0xa8(%r11),%xmm6
2213	movaps	%xmm0,-0xa8(%r11)		# clear stack
2214	movaps	-0x98(%r11),%xmm7
2215	movaps	%xmm0,-0x98(%r11)
2216	movaps	-0x88(%r11),%xmm8
2217	movaps	%xmm0,-0x88(%r11)
2218	movaps	-0x78(%r11),%xmm9
2219	movaps	%xmm0,-0x78(%r11)
2220	movaps	-0x68(%r11),%xmm10
2221	movaps	%xmm0,-0x68(%r11)
2222	movaps	-0x58(%r11),%xmm11
2223	movaps	%xmm0,-0x58(%r11)
2224	movaps	-0x48(%r11),%xmm12
2225	movaps	%xmm0,-0x48(%r11)
2226	movaps	-0x38(%r11),%xmm13
2227	movaps	%xmm0,-0x38(%r11)
2228	movaps	-0x28(%r11),%xmm14
2229	movaps	%xmm0,-0x28(%r11)
2230	movaps	-0x18(%r11),%xmm15
2231	movaps	%xmm0,-0x18(%r11)
2232	movaps	%xmm0,0x00(%rsp)
2233	movaps	%xmm0,0x10(%rsp)
2234	movaps	%xmm0,0x20(%rsp)
2235	movaps	%xmm0,0x30(%rsp)
2236	movaps	%xmm0,0x40(%rsp)
2237	movaps	%xmm0,0x50(%rsp)
2238	movaps	%xmm0,0x60(%rsp)
2239___
2240$code.=<<___;
2241	mov	-8(%r11),%rbp
2242.cfi_restore	%rbp
2243	lea	(%r11),%rsp
2244.cfi_def_cfa_register	%rsp
2245.Lxts_enc_epilogue:
2246	ret
2247.cfi_endproc
2248.size	aesni_xts_encrypt,.-aesni_xts_encrypt
2249___
2250
2251$code.=<<___;
2252.globl	aesni_xts_decrypt
2253.type	aesni_xts_decrypt,\@function,6
2254.align	16
2255aesni_xts_decrypt:
2256.cfi_startproc
2257	lea	(%rsp),%r11			# frame pointer
2258.cfi_def_cfa_register	%r11
2259	push	%rbp
2260.cfi_push	%rbp
2261	sub	\$$frame_size,%rsp
2262	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
2263___
2264$code.=<<___ if ($win64);
2265	movaps	%xmm6,-0xa8(%r11)		# offload everything
2266	movaps	%xmm7,-0x98(%r11)
2267	movaps	%xmm8,-0x88(%r11)
2268	movaps	%xmm9,-0x78(%r11)
2269	movaps	%xmm10,-0x68(%r11)
2270	movaps	%xmm11,-0x58(%r11)
2271	movaps	%xmm12,-0x48(%r11)
2272	movaps	%xmm13,-0x38(%r11)
2273	movaps	%xmm14,-0x28(%r11)
2274	movaps	%xmm15,-0x18(%r11)
2275.Lxts_dec_body:
2276___
2277$code.=<<___;
2278	movups	($ivp),$inout0			# load clear-text tweak
2279	mov	240($key2),$rounds		# key2->rounds
2280	mov	240($key),$rnds_		# key1->rounds
2281___
2282	# generate the tweak
2283	&aesni_generate1("enc",$key2,$rounds,$inout0);
2284$code.=<<___;
2285	xor	%eax,%eax			# if ($len%16) len-=16;
2286	test	\$15,$len
2287	setnz	%al
2288	shl	\$4,%rax
2289	sub	%rax,$len
2290
2291	$movkey	($key),$rndkey0			# zero round key
2292	mov	$key,$key_			# backup $key
2293	mov	$rnds_,$rounds			# backup $rounds
2294	shl	\$4,$rnds_
2295	mov	$len,$len_			# backup $len
2296	and	\$-16,$len
2297
2298	$movkey	16($key,$rnds_),$rndkey1	# last round key
2299
2300	movdqa	.Lxts_magic(%rip),$twmask
2301	movdqa	$inout0,@tweak[5]
2302	pshufd	\$0x5f,$inout0,$twres
2303	pxor	$rndkey0,$rndkey1
2304___
2305    for ($i=0;$i<4;$i++) {
2306    $code.=<<___;
2307	movdqa	$twres,$twtmp
2308	paddd	$twres,$twres
2309	movdqa	@tweak[5],@tweak[$i]
2310	psrad	\$31,$twtmp			# broadcast upper bits
2311	paddq	@tweak[5],@tweak[5]
2312	pand	$twmask,$twtmp
2313	pxor	$rndkey0,@tweak[$i]
2314	pxor	$twtmp,@tweak[5]
2315___
2316    }
2317$code.=<<___;
2318	movdqa	@tweak[5],@tweak[4]
2319	psrad	\$31,$twres
2320	paddq	@tweak[5],@tweak[5]
2321	pand	$twmask,$twres
2322	pxor	$rndkey0,@tweak[4]
2323	pxor	$twres,@tweak[5]
2324	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
2325
2326	sub	\$16*6,$len
2327	jc	.Lxts_dec_short			# if $len-=6*16 borrowed
2328
2329	mov	\$16+96,$rounds
2330	lea	32($key_,$rnds_),$key		# end of key schedule
2331	sub	%r10,%rax			# twisted $rounds
2332	$movkey	16($key_),$rndkey1
2333	mov	%rax,%r10			# backup twisted $rounds
2334	lea	.Lxts_magic(%rip),%r8
2335	jmp	.Lxts_dec_grandloop
2336
2337.align	32
2338.Lxts_dec_grandloop:
2339	movdqu	`16*0`($inp),$inout0		# load input
2340	movdqa	$rndkey0,$twmask
2341	movdqu	`16*1`($inp),$inout1
2342	pxor	@tweak[0],$inout0		# intput^=tweak^round[0]
2343	movdqu	`16*2`($inp),$inout2
2344	pxor	@tweak[1],$inout1
2345	 aesdec		$rndkey1,$inout0
2346	movdqu	`16*3`($inp),$inout3
2347	pxor	@tweak[2],$inout2
2348	 aesdec		$rndkey1,$inout1
2349	movdqu	`16*4`($inp),$inout4
2350	pxor	@tweak[3],$inout3
2351	 aesdec		$rndkey1,$inout2
2352	movdqu	`16*5`($inp),$inout5
2353	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
2354	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
2355	pxor	@tweak[4],$inout4
2356	 aesdec		$rndkey1,$inout3
2357	$movkey	32($key_),$rndkey0
2358	lea	`16*6`($inp),$inp
2359	pxor	$twmask,$inout5
2360
2361	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
2362	aesdec		$rndkey1,$inout4
2363	 pxor	$twres,@tweak[1]
2364	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
2365	aesdec		$rndkey1,$inout5
2366	$movkey		48($key_),$rndkey1
2367	 pxor	$twres,@tweak[2]
2368
2369	aesdec		$rndkey0,$inout0
2370	 pxor	$twres,@tweak[3]
2371	 movdqa	@tweak[1],`16*1`(%rsp)
2372	aesdec		$rndkey0,$inout1
2373	 pxor	$twres,@tweak[4]
2374	 movdqa	@tweak[2],`16*2`(%rsp)
2375	aesdec		$rndkey0,$inout2
2376	aesdec		$rndkey0,$inout3
2377	 pxor	$twres,$twmask
2378	 movdqa	@tweak[4],`16*4`(%rsp)
2379	aesdec		$rndkey0,$inout4
2380	aesdec		$rndkey0,$inout5
2381	$movkey		64($key_),$rndkey0
2382	 movdqa	$twmask,`16*5`(%rsp)
2383	pshufd	\$0x5f,@tweak[5],$twres
2384	jmp	.Lxts_dec_loop6
2385.align	32
2386.Lxts_dec_loop6:
2387	aesdec		$rndkey1,$inout0
2388	aesdec		$rndkey1,$inout1
2389	aesdec		$rndkey1,$inout2
2390	aesdec		$rndkey1,$inout3
2391	aesdec		$rndkey1,$inout4
2392	aesdec		$rndkey1,$inout5
2393	$movkey		-64($key,%rax),$rndkey1
2394	add		\$32,%rax
2395
2396	aesdec		$rndkey0,$inout0
2397	aesdec		$rndkey0,$inout1
2398	aesdec		$rndkey0,$inout2
2399	aesdec		$rndkey0,$inout3
2400	aesdec		$rndkey0,$inout4
2401	aesdec		$rndkey0,$inout5
2402	$movkey		-80($key,%rax),$rndkey0
2403	jnz		.Lxts_dec_loop6
2404
2405	movdqa	(%r8),$twmask			# start calculating next tweak
2406	movdqa	$twres,$twtmp
2407	paddd	$twres,$twres
2408	 aesdec		$rndkey1,$inout0
2409	paddq	@tweak[5],@tweak[5]
2410	psrad	\$31,$twtmp
2411	 aesdec		$rndkey1,$inout1
2412	pand	$twmask,$twtmp
2413	$movkey	($key_),@tweak[0]		# load round[0]
2414	 aesdec		$rndkey1,$inout2
2415	 aesdec		$rndkey1,$inout3
2416	 aesdec		$rndkey1,$inout4
2417	pxor	$twtmp,@tweak[5]
2418	movaps	@tweak[0],@tweak[1]		# copy round[0]
2419	 aesdec		$rndkey1,$inout5
2420	 $movkey	-64($key),$rndkey1
2421
2422	movdqa	$twres,$twtmp
2423	 aesdec		$rndkey0,$inout0
2424	paddd	$twres,$twres
2425	pxor	@tweak[5],@tweak[0]
2426	 aesdec		$rndkey0,$inout1
2427	psrad	\$31,$twtmp
2428	paddq	@tweak[5],@tweak[5]
2429	 aesdec		$rndkey0,$inout2
2430	 aesdec		$rndkey0,$inout3
2431	pand	$twmask,$twtmp
2432	movaps	@tweak[1],@tweak[2]
2433	 aesdec		$rndkey0,$inout4
2434	pxor	$twtmp,@tweak[5]
2435	movdqa	$twres,$twtmp
2436	 aesdec		$rndkey0,$inout5
2437	 $movkey	-48($key),$rndkey0
2438
2439	paddd	$twres,$twres
2440	 aesdec		$rndkey1,$inout0
2441	pxor	@tweak[5],@tweak[1]
2442	psrad	\$31,$twtmp
2443	 aesdec		$rndkey1,$inout1
2444	paddq	@tweak[5],@tweak[5]
2445	pand	$twmask,$twtmp
2446	 aesdec		$rndkey1,$inout2
2447	 aesdec		$rndkey1,$inout3
2448	 movdqa	@tweak[3],`16*3`(%rsp)
2449	pxor	$twtmp,@tweak[5]
2450	 aesdec		$rndkey1,$inout4
2451	movaps	@tweak[2],@tweak[3]
2452	movdqa	$twres,$twtmp
2453	 aesdec		$rndkey1,$inout5
2454	 $movkey	-32($key),$rndkey1
2455
2456	paddd	$twres,$twres
2457	 aesdec		$rndkey0,$inout0
2458	pxor	@tweak[5],@tweak[2]
2459	psrad	\$31,$twtmp
2460	 aesdec		$rndkey0,$inout1
2461	paddq	@tweak[5],@tweak[5]
2462	pand	$twmask,$twtmp
2463	 aesdec		$rndkey0,$inout2
2464	 aesdec		$rndkey0,$inout3
2465	 aesdec		$rndkey0,$inout4
2466	pxor	$twtmp,@tweak[5]
2467	movaps	@tweak[3],@tweak[4]
2468	 aesdec		$rndkey0,$inout5
2469
2470	movdqa	$twres,$rndkey0
2471	paddd	$twres,$twres
2472	 aesdec		$rndkey1,$inout0
2473	pxor	@tweak[5],@tweak[3]
2474	psrad	\$31,$rndkey0
2475	 aesdec		$rndkey1,$inout1
2476	paddq	@tweak[5],@tweak[5]
2477	pand	$twmask,$rndkey0
2478	 aesdec		$rndkey1,$inout2
2479	 aesdec		$rndkey1,$inout3
2480	pxor	$rndkey0,@tweak[5]
2481	$movkey		($key_),$rndkey0
2482	 aesdec		$rndkey1,$inout4
2483	 aesdec		$rndkey1,$inout5
2484	$movkey		16($key_),$rndkey1
2485
2486	pxor	@tweak[5],@tweak[4]
2487	 aesdeclast	`16*0`(%rsp),$inout0
2488	psrad	\$31,$twres
2489	paddq	@tweak[5],@tweak[5]
2490	 aesdeclast	`16*1`(%rsp),$inout1
2491	 aesdeclast	`16*2`(%rsp),$inout2
2492	pand	$twmask,$twres
2493	mov	%r10,%rax			# restore $rounds
2494	 aesdeclast	`16*3`(%rsp),$inout3
2495	 aesdeclast	`16*4`(%rsp),$inout4
2496	 aesdeclast	`16*5`(%rsp),$inout5
2497	pxor	$twres,@tweak[5]
2498
2499	lea	`16*6`($out),$out		# $out+=6*16
2500	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2501	movups	$inout1,`-16*5`($out)
2502	movups	$inout2,`-16*4`($out)
2503	movups	$inout3,`-16*3`($out)
2504	movups	$inout4,`-16*2`($out)
2505	movups	$inout5,`-16*1`($out)
2506	sub	\$16*6,$len
2507	jnc	.Lxts_dec_grandloop		# loop if $len-=6*16 didn't borrow
2508
2509	mov	\$16+96,$rounds
2510	sub	$rnds_,$rounds
2511	mov	$key_,$key			# restore $key
2512	shr	\$4,$rounds			# restore original value
2513
2514.Lxts_dec_short:
2515	# at the point @tweak[0..5] are populated with tweak values
2516	mov	$rounds,$rnds_			# backup $rounds
2517	pxor	$rndkey0,@tweak[0]
2518	pxor	$rndkey0,@tweak[1]
2519	add	\$16*6,$len			# restore real remaining $len
2520	jz	.Lxts_dec_done			# done if ($len==0)
2521
2522	pxor	$rndkey0,@tweak[2]
2523	cmp	\$0x20,$len
2524	jb	.Lxts_dec_one			# $len is 1*16
2525	pxor	$rndkey0,@tweak[3]
2526	je	.Lxts_dec_two			# $len is 2*16
2527
2528	pxor	$rndkey0,@tweak[4]
2529	cmp	\$0x40,$len
2530	jb	.Lxts_dec_three			# $len is 3*16
2531	je	.Lxts_dec_four			# $len is 4*16
2532
2533	movdqu	($inp),$inout0			# $len is 5*16
2534	movdqu	16*1($inp),$inout1
2535	movdqu	16*2($inp),$inout2
2536	pxor	@tweak[0],$inout0
2537	movdqu	16*3($inp),$inout3
2538	pxor	@tweak[1],$inout1
2539	movdqu	16*4($inp),$inout4
2540	lea	16*5($inp),$inp			# $inp+=5*16
2541	pxor	@tweak[2],$inout2
2542	pxor	@tweak[3],$inout3
2543	pxor	@tweak[4],$inout4
2544
2545	call	_aesni_decrypt6
2546
2547	xorps	@tweak[0],$inout0
2548	xorps	@tweak[1],$inout1
2549	xorps	@tweak[2],$inout2
2550	movdqu	$inout0,($out)			# store 5 output blocks
2551	xorps	@tweak[3],$inout3
2552	movdqu	$inout1,16*1($out)
2553	xorps	@tweak[4],$inout4
2554	movdqu	$inout2,16*2($out)
2555	 pxor		$twtmp,$twtmp
2556	movdqu	$inout3,16*3($out)
2557	 pcmpgtd	@tweak[5],$twtmp
2558	movdqu	$inout4,16*4($out)
2559	lea	16*5($out),$out			# $out+=5*16
2560	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
2561	and	\$15,$len_
2562	jz	.Lxts_dec_ret
2563
2564	movdqa	@tweak[5],@tweak[0]
2565	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
2566	pand	$twmask,@tweak[1]		# isolate carry and residue
2567	pxor	@tweak[5],@tweak[1]
2568	jmp	.Lxts_dec_done2
2569
2570.align	16
2571.Lxts_dec_one:
2572	movups	($inp),$inout0
2573	lea	16*1($inp),$inp			# $inp+=1*16
2574	xorps	@tweak[0],$inout0
2575___
2576	&aesni_generate1("dec",$key,$rounds);
2577$code.=<<___;
2578	xorps	@tweak[0],$inout0
2579	movdqa	@tweak[1],@tweak[0]
2580	movups	$inout0,($out)			# store one output block
2581	movdqa	@tweak[2],@tweak[1]
2582	lea	16*1($out),$out			# $out+=1*16
2583	jmp	.Lxts_dec_done
2584
2585.align	16
2586.Lxts_dec_two:
2587	movups	($inp),$inout0
2588	movups	16($inp),$inout1
2589	lea	32($inp),$inp			# $inp+=2*16
2590	xorps	@tweak[0],$inout0
2591	xorps	@tweak[1],$inout1
2592
2593	call	_aesni_decrypt2
2594
2595	xorps	@tweak[0],$inout0
2596	movdqa	@tweak[2],@tweak[0]
2597	xorps	@tweak[1],$inout1
2598	movdqa	@tweak[3],@tweak[1]
2599	movups	$inout0,($out)			# store 2 output blocks
2600	movups	$inout1,16*1($out)
2601	lea	16*2($out),$out			# $out+=2*16
2602	jmp	.Lxts_dec_done
2603
2604.align	16
2605.Lxts_dec_three:
2606	movups	($inp),$inout0
2607	movups	16*1($inp),$inout1
2608	movups	16*2($inp),$inout2
2609	lea	16*3($inp),$inp			# $inp+=3*16
2610	xorps	@tweak[0],$inout0
2611	xorps	@tweak[1],$inout1
2612	xorps	@tweak[2],$inout2
2613
2614	call	_aesni_decrypt3
2615
2616	xorps	@tweak[0],$inout0
2617	movdqa	@tweak[3],@tweak[0]
2618	xorps	@tweak[1],$inout1
2619	movdqa	@tweak[4],@tweak[1]
2620	xorps	@tweak[2],$inout2
2621	movups	$inout0,($out)			# store 3 output blocks
2622	movups	$inout1,16*1($out)
2623	movups	$inout2,16*2($out)
2624	lea	16*3($out),$out			# $out+=3*16
2625	jmp	.Lxts_dec_done
2626
2627.align	16
2628.Lxts_dec_four:
2629	movups	($inp),$inout0
2630	movups	16*1($inp),$inout1
2631	movups	16*2($inp),$inout2
2632	xorps	@tweak[0],$inout0
2633	movups	16*3($inp),$inout3
2634	lea	16*4($inp),$inp			# $inp+=4*16
2635	xorps	@tweak[1],$inout1
2636	xorps	@tweak[2],$inout2
2637	xorps	@tweak[3],$inout3
2638
2639	call	_aesni_decrypt4
2640
2641	pxor	@tweak[0],$inout0
2642	movdqa	@tweak[4],@tweak[0]
2643	pxor	@tweak[1],$inout1
2644	movdqa	@tweak[5],@tweak[1]
2645	pxor	@tweak[2],$inout2
2646	movdqu	$inout0,($out)			# store 4 output blocks
2647	pxor	@tweak[3],$inout3
2648	movdqu	$inout1,16*1($out)
2649	movdqu	$inout2,16*2($out)
2650	movdqu	$inout3,16*3($out)
2651	lea	16*4($out),$out			# $out+=4*16
2652	jmp	.Lxts_dec_done
2653
2654.align	16
2655.Lxts_dec_done:
2656	and	\$15,$len_			# see if $len%16 is 0
2657	jz	.Lxts_dec_ret
2658.Lxts_dec_done2:
2659	mov	$len_,$len
2660	mov	$key_,$key			# restore $key
2661	mov	$rnds_,$rounds			# restore $rounds
2662
2663	movups	($inp),$inout0
2664	xorps	@tweak[1],$inout0
2665___
2666	&aesni_generate1("dec",$key,$rounds);
2667$code.=<<___;
2668	xorps	@tweak[1],$inout0
2669	movups	$inout0,($out)
2670
2671.Lxts_dec_steal:
2672	movzb	16($inp),%eax			# borrow $rounds ...
2673	movzb	($out),%ecx			# ... and $key
2674	lea	1($inp),$inp
2675	mov	%al,($out)
2676	mov	%cl,16($out)
2677	lea	1($out),$out
2678	sub	\$1,$len
2679	jnz	.Lxts_dec_steal
2680
2681	sub	$len_,$out			# rewind $out
2682	mov	$key_,$key			# restore $key
2683	mov	$rnds_,$rounds			# restore $rounds
2684
2685	movups	($out),$inout0
2686	xorps	@tweak[0],$inout0
2687___
2688	&aesni_generate1("dec",$key,$rounds);
2689$code.=<<___;
2690	xorps	@tweak[0],$inout0
2691	movups	$inout0,($out)
2692
2693.Lxts_dec_ret:
2694	xorps	%xmm0,%xmm0			# clear register bank
2695	pxor	%xmm1,%xmm1
2696	pxor	%xmm2,%xmm2
2697	pxor	%xmm3,%xmm3
2698	pxor	%xmm4,%xmm4
2699	pxor	%xmm5,%xmm5
2700___
2701$code.=<<___ if (!$win64);
2702	pxor	%xmm6,%xmm6
2703	pxor	%xmm7,%xmm7
2704	movaps	%xmm0,0x00(%rsp)		# clear stack
2705	pxor	%xmm8,%xmm8
2706	movaps	%xmm0,0x10(%rsp)
2707	pxor	%xmm9,%xmm9
2708	movaps	%xmm0,0x20(%rsp)
2709	pxor	%xmm10,%xmm10
2710	movaps	%xmm0,0x30(%rsp)
2711	pxor	%xmm11,%xmm11
2712	movaps	%xmm0,0x40(%rsp)
2713	pxor	%xmm12,%xmm12
2714	movaps	%xmm0,0x50(%rsp)
2715	pxor	%xmm13,%xmm13
2716	movaps	%xmm0,0x60(%rsp)
2717	pxor	%xmm14,%xmm14
2718	pxor	%xmm15,%xmm15
2719___
2720$code.=<<___ if ($win64);
2721	movaps	-0xa8(%r11),%xmm6
2722	movaps	%xmm0,-0xa8(%r11)		# clear stack
2723	movaps	-0x98(%r11),%xmm7
2724	movaps	%xmm0,-0x98(%r11)
2725	movaps	-0x88(%r11),%xmm8
2726	movaps	%xmm0,-0x88(%r11)
2727	movaps	-0x78(%r11),%xmm9
2728	movaps	%xmm0,-0x78(%r11)
2729	movaps	-0x68(%r11),%xmm10
2730	movaps	%xmm0,-0x68(%r11)
2731	movaps	-0x58(%r11),%xmm11
2732	movaps	%xmm0,-0x58(%r11)
2733	movaps	-0x48(%r11),%xmm12
2734	movaps	%xmm0,-0x48(%r11)
2735	movaps	-0x38(%r11),%xmm13
2736	movaps	%xmm0,-0x38(%r11)
2737	movaps	-0x28(%r11),%xmm14
2738	movaps	%xmm0,-0x28(%r11)
2739	movaps	-0x18(%r11),%xmm15
2740	movaps	%xmm0,-0x18(%r11)
2741	movaps	%xmm0,0x00(%rsp)
2742	movaps	%xmm0,0x10(%rsp)
2743	movaps	%xmm0,0x20(%rsp)
2744	movaps	%xmm0,0x30(%rsp)
2745	movaps	%xmm0,0x40(%rsp)
2746	movaps	%xmm0,0x50(%rsp)
2747	movaps	%xmm0,0x60(%rsp)
2748___
2749$code.=<<___;
2750	mov	-8(%r11),%rbp
2751.cfi_restore	%rbp
2752	lea	(%r11),%rsp
2753.cfi_def_cfa_register	%rsp
2754.Lxts_dec_epilogue:
2755	ret
2756.cfi_endproc
2757.size	aesni_xts_decrypt,.-aesni_xts_decrypt
2758___
2759}
2760
2761######################################################################
2762# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2763#	const AES_KEY *key, unsigned int start_block_num,
2764#	unsigned char offset_i[16], const unsigned char L_[][16],
2765#	unsigned char checksum[16]);
2766#
2767{
2768my @offset=map("%xmm$_",(10..15));
2769my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2770my ($block_num,$offset_p)=("%r8","%r9");		# 5th and 6th arguments
2771my ($L_p,$checksum_p) = ("%rbx","%rbp");
2772my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2773my $seventh_arg = $win64 ? 56 : 8;
2774my $blocks = $len;
2775
2776$code.=<<___;
2777.globl	aesni_ocb_encrypt
2778.type	aesni_ocb_encrypt,\@function,6
2779.align	32
2780aesni_ocb_encrypt:
2781.cfi_startproc
2782	lea	(%rsp),%rax
2783	push	%rbx
2784.cfi_push	%rbx
2785	push	%rbp
2786.cfi_push	%rbp
2787	push	%r12
2788.cfi_push	%r12
2789	push	%r13
2790.cfi_push	%r13
2791	push	%r14
2792.cfi_push	%r14
2793___
2794$code.=<<___ if ($win64);
2795	lea	-0xa0(%rsp),%rsp
2796	movaps	%xmm6,0x00(%rsp)		# offload everything
2797	movaps	%xmm7,0x10(%rsp)
2798	movaps	%xmm8,0x20(%rsp)
2799	movaps	%xmm9,0x30(%rsp)
2800	movaps	%xmm10,0x40(%rsp)
2801	movaps	%xmm11,0x50(%rsp)
2802	movaps	%xmm12,0x60(%rsp)
2803	movaps	%xmm13,0x70(%rsp)
2804	movaps	%xmm14,0x80(%rsp)
2805	movaps	%xmm15,0x90(%rsp)
2806.Locb_enc_body:
2807___
2808$code.=<<___;
2809	mov	$seventh_arg(%rax),$L_p		# 7th argument
2810	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
2811
2812	mov	240($key),$rnds_
2813	mov	$key,$key_
2814	shl	\$4,$rnds_
2815	$movkey	($key),$rndkey0l		# round[0]
2816	$movkey	16($key,$rnds_),$rndkey1	# round[last]
2817
2818	movdqu	($offset_p),@offset[5]		# load last offset_i
2819	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
2820	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
2821
2822	mov	\$16+32,$rounds
2823	lea	32($key_,$rnds_),$key
2824	$movkey	16($key_),$rndkey1		# round[1]
2825	sub	%r10,%rax			# twisted $rounds
2826	mov	%rax,%r10			# backup twisted $rounds
2827
2828	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
2829	movdqu	($checksum_p),$checksum		# load checksum
2830
2831	test	\$1,$block_num			# is first block number odd?
2832	jnz	.Locb_enc_odd
2833
2834	bsf	$block_num,$i1
2835	add	\$1,$block_num
2836	shl	\$4,$i1
2837	movdqu	($L_p,$i1),$inout5		# borrow
2838	movdqu	($inp),$inout0
2839	lea	16($inp),$inp
2840
2841	call	__ocb_encrypt1
2842
2843	movdqa	$inout5,@offset[5]
2844	movups	$inout0,($out)
2845	lea	16($out),$out
2846	sub	\$1,$blocks
2847	jz	.Locb_enc_done
2848
2849.Locb_enc_odd:
2850	lea	1($block_num),$i1		# even-numbered blocks
2851	lea	3($block_num),$i3
2852	lea	5($block_num),$i5
2853	lea	6($block_num),$block_num
2854	bsf	$i1,$i1				# ntz(block)
2855	bsf	$i3,$i3
2856	bsf	$i5,$i5
2857	shl	\$4,$i1				# ntz(block) -> table offset
2858	shl	\$4,$i3
2859	shl	\$4,$i5
2860
2861	sub	\$6,$blocks
2862	jc	.Locb_enc_short
2863	jmp	.Locb_enc_grandloop
2864
2865.align	32
2866.Locb_enc_grandloop:
2867	movdqu	`16*0`($inp),$inout0		# load input
2868	movdqu	`16*1`($inp),$inout1
2869	movdqu	`16*2`($inp),$inout2
2870	movdqu	`16*3`($inp),$inout3
2871	movdqu	`16*4`($inp),$inout4
2872	movdqu	`16*5`($inp),$inout5
2873	lea	`16*6`($inp),$inp
2874
2875	call	__ocb_encrypt6
2876
2877	movups	$inout0,`16*0`($out)		# store output
2878	movups	$inout1,`16*1`($out)
2879	movups	$inout2,`16*2`($out)
2880	movups	$inout3,`16*3`($out)
2881	movups	$inout4,`16*4`($out)
2882	movups	$inout5,`16*5`($out)
2883	lea	`16*6`($out),$out
2884	sub	\$6,$blocks
2885	jnc	.Locb_enc_grandloop
2886
2887.Locb_enc_short:
2888	add	\$6,$blocks
2889	jz	.Locb_enc_done
2890
2891	movdqu	`16*0`($inp),$inout0
2892	cmp	\$2,$blocks
2893	jb	.Locb_enc_one
2894	movdqu	`16*1`($inp),$inout1
2895	je	.Locb_enc_two
2896
2897	movdqu	`16*2`($inp),$inout2
2898	cmp	\$4,$blocks
2899	jb	.Locb_enc_three
2900	movdqu	`16*3`($inp),$inout3
2901	je	.Locb_enc_four
2902
2903	movdqu	`16*4`($inp),$inout4
2904	pxor	$inout5,$inout5
2905
2906	call	__ocb_encrypt6
2907
2908	movdqa	@offset[4],@offset[5]
2909	movups	$inout0,`16*0`($out)
2910	movups	$inout1,`16*1`($out)
2911	movups	$inout2,`16*2`($out)
2912	movups	$inout3,`16*3`($out)
2913	movups	$inout4,`16*4`($out)
2914
2915	jmp	.Locb_enc_done
2916
2917.align	16
2918.Locb_enc_one:
2919	movdqa	@offset[0],$inout5		# borrow
2920
2921	call	__ocb_encrypt1
2922
2923	movdqa	$inout5,@offset[5]
2924	movups	$inout0,`16*0`($out)
2925	jmp	.Locb_enc_done
2926
2927.align	16
2928.Locb_enc_two:
2929	pxor	$inout2,$inout2
2930	pxor	$inout3,$inout3
2931
2932	call	__ocb_encrypt4
2933
2934	movdqa	@offset[1],@offset[5]
2935	movups	$inout0,`16*0`($out)
2936	movups	$inout1,`16*1`($out)
2937
2938	jmp	.Locb_enc_done
2939
2940.align	16
2941.Locb_enc_three:
2942	pxor	$inout3,$inout3
2943
2944	call	__ocb_encrypt4
2945
2946	movdqa	@offset[2],@offset[5]
2947	movups	$inout0,`16*0`($out)
2948	movups	$inout1,`16*1`($out)
2949	movups	$inout2,`16*2`($out)
2950
2951	jmp	.Locb_enc_done
2952
2953.align	16
2954.Locb_enc_four:
2955	call	__ocb_encrypt4
2956
2957	movdqa	@offset[3],@offset[5]
2958	movups	$inout0,`16*0`($out)
2959	movups	$inout1,`16*1`($out)
2960	movups	$inout2,`16*2`($out)
2961	movups	$inout3,`16*3`($out)
2962
2963.Locb_enc_done:
2964	pxor	$rndkey0,@offset[5]		# "remove" round[last]
2965	movdqu	$checksum,($checksum_p)		# store checksum
2966	movdqu	@offset[5],($offset_p)		# store last offset_i
2967
2968	xorps	%xmm0,%xmm0			# clear register bank
2969	pxor	%xmm1,%xmm1
2970	pxor	%xmm2,%xmm2
2971	pxor	%xmm3,%xmm3
2972	pxor	%xmm4,%xmm4
2973	pxor	%xmm5,%xmm5
2974___
2975$code.=<<___ if (!$win64);
2976	pxor	%xmm6,%xmm6
2977	pxor	%xmm7,%xmm7
2978	pxor	%xmm8,%xmm8
2979	pxor	%xmm9,%xmm9
2980	pxor	%xmm10,%xmm10
2981	pxor	%xmm11,%xmm11
2982	pxor	%xmm12,%xmm12
2983	pxor	%xmm13,%xmm13
2984	pxor	%xmm14,%xmm14
2985	pxor	%xmm15,%xmm15
2986	lea	0x28(%rsp),%rax
2987.cfi_def_cfa	%rax,8
2988___
2989$code.=<<___ if ($win64);
2990	movaps	0x00(%rsp),%xmm6
2991	movaps	%xmm0,0x00(%rsp)		# clear stack
2992	movaps	0x10(%rsp),%xmm7
2993	movaps	%xmm0,0x10(%rsp)
2994	movaps	0x20(%rsp),%xmm8
2995	movaps	%xmm0,0x20(%rsp)
2996	movaps	0x30(%rsp),%xmm9
2997	movaps	%xmm0,0x30(%rsp)
2998	movaps	0x40(%rsp),%xmm10
2999	movaps	%xmm0,0x40(%rsp)
3000	movaps	0x50(%rsp),%xmm11
3001	movaps	%xmm0,0x50(%rsp)
3002	movaps	0x60(%rsp),%xmm12
3003	movaps	%xmm0,0x60(%rsp)
3004	movaps	0x70(%rsp),%xmm13
3005	movaps	%xmm0,0x70(%rsp)
3006	movaps	0x80(%rsp),%xmm14
3007	movaps	%xmm0,0x80(%rsp)
3008	movaps	0x90(%rsp),%xmm15
3009	movaps	%xmm0,0x90(%rsp)
3010	lea	0xa0+0x28(%rsp),%rax
3011.Locb_enc_pop:
3012___
3013$code.=<<___;
3014	mov	-40(%rax),%r14
3015.cfi_restore	%r14
3016	mov	-32(%rax),%r13
3017.cfi_restore	%r13
3018	mov	-24(%rax),%r12
3019.cfi_restore	%r12
3020	mov	-16(%rax),%rbp
3021.cfi_restore	%rbp
3022	mov	-8(%rax),%rbx
3023.cfi_restore	%rbx
3024	lea	(%rax),%rsp
3025.cfi_def_cfa_register	%rsp
3026.Locb_enc_epilogue:
3027	ret
3028.cfi_endproc
3029.size	aesni_ocb_encrypt,.-aesni_ocb_encrypt
3030
3031.type	__ocb_encrypt6,\@abi-omnipotent
3032.align	32
3033__ocb_encrypt6:
3034	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3035	 movdqu		($L_p,$i1),@offset[1]
3036	 movdqa		@offset[0],@offset[2]
3037	 movdqu		($L_p,$i3),@offset[3]
3038	 movdqa		@offset[0],@offset[4]
3039	 pxor		@offset[5],@offset[0]
3040	 movdqu		($L_p,$i5),@offset[5]
3041	 pxor		@offset[0],@offset[1]
3042	pxor		$inout0,$checksum	# accumulate checksum
3043	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3044	 pxor		@offset[1],@offset[2]
3045	pxor		$inout1,$checksum
3046	pxor		@offset[1],$inout1
3047	 pxor		@offset[2],@offset[3]
3048	pxor		$inout2,$checksum
3049	pxor		@offset[2],$inout2
3050	 pxor		@offset[3],@offset[4]
3051	pxor		$inout3,$checksum
3052	pxor		@offset[3],$inout3
3053	 pxor		@offset[4],@offset[5]
3054	pxor		$inout4,$checksum
3055	pxor		@offset[4],$inout4
3056	pxor		$inout5,$checksum
3057	pxor		@offset[5],$inout5
3058	$movkey		32($key_),$rndkey0
3059
3060	lea		1($block_num),$i1	# even-numbered blocks
3061	lea		3($block_num),$i3
3062	lea		5($block_num),$i5
3063	add		\$6,$block_num
3064	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3065	bsf		$i1,$i1			# ntz(block)
3066	bsf		$i3,$i3
3067	bsf		$i5,$i5
3068
3069	aesenc		$rndkey1,$inout0
3070	aesenc		$rndkey1,$inout1
3071	aesenc		$rndkey1,$inout2
3072	aesenc		$rndkey1,$inout3
3073	 pxor		$rndkey0l,@offset[1]
3074	 pxor		$rndkey0l,@offset[2]
3075	aesenc		$rndkey1,$inout4
3076	 pxor		$rndkey0l,@offset[3]
3077	 pxor		$rndkey0l,@offset[4]
3078	aesenc		$rndkey1,$inout5
3079	$movkey		48($key_),$rndkey1
3080	 pxor		$rndkey0l,@offset[5]
3081
3082	aesenc		$rndkey0,$inout0
3083	aesenc		$rndkey0,$inout1
3084	aesenc		$rndkey0,$inout2
3085	aesenc		$rndkey0,$inout3
3086	aesenc		$rndkey0,$inout4
3087	aesenc		$rndkey0,$inout5
3088	$movkey		64($key_),$rndkey0
3089	shl		\$4,$i1			# ntz(block) -> table offset
3090	shl		\$4,$i3
3091	jmp		.Locb_enc_loop6
3092
3093.align	32
3094.Locb_enc_loop6:
3095	aesenc		$rndkey1,$inout0
3096	aesenc		$rndkey1,$inout1
3097	aesenc		$rndkey1,$inout2
3098	aesenc		$rndkey1,$inout3
3099	aesenc		$rndkey1,$inout4
3100	aesenc		$rndkey1,$inout5
3101	$movkey		($key,%rax),$rndkey1
3102	add		\$32,%rax
3103
3104	aesenc		$rndkey0,$inout0
3105	aesenc		$rndkey0,$inout1
3106	aesenc		$rndkey0,$inout2
3107	aesenc		$rndkey0,$inout3
3108	aesenc		$rndkey0,$inout4
3109	aesenc		$rndkey0,$inout5
3110	$movkey		-16($key,%rax),$rndkey0
3111	jnz		.Locb_enc_loop6
3112
3113	aesenc		$rndkey1,$inout0
3114	aesenc		$rndkey1,$inout1
3115	aesenc		$rndkey1,$inout2
3116	aesenc		$rndkey1,$inout3
3117	aesenc		$rndkey1,$inout4
3118	aesenc		$rndkey1,$inout5
3119	$movkey		16($key_),$rndkey1
3120	shl		\$4,$i5
3121
3122	aesenclast	@offset[0],$inout0
3123	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3124	mov		%r10,%rax		# restore twisted rounds
3125	aesenclast	@offset[1],$inout1
3126	aesenclast	@offset[2],$inout2
3127	aesenclast	@offset[3],$inout3
3128	aesenclast	@offset[4],$inout4
3129	aesenclast	@offset[5],$inout5
3130	ret
3131.size	__ocb_encrypt6,.-__ocb_encrypt6
3132
3133.type	__ocb_encrypt4,\@abi-omnipotent
3134.align	32
3135__ocb_encrypt4:
3136	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3137	 movdqu		($L_p,$i1),@offset[1]
3138	 movdqa		@offset[0],@offset[2]
3139	 movdqu		($L_p,$i3),@offset[3]
3140	 pxor		@offset[5],@offset[0]
3141	 pxor		@offset[0],@offset[1]
3142	pxor		$inout0,$checksum	# accumulate checksum
3143	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3144	 pxor		@offset[1],@offset[2]
3145	pxor		$inout1,$checksum
3146	pxor		@offset[1],$inout1
3147	 pxor		@offset[2],@offset[3]
3148	pxor		$inout2,$checksum
3149	pxor		@offset[2],$inout2
3150	pxor		$inout3,$checksum
3151	pxor		@offset[3],$inout3
3152	$movkey		32($key_),$rndkey0
3153
3154	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3155	 pxor		$rndkey0l,@offset[1]
3156	 pxor		$rndkey0l,@offset[2]
3157	 pxor		$rndkey0l,@offset[3]
3158
3159	aesenc		$rndkey1,$inout0
3160	aesenc		$rndkey1,$inout1
3161	aesenc		$rndkey1,$inout2
3162	aesenc		$rndkey1,$inout3
3163	$movkey		48($key_),$rndkey1
3164
3165	aesenc		$rndkey0,$inout0
3166	aesenc		$rndkey0,$inout1
3167	aesenc		$rndkey0,$inout2
3168	aesenc		$rndkey0,$inout3
3169	$movkey		64($key_),$rndkey0
3170	jmp		.Locb_enc_loop4
3171
3172.align	32
3173.Locb_enc_loop4:
3174	aesenc		$rndkey1,$inout0
3175	aesenc		$rndkey1,$inout1
3176	aesenc		$rndkey1,$inout2
3177	aesenc		$rndkey1,$inout3
3178	$movkey		($key,%rax),$rndkey1
3179	add		\$32,%rax
3180
3181	aesenc		$rndkey0,$inout0
3182	aesenc		$rndkey0,$inout1
3183	aesenc		$rndkey0,$inout2
3184	aesenc		$rndkey0,$inout3
3185	$movkey		-16($key,%rax),$rndkey0
3186	jnz		.Locb_enc_loop4
3187
3188	aesenc		$rndkey1,$inout0
3189	aesenc		$rndkey1,$inout1
3190	aesenc		$rndkey1,$inout2
3191	aesenc		$rndkey1,$inout3
3192	$movkey		16($key_),$rndkey1
3193	mov		%r10,%rax		# restore twisted rounds
3194
3195	aesenclast	@offset[0],$inout0
3196	aesenclast	@offset[1],$inout1
3197	aesenclast	@offset[2],$inout2
3198	aesenclast	@offset[3],$inout3
3199	ret
3200.size	__ocb_encrypt4,.-__ocb_encrypt4
3201
3202.type	__ocb_encrypt1,\@abi-omnipotent
3203.align	32
3204__ocb_encrypt1:
3205	 pxor		@offset[5],$inout5	# offset_i
3206	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3207	pxor		$inout0,$checksum	# accumulate checksum
3208	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3209	$movkey		32($key_),$rndkey0
3210
3211	aesenc		$rndkey1,$inout0
3212	$movkey		48($key_),$rndkey1
3213	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3214
3215	aesenc		$rndkey0,$inout0
3216	$movkey		64($key_),$rndkey0
3217	jmp		.Locb_enc_loop1
3218
3219.align	32
3220.Locb_enc_loop1:
3221	aesenc		$rndkey1,$inout0
3222	$movkey		($key,%rax),$rndkey1
3223	add		\$32,%rax
3224
3225	aesenc		$rndkey0,$inout0
3226	$movkey		-16($key,%rax),$rndkey0
3227	jnz		.Locb_enc_loop1
3228
3229	aesenc		$rndkey1,$inout0
3230	$movkey		16($key_),$rndkey1	# redundant in tail
3231	mov		%r10,%rax		# restore twisted rounds
3232
3233	aesenclast	$inout5,$inout0
3234	ret
3235.size	__ocb_encrypt1,.-__ocb_encrypt1
3236
3237.globl	aesni_ocb_decrypt
3238.type	aesni_ocb_decrypt,\@function,6
3239.align	32
3240aesni_ocb_decrypt:
3241.cfi_startproc
3242	lea	(%rsp),%rax
3243	push	%rbx
3244.cfi_push	%rbx
3245	push	%rbp
3246.cfi_push	%rbp
3247	push	%r12
3248.cfi_push	%r12
3249	push	%r13
3250.cfi_push	%r13
3251	push	%r14
3252.cfi_push	%r14
3253___
3254$code.=<<___ if ($win64);
3255	lea	-0xa0(%rsp),%rsp
3256	movaps	%xmm6,0x00(%rsp)		# offload everything
3257	movaps	%xmm7,0x10(%rsp)
3258	movaps	%xmm8,0x20(%rsp)
3259	movaps	%xmm9,0x30(%rsp)
3260	movaps	%xmm10,0x40(%rsp)
3261	movaps	%xmm11,0x50(%rsp)
3262	movaps	%xmm12,0x60(%rsp)
3263	movaps	%xmm13,0x70(%rsp)
3264	movaps	%xmm14,0x80(%rsp)
3265	movaps	%xmm15,0x90(%rsp)
3266.Locb_dec_body:
3267___
3268$code.=<<___;
3269	mov	$seventh_arg(%rax),$L_p		# 7th argument
3270	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
3271
3272	mov	240($key),$rnds_
3273	mov	$key,$key_
3274	shl	\$4,$rnds_
3275	$movkey	($key),$rndkey0l		# round[0]
3276	$movkey	16($key,$rnds_),$rndkey1	# round[last]
3277
3278	movdqu	($offset_p),@offset[5]		# load last offset_i
3279	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
3280	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
3281
3282	mov	\$16+32,$rounds
3283	lea	32($key_,$rnds_),$key
3284	$movkey	16($key_),$rndkey1		# round[1]
3285	sub	%r10,%rax			# twisted $rounds
3286	mov	%rax,%r10			# backup twisted $rounds
3287
3288	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
3289	movdqu	($checksum_p),$checksum		# load checksum
3290
3291	test	\$1,$block_num			# is first block number odd?
3292	jnz	.Locb_dec_odd
3293
3294	bsf	$block_num,$i1
3295	add	\$1,$block_num
3296	shl	\$4,$i1
3297	movdqu	($L_p,$i1),$inout5		# borrow
3298	movdqu	($inp),$inout0
3299	lea	16($inp),$inp
3300
3301	call	__ocb_decrypt1
3302
3303	movdqa	$inout5,@offset[5]
3304	movups	$inout0,($out)
3305	xorps	$inout0,$checksum		# accumulate checksum
3306	lea	16($out),$out
3307	sub	\$1,$blocks
3308	jz	.Locb_dec_done
3309
3310.Locb_dec_odd:
3311	lea	1($block_num),$i1		# even-numbered blocks
3312	lea	3($block_num),$i3
3313	lea	5($block_num),$i5
3314	lea	6($block_num),$block_num
3315	bsf	$i1,$i1				# ntz(block)
3316	bsf	$i3,$i3
3317	bsf	$i5,$i5
3318	shl	\$4,$i1				# ntz(block) -> table offset
3319	shl	\$4,$i3
3320	shl	\$4,$i5
3321
3322	sub	\$6,$blocks
3323	jc	.Locb_dec_short
3324	jmp	.Locb_dec_grandloop
3325
3326.align	32
3327.Locb_dec_grandloop:
3328	movdqu	`16*0`($inp),$inout0		# load input
3329	movdqu	`16*1`($inp),$inout1
3330	movdqu	`16*2`($inp),$inout2
3331	movdqu	`16*3`($inp),$inout3
3332	movdqu	`16*4`($inp),$inout4
3333	movdqu	`16*5`($inp),$inout5
3334	lea	`16*6`($inp),$inp
3335
3336	call	__ocb_decrypt6
3337
3338	movups	$inout0,`16*0`($out)		# store output
3339	pxor	$inout0,$checksum		# accumulate checksum
3340	movups	$inout1,`16*1`($out)
3341	pxor	$inout1,$checksum
3342	movups	$inout2,`16*2`($out)
3343	pxor	$inout2,$checksum
3344	movups	$inout3,`16*3`($out)
3345	pxor	$inout3,$checksum
3346	movups	$inout4,`16*4`($out)
3347	pxor	$inout4,$checksum
3348	movups	$inout5,`16*5`($out)
3349	pxor	$inout5,$checksum
3350	lea	`16*6`($out),$out
3351	sub	\$6,$blocks
3352	jnc	.Locb_dec_grandloop
3353
3354.Locb_dec_short:
3355	add	\$6,$blocks
3356	jz	.Locb_dec_done
3357
3358	movdqu	`16*0`($inp),$inout0
3359	cmp	\$2,$blocks
3360	jb	.Locb_dec_one
3361	movdqu	`16*1`($inp),$inout1
3362	je	.Locb_dec_two
3363
3364	movdqu	`16*2`($inp),$inout2
3365	cmp	\$4,$blocks
3366	jb	.Locb_dec_three
3367	movdqu	`16*3`($inp),$inout3
3368	je	.Locb_dec_four
3369
3370	movdqu	`16*4`($inp),$inout4
3371	pxor	$inout5,$inout5
3372
3373	call	__ocb_decrypt6
3374
3375	movdqa	@offset[4],@offset[5]
3376	movups	$inout0,`16*0`($out)		# store output
3377	pxor	$inout0,$checksum		# accumulate checksum
3378	movups	$inout1,`16*1`($out)
3379	pxor	$inout1,$checksum
3380	movups	$inout2,`16*2`($out)
3381	pxor	$inout2,$checksum
3382	movups	$inout3,`16*3`($out)
3383	pxor	$inout3,$checksum
3384	movups	$inout4,`16*4`($out)
3385	pxor	$inout4,$checksum
3386
3387	jmp	.Locb_dec_done
3388
3389.align	16
3390.Locb_dec_one:
3391	movdqa	@offset[0],$inout5		# borrow
3392
3393	call	__ocb_decrypt1
3394
3395	movdqa	$inout5,@offset[5]
3396	movups	$inout0,`16*0`($out)		# store output
3397	xorps	$inout0,$checksum		# accumulate checksum
3398	jmp	.Locb_dec_done
3399
3400.align	16
3401.Locb_dec_two:
3402	pxor	$inout2,$inout2
3403	pxor	$inout3,$inout3
3404
3405	call	__ocb_decrypt4
3406
3407	movdqa	@offset[1],@offset[5]
3408	movups	$inout0,`16*0`($out)		# store output
3409	xorps	$inout0,$checksum		# accumulate checksum
3410	movups	$inout1,`16*1`($out)
3411	xorps	$inout1,$checksum
3412
3413	jmp	.Locb_dec_done
3414
3415.align	16
3416.Locb_dec_three:
3417	pxor	$inout3,$inout3
3418
3419	call	__ocb_decrypt4
3420
3421	movdqa	@offset[2],@offset[5]
3422	movups	$inout0,`16*0`($out)		# store output
3423	xorps	$inout0,$checksum		# accumulate checksum
3424	movups	$inout1,`16*1`($out)
3425	xorps	$inout1,$checksum
3426	movups	$inout2,`16*2`($out)
3427	xorps	$inout2,$checksum
3428
3429	jmp	.Locb_dec_done
3430
3431.align	16
3432.Locb_dec_four:
3433	call	__ocb_decrypt4
3434
3435	movdqa	@offset[3],@offset[5]
3436	movups	$inout0,`16*0`($out)		# store output
3437	pxor	$inout0,$checksum		# accumulate checksum
3438	movups	$inout1,`16*1`($out)
3439	pxor	$inout1,$checksum
3440	movups	$inout2,`16*2`($out)
3441	pxor	$inout2,$checksum
3442	movups	$inout3,`16*3`($out)
3443	pxor	$inout3,$checksum
3444
3445.Locb_dec_done:
3446	pxor	$rndkey0,@offset[5]		# "remove" round[last]
3447	movdqu	$checksum,($checksum_p)		# store checksum
3448	movdqu	@offset[5],($offset_p)		# store last offset_i
3449
3450	xorps	%xmm0,%xmm0			# clear register bank
3451	pxor	%xmm1,%xmm1
3452	pxor	%xmm2,%xmm2
3453	pxor	%xmm3,%xmm3
3454	pxor	%xmm4,%xmm4
3455	pxor	%xmm5,%xmm5
3456___
3457$code.=<<___ if (!$win64);
3458	pxor	%xmm6,%xmm6
3459	pxor	%xmm7,%xmm7
3460	pxor	%xmm8,%xmm8
3461	pxor	%xmm9,%xmm9
3462	pxor	%xmm10,%xmm10
3463	pxor	%xmm11,%xmm11
3464	pxor	%xmm12,%xmm12
3465	pxor	%xmm13,%xmm13
3466	pxor	%xmm14,%xmm14
3467	pxor	%xmm15,%xmm15
3468	lea	0x28(%rsp),%rax
3469.cfi_def_cfa	%rax,8
3470___
3471$code.=<<___ if ($win64);
3472	movaps	0x00(%rsp),%xmm6
3473	movaps	%xmm0,0x00(%rsp)		# clear stack
3474	movaps	0x10(%rsp),%xmm7
3475	movaps	%xmm0,0x10(%rsp)
3476	movaps	0x20(%rsp),%xmm8
3477	movaps	%xmm0,0x20(%rsp)
3478	movaps	0x30(%rsp),%xmm9
3479	movaps	%xmm0,0x30(%rsp)
3480	movaps	0x40(%rsp),%xmm10
3481	movaps	%xmm0,0x40(%rsp)
3482	movaps	0x50(%rsp),%xmm11
3483	movaps	%xmm0,0x50(%rsp)
3484	movaps	0x60(%rsp),%xmm12
3485	movaps	%xmm0,0x60(%rsp)
3486	movaps	0x70(%rsp),%xmm13
3487	movaps	%xmm0,0x70(%rsp)
3488	movaps	0x80(%rsp),%xmm14
3489	movaps	%xmm0,0x80(%rsp)
3490	movaps	0x90(%rsp),%xmm15
3491	movaps	%xmm0,0x90(%rsp)
3492	lea	0xa0+0x28(%rsp),%rax
3493.Locb_dec_pop:
3494___
3495$code.=<<___;
3496	mov	-40(%rax),%r14
3497.cfi_restore	%r14
3498	mov	-32(%rax),%r13
3499.cfi_restore	%r13
3500	mov	-24(%rax),%r12
3501.cfi_restore	%r12
3502	mov	-16(%rax),%rbp
3503.cfi_restore	%rbp
3504	mov	-8(%rax),%rbx
3505.cfi_restore	%rbx
3506	lea	(%rax),%rsp
3507.cfi_def_cfa_register	%rsp
3508.Locb_dec_epilogue:
3509	ret
3510.cfi_endproc
3511.size	aesni_ocb_decrypt,.-aesni_ocb_decrypt
3512
3513.type	__ocb_decrypt6,\@abi-omnipotent
3514.align	32
3515__ocb_decrypt6:
3516	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3517	 movdqu		($L_p,$i1),@offset[1]
3518	 movdqa		@offset[0],@offset[2]
3519	 movdqu		($L_p,$i3),@offset[3]
3520	 movdqa		@offset[0],@offset[4]
3521	 pxor		@offset[5],@offset[0]
3522	 movdqu		($L_p,$i5),@offset[5]
3523	 pxor		@offset[0],@offset[1]
3524	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3525	 pxor		@offset[1],@offset[2]
3526	pxor		@offset[1],$inout1
3527	 pxor		@offset[2],@offset[3]
3528	pxor		@offset[2],$inout2
3529	 pxor		@offset[3],@offset[4]
3530	pxor		@offset[3],$inout3
3531	 pxor		@offset[4],@offset[5]
3532	pxor		@offset[4],$inout4
3533	pxor		@offset[5],$inout5
3534	$movkey		32($key_),$rndkey0
3535
3536	lea		1($block_num),$i1	# even-numbered blocks
3537	lea		3($block_num),$i3
3538	lea		5($block_num),$i5
3539	add		\$6,$block_num
3540	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3541	bsf		$i1,$i1			# ntz(block)
3542	bsf		$i3,$i3
3543	bsf		$i5,$i5
3544
3545	aesdec		$rndkey1,$inout0
3546	aesdec		$rndkey1,$inout1
3547	aesdec		$rndkey1,$inout2
3548	aesdec		$rndkey1,$inout3
3549	 pxor		$rndkey0l,@offset[1]
3550	 pxor		$rndkey0l,@offset[2]
3551	aesdec		$rndkey1,$inout4
3552	 pxor		$rndkey0l,@offset[3]
3553	 pxor		$rndkey0l,@offset[4]
3554	aesdec		$rndkey1,$inout5
3555	$movkey		48($key_),$rndkey1
3556	 pxor		$rndkey0l,@offset[5]
3557
3558	aesdec		$rndkey0,$inout0
3559	aesdec		$rndkey0,$inout1
3560	aesdec		$rndkey0,$inout2
3561	aesdec		$rndkey0,$inout3
3562	aesdec		$rndkey0,$inout4
3563	aesdec		$rndkey0,$inout5
3564	$movkey		64($key_),$rndkey0
3565	shl		\$4,$i1			# ntz(block) -> table offset
3566	shl		\$4,$i3
3567	jmp		.Locb_dec_loop6
3568
3569.align	32
3570.Locb_dec_loop6:
3571	aesdec		$rndkey1,$inout0
3572	aesdec		$rndkey1,$inout1
3573	aesdec		$rndkey1,$inout2
3574	aesdec		$rndkey1,$inout3
3575	aesdec		$rndkey1,$inout4
3576	aesdec		$rndkey1,$inout5
3577	$movkey		($key,%rax),$rndkey1
3578	add		\$32,%rax
3579
3580	aesdec		$rndkey0,$inout0
3581	aesdec		$rndkey0,$inout1
3582	aesdec		$rndkey0,$inout2
3583	aesdec		$rndkey0,$inout3
3584	aesdec		$rndkey0,$inout4
3585	aesdec		$rndkey0,$inout5
3586	$movkey		-16($key,%rax),$rndkey0
3587	jnz		.Locb_dec_loop6
3588
3589	aesdec		$rndkey1,$inout0
3590	aesdec		$rndkey1,$inout1
3591	aesdec		$rndkey1,$inout2
3592	aesdec		$rndkey1,$inout3
3593	aesdec		$rndkey1,$inout4
3594	aesdec		$rndkey1,$inout5
3595	$movkey		16($key_),$rndkey1
3596	shl		\$4,$i5
3597
3598	aesdeclast	@offset[0],$inout0
3599	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3600	mov		%r10,%rax		# restore twisted rounds
3601	aesdeclast	@offset[1],$inout1
3602	aesdeclast	@offset[2],$inout2
3603	aesdeclast	@offset[3],$inout3
3604	aesdeclast	@offset[4],$inout4
3605	aesdeclast	@offset[5],$inout5
3606	ret
3607.size	__ocb_decrypt6,.-__ocb_decrypt6
3608
3609.type	__ocb_decrypt4,\@abi-omnipotent
3610.align	32
3611__ocb_decrypt4:
3612	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3613	 movdqu		($L_p,$i1),@offset[1]
3614	 movdqa		@offset[0],@offset[2]
3615	 movdqu		($L_p,$i3),@offset[3]
3616	 pxor		@offset[5],@offset[0]
3617	 pxor		@offset[0],@offset[1]
3618	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3619	 pxor		@offset[1],@offset[2]
3620	pxor		@offset[1],$inout1
3621	 pxor		@offset[2],@offset[3]
3622	pxor		@offset[2],$inout2
3623	pxor		@offset[3],$inout3
3624	$movkey		32($key_),$rndkey0
3625
3626	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3627	 pxor		$rndkey0l,@offset[1]
3628	 pxor		$rndkey0l,@offset[2]
3629	 pxor		$rndkey0l,@offset[3]
3630
3631	aesdec		$rndkey1,$inout0
3632	aesdec		$rndkey1,$inout1
3633	aesdec		$rndkey1,$inout2
3634	aesdec		$rndkey1,$inout3
3635	$movkey		48($key_),$rndkey1
3636
3637	aesdec		$rndkey0,$inout0
3638	aesdec		$rndkey0,$inout1
3639	aesdec		$rndkey0,$inout2
3640	aesdec		$rndkey0,$inout3
3641	$movkey		64($key_),$rndkey0
3642	jmp		.Locb_dec_loop4
3643
3644.align	32
3645.Locb_dec_loop4:
3646	aesdec		$rndkey1,$inout0
3647	aesdec		$rndkey1,$inout1
3648	aesdec		$rndkey1,$inout2
3649	aesdec		$rndkey1,$inout3
3650	$movkey		($key,%rax),$rndkey1
3651	add		\$32,%rax
3652
3653	aesdec		$rndkey0,$inout0
3654	aesdec		$rndkey0,$inout1
3655	aesdec		$rndkey0,$inout2
3656	aesdec		$rndkey0,$inout3
3657	$movkey		-16($key,%rax),$rndkey0
3658	jnz		.Locb_dec_loop4
3659
3660	aesdec		$rndkey1,$inout0
3661	aesdec		$rndkey1,$inout1
3662	aesdec		$rndkey1,$inout2
3663	aesdec		$rndkey1,$inout3
3664	$movkey		16($key_),$rndkey1
3665	mov		%r10,%rax		# restore twisted rounds
3666
3667	aesdeclast	@offset[0],$inout0
3668	aesdeclast	@offset[1],$inout1
3669	aesdeclast	@offset[2],$inout2
3670	aesdeclast	@offset[3],$inout3
3671	ret
3672.size	__ocb_decrypt4,.-__ocb_decrypt4
3673
3674.type	__ocb_decrypt1,\@abi-omnipotent
3675.align	32
3676__ocb_decrypt1:
3677	 pxor		@offset[5],$inout5	# offset_i
3678	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3679	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3680	$movkey		32($key_),$rndkey0
3681
3682	aesdec		$rndkey1,$inout0
3683	$movkey		48($key_),$rndkey1
3684	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3685
3686	aesdec		$rndkey0,$inout0
3687	$movkey		64($key_),$rndkey0
3688	jmp		.Locb_dec_loop1
3689
3690.align	32
3691.Locb_dec_loop1:
3692	aesdec		$rndkey1,$inout0
3693	$movkey		($key,%rax),$rndkey1
3694	add		\$32,%rax
3695
3696	aesdec		$rndkey0,$inout0
3697	$movkey		-16($key,%rax),$rndkey0
3698	jnz		.Locb_dec_loop1
3699
3700	aesdec		$rndkey1,$inout0
3701	$movkey		16($key_),$rndkey1	# redundant in tail
3702	mov		%r10,%rax		# restore twisted rounds
3703
3704	aesdeclast	$inout5,$inout0
3705	ret
3706.size	__ocb_decrypt1,.-__ocb_decrypt1
3707___
3708} }}
3709
3710########################################################################
3711# void $PREFIX_cbc_encrypt (const void *inp, void *out,
3712#			    size_t length, const AES_KEY *key,
3713#			    unsigned char *ivp,const int enc);
3714{
3715my $frame_size = 0x10 + ($win64?0xa0:0);	# used in decrypt
3716my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3717
3718$code.=<<___;
3719.globl	${PREFIX}_cbc_encrypt
3720.type	${PREFIX}_cbc_encrypt,\@function,6
3721.align	16
3722${PREFIX}_cbc_encrypt:
3723.cfi_startproc
3724	test	$len,$len		# check length
3725	jz	.Lcbc_ret
3726
3727	mov	240($key),$rnds_	# key->rounds
3728	mov	$key,$key_		# backup $key
3729	test	%r9d,%r9d		# 6th argument
3730	jz	.Lcbc_decrypt
3731#--------------------------- CBC ENCRYPT ------------------------------#
3732	movups	($ivp),$inout0		# load iv as initial state
3733	mov	$rnds_,$rounds
3734	cmp	\$16,$len
3735	jb	.Lcbc_enc_tail
3736	sub	\$16,$len
3737	jmp	.Lcbc_enc_loop
3738.align	16
3739.Lcbc_enc_loop:
3740	movups	($inp),$inout1		# load input
3741	lea	16($inp),$inp
3742	#xorps	$inout1,$inout0
3743___
3744	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3745$code.=<<___;
3746	mov	$rnds_,$rounds		# restore $rounds
3747	mov	$key_,$key		# restore $key
3748	movups	$inout0,0($out)		# store output
3749	lea	16($out),$out
3750	sub	\$16,$len
3751	jnc	.Lcbc_enc_loop
3752	add	\$16,$len
3753	jnz	.Lcbc_enc_tail
3754	 pxor	$rndkey0,$rndkey0	# clear register bank
3755	 pxor	$rndkey1,$rndkey1
3756	movups	$inout0,($ivp)
3757	 pxor	$inout0,$inout0
3758	 pxor	$inout1,$inout1
3759	jmp	.Lcbc_ret
3760
3761.Lcbc_enc_tail:
3762	mov	$len,%rcx	# zaps $key
3763	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
3764	.long	0x9066A4F3	# rep movsb
3765	mov	\$16,%ecx	# zero tail
3766	sub	$len,%rcx
3767	xor	%eax,%eax
3768	.long	0x9066AAF3	# rep stosb
3769	lea	-16(%rdi),%rdi	# rewind $out by 1 block
3770	mov	$rnds_,$rounds	# restore $rounds
3771	mov	%rdi,%rsi	# $inp and $out are the same
3772	mov	$key_,$key	# restore $key
3773	xor	$len,$len	# len=16
3774	jmp	.Lcbc_enc_loop	# one more spin
3775#--------------------------- CBC DECRYPT ------------------------------#
3776.align	16
3777.Lcbc_decrypt:
3778	cmp	\$16,$len
3779	jne	.Lcbc_decrypt_bulk
3780
3781	# handle single block without allocating stack frame,
3782	# useful in ciphertext stealing mode
3783	movdqu	($inp),$inout0		# load input
3784	movdqu	($ivp),$inout1		# load iv
3785	movdqa	$inout0,$inout2		# future iv
3786___
3787	&aesni_generate1("dec",$key,$rnds_);
3788$code.=<<___;
3789	 pxor	$rndkey0,$rndkey0	# clear register bank
3790	 pxor	$rndkey1,$rndkey1
3791	movdqu	$inout2,($ivp)		# store iv
3792	xorps	$inout1,$inout0		# ^=iv
3793	 pxor	$inout1,$inout1
3794	movups	$inout0,($out)		# store output
3795	 pxor	$inout0,$inout0
3796	jmp	.Lcbc_ret
3797.align	16
3798.Lcbc_decrypt_bulk:
3799	lea	(%rsp),%r11		# frame pointer
3800.cfi_def_cfa_register	%r11
3801	push	%rbp
3802.cfi_push	%rbp
3803	sub	\$$frame_size,%rsp
3804	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
3805___
3806$code.=<<___ if ($win64);
3807	movaps	%xmm6,0x10(%rsp)
3808	movaps	%xmm7,0x20(%rsp)
3809	movaps	%xmm8,0x30(%rsp)
3810	movaps	%xmm9,0x40(%rsp)
3811	movaps	%xmm10,0x50(%rsp)
3812	movaps	%xmm11,0x60(%rsp)
3813	movaps	%xmm12,0x70(%rsp)
3814	movaps	%xmm13,0x80(%rsp)
3815	movaps	%xmm14,0x90(%rsp)
3816	movaps	%xmm15,0xa0(%rsp)
3817.Lcbc_decrypt_body:
3818___
3819
3820my $inp_=$key_="%rbp";			# reassign $key_
3821
3822$code.=<<___;
3823	mov	$key,$key_		# [re-]backup $key [after reassignment]
3824	movups	($ivp),$iv
3825	mov	$rnds_,$rounds
3826	cmp	\$0x50,$len
3827	jbe	.Lcbc_dec_tail
3828
3829	$movkey	($key),$rndkey0
3830	movdqu	0x00($inp),$inout0	# load input
3831	movdqu	0x10($inp),$inout1
3832	movdqa	$inout0,$in0
3833	movdqu	0x20($inp),$inout2
3834	movdqa	$inout1,$in1
3835	movdqu	0x30($inp),$inout3
3836	movdqa	$inout2,$in2
3837	movdqu	0x40($inp),$inout4
3838	movdqa	$inout3,$in3
3839	movdqu	0x50($inp),$inout5
3840	movdqa	$inout4,$in4
3841	mov	OPENSSL_ia32cap_P+4(%rip),%r9d
3842	cmp	\$0x70,$len
3843	jbe	.Lcbc_dec_six_or_seven
3844
3845	and	\$`1<<26|1<<22`,%r9d	# isolate XSAVE+MOVBE
3846	sub	\$0x50,$len		# $len is biased by -5*16
3847	cmp	\$`1<<22`,%r9d		# check for MOVBE without XSAVE
3848	je	.Lcbc_dec_loop6_enter	# [which denotes Atom Silvermont]
3849	sub	\$0x20,$len		# $len is biased by -7*16
3850	lea	0x70($key),$key		# size optimization
3851	jmp	.Lcbc_dec_loop8_enter
3852.align	16
3853.Lcbc_dec_loop8:
3854	movups	$inout7,($out)
3855	lea	0x10($out),$out
3856.Lcbc_dec_loop8_enter:
3857	movdqu		0x60($inp),$inout6
3858	pxor		$rndkey0,$inout0
3859	movdqu		0x70($inp),$inout7
3860	pxor		$rndkey0,$inout1
3861	$movkey		0x10-0x70($key),$rndkey1
3862	pxor		$rndkey0,$inout2
3863	mov		\$-1,$inp_
3864	cmp		\$0x70,$len	# is there at least 0x60 bytes ahead?
3865	pxor		$rndkey0,$inout3
3866	pxor		$rndkey0,$inout4
3867	pxor		$rndkey0,$inout5
3868	pxor		$rndkey0,$inout6
3869
3870	aesdec		$rndkey1,$inout0
3871	pxor		$rndkey0,$inout7
3872	$movkey		0x20-0x70($key),$rndkey0
3873	aesdec		$rndkey1,$inout1
3874	aesdec		$rndkey1,$inout2
3875	aesdec		$rndkey1,$inout3
3876	aesdec		$rndkey1,$inout4
3877	aesdec		$rndkey1,$inout5
3878	aesdec		$rndkey1,$inout6
3879	adc		\$0,$inp_
3880	and		\$128,$inp_
3881	aesdec		$rndkey1,$inout7
3882	add		$inp,$inp_
3883	$movkey		0x30-0x70($key),$rndkey1
3884___
3885for($i=1;$i<12;$i++) {
3886my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3887$code.=<<___	if ($i==7);
3888	cmp		\$11,$rounds
3889___
3890$code.=<<___;
3891	aesdec		$rndkeyx,$inout0
3892	aesdec		$rndkeyx,$inout1
3893	aesdec		$rndkeyx,$inout2
3894	aesdec		$rndkeyx,$inout3
3895	aesdec		$rndkeyx,$inout4
3896	aesdec		$rndkeyx,$inout5
3897	aesdec		$rndkeyx,$inout6
3898	aesdec		$rndkeyx,$inout7
3899	$movkey		`0x30+0x10*$i`-0x70($key),$rndkeyx
3900___
3901$code.=<<___	if ($i<6 || (!($i&1) && $i>7));
3902	nop
3903___
3904$code.=<<___	if ($i==7);
3905	jb		.Lcbc_dec_done
3906___
3907$code.=<<___	if ($i==9);
3908	je		.Lcbc_dec_done
3909___
3910$code.=<<___	if ($i==11);
3911	jmp		.Lcbc_dec_done
3912___
3913}
3914$code.=<<___;
3915.align	16
3916.Lcbc_dec_done:
3917	aesdec		$rndkey1,$inout0
3918	aesdec		$rndkey1,$inout1
3919	pxor		$rndkey0,$iv
3920	pxor		$rndkey0,$in0
3921	aesdec		$rndkey1,$inout2
3922	aesdec		$rndkey1,$inout3
3923	pxor		$rndkey0,$in1
3924	pxor		$rndkey0,$in2
3925	aesdec		$rndkey1,$inout4
3926	aesdec		$rndkey1,$inout5
3927	pxor		$rndkey0,$in3
3928	pxor		$rndkey0,$in4
3929	aesdec		$rndkey1,$inout6
3930	aesdec		$rndkey1,$inout7
3931	movdqu		0x50($inp),$rndkey1
3932
3933	aesdeclast	$iv,$inout0
3934	movdqu		0x60($inp),$iv		# borrow $iv
3935	pxor		$rndkey0,$rndkey1
3936	aesdeclast	$in0,$inout1
3937	pxor		$rndkey0,$iv
3938	movdqu		0x70($inp),$rndkey0	# next IV
3939	aesdeclast	$in1,$inout2
3940	lea		0x80($inp),$inp
3941	movdqu		0x00($inp_),$in0
3942	aesdeclast	$in2,$inout3
3943	aesdeclast	$in3,$inout4
3944	movdqu		0x10($inp_),$in1
3945	movdqu		0x20($inp_),$in2
3946	aesdeclast	$in4,$inout5
3947	aesdeclast	$rndkey1,$inout6
3948	movdqu		0x30($inp_),$in3
3949	movdqu		0x40($inp_),$in4
3950	aesdeclast	$iv,$inout7
3951	movdqa		$rndkey0,$iv		# return $iv
3952	movdqu		0x50($inp_),$rndkey1
3953	$movkey		-0x70($key),$rndkey0
3954
3955	movups		$inout0,($out)		# store output
3956	movdqa		$in0,$inout0
3957	movups		$inout1,0x10($out)
3958	movdqa		$in1,$inout1
3959	movups		$inout2,0x20($out)
3960	movdqa		$in2,$inout2
3961	movups		$inout3,0x30($out)
3962	movdqa		$in3,$inout3
3963	movups		$inout4,0x40($out)
3964	movdqa		$in4,$inout4
3965	movups		$inout5,0x50($out)
3966	movdqa		$rndkey1,$inout5
3967	movups		$inout6,0x60($out)
3968	lea		0x70($out),$out
3969
3970	sub	\$0x80,$len
3971	ja	.Lcbc_dec_loop8
3972
3973	movaps	$inout7,$inout0
3974	lea	-0x70($key),$key
3975	add	\$0x70,$len
3976	jle	.Lcbc_dec_clear_tail_collected
3977	movups	$inout7,($out)
3978	lea	0x10($out),$out
3979	cmp	\$0x50,$len
3980	jbe	.Lcbc_dec_tail
3981
3982	movaps	$in0,$inout0
3983.Lcbc_dec_six_or_seven:
3984	cmp	\$0x60,$len
3985	ja	.Lcbc_dec_seven
3986
3987	movaps	$inout5,$inout6
3988	call	_aesni_decrypt6
3989	pxor	$iv,$inout0		# ^= IV
3990	movaps	$inout6,$iv
3991	pxor	$in0,$inout1
3992	movdqu	$inout0,($out)
3993	pxor	$in1,$inout2
3994	movdqu	$inout1,0x10($out)
3995	 pxor	$inout1,$inout1		# clear register bank
3996	pxor	$in2,$inout3
3997	movdqu	$inout2,0x20($out)
3998	 pxor	$inout2,$inout2
3999	pxor	$in3,$inout4
4000	movdqu	$inout3,0x30($out)
4001	 pxor	$inout3,$inout3
4002	pxor	$in4,$inout5
4003	movdqu	$inout4,0x40($out)
4004	 pxor	$inout4,$inout4
4005	lea	0x50($out),$out
4006	movdqa	$inout5,$inout0
4007	 pxor	$inout5,$inout5
4008	jmp	.Lcbc_dec_tail_collected
4009
4010.align	16
4011.Lcbc_dec_seven:
4012	movups	0x60($inp),$inout6
4013	xorps	$inout7,$inout7
4014	call	_aesni_decrypt8
4015	movups	0x50($inp),$inout7
4016	pxor	$iv,$inout0		# ^= IV
4017	movups	0x60($inp),$iv
4018	pxor	$in0,$inout1
4019	movdqu	$inout0,($out)
4020	pxor	$in1,$inout2
4021	movdqu	$inout1,0x10($out)
4022	 pxor	$inout1,$inout1		# clear register bank
4023	pxor	$in2,$inout3
4024	movdqu	$inout2,0x20($out)
4025	 pxor	$inout2,$inout2
4026	pxor	$in3,$inout4
4027	movdqu	$inout3,0x30($out)
4028	 pxor	$inout3,$inout3
4029	pxor	$in4,$inout5
4030	movdqu	$inout4,0x40($out)
4031	 pxor	$inout4,$inout4
4032	pxor	$inout7,$inout6
4033	movdqu	$inout5,0x50($out)
4034	 pxor	$inout5,$inout5
4035	lea	0x60($out),$out
4036	movdqa	$inout6,$inout0
4037	 pxor	$inout6,$inout6
4038	 pxor	$inout7,$inout7
4039	jmp	.Lcbc_dec_tail_collected
4040
4041.align	16
4042.Lcbc_dec_loop6:
4043	movups	$inout5,($out)
4044	lea	0x10($out),$out
4045	movdqu	0x00($inp),$inout0	# load input
4046	movdqu	0x10($inp),$inout1
4047	movdqa	$inout0,$in0
4048	movdqu	0x20($inp),$inout2
4049	movdqa	$inout1,$in1
4050	movdqu	0x30($inp),$inout3
4051	movdqa	$inout2,$in2
4052	movdqu	0x40($inp),$inout4
4053	movdqa	$inout3,$in3
4054	movdqu	0x50($inp),$inout5
4055	movdqa	$inout4,$in4
4056.Lcbc_dec_loop6_enter:
4057	lea	0x60($inp),$inp
4058	movdqa	$inout5,$inout6
4059
4060	call	_aesni_decrypt6
4061
4062	pxor	$iv,$inout0		# ^= IV
4063	movdqa	$inout6,$iv
4064	pxor	$in0,$inout1
4065	movdqu	$inout0,($out)
4066	pxor	$in1,$inout2
4067	movdqu	$inout1,0x10($out)
4068	pxor	$in2,$inout3
4069	movdqu	$inout2,0x20($out)
4070	pxor	$in3,$inout4
4071	mov	$key_,$key
4072	movdqu	$inout3,0x30($out)
4073	pxor	$in4,$inout5
4074	mov	$rnds_,$rounds
4075	movdqu	$inout4,0x40($out)
4076	lea	0x50($out),$out
4077	sub	\$0x60,$len
4078	ja	.Lcbc_dec_loop6
4079
4080	movdqa	$inout5,$inout0
4081	add	\$0x50,$len
4082	jle	.Lcbc_dec_clear_tail_collected
4083	movups	$inout5,($out)
4084	lea	0x10($out),$out
4085
4086.Lcbc_dec_tail:
4087	movups	($inp),$inout0
4088	sub	\$0x10,$len
4089	jbe	.Lcbc_dec_one		# $len is 1*16 or less
4090
4091	movups	0x10($inp),$inout1
4092	movaps	$inout0,$in0
4093	sub	\$0x10,$len
4094	jbe	.Lcbc_dec_two		# $len is 2*16 or less
4095
4096	movups	0x20($inp),$inout2
4097	movaps	$inout1,$in1
4098	sub	\$0x10,$len
4099	jbe	.Lcbc_dec_three		# $len is 3*16 or less
4100
4101	movups	0x30($inp),$inout3
4102	movaps	$inout2,$in2
4103	sub	\$0x10,$len
4104	jbe	.Lcbc_dec_four		# $len is 4*16 or less
4105
4106	movups	0x40($inp),$inout4	# $len is 5*16 or less
4107	movaps	$inout3,$in3
4108	movaps	$inout4,$in4
4109	xorps	$inout5,$inout5
4110	call	_aesni_decrypt6
4111	pxor	$iv,$inout0
4112	movaps	$in4,$iv
4113	pxor	$in0,$inout1
4114	movdqu	$inout0,($out)
4115	pxor	$in1,$inout2
4116	movdqu	$inout1,0x10($out)
4117	 pxor	$inout1,$inout1		# clear register bank
4118	pxor	$in2,$inout3
4119	movdqu	$inout2,0x20($out)
4120	 pxor	$inout2,$inout2
4121	pxor	$in3,$inout4
4122	movdqu	$inout3,0x30($out)
4123	 pxor	$inout3,$inout3
4124	lea	0x40($out),$out
4125	movdqa	$inout4,$inout0
4126	 pxor	$inout4,$inout4
4127	 pxor	$inout5,$inout5
4128	sub	\$0x10,$len
4129	jmp	.Lcbc_dec_tail_collected
4130
4131.align	16
4132.Lcbc_dec_one:
4133	movaps	$inout0,$in0
4134___
4135	&aesni_generate1("dec",$key,$rounds);
4136$code.=<<___;
4137	xorps	$iv,$inout0
4138	movaps	$in0,$iv
4139	jmp	.Lcbc_dec_tail_collected
4140.align	16
4141.Lcbc_dec_two:
4142	movaps	$inout1,$in1
4143	call	_aesni_decrypt2
4144	pxor	$iv,$inout0
4145	movaps	$in1,$iv
4146	pxor	$in0,$inout1
4147	movdqu	$inout0,($out)
4148	movdqa	$inout1,$inout0
4149	 pxor	$inout1,$inout1		# clear register bank
4150	lea	0x10($out),$out
4151	jmp	.Lcbc_dec_tail_collected
4152.align	16
4153.Lcbc_dec_three:
4154	movaps	$inout2,$in2
4155	call	_aesni_decrypt3
4156	pxor	$iv,$inout0
4157	movaps	$in2,$iv
4158	pxor	$in0,$inout1
4159	movdqu	$inout0,($out)
4160	pxor	$in1,$inout2
4161	movdqu	$inout1,0x10($out)
4162	 pxor	$inout1,$inout1		# clear register bank
4163	movdqa	$inout2,$inout0
4164	 pxor	$inout2,$inout2
4165	lea	0x20($out),$out
4166	jmp	.Lcbc_dec_tail_collected
4167.align	16
4168.Lcbc_dec_four:
4169	movaps	$inout3,$in3
4170	call	_aesni_decrypt4
4171	pxor	$iv,$inout0
4172	movaps	$in3,$iv
4173	pxor	$in0,$inout1
4174	movdqu	$inout0,($out)
4175	pxor	$in1,$inout2
4176	movdqu	$inout1,0x10($out)
4177	 pxor	$inout1,$inout1		# clear register bank
4178	pxor	$in2,$inout3
4179	movdqu	$inout2,0x20($out)
4180	 pxor	$inout2,$inout2
4181	movdqa	$inout3,$inout0
4182	 pxor	$inout3,$inout3
4183	lea	0x30($out),$out
4184	jmp	.Lcbc_dec_tail_collected
4185
4186.align	16
4187.Lcbc_dec_clear_tail_collected:
4188	pxor	$inout1,$inout1		# clear register bank
4189	pxor	$inout2,$inout2
4190	pxor	$inout3,$inout3
4191___
4192$code.=<<___ if (!$win64);
4193	pxor	$inout4,$inout4		# %xmm6..9
4194	pxor	$inout5,$inout5
4195	pxor	$inout6,$inout6
4196	pxor	$inout7,$inout7
4197___
4198$code.=<<___;
4199.Lcbc_dec_tail_collected:
4200	movups	$iv,($ivp)
4201	and	\$15,$len
4202	jnz	.Lcbc_dec_tail_partial
4203	movups	$inout0,($out)
4204	pxor	$inout0,$inout0
4205	jmp	.Lcbc_dec_ret
4206.align	16
4207.Lcbc_dec_tail_partial:
4208	movaps	$inout0,(%rsp)
4209	pxor	$inout0,$inout0
4210	mov	\$16,%rcx
4211	mov	$out,%rdi
4212	sub	$len,%rcx
4213	lea	(%rsp),%rsi
4214	.long	0x9066A4F3		# rep movsb
4215	movdqa	$inout0,(%rsp)
4216
4217.Lcbc_dec_ret:
4218	xorps	$rndkey0,$rndkey0	# %xmm0
4219	pxor	$rndkey1,$rndkey1
4220___
4221$code.=<<___ if ($win64);
4222	movaps	0x10(%rsp),%xmm6
4223	movaps	%xmm0,0x10(%rsp)	# clear stack
4224	movaps	0x20(%rsp),%xmm7
4225	movaps	%xmm0,0x20(%rsp)
4226	movaps	0x30(%rsp),%xmm8
4227	movaps	%xmm0,0x30(%rsp)
4228	movaps	0x40(%rsp),%xmm9
4229	movaps	%xmm0,0x40(%rsp)
4230	movaps	0x50(%rsp),%xmm10
4231	movaps	%xmm0,0x50(%rsp)
4232	movaps	0x60(%rsp),%xmm11
4233	movaps	%xmm0,0x60(%rsp)
4234	movaps	0x70(%rsp),%xmm12
4235	movaps	%xmm0,0x70(%rsp)
4236	movaps	0x80(%rsp),%xmm13
4237	movaps	%xmm0,0x80(%rsp)
4238	movaps	0x90(%rsp),%xmm14
4239	movaps	%xmm0,0x90(%rsp)
4240	movaps	0xa0(%rsp),%xmm15
4241	movaps	%xmm0,0xa0(%rsp)
4242___
4243$code.=<<___;
4244	mov	-8(%r11),%rbp
4245.cfi_restore	%rbp
4246	lea	(%r11),%rsp
4247.cfi_def_cfa_register	%rsp
4248.Lcbc_ret:
4249	ret
4250.cfi_endproc
4251.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4252___
4253}
4254# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4255#				int bits, AES_KEY *key)
4256#
4257# input:	$inp	user-supplied key
4258#		$bits	$inp length in bits
4259#		$key	pointer to key schedule
4260# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4261#		*$key	key schedule
4262#
4263{ my ($inp,$bits,$key) = @_4args;
4264  $bits =~ s/%r/%e/;
4265
4266$code.=<<___;
4267.globl	${PREFIX}_set_decrypt_key
4268.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
4269.align	16
4270${PREFIX}_set_decrypt_key:
4271.cfi_startproc
4272	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4273.cfi_adjust_cfa_offset	8
4274	call	__aesni_set_encrypt_key
4275	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
4276	test	%eax,%eax
4277	jnz	.Ldec_key_ret
4278	lea	16($key,$bits),$inp	# points at the end of key schedule
4279
4280	$movkey	($key),%xmm0		# just swap
4281	$movkey	($inp),%xmm1
4282	$movkey	%xmm0,($inp)
4283	$movkey	%xmm1,($key)
4284	lea	16($key),$key
4285	lea	-16($inp),$inp
4286
4287.Ldec_key_inverse:
4288	$movkey	($key),%xmm0		# swap and inverse
4289	$movkey	($inp),%xmm1
4290	aesimc	%xmm0,%xmm0
4291	aesimc	%xmm1,%xmm1
4292	lea	16($key),$key
4293	lea	-16($inp),$inp
4294	$movkey	%xmm0,16($inp)
4295	$movkey	%xmm1,-16($key)
4296	cmp	$key,$inp
4297	ja	.Ldec_key_inverse
4298
4299	$movkey	($key),%xmm0		# inverse middle
4300	aesimc	%xmm0,%xmm0
4301	pxor	%xmm1,%xmm1
4302	$movkey	%xmm0,($inp)
4303	pxor	%xmm0,%xmm0
4304.Ldec_key_ret:
4305	add	\$8,%rsp
4306.cfi_adjust_cfa_offset	-8
4307	ret
4308.cfi_endproc
4309.LSEH_end_set_decrypt_key:
4310.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4311___
4312
4313# This is based on submission from Intel by
4314#	Huang Ying
4315#	Vinodh Gopal
4316#	Kahraman Akdemir
4317#
4318# Aggressively optimized in respect to aeskeygenassist's critical path
4319# and is contained in %xmm0-5 to meet Win64 ABI requirement.
4320#
4321# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4322#				int bits, AES_KEY * const key);
4323#
4324# input:	$inp	user-supplied key
4325#		$bits	$inp length in bits
4326#		$key	pointer to key schedule
4327# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4328#		$bits	rounds-1 (used in aesni_set_decrypt_key)
4329#		*$key	key schedule
4330#		$key	pointer to key schedule (used in
4331#			aesni_set_decrypt_key)
4332#
4333# Subroutine is frame-less, which means that only volatile registers
4334# are used. Note that it's declared "abi-omnipotent", which means that
4335# amount of volatile registers is smaller on Windows.
4336#
4337$code.=<<___;
4338.globl	${PREFIX}_set_encrypt_key
4339.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
4340.align	16
4341${PREFIX}_set_encrypt_key:
4342__aesni_set_encrypt_key:
4343.cfi_startproc
4344	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4345.cfi_adjust_cfa_offset	8
4346	mov	\$-1,%rax
4347	test	$inp,$inp
4348	jz	.Lenc_key_ret
4349	test	$key,$key
4350	jz	.Lenc_key_ret
4351
4352	mov	\$`1<<28|1<<11`,%r10d	# AVX and XOP bits
4353	movups	($inp),%xmm0		# pull first 128 bits of *userKey
4354	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
4355	and	OPENSSL_ia32cap_P+4(%rip),%r10d
4356	lea	16($key),%rax		# %rax is used as modifiable copy of $key
4357	cmp	\$256,$bits
4358	je	.L14rounds
4359	cmp	\$192,$bits
4360	je	.L12rounds
4361	cmp	\$128,$bits
4362	jne	.Lbad_keybits
4363
4364.L10rounds:
4365	mov	\$9,$bits			# 10 rounds for 128-bit key
4366	cmp	\$`1<<28`,%r10d			# AVX, bit no XOP
4367	je	.L10rounds_alt
4368
4369	$movkey	%xmm0,($key)			# round 0
4370	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
4371	call		.Lkey_expansion_128_cold
4372	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
4373	call		.Lkey_expansion_128
4374	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
4375	call		.Lkey_expansion_128
4376	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
4377	call		.Lkey_expansion_128
4378	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
4379	call		.Lkey_expansion_128
4380	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
4381	call		.Lkey_expansion_128
4382	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
4383	call		.Lkey_expansion_128
4384	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
4385	call		.Lkey_expansion_128
4386	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
4387	call		.Lkey_expansion_128
4388	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
4389	call		.Lkey_expansion_128
4390	$movkey	%xmm0,(%rax)
4391	mov	$bits,80(%rax)	# 240(%rdx)
4392	xor	%eax,%eax
4393	jmp	.Lenc_key_ret
4394
4395.align	16
4396.L10rounds_alt:
4397	movdqa	.Lkey_rotate(%rip),%xmm5
4398	mov	\$8,%r10d
4399	movdqa	.Lkey_rcon1(%rip),%xmm4
4400	movdqa	%xmm0,%xmm2
4401	movdqu	%xmm0,($key)
4402	jmp	.Loop_key128
4403
4404.align	16
4405.Loop_key128:
4406	pshufb		%xmm5,%xmm0
4407	aesenclast	%xmm4,%xmm0
4408	pslld		\$1,%xmm4
4409	lea		16(%rax),%rax
4410
4411	movdqa		%xmm2,%xmm3
4412	pslldq		\$4,%xmm2
4413	pxor		%xmm2,%xmm3
4414	pslldq		\$4,%xmm2
4415	pxor		%xmm2,%xmm3
4416	pslldq		\$4,%xmm2
4417	pxor		%xmm3,%xmm2
4418
4419	pxor		%xmm2,%xmm0
4420	movdqu		%xmm0,-16(%rax)
4421	movdqa		%xmm0,%xmm2
4422
4423	dec	%r10d
4424	jnz	.Loop_key128
4425
4426	movdqa		.Lkey_rcon1b(%rip),%xmm4
4427
4428	pshufb		%xmm5,%xmm0
4429	aesenclast	%xmm4,%xmm0
4430	pslld		\$1,%xmm4
4431
4432	movdqa		%xmm2,%xmm3
4433	pslldq		\$4,%xmm2
4434	pxor		%xmm2,%xmm3
4435	pslldq		\$4,%xmm2
4436	pxor		%xmm2,%xmm3
4437	pslldq		\$4,%xmm2
4438	pxor		%xmm3,%xmm2
4439
4440	pxor		%xmm2,%xmm0
4441	movdqu		%xmm0,(%rax)
4442
4443	movdqa		%xmm0,%xmm2
4444	pshufb		%xmm5,%xmm0
4445	aesenclast	%xmm4,%xmm0
4446
4447	movdqa		%xmm2,%xmm3
4448	pslldq		\$4,%xmm2
4449	pxor		%xmm2,%xmm3
4450	pslldq		\$4,%xmm2
4451	pxor		%xmm2,%xmm3
4452	pslldq		\$4,%xmm2
4453	pxor		%xmm3,%xmm2
4454
4455	pxor		%xmm2,%xmm0
4456	movdqu		%xmm0,16(%rax)
4457
4458	mov	$bits,96(%rax)	# 240($key)
4459	xor	%eax,%eax
4460	jmp	.Lenc_key_ret
4461
4462.align	16
4463.L12rounds:
4464	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
4465	mov	\$11,$bits			# 12 rounds for 192
4466	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4467	je	.L12rounds_alt
4468
4469	$movkey	%xmm0,($key)			# round 0
4470	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
4471	call		.Lkey_expansion_192a_cold
4472	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
4473	call		.Lkey_expansion_192b
4474	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
4475	call		.Lkey_expansion_192a
4476	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
4477	call		.Lkey_expansion_192b
4478	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
4479	call		.Lkey_expansion_192a
4480	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
4481	call		.Lkey_expansion_192b
4482	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
4483	call		.Lkey_expansion_192a
4484	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
4485	call		.Lkey_expansion_192b
4486	$movkey	%xmm0,(%rax)
4487	mov	$bits,48(%rax)	# 240(%rdx)
4488	xor	%rax, %rax
4489	jmp	.Lenc_key_ret
4490
4491.align	16
4492.L12rounds_alt:
4493	movdqa	.Lkey_rotate192(%rip),%xmm5
4494	movdqa	.Lkey_rcon1(%rip),%xmm4
4495	mov	\$8,%r10d
4496	movdqu	%xmm0,($key)
4497	jmp	.Loop_key192
4498
4499.align	16
4500.Loop_key192:
4501	movq		%xmm2,0(%rax)
4502	movdqa		%xmm2,%xmm1
4503	pshufb		%xmm5,%xmm2
4504	aesenclast	%xmm4,%xmm2
4505	pslld		\$1, %xmm4
4506	lea		24(%rax),%rax
4507
4508	movdqa		%xmm0,%xmm3
4509	pslldq		\$4,%xmm0
4510	pxor		%xmm0,%xmm3
4511	pslldq		\$4,%xmm0
4512	pxor		%xmm0,%xmm3
4513	pslldq		\$4,%xmm0
4514	pxor		%xmm3,%xmm0
4515
4516	pshufd		\$0xff,%xmm0,%xmm3
4517	pxor		%xmm1,%xmm3
4518	pslldq		\$4,%xmm1
4519	pxor		%xmm1,%xmm3
4520
4521	pxor		%xmm2,%xmm0
4522	pxor		%xmm3,%xmm2
4523	movdqu		%xmm0,-16(%rax)
4524
4525	dec	%r10d
4526	jnz	.Loop_key192
4527
4528	mov	$bits,32(%rax)	# 240($key)
4529	xor	%eax,%eax
4530	jmp	.Lenc_key_ret
4531
4532.align	16
4533.L14rounds:
4534	movups	16($inp),%xmm2			# remaining half of *userKey
4535	mov	\$13,$bits			# 14 rounds for 256
4536	lea	16(%rax),%rax
4537	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4538	je	.L14rounds_alt
4539
4540	$movkey	%xmm0,($key)			# round 0
4541	$movkey	%xmm2,16($key)			# round 1
4542	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
4543	call		.Lkey_expansion_256a_cold
4544	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
4545	call		.Lkey_expansion_256b
4546	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
4547	call		.Lkey_expansion_256a
4548	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
4549	call		.Lkey_expansion_256b
4550	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
4551	call		.Lkey_expansion_256a
4552	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
4553	call		.Lkey_expansion_256b
4554	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
4555	call		.Lkey_expansion_256a
4556	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
4557	call		.Lkey_expansion_256b
4558	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
4559	call		.Lkey_expansion_256a
4560	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
4561	call		.Lkey_expansion_256b
4562	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
4563	call		.Lkey_expansion_256a
4564	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
4565	call		.Lkey_expansion_256b
4566	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
4567	call		.Lkey_expansion_256a
4568	$movkey	%xmm0,(%rax)
4569	mov	$bits,16(%rax)	# 240(%rdx)
4570	xor	%rax,%rax
4571	jmp	.Lenc_key_ret
4572
4573.align	16
4574.L14rounds_alt:
4575	movdqa	.Lkey_rotate(%rip),%xmm5
4576	movdqa	.Lkey_rcon1(%rip),%xmm4
4577	mov	\$7,%r10d
4578	movdqu	%xmm0,0($key)
4579	movdqa	%xmm2,%xmm1
4580	movdqu	%xmm2,16($key)
4581	jmp	.Loop_key256
4582
4583.align	16
4584.Loop_key256:
4585	pshufb		%xmm5,%xmm2
4586	aesenclast	%xmm4,%xmm2
4587
4588	movdqa		%xmm0,%xmm3
4589	pslldq		\$4,%xmm0
4590	pxor		%xmm0,%xmm3
4591	pslldq		\$4,%xmm0
4592	pxor		%xmm0,%xmm3
4593	pslldq		\$4,%xmm0
4594	pxor		%xmm3,%xmm0
4595	pslld		\$1,%xmm4
4596
4597	pxor		%xmm2,%xmm0
4598	movdqu		%xmm0,(%rax)
4599
4600	dec	%r10d
4601	jz	.Ldone_key256
4602
4603	pshufd		\$0xff,%xmm0,%xmm2
4604	pxor		%xmm3,%xmm3
4605	aesenclast	%xmm3,%xmm2
4606
4607	movdqa		%xmm1,%xmm3
4608	pslldq		\$4,%xmm1
4609	pxor		%xmm1,%xmm3
4610	pslldq		\$4,%xmm1
4611	pxor		%xmm1,%xmm3
4612	pslldq		\$4,%xmm1
4613	pxor		%xmm3,%xmm1
4614
4615	pxor		%xmm1,%xmm2
4616	movdqu		%xmm2,16(%rax)
4617	lea		32(%rax),%rax
4618	movdqa		%xmm2,%xmm1
4619
4620	jmp	.Loop_key256
4621
4622.Ldone_key256:
4623	mov	$bits,16(%rax)	# 240($key)
4624	xor	%eax,%eax
4625	jmp	.Lenc_key_ret
4626
4627.align	16
4628.Lbad_keybits:
4629	mov	\$-2,%rax
4630.Lenc_key_ret:
4631	pxor	%xmm0,%xmm0
4632	pxor	%xmm1,%xmm1
4633	pxor	%xmm2,%xmm2
4634	pxor	%xmm3,%xmm3
4635	pxor	%xmm4,%xmm4
4636	pxor	%xmm5,%xmm5
4637	add	\$8,%rsp
4638.cfi_adjust_cfa_offset	-8
4639	ret
4640.cfi_endproc
4641.LSEH_end_set_encrypt_key:
4642
4643.align	16
4644.Lkey_expansion_128:
4645	$movkey	%xmm0,(%rax)
4646	lea	16(%rax),%rax
4647.Lkey_expansion_128_cold:
4648	shufps	\$0b00010000,%xmm0,%xmm4
4649	xorps	%xmm4, %xmm0
4650	shufps	\$0b10001100,%xmm0,%xmm4
4651	xorps	%xmm4, %xmm0
4652	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4653	xorps	%xmm1,%xmm0
4654	ret
4655
4656.align 16
4657.Lkey_expansion_192a:
4658	$movkey	%xmm0,(%rax)
4659	lea	16(%rax),%rax
4660.Lkey_expansion_192a_cold:
4661	movaps	%xmm2, %xmm5
4662.Lkey_expansion_192b_warm:
4663	shufps	\$0b00010000,%xmm0,%xmm4
4664	movdqa	%xmm2,%xmm3
4665	xorps	%xmm4,%xmm0
4666	shufps	\$0b10001100,%xmm0,%xmm4
4667	pslldq	\$4,%xmm3
4668	xorps	%xmm4,%xmm0
4669	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
4670	pxor	%xmm3,%xmm2
4671	pxor	%xmm1,%xmm0
4672	pshufd	\$0b11111111,%xmm0,%xmm3
4673	pxor	%xmm3,%xmm2
4674	ret
4675
4676.align 16
4677.Lkey_expansion_192b:
4678	movaps	%xmm0,%xmm3
4679	shufps	\$0b01000100,%xmm0,%xmm5
4680	$movkey	%xmm5,(%rax)
4681	shufps	\$0b01001110,%xmm2,%xmm3
4682	$movkey	%xmm3,16(%rax)
4683	lea	32(%rax),%rax
4684	jmp	.Lkey_expansion_192b_warm
4685
4686.align	16
4687.Lkey_expansion_256a:
4688	$movkey	%xmm2,(%rax)
4689	lea	16(%rax),%rax
4690.Lkey_expansion_256a_cold:
4691	shufps	\$0b00010000,%xmm0,%xmm4
4692	xorps	%xmm4,%xmm0
4693	shufps	\$0b10001100,%xmm0,%xmm4
4694	xorps	%xmm4,%xmm0
4695	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4696	xorps	%xmm1,%xmm0
4697	ret
4698
4699.align 16
4700.Lkey_expansion_256b:
4701	$movkey	%xmm0,(%rax)
4702	lea	16(%rax),%rax
4703
4704	shufps	\$0b00010000,%xmm2,%xmm4
4705	xorps	%xmm4,%xmm2
4706	shufps	\$0b10001100,%xmm2,%xmm4
4707	xorps	%xmm4,%xmm2
4708	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
4709	xorps	%xmm1,%xmm2
4710	ret
4711.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4712.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4713___
4714}
4715
4716$code.=<<___;
4717.align	64
4718.Lbswap_mask:
4719	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4720.Lincrement32:
4721	.long	6,6,6,0
4722.Lincrement64:
4723	.long	1,0,0,0
4724.Lxts_magic:
4725	.long	0x87,0,1,0
4726.Lincrement1:
4727	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4728.Lkey_rotate:
4729	.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4730.Lkey_rotate192:
4731	.long	0x04070605,0x04070605,0x04070605,0x04070605
4732.Lkey_rcon1:
4733	.long	1,1,1,1
4734.Lkey_rcon1b:
4735	.long	0x1b,0x1b,0x1b,0x1b
4736
4737.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4738.align	64
4739___
4740
4741# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4742#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
4743if ($win64) {
4744$rec="%rcx";
4745$frame="%rdx";
4746$context="%r8";
4747$disp="%r9";
4748
4749$code.=<<___;
4750.extern	__imp_RtlVirtualUnwind
4751___
4752$code.=<<___ if ($PREFIX eq "aesni");
4753.type	ecb_ccm64_se_handler,\@abi-omnipotent
4754.align	16
4755ecb_ccm64_se_handler:
4756	push	%rsi
4757	push	%rdi
4758	push	%rbx
4759	push	%rbp
4760	push	%r12
4761	push	%r13
4762	push	%r14
4763	push	%r15
4764	pushfq
4765	sub	\$64,%rsp
4766
4767	mov	120($context),%rax	# pull context->Rax
4768	mov	248($context),%rbx	# pull context->Rip
4769
4770	mov	8($disp),%rsi		# disp->ImageBase
4771	mov	56($disp),%r11		# disp->HandlerData
4772
4773	mov	0(%r11),%r10d		# HandlerData[0]
4774	lea	(%rsi,%r10),%r10	# prologue label
4775	cmp	%r10,%rbx		# context->Rip<prologue label
4776	jb	.Lcommon_seh_tail
4777
4778	mov	152($context),%rax	# pull context->Rsp
4779
4780	mov	4(%r11),%r10d		# HandlerData[1]
4781	lea	(%rsi,%r10),%r10	# epilogue label
4782	cmp	%r10,%rbx		# context->Rip>=epilogue label
4783	jae	.Lcommon_seh_tail
4784
4785	lea	0(%rax),%rsi		# %xmm save area
4786	lea	512($context),%rdi	# &context.Xmm6
4787	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
4788	.long	0xa548f3fc		# cld; rep movsq
4789	lea	0x58(%rax),%rax		# adjust stack pointer
4790
4791	jmp	.Lcommon_seh_tail
4792.size	ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4793
4794.type	ctr_xts_se_handler,\@abi-omnipotent
4795.align	16
4796ctr_xts_se_handler:
4797	push	%rsi
4798	push	%rdi
4799	push	%rbx
4800	push	%rbp
4801	push	%r12
4802	push	%r13
4803	push	%r14
4804	push	%r15
4805	pushfq
4806	sub	\$64,%rsp
4807
4808	mov	120($context),%rax	# pull context->Rax
4809	mov	248($context),%rbx	# pull context->Rip
4810
4811	mov	8($disp),%rsi		# disp->ImageBase
4812	mov	56($disp),%r11		# disp->HandlerData
4813
4814	mov	0(%r11),%r10d		# HandlerData[0]
4815	lea	(%rsi,%r10),%r10	# prologue lable
4816	cmp	%r10,%rbx		# context->Rip<prologue label
4817	jb	.Lcommon_seh_tail
4818
4819	mov	152($context),%rax	# pull context->Rsp
4820
4821	mov	4(%r11),%r10d		# HandlerData[1]
4822	lea	(%rsi,%r10),%r10	# epilogue label
4823	cmp	%r10,%rbx		# context->Rip>=epilogue label
4824	jae	.Lcommon_seh_tail
4825
4826	mov	208($context),%rax	# pull context->R11
4827
4828	lea	-0xa8(%rax),%rsi	# %xmm save area
4829	lea	512($context),%rdi	# & context.Xmm6
4830	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4831	.long	0xa548f3fc		# cld; rep movsq
4832
4833	mov	-8(%rax),%rbp		# restore saved %rbp
4834	mov	%rbp,160($context)	# restore context->Rbp
4835	jmp	.Lcommon_seh_tail
4836.size	ctr_xts_se_handler,.-ctr_xts_se_handler
4837
4838.type	ocb_se_handler,\@abi-omnipotent
4839.align	16
4840ocb_se_handler:
4841	push	%rsi
4842	push	%rdi
4843	push	%rbx
4844	push	%rbp
4845	push	%r12
4846	push	%r13
4847	push	%r14
4848	push	%r15
4849	pushfq
4850	sub	\$64,%rsp
4851
4852	mov	120($context),%rax	# pull context->Rax
4853	mov	248($context),%rbx	# pull context->Rip
4854
4855	mov	8($disp),%rsi		# disp->ImageBase
4856	mov	56($disp),%r11		# disp->HandlerData
4857
4858	mov	0(%r11),%r10d		# HandlerData[0]
4859	lea	(%rsi,%r10),%r10	# prologue lable
4860	cmp	%r10,%rbx		# context->Rip<prologue label
4861	jb	.Lcommon_seh_tail
4862
4863	mov	4(%r11),%r10d		# HandlerData[1]
4864	lea	(%rsi,%r10),%r10	# epilogue label
4865	cmp	%r10,%rbx		# context->Rip>=epilogue label
4866	jae	.Lcommon_seh_tail
4867
4868	mov	8(%r11),%r10d		# HandlerData[2]
4869	lea	(%rsi,%r10),%r10
4870	cmp	%r10,%rbx		# context->Rip>=pop label
4871	jae	.Locb_no_xmm
4872
4873	mov	152($context),%rax	# pull context->Rsp
4874
4875	lea	(%rax),%rsi		# %xmm save area
4876	lea	512($context),%rdi	# & context.Xmm6
4877	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4878	.long	0xa548f3fc		# cld; rep movsq
4879	lea	0xa0+0x28(%rax),%rax
4880
4881.Locb_no_xmm:
4882	mov	-8(%rax),%rbx
4883	mov	-16(%rax),%rbp
4884	mov	-24(%rax),%r12
4885	mov	-32(%rax),%r13
4886	mov	-40(%rax),%r14
4887
4888	mov	%rbx,144($context)	# restore context->Rbx
4889	mov	%rbp,160($context)	# restore context->Rbp
4890	mov	%r12,216($context)	# restore context->R12
4891	mov	%r13,224($context)	# restore context->R13
4892	mov	%r14,232($context)	# restore context->R14
4893
4894	jmp	.Lcommon_seh_tail
4895.size	ocb_se_handler,.-ocb_se_handler
4896___
4897$code.=<<___;
4898.type	cbc_se_handler,\@abi-omnipotent
4899.align	16
4900cbc_se_handler:
4901	push	%rsi
4902	push	%rdi
4903	push	%rbx
4904	push	%rbp
4905	push	%r12
4906	push	%r13
4907	push	%r14
4908	push	%r15
4909	pushfq
4910	sub	\$64,%rsp
4911
4912	mov	152($context),%rax	# pull context->Rsp
4913	mov	248($context),%rbx	# pull context->Rip
4914
4915	lea	.Lcbc_decrypt_bulk(%rip),%r10
4916	cmp	%r10,%rbx		# context->Rip<"prologue" label
4917	jb	.Lcommon_seh_tail
4918
4919	mov	120($context),%rax	# pull context->Rax
4920
4921	lea	.Lcbc_decrypt_body(%rip),%r10
4922	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
4923	jb	.Lcommon_seh_tail
4924
4925	mov	152($context),%rax	# pull context->Rsp
4926
4927	lea	.Lcbc_ret(%rip),%r10
4928	cmp	%r10,%rbx		# context->Rip>="epilogue" label
4929	jae	.Lcommon_seh_tail
4930
4931	lea	16(%rax),%rsi		# %xmm save area
4932	lea	512($context),%rdi	# &context.Xmm6
4933	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4934	.long	0xa548f3fc		# cld; rep movsq
4935
4936	mov	208($context),%rax	# pull context->R11
4937
4938	mov	-8(%rax),%rbp		# restore saved %rbp
4939	mov	%rbp,160($context)	# restore context->Rbp
4940
4941.Lcommon_seh_tail:
4942	mov	8(%rax),%rdi
4943	mov	16(%rax),%rsi
4944	mov	%rax,152($context)	# restore context->Rsp
4945	mov	%rsi,168($context)	# restore context->Rsi
4946	mov	%rdi,176($context)	# restore context->Rdi
4947
4948	mov	40($disp),%rdi		# disp->ContextRecord
4949	mov	$context,%rsi		# context
4950	mov	\$154,%ecx		# sizeof(CONTEXT)
4951	.long	0xa548f3fc		# cld; rep movsq
4952
4953	mov	$disp,%rsi
4954	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4955	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4956	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4957	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4958	mov	40(%rsi),%r10		# disp->ContextRecord
4959	lea	56(%rsi),%r11		# &disp->HandlerData
4960	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4961	mov	%r10,32(%rsp)		# arg5
4962	mov	%r11,40(%rsp)		# arg6
4963	mov	%r12,48(%rsp)		# arg7
4964	mov	%rcx,56(%rsp)		# arg8, (NULL)
4965	call	*__imp_RtlVirtualUnwind(%rip)
4966
4967	mov	\$1,%eax		# ExceptionContinueSearch
4968	add	\$64,%rsp
4969	popfq
4970	pop	%r15
4971	pop	%r14
4972	pop	%r13
4973	pop	%r12
4974	pop	%rbp
4975	pop	%rbx
4976	pop	%rdi
4977	pop	%rsi
4978	ret
4979.size	cbc_se_handler,.-cbc_se_handler
4980
4981.section	.pdata
4982.align	4
4983___
4984$code.=<<___ if ($PREFIX eq "aesni");
4985	.rva	.LSEH_begin_aesni_ecb_encrypt
4986	.rva	.LSEH_end_aesni_ecb_encrypt
4987	.rva	.LSEH_info_ecb
4988
4989	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
4990	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
4991	.rva	.LSEH_info_ccm64_enc
4992
4993	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
4994	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
4995	.rva	.LSEH_info_ccm64_dec
4996
4997	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
4998	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
4999	.rva	.LSEH_info_ctr32
5000
5001	.rva	.LSEH_begin_aesni_xts_encrypt
5002	.rva	.LSEH_end_aesni_xts_encrypt
5003	.rva	.LSEH_info_xts_enc
5004
5005	.rva	.LSEH_begin_aesni_xts_decrypt
5006	.rva	.LSEH_end_aesni_xts_decrypt
5007	.rva	.LSEH_info_xts_dec
5008
5009	.rva	.LSEH_begin_aesni_ocb_encrypt
5010	.rva	.LSEH_end_aesni_ocb_encrypt
5011	.rva	.LSEH_info_ocb_enc
5012
5013	.rva	.LSEH_begin_aesni_ocb_decrypt
5014	.rva	.LSEH_end_aesni_ocb_decrypt
5015	.rva	.LSEH_info_ocb_dec
5016___
5017$code.=<<___;
5018	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
5019	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
5020	.rva	.LSEH_info_cbc
5021
5022	.rva	${PREFIX}_set_decrypt_key
5023	.rva	.LSEH_end_set_decrypt_key
5024	.rva	.LSEH_info_key
5025
5026	.rva	${PREFIX}_set_encrypt_key
5027	.rva	.LSEH_end_set_encrypt_key
5028	.rva	.LSEH_info_key
5029.section	.xdata
5030.align	8
5031___
5032$code.=<<___ if ($PREFIX eq "aesni");
5033.LSEH_info_ecb:
5034	.byte	9,0,0,0
5035	.rva	ecb_ccm64_se_handler
5036	.rva	.Lecb_enc_body,.Lecb_enc_ret		# HandlerData[]
5037.LSEH_info_ccm64_enc:
5038	.byte	9,0,0,0
5039	.rva	ecb_ccm64_se_handler
5040	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
5041.LSEH_info_ccm64_dec:
5042	.byte	9,0,0,0
5043	.rva	ecb_ccm64_se_handler
5044	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
5045.LSEH_info_ctr32:
5046	.byte	9,0,0,0
5047	.rva	ctr_xts_se_handler
5048	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
5049.LSEH_info_xts_enc:
5050	.byte	9,0,0,0
5051	.rva	ctr_xts_se_handler
5052	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
5053.LSEH_info_xts_dec:
5054	.byte	9,0,0,0
5055	.rva	ctr_xts_se_handler
5056	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
5057.LSEH_info_ocb_enc:
5058	.byte	9,0,0,0
5059	.rva	ocb_se_handler
5060	.rva	.Locb_enc_body,.Locb_enc_epilogue	# HandlerData[]
5061	.rva	.Locb_enc_pop
5062	.long	0
5063.LSEH_info_ocb_dec:
5064	.byte	9,0,0,0
5065	.rva	ocb_se_handler
5066	.rva	.Locb_dec_body,.Locb_dec_epilogue	# HandlerData[]
5067	.rva	.Locb_dec_pop
5068	.long	0
5069___
5070$code.=<<___;
5071.LSEH_info_cbc:
5072	.byte	9,0,0,0
5073	.rva	cbc_se_handler
5074.LSEH_info_key:
5075	.byte	0x01,0x04,0x01,0x00
5076	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
5077___
5078}
5079
5080sub rex {
5081  local *opcode=shift;
5082  my ($dst,$src)=@_;
5083  my $rex=0;
5084
5085    $rex|=0x04			if($dst>=8);
5086    $rex|=0x01			if($src>=8);
5087    push @opcode,$rex|0x40	if($rex);
5088}
5089
5090sub aesni {
5091  my $line=shift;
5092  my @opcode=(0x66);
5093
5094    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5095	rex(\@opcode,$4,$3);
5096	push @opcode,0x0f,0x3a,0xdf;
5097	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
5098	my $c=$2;
5099	push @opcode,$c=~/^0/?oct($c):$c;
5100	return ".byte\t".join(',',@opcode);
5101    }
5102    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5103	my %opcodelet = (
5104		"aesimc" => 0xdb,
5105		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5106		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5107	);
5108	return undef if (!defined($opcodelet{$1}));
5109	rex(\@opcode,$3,$2);
5110	push @opcode,0x0f,0x38,$opcodelet{$1};
5111	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
5112	return ".byte\t".join(',',@opcode);
5113    }
5114    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5115	my %opcodelet = (
5116		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5117		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5118	);
5119	return undef if (!defined($opcodelet{$1}));
5120	my $off = $2;
5121	push @opcode,0x44 if ($3>=8);
5122	push @opcode,0x0f,0x38,$opcodelet{$1};
5123	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
5124	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5125	return ".byte\t".join(',',@opcode);
5126    }
5127    return $line;
5128}
5129
5130sub movbe {
5131	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
5132}
5133
5134$code =~ s/\`([^\`]*)\`/eval($1)/gem;
5135$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5136#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
5137$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5138
5139print $code;
5140
5141close STDOUT;
5142