xref: /freebsd/crypto/openssl/crypto/aes/asm/bsaes-x86_64.pl (revision fba3cde907930eed2adb8a320524bc250338c729)
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode]				###
5### bitsliced implementation for Intel Core 2 processors	###
6### requires support of SSE extensions up to SSSE3		###
7### Author: Emilia Käsper and Peter Schwabe			###
8### Date: 2009-03-19						###
9### Public domain						###
10###								###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
12### further information.					###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22#   from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24#   allowed to feed its output back to aesenc[last], this was
25#   achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28#   relies on conversion of "conventional" key schedule as returned
29#   by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31#   to skip one shiftrows(), reduce bit-sliced key schedule and
32#   speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38#		Emilia's	this(*)		difference
39#
40# Core 2    	9.30		8.69		+7%
41# Nehalem(**) 	7.63		6.98		+9%
42# Atom	    	17.1		17.4		-2%(***)
43#
44# (*)	Comparison is not completely fair, because "this" is ECB,
45#	i.e. no extra processing such as counter values calculation
46#	and xor-ing input as in Emilia's CTR implementation is
47#	performed. However, the CTR calculations stand for not more
48#	than 1% of total time, so comparison is *rather* fair.
49#
50# (**)	Results were collected on Westmere, which is considered to
51#	be equivalent to Nehalem for this code.
52#
53# (***)	Slowdown on Atom is rather strange per se, because original
54#	implementation has a number of 9+-bytes instructions, which
55#	are bad for Atom front-end, and which I eliminated completely.
56#	In attempt to address deterioration sbox() was tested in FP
57#	SIMD "domain" (movaps instead of movdqa, xorps instead of
58#	pxor, etc.). While it resulted in nominal 4% improvement on
59#	Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# 		conversion	conversion/8x block
68# Core 2	240		0.22
69# Nehalem	180		0.20
70# Atom		430		0.19
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2	11.0
87# Nehalem	9.16
88# Atom		20.9
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95#						<appro@openssl.org>
96
97$flavour = shift;
98$output  = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open OUT,"| \"$^X\" $xlate $flavour $output";
109*STDOUT=*OUT;
110
111my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
113my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
114
115{
116my ($key,$rounds,$const)=("%rax","%r10d","%r11");
117
118sub Sbox {
119# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
121my @b=@_[0..7];
122my @t=@_[8..11];
123my @s=@_[12..15];
124	&InBasisChange	(@b);
125	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
126	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
127}
128
129sub InBasisChange {
130# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
132my @b=@_[0..7];
133$code.=<<___;
134	pxor	@b[6], @b[5]
135	pxor	@b[1], @b[2]
136	pxor	@b[0], @b[3]
137	pxor	@b[2], @b[6]
138	pxor 	@b[0], @b[5]
139
140	pxor	@b[3], @b[6]
141	pxor	@b[7], @b[3]
142	pxor	@b[5], @b[7]
143	pxor	@b[4], @b[3]
144	pxor	@b[5], @b[4]
145	pxor	@b[1], @b[3]
146
147	pxor	@b[7], @b[2]
148	pxor	@b[5], @b[1]
149___
150}
151
152sub OutBasisChange {
153# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
155my @b=@_[0..7];
156$code.=<<___;
157	pxor	@b[6], @b[0]
158	pxor	@b[4], @b[1]
159	pxor	@b[0], @b[2]
160	pxor	@b[6], @b[4]
161	pxor	@b[1], @b[6]
162
163	pxor	@b[5], @b[1]
164	pxor	@b[3], @b[5]
165	pxor	@b[7], @b[3]
166	pxor	@b[5], @b[7]
167	pxor	@b[5], @b[2]
168
169	pxor	@b[7], @b[4]
170___
171}
172
173sub InvSbox {
174# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176my @b=@_[0..7];
177my @t=@_[8..11];
178my @s=@_[12..15];
179	&InvInBasisChange	(@b);
180	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
181	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
182}
183
184sub InvInBasisChange {		# OutBasisChange in reverse
185my @b=@_[5,1,2,6,3,7,0,4];
186$code.=<<___
187	pxor	@b[7], @b[4]
188
189	pxor	@b[5], @b[7]
190	pxor	@b[5], @b[2]
191	pxor	@b[7], @b[3]
192	pxor	@b[3], @b[5]
193	pxor	@b[5], @b[1]
194
195	pxor	@b[1], @b[6]
196	pxor	@b[0], @b[2]
197	pxor	@b[6], @b[4]
198	pxor	@b[6], @b[0]
199	pxor	@b[4], @b[1]
200___
201}
202
203sub InvOutBasisChange {		# InBasisChange in reverse
204my @b=@_[2,5,7,3,6,1,0,4];
205$code.=<<___;
206	pxor	@b[5], @b[1]
207	pxor	@b[7], @b[2]
208
209	pxor	@b[1], @b[3]
210	pxor	@b[5], @b[4]
211	pxor	@b[5], @b[7]
212	pxor	@b[4], @b[3]
213	 pxor 	@b[0], @b[5]
214	pxor	@b[7], @b[3]
215	 pxor	@b[2], @b[6]
216	 pxor	@b[1], @b[2]
217	pxor	@b[3], @b[6]
218
219	pxor	@b[0], @b[3]
220	pxor	@b[6], @b[5]
221___
222}
223
224sub Mul_GF4 {
225#;*************************************************************
226#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227#;*************************************************************
228my ($x0,$x1,$y0,$y1,$t0)=@_;
229$code.=<<___;
230	movdqa	$y0, $t0
231	pxor 	$y1, $t0
232	pand	$x0, $t0
233	pxor	$x1, $x0
234	pand	$y0, $x1
235	pand	$y1, $x0
236	pxor	$x1, $x0
237	pxor	$t0, $x1
238___
239}
240
241sub Mul_GF4_N {				# not used, see next subroutine
242# multiply and scale by N
243my ($x0,$x1,$y0,$y1,$t0)=@_;
244$code.=<<___;
245	movdqa	$y0, $t0
246	pxor	$y1, $t0
247	pand	$x0, $t0
248	pxor	$x1, $x0
249	pand	$y0, $x1
250	pand	$y1, $x0
251	pxor	$x0, $x1
252	pxor	$t0, $x0
253___
254}
255
256sub Mul_GF4_N_GF4 {
257# interleaved Mul_GF4_N and Mul_GF4
258my ($x0,$x1,$y0,$y1,$t0,
259    $x2,$x3,$y2,$y3,$t1)=@_;
260$code.=<<___;
261	movdqa	$y0, $t0
262	 movdqa	$y2, $t1
263	pxor	$y1, $t0
264	 pxor 	$y3, $t1
265	pand	$x0, $t0
266	 pand	$x2, $t1
267	pxor	$x1, $x0
268	 pxor	$x3, $x2
269	pand	$y0, $x1
270	 pand	$y2, $x3
271	pand	$y1, $x0
272	 pand	$y3, $x2
273	pxor	$x0, $x1
274	 pxor	$x3, $x2
275	pxor	$t0, $x0
276	 pxor	$t1, $x3
277___
278}
279sub Mul_GF16_2 {
280my @x=@_[0..7];
281my @y=@_[8..11];
282my @t=@_[12..15];
283$code.=<<___;
284	movdqa	@x[0], @t[0]
285	movdqa	@x[1], @t[1]
286___
287	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
288$code.=<<___;
289	pxor	@x[2], @t[0]
290	pxor	@x[3], @t[1]
291	pxor	@y[2], @y[0]
292	pxor	@y[3], @y[1]
293___
294	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
295			 @x[2], @x[3], @y[2], @y[3], @t[2]);
296$code.=<<___;
297	pxor	@t[0], @x[0]
298	pxor	@t[0], @x[2]
299	pxor	@t[1], @x[1]
300	pxor	@t[1], @x[3]
301
302	movdqa	@x[4], @t[0]
303	movdqa	@x[5], @t[1]
304	pxor	@x[6], @t[0]
305	pxor	@x[7], @t[1]
306___
307	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
308			 @x[6], @x[7], @y[2], @y[3], @t[2]);
309$code.=<<___;
310	pxor	@y[2], @y[0]
311	pxor	@y[3], @y[1]
312___
313	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
314$code.=<<___;
315	pxor	@t[0], @x[4]
316	pxor	@t[0], @x[6]
317	pxor	@t[1], @x[5]
318	pxor	@t[1], @x[7]
319___
320}
321sub Inv_GF256 {
322#;********************************************************************
323#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
324#;********************************************************************
325my @x=@_[0..7];
326my @t=@_[8..11];
327my @s=@_[12..15];
328# direct optimizations from hardware
329$code.=<<___;
330	movdqa	@x[4], @t[3]
331	movdqa	@x[5], @t[2]
332	movdqa	@x[1], @t[1]
333	movdqa	@x[7], @s[1]
334	movdqa	@x[0], @s[0]
335
336	pxor	@x[6], @t[3]
337	pxor	@x[7], @t[2]
338	pxor	@x[3], @t[1]
339	 movdqa	@t[3], @s[2]
340	pxor	@x[6], @s[1]
341	 movdqa	@t[2], @t[0]
342	pxor	@x[2], @s[0]
343	 movdqa	@t[3], @s[3]
344
345	por	@t[1], @t[2]
346	por	@s[0], @t[3]
347	pxor	@t[0], @s[3]
348	pand	@s[0], @s[2]
349	pxor	@t[1], @s[0]
350	pand	@t[1], @t[0]
351	pand	@s[0], @s[3]
352	movdqa	@x[3], @s[0]
353	pxor	@x[2], @s[0]
354	pand	@s[0], @s[1]
355	pxor	@s[1], @t[3]
356	pxor	@s[1], @t[2]
357	movdqa	@x[4], @s[1]
358	movdqa	@x[1], @s[0]
359	pxor	@x[5], @s[1]
360	pxor	@x[0], @s[0]
361	movdqa	@s[1], @t[1]
362	pand	@s[0], @s[1]
363	por	@s[0], @t[1]
364	pxor	@s[1], @t[0]
365	pxor	@s[3], @t[3]
366	pxor	@s[2], @t[2]
367	pxor	@s[3], @t[1]
368	movdqa	@x[7], @s[0]
369	pxor	@s[2], @t[0]
370	movdqa	@x[6], @s[1]
371	pxor	@s[2], @t[1]
372	movdqa	@x[5], @s[2]
373	pand	@x[3], @s[0]
374	movdqa	@x[4], @s[3]
375	pand	@x[2], @s[1]
376	pand	@x[1], @s[2]
377	por	@x[0], @s[3]
378	pxor	@s[0], @t[3]
379	pxor	@s[1], @t[2]
380	pxor	@s[2], @t[1]
381	pxor	@s[3], @t[0]
382
383	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
384
385	# new smaller inversion
386
387	movdqa	@t[3], @s[0]
388	pand	@t[1], @t[3]
389	pxor	@t[2], @s[0]
390
391	movdqa	@t[0], @s[2]
392	movdqa	@s[0], @s[3]
393	pxor	@t[3], @s[2]
394	pand	@s[2], @s[3]
395
396	movdqa	@t[1], @s[1]
397	pxor	@t[2], @s[3]
398	pxor	@t[0], @s[1]
399
400	pxor	@t[2], @t[3]
401
402	pand	@t[3], @s[1]
403
404	movdqa	@s[2], @t[2]
405	pxor	@t[0], @s[1]
406
407	pxor	@s[1], @t[2]
408	pxor	@s[1], @t[1]
409
410	pand	@t[0], @t[2]
411
412	pxor	@t[2], @s[2]
413	pxor	@t[2], @t[1]
414
415	pand	@s[3], @s[2]
416
417	pxor	@s[0], @s[2]
418___
419# output in s3, s2, s1, t1
420
421# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
422
423# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
425
426### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427}
428
429# AES linear components
430
431sub ShiftRows {
432my @x=@_[0..7];
433my $mask=pop;
434$code.=<<___;
435	pxor	0x00($key),@x[0]
436	pxor	0x10($key),@x[1]
437	pshufb	$mask,@x[0]
438	pxor	0x20($key),@x[2]
439	pshufb	$mask,@x[1]
440	pxor	0x30($key),@x[3]
441	pshufb	$mask,@x[2]
442	pxor	0x40($key),@x[4]
443	pshufb	$mask,@x[3]
444	pxor	0x50($key),@x[5]
445	pshufb	$mask,@x[4]
446	pxor	0x60($key),@x[6]
447	pshufb	$mask,@x[5]
448	pxor	0x70($key),@x[7]
449	pshufb	$mask,@x[6]
450	lea	0x80($key),$key
451	pshufb	$mask,@x[7]
452___
453}
454
455sub MixColumns {
456# modified to emit output in order suitable for feeding back to aesenc[last]
457my @x=@_[0..7];
458my @t=@_[8..15];
459$code.=<<___;
460	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
461	pshufd	\$0x93, @x[1], @t[1]
462	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
463	pshufd	\$0x93, @x[2], @t[2]
464	 pxor	@t[1], @x[1]
465	pshufd	\$0x93, @x[3], @t[3]
466	 pxor	@t[2], @x[2]
467	pshufd	\$0x93, @x[4], @t[4]
468	 pxor	@t[3], @x[3]
469	pshufd	\$0x93, @x[5], @t[5]
470	 pxor	@t[4], @x[4]
471	pshufd	\$0x93, @x[6], @t[6]
472	 pxor	@t[5], @x[5]
473	pshufd	\$0x93, @x[7], @t[7]
474	 pxor	@t[6], @x[6]
475	 pxor	@t[7], @x[7]
476
477	pxor	@x[0], @t[1]
478	pxor	@x[7], @t[0]
479	pxor	@x[7], @t[1]
480	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
481	pxor	@x[1], @t[2]
482	 pshufd	\$0x4E, @x[1], @x[1]
483	pxor	@x[4], @t[5]
484	 pxor	@t[0], @x[0]
485	pxor	@x[5], @t[6]
486	 pxor	@t[1], @x[1]
487	pxor	@x[3], @t[4]
488	 pshufd	\$0x4E, @x[4], @t[0]
489	pxor	@x[6], @t[7]
490	 pshufd	\$0x4E, @x[5], @t[1]
491	pxor	@x[2], @t[3]
492	 pshufd	\$0x4E, @x[3], @x[4]
493	pxor	@x[7], @t[3]
494	 pshufd	\$0x4E, @x[7], @x[5]
495	pxor	@x[7], @t[4]
496	 pshufd	\$0x4E, @x[6], @x[3]
497	pxor	@t[4], @t[0]
498	 pshufd	\$0x4E, @x[2], @x[6]
499	pxor	@t[5], @t[1]
500
501	pxor	@t[3], @x[4]
502	pxor	@t[7], @x[5]
503	pxor	@t[6], @x[3]
504	 movdqa	@t[0], @x[2]
505	pxor	@t[2], @x[6]
506	 movdqa	@t[1], @x[7]
507___
508}
509
510sub InvMixColumns {
511my @x=@_[0..7];
512my @t=@_[8..15];
513
514$code.=<<___;
515	# multiplication by 0x0e
516	pshufd	\$0x93, @x[7], @t[7]
517	movdqa	@x[2], @t[2]
518	pxor	@x[5], @x[7]		# 7 5
519	pxor	@x[5], @x[2]		# 2 5
520	pshufd	\$0x93, @x[0], @t[0]
521	movdqa	@x[5], @t[5]
522	pxor	@x[0], @x[5]		# 5 0		[1]
523	pxor	@x[1], @x[0]		# 0 1
524	pshufd	\$0x93, @x[1], @t[1]
525	pxor	@x[2], @x[1]		# 1 25
526	pxor	@x[6], @x[0]		# 01 6		[2]
527	pxor	@x[3], @x[1]		# 125 3		[4]
528	pshufd	\$0x93, @x[3], @t[3]
529	pxor	@x[0], @x[2]		# 25 016	[3]
530	pxor	@x[7], @x[3]		# 3 75
531	pxor	@x[6], @x[7]		# 75 6		[0]
532	pshufd	\$0x93, @x[6], @t[6]
533	movdqa	@x[4], @t[4]
534	pxor	@x[4], @x[6]		# 6 4
535	pxor	@x[3], @x[4]		# 4 375		[6]
536	pxor	@x[7], @x[3]		# 375 756=36
537	pxor	@t[5], @x[6]		# 64 5		[7]
538	pxor	@t[2], @x[3]		# 36 2
539	pxor	@t[4], @x[3]		# 362 4		[5]
540	pshufd	\$0x93, @t[5], @t[5]
541___
542					my @y = @x[7,5,0,2,1,3,4,6];
543$code.=<<___;
544	# multiplication by 0x0b
545	pxor	@y[0], @y[1]
546	pxor	@t[0], @y[0]
547	pxor	@t[1], @y[1]
548	pshufd	\$0x93, @t[2], @t[2]
549	pxor	@t[5], @y[0]
550	pxor	@t[6], @y[1]
551	pxor	@t[7], @y[0]
552	pshufd	\$0x93, @t[4], @t[4]
553	pxor	@t[6], @t[7]		# clobber t[7]
554	pxor	@y[0], @y[1]
555
556	pxor	@t[0], @y[3]
557	pshufd	\$0x93, @t[0], @t[0]
558	pxor	@t[1], @y[2]
559	pxor	@t[1], @y[4]
560	pxor	@t[2], @y[2]
561	pshufd	\$0x93, @t[1], @t[1]
562	pxor	@t[2], @y[3]
563	pxor	@t[2], @y[5]
564	pxor	@t[7], @y[2]
565	pshufd	\$0x93, @t[2], @t[2]
566	pxor	@t[3], @y[3]
567	pxor	@t[3], @y[6]
568	pxor	@t[3], @y[4]
569	pshufd	\$0x93, @t[3], @t[3]
570	pxor	@t[4], @y[7]
571	pxor	@t[4], @y[5]
572	pxor	@t[7], @y[7]
573	pxor	@t[5], @y[3]
574	pxor	@t[4], @y[4]
575	pxor	@t[5], @t[7]		# clobber t[7] even more
576
577	pxor	@t[7], @y[5]
578	pshufd	\$0x93, @t[4], @t[4]
579	pxor	@t[7], @y[6]
580	pxor	@t[7], @y[4]
581
582	pxor	@t[5], @t[7]
583	pshufd	\$0x93, @t[5], @t[5]
584	pxor	@t[6], @t[7]		# restore t[7]
585
586	# multiplication by 0x0d
587	pxor	@y[7], @y[4]
588	pxor	@t[4], @y[7]
589	pshufd	\$0x93, @t[6], @t[6]
590	pxor	@t[0], @y[2]
591	pxor	@t[5], @y[7]
592	pxor	@t[2], @y[2]
593	pshufd	\$0x93, @t[7], @t[7]
594
595	pxor	@y[1], @y[3]
596	pxor	@t[1], @y[1]
597	pxor	@t[0], @y[0]
598	pxor	@t[0], @y[3]
599	pxor	@t[5], @y[1]
600	pxor	@t[5], @y[0]
601	pxor	@t[7], @y[1]
602	pshufd	\$0x93, @t[0], @t[0]
603	pxor	@t[6], @y[0]
604	pxor	@y[1], @y[3]
605	pxor	@t[1], @y[4]
606	pshufd	\$0x93, @t[1], @t[1]
607
608	pxor	@t[7], @y[7]
609	pxor	@t[2], @y[4]
610	pxor	@t[2], @y[5]
611	pshufd	\$0x93, @t[2], @t[2]
612	pxor	@t[6], @y[2]
613	pxor	@t[3], @t[6]		# clobber t[6]
614	pxor	@y[7], @y[4]
615	pxor	@t[6], @y[3]
616
617	pxor	@t[6], @y[6]
618	pxor	@t[5], @y[5]
619	pxor	@t[4], @y[6]
620	pshufd	\$0x93, @t[4], @t[4]
621	pxor	@t[6], @y[5]
622	pxor	@t[7], @y[6]
623	pxor	@t[3], @t[6]		# restore t[6]
624
625	pshufd	\$0x93, @t[5], @t[5]
626	pshufd	\$0x93, @t[6], @t[6]
627	pshufd	\$0x93, @t[7], @t[7]
628	pshufd	\$0x93, @t[3], @t[3]
629
630	# multiplication by 0x09
631	pxor	@y[1], @y[4]
632	pxor	@y[1], @t[1]		# t[1]=y[1]
633	pxor	@t[5], @t[0]		# clobber t[0]
634	pxor	@t[5], @t[1]
635	pxor	@t[0], @y[3]
636	pxor	@y[0], @t[0]		# t[0]=y[0]
637	pxor	@t[6], @t[1]
638	pxor	@t[7], @t[6]		# clobber t[6]
639	pxor	@t[1], @y[4]
640	pxor	@t[4], @y[7]
641	pxor	@y[4], @t[4]		# t[4]=y[4]
642	pxor	@t[3], @y[6]
643	pxor	@y[3], @t[3]		# t[3]=y[3]
644	pxor	@t[2], @y[5]
645	pxor	@y[2], @t[2]		# t[2]=y[2]
646	pxor	@t[7], @t[3]
647	pxor	@y[5], @t[5]		# t[5]=y[5]
648	pxor	@t[6], @t[2]
649	pxor	@t[6], @t[5]
650	pxor	@y[6], @t[6]		# t[6]=y[6]
651	pxor	@y[7], @t[7]		# t[7]=y[7]
652
653	movdqa	@t[0],@XMM[0]
654	movdqa	@t[1],@XMM[1]
655	movdqa	@t[2],@XMM[2]
656	movdqa	@t[3],@XMM[3]
657	movdqa	@t[4],@XMM[4]
658	movdqa	@t[5],@XMM[5]
659	movdqa	@t[6],@XMM[6]
660	movdqa	@t[7],@XMM[7]
661___
662}
663
664sub aesenc {				# not used
665my @b=@_[0..7];
666my @t=@_[8..15];
667$code.=<<___;
668	movdqa	0x30($const),@t[0]	# .LSR
669___
670	&ShiftRows	(@b,@t[0]);
671	&Sbox		(@b,@t);
672	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
673}
674
675sub aesenclast {			# not used
676my @b=@_[0..7];
677my @t=@_[8..15];
678$code.=<<___;
679	movdqa	0x40($const),@t[0]	# .LSRM0
680___
681	&ShiftRows	(@b,@t[0]);
682	&Sbox		(@b,@t);
683$code.=<<___
684	pxor	0x00($key),@b[0]
685	pxor	0x10($key),@b[1]
686	pxor	0x20($key),@b[4]
687	pxor	0x30($key),@b[6]
688	pxor	0x40($key),@b[3]
689	pxor	0x50($key),@b[7]
690	pxor	0x60($key),@b[2]
691	pxor	0x70($key),@b[5]
692___
693}
694
695sub swapmove {
696my ($a,$b,$n,$mask,$t)=@_;
697$code.=<<___;
698	movdqa	$b,$t
699	psrlq	\$$n,$b
700	pxor  	$a,$b
701	pand	$mask,$b
702	pxor	$b,$a
703	psllq	\$$n,$b
704	pxor	$t,$b
705___
706}
707sub swapmove2x {
708my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
709$code.=<<___;
710	movdqa	$b0,$t0
711	psrlq	\$$n,$b0
712	 movdqa	$b1,$t1
713	 psrlq	\$$n,$b1
714	pxor  	$a0,$b0
715	 pxor  	$a1,$b1
716	pand	$mask,$b0
717	 pand	$mask,$b1
718	pxor	$b0,$a0
719	psllq	\$$n,$b0
720	 pxor	$b1,$a1
721	 psllq	\$$n,$b1
722	pxor	$t0,$b0
723	 pxor	$t1,$b1
724___
725}
726
727sub bitslice {
728my @x=reverse(@_[0..7]);
729my ($t0,$t1,$t2,$t3)=@_[8..11];
730$code.=<<___;
731	movdqa	0x00($const),$t0	# .LBS0
732	movdqa	0x10($const),$t1	# .LBS1
733___
734	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
735	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
736$code.=<<___;
737	movdqa	0x20($const),$t0	# .LBS2
738___
739	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
740	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
741
742	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
743	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
744}
745
746$code.=<<___;
747.text
748
749.extern	asm_AES_encrypt
750.extern	asm_AES_decrypt
751
752.type	_bsaes_encrypt8,\@abi-omnipotent
753.align	64
754_bsaes_encrypt8:
755	lea	.LBS0(%rip), $const	# constants table
756
757	movdqa	($key), @XMM[9]		# round 0 key
758	lea	0x10($key), $key
759	movdqa	0x50($const), @XMM[8]	# .LM0SR
760	pxor	@XMM[9], @XMM[0]	# xor with round0 key
761	pxor	@XMM[9], @XMM[1]
762	 pshufb	@XMM[8], @XMM[0]
763	pxor	@XMM[9], @XMM[2]
764	 pshufb	@XMM[8], @XMM[1]
765	pxor	@XMM[9], @XMM[3]
766	 pshufb	@XMM[8], @XMM[2]
767	pxor	@XMM[9], @XMM[4]
768	 pshufb	@XMM[8], @XMM[3]
769	pxor	@XMM[9], @XMM[5]
770	 pshufb	@XMM[8], @XMM[4]
771	pxor	@XMM[9], @XMM[6]
772	 pshufb	@XMM[8], @XMM[5]
773	pxor	@XMM[9], @XMM[7]
774	 pshufb	@XMM[8], @XMM[6]
775	 pshufb	@XMM[8], @XMM[7]
776_bsaes_encrypt8_bitslice:
777___
778	&bitslice	(@XMM[0..7, 8..11]);
779$code.=<<___;
780	dec	$rounds
781	jmp	.Lenc_sbox
782.align	16
783.Lenc_loop:
784___
785	&ShiftRows	(@XMM[0..7, 8]);
786$code.=".Lenc_sbox:\n";
787	&Sbox		(@XMM[0..7, 8..15]);
788$code.=<<___;
789	dec	$rounds
790	jl	.Lenc_done
791___
792	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
793$code.=<<___;
794	movdqa	0x30($const), @XMM[8]	# .LSR
795	jnz	.Lenc_loop
796	movdqa	0x40($const), @XMM[8]	# .LSRM0
797	jmp	.Lenc_loop
798.align	16
799.Lenc_done:
800___
801	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
802	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
803$code.=<<___;
804	movdqa	($key), @XMM[8]		# last round key
805	pxor	@XMM[8], @XMM[4]
806	pxor	@XMM[8], @XMM[6]
807	pxor	@XMM[8], @XMM[3]
808	pxor	@XMM[8], @XMM[7]
809	pxor	@XMM[8], @XMM[2]
810	pxor	@XMM[8], @XMM[5]
811	pxor	@XMM[8], @XMM[0]
812	pxor	@XMM[8], @XMM[1]
813	ret
814.size	_bsaes_encrypt8,.-_bsaes_encrypt8
815
816.type	_bsaes_decrypt8,\@abi-omnipotent
817.align	64
818_bsaes_decrypt8:
819	lea	.LBS0(%rip), $const	# constants table
820
821	movdqa	($key), @XMM[9]		# round 0 key
822	lea	0x10($key), $key
823	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
824	pxor	@XMM[9], @XMM[0]	# xor with round0 key
825	pxor	@XMM[9], @XMM[1]
826	 pshufb	@XMM[8], @XMM[0]
827	pxor	@XMM[9], @XMM[2]
828	 pshufb	@XMM[8], @XMM[1]
829	pxor	@XMM[9], @XMM[3]
830	 pshufb	@XMM[8], @XMM[2]
831	pxor	@XMM[9], @XMM[4]
832	 pshufb	@XMM[8], @XMM[3]
833	pxor	@XMM[9], @XMM[5]
834	 pshufb	@XMM[8], @XMM[4]
835	pxor	@XMM[9], @XMM[6]
836	 pshufb	@XMM[8], @XMM[5]
837	pxor	@XMM[9], @XMM[7]
838	 pshufb	@XMM[8], @XMM[6]
839	 pshufb	@XMM[8], @XMM[7]
840___
841	&bitslice	(@XMM[0..7, 8..11]);
842$code.=<<___;
843	dec	$rounds
844	jmp	.Ldec_sbox
845.align	16
846.Ldec_loop:
847___
848	&ShiftRows	(@XMM[0..7, 8]);
849$code.=".Ldec_sbox:\n";
850	&InvSbox	(@XMM[0..7, 8..15]);
851$code.=<<___;
852	dec	$rounds
853	jl	.Ldec_done
854___
855	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
856$code.=<<___;
857	movdqa	-0x10($const), @XMM[8]	# .LISR
858	jnz	.Ldec_loop
859	movdqa	-0x20($const), @XMM[8]	# .LISRM0
860	jmp	.Ldec_loop
861.align	16
862.Ldec_done:
863___
864	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
865$code.=<<___;
866	movdqa	($key), @XMM[8]		# last round key
867	pxor	@XMM[8], @XMM[6]
868	pxor	@XMM[8], @XMM[4]
869	pxor	@XMM[8], @XMM[2]
870	pxor	@XMM[8], @XMM[7]
871	pxor	@XMM[8], @XMM[3]
872	pxor	@XMM[8], @XMM[5]
873	pxor	@XMM[8], @XMM[0]
874	pxor	@XMM[8], @XMM[1]
875	ret
876.size	_bsaes_decrypt8,.-_bsaes_decrypt8
877___
878}
879{
880my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
881
882sub bitslice_key {
883my @x=reverse(@_[0..7]);
884my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
885
886	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
887$code.=<<___;
888	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
889	movdqa	@x[0], @x[2]
890	movdqa	@x[1], @x[3]
891___
892	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
893
894	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
895$code.=<<___;
896	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
897	movdqa	@x[0], @x[4]
898	movdqa	@x[2], @x[6]
899	movdqa	@x[1], @x[5]
900	movdqa	@x[3], @x[7]
901___
902	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
903	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
904}
905
906$code.=<<___;
907.type	_bsaes_key_convert,\@abi-omnipotent
908.align	16
909_bsaes_key_convert:
910	lea	.Lmasks(%rip), $const
911	movdqu	($inp), %xmm7		# load round 0 key
912	lea	0x10($inp), $inp
913	movdqa	0x00($const), %xmm0	# 0x01...
914	movdqa	0x10($const), %xmm1	# 0x02...
915	movdqa	0x20($const), %xmm2	# 0x04...
916	movdqa	0x30($const), %xmm3	# 0x08...
917	movdqa	0x40($const), %xmm4	# .LM0
918	pcmpeqd	%xmm5, %xmm5		# .LNOT
919
920	movdqu	($inp), %xmm6		# load round 1 key
921	movdqa	%xmm7, ($out)		# save round 0 key
922	lea	0x10($out), $out
923	dec	$rounds
924	jmp	.Lkey_loop
925.align	16
926.Lkey_loop:
927	pshufb	%xmm4, %xmm6		# .LM0
928
929	movdqa	%xmm0,	%xmm8
930	movdqa	%xmm1,	%xmm9
931
932	pand	%xmm6,	%xmm8
933	pand	%xmm6,	%xmm9
934	movdqa	%xmm2,	%xmm10
935	pcmpeqb	%xmm0,	%xmm8
936	psllq	\$4,	%xmm0		# 0x10...
937	movdqa	%xmm3,	%xmm11
938	pcmpeqb	%xmm1,	%xmm9
939	psllq	\$4,	%xmm1		# 0x20...
940
941	pand	%xmm6,	%xmm10
942	pand	%xmm6,	%xmm11
943	movdqa	%xmm0,	%xmm12
944	pcmpeqb	%xmm2,	%xmm10
945	psllq	\$4,	%xmm2		# 0x40...
946	movdqa	%xmm1,	%xmm13
947	pcmpeqb	%xmm3,	%xmm11
948	psllq	\$4,	%xmm3		# 0x80...
949
950	movdqa	%xmm2,	%xmm14
951	movdqa	%xmm3,	%xmm15
952	 pxor	%xmm5,	%xmm8		# "pnot"
953	 pxor	%xmm5,	%xmm9
954
955	pand	%xmm6,	%xmm12
956	pand	%xmm6,	%xmm13
957	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
958	pcmpeqb	%xmm0,	%xmm12
959	psrlq	\$4,	%xmm0		# 0x01...
960	 movdqa	%xmm9, 0x10($out)
961	pcmpeqb	%xmm1,	%xmm13
962	psrlq	\$4,	%xmm1		# 0x02...
963	 lea	0x10($inp), $inp
964
965	pand	%xmm6,	%xmm14
966	pand	%xmm6,	%xmm15
967	 movdqa	%xmm10, 0x20($out)
968	pcmpeqb	%xmm2,	%xmm14
969	psrlq	\$4,	%xmm2		# 0x04...
970	 movdqa	%xmm11, 0x30($out)
971	pcmpeqb	%xmm3,	%xmm15
972	psrlq	\$4,	%xmm3		# 0x08...
973	 movdqu	($inp), %xmm6		# load next round key
974
975	pxor	%xmm5, %xmm13		# "pnot"
976	pxor	%xmm5, %xmm14
977	movdqa	%xmm12, 0x40($out)
978	movdqa	%xmm13, 0x50($out)
979	movdqa	%xmm14, 0x60($out)
980	movdqa	%xmm15, 0x70($out)
981	lea	0x80($out),$out
982	dec	$rounds
983	jnz	.Lkey_loop
984
985	movdqa	0x50($const), %xmm7	# .L63
986	#movdqa	%xmm6, ($out)		# don't save last round key
987	ret
988.size	_bsaes_key_convert,.-_bsaes_key_convert
989___
990}
991
992if (0 && !$win64) {	# following four functions are unsupported interface
993			# used for benchmarking...
994$code.=<<___;
995.globl	bsaes_enc_key_convert
996.type	bsaes_enc_key_convert,\@function,2
997.align	16
998bsaes_enc_key_convert:
999	mov	240($inp),%r10d		# pass rounds
1000	mov	$inp,%rcx		# pass key
1001	mov	$out,%rax		# pass key schedule
1002	call	_bsaes_key_convert
1003	pxor	%xmm6,%xmm7		# fix up last round key
1004	movdqa	%xmm7,(%rax)		# save last round key
1005	ret
1006.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1007
1008.globl	bsaes_encrypt_128
1009.type	bsaes_encrypt_128,\@function,4
1010.align	16
1011bsaes_encrypt_128:
1012.Lenc128_loop:
1013	movdqu	0x00($inp), @XMM[0]	# load input
1014	movdqu	0x10($inp), @XMM[1]
1015	movdqu	0x20($inp), @XMM[2]
1016	movdqu	0x30($inp), @XMM[3]
1017	movdqu	0x40($inp), @XMM[4]
1018	movdqu	0x50($inp), @XMM[5]
1019	movdqu	0x60($inp), @XMM[6]
1020	movdqu	0x70($inp), @XMM[7]
1021	mov	$key, %rax		# pass the $key
1022	lea	0x80($inp), $inp
1023	mov	\$10,%r10d
1024
1025	call	_bsaes_encrypt8
1026
1027	movdqu	@XMM[0], 0x00($out)	# write output
1028	movdqu	@XMM[1], 0x10($out)
1029	movdqu	@XMM[4], 0x20($out)
1030	movdqu	@XMM[6], 0x30($out)
1031	movdqu	@XMM[3], 0x40($out)
1032	movdqu	@XMM[7], 0x50($out)
1033	movdqu	@XMM[2], 0x60($out)
1034	movdqu	@XMM[5], 0x70($out)
1035	lea	0x80($out), $out
1036	sub	\$0x80,$len
1037	ja	.Lenc128_loop
1038	ret
1039.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1040
1041.globl	bsaes_dec_key_convert
1042.type	bsaes_dec_key_convert,\@function,2
1043.align	16
1044bsaes_dec_key_convert:
1045	mov	240($inp),%r10d		# pass rounds
1046	mov	$inp,%rcx		# pass key
1047	mov	$out,%rax		# pass key schedule
1048	call	_bsaes_key_convert
1049	pxor	($out),%xmm7		# fix up round 0 key
1050	movdqa	%xmm6,(%rax)		# save last round key
1051	movdqa	%xmm7,($out)
1052	ret
1053.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1054
1055.globl	bsaes_decrypt_128
1056.type	bsaes_decrypt_128,\@function,4
1057.align	16
1058bsaes_decrypt_128:
1059.Ldec128_loop:
1060	movdqu	0x00($inp), @XMM[0]	# load input
1061	movdqu	0x10($inp), @XMM[1]
1062	movdqu	0x20($inp), @XMM[2]
1063	movdqu	0x30($inp), @XMM[3]
1064	movdqu	0x40($inp), @XMM[4]
1065	movdqu	0x50($inp), @XMM[5]
1066	movdqu	0x60($inp), @XMM[6]
1067	movdqu	0x70($inp), @XMM[7]
1068	mov	$key, %rax		# pass the $key
1069	lea	0x80($inp), $inp
1070	mov	\$10,%r10d
1071
1072	call	_bsaes_decrypt8
1073
1074	movdqu	@XMM[0], 0x00($out)	# write output
1075	movdqu	@XMM[1], 0x10($out)
1076	movdqu	@XMM[6], 0x20($out)
1077	movdqu	@XMM[4], 0x30($out)
1078	movdqu	@XMM[2], 0x40($out)
1079	movdqu	@XMM[7], 0x50($out)
1080	movdqu	@XMM[3], 0x60($out)
1081	movdqu	@XMM[5], 0x70($out)
1082	lea	0x80($out), $out
1083	sub	\$0x80,$len
1084	ja	.Ldec128_loop
1085	ret
1086.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1087___
1088}
1089{
1090######################################################################
1091#
1092# OpenSSL interface
1093#
1094my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1095						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1096my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1097
1098if ($ecb) {
1099$code.=<<___;
1100.globl	bsaes_ecb_encrypt_blocks
1101.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1102.align	16
1103bsaes_ecb_encrypt_blocks:
1104	mov	%rsp, %rax
1105.Lecb_enc_prologue:
1106	push	%rbp
1107	push	%rbx
1108	push	%r12
1109	push	%r13
1110	push	%r14
1111	push	%r15
1112	lea	-0x48(%rsp),%rsp
1113___
1114$code.=<<___ if ($win64);
1115	lea	-0xa0(%rsp), %rsp
1116	movaps	%xmm6, 0x40(%rsp)
1117	movaps	%xmm7, 0x50(%rsp)
1118	movaps	%xmm8, 0x60(%rsp)
1119	movaps	%xmm9, 0x70(%rsp)
1120	movaps	%xmm10, 0x80(%rsp)
1121	movaps	%xmm11, 0x90(%rsp)
1122	movaps	%xmm12, 0xa0(%rsp)
1123	movaps	%xmm13, 0xb0(%rsp)
1124	movaps	%xmm14, 0xc0(%rsp)
1125	movaps	%xmm15, 0xd0(%rsp)
1126.Lecb_enc_body:
1127___
1128$code.=<<___;
1129	mov	%rsp,%rbp		# backup %rsp
1130	mov	240($arg4),%eax		# rounds
1131	mov	$arg1,$inp		# backup arguments
1132	mov	$arg2,$out
1133	mov	$arg3,$len
1134	mov	$arg4,$key
1135	cmp	\$8,$arg3
1136	jb	.Lecb_enc_short
1137
1138	mov	%eax,%ebx		# backup rounds
1139	shl	\$7,%rax		# 128 bytes per inner round key
1140	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1141	sub	%rax,%rsp
1142	mov	%rsp,%rax		# pass key schedule
1143	mov	$key,%rcx		# pass key
1144	mov	%ebx,%r10d		# pass rounds
1145	call	_bsaes_key_convert
1146	pxor	%xmm6,%xmm7		# fix up last round key
1147	movdqa	%xmm7,(%rax)		# save last round key
1148
1149	sub	\$8,$len
1150.Lecb_enc_loop:
1151	movdqu	0x00($inp), @XMM[0]	# load input
1152	movdqu	0x10($inp), @XMM[1]
1153	movdqu	0x20($inp), @XMM[2]
1154	movdqu	0x30($inp), @XMM[3]
1155	movdqu	0x40($inp), @XMM[4]
1156	movdqu	0x50($inp), @XMM[5]
1157	mov	%rsp, %rax		# pass key schedule
1158	movdqu	0x60($inp), @XMM[6]
1159	mov	%ebx,%r10d		# pass rounds
1160	movdqu	0x70($inp), @XMM[7]
1161	lea	0x80($inp), $inp
1162
1163	call	_bsaes_encrypt8
1164
1165	movdqu	@XMM[0], 0x00($out)	# write output
1166	movdqu	@XMM[1], 0x10($out)
1167	movdqu	@XMM[4], 0x20($out)
1168	movdqu	@XMM[6], 0x30($out)
1169	movdqu	@XMM[3], 0x40($out)
1170	movdqu	@XMM[7], 0x50($out)
1171	movdqu	@XMM[2], 0x60($out)
1172	movdqu	@XMM[5], 0x70($out)
1173	lea	0x80($out), $out
1174	sub	\$8,$len
1175	jnc	.Lecb_enc_loop
1176
1177	add	\$8,$len
1178	jz	.Lecb_enc_done
1179
1180	movdqu	0x00($inp), @XMM[0]	# load input
1181	mov	%rsp, %rax		# pass key schedule
1182	mov	%ebx,%r10d		# pass rounds
1183	cmp	\$2,$len
1184	jb	.Lecb_enc_one
1185	movdqu	0x10($inp), @XMM[1]
1186	je	.Lecb_enc_two
1187	movdqu	0x20($inp), @XMM[2]
1188	cmp	\$4,$len
1189	jb	.Lecb_enc_three
1190	movdqu	0x30($inp), @XMM[3]
1191	je	.Lecb_enc_four
1192	movdqu	0x40($inp), @XMM[4]
1193	cmp	\$6,$len
1194	jb	.Lecb_enc_five
1195	movdqu	0x50($inp), @XMM[5]
1196	je	.Lecb_enc_six
1197	movdqu	0x60($inp), @XMM[6]
1198	call	_bsaes_encrypt8
1199	movdqu	@XMM[0], 0x00($out)	# write output
1200	movdqu	@XMM[1], 0x10($out)
1201	movdqu	@XMM[4], 0x20($out)
1202	movdqu	@XMM[6], 0x30($out)
1203	movdqu	@XMM[3], 0x40($out)
1204	movdqu	@XMM[7], 0x50($out)
1205	movdqu	@XMM[2], 0x60($out)
1206	jmp	.Lecb_enc_done
1207.align	16
1208.Lecb_enc_six:
1209	call	_bsaes_encrypt8
1210	movdqu	@XMM[0], 0x00($out)	# write output
1211	movdqu	@XMM[1], 0x10($out)
1212	movdqu	@XMM[4], 0x20($out)
1213	movdqu	@XMM[6], 0x30($out)
1214	movdqu	@XMM[3], 0x40($out)
1215	movdqu	@XMM[7], 0x50($out)
1216	jmp	.Lecb_enc_done
1217.align	16
1218.Lecb_enc_five:
1219	call	_bsaes_encrypt8
1220	movdqu	@XMM[0], 0x00($out)	# write output
1221	movdqu	@XMM[1], 0x10($out)
1222	movdqu	@XMM[4], 0x20($out)
1223	movdqu	@XMM[6], 0x30($out)
1224	movdqu	@XMM[3], 0x40($out)
1225	jmp	.Lecb_enc_done
1226.align	16
1227.Lecb_enc_four:
1228	call	_bsaes_encrypt8
1229	movdqu	@XMM[0], 0x00($out)	# write output
1230	movdqu	@XMM[1], 0x10($out)
1231	movdqu	@XMM[4], 0x20($out)
1232	movdqu	@XMM[6], 0x30($out)
1233	jmp	.Lecb_enc_done
1234.align	16
1235.Lecb_enc_three:
1236	call	_bsaes_encrypt8
1237	movdqu	@XMM[0], 0x00($out)	# write output
1238	movdqu	@XMM[1], 0x10($out)
1239	movdqu	@XMM[4], 0x20($out)
1240	jmp	.Lecb_enc_done
1241.align	16
1242.Lecb_enc_two:
1243	call	_bsaes_encrypt8
1244	movdqu	@XMM[0], 0x00($out)	# write output
1245	movdqu	@XMM[1], 0x10($out)
1246	jmp	.Lecb_enc_done
1247.align	16
1248.Lecb_enc_one:
1249	call	_bsaes_encrypt8
1250	movdqu	@XMM[0], 0x00($out)	# write output
1251	jmp	.Lecb_enc_done
1252.align	16
1253.Lecb_enc_short:
1254	lea	($inp), $arg1
1255	lea	($out), $arg2
1256	lea	($key), $arg3
1257	call	asm_AES_encrypt
1258	lea	16($inp), $inp
1259	lea	16($out), $out
1260	dec	$len
1261	jnz	.Lecb_enc_short
1262
1263.Lecb_enc_done:
1264	lea	(%rsp),%rax
1265	pxor	%xmm0, %xmm0
1266.Lecb_enc_bzero:			# wipe key schedule [if any]
1267	movdqa	%xmm0, 0x00(%rax)
1268	movdqa	%xmm0, 0x10(%rax)
1269	lea	0x20(%rax), %rax
1270	cmp	%rax, %rbp
1271	jb	.Lecb_enc_bzero
1272
1273	lea	(%rbp),%rsp		# restore %rsp
1274___
1275$code.=<<___ if ($win64);
1276	movaps	0x40(%rbp), %xmm6
1277	movaps	0x50(%rbp), %xmm7
1278	movaps	0x60(%rbp), %xmm8
1279	movaps	0x70(%rbp), %xmm9
1280	movaps	0x80(%rbp), %xmm10
1281	movaps	0x90(%rbp), %xmm11
1282	movaps	0xa0(%rbp), %xmm12
1283	movaps	0xb0(%rbp), %xmm13
1284	movaps	0xc0(%rbp), %xmm14
1285	movaps	0xd0(%rbp), %xmm15
1286	lea	0xa0(%rbp), %rsp
1287___
1288$code.=<<___;
1289	mov	0x48(%rsp), %r15
1290	mov	0x50(%rsp), %r14
1291	mov	0x58(%rsp), %r13
1292	mov	0x60(%rsp), %r12
1293	mov	0x68(%rsp), %rbx
1294	mov	0x70(%rsp), %rax
1295	lea	0x78(%rsp), %rsp
1296	mov	%rax, %rbp
1297.Lecb_enc_epilogue:
1298	ret
1299.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1300
1301.globl	bsaes_ecb_decrypt_blocks
1302.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1303.align	16
1304bsaes_ecb_decrypt_blocks:
1305	mov	%rsp, %rax
1306.Lecb_dec_prologue:
1307	push	%rbp
1308	push	%rbx
1309	push	%r12
1310	push	%r13
1311	push	%r14
1312	push	%r15
1313	lea	-0x48(%rsp),%rsp
1314___
1315$code.=<<___ if ($win64);
1316	lea	-0xa0(%rsp), %rsp
1317	movaps	%xmm6, 0x40(%rsp)
1318	movaps	%xmm7, 0x50(%rsp)
1319	movaps	%xmm8, 0x60(%rsp)
1320	movaps	%xmm9, 0x70(%rsp)
1321	movaps	%xmm10, 0x80(%rsp)
1322	movaps	%xmm11, 0x90(%rsp)
1323	movaps	%xmm12, 0xa0(%rsp)
1324	movaps	%xmm13, 0xb0(%rsp)
1325	movaps	%xmm14, 0xc0(%rsp)
1326	movaps	%xmm15, 0xd0(%rsp)
1327.Lecb_dec_body:
1328___
1329$code.=<<___;
1330	mov	%rsp,%rbp		# backup %rsp
1331	mov	240($arg4),%eax		# rounds
1332	mov	$arg1,$inp		# backup arguments
1333	mov	$arg2,$out
1334	mov	$arg3,$len
1335	mov	$arg4,$key
1336	cmp	\$8,$arg3
1337	jb	.Lecb_dec_short
1338
1339	mov	%eax,%ebx		# backup rounds
1340	shl	\$7,%rax		# 128 bytes per inner round key
1341	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1342	sub	%rax,%rsp
1343	mov	%rsp,%rax		# pass key schedule
1344	mov	$key,%rcx		# pass key
1345	mov	%ebx,%r10d		# pass rounds
1346	call	_bsaes_key_convert
1347	pxor	(%rsp),%xmm7		# fix up 0 round key
1348	movdqa	%xmm6,(%rax)		# save last round key
1349	movdqa	%xmm7,(%rsp)
1350
1351	sub	\$8,$len
1352.Lecb_dec_loop:
1353	movdqu	0x00($inp), @XMM[0]	# load input
1354	movdqu	0x10($inp), @XMM[1]
1355	movdqu	0x20($inp), @XMM[2]
1356	movdqu	0x30($inp), @XMM[3]
1357	movdqu	0x40($inp), @XMM[4]
1358	movdqu	0x50($inp), @XMM[5]
1359	mov	%rsp, %rax		# pass key schedule
1360	movdqu	0x60($inp), @XMM[6]
1361	mov	%ebx,%r10d		# pass rounds
1362	movdqu	0x70($inp), @XMM[7]
1363	lea	0x80($inp), $inp
1364
1365	call	_bsaes_decrypt8
1366
1367	movdqu	@XMM[0], 0x00($out)	# write output
1368	movdqu	@XMM[1], 0x10($out)
1369	movdqu	@XMM[6], 0x20($out)
1370	movdqu	@XMM[4], 0x30($out)
1371	movdqu	@XMM[2], 0x40($out)
1372	movdqu	@XMM[7], 0x50($out)
1373	movdqu	@XMM[3], 0x60($out)
1374	movdqu	@XMM[5], 0x70($out)
1375	lea	0x80($out), $out
1376	sub	\$8,$len
1377	jnc	.Lecb_dec_loop
1378
1379	add	\$8,$len
1380	jz	.Lecb_dec_done
1381
1382	movdqu	0x00($inp), @XMM[0]	# load input
1383	mov	%rsp, %rax		# pass key schedule
1384	mov	%ebx,%r10d		# pass rounds
1385	cmp	\$2,$len
1386	jb	.Lecb_dec_one
1387	movdqu	0x10($inp), @XMM[1]
1388	je	.Lecb_dec_two
1389	movdqu	0x20($inp), @XMM[2]
1390	cmp	\$4,$len
1391	jb	.Lecb_dec_three
1392	movdqu	0x30($inp), @XMM[3]
1393	je	.Lecb_dec_four
1394	movdqu	0x40($inp), @XMM[4]
1395	cmp	\$6,$len
1396	jb	.Lecb_dec_five
1397	movdqu	0x50($inp), @XMM[5]
1398	je	.Lecb_dec_six
1399	movdqu	0x60($inp), @XMM[6]
1400	call	_bsaes_decrypt8
1401	movdqu	@XMM[0], 0x00($out)	# write output
1402	movdqu	@XMM[1], 0x10($out)
1403	movdqu	@XMM[6], 0x20($out)
1404	movdqu	@XMM[4], 0x30($out)
1405	movdqu	@XMM[2], 0x40($out)
1406	movdqu	@XMM[7], 0x50($out)
1407	movdqu	@XMM[3], 0x60($out)
1408	jmp	.Lecb_dec_done
1409.align	16
1410.Lecb_dec_six:
1411	call	_bsaes_decrypt8
1412	movdqu	@XMM[0], 0x00($out)	# write output
1413	movdqu	@XMM[1], 0x10($out)
1414	movdqu	@XMM[6], 0x20($out)
1415	movdqu	@XMM[4], 0x30($out)
1416	movdqu	@XMM[2], 0x40($out)
1417	movdqu	@XMM[7], 0x50($out)
1418	jmp	.Lecb_dec_done
1419.align	16
1420.Lecb_dec_five:
1421	call	_bsaes_decrypt8
1422	movdqu	@XMM[0], 0x00($out)	# write output
1423	movdqu	@XMM[1], 0x10($out)
1424	movdqu	@XMM[6], 0x20($out)
1425	movdqu	@XMM[4], 0x30($out)
1426	movdqu	@XMM[2], 0x40($out)
1427	jmp	.Lecb_dec_done
1428.align	16
1429.Lecb_dec_four:
1430	call	_bsaes_decrypt8
1431	movdqu	@XMM[0], 0x00($out)	# write output
1432	movdqu	@XMM[1], 0x10($out)
1433	movdqu	@XMM[6], 0x20($out)
1434	movdqu	@XMM[4], 0x30($out)
1435	jmp	.Lecb_dec_done
1436.align	16
1437.Lecb_dec_three:
1438	call	_bsaes_decrypt8
1439	movdqu	@XMM[0], 0x00($out)	# write output
1440	movdqu	@XMM[1], 0x10($out)
1441	movdqu	@XMM[6], 0x20($out)
1442	jmp	.Lecb_dec_done
1443.align	16
1444.Lecb_dec_two:
1445	call	_bsaes_decrypt8
1446	movdqu	@XMM[0], 0x00($out)	# write output
1447	movdqu	@XMM[1], 0x10($out)
1448	jmp	.Lecb_dec_done
1449.align	16
1450.Lecb_dec_one:
1451	call	_bsaes_decrypt8
1452	movdqu	@XMM[0], 0x00($out)	# write output
1453	jmp	.Lecb_dec_done
1454.align	16
1455.Lecb_dec_short:
1456	lea	($inp), $arg1
1457	lea	($out), $arg2
1458	lea	($key), $arg3
1459	call	asm_AES_decrypt
1460	lea	16($inp), $inp
1461	lea	16($out), $out
1462	dec	$len
1463	jnz	.Lecb_dec_short
1464
1465.Lecb_dec_done:
1466	lea	(%rsp),%rax
1467	pxor	%xmm0, %xmm0
1468.Lecb_dec_bzero:			# wipe key schedule [if any]
1469	movdqa	%xmm0, 0x00(%rax)
1470	movdqa	%xmm0, 0x10(%rax)
1471	lea	0x20(%rax), %rax
1472	cmp	%rax, %rbp
1473	jb	.Lecb_dec_bzero
1474
1475	lea	(%rbp),%rsp		# restore %rsp
1476___
1477$code.=<<___ if ($win64);
1478	movaps	0x40(%rbp), %xmm6
1479	movaps	0x50(%rbp), %xmm7
1480	movaps	0x60(%rbp), %xmm8
1481	movaps	0x70(%rbp), %xmm9
1482	movaps	0x80(%rbp), %xmm10
1483	movaps	0x90(%rbp), %xmm11
1484	movaps	0xa0(%rbp), %xmm12
1485	movaps	0xb0(%rbp), %xmm13
1486	movaps	0xc0(%rbp), %xmm14
1487	movaps	0xd0(%rbp), %xmm15
1488	lea	0xa0(%rbp), %rsp
1489___
1490$code.=<<___;
1491	mov	0x48(%rsp), %r15
1492	mov	0x50(%rsp), %r14
1493	mov	0x58(%rsp), %r13
1494	mov	0x60(%rsp), %r12
1495	mov	0x68(%rsp), %rbx
1496	mov	0x70(%rsp), %rax
1497	lea	0x78(%rsp), %rsp
1498	mov	%rax, %rbp
1499.Lecb_dec_epilogue:
1500	ret
1501.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1502___
1503}
1504$code.=<<___;
1505.extern	asm_AES_cbc_encrypt
1506.globl	bsaes_cbc_encrypt
1507.type	bsaes_cbc_encrypt,\@abi-omnipotent
1508.align	16
1509bsaes_cbc_encrypt:
1510___
1511$code.=<<___ if ($win64);
1512	mov	48(%rsp),$arg6		# pull direction flag
1513___
1514$code.=<<___;
1515	cmp	\$0,$arg6
1516	jne	asm_AES_cbc_encrypt
1517	cmp	\$128,$arg3
1518	jb	asm_AES_cbc_encrypt
1519
1520	mov	%rsp, %rax
1521.Lcbc_dec_prologue:
1522	push	%rbp
1523	push	%rbx
1524	push	%r12
1525	push	%r13
1526	push	%r14
1527	push	%r15
1528	lea	-0x48(%rsp), %rsp
1529___
1530$code.=<<___ if ($win64);
1531	mov	0xa0(%rsp),$arg5	# pull ivp
1532	lea	-0xa0(%rsp), %rsp
1533	movaps	%xmm6, 0x40(%rsp)
1534	movaps	%xmm7, 0x50(%rsp)
1535	movaps	%xmm8, 0x60(%rsp)
1536	movaps	%xmm9, 0x70(%rsp)
1537	movaps	%xmm10, 0x80(%rsp)
1538	movaps	%xmm11, 0x90(%rsp)
1539	movaps	%xmm12, 0xa0(%rsp)
1540	movaps	%xmm13, 0xb0(%rsp)
1541	movaps	%xmm14, 0xc0(%rsp)
1542	movaps	%xmm15, 0xd0(%rsp)
1543.Lcbc_dec_body:
1544___
1545$code.=<<___;
1546	mov	%rsp, %rbp		# backup %rsp
1547	mov	240($arg4), %eax	# rounds
1548	mov	$arg1, $inp		# backup arguments
1549	mov	$arg2, $out
1550	mov	$arg3, $len
1551	mov	$arg4, $key
1552	mov	$arg5, %rbx
1553	shr	\$4, $len		# bytes to blocks
1554
1555	mov	%eax, %edx		# rounds
1556	shl	\$7, %rax		# 128 bytes per inner round key
1557	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1558	sub	%rax, %rsp
1559
1560	mov	%rsp, %rax		# pass key schedule
1561	mov	$key, %rcx		# pass key
1562	mov	%edx, %r10d		# pass rounds
1563	call	_bsaes_key_convert
1564	pxor	(%rsp),%xmm7		# fix up 0 round key
1565	movdqa	%xmm6,(%rax)		# save last round key
1566	movdqa	%xmm7,(%rsp)
1567
1568	movdqu	(%rbx), @XMM[15]	# load IV
1569	sub	\$8,$len
1570.Lcbc_dec_loop:
1571	movdqu	0x00($inp), @XMM[0]	# load input
1572	movdqu	0x10($inp), @XMM[1]
1573	movdqu	0x20($inp), @XMM[2]
1574	movdqu	0x30($inp), @XMM[3]
1575	movdqu	0x40($inp), @XMM[4]
1576	movdqu	0x50($inp), @XMM[5]
1577	mov	%rsp, %rax		# pass key schedule
1578	movdqu	0x60($inp), @XMM[6]
1579	mov	%edx,%r10d		# pass rounds
1580	movdqu	0x70($inp), @XMM[7]
1581	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1582
1583	call	_bsaes_decrypt8
1584
1585	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1586	movdqu	0x00($inp), @XMM[8]	# re-load input
1587	movdqu	0x10($inp), @XMM[9]
1588	pxor	@XMM[8], @XMM[1]
1589	movdqu	0x20($inp), @XMM[10]
1590	pxor	@XMM[9], @XMM[6]
1591	movdqu	0x30($inp), @XMM[11]
1592	pxor	@XMM[10], @XMM[4]
1593	movdqu	0x40($inp), @XMM[12]
1594	pxor	@XMM[11], @XMM[2]
1595	movdqu	0x50($inp), @XMM[13]
1596	pxor	@XMM[12], @XMM[7]
1597	movdqu	0x60($inp), @XMM[14]
1598	pxor	@XMM[13], @XMM[3]
1599	movdqu	0x70($inp), @XMM[15]	# IV
1600	pxor	@XMM[14], @XMM[5]
1601	movdqu	@XMM[0], 0x00($out)	# write output
1602	lea	0x80($inp), $inp
1603	movdqu	@XMM[1], 0x10($out)
1604	movdqu	@XMM[6], 0x20($out)
1605	movdqu	@XMM[4], 0x30($out)
1606	movdqu	@XMM[2], 0x40($out)
1607	movdqu	@XMM[7], 0x50($out)
1608	movdqu	@XMM[3], 0x60($out)
1609	movdqu	@XMM[5], 0x70($out)
1610	lea	0x80($out), $out
1611	sub	\$8,$len
1612	jnc	.Lcbc_dec_loop
1613
1614	add	\$8,$len
1615	jz	.Lcbc_dec_done
1616
1617	movdqu	0x00($inp), @XMM[0]	# load input
1618	mov	%rsp, %rax		# pass key schedule
1619	mov	%edx, %r10d		# pass rounds
1620	cmp	\$2,$len
1621	jb	.Lcbc_dec_one
1622	movdqu	0x10($inp), @XMM[1]
1623	je	.Lcbc_dec_two
1624	movdqu	0x20($inp), @XMM[2]
1625	cmp	\$4,$len
1626	jb	.Lcbc_dec_three
1627	movdqu	0x30($inp), @XMM[3]
1628	je	.Lcbc_dec_four
1629	movdqu	0x40($inp), @XMM[4]
1630	cmp	\$6,$len
1631	jb	.Lcbc_dec_five
1632	movdqu	0x50($inp), @XMM[5]
1633	je	.Lcbc_dec_six
1634	movdqu	0x60($inp), @XMM[6]
1635	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1636	call	_bsaes_decrypt8
1637	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1638	movdqu	0x00($inp), @XMM[8]	# re-load input
1639	movdqu	0x10($inp), @XMM[9]
1640	pxor	@XMM[8], @XMM[1]
1641	movdqu	0x20($inp), @XMM[10]
1642	pxor	@XMM[9], @XMM[6]
1643	movdqu	0x30($inp), @XMM[11]
1644	pxor	@XMM[10], @XMM[4]
1645	movdqu	0x40($inp), @XMM[12]
1646	pxor	@XMM[11], @XMM[2]
1647	movdqu	0x50($inp), @XMM[13]
1648	pxor	@XMM[12], @XMM[7]
1649	movdqu	0x60($inp), @XMM[15]	# IV
1650	pxor	@XMM[13], @XMM[3]
1651	movdqu	@XMM[0], 0x00($out)	# write output
1652	movdqu	@XMM[1], 0x10($out)
1653	movdqu	@XMM[6], 0x20($out)
1654	movdqu	@XMM[4], 0x30($out)
1655	movdqu	@XMM[2], 0x40($out)
1656	movdqu	@XMM[7], 0x50($out)
1657	movdqu	@XMM[3], 0x60($out)
1658	jmp	.Lcbc_dec_done
1659.align	16
1660.Lcbc_dec_six:
1661	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1662	call	_bsaes_decrypt8
1663	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1664	movdqu	0x00($inp), @XMM[8]	# re-load input
1665	movdqu	0x10($inp), @XMM[9]
1666	pxor	@XMM[8], @XMM[1]
1667	movdqu	0x20($inp), @XMM[10]
1668	pxor	@XMM[9], @XMM[6]
1669	movdqu	0x30($inp), @XMM[11]
1670	pxor	@XMM[10], @XMM[4]
1671	movdqu	0x40($inp), @XMM[12]
1672	pxor	@XMM[11], @XMM[2]
1673	movdqu	0x50($inp), @XMM[15]	# IV
1674	pxor	@XMM[12], @XMM[7]
1675	movdqu	@XMM[0], 0x00($out)	# write output
1676	movdqu	@XMM[1], 0x10($out)
1677	movdqu	@XMM[6], 0x20($out)
1678	movdqu	@XMM[4], 0x30($out)
1679	movdqu	@XMM[2], 0x40($out)
1680	movdqu	@XMM[7], 0x50($out)
1681	jmp	.Lcbc_dec_done
1682.align	16
1683.Lcbc_dec_five:
1684	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1685	call	_bsaes_decrypt8
1686	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1687	movdqu	0x00($inp), @XMM[8]	# re-load input
1688	movdqu	0x10($inp), @XMM[9]
1689	pxor	@XMM[8], @XMM[1]
1690	movdqu	0x20($inp), @XMM[10]
1691	pxor	@XMM[9], @XMM[6]
1692	movdqu	0x30($inp), @XMM[11]
1693	pxor	@XMM[10], @XMM[4]
1694	movdqu	0x40($inp), @XMM[15]	# IV
1695	pxor	@XMM[11], @XMM[2]
1696	movdqu	@XMM[0], 0x00($out)	# write output
1697	movdqu	@XMM[1], 0x10($out)
1698	movdqu	@XMM[6], 0x20($out)
1699	movdqu	@XMM[4], 0x30($out)
1700	movdqu	@XMM[2], 0x40($out)
1701	jmp	.Lcbc_dec_done
1702.align	16
1703.Lcbc_dec_four:
1704	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1705	call	_bsaes_decrypt8
1706	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1707	movdqu	0x00($inp), @XMM[8]	# re-load input
1708	movdqu	0x10($inp), @XMM[9]
1709	pxor	@XMM[8], @XMM[1]
1710	movdqu	0x20($inp), @XMM[10]
1711	pxor	@XMM[9], @XMM[6]
1712	movdqu	0x30($inp), @XMM[15]	# IV
1713	pxor	@XMM[10], @XMM[4]
1714	movdqu	@XMM[0], 0x00($out)	# write output
1715	movdqu	@XMM[1], 0x10($out)
1716	movdqu	@XMM[6], 0x20($out)
1717	movdqu	@XMM[4], 0x30($out)
1718	jmp	.Lcbc_dec_done
1719.align	16
1720.Lcbc_dec_three:
1721	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1722	call	_bsaes_decrypt8
1723	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1724	movdqu	0x00($inp), @XMM[8]	# re-load input
1725	movdqu	0x10($inp), @XMM[9]
1726	pxor	@XMM[8], @XMM[1]
1727	movdqu	0x20($inp), @XMM[15]	# IV
1728	pxor	@XMM[9], @XMM[6]
1729	movdqu	@XMM[0], 0x00($out)	# write output
1730	movdqu	@XMM[1], 0x10($out)
1731	movdqu	@XMM[6], 0x20($out)
1732	jmp	.Lcbc_dec_done
1733.align	16
1734.Lcbc_dec_two:
1735	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1736	call	_bsaes_decrypt8
1737	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1738	movdqu	0x00($inp), @XMM[8]	# re-load input
1739	movdqu	0x10($inp), @XMM[15]	# IV
1740	pxor	@XMM[8], @XMM[1]
1741	movdqu	@XMM[0], 0x00($out)	# write output
1742	movdqu	@XMM[1], 0x10($out)
1743	jmp	.Lcbc_dec_done
1744.align	16
1745.Lcbc_dec_one:
1746	lea	($inp), $arg1
1747	lea	0x20(%rbp), $arg2	# buffer output
1748	lea	($key), $arg3
1749	call	asm_AES_decrypt		# doesn't touch %xmm
1750	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1751	movdqu	@XMM[15], ($out)	# write output
1752	movdqa	@XMM[0], @XMM[15]	# IV
1753
1754.Lcbc_dec_done:
1755	movdqu	@XMM[15], (%rbx)	# return IV
1756	lea	(%rsp), %rax
1757	pxor	%xmm0, %xmm0
1758.Lcbc_dec_bzero:			# wipe key schedule [if any]
1759	movdqa	%xmm0, 0x00(%rax)
1760	movdqa	%xmm0, 0x10(%rax)
1761	lea	0x20(%rax), %rax
1762	cmp	%rax, %rbp
1763	ja	.Lcbc_dec_bzero
1764
1765	lea	(%rbp),%rsp		# restore %rsp
1766___
1767$code.=<<___ if ($win64);
1768	movaps	0x40(%rbp), %xmm6
1769	movaps	0x50(%rbp), %xmm7
1770	movaps	0x60(%rbp), %xmm8
1771	movaps	0x70(%rbp), %xmm9
1772	movaps	0x80(%rbp), %xmm10
1773	movaps	0x90(%rbp), %xmm11
1774	movaps	0xa0(%rbp), %xmm12
1775	movaps	0xb0(%rbp), %xmm13
1776	movaps	0xc0(%rbp), %xmm14
1777	movaps	0xd0(%rbp), %xmm15
1778	lea	0xa0(%rbp), %rsp
1779___
1780$code.=<<___;
1781	mov	0x48(%rsp), %r15
1782	mov	0x50(%rsp), %r14
1783	mov	0x58(%rsp), %r13
1784	mov	0x60(%rsp), %r12
1785	mov	0x68(%rsp), %rbx
1786	mov	0x70(%rsp), %rax
1787	lea	0x78(%rsp), %rsp
1788	mov	%rax, %rbp
1789.Lcbc_dec_epilogue:
1790	ret
1791.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1792
1793.globl	bsaes_ctr32_encrypt_blocks
1794.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1795.align	16
1796bsaes_ctr32_encrypt_blocks:
1797	mov	%rsp, %rax
1798.Lctr_enc_prologue:
1799	push	%rbp
1800	push	%rbx
1801	push	%r12
1802	push	%r13
1803	push	%r14
1804	push	%r15
1805	lea	-0x48(%rsp), %rsp
1806___
1807$code.=<<___ if ($win64);
1808	mov	0xa0(%rsp),$arg5	# pull ivp
1809	lea	-0xa0(%rsp), %rsp
1810	movaps	%xmm6, 0x40(%rsp)
1811	movaps	%xmm7, 0x50(%rsp)
1812	movaps	%xmm8, 0x60(%rsp)
1813	movaps	%xmm9, 0x70(%rsp)
1814	movaps	%xmm10, 0x80(%rsp)
1815	movaps	%xmm11, 0x90(%rsp)
1816	movaps	%xmm12, 0xa0(%rsp)
1817	movaps	%xmm13, 0xb0(%rsp)
1818	movaps	%xmm14, 0xc0(%rsp)
1819	movaps	%xmm15, 0xd0(%rsp)
1820.Lctr_enc_body:
1821___
1822$code.=<<___;
1823	mov	%rsp, %rbp		# backup %rsp
1824	movdqu	($arg5), %xmm0		# load counter
1825	mov	240($arg4), %eax	# rounds
1826	mov	$arg1, $inp		# backup arguments
1827	mov	$arg2, $out
1828	mov	$arg3, $len
1829	mov	$arg4, $key
1830	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1831	cmp	\$8, $arg3
1832	jb	.Lctr_enc_short
1833
1834	mov	%eax, %ebx		# rounds
1835	shl	\$7, %rax		# 128 bytes per inner round key
1836	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1837	sub	%rax, %rsp
1838
1839	mov	%rsp, %rax		# pass key schedule
1840	mov	$key, %rcx		# pass key
1841	mov	%ebx, %r10d		# pass rounds
1842	call	_bsaes_key_convert
1843	pxor	%xmm6,%xmm7		# fix up last round key
1844	movdqa	%xmm7,(%rax)		# save last round key
1845
1846	movdqa	(%rsp), @XMM[9]		# load round0 key
1847	lea	.LADD1(%rip), %r11
1848	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1849	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1850	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1851	pshufb	@XMM[8], @XMM[0]
1852	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1853	jmp	.Lctr_enc_loop
1854.align	16
1855.Lctr_enc_loop:
1856	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1857	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1858	movdqa	@XMM[0], @XMM[2]
1859	paddd	0x00(%r11), @XMM[1]	# .LADD1
1860	movdqa	@XMM[0], @XMM[3]
1861	paddd	0x10(%r11), @XMM[2]	# .LADD2
1862	movdqa	@XMM[0], @XMM[4]
1863	paddd	0x20(%r11), @XMM[3]	# .LADD3
1864	movdqa	@XMM[0], @XMM[5]
1865	paddd	0x30(%r11), @XMM[4]	# .LADD4
1866	movdqa	@XMM[0], @XMM[6]
1867	paddd	0x40(%r11), @XMM[5]	# .LADD5
1868	movdqa	@XMM[0], @XMM[7]
1869	paddd	0x50(%r11), @XMM[6]	# .LADD6
1870	paddd	0x60(%r11), @XMM[7]	# .LADD7
1871
1872	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
1873	# to flip byte order in 32-bit counter
1874	movdqa	(%rsp), @XMM[9]		# round 0 key
1875	lea	0x10(%rsp), %rax	# pass key schedule
1876	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
1877	pxor	@XMM[9], @XMM[0]	# xor with round0 key
1878	pxor	@XMM[9], @XMM[1]
1879	 pshufb	@XMM[8], @XMM[0]
1880	pxor	@XMM[9], @XMM[2]
1881	 pshufb	@XMM[8], @XMM[1]
1882	pxor	@XMM[9], @XMM[3]
1883	 pshufb	@XMM[8], @XMM[2]
1884	pxor	@XMM[9], @XMM[4]
1885	 pshufb	@XMM[8], @XMM[3]
1886	pxor	@XMM[9], @XMM[5]
1887	 pshufb	@XMM[8], @XMM[4]
1888	pxor	@XMM[9], @XMM[6]
1889	 pshufb	@XMM[8], @XMM[5]
1890	pxor	@XMM[9], @XMM[7]
1891	 pshufb	@XMM[8], @XMM[6]
1892	lea	.LBS0(%rip), %r11	# constants table
1893	 pshufb	@XMM[8], @XMM[7]
1894	mov	%ebx,%r10d		# pass rounds
1895
1896	call	_bsaes_encrypt8_bitslice
1897
1898	sub	\$8,$len
1899	jc	.Lctr_enc_loop_done
1900
1901	movdqu	0x00($inp), @XMM[8]	# load input
1902	movdqu	0x10($inp), @XMM[9]
1903	movdqu	0x20($inp), @XMM[10]
1904	movdqu	0x30($inp), @XMM[11]
1905	movdqu	0x40($inp), @XMM[12]
1906	movdqu	0x50($inp), @XMM[13]
1907	movdqu	0x60($inp), @XMM[14]
1908	movdqu	0x70($inp), @XMM[15]
1909	lea	0x80($inp),$inp
1910	pxor	@XMM[0], @XMM[8]
1911	movdqa	0x20(%rbp), @XMM[0]	# load counter
1912	pxor	@XMM[9], @XMM[1]
1913	movdqu	@XMM[8], 0x00($out)	# write output
1914	pxor	@XMM[10], @XMM[4]
1915	movdqu	@XMM[1], 0x10($out)
1916	pxor	@XMM[11], @XMM[6]
1917	movdqu	@XMM[4], 0x20($out)
1918	pxor	@XMM[12], @XMM[3]
1919	movdqu	@XMM[6], 0x30($out)
1920	pxor	@XMM[13], @XMM[7]
1921	movdqu	@XMM[3], 0x40($out)
1922	pxor	@XMM[14], @XMM[2]
1923	movdqu	@XMM[7], 0x50($out)
1924	pxor	@XMM[15], @XMM[5]
1925	movdqu	@XMM[2], 0x60($out)
1926	lea	.LADD1(%rip), %r11
1927	movdqu	@XMM[5], 0x70($out)
1928	lea	0x80($out), $out
1929	paddd	0x70(%r11), @XMM[0]	# .LADD8
1930	jnz	.Lctr_enc_loop
1931
1932	jmp	.Lctr_enc_done
1933.align	16
1934.Lctr_enc_loop_done:
1935	add	\$8, $len
1936	movdqu	0x00($inp), @XMM[8]	# load input
1937	pxor	@XMM[8], @XMM[0]
1938	movdqu	@XMM[0], 0x00($out)	# write output
1939	cmp	\$2,$len
1940	jb	.Lctr_enc_done
1941	movdqu	0x10($inp), @XMM[9]
1942	pxor	@XMM[9], @XMM[1]
1943	movdqu	@XMM[1], 0x10($out)
1944	je	.Lctr_enc_done
1945	movdqu	0x20($inp), @XMM[10]
1946	pxor	@XMM[10], @XMM[4]
1947	movdqu	@XMM[4], 0x20($out)
1948	cmp	\$4,$len
1949	jb	.Lctr_enc_done
1950	movdqu	0x30($inp), @XMM[11]
1951	pxor	@XMM[11], @XMM[6]
1952	movdqu	@XMM[6], 0x30($out)
1953	je	.Lctr_enc_done
1954	movdqu	0x40($inp), @XMM[12]
1955	pxor	@XMM[12], @XMM[3]
1956	movdqu	@XMM[3], 0x40($out)
1957	cmp	\$6,$len
1958	jb	.Lctr_enc_done
1959	movdqu	0x50($inp), @XMM[13]
1960	pxor	@XMM[13], @XMM[7]
1961	movdqu	@XMM[7], 0x50($out)
1962	je	.Lctr_enc_done
1963	movdqu	0x60($inp), @XMM[14]
1964	pxor	@XMM[14], @XMM[2]
1965	movdqu	@XMM[2], 0x60($out)
1966	jmp	.Lctr_enc_done
1967
1968.align	16
1969.Lctr_enc_short:
1970	lea	0x20(%rbp), $arg1
1971	lea	0x30(%rbp), $arg2
1972	lea	($key), $arg3
1973	call	asm_AES_encrypt
1974	movdqu	($inp), @XMM[1]
1975	lea	16($inp), $inp
1976	mov	0x2c(%rbp), %eax	# load 32-bit counter
1977	bswap	%eax
1978	pxor	0x30(%rbp), @XMM[1]
1979	inc	%eax			# increment
1980	movdqu	@XMM[1], ($out)
1981	bswap	%eax
1982	lea	16($out), $out
1983	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
1984	dec	$len
1985	jnz	.Lctr_enc_short
1986
1987.Lctr_enc_done:
1988	lea	(%rsp), %rax
1989	pxor	%xmm0, %xmm0
1990.Lctr_enc_bzero:			# wipe key schedule [if any]
1991	movdqa	%xmm0, 0x00(%rax)
1992	movdqa	%xmm0, 0x10(%rax)
1993	lea	0x20(%rax), %rax
1994	cmp	%rax, %rbp
1995	ja	.Lctr_enc_bzero
1996
1997	lea	(%rbp),%rsp		# restore %rsp
1998___
1999$code.=<<___ if ($win64);
2000	movaps	0x40(%rbp), %xmm6
2001	movaps	0x50(%rbp), %xmm7
2002	movaps	0x60(%rbp), %xmm8
2003	movaps	0x70(%rbp), %xmm9
2004	movaps	0x80(%rbp), %xmm10
2005	movaps	0x90(%rbp), %xmm11
2006	movaps	0xa0(%rbp), %xmm12
2007	movaps	0xb0(%rbp), %xmm13
2008	movaps	0xc0(%rbp), %xmm14
2009	movaps	0xd0(%rbp), %xmm15
2010	lea	0xa0(%rbp), %rsp
2011___
2012$code.=<<___;
2013	mov	0x48(%rsp), %r15
2014	mov	0x50(%rsp), %r14
2015	mov	0x58(%rsp), %r13
2016	mov	0x60(%rsp), %r12
2017	mov	0x68(%rsp), %rbx
2018	mov	0x70(%rsp), %rax
2019	lea	0x78(%rsp), %rsp
2020	mov	%rax, %rbp
2021.Lctr_enc_epilogue:
2022	ret
2023.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2024___
2025######################################################################
2026# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2027#	const AES_KEY *key1, const AES_KEY *key2,
2028#	const unsigned char iv[16]);
2029#
2030my ($twmask,$twres,$twtmp)=@XMM[13..15];
2031$code.=<<___;
2032.globl	bsaes_xts_encrypt
2033.type	bsaes_xts_encrypt,\@abi-omnipotent
2034.align	16
2035bsaes_xts_encrypt:
2036	mov	%rsp, %rax
2037.Lxts_enc_prologue:
2038	push	%rbp
2039	push	%rbx
2040	push	%r12
2041	push	%r13
2042	push	%r14
2043	push	%r15
2044	lea	-0x48(%rsp), %rsp
2045___
2046$code.=<<___ if ($win64);
2047	mov	0xa0(%rsp),$arg5	# pull key2
2048	mov	0xa8(%rsp),$arg6	# pull ivp
2049	lea	-0xa0(%rsp), %rsp
2050	movaps	%xmm6, 0x40(%rsp)
2051	movaps	%xmm7, 0x50(%rsp)
2052	movaps	%xmm8, 0x60(%rsp)
2053	movaps	%xmm9, 0x70(%rsp)
2054	movaps	%xmm10, 0x80(%rsp)
2055	movaps	%xmm11, 0x90(%rsp)
2056	movaps	%xmm12, 0xa0(%rsp)
2057	movaps	%xmm13, 0xb0(%rsp)
2058	movaps	%xmm14, 0xc0(%rsp)
2059	movaps	%xmm15, 0xd0(%rsp)
2060.Lxts_enc_body:
2061___
2062$code.=<<___;
2063	mov	%rsp, %rbp		# backup %rsp
2064	mov	$arg1, $inp		# backup arguments
2065	mov	$arg2, $out
2066	mov	$arg3, $len
2067	mov	$arg4, $key
2068
2069	lea	($arg6), $arg1
2070	lea	0x20(%rbp), $arg2
2071	lea	($arg5), $arg3
2072	call	asm_AES_encrypt		# generate initial tweak
2073
2074	mov	240($key), %eax		# rounds
2075	mov	$len, %rbx		# backup $len
2076
2077	mov	%eax, %edx		# rounds
2078	shl	\$7, %rax		# 128 bytes per inner round key
2079	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2080	sub	%rax, %rsp
2081
2082	mov	%rsp, %rax		# pass key schedule
2083	mov	$key, %rcx		# pass key
2084	mov	%edx, %r10d		# pass rounds
2085	call	_bsaes_key_convert
2086	pxor	%xmm6, %xmm7		# fix up last round key
2087	movdqa	%xmm7, (%rax)		# save last round key
2088
2089	and	\$-16, $len
2090	sub	\$0x80, %rsp		# place for tweak[8]
2091	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2092
2093	pxor	$twtmp, $twtmp
2094	movdqa	.Lxts_magic(%rip), $twmask
2095	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2096
2097	sub	\$0x80, $len
2098	jc	.Lxts_enc_short
2099	jmp	.Lxts_enc_loop
2100
2101.align	16
2102.Lxts_enc_loop:
2103___
2104    for ($i=0;$i<7;$i++) {
2105    $code.=<<___;
2106	pshufd	\$0x13, $twtmp, $twres
2107	pxor	$twtmp, $twtmp
2108	movdqa	@XMM[7], @XMM[$i]
2109	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2110	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2111	pand	$twmask, $twres		# isolate carry and residue
2112	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2113	pxor	$twres, @XMM[7]
2114___
2115    $code.=<<___ if ($i>=1);
2116	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2117___
2118    $code.=<<___ if ($i>=2);
2119	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2120___
2121    }
2122$code.=<<___;
2123	movdqu	0x60($inp), @XMM[8+6]
2124	pxor	@XMM[8+5], @XMM[5]
2125	movdqu	0x70($inp), @XMM[8+7]
2126	lea	0x80($inp), $inp
2127	movdqa	@XMM[7], 0x70(%rsp)
2128	pxor	@XMM[8+6], @XMM[6]
2129	lea	0x80(%rsp), %rax	# pass key schedule
2130	pxor	@XMM[8+7], @XMM[7]
2131	mov	%edx, %r10d		# pass rounds
2132
2133	call	_bsaes_encrypt8
2134
2135	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2136	pxor	0x10(%rsp), @XMM[1]
2137	movdqu	@XMM[0], 0x00($out)	# write output
2138	pxor	0x20(%rsp), @XMM[4]
2139	movdqu	@XMM[1], 0x10($out)
2140	pxor	0x30(%rsp), @XMM[6]
2141	movdqu	@XMM[4], 0x20($out)
2142	pxor	0x40(%rsp), @XMM[3]
2143	movdqu	@XMM[6], 0x30($out)
2144	pxor	0x50(%rsp), @XMM[7]
2145	movdqu	@XMM[3], 0x40($out)
2146	pxor	0x60(%rsp), @XMM[2]
2147	movdqu	@XMM[7], 0x50($out)
2148	pxor	0x70(%rsp), @XMM[5]
2149	movdqu	@XMM[2], 0x60($out)
2150	movdqu	@XMM[5], 0x70($out)
2151	lea	0x80($out), $out
2152
2153	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2154	pxor	$twtmp, $twtmp
2155	movdqa	.Lxts_magic(%rip), $twmask
2156	pcmpgtd	@XMM[7], $twtmp
2157	pshufd	\$0x13, $twtmp, $twres
2158	pxor	$twtmp, $twtmp
2159	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2160	pand	$twmask, $twres		# isolate carry and residue
2161	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2162	pxor	$twres, @XMM[7]
2163
2164	sub	\$0x80,$len
2165	jnc	.Lxts_enc_loop
2166
2167.Lxts_enc_short:
2168	add	\$0x80, $len
2169	jz	.Lxts_enc_done
2170___
2171    for ($i=0;$i<7;$i++) {
2172    $code.=<<___;
2173	pshufd	\$0x13, $twtmp, $twres
2174	pxor	$twtmp, $twtmp
2175	movdqa	@XMM[7], @XMM[$i]
2176	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2177	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2178	pand	$twmask, $twres		# isolate carry and residue
2179	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2180	pxor	$twres, @XMM[7]
2181___
2182    $code.=<<___ if ($i>=1);
2183	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2184	cmp	\$`0x10*$i`,$len
2185	je	.Lxts_enc_$i
2186___
2187    $code.=<<___ if ($i>=2);
2188	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2189___
2190    }
2191$code.=<<___;
2192	movdqu	0x60($inp), @XMM[8+6]
2193	pxor	@XMM[8+5], @XMM[5]
2194	movdqa	@XMM[7], 0x70(%rsp)
2195	lea	0x70($inp), $inp
2196	pxor	@XMM[8+6], @XMM[6]
2197	lea	0x80(%rsp), %rax	# pass key schedule
2198	mov	%edx, %r10d		# pass rounds
2199
2200	call	_bsaes_encrypt8
2201
2202	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2203	pxor	0x10(%rsp), @XMM[1]
2204	movdqu	@XMM[0], 0x00($out)	# write output
2205	pxor	0x20(%rsp), @XMM[4]
2206	movdqu	@XMM[1], 0x10($out)
2207	pxor	0x30(%rsp), @XMM[6]
2208	movdqu	@XMM[4], 0x20($out)
2209	pxor	0x40(%rsp), @XMM[3]
2210	movdqu	@XMM[6], 0x30($out)
2211	pxor	0x50(%rsp), @XMM[7]
2212	movdqu	@XMM[3], 0x40($out)
2213	pxor	0x60(%rsp), @XMM[2]
2214	movdqu	@XMM[7], 0x50($out)
2215	movdqu	@XMM[2], 0x60($out)
2216	lea	0x70($out), $out
2217
2218	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2219	jmp	.Lxts_enc_done
2220.align	16
2221.Lxts_enc_6:
2222	pxor	@XMM[8+4], @XMM[4]
2223	lea	0x60($inp), $inp
2224	pxor	@XMM[8+5], @XMM[5]
2225	lea	0x80(%rsp), %rax	# pass key schedule
2226	mov	%edx, %r10d		# pass rounds
2227
2228	call	_bsaes_encrypt8
2229
2230	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2231	pxor	0x10(%rsp), @XMM[1]
2232	movdqu	@XMM[0], 0x00($out)	# write output
2233	pxor	0x20(%rsp), @XMM[4]
2234	movdqu	@XMM[1], 0x10($out)
2235	pxor	0x30(%rsp), @XMM[6]
2236	movdqu	@XMM[4], 0x20($out)
2237	pxor	0x40(%rsp), @XMM[3]
2238	movdqu	@XMM[6], 0x30($out)
2239	pxor	0x50(%rsp), @XMM[7]
2240	movdqu	@XMM[3], 0x40($out)
2241	movdqu	@XMM[7], 0x50($out)
2242	lea	0x60($out), $out
2243
2244	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2245	jmp	.Lxts_enc_done
2246.align	16
2247.Lxts_enc_5:
2248	pxor	@XMM[8+3], @XMM[3]
2249	lea	0x50($inp), $inp
2250	pxor	@XMM[8+4], @XMM[4]
2251	lea	0x80(%rsp), %rax	# pass key schedule
2252	mov	%edx, %r10d		# pass rounds
2253
2254	call	_bsaes_encrypt8
2255
2256	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2257	pxor	0x10(%rsp), @XMM[1]
2258	movdqu	@XMM[0], 0x00($out)	# write output
2259	pxor	0x20(%rsp), @XMM[4]
2260	movdqu	@XMM[1], 0x10($out)
2261	pxor	0x30(%rsp), @XMM[6]
2262	movdqu	@XMM[4], 0x20($out)
2263	pxor	0x40(%rsp), @XMM[3]
2264	movdqu	@XMM[6], 0x30($out)
2265	movdqu	@XMM[3], 0x40($out)
2266	lea	0x50($out), $out
2267
2268	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2269	jmp	.Lxts_enc_done
2270.align	16
2271.Lxts_enc_4:
2272	pxor	@XMM[8+2], @XMM[2]
2273	lea	0x40($inp), $inp
2274	pxor	@XMM[8+3], @XMM[3]
2275	lea	0x80(%rsp), %rax	# pass key schedule
2276	mov	%edx, %r10d		# pass rounds
2277
2278	call	_bsaes_encrypt8
2279
2280	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2281	pxor	0x10(%rsp), @XMM[1]
2282	movdqu	@XMM[0], 0x00($out)	# write output
2283	pxor	0x20(%rsp), @XMM[4]
2284	movdqu	@XMM[1], 0x10($out)
2285	pxor	0x30(%rsp), @XMM[6]
2286	movdqu	@XMM[4], 0x20($out)
2287	movdqu	@XMM[6], 0x30($out)
2288	lea	0x40($out), $out
2289
2290	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2291	jmp	.Lxts_enc_done
2292.align	16
2293.Lxts_enc_3:
2294	pxor	@XMM[8+1], @XMM[1]
2295	lea	0x30($inp), $inp
2296	pxor	@XMM[8+2], @XMM[2]
2297	lea	0x80(%rsp), %rax	# pass key schedule
2298	mov	%edx, %r10d		# pass rounds
2299
2300	call	_bsaes_encrypt8
2301
2302	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2303	pxor	0x10(%rsp), @XMM[1]
2304	movdqu	@XMM[0], 0x00($out)	# write output
2305	pxor	0x20(%rsp), @XMM[4]
2306	movdqu	@XMM[1], 0x10($out)
2307	movdqu	@XMM[4], 0x20($out)
2308	lea	0x30($out), $out
2309
2310	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2311	jmp	.Lxts_enc_done
2312.align	16
2313.Lxts_enc_2:
2314	pxor	@XMM[8+0], @XMM[0]
2315	lea	0x20($inp), $inp
2316	pxor	@XMM[8+1], @XMM[1]
2317	lea	0x80(%rsp), %rax	# pass key schedule
2318	mov	%edx, %r10d		# pass rounds
2319
2320	call	_bsaes_encrypt8
2321
2322	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2323	pxor	0x10(%rsp), @XMM[1]
2324	movdqu	@XMM[0], 0x00($out)	# write output
2325	movdqu	@XMM[1], 0x10($out)
2326	lea	0x20($out), $out
2327
2328	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2329	jmp	.Lxts_enc_done
2330.align	16
2331.Lxts_enc_1:
2332	pxor	@XMM[0], @XMM[8]
2333	lea	0x10($inp), $inp
2334	movdqa	@XMM[8], 0x20(%rbp)
2335	lea	0x20(%rbp), $arg1
2336	lea	0x20(%rbp), $arg2
2337	lea	($key), $arg3
2338	call	asm_AES_encrypt		# doesn't touch %xmm
2339	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2340	#pxor	@XMM[8], @XMM[0]
2341	#lea	0x80(%rsp), %rax	# pass key schedule
2342	#mov	%edx, %r10d		# pass rounds
2343	#call	_bsaes_encrypt8
2344	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2345	movdqu	@XMM[0], 0x00($out)	# write output
2346	lea	0x10($out), $out
2347
2348	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2349
2350.Lxts_enc_done:
2351	and	\$15, %ebx
2352	jz	.Lxts_enc_ret
2353	mov	$out, %rdx
2354
2355.Lxts_enc_steal:
2356	movzb	($inp), %eax
2357	movzb	-16(%rdx), %ecx
2358	lea	1($inp), $inp
2359	mov	%al, -16(%rdx)
2360	mov	%cl, 0(%rdx)
2361	lea	1(%rdx), %rdx
2362	sub	\$1,%ebx
2363	jnz	.Lxts_enc_steal
2364
2365	movdqu	-16($out), @XMM[0]
2366	lea	0x20(%rbp), $arg1
2367	pxor	@XMM[7], @XMM[0]
2368	lea	0x20(%rbp), $arg2
2369	movdqa	@XMM[0], 0x20(%rbp)
2370	lea	($key), $arg3
2371	call	asm_AES_encrypt		# doesn't touch %xmm
2372	pxor	0x20(%rbp), @XMM[7]
2373	movdqu	@XMM[7], -16($out)
2374
2375.Lxts_enc_ret:
2376	lea	(%rsp), %rax
2377	pxor	%xmm0, %xmm0
2378.Lxts_enc_bzero:			# wipe key schedule [if any]
2379	movdqa	%xmm0, 0x00(%rax)
2380	movdqa	%xmm0, 0x10(%rax)
2381	lea	0x20(%rax), %rax
2382	cmp	%rax, %rbp
2383	ja	.Lxts_enc_bzero
2384
2385	lea	(%rbp),%rsp		# restore %rsp
2386___
2387$code.=<<___ if ($win64);
2388	movaps	0x40(%rbp), %xmm6
2389	movaps	0x50(%rbp), %xmm7
2390	movaps	0x60(%rbp), %xmm8
2391	movaps	0x70(%rbp), %xmm9
2392	movaps	0x80(%rbp), %xmm10
2393	movaps	0x90(%rbp), %xmm11
2394	movaps	0xa0(%rbp), %xmm12
2395	movaps	0xb0(%rbp), %xmm13
2396	movaps	0xc0(%rbp), %xmm14
2397	movaps	0xd0(%rbp), %xmm15
2398	lea	0xa0(%rbp), %rsp
2399___
2400$code.=<<___;
2401	mov	0x48(%rsp), %r15
2402	mov	0x50(%rsp), %r14
2403	mov	0x58(%rsp), %r13
2404	mov	0x60(%rsp), %r12
2405	mov	0x68(%rsp), %rbx
2406	mov	0x70(%rsp), %rax
2407	lea	0x78(%rsp), %rsp
2408	mov	%rax, %rbp
2409.Lxts_enc_epilogue:
2410	ret
2411.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2412
2413.globl	bsaes_xts_decrypt
2414.type	bsaes_xts_decrypt,\@abi-omnipotent
2415.align	16
2416bsaes_xts_decrypt:
2417	mov	%rsp, %rax
2418.Lxts_dec_prologue:
2419	push	%rbp
2420	push	%rbx
2421	push	%r12
2422	push	%r13
2423	push	%r14
2424	push	%r15
2425	lea	-0x48(%rsp), %rsp
2426___
2427$code.=<<___ if ($win64);
2428	mov	0xa0(%rsp),$arg5	# pull key2
2429	mov	0xa8(%rsp),$arg6	# pull ivp
2430	lea	-0xa0(%rsp), %rsp
2431	movaps	%xmm6, 0x40(%rsp)
2432	movaps	%xmm7, 0x50(%rsp)
2433	movaps	%xmm8, 0x60(%rsp)
2434	movaps	%xmm9, 0x70(%rsp)
2435	movaps	%xmm10, 0x80(%rsp)
2436	movaps	%xmm11, 0x90(%rsp)
2437	movaps	%xmm12, 0xa0(%rsp)
2438	movaps	%xmm13, 0xb0(%rsp)
2439	movaps	%xmm14, 0xc0(%rsp)
2440	movaps	%xmm15, 0xd0(%rsp)
2441.Lxts_dec_body:
2442___
2443$code.=<<___;
2444	mov	%rsp, %rbp		# backup %rsp
2445	mov	$arg1, $inp		# backup arguments
2446	mov	$arg2, $out
2447	mov	$arg3, $len
2448	mov	$arg4, $key
2449
2450	lea	($arg6), $arg1
2451	lea	0x20(%rbp), $arg2
2452	lea	($arg5), $arg3
2453	call	asm_AES_encrypt		# generate initial tweak
2454
2455	mov	240($key), %eax		# rounds
2456	mov	$len, %rbx		# backup $len
2457
2458	mov	%eax, %edx		# rounds
2459	shl	\$7, %rax		# 128 bytes per inner round key
2460	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2461	sub	%rax, %rsp
2462
2463	mov	%rsp, %rax		# pass key schedule
2464	mov	$key, %rcx		# pass key
2465	mov	%edx, %r10d		# pass rounds
2466	call	_bsaes_key_convert
2467	pxor	(%rsp), %xmm7		# fix up round 0 key
2468	movdqa	%xmm6, (%rax)		# save last round key
2469	movdqa	%xmm7, (%rsp)
2470
2471	xor	%eax, %eax		# if ($len%16) len-=16;
2472	and	\$-16, $len
2473	test	\$15, %ebx
2474	setnz	%al
2475	shl	\$4, %rax
2476	sub	%rax, $len
2477
2478	sub	\$0x80, %rsp		# place for tweak[8]
2479	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2480
2481	pxor	$twtmp, $twtmp
2482	movdqa	.Lxts_magic(%rip), $twmask
2483	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2484
2485	sub	\$0x80, $len
2486	jc	.Lxts_dec_short
2487	jmp	.Lxts_dec_loop
2488
2489.align	16
2490.Lxts_dec_loop:
2491___
2492    for ($i=0;$i<7;$i++) {
2493    $code.=<<___;
2494	pshufd	\$0x13, $twtmp, $twres
2495	pxor	$twtmp, $twtmp
2496	movdqa	@XMM[7], @XMM[$i]
2497	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2498	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2499	pand	$twmask, $twres		# isolate carry and residue
2500	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2501	pxor	$twres, @XMM[7]
2502___
2503    $code.=<<___ if ($i>=1);
2504	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2505___
2506    $code.=<<___ if ($i>=2);
2507	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2508___
2509    }
2510$code.=<<___;
2511	movdqu	0x60($inp), @XMM[8+6]
2512	pxor	@XMM[8+5], @XMM[5]
2513	movdqu	0x70($inp), @XMM[8+7]
2514	lea	0x80($inp), $inp
2515	movdqa	@XMM[7], 0x70(%rsp)
2516	pxor	@XMM[8+6], @XMM[6]
2517	lea	0x80(%rsp), %rax	# pass key schedule
2518	pxor	@XMM[8+7], @XMM[7]
2519	mov	%edx, %r10d		# pass rounds
2520
2521	call	_bsaes_decrypt8
2522
2523	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2524	pxor	0x10(%rsp), @XMM[1]
2525	movdqu	@XMM[0], 0x00($out)	# write output
2526	pxor	0x20(%rsp), @XMM[6]
2527	movdqu	@XMM[1], 0x10($out)
2528	pxor	0x30(%rsp), @XMM[4]
2529	movdqu	@XMM[6], 0x20($out)
2530	pxor	0x40(%rsp), @XMM[2]
2531	movdqu	@XMM[4], 0x30($out)
2532	pxor	0x50(%rsp), @XMM[7]
2533	movdqu	@XMM[2], 0x40($out)
2534	pxor	0x60(%rsp), @XMM[3]
2535	movdqu	@XMM[7], 0x50($out)
2536	pxor	0x70(%rsp), @XMM[5]
2537	movdqu	@XMM[3], 0x60($out)
2538	movdqu	@XMM[5], 0x70($out)
2539	lea	0x80($out), $out
2540
2541	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2542	pxor	$twtmp, $twtmp
2543	movdqa	.Lxts_magic(%rip), $twmask
2544	pcmpgtd	@XMM[7], $twtmp
2545	pshufd	\$0x13, $twtmp, $twres
2546	pxor	$twtmp, $twtmp
2547	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2548	pand	$twmask, $twres		# isolate carry and residue
2549	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2550	pxor	$twres, @XMM[7]
2551
2552	sub	\$0x80,$len
2553	jnc	.Lxts_dec_loop
2554
2555.Lxts_dec_short:
2556	add	\$0x80, $len
2557	jz	.Lxts_dec_done
2558___
2559    for ($i=0;$i<7;$i++) {
2560    $code.=<<___;
2561	pshufd	\$0x13, $twtmp, $twres
2562	pxor	$twtmp, $twtmp
2563	movdqa	@XMM[7], @XMM[$i]
2564	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2565	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2566	pand	$twmask, $twres		# isolate carry and residue
2567	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2568	pxor	$twres, @XMM[7]
2569___
2570    $code.=<<___ if ($i>=1);
2571	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2572	cmp	\$`0x10*$i`,$len
2573	je	.Lxts_dec_$i
2574___
2575    $code.=<<___ if ($i>=2);
2576	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2577___
2578    }
2579$code.=<<___;
2580	movdqu	0x60($inp), @XMM[8+6]
2581	pxor	@XMM[8+5], @XMM[5]
2582	movdqa	@XMM[7], 0x70(%rsp)
2583	lea	0x70($inp), $inp
2584	pxor	@XMM[8+6], @XMM[6]
2585	lea	0x80(%rsp), %rax	# pass key schedule
2586	mov	%edx, %r10d		# pass rounds
2587
2588	call	_bsaes_decrypt8
2589
2590	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2591	pxor	0x10(%rsp), @XMM[1]
2592	movdqu	@XMM[0], 0x00($out)	# write output
2593	pxor	0x20(%rsp), @XMM[6]
2594	movdqu	@XMM[1], 0x10($out)
2595	pxor	0x30(%rsp), @XMM[4]
2596	movdqu	@XMM[6], 0x20($out)
2597	pxor	0x40(%rsp), @XMM[2]
2598	movdqu	@XMM[4], 0x30($out)
2599	pxor	0x50(%rsp), @XMM[7]
2600	movdqu	@XMM[2], 0x40($out)
2601	pxor	0x60(%rsp), @XMM[3]
2602	movdqu	@XMM[7], 0x50($out)
2603	movdqu	@XMM[3], 0x60($out)
2604	lea	0x70($out), $out
2605
2606	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2607	jmp	.Lxts_dec_done
2608.align	16
2609.Lxts_dec_6:
2610	pxor	@XMM[8+4], @XMM[4]
2611	lea	0x60($inp), $inp
2612	pxor	@XMM[8+5], @XMM[5]
2613	lea	0x80(%rsp), %rax	# pass key schedule
2614	mov	%edx, %r10d		# pass rounds
2615
2616	call	_bsaes_decrypt8
2617
2618	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2619	pxor	0x10(%rsp), @XMM[1]
2620	movdqu	@XMM[0], 0x00($out)	# write output
2621	pxor	0x20(%rsp), @XMM[6]
2622	movdqu	@XMM[1], 0x10($out)
2623	pxor	0x30(%rsp), @XMM[4]
2624	movdqu	@XMM[6], 0x20($out)
2625	pxor	0x40(%rsp), @XMM[2]
2626	movdqu	@XMM[4], 0x30($out)
2627	pxor	0x50(%rsp), @XMM[7]
2628	movdqu	@XMM[2], 0x40($out)
2629	movdqu	@XMM[7], 0x50($out)
2630	lea	0x60($out), $out
2631
2632	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2633	jmp	.Lxts_dec_done
2634.align	16
2635.Lxts_dec_5:
2636	pxor	@XMM[8+3], @XMM[3]
2637	lea	0x50($inp), $inp
2638	pxor	@XMM[8+4], @XMM[4]
2639	lea	0x80(%rsp), %rax	# pass key schedule
2640	mov	%edx, %r10d		# pass rounds
2641
2642	call	_bsaes_decrypt8
2643
2644	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2645	pxor	0x10(%rsp), @XMM[1]
2646	movdqu	@XMM[0], 0x00($out)	# write output
2647	pxor	0x20(%rsp), @XMM[6]
2648	movdqu	@XMM[1], 0x10($out)
2649	pxor	0x30(%rsp), @XMM[4]
2650	movdqu	@XMM[6], 0x20($out)
2651	pxor	0x40(%rsp), @XMM[2]
2652	movdqu	@XMM[4], 0x30($out)
2653	movdqu	@XMM[2], 0x40($out)
2654	lea	0x50($out), $out
2655
2656	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2657	jmp	.Lxts_dec_done
2658.align	16
2659.Lxts_dec_4:
2660	pxor	@XMM[8+2], @XMM[2]
2661	lea	0x40($inp), $inp
2662	pxor	@XMM[8+3], @XMM[3]
2663	lea	0x80(%rsp), %rax	# pass key schedule
2664	mov	%edx, %r10d		# pass rounds
2665
2666	call	_bsaes_decrypt8
2667
2668	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2669	pxor	0x10(%rsp), @XMM[1]
2670	movdqu	@XMM[0], 0x00($out)	# write output
2671	pxor	0x20(%rsp), @XMM[6]
2672	movdqu	@XMM[1], 0x10($out)
2673	pxor	0x30(%rsp), @XMM[4]
2674	movdqu	@XMM[6], 0x20($out)
2675	movdqu	@XMM[4], 0x30($out)
2676	lea	0x40($out), $out
2677
2678	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2679	jmp	.Lxts_dec_done
2680.align	16
2681.Lxts_dec_3:
2682	pxor	@XMM[8+1], @XMM[1]
2683	lea	0x30($inp), $inp
2684	pxor	@XMM[8+2], @XMM[2]
2685	lea	0x80(%rsp), %rax	# pass key schedule
2686	mov	%edx, %r10d		# pass rounds
2687
2688	call	_bsaes_decrypt8
2689
2690	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2691	pxor	0x10(%rsp), @XMM[1]
2692	movdqu	@XMM[0], 0x00($out)	# write output
2693	pxor	0x20(%rsp), @XMM[6]
2694	movdqu	@XMM[1], 0x10($out)
2695	movdqu	@XMM[6], 0x20($out)
2696	lea	0x30($out), $out
2697
2698	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2699	jmp	.Lxts_dec_done
2700.align	16
2701.Lxts_dec_2:
2702	pxor	@XMM[8+0], @XMM[0]
2703	lea	0x20($inp), $inp
2704	pxor	@XMM[8+1], @XMM[1]
2705	lea	0x80(%rsp), %rax	# pass key schedule
2706	mov	%edx, %r10d		# pass rounds
2707
2708	call	_bsaes_decrypt8
2709
2710	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2711	pxor	0x10(%rsp), @XMM[1]
2712	movdqu	@XMM[0], 0x00($out)	# write output
2713	movdqu	@XMM[1], 0x10($out)
2714	lea	0x20($out), $out
2715
2716	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2717	jmp	.Lxts_dec_done
2718.align	16
2719.Lxts_dec_1:
2720	pxor	@XMM[0], @XMM[8]
2721	lea	0x10($inp), $inp
2722	movdqa	@XMM[8], 0x20(%rbp)
2723	lea	0x20(%rbp), $arg1
2724	lea	0x20(%rbp), $arg2
2725	lea	($key), $arg3
2726	call	asm_AES_decrypt		# doesn't touch %xmm
2727	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2728	#pxor	@XMM[8], @XMM[0]
2729	#lea	0x80(%rsp), %rax	# pass key schedule
2730	#mov	%edx, %r10d		# pass rounds
2731	#call	_bsaes_decrypt8
2732	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2733	movdqu	@XMM[0], 0x00($out)	# write output
2734	lea	0x10($out), $out
2735
2736	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2737
2738.Lxts_dec_done:
2739	and	\$15, %ebx
2740	jz	.Lxts_dec_ret
2741
2742	pxor	$twtmp, $twtmp
2743	movdqa	.Lxts_magic(%rip), $twmask
2744	pcmpgtd	@XMM[7], $twtmp
2745	pshufd	\$0x13, $twtmp, $twres
2746	movdqa	@XMM[7], @XMM[6]
2747	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2748	pand	$twmask, $twres		# isolate carry and residue
2749	movdqu	($inp), @XMM[0]
2750	pxor	$twres, @XMM[7]
2751
2752	lea	0x20(%rbp), $arg1
2753	pxor	@XMM[7], @XMM[0]
2754	lea	0x20(%rbp), $arg2
2755	movdqa	@XMM[0], 0x20(%rbp)
2756	lea	($key), $arg3
2757	call	asm_AES_decrypt		# doesn't touch %xmm
2758	pxor	0x20(%rbp), @XMM[7]
2759	mov	$out, %rdx
2760	movdqu	@XMM[7], ($out)
2761
2762.Lxts_dec_steal:
2763	movzb	16($inp), %eax
2764	movzb	(%rdx), %ecx
2765	lea	1($inp), $inp
2766	mov	%al, (%rdx)
2767	mov	%cl, 16(%rdx)
2768	lea	1(%rdx), %rdx
2769	sub	\$1,%ebx
2770	jnz	.Lxts_dec_steal
2771
2772	movdqu	($out), @XMM[0]
2773	lea	0x20(%rbp), $arg1
2774	pxor	@XMM[6], @XMM[0]
2775	lea	0x20(%rbp), $arg2
2776	movdqa	@XMM[0], 0x20(%rbp)
2777	lea	($key), $arg3
2778	call	asm_AES_decrypt		# doesn't touch %xmm
2779	pxor	0x20(%rbp), @XMM[6]
2780	movdqu	@XMM[6], ($out)
2781
2782.Lxts_dec_ret:
2783	lea	(%rsp), %rax
2784	pxor	%xmm0, %xmm0
2785.Lxts_dec_bzero:			# wipe key schedule [if any]
2786	movdqa	%xmm0, 0x00(%rax)
2787	movdqa	%xmm0, 0x10(%rax)
2788	lea	0x20(%rax), %rax
2789	cmp	%rax, %rbp
2790	ja	.Lxts_dec_bzero
2791
2792	lea	(%rbp),%rsp		# restore %rsp
2793___
2794$code.=<<___ if ($win64);
2795	movaps	0x40(%rbp), %xmm6
2796	movaps	0x50(%rbp), %xmm7
2797	movaps	0x60(%rbp), %xmm8
2798	movaps	0x70(%rbp), %xmm9
2799	movaps	0x80(%rbp), %xmm10
2800	movaps	0x90(%rbp), %xmm11
2801	movaps	0xa0(%rbp), %xmm12
2802	movaps	0xb0(%rbp), %xmm13
2803	movaps	0xc0(%rbp), %xmm14
2804	movaps	0xd0(%rbp), %xmm15
2805	lea	0xa0(%rbp), %rsp
2806___
2807$code.=<<___;
2808	mov	0x48(%rsp), %r15
2809	mov	0x50(%rsp), %r14
2810	mov	0x58(%rsp), %r13
2811	mov	0x60(%rsp), %r12
2812	mov	0x68(%rsp), %rbx
2813	mov	0x70(%rsp), %rax
2814	lea	0x78(%rsp), %rsp
2815	mov	%rax, %rbp
2816.Lxts_dec_epilogue:
2817	ret
2818.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2819___
2820}
2821$code.=<<___;
2822.type	_bsaes_const,\@object
2823.align	64
2824_bsaes_const:
2825.LM0ISR:	# InvShiftRows constants
2826	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
2827.LISRM0:
2828	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
2829.LISR:
2830	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
2831.LBS0:		# bit-slice constants
2832	.quad	0x5555555555555555, 0x5555555555555555
2833.LBS1:
2834	.quad	0x3333333333333333, 0x3333333333333333
2835.LBS2:
2836	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2837.LSR:		# shiftrows constants
2838	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
2839.LSRM0:
2840	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
2841.LM0SR:
2842	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
2843.LSWPUP:	# byte-swap upper dword
2844	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
2845.LSWPUPM0SR:
2846	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
2847.LADD1:		# counter increment constants
2848	.quad	0x0000000000000000, 0x0000000100000000
2849.LADD2:
2850	.quad	0x0000000000000000, 0x0000000200000000
2851.LADD3:
2852	.quad	0x0000000000000000, 0x0000000300000000
2853.LADD4:
2854	.quad	0x0000000000000000, 0x0000000400000000
2855.LADD5:
2856	.quad	0x0000000000000000, 0x0000000500000000
2857.LADD6:
2858	.quad	0x0000000000000000, 0x0000000600000000
2859.LADD7:
2860	.quad	0x0000000000000000, 0x0000000700000000
2861.LADD8:
2862	.quad	0x0000000000000000, 0x0000000800000000
2863.Lxts_magic:
2864	.long	0x87,0,1,0
2865.Lmasks:
2866	.quad	0x0101010101010101, 0x0101010101010101
2867	.quad	0x0202020202020202, 0x0202020202020202
2868	.quad	0x0404040404040404, 0x0404040404040404
2869	.quad	0x0808080808080808, 0x0808080808080808
2870.LM0:
2871	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
2872.L63:
2873	.quad	0x6363636363636363, 0x6363636363636363
2874.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2875.align	64
2876.size	_bsaes_const,.-_bsaes_const
2877___
2878
2879# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2880#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2881if ($win64) {
2882$rec="%rcx";
2883$frame="%rdx";
2884$context="%r8";
2885$disp="%r9";
2886
2887$code.=<<___;
2888.extern	__imp_RtlVirtualUnwind
2889.type	se_handler,\@abi-omnipotent
2890.align	16
2891se_handler:
2892	push	%rsi
2893	push	%rdi
2894	push	%rbx
2895	push	%rbp
2896	push	%r12
2897	push	%r13
2898	push	%r14
2899	push	%r15
2900	pushfq
2901	sub	\$64,%rsp
2902
2903	mov	120($context),%rax	# pull context->Rax
2904	mov	248($context),%rbx	# pull context->Rip
2905
2906	mov	8($disp),%rsi		# disp->ImageBase
2907	mov	56($disp),%r11		# disp->HandlerData
2908
2909	mov	0(%r11),%r10d		# HandlerData[0]
2910	lea	(%rsi,%r10),%r10	# prologue label
2911	cmp	%r10,%rbx		# context->Rip<prologue label
2912	jb	.Lin_prologue
2913
2914	mov	152($context),%rax	# pull context->Rsp
2915
2916	mov	4(%r11),%r10d		# HandlerData[1]
2917	lea	(%rsi,%r10),%r10	# epilogue label
2918	cmp	%r10,%rbx		# context->Rip>=epilogue label
2919	jae	.Lin_prologue
2920
2921	mov	160($context),%rax	# pull context->Rbp
2922
2923	lea	0x40(%rax),%rsi		# %xmm save area
2924	lea	512($context),%rdi	# &context.Xmm6
2925	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2926	.long	0xa548f3fc		# cld; rep movsq
2927	lea	0xa0(%rax),%rax		# adjust stack pointer
2928
2929	mov	0x70(%rax),%rbp
2930	mov	0x68(%rax),%rbx
2931	mov	0x60(%rax),%r12
2932	mov	0x58(%rax),%r13
2933	mov	0x50(%rax),%r14
2934	mov	0x48(%rax),%r15
2935	lea	0x78(%rax),%rax		# adjust stack pointer
2936	mov	%rbx,144($context)	# restore context->Rbx
2937	mov	%rbp,160($context)	# restore context->Rbp
2938	mov	%r12,216($context)	# restore context->R12
2939	mov	%r13,224($context)	# restore context->R13
2940	mov	%r14,232($context)	# restore context->R14
2941	mov	%r15,240($context)	# restore context->R15
2942
2943.Lin_prologue:
2944	mov	%rax,152($context)	# restore context->Rsp
2945
2946	mov	40($disp),%rdi		# disp->ContextRecord
2947	mov	$context,%rsi		# context
2948	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
2949	.long	0xa548f3fc		# cld; rep movsq
2950
2951	mov	$disp,%rsi
2952	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2953	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2954	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2955	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2956	mov	40(%rsi),%r10		# disp->ContextRecord
2957	lea	56(%rsi),%r11		# &disp->HandlerData
2958	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2959	mov	%r10,32(%rsp)		# arg5
2960	mov	%r11,40(%rsp)		# arg6
2961	mov	%r12,48(%rsp)		# arg7
2962	mov	%rcx,56(%rsp)		# arg8, (NULL)
2963	call	*__imp_RtlVirtualUnwind(%rip)
2964
2965	mov	\$1,%eax		# ExceptionContinueSearch
2966	add	\$64,%rsp
2967	popfq
2968	pop	%r15
2969	pop	%r14
2970	pop	%r13
2971	pop	%r12
2972	pop	%rbp
2973	pop	%rbx
2974	pop	%rdi
2975	pop	%rsi
2976	ret
2977.size	se_handler,.-se_handler
2978
2979.section	.pdata
2980.align	4
2981___
2982$code.=<<___ if ($ecb);
2983	.rva	.Lecb_enc_prologue
2984	.rva	.Lecb_enc_epilogue
2985	.rva	.Lecb_enc_info
2986
2987	.rva	.Lecb_dec_prologue
2988	.rva	.Lecb_dec_epilogue
2989	.rva	.Lecb_dec_info
2990___
2991$code.=<<___;
2992	.rva	.Lcbc_dec_prologue
2993	.rva	.Lcbc_dec_epilogue
2994	.rva	.Lcbc_dec_info
2995
2996	.rva	.Lctr_enc_prologue
2997	.rva	.Lctr_enc_epilogue
2998	.rva	.Lctr_enc_info
2999
3000	.rva	.Lxts_enc_prologue
3001	.rva	.Lxts_enc_epilogue
3002	.rva	.Lxts_enc_info
3003
3004	.rva	.Lxts_dec_prologue
3005	.rva	.Lxts_dec_epilogue
3006	.rva	.Lxts_dec_info
3007
3008.section	.xdata
3009.align	8
3010___
3011$code.=<<___ if ($ecb);
3012.Lecb_enc_info:
3013	.byte	9,0,0,0
3014	.rva	se_handler
3015	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3016.Lecb_dec_info:
3017	.byte	9,0,0,0
3018	.rva	se_handler
3019	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3020___
3021$code.=<<___;
3022.Lcbc_dec_info:
3023	.byte	9,0,0,0
3024	.rva	se_handler
3025	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3026.Lctr_enc_info:
3027	.byte	9,0,0,0
3028	.rva	se_handler
3029	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3030.Lxts_enc_info:
3031	.byte	9,0,0,0
3032	.rva	se_handler
3033	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3034.Lxts_dec_info:
3035	.byte	9,0,0,0
3036	.rva	se_handler
3037	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3038___
3039}
3040
3041$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3042
3043print $code;
3044
3045close STDOUT;
3046