xref: /freebsd/crypto/openssl/crypto/aes/asm/bsaes-x86_64.pl (revision 31d62a73c2e6ac0ff413a7a17700ffc7dce254ef)
1#! /usr/bin/env perl
2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10###################################################################
11### AES-128 [originally in CTR mode]				###
12### bitsliced implementation for Intel Core 2 processors	###
13### requires support of SSE extensions up to SSSE3		###
14### Author: Emilia Käsper and Peter Schwabe			###
15### Date: 2009-03-19						###
16### Public domain						###
17###								###
18### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
19### further information.					###
20###################################################################
21#
22# September 2011.
23#
24# Started as transliteration to "perlasm" the original code has
25# undergone following changes:
26#
27# - code was made position-independent;
28# - rounds were folded into a loop resulting in >5x size reduction
29#   from 12.5KB to 2.2KB;
30# - above was possibile thanks to mixcolumns() modification that
31#   allowed to feed its output back to aesenc[last], this was
32#   achieved at cost of two additional inter-registers moves;
33# - some instruction reordering and interleaving;
34# - this module doesn't implement key setup subroutine, instead it
35#   relies on conversion of "conventional" key schedule as returned
36#   by AES_set_encrypt_key (see discussion below);
37# - first and last round keys are treated differently, which allowed
38#   to skip one shiftrows(), reduce bit-sliced key schedule and
39#   speed-up conversion by 22%;
40# - support for 192- and 256-bit keys was added;
41#
42# Resulting performance in CPU cycles spent to encrypt one byte out
43# of 4096-byte buffer with 128-bit key is:
44#
45#		Emilia's	this(*)		difference
46#
47# Core 2    	9.30		8.69		+7%
48# Nehalem(**) 	7.63		6.88		+11%
49# Atom	    	17.1		16.4		+4%
50# Silvermont	-		12.9
51# Goldmont	-		8.85
52#
53# (*)	Comparison is not completely fair, because "this" is ECB,
54#	i.e. no extra processing such as counter values calculation
55#	and xor-ing input as in Emilia's CTR implementation is
56#	performed. However, the CTR calculations stand for not more
57#	than 1% of total time, so comparison is *rather* fair.
58#
59# (**)	Results were collected on Westmere, which is considered to
60#	be equivalent to Nehalem for this code.
61#
62# As for key schedule conversion subroutine. Interface to OpenSSL
63# relies on per-invocation on-the-fly conversion. This naturally
64# has impact on performance, especially for short inputs. Conversion
65# time in CPU cycles and its ratio to CPU cycles spent in 8x block
66# function is:
67#
68# 		conversion	conversion/8x block
69# Core 2	240		0.22
70# Nehalem	180		0.20
71# Atom		430		0.20
72#
73# The ratio values mean that 128-byte blocks will be processed
74# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
75# etc. Then keep in mind that input sizes not divisible by 128 are
76# *effectively* slower, especially shortest ones, e.g. consecutive
77# 144-byte blocks are processed 44% slower than one would expect,
78# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
79# it's still faster than ["hyper-threading-safe" code path in]
80# aes-x86_64.pl on all lengths above 64 bytes...
81#
82# October 2011.
83#
84# Add decryption procedure. Performance in CPU cycles spent to decrypt
85# one byte out of 4096-byte buffer with 128-bit key is:
86#
87# Core 2	9.98
88# Nehalem	7.80
89# Atom		17.9
90# Silvermont	14.0
91# Goldmont	10.2
92#
93# November 2011.
94#
95# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
96# suboptimal, but XTS is meant to be used with larger blocks...
97#
98#						<appro@openssl.org>
99
100$flavour = shift;
101$output  = shift;
102if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
103
104$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
105
106$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
108( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
109die "can't locate x86_64-xlate.pl";
110
111open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
112*STDOUT=*OUT;
113
114my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
115my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
116my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
117
118{
119my ($key,$rounds,$const)=("%rax","%r10d","%r11");
120
121sub Sbox {
122# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
123# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
124my @b=@_[0..7];
125my @t=@_[8..11];
126my @s=@_[12..15];
127	&InBasisChange	(@b);
128	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
129	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
130}
131
132sub InBasisChange {
133# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
134# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
135my @b=@_[0..7];
136$code.=<<___;
137	pxor	@b[6], @b[5]
138	pxor	@b[1], @b[2]
139	pxor	@b[0], @b[3]
140	pxor	@b[2], @b[6]
141	pxor 	@b[0], @b[5]
142
143	pxor	@b[3], @b[6]
144	pxor	@b[7], @b[3]
145	pxor	@b[5], @b[7]
146	pxor	@b[4], @b[3]
147	pxor	@b[5], @b[4]
148	pxor	@b[1], @b[3]
149
150	pxor	@b[7], @b[2]
151	pxor	@b[5], @b[1]
152___
153}
154
155sub OutBasisChange {
156# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
157# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
158my @b=@_[0..7];
159$code.=<<___;
160	pxor	@b[6], @b[0]
161	pxor	@b[4], @b[1]
162	pxor	@b[0], @b[2]
163	pxor	@b[6], @b[4]
164	pxor	@b[1], @b[6]
165
166	pxor	@b[5], @b[1]
167	pxor	@b[3], @b[5]
168	pxor	@b[7], @b[3]
169	pxor	@b[5], @b[7]
170	pxor	@b[5], @b[2]
171
172	pxor	@b[7], @b[4]
173___
174}
175
176sub InvSbox {
177# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
178# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
179my @b=@_[0..7];
180my @t=@_[8..11];
181my @s=@_[12..15];
182	&InvInBasisChange	(@b);
183	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
184	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
185}
186
187sub InvInBasisChange {		# OutBasisChange in reverse
188my @b=@_[5,1,2,6,3,7,0,4];
189$code.=<<___
190	pxor	@b[7], @b[4]
191
192	pxor	@b[5], @b[7]
193	pxor	@b[5], @b[2]
194	pxor	@b[7], @b[3]
195	pxor	@b[3], @b[5]
196	pxor	@b[5], @b[1]
197
198	pxor	@b[1], @b[6]
199	pxor	@b[0], @b[2]
200	pxor	@b[6], @b[4]
201	pxor	@b[6], @b[0]
202	pxor	@b[4], @b[1]
203___
204}
205
206sub InvOutBasisChange {		# InBasisChange in reverse
207my @b=@_[2,5,7,3,6,1,0,4];
208$code.=<<___;
209	pxor	@b[5], @b[1]
210	pxor	@b[7], @b[2]
211
212	pxor	@b[1], @b[3]
213	pxor	@b[5], @b[4]
214	pxor	@b[5], @b[7]
215	pxor	@b[4], @b[3]
216	 pxor 	@b[0], @b[5]
217	pxor	@b[7], @b[3]
218	 pxor	@b[2], @b[6]
219	 pxor	@b[1], @b[2]
220	pxor	@b[3], @b[6]
221
222	pxor	@b[0], @b[3]
223	pxor	@b[6], @b[5]
224___
225}
226
227sub Mul_GF4 {
228#;*************************************************************
229#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
230#;*************************************************************
231my ($x0,$x1,$y0,$y1,$t0)=@_;
232$code.=<<___;
233	movdqa	$y0, $t0
234	pxor 	$y1, $t0
235	pand	$x0, $t0
236	pxor	$x1, $x0
237	pand	$y0, $x1
238	pand	$y1, $x0
239	pxor	$x1, $x0
240	pxor	$t0, $x1
241___
242}
243
244sub Mul_GF4_N {				# not used, see next subroutine
245# multiply and scale by N
246my ($x0,$x1,$y0,$y1,$t0)=@_;
247$code.=<<___;
248	movdqa	$y0, $t0
249	pxor	$y1, $t0
250	pand	$x0, $t0
251	pxor	$x1, $x0
252	pand	$y0, $x1
253	pand	$y1, $x0
254	pxor	$x0, $x1
255	pxor	$t0, $x0
256___
257}
258
259sub Mul_GF4_N_GF4 {
260# interleaved Mul_GF4_N and Mul_GF4
261my ($x0,$x1,$y0,$y1,$t0,
262    $x2,$x3,$y2,$y3,$t1)=@_;
263$code.=<<___;
264	movdqa	$y0, $t0
265	 movdqa	$y2, $t1
266	pxor	$y1, $t0
267	 pxor 	$y3, $t1
268	pand	$x0, $t0
269	 pand	$x2, $t1
270	pxor	$x1, $x0
271	 pxor	$x3, $x2
272	pand	$y0, $x1
273	 pand	$y2, $x3
274	pand	$y1, $x0
275	 pand	$y3, $x2
276	pxor	$x0, $x1
277	 pxor	$x3, $x2
278	pxor	$t0, $x0
279	 pxor	$t1, $x3
280___
281}
282sub Mul_GF16_2 {
283my @x=@_[0..7];
284my @y=@_[8..11];
285my @t=@_[12..15];
286$code.=<<___;
287	movdqa	@x[0], @t[0]
288	movdqa	@x[1], @t[1]
289___
290	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
291$code.=<<___;
292	pxor	@x[2], @t[0]
293	pxor	@x[3], @t[1]
294	pxor	@y[2], @y[0]
295	pxor	@y[3], @y[1]
296___
297	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
298			 @x[2], @x[3], @y[2], @y[3], @t[2]);
299$code.=<<___;
300	pxor	@t[0], @x[0]
301	pxor	@t[0], @x[2]
302	pxor	@t[1], @x[1]
303	pxor	@t[1], @x[3]
304
305	movdqa	@x[4], @t[0]
306	movdqa	@x[5], @t[1]
307	pxor	@x[6], @t[0]
308	pxor	@x[7], @t[1]
309___
310	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
311			 @x[6], @x[7], @y[2], @y[3], @t[2]);
312$code.=<<___;
313	pxor	@y[2], @y[0]
314	pxor	@y[3], @y[1]
315___
316	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
317$code.=<<___;
318	pxor	@t[0], @x[4]
319	pxor	@t[0], @x[6]
320	pxor	@t[1], @x[5]
321	pxor	@t[1], @x[7]
322___
323}
324sub Inv_GF256 {
325#;********************************************************************
326#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
327#;********************************************************************
328my @x=@_[0..7];
329my @t=@_[8..11];
330my @s=@_[12..15];
331# direct optimizations from hardware
332$code.=<<___;
333	movdqa	@x[4], @t[3]
334	movdqa	@x[5], @t[2]
335	movdqa	@x[1], @t[1]
336	movdqa	@x[7], @s[1]
337	movdqa	@x[0], @s[0]
338
339	pxor	@x[6], @t[3]
340	pxor	@x[7], @t[2]
341	pxor	@x[3], @t[1]
342	 movdqa	@t[3], @s[2]
343	pxor	@x[6], @s[1]
344	 movdqa	@t[2], @t[0]
345	pxor	@x[2], @s[0]
346	 movdqa	@t[3], @s[3]
347
348	por	@t[1], @t[2]
349	por	@s[0], @t[3]
350	pxor	@t[0], @s[3]
351	pand	@s[0], @s[2]
352	pxor	@t[1], @s[0]
353	pand	@t[1], @t[0]
354	pand	@s[0], @s[3]
355	movdqa	@x[3], @s[0]
356	pxor	@x[2], @s[0]
357	pand	@s[0], @s[1]
358	pxor	@s[1], @t[3]
359	pxor	@s[1], @t[2]
360	movdqa	@x[4], @s[1]
361	movdqa	@x[1], @s[0]
362	pxor	@x[5], @s[1]
363	pxor	@x[0], @s[0]
364	movdqa	@s[1], @t[1]
365	pand	@s[0], @s[1]
366	por	@s[0], @t[1]
367	pxor	@s[1], @t[0]
368	pxor	@s[3], @t[3]
369	pxor	@s[2], @t[2]
370	pxor	@s[3], @t[1]
371	movdqa	@x[7], @s[0]
372	pxor	@s[2], @t[0]
373	movdqa	@x[6], @s[1]
374	pxor	@s[2], @t[1]
375	movdqa	@x[5], @s[2]
376	pand	@x[3], @s[0]
377	movdqa	@x[4], @s[3]
378	pand	@x[2], @s[1]
379	pand	@x[1], @s[2]
380	por	@x[0], @s[3]
381	pxor	@s[0], @t[3]
382	pxor	@s[1], @t[2]
383	pxor	@s[2], @t[1]
384	pxor	@s[3], @t[0]
385
386	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
387
388	# new smaller inversion
389
390	movdqa	@t[3], @s[0]
391	pand	@t[1], @t[3]
392	pxor	@t[2], @s[0]
393
394	movdqa	@t[0], @s[2]
395	movdqa	@s[0], @s[3]
396	pxor	@t[3], @s[2]
397	pand	@s[2], @s[3]
398
399	movdqa	@t[1], @s[1]
400	pxor	@t[2], @s[3]
401	pxor	@t[0], @s[1]
402
403	pxor	@t[2], @t[3]
404
405	pand	@t[3], @s[1]
406
407	movdqa	@s[2], @t[2]
408	pxor	@t[0], @s[1]
409
410	pxor	@s[1], @t[2]
411	pxor	@s[1], @t[1]
412
413	pand	@t[0], @t[2]
414
415	pxor	@t[2], @s[2]
416	pxor	@t[2], @t[1]
417
418	pand	@s[3], @s[2]
419
420	pxor	@s[0], @s[2]
421___
422# output in s3, s2, s1, t1
423
424# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
425
426# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
427	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
428
429### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
430}
431
432# AES linear components
433
434sub ShiftRows {
435my @x=@_[0..7];
436my $mask=pop;
437$code.=<<___;
438	pxor	0x00($key),@x[0]
439	pxor	0x10($key),@x[1]
440	pxor	0x20($key),@x[2]
441	pxor	0x30($key),@x[3]
442	pshufb	$mask,@x[0]
443	pshufb	$mask,@x[1]
444	pxor	0x40($key),@x[4]
445	pxor	0x50($key),@x[5]
446	pshufb	$mask,@x[2]
447	pshufb	$mask,@x[3]
448	pxor	0x60($key),@x[6]
449	pxor	0x70($key),@x[7]
450	pshufb	$mask,@x[4]
451	pshufb	$mask,@x[5]
452	pshufb	$mask,@x[6]
453	pshufb	$mask,@x[7]
454	lea	0x80($key),$key
455___
456}
457
458sub MixColumns {
459# modified to emit output in order suitable for feeding back to aesenc[last]
460my @x=@_[0..7];
461my @t=@_[8..15];
462my $inv=@_[16];	# optional
463$code.=<<___;
464	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
465	pshufd	\$0x93, @x[1], @t[1]
466	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
467	pshufd	\$0x93, @x[2], @t[2]
468	 pxor	@t[1], @x[1]
469	pshufd	\$0x93, @x[3], @t[3]
470	 pxor	@t[2], @x[2]
471	pshufd	\$0x93, @x[4], @t[4]
472	 pxor	@t[3], @x[3]
473	pshufd	\$0x93, @x[5], @t[5]
474	 pxor	@t[4], @x[4]
475	pshufd	\$0x93, @x[6], @t[6]
476	 pxor	@t[5], @x[5]
477	pshufd	\$0x93, @x[7], @t[7]
478	 pxor	@t[6], @x[6]
479	 pxor	@t[7], @x[7]
480
481	pxor	@x[0], @t[1]
482	pxor	@x[7], @t[0]
483	pxor	@x[7], @t[1]
484	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
485	pxor	@x[1], @t[2]
486	 pshufd	\$0x4E, @x[1], @x[1]
487	pxor	@x[4], @t[5]
488	 pxor	@t[0], @x[0]
489	pxor	@x[5], @t[6]
490	 pxor	@t[1], @x[1]
491	pxor	@x[3], @t[4]
492	 pshufd	\$0x4E, @x[4], @t[0]
493	pxor	@x[6], @t[7]
494	 pshufd	\$0x4E, @x[5], @t[1]
495	pxor	@x[2], @t[3]
496	 pshufd	\$0x4E, @x[3], @x[4]
497	pxor	@x[7], @t[3]
498	 pshufd	\$0x4E, @x[7], @x[5]
499	pxor	@x[7], @t[4]
500	 pshufd	\$0x4E, @x[6], @x[3]
501	pxor	@t[4], @t[0]
502	 pshufd	\$0x4E, @x[2], @x[6]
503	pxor	@t[5], @t[1]
504___
505$code.=<<___ if (!$inv);
506	pxor	@t[3], @x[4]
507	pxor	@t[7], @x[5]
508	pxor	@t[6], @x[3]
509	 movdqa	@t[0], @x[2]
510	pxor	@t[2], @x[6]
511	 movdqa	@t[1], @x[7]
512___
513$code.=<<___ if ($inv);
514	pxor	@x[4], @t[3]
515	pxor	@t[7], @x[5]
516	pxor	@x[3], @t[6]
517	 movdqa	@t[0], @x[3]
518	pxor	@t[2], @x[6]
519	 movdqa	@t[6], @x[2]
520	 movdqa	@t[1], @x[7]
521	 movdqa	@x[6], @x[4]
522	 movdqa	@t[3], @x[6]
523___
524}
525
526sub InvMixColumns_orig {
527my @x=@_[0..7];
528my @t=@_[8..15];
529
530$code.=<<___;
531	# multiplication by 0x0e
532	pshufd	\$0x93, @x[7], @t[7]
533	movdqa	@x[2], @t[2]
534	pxor	@x[5], @x[7]		# 7 5
535	pxor	@x[5], @x[2]		# 2 5
536	pshufd	\$0x93, @x[0], @t[0]
537	movdqa	@x[5], @t[5]
538	pxor	@x[0], @x[5]		# 5 0		[1]
539	pxor	@x[1], @x[0]		# 0 1
540	pshufd	\$0x93, @x[1], @t[1]
541	pxor	@x[2], @x[1]		# 1 25
542	pxor	@x[6], @x[0]		# 01 6		[2]
543	pxor	@x[3], @x[1]		# 125 3		[4]
544	pshufd	\$0x93, @x[3], @t[3]
545	pxor	@x[0], @x[2]		# 25 016	[3]
546	pxor	@x[7], @x[3]		# 3 75
547	pxor	@x[6], @x[7]		# 75 6		[0]
548	pshufd	\$0x93, @x[6], @t[6]
549	movdqa	@x[4], @t[4]
550	pxor	@x[4], @x[6]		# 6 4
551	pxor	@x[3], @x[4]		# 4 375		[6]
552	pxor	@x[7], @x[3]		# 375 756=36
553	pxor	@t[5], @x[6]		# 64 5		[7]
554	pxor	@t[2], @x[3]		# 36 2
555	pxor	@t[4], @x[3]		# 362 4		[5]
556	pshufd	\$0x93, @t[5], @t[5]
557___
558					my @y = @x[7,5,0,2,1,3,4,6];
559$code.=<<___;
560	# multiplication by 0x0b
561	pxor	@y[0], @y[1]
562	pxor	@t[0], @y[0]
563	pxor	@t[1], @y[1]
564	pshufd	\$0x93, @t[2], @t[2]
565	pxor	@t[5], @y[0]
566	pxor	@t[6], @y[1]
567	pxor	@t[7], @y[0]
568	pshufd	\$0x93, @t[4], @t[4]
569	pxor	@t[6], @t[7]		# clobber t[7]
570	pxor	@y[0], @y[1]
571
572	pxor	@t[0], @y[3]
573	pshufd	\$0x93, @t[0], @t[0]
574	pxor	@t[1], @y[2]
575	pxor	@t[1], @y[4]
576	pxor	@t[2], @y[2]
577	pshufd	\$0x93, @t[1], @t[1]
578	pxor	@t[2], @y[3]
579	pxor	@t[2], @y[5]
580	pxor	@t[7], @y[2]
581	pshufd	\$0x93, @t[2], @t[2]
582	pxor	@t[3], @y[3]
583	pxor	@t[3], @y[6]
584	pxor	@t[3], @y[4]
585	pshufd	\$0x93, @t[3], @t[3]
586	pxor	@t[4], @y[7]
587	pxor	@t[4], @y[5]
588	pxor	@t[7], @y[7]
589	pxor	@t[5], @y[3]
590	pxor	@t[4], @y[4]
591	pxor	@t[5], @t[7]		# clobber t[7] even more
592
593	pxor	@t[7], @y[5]
594	pshufd	\$0x93, @t[4], @t[4]
595	pxor	@t[7], @y[6]
596	pxor	@t[7], @y[4]
597
598	pxor	@t[5], @t[7]
599	pshufd	\$0x93, @t[5], @t[5]
600	pxor	@t[6], @t[7]		# restore t[7]
601
602	# multiplication by 0x0d
603	pxor	@y[7], @y[4]
604	pxor	@t[4], @y[7]
605	pshufd	\$0x93, @t[6], @t[6]
606	pxor	@t[0], @y[2]
607	pxor	@t[5], @y[7]
608	pxor	@t[2], @y[2]
609	pshufd	\$0x93, @t[7], @t[7]
610
611	pxor	@y[1], @y[3]
612	pxor	@t[1], @y[1]
613	pxor	@t[0], @y[0]
614	pxor	@t[0], @y[3]
615	pxor	@t[5], @y[1]
616	pxor	@t[5], @y[0]
617	pxor	@t[7], @y[1]
618	pshufd	\$0x93, @t[0], @t[0]
619	pxor	@t[6], @y[0]
620	pxor	@y[1], @y[3]
621	pxor	@t[1], @y[4]
622	pshufd	\$0x93, @t[1], @t[1]
623
624	pxor	@t[7], @y[7]
625	pxor	@t[2], @y[4]
626	pxor	@t[2], @y[5]
627	pshufd	\$0x93, @t[2], @t[2]
628	pxor	@t[6], @y[2]
629	pxor	@t[3], @t[6]		# clobber t[6]
630	pxor	@y[7], @y[4]
631	pxor	@t[6], @y[3]
632
633	pxor	@t[6], @y[6]
634	pxor	@t[5], @y[5]
635	pxor	@t[4], @y[6]
636	pshufd	\$0x93, @t[4], @t[4]
637	pxor	@t[6], @y[5]
638	pxor	@t[7], @y[6]
639	pxor	@t[3], @t[6]		# restore t[6]
640
641	pshufd	\$0x93, @t[5], @t[5]
642	pshufd	\$0x93, @t[6], @t[6]
643	pshufd	\$0x93, @t[7], @t[7]
644	pshufd	\$0x93, @t[3], @t[3]
645
646	# multiplication by 0x09
647	pxor	@y[1], @y[4]
648	pxor	@y[1], @t[1]		# t[1]=y[1]
649	pxor	@t[5], @t[0]		# clobber t[0]
650	pxor	@t[5], @t[1]
651	pxor	@t[0], @y[3]
652	pxor	@y[0], @t[0]		# t[0]=y[0]
653	pxor	@t[6], @t[1]
654	pxor	@t[7], @t[6]		# clobber t[6]
655	pxor	@t[1], @y[4]
656	pxor	@t[4], @y[7]
657	pxor	@y[4], @t[4]		# t[4]=y[4]
658	pxor	@t[3], @y[6]
659	pxor	@y[3], @t[3]		# t[3]=y[3]
660	pxor	@t[2], @y[5]
661	pxor	@y[2], @t[2]		# t[2]=y[2]
662	pxor	@t[7], @t[3]
663	pxor	@y[5], @t[5]		# t[5]=y[5]
664	pxor	@t[6], @t[2]
665	pxor	@t[6], @t[5]
666	pxor	@y[6], @t[6]		# t[6]=y[6]
667	pxor	@y[7], @t[7]		# t[7]=y[7]
668
669	movdqa	@t[0],@XMM[0]
670	movdqa	@t[1],@XMM[1]
671	movdqa	@t[2],@XMM[2]
672	movdqa	@t[3],@XMM[3]
673	movdqa	@t[4],@XMM[4]
674	movdqa	@t[5],@XMM[5]
675	movdqa	@t[6],@XMM[6]
676	movdqa	@t[7],@XMM[7]
677___
678}
679
680sub InvMixColumns {
681my @x=@_[0..7];
682my @t=@_[8..15];
683
684# Thanks to Jussi Kivilinna for providing pointer to
685#
686# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
687# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
688# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
689# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
690
691$code.=<<___;
692	# multiplication by 0x05-0x00-0x04-0x00
693	pshufd	\$0x4E, @x[0], @t[0]
694	pshufd	\$0x4E, @x[6], @t[6]
695	pxor	@x[0], @t[0]
696	pshufd	\$0x4E, @x[7], @t[7]
697	pxor	@x[6], @t[6]
698	pshufd	\$0x4E, @x[1], @t[1]
699	pxor	@x[7], @t[7]
700	pshufd	\$0x4E, @x[2], @t[2]
701	pxor	@x[1], @t[1]
702	pshufd	\$0x4E, @x[3], @t[3]
703	pxor	@x[2], @t[2]
704	 pxor	@t[6], @x[0]
705	 pxor	@t[6], @x[1]
706	pshufd	\$0x4E, @x[4], @t[4]
707	pxor	@x[3], @t[3]
708	 pxor	@t[0], @x[2]
709	 pxor	@t[1], @x[3]
710	pshufd	\$0x4E, @x[5], @t[5]
711	pxor	@x[4], @t[4]
712	 pxor	@t[7], @x[1]
713	 pxor	@t[2], @x[4]
714	pxor	@x[5], @t[5]
715
716	 pxor	@t[7], @x[2]
717	 pxor	@t[6], @x[3]
718	 pxor	@t[6], @x[4]
719	 pxor	@t[3], @x[5]
720	 pxor	@t[4], @x[6]
721	 pxor	@t[7], @x[4]
722	 pxor	@t[7], @x[5]
723	 pxor	@t[5], @x[7]
724___
725	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
726}
727
728sub aesenc {				# not used
729my @b=@_[0..7];
730my @t=@_[8..15];
731$code.=<<___;
732	movdqa	0x30($const),@t[0]	# .LSR
733___
734	&ShiftRows	(@b,@t[0]);
735	&Sbox		(@b,@t);
736	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
737}
738
739sub aesenclast {			# not used
740my @b=@_[0..7];
741my @t=@_[8..15];
742$code.=<<___;
743	movdqa	0x40($const),@t[0]	# .LSRM0
744___
745	&ShiftRows	(@b,@t[0]);
746	&Sbox		(@b,@t);
747$code.=<<___
748	pxor	0x00($key),@b[0]
749	pxor	0x10($key),@b[1]
750	pxor	0x20($key),@b[4]
751	pxor	0x30($key),@b[6]
752	pxor	0x40($key),@b[3]
753	pxor	0x50($key),@b[7]
754	pxor	0x60($key),@b[2]
755	pxor	0x70($key),@b[5]
756___
757}
758
759sub swapmove {
760my ($a,$b,$n,$mask,$t)=@_;
761$code.=<<___;
762	movdqa	$b,$t
763	psrlq	\$$n,$b
764	pxor  	$a,$b
765	pand	$mask,$b
766	pxor	$b,$a
767	psllq	\$$n,$b
768	pxor	$t,$b
769___
770}
771sub swapmove2x {
772my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
773$code.=<<___;
774	movdqa	$b0,$t0
775	psrlq	\$$n,$b0
776	 movdqa	$b1,$t1
777	 psrlq	\$$n,$b1
778	pxor  	$a0,$b0
779	 pxor  	$a1,$b1
780	pand	$mask,$b0
781	 pand	$mask,$b1
782	pxor	$b0,$a0
783	psllq	\$$n,$b0
784	 pxor	$b1,$a1
785	 psllq	\$$n,$b1
786	pxor	$t0,$b0
787	 pxor	$t1,$b1
788___
789}
790
791sub bitslice {
792my @x=reverse(@_[0..7]);
793my ($t0,$t1,$t2,$t3)=@_[8..11];
794$code.=<<___;
795	movdqa	0x00($const),$t0	# .LBS0
796	movdqa	0x10($const),$t1	# .LBS1
797___
798	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
799	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
800$code.=<<___;
801	movdqa	0x20($const),$t0	# .LBS2
802___
803	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
804	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
805
806	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
807	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
808}
809
810$code.=<<___;
811.text
812
813.extern	asm_AES_encrypt
814.extern	asm_AES_decrypt
815
816.type	_bsaes_encrypt8,\@abi-omnipotent
817.align	64
818_bsaes_encrypt8:
819	lea	.LBS0(%rip), $const	# constants table
820
821	movdqa	($key), @XMM[9]		# round 0 key
822	lea	0x10($key), $key
823	movdqa	0x50($const), @XMM[8]	# .LM0SR
824	pxor	@XMM[9], @XMM[0]	# xor with round0 key
825	pxor	@XMM[9], @XMM[1]
826	pxor	@XMM[9], @XMM[2]
827	pxor	@XMM[9], @XMM[3]
828	 pshufb	@XMM[8], @XMM[0]
829	 pshufb	@XMM[8], @XMM[1]
830	pxor	@XMM[9], @XMM[4]
831	pxor	@XMM[9], @XMM[5]
832	 pshufb	@XMM[8], @XMM[2]
833	 pshufb	@XMM[8], @XMM[3]
834	pxor	@XMM[9], @XMM[6]
835	pxor	@XMM[9], @XMM[7]
836	 pshufb	@XMM[8], @XMM[4]
837	 pshufb	@XMM[8], @XMM[5]
838	 pshufb	@XMM[8], @XMM[6]
839	 pshufb	@XMM[8], @XMM[7]
840_bsaes_encrypt8_bitslice:
841___
842	&bitslice	(@XMM[0..7, 8..11]);
843$code.=<<___;
844	dec	$rounds
845	jmp	.Lenc_sbox
846.align	16
847.Lenc_loop:
848___
849	&ShiftRows	(@XMM[0..7, 8]);
850$code.=".Lenc_sbox:\n";
851	&Sbox		(@XMM[0..7, 8..15]);
852$code.=<<___;
853	dec	$rounds
854	jl	.Lenc_done
855___
856	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
857$code.=<<___;
858	movdqa	0x30($const), @XMM[8]	# .LSR
859	jnz	.Lenc_loop
860	movdqa	0x40($const), @XMM[8]	# .LSRM0
861	jmp	.Lenc_loop
862.align	16
863.Lenc_done:
864___
865	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
866	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
867$code.=<<___;
868	movdqa	($key), @XMM[8]		# last round key
869	pxor	@XMM[8], @XMM[4]
870	pxor	@XMM[8], @XMM[6]
871	pxor	@XMM[8], @XMM[3]
872	pxor	@XMM[8], @XMM[7]
873	pxor	@XMM[8], @XMM[2]
874	pxor	@XMM[8], @XMM[5]
875	pxor	@XMM[8], @XMM[0]
876	pxor	@XMM[8], @XMM[1]
877	ret
878.size	_bsaes_encrypt8,.-_bsaes_encrypt8
879
880.type	_bsaes_decrypt8,\@abi-omnipotent
881.align	64
882_bsaes_decrypt8:
883	lea	.LBS0(%rip), $const	# constants table
884
885	movdqa	($key), @XMM[9]		# round 0 key
886	lea	0x10($key), $key
887	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
888	pxor	@XMM[9], @XMM[0]	# xor with round0 key
889	pxor	@XMM[9], @XMM[1]
890	pxor	@XMM[9], @XMM[2]
891	pxor	@XMM[9], @XMM[3]
892	 pshufb	@XMM[8], @XMM[0]
893	 pshufb	@XMM[8], @XMM[1]
894	pxor	@XMM[9], @XMM[4]
895	pxor	@XMM[9], @XMM[5]
896	 pshufb	@XMM[8], @XMM[2]
897	 pshufb	@XMM[8], @XMM[3]
898	pxor	@XMM[9], @XMM[6]
899	pxor	@XMM[9], @XMM[7]
900	 pshufb	@XMM[8], @XMM[4]
901	 pshufb	@XMM[8], @XMM[5]
902	 pshufb	@XMM[8], @XMM[6]
903	 pshufb	@XMM[8], @XMM[7]
904___
905	&bitslice	(@XMM[0..7, 8..11]);
906$code.=<<___;
907	dec	$rounds
908	jmp	.Ldec_sbox
909.align	16
910.Ldec_loop:
911___
912	&ShiftRows	(@XMM[0..7, 8]);
913$code.=".Ldec_sbox:\n";
914	&InvSbox	(@XMM[0..7, 8..15]);
915$code.=<<___;
916	dec	$rounds
917	jl	.Ldec_done
918___
919	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
920$code.=<<___;
921	movdqa	-0x10($const), @XMM[8]	# .LISR
922	jnz	.Ldec_loop
923	movdqa	-0x20($const), @XMM[8]	# .LISRM0
924	jmp	.Ldec_loop
925.align	16
926.Ldec_done:
927___
928	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
929$code.=<<___;
930	movdqa	($key), @XMM[8]		# last round key
931	pxor	@XMM[8], @XMM[6]
932	pxor	@XMM[8], @XMM[4]
933	pxor	@XMM[8], @XMM[2]
934	pxor	@XMM[8], @XMM[7]
935	pxor	@XMM[8], @XMM[3]
936	pxor	@XMM[8], @XMM[5]
937	pxor	@XMM[8], @XMM[0]
938	pxor	@XMM[8], @XMM[1]
939	ret
940.size	_bsaes_decrypt8,.-_bsaes_decrypt8
941___
942}
943{
944my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
945
946sub bitslice_key {
947my @x=reverse(@_[0..7]);
948my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
949
950	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
951$code.=<<___;
952	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
953	movdqa	@x[0], @x[2]
954	movdqa	@x[1], @x[3]
955___
956	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
957
958	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
959$code.=<<___;
960	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
961	movdqa	@x[0], @x[4]
962	movdqa	@x[2], @x[6]
963	movdqa	@x[1], @x[5]
964	movdqa	@x[3], @x[7]
965___
966	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
967	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
968}
969
970$code.=<<___;
971.type	_bsaes_key_convert,\@abi-omnipotent
972.align	16
973_bsaes_key_convert:
974	lea	.Lmasks(%rip), $const
975	movdqu	($inp), %xmm7		# load round 0 key
976	lea	0x10($inp), $inp
977	movdqa	0x00($const), %xmm0	# 0x01...
978	movdqa	0x10($const), %xmm1	# 0x02...
979	movdqa	0x20($const), %xmm2	# 0x04...
980	movdqa	0x30($const), %xmm3	# 0x08...
981	movdqa	0x40($const), %xmm4	# .LM0
982	pcmpeqd	%xmm5, %xmm5		# .LNOT
983
984	movdqu	($inp), %xmm6		# load round 1 key
985	movdqa	%xmm7, ($out)		# save round 0 key
986	lea	0x10($out), $out
987	dec	$rounds
988	jmp	.Lkey_loop
989.align	16
990.Lkey_loop:
991	pshufb	%xmm4, %xmm6		# .LM0
992
993	movdqa	%xmm0,	%xmm8
994	movdqa	%xmm1,	%xmm9
995
996	pand	%xmm6,	%xmm8
997	pand	%xmm6,	%xmm9
998	movdqa	%xmm2,	%xmm10
999	pcmpeqb	%xmm0,	%xmm8
1000	psllq	\$4,	%xmm0		# 0x10...
1001	movdqa	%xmm3,	%xmm11
1002	pcmpeqb	%xmm1,	%xmm9
1003	psllq	\$4,	%xmm1		# 0x20...
1004
1005	pand	%xmm6,	%xmm10
1006	pand	%xmm6,	%xmm11
1007	movdqa	%xmm0,	%xmm12
1008	pcmpeqb	%xmm2,	%xmm10
1009	psllq	\$4,	%xmm2		# 0x40...
1010	movdqa	%xmm1,	%xmm13
1011	pcmpeqb	%xmm3,	%xmm11
1012	psllq	\$4,	%xmm3		# 0x80...
1013
1014	movdqa	%xmm2,	%xmm14
1015	movdqa	%xmm3,	%xmm15
1016	 pxor	%xmm5,	%xmm8		# "pnot"
1017	 pxor	%xmm5,	%xmm9
1018
1019	pand	%xmm6,	%xmm12
1020	pand	%xmm6,	%xmm13
1021	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
1022	pcmpeqb	%xmm0,	%xmm12
1023	psrlq	\$4,	%xmm0		# 0x01...
1024	 movdqa	%xmm9, 0x10($out)
1025	pcmpeqb	%xmm1,	%xmm13
1026	psrlq	\$4,	%xmm1		# 0x02...
1027	 lea	0x10($inp), $inp
1028
1029	pand	%xmm6,	%xmm14
1030	pand	%xmm6,	%xmm15
1031	 movdqa	%xmm10, 0x20($out)
1032	pcmpeqb	%xmm2,	%xmm14
1033	psrlq	\$4,	%xmm2		# 0x04...
1034	 movdqa	%xmm11, 0x30($out)
1035	pcmpeqb	%xmm3,	%xmm15
1036	psrlq	\$4,	%xmm3		# 0x08...
1037	 movdqu	($inp), %xmm6		# load next round key
1038
1039	pxor	%xmm5, %xmm13		# "pnot"
1040	pxor	%xmm5, %xmm14
1041	movdqa	%xmm12, 0x40($out)
1042	movdqa	%xmm13, 0x50($out)
1043	movdqa	%xmm14, 0x60($out)
1044	movdqa	%xmm15, 0x70($out)
1045	lea	0x80($out),$out
1046	dec	$rounds
1047	jnz	.Lkey_loop
1048
1049	movdqa	0x50($const), %xmm7	# .L63
1050	#movdqa	%xmm6, ($out)		# don't save last round key
1051	ret
1052.size	_bsaes_key_convert,.-_bsaes_key_convert
1053___
1054}
1055
1056if (0 && !$win64) {	# following four functions are unsupported interface
1057			# used for benchmarking...
1058$code.=<<___;
1059.globl	bsaes_enc_key_convert
1060.type	bsaes_enc_key_convert,\@function,2
1061.align	16
1062bsaes_enc_key_convert:
1063	mov	240($inp),%r10d		# pass rounds
1064	mov	$inp,%rcx		# pass key
1065	mov	$out,%rax		# pass key schedule
1066	call	_bsaes_key_convert
1067	pxor	%xmm6,%xmm7		# fix up last round key
1068	movdqa	%xmm7,(%rax)		# save last round key
1069	ret
1070.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1071
1072.globl	bsaes_encrypt_128
1073.type	bsaes_encrypt_128,\@function,4
1074.align	16
1075bsaes_encrypt_128:
1076.Lenc128_loop:
1077	movdqu	0x00($inp), @XMM[0]	# load input
1078	movdqu	0x10($inp), @XMM[1]
1079	movdqu	0x20($inp), @XMM[2]
1080	movdqu	0x30($inp), @XMM[3]
1081	movdqu	0x40($inp), @XMM[4]
1082	movdqu	0x50($inp), @XMM[5]
1083	movdqu	0x60($inp), @XMM[6]
1084	movdqu	0x70($inp), @XMM[7]
1085	mov	$key, %rax		# pass the $key
1086	lea	0x80($inp), $inp
1087	mov	\$10,%r10d
1088
1089	call	_bsaes_encrypt8
1090
1091	movdqu	@XMM[0], 0x00($out)	# write output
1092	movdqu	@XMM[1], 0x10($out)
1093	movdqu	@XMM[4], 0x20($out)
1094	movdqu	@XMM[6], 0x30($out)
1095	movdqu	@XMM[3], 0x40($out)
1096	movdqu	@XMM[7], 0x50($out)
1097	movdqu	@XMM[2], 0x60($out)
1098	movdqu	@XMM[5], 0x70($out)
1099	lea	0x80($out), $out
1100	sub	\$0x80,$len
1101	ja	.Lenc128_loop
1102	ret
1103.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1104
1105.globl	bsaes_dec_key_convert
1106.type	bsaes_dec_key_convert,\@function,2
1107.align	16
1108bsaes_dec_key_convert:
1109	mov	240($inp),%r10d		# pass rounds
1110	mov	$inp,%rcx		# pass key
1111	mov	$out,%rax		# pass key schedule
1112	call	_bsaes_key_convert
1113	pxor	($out),%xmm7		# fix up round 0 key
1114	movdqa	%xmm6,(%rax)		# save last round key
1115	movdqa	%xmm7,($out)
1116	ret
1117.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1118
1119.globl	bsaes_decrypt_128
1120.type	bsaes_decrypt_128,\@function,4
1121.align	16
1122bsaes_decrypt_128:
1123.Ldec128_loop:
1124	movdqu	0x00($inp), @XMM[0]	# load input
1125	movdqu	0x10($inp), @XMM[1]
1126	movdqu	0x20($inp), @XMM[2]
1127	movdqu	0x30($inp), @XMM[3]
1128	movdqu	0x40($inp), @XMM[4]
1129	movdqu	0x50($inp), @XMM[5]
1130	movdqu	0x60($inp), @XMM[6]
1131	movdqu	0x70($inp), @XMM[7]
1132	mov	$key, %rax		# pass the $key
1133	lea	0x80($inp), $inp
1134	mov	\$10,%r10d
1135
1136	call	_bsaes_decrypt8
1137
1138	movdqu	@XMM[0], 0x00($out)	# write output
1139	movdqu	@XMM[1], 0x10($out)
1140	movdqu	@XMM[6], 0x20($out)
1141	movdqu	@XMM[4], 0x30($out)
1142	movdqu	@XMM[2], 0x40($out)
1143	movdqu	@XMM[7], 0x50($out)
1144	movdqu	@XMM[3], 0x60($out)
1145	movdqu	@XMM[5], 0x70($out)
1146	lea	0x80($out), $out
1147	sub	\$0x80,$len
1148	ja	.Ldec128_loop
1149	ret
1150.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1151___
1152}
1153{
1154######################################################################
1155#
1156# OpenSSL interface
1157#
1158my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1159						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1160my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1161
1162if ($ecb) {
1163$code.=<<___;
1164.globl	bsaes_ecb_encrypt_blocks
1165.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1166.align	16
1167bsaes_ecb_encrypt_blocks:
1168.cfi_startproc
1169	mov	%rsp, %rax
1170.Lecb_enc_prologue:
1171	push	%rbp
1172.cfi_push	%rbp
1173	push	%rbx
1174.cfi_push	%rbx
1175	push	%r12
1176.cfi_push	%r12
1177	push	%r13
1178.cfi_push	%r13
1179	push	%r14
1180.cfi_push	%r14
1181	push	%r15
1182.cfi_push	%r15
1183	lea	-0x48(%rsp),%rsp
1184.cfi_adjust_cfa_offset	0x48
1185___
1186$code.=<<___ if ($win64);
1187	lea	-0xa0(%rsp), %rsp
1188	movaps	%xmm6, 0x40(%rsp)
1189	movaps	%xmm7, 0x50(%rsp)
1190	movaps	%xmm8, 0x60(%rsp)
1191	movaps	%xmm9, 0x70(%rsp)
1192	movaps	%xmm10, 0x80(%rsp)
1193	movaps	%xmm11, 0x90(%rsp)
1194	movaps	%xmm12, 0xa0(%rsp)
1195	movaps	%xmm13, 0xb0(%rsp)
1196	movaps	%xmm14, 0xc0(%rsp)
1197	movaps	%xmm15, 0xd0(%rsp)
1198.Lecb_enc_body:
1199___
1200$code.=<<___;
1201	mov	%rsp,%rbp		# backup %rsp
1202.cfi_def_cfa_register	%rbp
1203	mov	240($arg4),%eax		# rounds
1204	mov	$arg1,$inp		# backup arguments
1205	mov	$arg2,$out
1206	mov	$arg3,$len
1207	mov	$arg4,$key
1208	cmp	\$8,$arg3
1209	jb	.Lecb_enc_short
1210
1211	mov	%eax,%ebx		# backup rounds
1212	shl	\$7,%rax		# 128 bytes per inner round key
1213	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1214	sub	%rax,%rsp
1215	mov	%rsp,%rax		# pass key schedule
1216	mov	$key,%rcx		# pass key
1217	mov	%ebx,%r10d		# pass rounds
1218	call	_bsaes_key_convert
1219	pxor	%xmm6,%xmm7		# fix up last round key
1220	movdqa	%xmm7,(%rax)		# save last round key
1221
1222	sub	\$8,$len
1223.Lecb_enc_loop:
1224	movdqu	0x00($inp), @XMM[0]	# load input
1225	movdqu	0x10($inp), @XMM[1]
1226	movdqu	0x20($inp), @XMM[2]
1227	movdqu	0x30($inp), @XMM[3]
1228	movdqu	0x40($inp), @XMM[4]
1229	movdqu	0x50($inp), @XMM[5]
1230	mov	%rsp, %rax		# pass key schedule
1231	movdqu	0x60($inp), @XMM[6]
1232	mov	%ebx,%r10d		# pass rounds
1233	movdqu	0x70($inp), @XMM[7]
1234	lea	0x80($inp), $inp
1235
1236	call	_bsaes_encrypt8
1237
1238	movdqu	@XMM[0], 0x00($out)	# write output
1239	movdqu	@XMM[1], 0x10($out)
1240	movdqu	@XMM[4], 0x20($out)
1241	movdqu	@XMM[6], 0x30($out)
1242	movdqu	@XMM[3], 0x40($out)
1243	movdqu	@XMM[7], 0x50($out)
1244	movdqu	@XMM[2], 0x60($out)
1245	movdqu	@XMM[5], 0x70($out)
1246	lea	0x80($out), $out
1247	sub	\$8,$len
1248	jnc	.Lecb_enc_loop
1249
1250	add	\$8,$len
1251	jz	.Lecb_enc_done
1252
1253	movdqu	0x00($inp), @XMM[0]	# load input
1254	mov	%rsp, %rax		# pass key schedule
1255	mov	%ebx,%r10d		# pass rounds
1256	cmp	\$2,$len
1257	jb	.Lecb_enc_one
1258	movdqu	0x10($inp), @XMM[1]
1259	je	.Lecb_enc_two
1260	movdqu	0x20($inp), @XMM[2]
1261	cmp	\$4,$len
1262	jb	.Lecb_enc_three
1263	movdqu	0x30($inp), @XMM[3]
1264	je	.Lecb_enc_four
1265	movdqu	0x40($inp), @XMM[4]
1266	cmp	\$6,$len
1267	jb	.Lecb_enc_five
1268	movdqu	0x50($inp), @XMM[5]
1269	je	.Lecb_enc_six
1270	movdqu	0x60($inp), @XMM[6]
1271	call	_bsaes_encrypt8
1272	movdqu	@XMM[0], 0x00($out)	# write output
1273	movdqu	@XMM[1], 0x10($out)
1274	movdqu	@XMM[4], 0x20($out)
1275	movdqu	@XMM[6], 0x30($out)
1276	movdqu	@XMM[3], 0x40($out)
1277	movdqu	@XMM[7], 0x50($out)
1278	movdqu	@XMM[2], 0x60($out)
1279	jmp	.Lecb_enc_done
1280.align	16
1281.Lecb_enc_six:
1282	call	_bsaes_encrypt8
1283	movdqu	@XMM[0], 0x00($out)	# write output
1284	movdqu	@XMM[1], 0x10($out)
1285	movdqu	@XMM[4], 0x20($out)
1286	movdqu	@XMM[6], 0x30($out)
1287	movdqu	@XMM[3], 0x40($out)
1288	movdqu	@XMM[7], 0x50($out)
1289	jmp	.Lecb_enc_done
1290.align	16
1291.Lecb_enc_five:
1292	call	_bsaes_encrypt8
1293	movdqu	@XMM[0], 0x00($out)	# write output
1294	movdqu	@XMM[1], 0x10($out)
1295	movdqu	@XMM[4], 0x20($out)
1296	movdqu	@XMM[6], 0x30($out)
1297	movdqu	@XMM[3], 0x40($out)
1298	jmp	.Lecb_enc_done
1299.align	16
1300.Lecb_enc_four:
1301	call	_bsaes_encrypt8
1302	movdqu	@XMM[0], 0x00($out)	# write output
1303	movdqu	@XMM[1], 0x10($out)
1304	movdqu	@XMM[4], 0x20($out)
1305	movdqu	@XMM[6], 0x30($out)
1306	jmp	.Lecb_enc_done
1307.align	16
1308.Lecb_enc_three:
1309	call	_bsaes_encrypt8
1310	movdqu	@XMM[0], 0x00($out)	# write output
1311	movdqu	@XMM[1], 0x10($out)
1312	movdqu	@XMM[4], 0x20($out)
1313	jmp	.Lecb_enc_done
1314.align	16
1315.Lecb_enc_two:
1316	call	_bsaes_encrypt8
1317	movdqu	@XMM[0], 0x00($out)	# write output
1318	movdqu	@XMM[1], 0x10($out)
1319	jmp	.Lecb_enc_done
1320.align	16
1321.Lecb_enc_one:
1322	call	_bsaes_encrypt8
1323	movdqu	@XMM[0], 0x00($out)	# write output
1324	jmp	.Lecb_enc_done
1325.align	16
1326.Lecb_enc_short:
1327	lea	($inp), $arg1
1328	lea	($out), $arg2
1329	lea	($key), $arg3
1330	call	asm_AES_encrypt
1331	lea	16($inp), $inp
1332	lea	16($out), $out
1333	dec	$len
1334	jnz	.Lecb_enc_short
1335
1336.Lecb_enc_done:
1337	lea	(%rsp),%rax
1338	pxor	%xmm0, %xmm0
1339.Lecb_enc_bzero:			# wipe key schedule [if any]
1340	movdqa	%xmm0, 0x00(%rax)
1341	movdqa	%xmm0, 0x10(%rax)
1342	lea	0x20(%rax), %rax
1343	cmp	%rax, %rbp
1344	jb	.Lecb_enc_bzero
1345
1346	lea	0x78(%rbp),%rax
1347.cfi_def_cfa	%rax,8
1348___
1349$code.=<<___ if ($win64);
1350	movaps	0x40(%rbp), %xmm6
1351	movaps	0x50(%rbp), %xmm7
1352	movaps	0x60(%rbp), %xmm8
1353	movaps	0x70(%rbp), %xmm9
1354	movaps	0x80(%rbp), %xmm10
1355	movaps	0x90(%rbp), %xmm11
1356	movaps	0xa0(%rbp), %xmm12
1357	movaps	0xb0(%rbp), %xmm13
1358	movaps	0xc0(%rbp), %xmm14
1359	movaps	0xd0(%rbp), %xmm15
1360	lea	0xa0(%rax), %rax
1361.Lecb_enc_tail:
1362___
1363$code.=<<___;
1364	mov	-48(%rax), %r15
1365.cfi_restore	%r15
1366	mov	-40(%rax), %r14
1367.cfi_restore	%r14
1368	mov	-32(%rax), %r13
1369.cfi_restore	%r13
1370	mov	-24(%rax), %r12
1371.cfi_restore	%r12
1372	mov	-16(%rax), %rbx
1373.cfi_restore	%rbx
1374	mov	-8(%rax), %rbp
1375.cfi_restore	%rbp
1376	lea	(%rax), %rsp		# restore %rsp
1377.cfi_def_cfa_register	%rsp
1378.Lecb_enc_epilogue:
1379	ret
1380.cfi_endproc
1381.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1382
1383.globl	bsaes_ecb_decrypt_blocks
1384.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1385.align	16
1386bsaes_ecb_decrypt_blocks:
1387.cfi_startproc
1388	mov	%rsp, %rax
1389.Lecb_dec_prologue:
1390	push	%rbp
1391.cfi_push	%rbp
1392	push	%rbx
1393.cfi_push	%rbx
1394	push	%r12
1395.cfi_push	%r12
1396	push	%r13
1397.cfi_push	%r13
1398	push	%r14
1399.cfi_push	%r14
1400	push	%r15
1401.cfi_push	%r15
1402	lea	-0x48(%rsp),%rsp
1403.cfi_adjust_cfa_offset	0x48
1404___
1405$code.=<<___ if ($win64);
1406	lea	-0xa0(%rsp), %rsp
1407	movaps	%xmm6, 0x40(%rsp)
1408	movaps	%xmm7, 0x50(%rsp)
1409	movaps	%xmm8, 0x60(%rsp)
1410	movaps	%xmm9, 0x70(%rsp)
1411	movaps	%xmm10, 0x80(%rsp)
1412	movaps	%xmm11, 0x90(%rsp)
1413	movaps	%xmm12, 0xa0(%rsp)
1414	movaps	%xmm13, 0xb0(%rsp)
1415	movaps	%xmm14, 0xc0(%rsp)
1416	movaps	%xmm15, 0xd0(%rsp)
1417.Lecb_dec_body:
1418___
1419$code.=<<___;
1420	mov	%rsp,%rbp		# backup %rsp
1421.cfi_def_cfa_register	%rbp
1422	mov	240($arg4),%eax		# rounds
1423	mov	$arg1,$inp		# backup arguments
1424	mov	$arg2,$out
1425	mov	$arg3,$len
1426	mov	$arg4,$key
1427	cmp	\$8,$arg3
1428	jb	.Lecb_dec_short
1429
1430	mov	%eax,%ebx		# backup rounds
1431	shl	\$7,%rax		# 128 bytes per inner round key
1432	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1433	sub	%rax,%rsp
1434	mov	%rsp,%rax		# pass key schedule
1435	mov	$key,%rcx		# pass key
1436	mov	%ebx,%r10d		# pass rounds
1437	call	_bsaes_key_convert
1438	pxor	(%rsp),%xmm7		# fix up 0 round key
1439	movdqa	%xmm6,(%rax)		# save last round key
1440	movdqa	%xmm7,(%rsp)
1441
1442	sub	\$8,$len
1443.Lecb_dec_loop:
1444	movdqu	0x00($inp), @XMM[0]	# load input
1445	movdqu	0x10($inp), @XMM[1]
1446	movdqu	0x20($inp), @XMM[2]
1447	movdqu	0x30($inp), @XMM[3]
1448	movdqu	0x40($inp), @XMM[4]
1449	movdqu	0x50($inp), @XMM[5]
1450	mov	%rsp, %rax		# pass key schedule
1451	movdqu	0x60($inp), @XMM[6]
1452	mov	%ebx,%r10d		# pass rounds
1453	movdqu	0x70($inp), @XMM[7]
1454	lea	0x80($inp), $inp
1455
1456	call	_bsaes_decrypt8
1457
1458	movdqu	@XMM[0], 0x00($out)	# write output
1459	movdqu	@XMM[1], 0x10($out)
1460	movdqu	@XMM[6], 0x20($out)
1461	movdqu	@XMM[4], 0x30($out)
1462	movdqu	@XMM[2], 0x40($out)
1463	movdqu	@XMM[7], 0x50($out)
1464	movdqu	@XMM[3], 0x60($out)
1465	movdqu	@XMM[5], 0x70($out)
1466	lea	0x80($out), $out
1467	sub	\$8,$len
1468	jnc	.Lecb_dec_loop
1469
1470	add	\$8,$len
1471	jz	.Lecb_dec_done
1472
1473	movdqu	0x00($inp), @XMM[0]	# load input
1474	mov	%rsp, %rax		# pass key schedule
1475	mov	%ebx,%r10d		# pass rounds
1476	cmp	\$2,$len
1477	jb	.Lecb_dec_one
1478	movdqu	0x10($inp), @XMM[1]
1479	je	.Lecb_dec_two
1480	movdqu	0x20($inp), @XMM[2]
1481	cmp	\$4,$len
1482	jb	.Lecb_dec_three
1483	movdqu	0x30($inp), @XMM[3]
1484	je	.Lecb_dec_four
1485	movdqu	0x40($inp), @XMM[4]
1486	cmp	\$6,$len
1487	jb	.Lecb_dec_five
1488	movdqu	0x50($inp), @XMM[5]
1489	je	.Lecb_dec_six
1490	movdqu	0x60($inp), @XMM[6]
1491	call	_bsaes_decrypt8
1492	movdqu	@XMM[0], 0x00($out)	# write output
1493	movdqu	@XMM[1], 0x10($out)
1494	movdqu	@XMM[6], 0x20($out)
1495	movdqu	@XMM[4], 0x30($out)
1496	movdqu	@XMM[2], 0x40($out)
1497	movdqu	@XMM[7], 0x50($out)
1498	movdqu	@XMM[3], 0x60($out)
1499	jmp	.Lecb_dec_done
1500.align	16
1501.Lecb_dec_six:
1502	call	_bsaes_decrypt8
1503	movdqu	@XMM[0], 0x00($out)	# write output
1504	movdqu	@XMM[1], 0x10($out)
1505	movdqu	@XMM[6], 0x20($out)
1506	movdqu	@XMM[4], 0x30($out)
1507	movdqu	@XMM[2], 0x40($out)
1508	movdqu	@XMM[7], 0x50($out)
1509	jmp	.Lecb_dec_done
1510.align	16
1511.Lecb_dec_five:
1512	call	_bsaes_decrypt8
1513	movdqu	@XMM[0], 0x00($out)	# write output
1514	movdqu	@XMM[1], 0x10($out)
1515	movdqu	@XMM[6], 0x20($out)
1516	movdqu	@XMM[4], 0x30($out)
1517	movdqu	@XMM[2], 0x40($out)
1518	jmp	.Lecb_dec_done
1519.align	16
1520.Lecb_dec_four:
1521	call	_bsaes_decrypt8
1522	movdqu	@XMM[0], 0x00($out)	# write output
1523	movdqu	@XMM[1], 0x10($out)
1524	movdqu	@XMM[6], 0x20($out)
1525	movdqu	@XMM[4], 0x30($out)
1526	jmp	.Lecb_dec_done
1527.align	16
1528.Lecb_dec_three:
1529	call	_bsaes_decrypt8
1530	movdqu	@XMM[0], 0x00($out)	# write output
1531	movdqu	@XMM[1], 0x10($out)
1532	movdqu	@XMM[6], 0x20($out)
1533	jmp	.Lecb_dec_done
1534.align	16
1535.Lecb_dec_two:
1536	call	_bsaes_decrypt8
1537	movdqu	@XMM[0], 0x00($out)	# write output
1538	movdqu	@XMM[1], 0x10($out)
1539	jmp	.Lecb_dec_done
1540.align	16
1541.Lecb_dec_one:
1542	call	_bsaes_decrypt8
1543	movdqu	@XMM[0], 0x00($out)	# write output
1544	jmp	.Lecb_dec_done
1545.align	16
1546.Lecb_dec_short:
1547	lea	($inp), $arg1
1548	lea	($out), $arg2
1549	lea	($key), $arg3
1550	call	asm_AES_decrypt
1551	lea	16($inp), $inp
1552	lea	16($out), $out
1553	dec	$len
1554	jnz	.Lecb_dec_short
1555
1556.Lecb_dec_done:
1557	lea	(%rsp),%rax
1558	pxor	%xmm0, %xmm0
1559.Lecb_dec_bzero:			# wipe key schedule [if any]
1560	movdqa	%xmm0, 0x00(%rax)
1561	movdqa	%xmm0, 0x10(%rax)
1562	lea	0x20(%rax), %rax
1563	cmp	%rax, %rbp
1564	jb	.Lecb_dec_bzero
1565
1566	lea	0x78(%rbp),%rax
1567.cfi_def_cfa	%rax,8
1568___
1569$code.=<<___ if ($win64);
1570	movaps	0x40(%rbp), %xmm6
1571	movaps	0x50(%rbp), %xmm7
1572	movaps	0x60(%rbp), %xmm8
1573	movaps	0x70(%rbp), %xmm9
1574	movaps	0x80(%rbp), %xmm10
1575	movaps	0x90(%rbp), %xmm11
1576	movaps	0xa0(%rbp), %xmm12
1577	movaps	0xb0(%rbp), %xmm13
1578	movaps	0xc0(%rbp), %xmm14
1579	movaps	0xd0(%rbp), %xmm15
1580	lea	0xa0(%rax), %rax
1581.Lecb_dec_tail:
1582___
1583$code.=<<___;
1584	mov	-48(%rax), %r15
1585.cfi_restore	%r15
1586	mov	-40(%rax), %r14
1587.cfi_restore	%r14
1588	mov	-32(%rax), %r13
1589.cfi_restore	%r13
1590	mov	-24(%rax), %r12
1591.cfi_restore	%r12
1592	mov	-16(%rax), %rbx
1593.cfi_restore	%rbx
1594	mov	-8(%rax), %rbp
1595.cfi_restore	%rbp
1596	lea	(%rax), %rsp		# restore %rsp
1597.cfi_def_cfa_register	%rsp
1598.Lecb_dec_epilogue:
1599	ret
1600.cfi_endproc
1601.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1602___
1603}
1604$code.=<<___;
1605.extern	asm_AES_cbc_encrypt
1606.globl	bsaes_cbc_encrypt
1607.type	bsaes_cbc_encrypt,\@abi-omnipotent
1608.align	16
1609bsaes_cbc_encrypt:
1610.cfi_startproc
1611___
1612$code.=<<___ if ($win64);
1613	mov	48(%rsp),$arg6		# pull direction flag
1614___
1615$code.=<<___;
1616	cmp	\$0,$arg6
1617	jne	asm_AES_cbc_encrypt
1618	cmp	\$128,$arg3
1619	jb	asm_AES_cbc_encrypt
1620
1621	mov	%rsp, %rax
1622.Lcbc_dec_prologue:
1623	push	%rbp
1624.cfi_push	%rbp
1625	push	%rbx
1626.cfi_push	%rbx
1627	push	%r12
1628.cfi_push	%r12
1629	push	%r13
1630.cfi_push	%r13
1631	push	%r14
1632.cfi_push	%r14
1633	push	%r15
1634.cfi_push	%r15
1635	lea	-0x48(%rsp), %rsp
1636.cfi_adjust_cfa_offset	0x48
1637___
1638$code.=<<___ if ($win64);
1639	mov	0xa0(%rsp),$arg5	# pull ivp
1640	lea	-0xa0(%rsp), %rsp
1641	movaps	%xmm6, 0x40(%rsp)
1642	movaps	%xmm7, 0x50(%rsp)
1643	movaps	%xmm8, 0x60(%rsp)
1644	movaps	%xmm9, 0x70(%rsp)
1645	movaps	%xmm10, 0x80(%rsp)
1646	movaps	%xmm11, 0x90(%rsp)
1647	movaps	%xmm12, 0xa0(%rsp)
1648	movaps	%xmm13, 0xb0(%rsp)
1649	movaps	%xmm14, 0xc0(%rsp)
1650	movaps	%xmm15, 0xd0(%rsp)
1651.Lcbc_dec_body:
1652___
1653$code.=<<___;
1654	mov	%rsp, %rbp		# backup %rsp
1655.cfi_def_cfa_register	%rbp
1656	mov	240($arg4), %eax	# rounds
1657	mov	$arg1, $inp		# backup arguments
1658	mov	$arg2, $out
1659	mov	$arg3, $len
1660	mov	$arg4, $key
1661	mov	$arg5, %rbx
1662	shr	\$4, $len		# bytes to blocks
1663
1664	mov	%eax, %edx		# rounds
1665	shl	\$7, %rax		# 128 bytes per inner round key
1666	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1667	sub	%rax, %rsp
1668
1669	mov	%rsp, %rax		# pass key schedule
1670	mov	$key, %rcx		# pass key
1671	mov	%edx, %r10d		# pass rounds
1672	call	_bsaes_key_convert
1673	pxor	(%rsp),%xmm7		# fix up 0 round key
1674	movdqa	%xmm6,(%rax)		# save last round key
1675	movdqa	%xmm7,(%rsp)
1676
1677	movdqu	(%rbx), @XMM[15]	# load IV
1678	sub	\$8,$len
1679.Lcbc_dec_loop:
1680	movdqu	0x00($inp), @XMM[0]	# load input
1681	movdqu	0x10($inp), @XMM[1]
1682	movdqu	0x20($inp), @XMM[2]
1683	movdqu	0x30($inp), @XMM[3]
1684	movdqu	0x40($inp), @XMM[4]
1685	movdqu	0x50($inp), @XMM[5]
1686	mov	%rsp, %rax		# pass key schedule
1687	movdqu	0x60($inp), @XMM[6]
1688	mov	%edx,%r10d		# pass rounds
1689	movdqu	0x70($inp), @XMM[7]
1690	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1691
1692	call	_bsaes_decrypt8
1693
1694	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1695	movdqu	0x00($inp), @XMM[8]	# re-load input
1696	movdqu	0x10($inp), @XMM[9]
1697	pxor	@XMM[8], @XMM[1]
1698	movdqu	0x20($inp), @XMM[10]
1699	pxor	@XMM[9], @XMM[6]
1700	movdqu	0x30($inp), @XMM[11]
1701	pxor	@XMM[10], @XMM[4]
1702	movdqu	0x40($inp), @XMM[12]
1703	pxor	@XMM[11], @XMM[2]
1704	movdqu	0x50($inp), @XMM[13]
1705	pxor	@XMM[12], @XMM[7]
1706	movdqu	0x60($inp), @XMM[14]
1707	pxor	@XMM[13], @XMM[3]
1708	movdqu	0x70($inp), @XMM[15]	# IV
1709	pxor	@XMM[14], @XMM[5]
1710	movdqu	@XMM[0], 0x00($out)	# write output
1711	lea	0x80($inp), $inp
1712	movdqu	@XMM[1], 0x10($out)
1713	movdqu	@XMM[6], 0x20($out)
1714	movdqu	@XMM[4], 0x30($out)
1715	movdqu	@XMM[2], 0x40($out)
1716	movdqu	@XMM[7], 0x50($out)
1717	movdqu	@XMM[3], 0x60($out)
1718	movdqu	@XMM[5], 0x70($out)
1719	lea	0x80($out), $out
1720	sub	\$8,$len
1721	jnc	.Lcbc_dec_loop
1722
1723	add	\$8,$len
1724	jz	.Lcbc_dec_done
1725
1726	movdqu	0x00($inp), @XMM[0]	# load input
1727	mov	%rsp, %rax		# pass key schedule
1728	mov	%edx, %r10d		# pass rounds
1729	cmp	\$2,$len
1730	jb	.Lcbc_dec_one
1731	movdqu	0x10($inp), @XMM[1]
1732	je	.Lcbc_dec_two
1733	movdqu	0x20($inp), @XMM[2]
1734	cmp	\$4,$len
1735	jb	.Lcbc_dec_three
1736	movdqu	0x30($inp), @XMM[3]
1737	je	.Lcbc_dec_four
1738	movdqu	0x40($inp), @XMM[4]
1739	cmp	\$6,$len
1740	jb	.Lcbc_dec_five
1741	movdqu	0x50($inp), @XMM[5]
1742	je	.Lcbc_dec_six
1743	movdqu	0x60($inp), @XMM[6]
1744	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1745	call	_bsaes_decrypt8
1746	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1747	movdqu	0x00($inp), @XMM[8]	# re-load input
1748	movdqu	0x10($inp), @XMM[9]
1749	pxor	@XMM[8], @XMM[1]
1750	movdqu	0x20($inp), @XMM[10]
1751	pxor	@XMM[9], @XMM[6]
1752	movdqu	0x30($inp), @XMM[11]
1753	pxor	@XMM[10], @XMM[4]
1754	movdqu	0x40($inp), @XMM[12]
1755	pxor	@XMM[11], @XMM[2]
1756	movdqu	0x50($inp), @XMM[13]
1757	pxor	@XMM[12], @XMM[7]
1758	movdqu	0x60($inp), @XMM[15]	# IV
1759	pxor	@XMM[13], @XMM[3]
1760	movdqu	@XMM[0], 0x00($out)	# write output
1761	movdqu	@XMM[1], 0x10($out)
1762	movdqu	@XMM[6], 0x20($out)
1763	movdqu	@XMM[4], 0x30($out)
1764	movdqu	@XMM[2], 0x40($out)
1765	movdqu	@XMM[7], 0x50($out)
1766	movdqu	@XMM[3], 0x60($out)
1767	jmp	.Lcbc_dec_done
1768.align	16
1769.Lcbc_dec_six:
1770	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1771	call	_bsaes_decrypt8
1772	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1773	movdqu	0x00($inp), @XMM[8]	# re-load input
1774	movdqu	0x10($inp), @XMM[9]
1775	pxor	@XMM[8], @XMM[1]
1776	movdqu	0x20($inp), @XMM[10]
1777	pxor	@XMM[9], @XMM[6]
1778	movdqu	0x30($inp), @XMM[11]
1779	pxor	@XMM[10], @XMM[4]
1780	movdqu	0x40($inp), @XMM[12]
1781	pxor	@XMM[11], @XMM[2]
1782	movdqu	0x50($inp), @XMM[15]	# IV
1783	pxor	@XMM[12], @XMM[7]
1784	movdqu	@XMM[0], 0x00($out)	# write output
1785	movdqu	@XMM[1], 0x10($out)
1786	movdqu	@XMM[6], 0x20($out)
1787	movdqu	@XMM[4], 0x30($out)
1788	movdqu	@XMM[2], 0x40($out)
1789	movdqu	@XMM[7], 0x50($out)
1790	jmp	.Lcbc_dec_done
1791.align	16
1792.Lcbc_dec_five:
1793	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1794	call	_bsaes_decrypt8
1795	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1796	movdqu	0x00($inp), @XMM[8]	# re-load input
1797	movdqu	0x10($inp), @XMM[9]
1798	pxor	@XMM[8], @XMM[1]
1799	movdqu	0x20($inp), @XMM[10]
1800	pxor	@XMM[9], @XMM[6]
1801	movdqu	0x30($inp), @XMM[11]
1802	pxor	@XMM[10], @XMM[4]
1803	movdqu	0x40($inp), @XMM[15]	# IV
1804	pxor	@XMM[11], @XMM[2]
1805	movdqu	@XMM[0], 0x00($out)	# write output
1806	movdqu	@XMM[1], 0x10($out)
1807	movdqu	@XMM[6], 0x20($out)
1808	movdqu	@XMM[4], 0x30($out)
1809	movdqu	@XMM[2], 0x40($out)
1810	jmp	.Lcbc_dec_done
1811.align	16
1812.Lcbc_dec_four:
1813	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1814	call	_bsaes_decrypt8
1815	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1816	movdqu	0x00($inp), @XMM[8]	# re-load input
1817	movdqu	0x10($inp), @XMM[9]
1818	pxor	@XMM[8], @XMM[1]
1819	movdqu	0x20($inp), @XMM[10]
1820	pxor	@XMM[9], @XMM[6]
1821	movdqu	0x30($inp), @XMM[15]	# IV
1822	pxor	@XMM[10], @XMM[4]
1823	movdqu	@XMM[0], 0x00($out)	# write output
1824	movdqu	@XMM[1], 0x10($out)
1825	movdqu	@XMM[6], 0x20($out)
1826	movdqu	@XMM[4], 0x30($out)
1827	jmp	.Lcbc_dec_done
1828.align	16
1829.Lcbc_dec_three:
1830	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1831	call	_bsaes_decrypt8
1832	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1833	movdqu	0x00($inp), @XMM[8]	# re-load input
1834	movdqu	0x10($inp), @XMM[9]
1835	pxor	@XMM[8], @XMM[1]
1836	movdqu	0x20($inp), @XMM[15]	# IV
1837	pxor	@XMM[9], @XMM[6]
1838	movdqu	@XMM[0], 0x00($out)	# write output
1839	movdqu	@XMM[1], 0x10($out)
1840	movdqu	@XMM[6], 0x20($out)
1841	jmp	.Lcbc_dec_done
1842.align	16
1843.Lcbc_dec_two:
1844	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1845	call	_bsaes_decrypt8
1846	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1847	movdqu	0x00($inp), @XMM[8]	# re-load input
1848	movdqu	0x10($inp), @XMM[15]	# IV
1849	pxor	@XMM[8], @XMM[1]
1850	movdqu	@XMM[0], 0x00($out)	# write output
1851	movdqu	@XMM[1], 0x10($out)
1852	jmp	.Lcbc_dec_done
1853.align	16
1854.Lcbc_dec_one:
1855	lea	($inp), $arg1
1856	lea	0x20(%rbp), $arg2	# buffer output
1857	lea	($key), $arg3
1858	call	asm_AES_decrypt		# doesn't touch %xmm
1859	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1860	movdqu	@XMM[15], ($out)	# write output
1861	movdqa	@XMM[0], @XMM[15]	# IV
1862
1863.Lcbc_dec_done:
1864	movdqu	@XMM[15], (%rbx)	# return IV
1865	lea	(%rsp), %rax
1866	pxor	%xmm0, %xmm0
1867.Lcbc_dec_bzero:			# wipe key schedule [if any]
1868	movdqa	%xmm0, 0x00(%rax)
1869	movdqa	%xmm0, 0x10(%rax)
1870	lea	0x20(%rax), %rax
1871	cmp	%rax, %rbp
1872	ja	.Lcbc_dec_bzero
1873
1874	lea	0x78(%rbp),%rax
1875.cfi_def_cfa	%rax,8
1876___
1877$code.=<<___ if ($win64);
1878	movaps	0x40(%rbp), %xmm6
1879	movaps	0x50(%rbp), %xmm7
1880	movaps	0x60(%rbp), %xmm8
1881	movaps	0x70(%rbp), %xmm9
1882	movaps	0x80(%rbp), %xmm10
1883	movaps	0x90(%rbp), %xmm11
1884	movaps	0xa0(%rbp), %xmm12
1885	movaps	0xb0(%rbp), %xmm13
1886	movaps	0xc0(%rbp), %xmm14
1887	movaps	0xd0(%rbp), %xmm15
1888	lea	0xa0(%rax), %rax
1889.Lcbc_dec_tail:
1890___
1891$code.=<<___;
1892	mov	-48(%rax), %r15
1893.cfi_restore	%r15
1894	mov	-40(%rax), %r14
1895.cfi_restore	%r14
1896	mov	-32(%rax), %r13
1897.cfi_restore	%r13
1898	mov	-24(%rax), %r12
1899.cfi_restore	%r12
1900	mov	-16(%rax), %rbx
1901.cfi_restore	%rbx
1902	mov	-8(%rax), %rbp
1903.cfi_restore	%rbp
1904	lea	(%rax), %rsp		# restore %rsp
1905.cfi_def_cfa_register	%rsp
1906.Lcbc_dec_epilogue:
1907	ret
1908.cfi_endproc
1909.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1910
1911.globl	bsaes_ctr32_encrypt_blocks
1912.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1913.align	16
1914bsaes_ctr32_encrypt_blocks:
1915.cfi_startproc
1916	mov	%rsp, %rax
1917.Lctr_enc_prologue:
1918	push	%rbp
1919.cfi_push	%rbp
1920	push	%rbx
1921.cfi_push	%rbx
1922	push	%r12
1923.cfi_push	%r12
1924	push	%r13
1925.cfi_push	%r13
1926	push	%r14
1927.cfi_push	%r14
1928	push	%r15
1929.cfi_push	%r15
1930	lea	-0x48(%rsp), %rsp
1931.cfi_adjust_cfa_offset	0x48
1932___
1933$code.=<<___ if ($win64);
1934	mov	0xa0(%rsp),$arg5	# pull ivp
1935	lea	-0xa0(%rsp), %rsp
1936	movaps	%xmm6, 0x40(%rsp)
1937	movaps	%xmm7, 0x50(%rsp)
1938	movaps	%xmm8, 0x60(%rsp)
1939	movaps	%xmm9, 0x70(%rsp)
1940	movaps	%xmm10, 0x80(%rsp)
1941	movaps	%xmm11, 0x90(%rsp)
1942	movaps	%xmm12, 0xa0(%rsp)
1943	movaps	%xmm13, 0xb0(%rsp)
1944	movaps	%xmm14, 0xc0(%rsp)
1945	movaps	%xmm15, 0xd0(%rsp)
1946.Lctr_enc_body:
1947___
1948$code.=<<___;
1949	mov	%rsp, %rbp		# backup %rsp
1950.cfi_def_cfa_register	%rbp
1951	movdqu	($arg5), %xmm0		# load counter
1952	mov	240($arg4), %eax	# rounds
1953	mov	$arg1, $inp		# backup arguments
1954	mov	$arg2, $out
1955	mov	$arg3, $len
1956	mov	$arg4, $key
1957	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1958	cmp	\$8, $arg3
1959	jb	.Lctr_enc_short
1960
1961	mov	%eax, %ebx		# rounds
1962	shl	\$7, %rax		# 128 bytes per inner round key
1963	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1964	sub	%rax, %rsp
1965
1966	mov	%rsp, %rax		# pass key schedule
1967	mov	$key, %rcx		# pass key
1968	mov	%ebx, %r10d		# pass rounds
1969	call	_bsaes_key_convert
1970	pxor	%xmm6,%xmm7		# fix up last round key
1971	movdqa	%xmm7,(%rax)		# save last round key
1972
1973	movdqa	(%rsp), @XMM[9]		# load round0 key
1974	lea	.LADD1(%rip), %r11
1975	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1976	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1977	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1978	pshufb	@XMM[8], @XMM[0]
1979	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1980	jmp	.Lctr_enc_loop
1981.align	16
1982.Lctr_enc_loop:
1983	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1984	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1985	movdqa	@XMM[0], @XMM[2]
1986	paddd	0x00(%r11), @XMM[1]	# .LADD1
1987	movdqa	@XMM[0], @XMM[3]
1988	paddd	0x10(%r11), @XMM[2]	# .LADD2
1989	movdqa	@XMM[0], @XMM[4]
1990	paddd	0x20(%r11), @XMM[3]	# .LADD3
1991	movdqa	@XMM[0], @XMM[5]
1992	paddd	0x30(%r11), @XMM[4]	# .LADD4
1993	movdqa	@XMM[0], @XMM[6]
1994	paddd	0x40(%r11), @XMM[5]	# .LADD5
1995	movdqa	@XMM[0], @XMM[7]
1996	paddd	0x50(%r11), @XMM[6]	# .LADD6
1997	paddd	0x60(%r11), @XMM[7]	# .LADD7
1998
1999	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
2000	# to flip byte order in 32-bit counter
2001	movdqa	(%rsp), @XMM[9]		# round 0 key
2002	lea	0x10(%rsp), %rax	# pass key schedule
2003	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
2004	pxor	@XMM[9], @XMM[0]	# xor with round0 key
2005	pxor	@XMM[9], @XMM[1]
2006	pxor	@XMM[9], @XMM[2]
2007	pxor	@XMM[9], @XMM[3]
2008	 pshufb	@XMM[8], @XMM[0]
2009	 pshufb	@XMM[8], @XMM[1]
2010	pxor	@XMM[9], @XMM[4]
2011	pxor	@XMM[9], @XMM[5]
2012	 pshufb	@XMM[8], @XMM[2]
2013	 pshufb	@XMM[8], @XMM[3]
2014	pxor	@XMM[9], @XMM[6]
2015	pxor	@XMM[9], @XMM[7]
2016	 pshufb	@XMM[8], @XMM[4]
2017	 pshufb	@XMM[8], @XMM[5]
2018	 pshufb	@XMM[8], @XMM[6]
2019	 pshufb	@XMM[8], @XMM[7]
2020	lea	.LBS0(%rip), %r11	# constants table
2021	mov	%ebx,%r10d		# pass rounds
2022
2023	call	_bsaes_encrypt8_bitslice
2024
2025	sub	\$8,$len
2026	jc	.Lctr_enc_loop_done
2027
2028	movdqu	0x00($inp), @XMM[8]	# load input
2029	movdqu	0x10($inp), @XMM[9]
2030	movdqu	0x20($inp), @XMM[10]
2031	movdqu	0x30($inp), @XMM[11]
2032	movdqu	0x40($inp), @XMM[12]
2033	movdqu	0x50($inp), @XMM[13]
2034	movdqu	0x60($inp), @XMM[14]
2035	movdqu	0x70($inp), @XMM[15]
2036	lea	0x80($inp),$inp
2037	pxor	@XMM[0], @XMM[8]
2038	movdqa	0x20(%rbp), @XMM[0]	# load counter
2039	pxor	@XMM[9], @XMM[1]
2040	movdqu	@XMM[8], 0x00($out)	# write output
2041	pxor	@XMM[10], @XMM[4]
2042	movdqu	@XMM[1], 0x10($out)
2043	pxor	@XMM[11], @XMM[6]
2044	movdqu	@XMM[4], 0x20($out)
2045	pxor	@XMM[12], @XMM[3]
2046	movdqu	@XMM[6], 0x30($out)
2047	pxor	@XMM[13], @XMM[7]
2048	movdqu	@XMM[3], 0x40($out)
2049	pxor	@XMM[14], @XMM[2]
2050	movdqu	@XMM[7], 0x50($out)
2051	pxor	@XMM[15], @XMM[5]
2052	movdqu	@XMM[2], 0x60($out)
2053	lea	.LADD1(%rip), %r11
2054	movdqu	@XMM[5], 0x70($out)
2055	lea	0x80($out), $out
2056	paddd	0x70(%r11), @XMM[0]	# .LADD8
2057	jnz	.Lctr_enc_loop
2058
2059	jmp	.Lctr_enc_done
2060.align	16
2061.Lctr_enc_loop_done:
2062	add	\$8, $len
2063	movdqu	0x00($inp), @XMM[8]	# load input
2064	pxor	@XMM[8], @XMM[0]
2065	movdqu	@XMM[0], 0x00($out)	# write output
2066	cmp	\$2,$len
2067	jb	.Lctr_enc_done
2068	movdqu	0x10($inp), @XMM[9]
2069	pxor	@XMM[9], @XMM[1]
2070	movdqu	@XMM[1], 0x10($out)
2071	je	.Lctr_enc_done
2072	movdqu	0x20($inp), @XMM[10]
2073	pxor	@XMM[10], @XMM[4]
2074	movdqu	@XMM[4], 0x20($out)
2075	cmp	\$4,$len
2076	jb	.Lctr_enc_done
2077	movdqu	0x30($inp), @XMM[11]
2078	pxor	@XMM[11], @XMM[6]
2079	movdqu	@XMM[6], 0x30($out)
2080	je	.Lctr_enc_done
2081	movdqu	0x40($inp), @XMM[12]
2082	pxor	@XMM[12], @XMM[3]
2083	movdqu	@XMM[3], 0x40($out)
2084	cmp	\$6,$len
2085	jb	.Lctr_enc_done
2086	movdqu	0x50($inp), @XMM[13]
2087	pxor	@XMM[13], @XMM[7]
2088	movdqu	@XMM[7], 0x50($out)
2089	je	.Lctr_enc_done
2090	movdqu	0x60($inp), @XMM[14]
2091	pxor	@XMM[14], @XMM[2]
2092	movdqu	@XMM[2], 0x60($out)
2093	jmp	.Lctr_enc_done
2094
2095.align	16
2096.Lctr_enc_short:
2097	lea	0x20(%rbp), $arg1
2098	lea	0x30(%rbp), $arg2
2099	lea	($key), $arg3
2100	call	asm_AES_encrypt
2101	movdqu	($inp), @XMM[1]
2102	lea	16($inp), $inp
2103	mov	0x2c(%rbp), %eax	# load 32-bit counter
2104	bswap	%eax
2105	pxor	0x30(%rbp), @XMM[1]
2106	inc	%eax			# increment
2107	movdqu	@XMM[1], ($out)
2108	bswap	%eax
2109	lea	16($out), $out
2110	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
2111	dec	$len
2112	jnz	.Lctr_enc_short
2113
2114.Lctr_enc_done:
2115	lea	(%rsp), %rax
2116	pxor	%xmm0, %xmm0
2117.Lctr_enc_bzero:			# wipe key schedule [if any]
2118	movdqa	%xmm0, 0x00(%rax)
2119	movdqa	%xmm0, 0x10(%rax)
2120	lea	0x20(%rax), %rax
2121	cmp	%rax, %rbp
2122	ja	.Lctr_enc_bzero
2123
2124	lea	0x78(%rbp),%rax
2125.cfi_def_cfa	%rax,8
2126___
2127$code.=<<___ if ($win64);
2128	movaps	0x40(%rbp), %xmm6
2129	movaps	0x50(%rbp), %xmm7
2130	movaps	0x60(%rbp), %xmm8
2131	movaps	0x70(%rbp), %xmm9
2132	movaps	0x80(%rbp), %xmm10
2133	movaps	0x90(%rbp), %xmm11
2134	movaps	0xa0(%rbp), %xmm12
2135	movaps	0xb0(%rbp), %xmm13
2136	movaps	0xc0(%rbp), %xmm14
2137	movaps	0xd0(%rbp), %xmm15
2138	lea	0xa0(%rax), %rax
2139.Lctr_enc_tail:
2140___
2141$code.=<<___;
2142	mov	-48(%rax), %r15
2143.cfi_restore	%r15
2144	mov	-40(%rax), %r14
2145.cfi_restore	%r14
2146	mov	-32(%rax), %r13
2147.cfi_restore	%r13
2148	mov	-24(%rax), %r12
2149.cfi_restore	%r12
2150	mov	-16(%rax), %rbx
2151.cfi_restore	%rbx
2152	mov	-8(%rax), %rbp
2153.cfi_restore	%rbp
2154	lea	(%rax), %rsp		# restore %rsp
2155.cfi_def_cfa_register	%rsp
2156.Lctr_enc_epilogue:
2157	ret
2158.cfi_endproc
2159.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2160___
2161######################################################################
2162# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2163#	const AES_KEY *key1, const AES_KEY *key2,
2164#	const unsigned char iv[16]);
2165#
2166my ($twmask,$twres,$twtmp)=@XMM[13..15];
2167$arg6=~s/d$//;
2168
2169$code.=<<___;
2170.globl	bsaes_xts_encrypt
2171.type	bsaes_xts_encrypt,\@abi-omnipotent
2172.align	16
2173bsaes_xts_encrypt:
2174.cfi_startproc
2175	mov	%rsp, %rax
2176.Lxts_enc_prologue:
2177	push	%rbp
2178.cfi_push	%rbp
2179	push	%rbx
2180.cfi_push	%rbx
2181	push	%r12
2182.cfi_push	%r12
2183	push	%r13
2184.cfi_push	%r13
2185	push	%r14
2186.cfi_push	%r14
2187	push	%r15
2188.cfi_push	%r15
2189	lea	-0x48(%rsp), %rsp
2190.cfi_adjust_cfa_offset	0x48
2191___
2192$code.=<<___ if ($win64);
2193	mov	0xa0(%rsp),$arg5	# pull key2
2194	mov	0xa8(%rsp),$arg6	# pull ivp
2195	lea	-0xa0(%rsp), %rsp
2196	movaps	%xmm6, 0x40(%rsp)
2197	movaps	%xmm7, 0x50(%rsp)
2198	movaps	%xmm8, 0x60(%rsp)
2199	movaps	%xmm9, 0x70(%rsp)
2200	movaps	%xmm10, 0x80(%rsp)
2201	movaps	%xmm11, 0x90(%rsp)
2202	movaps	%xmm12, 0xa0(%rsp)
2203	movaps	%xmm13, 0xb0(%rsp)
2204	movaps	%xmm14, 0xc0(%rsp)
2205	movaps	%xmm15, 0xd0(%rsp)
2206.Lxts_enc_body:
2207___
2208$code.=<<___;
2209	mov	%rsp, %rbp		# backup %rsp
2210.cfi_def_cfa_register	%rbp
2211	mov	$arg1, $inp		# backup arguments
2212	mov	$arg2, $out
2213	mov	$arg3, $len
2214	mov	$arg4, $key
2215
2216	lea	($arg6), $arg1
2217	lea	0x20(%rbp), $arg2
2218	lea	($arg5), $arg3
2219	call	asm_AES_encrypt		# generate initial tweak
2220
2221	mov	240($key), %eax		# rounds
2222	mov	$len, %rbx		# backup $len
2223
2224	mov	%eax, %edx		# rounds
2225	shl	\$7, %rax		# 128 bytes per inner round key
2226	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2227	sub	%rax, %rsp
2228
2229	mov	%rsp, %rax		# pass key schedule
2230	mov	$key, %rcx		# pass key
2231	mov	%edx, %r10d		# pass rounds
2232	call	_bsaes_key_convert
2233	pxor	%xmm6, %xmm7		# fix up last round key
2234	movdqa	%xmm7, (%rax)		# save last round key
2235
2236	and	\$-16, $len
2237	sub	\$0x80, %rsp		# place for tweak[8]
2238	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2239
2240	pxor	$twtmp, $twtmp
2241	movdqa	.Lxts_magic(%rip), $twmask
2242	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2243
2244	sub	\$0x80, $len
2245	jc	.Lxts_enc_short
2246	jmp	.Lxts_enc_loop
2247
2248.align	16
2249.Lxts_enc_loop:
2250___
2251    for ($i=0;$i<7;$i++) {
2252    $code.=<<___;
2253	pshufd	\$0x13, $twtmp, $twres
2254	pxor	$twtmp, $twtmp
2255	movdqa	@XMM[7], @XMM[$i]
2256	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2257	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2258	pand	$twmask, $twres		# isolate carry and residue
2259	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2260	pxor	$twres, @XMM[7]
2261___
2262    $code.=<<___ if ($i>=1);
2263	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2264___
2265    $code.=<<___ if ($i>=2);
2266	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2267___
2268    }
2269$code.=<<___;
2270	movdqu	0x60($inp), @XMM[8+6]
2271	pxor	@XMM[8+5], @XMM[5]
2272	movdqu	0x70($inp), @XMM[8+7]
2273	lea	0x80($inp), $inp
2274	movdqa	@XMM[7], 0x70(%rsp)
2275	pxor	@XMM[8+6], @XMM[6]
2276	lea	0x80(%rsp), %rax	# pass key schedule
2277	pxor	@XMM[8+7], @XMM[7]
2278	mov	%edx, %r10d		# pass rounds
2279
2280	call	_bsaes_encrypt8
2281
2282	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2283	pxor	0x10(%rsp), @XMM[1]
2284	movdqu	@XMM[0], 0x00($out)	# write output
2285	pxor	0x20(%rsp), @XMM[4]
2286	movdqu	@XMM[1], 0x10($out)
2287	pxor	0x30(%rsp), @XMM[6]
2288	movdqu	@XMM[4], 0x20($out)
2289	pxor	0x40(%rsp), @XMM[3]
2290	movdqu	@XMM[6], 0x30($out)
2291	pxor	0x50(%rsp), @XMM[7]
2292	movdqu	@XMM[3], 0x40($out)
2293	pxor	0x60(%rsp), @XMM[2]
2294	movdqu	@XMM[7], 0x50($out)
2295	pxor	0x70(%rsp), @XMM[5]
2296	movdqu	@XMM[2], 0x60($out)
2297	movdqu	@XMM[5], 0x70($out)
2298	lea	0x80($out), $out
2299
2300	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2301	pxor	$twtmp, $twtmp
2302	movdqa	.Lxts_magic(%rip), $twmask
2303	pcmpgtd	@XMM[7], $twtmp
2304	pshufd	\$0x13, $twtmp, $twres
2305	pxor	$twtmp, $twtmp
2306	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2307	pand	$twmask, $twres		# isolate carry and residue
2308	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2309	pxor	$twres, @XMM[7]
2310
2311	sub	\$0x80,$len
2312	jnc	.Lxts_enc_loop
2313
2314.Lxts_enc_short:
2315	add	\$0x80, $len
2316	jz	.Lxts_enc_done
2317___
2318    for ($i=0;$i<7;$i++) {
2319    $code.=<<___;
2320	pshufd	\$0x13, $twtmp, $twres
2321	pxor	$twtmp, $twtmp
2322	movdqa	@XMM[7], @XMM[$i]
2323	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2324	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2325	pand	$twmask, $twres		# isolate carry and residue
2326	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2327	pxor	$twres, @XMM[7]
2328___
2329    $code.=<<___ if ($i>=1);
2330	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2331	cmp	\$`0x10*$i`,$len
2332	je	.Lxts_enc_$i
2333___
2334    $code.=<<___ if ($i>=2);
2335	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2336___
2337    }
2338$code.=<<___;
2339	movdqu	0x60($inp), @XMM[8+6]
2340	pxor	@XMM[8+5], @XMM[5]
2341	movdqa	@XMM[7], 0x70(%rsp)
2342	lea	0x70($inp), $inp
2343	pxor	@XMM[8+6], @XMM[6]
2344	lea	0x80(%rsp), %rax	# pass key schedule
2345	mov	%edx, %r10d		# pass rounds
2346
2347	call	_bsaes_encrypt8
2348
2349	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2350	pxor	0x10(%rsp), @XMM[1]
2351	movdqu	@XMM[0], 0x00($out)	# write output
2352	pxor	0x20(%rsp), @XMM[4]
2353	movdqu	@XMM[1], 0x10($out)
2354	pxor	0x30(%rsp), @XMM[6]
2355	movdqu	@XMM[4], 0x20($out)
2356	pxor	0x40(%rsp), @XMM[3]
2357	movdqu	@XMM[6], 0x30($out)
2358	pxor	0x50(%rsp), @XMM[7]
2359	movdqu	@XMM[3], 0x40($out)
2360	pxor	0x60(%rsp), @XMM[2]
2361	movdqu	@XMM[7], 0x50($out)
2362	movdqu	@XMM[2], 0x60($out)
2363	lea	0x70($out), $out
2364
2365	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2366	jmp	.Lxts_enc_done
2367.align	16
2368.Lxts_enc_6:
2369	pxor	@XMM[8+4], @XMM[4]
2370	lea	0x60($inp), $inp
2371	pxor	@XMM[8+5], @XMM[5]
2372	lea	0x80(%rsp), %rax	# pass key schedule
2373	mov	%edx, %r10d		# pass rounds
2374
2375	call	_bsaes_encrypt8
2376
2377	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2378	pxor	0x10(%rsp), @XMM[1]
2379	movdqu	@XMM[0], 0x00($out)	# write output
2380	pxor	0x20(%rsp), @XMM[4]
2381	movdqu	@XMM[1], 0x10($out)
2382	pxor	0x30(%rsp), @XMM[6]
2383	movdqu	@XMM[4], 0x20($out)
2384	pxor	0x40(%rsp), @XMM[3]
2385	movdqu	@XMM[6], 0x30($out)
2386	pxor	0x50(%rsp), @XMM[7]
2387	movdqu	@XMM[3], 0x40($out)
2388	movdqu	@XMM[7], 0x50($out)
2389	lea	0x60($out), $out
2390
2391	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2392	jmp	.Lxts_enc_done
2393.align	16
2394.Lxts_enc_5:
2395	pxor	@XMM[8+3], @XMM[3]
2396	lea	0x50($inp), $inp
2397	pxor	@XMM[8+4], @XMM[4]
2398	lea	0x80(%rsp), %rax	# pass key schedule
2399	mov	%edx, %r10d		# pass rounds
2400
2401	call	_bsaes_encrypt8
2402
2403	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2404	pxor	0x10(%rsp), @XMM[1]
2405	movdqu	@XMM[0], 0x00($out)	# write output
2406	pxor	0x20(%rsp), @XMM[4]
2407	movdqu	@XMM[1], 0x10($out)
2408	pxor	0x30(%rsp), @XMM[6]
2409	movdqu	@XMM[4], 0x20($out)
2410	pxor	0x40(%rsp), @XMM[3]
2411	movdqu	@XMM[6], 0x30($out)
2412	movdqu	@XMM[3], 0x40($out)
2413	lea	0x50($out), $out
2414
2415	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2416	jmp	.Lxts_enc_done
2417.align	16
2418.Lxts_enc_4:
2419	pxor	@XMM[8+2], @XMM[2]
2420	lea	0x40($inp), $inp
2421	pxor	@XMM[8+3], @XMM[3]
2422	lea	0x80(%rsp), %rax	# pass key schedule
2423	mov	%edx, %r10d		# pass rounds
2424
2425	call	_bsaes_encrypt8
2426
2427	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2428	pxor	0x10(%rsp), @XMM[1]
2429	movdqu	@XMM[0], 0x00($out)	# write output
2430	pxor	0x20(%rsp), @XMM[4]
2431	movdqu	@XMM[1], 0x10($out)
2432	pxor	0x30(%rsp), @XMM[6]
2433	movdqu	@XMM[4], 0x20($out)
2434	movdqu	@XMM[6], 0x30($out)
2435	lea	0x40($out), $out
2436
2437	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2438	jmp	.Lxts_enc_done
2439.align	16
2440.Lxts_enc_3:
2441	pxor	@XMM[8+1], @XMM[1]
2442	lea	0x30($inp), $inp
2443	pxor	@XMM[8+2], @XMM[2]
2444	lea	0x80(%rsp), %rax	# pass key schedule
2445	mov	%edx, %r10d		# pass rounds
2446
2447	call	_bsaes_encrypt8
2448
2449	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2450	pxor	0x10(%rsp), @XMM[1]
2451	movdqu	@XMM[0], 0x00($out)	# write output
2452	pxor	0x20(%rsp), @XMM[4]
2453	movdqu	@XMM[1], 0x10($out)
2454	movdqu	@XMM[4], 0x20($out)
2455	lea	0x30($out), $out
2456
2457	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2458	jmp	.Lxts_enc_done
2459.align	16
2460.Lxts_enc_2:
2461	pxor	@XMM[8+0], @XMM[0]
2462	lea	0x20($inp), $inp
2463	pxor	@XMM[8+1], @XMM[1]
2464	lea	0x80(%rsp), %rax	# pass key schedule
2465	mov	%edx, %r10d		# pass rounds
2466
2467	call	_bsaes_encrypt8
2468
2469	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2470	pxor	0x10(%rsp), @XMM[1]
2471	movdqu	@XMM[0], 0x00($out)	# write output
2472	movdqu	@XMM[1], 0x10($out)
2473	lea	0x20($out), $out
2474
2475	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2476	jmp	.Lxts_enc_done
2477.align	16
2478.Lxts_enc_1:
2479	pxor	@XMM[0], @XMM[8]
2480	lea	0x10($inp), $inp
2481	movdqa	@XMM[8], 0x20(%rbp)
2482	lea	0x20(%rbp), $arg1
2483	lea	0x20(%rbp), $arg2
2484	lea	($key), $arg3
2485	call	asm_AES_encrypt		# doesn't touch %xmm
2486	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2487	#pxor	@XMM[8], @XMM[0]
2488	#lea	0x80(%rsp), %rax	# pass key schedule
2489	#mov	%edx, %r10d		# pass rounds
2490	#call	_bsaes_encrypt8
2491	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2492	movdqu	@XMM[0], 0x00($out)	# write output
2493	lea	0x10($out), $out
2494
2495	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2496
2497.Lxts_enc_done:
2498	and	\$15, %ebx
2499	jz	.Lxts_enc_ret
2500	mov	$out, %rdx
2501
2502.Lxts_enc_steal:
2503	movzb	($inp), %eax
2504	movzb	-16(%rdx), %ecx
2505	lea	1($inp), $inp
2506	mov	%al, -16(%rdx)
2507	mov	%cl, 0(%rdx)
2508	lea	1(%rdx), %rdx
2509	sub	\$1,%ebx
2510	jnz	.Lxts_enc_steal
2511
2512	movdqu	-16($out), @XMM[0]
2513	lea	0x20(%rbp), $arg1
2514	pxor	@XMM[7], @XMM[0]
2515	lea	0x20(%rbp), $arg2
2516	movdqa	@XMM[0], 0x20(%rbp)
2517	lea	($key), $arg3
2518	call	asm_AES_encrypt		# doesn't touch %xmm
2519	pxor	0x20(%rbp), @XMM[7]
2520	movdqu	@XMM[7], -16($out)
2521
2522.Lxts_enc_ret:
2523	lea	(%rsp), %rax
2524	pxor	%xmm0, %xmm0
2525.Lxts_enc_bzero:			# wipe key schedule [if any]
2526	movdqa	%xmm0, 0x00(%rax)
2527	movdqa	%xmm0, 0x10(%rax)
2528	lea	0x20(%rax), %rax
2529	cmp	%rax, %rbp
2530	ja	.Lxts_enc_bzero
2531
2532	lea	0x78(%rbp),%rax
2533.cfi_def_cfa	%rax,8
2534___
2535$code.=<<___ if ($win64);
2536	movaps	0x40(%rbp), %xmm6
2537	movaps	0x50(%rbp), %xmm7
2538	movaps	0x60(%rbp), %xmm8
2539	movaps	0x70(%rbp), %xmm9
2540	movaps	0x80(%rbp), %xmm10
2541	movaps	0x90(%rbp), %xmm11
2542	movaps	0xa0(%rbp), %xmm12
2543	movaps	0xb0(%rbp), %xmm13
2544	movaps	0xc0(%rbp), %xmm14
2545	movaps	0xd0(%rbp), %xmm15
2546	lea	0xa0(%rax), %rax
2547.Lxts_enc_tail:
2548___
2549$code.=<<___;
2550	mov	-48(%rax), %r15
2551.cfi_restore	%r15
2552	mov	-40(%rax), %r14
2553.cfi_restore	%r14
2554	mov	-32(%rax), %r13
2555.cfi_restore	%r13
2556	mov	-24(%rax), %r12
2557.cfi_restore	%r12
2558	mov	-16(%rax), %rbx
2559.cfi_restore	%rbx
2560	mov	-8(%rax), %rbp
2561.cfi_restore	%rbp
2562	lea	(%rax), %rsp		# restore %rsp
2563.cfi_def_cfa_register	%rsp
2564.Lxts_enc_epilogue:
2565	ret
2566.cfi_endproc
2567.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2568
2569.globl	bsaes_xts_decrypt
2570.type	bsaes_xts_decrypt,\@abi-omnipotent
2571.align	16
2572bsaes_xts_decrypt:
2573.cfi_startproc
2574	mov	%rsp, %rax
2575.Lxts_dec_prologue:
2576	push	%rbp
2577.cfi_push	%rbp
2578	push	%rbx
2579.cfi_push	%rbx
2580	push	%r12
2581.cfi_push	%r12
2582	push	%r13
2583.cfi_push	%r13
2584	push	%r14
2585.cfi_push	%r14
2586	push	%r15
2587.cfi_push	%r15
2588	lea	-0x48(%rsp), %rsp
2589.cfi_adjust_cfa_offset	0x48
2590___
2591$code.=<<___ if ($win64);
2592	mov	0xa0(%rsp),$arg5	# pull key2
2593	mov	0xa8(%rsp),$arg6	# pull ivp
2594	lea	-0xa0(%rsp), %rsp
2595	movaps	%xmm6, 0x40(%rsp)
2596	movaps	%xmm7, 0x50(%rsp)
2597	movaps	%xmm8, 0x60(%rsp)
2598	movaps	%xmm9, 0x70(%rsp)
2599	movaps	%xmm10, 0x80(%rsp)
2600	movaps	%xmm11, 0x90(%rsp)
2601	movaps	%xmm12, 0xa0(%rsp)
2602	movaps	%xmm13, 0xb0(%rsp)
2603	movaps	%xmm14, 0xc0(%rsp)
2604	movaps	%xmm15, 0xd0(%rsp)
2605.Lxts_dec_body:
2606___
2607$code.=<<___;
2608	mov	%rsp, %rbp		# backup %rsp
2609	mov	$arg1, $inp		# backup arguments
2610	mov	$arg2, $out
2611	mov	$arg3, $len
2612	mov	$arg4, $key
2613
2614	lea	($arg6), $arg1
2615	lea	0x20(%rbp), $arg2
2616	lea	($arg5), $arg3
2617	call	asm_AES_encrypt		# generate initial tweak
2618
2619	mov	240($key), %eax		# rounds
2620	mov	$len, %rbx		# backup $len
2621
2622	mov	%eax, %edx		# rounds
2623	shl	\$7, %rax		# 128 bytes per inner round key
2624	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2625	sub	%rax, %rsp
2626
2627	mov	%rsp, %rax		# pass key schedule
2628	mov	$key, %rcx		# pass key
2629	mov	%edx, %r10d		# pass rounds
2630	call	_bsaes_key_convert
2631	pxor	(%rsp), %xmm7		# fix up round 0 key
2632	movdqa	%xmm6, (%rax)		# save last round key
2633	movdqa	%xmm7, (%rsp)
2634
2635	xor	%eax, %eax		# if ($len%16) len-=16;
2636	and	\$-16, $len
2637	test	\$15, %ebx
2638	setnz	%al
2639	shl	\$4, %rax
2640	sub	%rax, $len
2641
2642	sub	\$0x80, %rsp		# place for tweak[8]
2643	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2644
2645	pxor	$twtmp, $twtmp
2646	movdqa	.Lxts_magic(%rip), $twmask
2647	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2648
2649	sub	\$0x80, $len
2650	jc	.Lxts_dec_short
2651	jmp	.Lxts_dec_loop
2652
2653.align	16
2654.Lxts_dec_loop:
2655___
2656    for ($i=0;$i<7;$i++) {
2657    $code.=<<___;
2658	pshufd	\$0x13, $twtmp, $twres
2659	pxor	$twtmp, $twtmp
2660	movdqa	@XMM[7], @XMM[$i]
2661	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2662	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2663	pand	$twmask, $twres		# isolate carry and residue
2664	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2665	pxor	$twres, @XMM[7]
2666___
2667    $code.=<<___ if ($i>=1);
2668	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2669___
2670    $code.=<<___ if ($i>=2);
2671	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2672___
2673    }
2674$code.=<<___;
2675	movdqu	0x60($inp), @XMM[8+6]
2676	pxor	@XMM[8+5], @XMM[5]
2677	movdqu	0x70($inp), @XMM[8+7]
2678	lea	0x80($inp), $inp
2679	movdqa	@XMM[7], 0x70(%rsp)
2680	pxor	@XMM[8+6], @XMM[6]
2681	lea	0x80(%rsp), %rax	# pass key schedule
2682	pxor	@XMM[8+7], @XMM[7]
2683	mov	%edx, %r10d		# pass rounds
2684
2685	call	_bsaes_decrypt8
2686
2687	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2688	pxor	0x10(%rsp), @XMM[1]
2689	movdqu	@XMM[0], 0x00($out)	# write output
2690	pxor	0x20(%rsp), @XMM[6]
2691	movdqu	@XMM[1], 0x10($out)
2692	pxor	0x30(%rsp), @XMM[4]
2693	movdqu	@XMM[6], 0x20($out)
2694	pxor	0x40(%rsp), @XMM[2]
2695	movdqu	@XMM[4], 0x30($out)
2696	pxor	0x50(%rsp), @XMM[7]
2697	movdqu	@XMM[2], 0x40($out)
2698	pxor	0x60(%rsp), @XMM[3]
2699	movdqu	@XMM[7], 0x50($out)
2700	pxor	0x70(%rsp), @XMM[5]
2701	movdqu	@XMM[3], 0x60($out)
2702	movdqu	@XMM[5], 0x70($out)
2703	lea	0x80($out), $out
2704
2705	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2706	pxor	$twtmp, $twtmp
2707	movdqa	.Lxts_magic(%rip), $twmask
2708	pcmpgtd	@XMM[7], $twtmp
2709	pshufd	\$0x13, $twtmp, $twres
2710	pxor	$twtmp, $twtmp
2711	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2712	pand	$twmask, $twres		# isolate carry and residue
2713	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2714	pxor	$twres, @XMM[7]
2715
2716	sub	\$0x80,$len
2717	jnc	.Lxts_dec_loop
2718
2719.Lxts_dec_short:
2720	add	\$0x80, $len
2721	jz	.Lxts_dec_done
2722___
2723    for ($i=0;$i<7;$i++) {
2724    $code.=<<___;
2725	pshufd	\$0x13, $twtmp, $twres
2726	pxor	$twtmp, $twtmp
2727	movdqa	@XMM[7], @XMM[$i]
2728	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2729	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2730	pand	$twmask, $twres		# isolate carry and residue
2731	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2732	pxor	$twres, @XMM[7]
2733___
2734    $code.=<<___ if ($i>=1);
2735	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2736	cmp	\$`0x10*$i`,$len
2737	je	.Lxts_dec_$i
2738___
2739    $code.=<<___ if ($i>=2);
2740	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2741___
2742    }
2743$code.=<<___;
2744	movdqu	0x60($inp), @XMM[8+6]
2745	pxor	@XMM[8+5], @XMM[5]
2746	movdqa	@XMM[7], 0x70(%rsp)
2747	lea	0x70($inp), $inp
2748	pxor	@XMM[8+6], @XMM[6]
2749	lea	0x80(%rsp), %rax	# pass key schedule
2750	mov	%edx, %r10d		# pass rounds
2751
2752	call	_bsaes_decrypt8
2753
2754	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2755	pxor	0x10(%rsp), @XMM[1]
2756	movdqu	@XMM[0], 0x00($out)	# write output
2757	pxor	0x20(%rsp), @XMM[6]
2758	movdqu	@XMM[1], 0x10($out)
2759	pxor	0x30(%rsp), @XMM[4]
2760	movdqu	@XMM[6], 0x20($out)
2761	pxor	0x40(%rsp), @XMM[2]
2762	movdqu	@XMM[4], 0x30($out)
2763	pxor	0x50(%rsp), @XMM[7]
2764	movdqu	@XMM[2], 0x40($out)
2765	pxor	0x60(%rsp), @XMM[3]
2766	movdqu	@XMM[7], 0x50($out)
2767	movdqu	@XMM[3], 0x60($out)
2768	lea	0x70($out), $out
2769
2770	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2771	jmp	.Lxts_dec_done
2772.align	16
2773.Lxts_dec_6:
2774	pxor	@XMM[8+4], @XMM[4]
2775	lea	0x60($inp), $inp
2776	pxor	@XMM[8+5], @XMM[5]
2777	lea	0x80(%rsp), %rax	# pass key schedule
2778	mov	%edx, %r10d		# pass rounds
2779
2780	call	_bsaes_decrypt8
2781
2782	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2783	pxor	0x10(%rsp), @XMM[1]
2784	movdqu	@XMM[0], 0x00($out)	# write output
2785	pxor	0x20(%rsp), @XMM[6]
2786	movdqu	@XMM[1], 0x10($out)
2787	pxor	0x30(%rsp), @XMM[4]
2788	movdqu	@XMM[6], 0x20($out)
2789	pxor	0x40(%rsp), @XMM[2]
2790	movdqu	@XMM[4], 0x30($out)
2791	pxor	0x50(%rsp), @XMM[7]
2792	movdqu	@XMM[2], 0x40($out)
2793	movdqu	@XMM[7], 0x50($out)
2794	lea	0x60($out), $out
2795
2796	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2797	jmp	.Lxts_dec_done
2798.align	16
2799.Lxts_dec_5:
2800	pxor	@XMM[8+3], @XMM[3]
2801	lea	0x50($inp), $inp
2802	pxor	@XMM[8+4], @XMM[4]
2803	lea	0x80(%rsp), %rax	# pass key schedule
2804	mov	%edx, %r10d		# pass rounds
2805
2806	call	_bsaes_decrypt8
2807
2808	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2809	pxor	0x10(%rsp), @XMM[1]
2810	movdqu	@XMM[0], 0x00($out)	# write output
2811	pxor	0x20(%rsp), @XMM[6]
2812	movdqu	@XMM[1], 0x10($out)
2813	pxor	0x30(%rsp), @XMM[4]
2814	movdqu	@XMM[6], 0x20($out)
2815	pxor	0x40(%rsp), @XMM[2]
2816	movdqu	@XMM[4], 0x30($out)
2817	movdqu	@XMM[2], 0x40($out)
2818	lea	0x50($out), $out
2819
2820	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2821	jmp	.Lxts_dec_done
2822.align	16
2823.Lxts_dec_4:
2824	pxor	@XMM[8+2], @XMM[2]
2825	lea	0x40($inp), $inp
2826	pxor	@XMM[8+3], @XMM[3]
2827	lea	0x80(%rsp), %rax	# pass key schedule
2828	mov	%edx, %r10d		# pass rounds
2829
2830	call	_bsaes_decrypt8
2831
2832	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2833	pxor	0x10(%rsp), @XMM[1]
2834	movdqu	@XMM[0], 0x00($out)	# write output
2835	pxor	0x20(%rsp), @XMM[6]
2836	movdqu	@XMM[1], 0x10($out)
2837	pxor	0x30(%rsp), @XMM[4]
2838	movdqu	@XMM[6], 0x20($out)
2839	movdqu	@XMM[4], 0x30($out)
2840	lea	0x40($out), $out
2841
2842	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2843	jmp	.Lxts_dec_done
2844.align	16
2845.Lxts_dec_3:
2846	pxor	@XMM[8+1], @XMM[1]
2847	lea	0x30($inp), $inp
2848	pxor	@XMM[8+2], @XMM[2]
2849	lea	0x80(%rsp), %rax	# pass key schedule
2850	mov	%edx, %r10d		# pass rounds
2851
2852	call	_bsaes_decrypt8
2853
2854	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2855	pxor	0x10(%rsp), @XMM[1]
2856	movdqu	@XMM[0], 0x00($out)	# write output
2857	pxor	0x20(%rsp), @XMM[6]
2858	movdqu	@XMM[1], 0x10($out)
2859	movdqu	@XMM[6], 0x20($out)
2860	lea	0x30($out), $out
2861
2862	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2863	jmp	.Lxts_dec_done
2864.align	16
2865.Lxts_dec_2:
2866	pxor	@XMM[8+0], @XMM[0]
2867	lea	0x20($inp), $inp
2868	pxor	@XMM[8+1], @XMM[1]
2869	lea	0x80(%rsp), %rax	# pass key schedule
2870	mov	%edx, %r10d		# pass rounds
2871
2872	call	_bsaes_decrypt8
2873
2874	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2875	pxor	0x10(%rsp), @XMM[1]
2876	movdqu	@XMM[0], 0x00($out)	# write output
2877	movdqu	@XMM[1], 0x10($out)
2878	lea	0x20($out), $out
2879
2880	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2881	jmp	.Lxts_dec_done
2882.align	16
2883.Lxts_dec_1:
2884	pxor	@XMM[0], @XMM[8]
2885	lea	0x10($inp), $inp
2886	movdqa	@XMM[8], 0x20(%rbp)
2887	lea	0x20(%rbp), $arg1
2888	lea	0x20(%rbp), $arg2
2889	lea	($key), $arg3
2890	call	asm_AES_decrypt		# doesn't touch %xmm
2891	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2892	#pxor	@XMM[8], @XMM[0]
2893	#lea	0x80(%rsp), %rax	# pass key schedule
2894	#mov	%edx, %r10d		# pass rounds
2895	#call	_bsaes_decrypt8
2896	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2897	movdqu	@XMM[0], 0x00($out)	# write output
2898	lea	0x10($out), $out
2899
2900	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2901
2902.Lxts_dec_done:
2903	and	\$15, %ebx
2904	jz	.Lxts_dec_ret
2905
2906	pxor	$twtmp, $twtmp
2907	movdqa	.Lxts_magic(%rip), $twmask
2908	pcmpgtd	@XMM[7], $twtmp
2909	pshufd	\$0x13, $twtmp, $twres
2910	movdqa	@XMM[7], @XMM[6]
2911	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2912	pand	$twmask, $twres		# isolate carry and residue
2913	movdqu	($inp), @XMM[0]
2914	pxor	$twres, @XMM[7]
2915
2916	lea	0x20(%rbp), $arg1
2917	pxor	@XMM[7], @XMM[0]
2918	lea	0x20(%rbp), $arg2
2919	movdqa	@XMM[0], 0x20(%rbp)
2920	lea	($key), $arg3
2921	call	asm_AES_decrypt		# doesn't touch %xmm
2922	pxor	0x20(%rbp), @XMM[7]
2923	mov	$out, %rdx
2924	movdqu	@XMM[7], ($out)
2925
2926.Lxts_dec_steal:
2927	movzb	16($inp), %eax
2928	movzb	(%rdx), %ecx
2929	lea	1($inp), $inp
2930	mov	%al, (%rdx)
2931	mov	%cl, 16(%rdx)
2932	lea	1(%rdx), %rdx
2933	sub	\$1,%ebx
2934	jnz	.Lxts_dec_steal
2935
2936	movdqu	($out), @XMM[0]
2937	lea	0x20(%rbp), $arg1
2938	pxor	@XMM[6], @XMM[0]
2939	lea	0x20(%rbp), $arg2
2940	movdqa	@XMM[0], 0x20(%rbp)
2941	lea	($key), $arg3
2942	call	asm_AES_decrypt		# doesn't touch %xmm
2943	pxor	0x20(%rbp), @XMM[6]
2944	movdqu	@XMM[6], ($out)
2945
2946.Lxts_dec_ret:
2947	lea	(%rsp), %rax
2948	pxor	%xmm0, %xmm0
2949.Lxts_dec_bzero:			# wipe key schedule [if any]
2950	movdqa	%xmm0, 0x00(%rax)
2951	movdqa	%xmm0, 0x10(%rax)
2952	lea	0x20(%rax), %rax
2953	cmp	%rax, %rbp
2954	ja	.Lxts_dec_bzero
2955
2956	lea	0x78(%rbp),%rax
2957.cfi_def_cfa	%rax,8
2958___
2959$code.=<<___ if ($win64);
2960	movaps	0x40(%rbp), %xmm6
2961	movaps	0x50(%rbp), %xmm7
2962	movaps	0x60(%rbp), %xmm8
2963	movaps	0x70(%rbp), %xmm9
2964	movaps	0x80(%rbp), %xmm10
2965	movaps	0x90(%rbp), %xmm11
2966	movaps	0xa0(%rbp), %xmm12
2967	movaps	0xb0(%rbp), %xmm13
2968	movaps	0xc0(%rbp), %xmm14
2969	movaps	0xd0(%rbp), %xmm15
2970	lea	0xa0(%rax), %rax
2971.Lxts_dec_tail:
2972___
2973$code.=<<___;
2974	mov	-48(%rax), %r15
2975.cfi_restore	%r15
2976	mov	-40(%rax), %r14
2977.cfi_restore	%r14
2978	mov	-32(%rax), %r13
2979.cfi_restore	%r13
2980	mov	-24(%rax), %r12
2981.cfi_restore	%r12
2982	mov	-16(%rax), %rbx
2983.cfi_restore	%rbx
2984	mov	-8(%rax), %rbp
2985.cfi_restore	%rbp
2986	lea	(%rax), %rsp		# restore %rsp
2987.cfi_def_cfa_register	%rsp
2988.Lxts_dec_epilogue:
2989	ret
2990.cfi_endproc
2991.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2992___
2993}
2994$code.=<<___;
2995.type	_bsaes_const,\@object
2996.align	64
2997_bsaes_const:
2998.LM0ISR:	# InvShiftRows constants
2999	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
3000.LISRM0:
3001	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
3002.LISR:
3003	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
3004.LBS0:		# bit-slice constants
3005	.quad	0x5555555555555555, 0x5555555555555555
3006.LBS1:
3007	.quad	0x3333333333333333, 0x3333333333333333
3008.LBS2:
3009	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
3010.LSR:		# shiftrows constants
3011	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
3012.LSRM0:
3013	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
3014.LM0SR:
3015	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
3016.LSWPUP:	# byte-swap upper dword
3017	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
3018.LSWPUPM0SR:
3019	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
3020.LADD1:		# counter increment constants
3021	.quad	0x0000000000000000, 0x0000000100000000
3022.LADD2:
3023	.quad	0x0000000000000000, 0x0000000200000000
3024.LADD3:
3025	.quad	0x0000000000000000, 0x0000000300000000
3026.LADD4:
3027	.quad	0x0000000000000000, 0x0000000400000000
3028.LADD5:
3029	.quad	0x0000000000000000, 0x0000000500000000
3030.LADD6:
3031	.quad	0x0000000000000000, 0x0000000600000000
3032.LADD7:
3033	.quad	0x0000000000000000, 0x0000000700000000
3034.LADD8:
3035	.quad	0x0000000000000000, 0x0000000800000000
3036.Lxts_magic:
3037	.long	0x87,0,1,0
3038.Lmasks:
3039	.quad	0x0101010101010101, 0x0101010101010101
3040	.quad	0x0202020202020202, 0x0202020202020202
3041	.quad	0x0404040404040404, 0x0404040404040404
3042	.quad	0x0808080808080808, 0x0808080808080808
3043.LM0:
3044	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
3045.L63:
3046	.quad	0x6363636363636363, 0x6363636363636363
3047.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
3048.align	64
3049.size	_bsaes_const,.-_bsaes_const
3050___
3051
3052# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3053#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3054if ($win64) {
3055$rec="%rcx";
3056$frame="%rdx";
3057$context="%r8";
3058$disp="%r9";
3059
3060$code.=<<___;
3061.extern	__imp_RtlVirtualUnwind
3062.type	se_handler,\@abi-omnipotent
3063.align	16
3064se_handler:
3065	push	%rsi
3066	push	%rdi
3067	push	%rbx
3068	push	%rbp
3069	push	%r12
3070	push	%r13
3071	push	%r14
3072	push	%r15
3073	pushfq
3074	sub	\$64,%rsp
3075
3076	mov	120($context),%rax	# pull context->Rax
3077	mov	248($context),%rbx	# pull context->Rip
3078
3079	mov	8($disp),%rsi		# disp->ImageBase
3080	mov	56($disp),%r11		# disp->HandlerData
3081
3082	mov	0(%r11),%r10d		# HandlerData[0]
3083	lea	(%rsi,%r10),%r10	# prologue label
3084	cmp	%r10,%rbx		# context->Rip<=prologue label
3085	jbe	.Lin_prologue
3086
3087	mov	4(%r11),%r10d		# HandlerData[1]
3088	lea	(%rsi,%r10),%r10	# epilogue label
3089	cmp	%r10,%rbx		# context->Rip>=epilogue label
3090	jae	.Lin_prologue
3091
3092	mov	8(%r11),%r10d		# HandlerData[2]
3093	lea	(%rsi,%r10),%r10	# epilogue label
3094	cmp	%r10,%rbx		# context->Rip>=tail label
3095	jae	.Lin_tail
3096
3097	mov	160($context),%rax	# pull context->Rbp
3098
3099	lea	0x40(%rax),%rsi		# %xmm save area
3100	lea	512($context),%rdi	# &context.Xmm6
3101	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
3102	.long	0xa548f3fc		# cld; rep movsq
3103	lea	0xa0+0x78(%rax),%rax	# adjust stack pointer
3104
3105.Lin_tail:
3106	mov	-48(%rax),%rbp
3107	mov	-40(%rax),%rbx
3108	mov	-32(%rax),%r12
3109	mov	-24(%rax),%r13
3110	mov	-16(%rax),%r14
3111	mov	-8(%rax),%r15
3112	mov	%rbx,144($context)	# restore context->Rbx
3113	mov	%rbp,160($context)	# restore context->Rbp
3114	mov	%r12,216($context)	# restore context->R12
3115	mov	%r13,224($context)	# restore context->R13
3116	mov	%r14,232($context)	# restore context->R14
3117	mov	%r15,240($context)	# restore context->R15
3118
3119.Lin_prologue:
3120	mov	%rax,152($context)	# restore context->Rsp
3121
3122	mov	40($disp),%rdi		# disp->ContextRecord
3123	mov	$context,%rsi		# context
3124	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
3125	.long	0xa548f3fc		# cld; rep movsq
3126
3127	mov	$disp,%rsi
3128	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3129	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3130	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3131	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3132	mov	40(%rsi),%r10		# disp->ContextRecord
3133	lea	56(%rsi),%r11		# &disp->HandlerData
3134	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3135	mov	%r10,32(%rsp)		# arg5
3136	mov	%r11,40(%rsp)		# arg6
3137	mov	%r12,48(%rsp)		# arg7
3138	mov	%rcx,56(%rsp)		# arg8, (NULL)
3139	call	*__imp_RtlVirtualUnwind(%rip)
3140
3141	mov	\$1,%eax		# ExceptionContinueSearch
3142	add	\$64,%rsp
3143	popfq
3144	pop	%r15
3145	pop	%r14
3146	pop	%r13
3147	pop	%r12
3148	pop	%rbp
3149	pop	%rbx
3150	pop	%rdi
3151	pop	%rsi
3152	ret
3153.size	se_handler,.-se_handler
3154
3155.section	.pdata
3156.align	4
3157___
3158$code.=<<___ if ($ecb);
3159	.rva	.Lecb_enc_prologue
3160	.rva	.Lecb_enc_epilogue
3161	.rva	.Lecb_enc_info
3162
3163	.rva	.Lecb_dec_prologue
3164	.rva	.Lecb_dec_epilogue
3165	.rva	.Lecb_dec_info
3166___
3167$code.=<<___;
3168	.rva	.Lcbc_dec_prologue
3169	.rva	.Lcbc_dec_epilogue
3170	.rva	.Lcbc_dec_info
3171
3172	.rva	.Lctr_enc_prologue
3173	.rva	.Lctr_enc_epilogue
3174	.rva	.Lctr_enc_info
3175
3176	.rva	.Lxts_enc_prologue
3177	.rva	.Lxts_enc_epilogue
3178	.rva	.Lxts_enc_info
3179
3180	.rva	.Lxts_dec_prologue
3181	.rva	.Lxts_dec_epilogue
3182	.rva	.Lxts_dec_info
3183
3184.section	.xdata
3185.align	8
3186___
3187$code.=<<___ if ($ecb);
3188.Lecb_enc_info:
3189	.byte	9,0,0,0
3190	.rva	se_handler
3191	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3192	.rva	.Lecb_enc_tail
3193	.long	0
3194.Lecb_dec_info:
3195	.byte	9,0,0,0
3196	.rva	se_handler
3197	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3198	.rva	.Lecb_dec_tail
3199	.long	0
3200___
3201$code.=<<___;
3202.Lcbc_dec_info:
3203	.byte	9,0,0,0
3204	.rva	se_handler
3205	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3206	.rva	.Lcbc_dec_tail
3207	.long	0
3208.Lctr_enc_info:
3209	.byte	9,0,0,0
3210	.rva	se_handler
3211	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3212	.rva	.Lctr_enc_tail
3213	.long	0
3214.Lxts_enc_info:
3215	.byte	9,0,0,0
3216	.rva	se_handler
3217	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3218	.rva	.Lxts_enc_tail
3219	.long	0
3220.Lxts_dec_info:
3221	.byte	9,0,0,0
3222	.rva	se_handler
3223	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3224	.rva	.Lxts_dec_tail
3225	.long	0
3226___
3227}
3228
3229$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3230
3231print $code;
3232
3233close STDOUT;
3234