xref: /freebsd/crypto/openssl/crypto/aes/asm/bsaes-x86_64.pl (revision cddbc3b40812213ff00041f79174cac0be360a2a)
1#! /usr/bin/env perl
2# Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10###################################################################
11### AES-128 [originally in CTR mode]				###
12### bitsliced implementation for Intel Core 2 processors	###
13### requires support of SSE extensions up to SSSE3		###
14### Author: Emilia Käsper and Peter Schwabe			###
15### Date: 2009-03-19						###
16### Public domain						###
17###								###
18### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
19### further information.					###
20###################################################################
21#
22# September 2011.
23#
24# Started as transliteration to "perlasm" the original code has
25# undergone following changes:
26#
27# - code was made position-independent;
28# - rounds were folded into a loop resulting in >5x size reduction
29#   from 12.5KB to 2.2KB;
30# - above was possibile thanks to mixcolumns() modification that
31#   allowed to feed its output back to aesenc[last], this was
32#   achieved at cost of two additional inter-registers moves;
33# - some instruction reordering and interleaving;
34# - this module doesn't implement key setup subroutine, instead it
35#   relies on conversion of "conventional" key schedule as returned
36#   by AES_set_encrypt_key (see discussion below);
37# - first and last round keys are treated differently, which allowed
38#   to skip one shiftrows(), reduce bit-sliced key schedule and
39#   speed-up conversion by 22%;
40# - support for 192- and 256-bit keys was added;
41#
42# Resulting performance in CPU cycles spent to encrypt one byte out
43# of 4096-byte buffer with 128-bit key is:
44#
45#		Emilia's	this(*)		difference
46#
47# Core 2    	9.30		8.69		+7%
48# Nehalem(**) 	7.63		6.88		+11%
49# Atom	    	17.1		16.4		+4%
50# Silvermont	-		12.9
51# Goldmont	-		8.85
52#
53# (*)	Comparison is not completely fair, because "this" is ECB,
54#	i.e. no extra processing such as counter values calculation
55#	and xor-ing input as in Emilia's CTR implementation is
56#	performed. However, the CTR calculations stand for not more
57#	than 1% of total time, so comparison is *rather* fair.
58#
59# (**)	Results were collected on Westmere, which is considered to
60#	be equivalent to Nehalem for this code.
61#
62# As for key schedule conversion subroutine. Interface to OpenSSL
63# relies on per-invocation on-the-fly conversion. This naturally
64# has impact on performance, especially for short inputs. Conversion
65# time in CPU cycles and its ratio to CPU cycles spent in 8x block
66# function is:
67#
68# 		conversion	conversion/8x block
69# Core 2	240		0.22
70# Nehalem	180		0.20
71# Atom		430		0.20
72#
73# The ratio values mean that 128-byte blocks will be processed
74# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
75# etc. Then keep in mind that input sizes not divisible by 128 are
76# *effectively* slower, especially shortest ones, e.g. consecutive
77# 144-byte blocks are processed 44% slower than one would expect,
78# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
79# it's still faster than ["hyper-threading-safe" code path in]
80# aes-x86_64.pl on all lengths above 64 bytes...
81#
82# October 2011.
83#
84# Add decryption procedure. Performance in CPU cycles spent to decrypt
85# one byte out of 4096-byte buffer with 128-bit key is:
86#
87# Core 2	9.98
88# Nehalem	7.80
89# Atom		17.9
90# Silvermont	14.0
91# Goldmont	10.2
92#
93# November 2011.
94#
95# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
96# suboptimal, but XTS is meant to be used with larger blocks...
97#
98#						<appro@openssl.org>
99
100$flavour = shift;
101$output  = shift;
102if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
103
104$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
105
106$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
108( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
109die "can't locate x86_64-xlate.pl";
110
111open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
112*STDOUT=*OUT;
113
114my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
115my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
116my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
117
118{
119my ($key,$rounds,$const)=("%rax","%r10d","%r11");
120
121sub Sbox {
122# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
123# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
124my @b=@_[0..7];
125my @t=@_[8..11];
126my @s=@_[12..15];
127	&InBasisChange	(@b);
128	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
129	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
130}
131
132sub InBasisChange {
133# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
134# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
135my @b=@_[0..7];
136$code.=<<___;
137	pxor	@b[6], @b[5]
138	pxor	@b[1], @b[2]
139	pxor	@b[0], @b[3]
140	pxor	@b[2], @b[6]
141	pxor 	@b[0], @b[5]
142
143	pxor	@b[3], @b[6]
144	pxor	@b[7], @b[3]
145	pxor	@b[5], @b[7]
146	pxor	@b[4], @b[3]
147	pxor	@b[5], @b[4]
148	pxor	@b[1], @b[3]
149
150	pxor	@b[7], @b[2]
151	pxor	@b[5], @b[1]
152___
153}
154
155sub OutBasisChange {
156# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
157# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
158my @b=@_[0..7];
159$code.=<<___;
160	pxor	@b[6], @b[0]
161	pxor	@b[4], @b[1]
162	pxor	@b[0], @b[2]
163	pxor	@b[6], @b[4]
164	pxor	@b[1], @b[6]
165
166	pxor	@b[5], @b[1]
167	pxor	@b[3], @b[5]
168	pxor	@b[7], @b[3]
169	pxor	@b[5], @b[7]
170	pxor	@b[5], @b[2]
171
172	pxor	@b[7], @b[4]
173___
174}
175
176sub InvSbox {
177# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
178# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
179my @b=@_[0..7];
180my @t=@_[8..11];
181my @s=@_[12..15];
182	&InvInBasisChange	(@b);
183	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
184	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
185}
186
187sub InvInBasisChange {		# OutBasisChange in reverse
188my @b=@_[5,1,2,6,3,7,0,4];
189$code.=<<___
190	pxor	@b[7], @b[4]
191
192	pxor	@b[5], @b[7]
193	pxor	@b[5], @b[2]
194	pxor	@b[7], @b[3]
195	pxor	@b[3], @b[5]
196	pxor	@b[5], @b[1]
197
198	pxor	@b[1], @b[6]
199	pxor	@b[0], @b[2]
200	pxor	@b[6], @b[4]
201	pxor	@b[6], @b[0]
202	pxor	@b[4], @b[1]
203___
204}
205
206sub InvOutBasisChange {		# InBasisChange in reverse
207my @b=@_[2,5,7,3,6,1,0,4];
208$code.=<<___;
209	pxor	@b[5], @b[1]
210	pxor	@b[7], @b[2]
211
212	pxor	@b[1], @b[3]
213	pxor	@b[5], @b[4]
214	pxor	@b[5], @b[7]
215	pxor	@b[4], @b[3]
216	 pxor 	@b[0], @b[5]
217	pxor	@b[7], @b[3]
218	 pxor	@b[2], @b[6]
219	 pxor	@b[1], @b[2]
220	pxor	@b[3], @b[6]
221
222	pxor	@b[0], @b[3]
223	pxor	@b[6], @b[5]
224___
225}
226
227sub Mul_GF4 {
228#;*************************************************************
229#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
230#;*************************************************************
231my ($x0,$x1,$y0,$y1,$t0)=@_;
232$code.=<<___;
233	movdqa	$y0, $t0
234	pxor 	$y1, $t0
235	pand	$x0, $t0
236	pxor	$x1, $x0
237	pand	$y0, $x1
238	pand	$y1, $x0
239	pxor	$x1, $x0
240	pxor	$t0, $x1
241___
242}
243
244sub Mul_GF4_N {				# not used, see next subroutine
245# multiply and scale by N
246my ($x0,$x1,$y0,$y1,$t0)=@_;
247$code.=<<___;
248	movdqa	$y0, $t0
249	pxor	$y1, $t0
250	pand	$x0, $t0
251	pxor	$x1, $x0
252	pand	$y0, $x1
253	pand	$y1, $x0
254	pxor	$x0, $x1
255	pxor	$t0, $x0
256___
257}
258
259sub Mul_GF4_N_GF4 {
260# interleaved Mul_GF4_N and Mul_GF4
261my ($x0,$x1,$y0,$y1,$t0,
262    $x2,$x3,$y2,$y3,$t1)=@_;
263$code.=<<___;
264	movdqa	$y0, $t0
265	 movdqa	$y2, $t1
266	pxor	$y1, $t0
267	 pxor 	$y3, $t1
268	pand	$x0, $t0
269	 pand	$x2, $t1
270	pxor	$x1, $x0
271	 pxor	$x3, $x2
272	pand	$y0, $x1
273	 pand	$y2, $x3
274	pand	$y1, $x0
275	 pand	$y3, $x2
276	pxor	$x0, $x1
277	 pxor	$x3, $x2
278	pxor	$t0, $x0
279	 pxor	$t1, $x3
280___
281}
282sub Mul_GF16_2 {
283my @x=@_[0..7];
284my @y=@_[8..11];
285my @t=@_[12..15];
286$code.=<<___;
287	movdqa	@x[0], @t[0]
288	movdqa	@x[1], @t[1]
289___
290	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
291$code.=<<___;
292	pxor	@x[2], @t[0]
293	pxor	@x[3], @t[1]
294	pxor	@y[2], @y[0]
295	pxor	@y[3], @y[1]
296___
297	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
298			 @x[2], @x[3], @y[2], @y[3], @t[2]);
299$code.=<<___;
300	pxor	@t[0], @x[0]
301	pxor	@t[0], @x[2]
302	pxor	@t[1], @x[1]
303	pxor	@t[1], @x[3]
304
305	movdqa	@x[4], @t[0]
306	movdqa	@x[5], @t[1]
307	pxor	@x[6], @t[0]
308	pxor	@x[7], @t[1]
309___
310	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
311			 @x[6], @x[7], @y[2], @y[3], @t[2]);
312$code.=<<___;
313	pxor	@y[2], @y[0]
314	pxor	@y[3], @y[1]
315___
316	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
317$code.=<<___;
318	pxor	@t[0], @x[4]
319	pxor	@t[0], @x[6]
320	pxor	@t[1], @x[5]
321	pxor	@t[1], @x[7]
322___
323}
324sub Inv_GF256 {
325#;********************************************************************
326#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
327#;********************************************************************
328my @x=@_[0..7];
329my @t=@_[8..11];
330my @s=@_[12..15];
331# direct optimizations from hardware
332$code.=<<___;
333	movdqa	@x[4], @t[3]
334	movdqa	@x[5], @t[2]
335	movdqa	@x[1], @t[1]
336	movdqa	@x[7], @s[1]
337	movdqa	@x[0], @s[0]
338
339	pxor	@x[6], @t[3]
340	pxor	@x[7], @t[2]
341	pxor	@x[3], @t[1]
342	 movdqa	@t[3], @s[2]
343	pxor	@x[6], @s[1]
344	 movdqa	@t[2], @t[0]
345	pxor	@x[2], @s[0]
346	 movdqa	@t[3], @s[3]
347
348	por	@t[1], @t[2]
349	por	@s[0], @t[3]
350	pxor	@t[0], @s[3]
351	pand	@s[0], @s[2]
352	pxor	@t[1], @s[0]
353	pand	@t[1], @t[0]
354	pand	@s[0], @s[3]
355	movdqa	@x[3], @s[0]
356	pxor	@x[2], @s[0]
357	pand	@s[0], @s[1]
358	pxor	@s[1], @t[3]
359	pxor	@s[1], @t[2]
360	movdqa	@x[4], @s[1]
361	movdqa	@x[1], @s[0]
362	pxor	@x[5], @s[1]
363	pxor	@x[0], @s[0]
364	movdqa	@s[1], @t[1]
365	pand	@s[0], @s[1]
366	por	@s[0], @t[1]
367	pxor	@s[1], @t[0]
368	pxor	@s[3], @t[3]
369	pxor	@s[2], @t[2]
370	pxor	@s[3], @t[1]
371	movdqa	@x[7], @s[0]
372	pxor	@s[2], @t[0]
373	movdqa	@x[6], @s[1]
374	pxor	@s[2], @t[1]
375	movdqa	@x[5], @s[2]
376	pand	@x[3], @s[0]
377	movdqa	@x[4], @s[3]
378	pand	@x[2], @s[1]
379	pand	@x[1], @s[2]
380	por	@x[0], @s[3]
381	pxor	@s[0], @t[3]
382	pxor	@s[1], @t[2]
383	pxor	@s[2], @t[1]
384	pxor	@s[3], @t[0]
385
386	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
387
388	# new smaller inversion
389
390	movdqa	@t[3], @s[0]
391	pand	@t[1], @t[3]
392	pxor	@t[2], @s[0]
393
394	movdqa	@t[0], @s[2]
395	movdqa	@s[0], @s[3]
396	pxor	@t[3], @s[2]
397	pand	@s[2], @s[3]
398
399	movdqa	@t[1], @s[1]
400	pxor	@t[2], @s[3]
401	pxor	@t[0], @s[1]
402
403	pxor	@t[2], @t[3]
404
405	pand	@t[3], @s[1]
406
407	movdqa	@s[2], @t[2]
408	pxor	@t[0], @s[1]
409
410	pxor	@s[1], @t[2]
411	pxor	@s[1], @t[1]
412
413	pand	@t[0], @t[2]
414
415	pxor	@t[2], @s[2]
416	pxor	@t[2], @t[1]
417
418	pand	@s[3], @s[2]
419
420	pxor	@s[0], @s[2]
421___
422# output in s3, s2, s1, t1
423
424# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
425
426# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
427	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
428
429### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
430}
431
432# AES linear components
433
434sub ShiftRows {
435my @x=@_[0..7];
436my $mask=pop;
437$code.=<<___;
438	pxor	0x00($key),@x[0]
439	pxor	0x10($key),@x[1]
440	pxor	0x20($key),@x[2]
441	pxor	0x30($key),@x[3]
442	pshufb	$mask,@x[0]
443	pshufb	$mask,@x[1]
444	pxor	0x40($key),@x[4]
445	pxor	0x50($key),@x[5]
446	pshufb	$mask,@x[2]
447	pshufb	$mask,@x[3]
448	pxor	0x60($key),@x[6]
449	pxor	0x70($key),@x[7]
450	pshufb	$mask,@x[4]
451	pshufb	$mask,@x[5]
452	pshufb	$mask,@x[6]
453	pshufb	$mask,@x[7]
454	lea	0x80($key),$key
455___
456}
457
458sub MixColumns {
459# modified to emit output in order suitable for feeding back to aesenc[last]
460my @x=@_[0..7];
461my @t=@_[8..15];
462my $inv=@_[16];	# optional
463$code.=<<___;
464	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
465	pshufd	\$0x93, @x[1], @t[1]
466	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
467	pshufd	\$0x93, @x[2], @t[2]
468	 pxor	@t[1], @x[1]
469	pshufd	\$0x93, @x[3], @t[3]
470	 pxor	@t[2], @x[2]
471	pshufd	\$0x93, @x[4], @t[4]
472	 pxor	@t[3], @x[3]
473	pshufd	\$0x93, @x[5], @t[5]
474	 pxor	@t[4], @x[4]
475	pshufd	\$0x93, @x[6], @t[6]
476	 pxor	@t[5], @x[5]
477	pshufd	\$0x93, @x[7], @t[7]
478	 pxor	@t[6], @x[6]
479	 pxor	@t[7], @x[7]
480
481	pxor	@x[0], @t[1]
482	pxor	@x[7], @t[0]
483	pxor	@x[7], @t[1]
484	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
485	pxor	@x[1], @t[2]
486	 pshufd	\$0x4E, @x[1], @x[1]
487	pxor	@x[4], @t[5]
488	 pxor	@t[0], @x[0]
489	pxor	@x[5], @t[6]
490	 pxor	@t[1], @x[1]
491	pxor	@x[3], @t[4]
492	 pshufd	\$0x4E, @x[4], @t[0]
493	pxor	@x[6], @t[7]
494	 pshufd	\$0x4E, @x[5], @t[1]
495	pxor	@x[2], @t[3]
496	 pshufd	\$0x4E, @x[3], @x[4]
497	pxor	@x[7], @t[3]
498	 pshufd	\$0x4E, @x[7], @x[5]
499	pxor	@x[7], @t[4]
500	 pshufd	\$0x4E, @x[6], @x[3]
501	pxor	@t[4], @t[0]
502	 pshufd	\$0x4E, @x[2], @x[6]
503	pxor	@t[5], @t[1]
504___
505$code.=<<___ if (!$inv);
506	pxor	@t[3], @x[4]
507	pxor	@t[7], @x[5]
508	pxor	@t[6], @x[3]
509	 movdqa	@t[0], @x[2]
510	pxor	@t[2], @x[6]
511	 movdqa	@t[1], @x[7]
512___
513$code.=<<___ if ($inv);
514	pxor	@x[4], @t[3]
515	pxor	@t[7], @x[5]
516	pxor	@x[3], @t[6]
517	 movdqa	@t[0], @x[3]
518	pxor	@t[2], @x[6]
519	 movdqa	@t[6], @x[2]
520	 movdqa	@t[1], @x[7]
521	 movdqa	@x[6], @x[4]
522	 movdqa	@t[3], @x[6]
523___
524}
525
526sub InvMixColumns_orig {
527my @x=@_[0..7];
528my @t=@_[8..15];
529
530$code.=<<___;
531	# multiplication by 0x0e
532	pshufd	\$0x93, @x[7], @t[7]
533	movdqa	@x[2], @t[2]
534	pxor	@x[5], @x[7]		# 7 5
535	pxor	@x[5], @x[2]		# 2 5
536	pshufd	\$0x93, @x[0], @t[0]
537	movdqa	@x[5], @t[5]
538	pxor	@x[0], @x[5]		# 5 0		[1]
539	pxor	@x[1], @x[0]		# 0 1
540	pshufd	\$0x93, @x[1], @t[1]
541	pxor	@x[2], @x[1]		# 1 25
542	pxor	@x[6], @x[0]		# 01 6		[2]
543	pxor	@x[3], @x[1]		# 125 3		[4]
544	pshufd	\$0x93, @x[3], @t[3]
545	pxor	@x[0], @x[2]		# 25 016	[3]
546	pxor	@x[7], @x[3]		# 3 75
547	pxor	@x[6], @x[7]		# 75 6		[0]
548	pshufd	\$0x93, @x[6], @t[6]
549	movdqa	@x[4], @t[4]
550	pxor	@x[4], @x[6]		# 6 4
551	pxor	@x[3], @x[4]		# 4 375		[6]
552	pxor	@x[7], @x[3]		# 375 756=36
553	pxor	@t[5], @x[6]		# 64 5		[7]
554	pxor	@t[2], @x[3]		# 36 2
555	pxor	@t[4], @x[3]		# 362 4		[5]
556	pshufd	\$0x93, @t[5], @t[5]
557___
558					my @y = @x[7,5,0,2,1,3,4,6];
559$code.=<<___;
560	# multiplication by 0x0b
561	pxor	@y[0], @y[1]
562	pxor	@t[0], @y[0]
563	pxor	@t[1], @y[1]
564	pshufd	\$0x93, @t[2], @t[2]
565	pxor	@t[5], @y[0]
566	pxor	@t[6], @y[1]
567	pxor	@t[7], @y[0]
568	pshufd	\$0x93, @t[4], @t[4]
569	pxor	@t[6], @t[7]		# clobber t[7]
570	pxor	@y[0], @y[1]
571
572	pxor	@t[0], @y[3]
573	pshufd	\$0x93, @t[0], @t[0]
574	pxor	@t[1], @y[2]
575	pxor	@t[1], @y[4]
576	pxor	@t[2], @y[2]
577	pshufd	\$0x93, @t[1], @t[1]
578	pxor	@t[2], @y[3]
579	pxor	@t[2], @y[5]
580	pxor	@t[7], @y[2]
581	pshufd	\$0x93, @t[2], @t[2]
582	pxor	@t[3], @y[3]
583	pxor	@t[3], @y[6]
584	pxor	@t[3], @y[4]
585	pshufd	\$0x93, @t[3], @t[3]
586	pxor	@t[4], @y[7]
587	pxor	@t[4], @y[5]
588	pxor	@t[7], @y[7]
589	pxor	@t[5], @y[3]
590	pxor	@t[4], @y[4]
591	pxor	@t[5], @t[7]		# clobber t[7] even more
592
593	pxor	@t[7], @y[5]
594	pshufd	\$0x93, @t[4], @t[4]
595	pxor	@t[7], @y[6]
596	pxor	@t[7], @y[4]
597
598	pxor	@t[5], @t[7]
599	pshufd	\$0x93, @t[5], @t[5]
600	pxor	@t[6], @t[7]		# restore t[7]
601
602	# multiplication by 0x0d
603	pxor	@y[7], @y[4]
604	pxor	@t[4], @y[7]
605	pshufd	\$0x93, @t[6], @t[6]
606	pxor	@t[0], @y[2]
607	pxor	@t[5], @y[7]
608	pxor	@t[2], @y[2]
609	pshufd	\$0x93, @t[7], @t[7]
610
611	pxor	@y[1], @y[3]
612	pxor	@t[1], @y[1]
613	pxor	@t[0], @y[0]
614	pxor	@t[0], @y[3]
615	pxor	@t[5], @y[1]
616	pxor	@t[5], @y[0]
617	pxor	@t[7], @y[1]
618	pshufd	\$0x93, @t[0], @t[0]
619	pxor	@t[6], @y[0]
620	pxor	@y[1], @y[3]
621	pxor	@t[1], @y[4]
622	pshufd	\$0x93, @t[1], @t[1]
623
624	pxor	@t[7], @y[7]
625	pxor	@t[2], @y[4]
626	pxor	@t[2], @y[5]
627	pshufd	\$0x93, @t[2], @t[2]
628	pxor	@t[6], @y[2]
629	pxor	@t[3], @t[6]		# clobber t[6]
630	pxor	@y[7], @y[4]
631	pxor	@t[6], @y[3]
632
633	pxor	@t[6], @y[6]
634	pxor	@t[5], @y[5]
635	pxor	@t[4], @y[6]
636	pshufd	\$0x93, @t[4], @t[4]
637	pxor	@t[6], @y[5]
638	pxor	@t[7], @y[6]
639	pxor	@t[3], @t[6]		# restore t[6]
640
641	pshufd	\$0x93, @t[5], @t[5]
642	pshufd	\$0x93, @t[6], @t[6]
643	pshufd	\$0x93, @t[7], @t[7]
644	pshufd	\$0x93, @t[3], @t[3]
645
646	# multiplication by 0x09
647	pxor	@y[1], @y[4]
648	pxor	@y[1], @t[1]		# t[1]=y[1]
649	pxor	@t[5], @t[0]		# clobber t[0]
650	pxor	@t[5], @t[1]
651	pxor	@t[0], @y[3]
652	pxor	@y[0], @t[0]		# t[0]=y[0]
653	pxor	@t[6], @t[1]
654	pxor	@t[7], @t[6]		# clobber t[6]
655	pxor	@t[1], @y[4]
656	pxor	@t[4], @y[7]
657	pxor	@y[4], @t[4]		# t[4]=y[4]
658	pxor	@t[3], @y[6]
659	pxor	@y[3], @t[3]		# t[3]=y[3]
660	pxor	@t[2], @y[5]
661	pxor	@y[2], @t[2]		# t[2]=y[2]
662	pxor	@t[7], @t[3]
663	pxor	@y[5], @t[5]		# t[5]=y[5]
664	pxor	@t[6], @t[2]
665	pxor	@t[6], @t[5]
666	pxor	@y[6], @t[6]		# t[6]=y[6]
667	pxor	@y[7], @t[7]		# t[7]=y[7]
668
669	movdqa	@t[0],@XMM[0]
670	movdqa	@t[1],@XMM[1]
671	movdqa	@t[2],@XMM[2]
672	movdqa	@t[3],@XMM[3]
673	movdqa	@t[4],@XMM[4]
674	movdqa	@t[5],@XMM[5]
675	movdqa	@t[6],@XMM[6]
676	movdqa	@t[7],@XMM[7]
677___
678}
679
680sub InvMixColumns {
681my @x=@_[0..7];
682my @t=@_[8..15];
683
684# Thanks to Jussi Kivilinna for providing pointer to
685#
686# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
687# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
688# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
689# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
690
691$code.=<<___;
692	# multiplication by 0x05-0x00-0x04-0x00
693	pshufd	\$0x4E, @x[0], @t[0]
694	pshufd	\$0x4E, @x[6], @t[6]
695	pxor	@x[0], @t[0]
696	pshufd	\$0x4E, @x[7], @t[7]
697	pxor	@x[6], @t[6]
698	pshufd	\$0x4E, @x[1], @t[1]
699	pxor	@x[7], @t[7]
700	pshufd	\$0x4E, @x[2], @t[2]
701	pxor	@x[1], @t[1]
702	pshufd	\$0x4E, @x[3], @t[3]
703	pxor	@x[2], @t[2]
704	 pxor	@t[6], @x[0]
705	 pxor	@t[6], @x[1]
706	pshufd	\$0x4E, @x[4], @t[4]
707	pxor	@x[3], @t[3]
708	 pxor	@t[0], @x[2]
709	 pxor	@t[1], @x[3]
710	pshufd	\$0x4E, @x[5], @t[5]
711	pxor	@x[4], @t[4]
712	 pxor	@t[7], @x[1]
713	 pxor	@t[2], @x[4]
714	pxor	@x[5], @t[5]
715
716	 pxor	@t[7], @x[2]
717	 pxor	@t[6], @x[3]
718	 pxor	@t[6], @x[4]
719	 pxor	@t[3], @x[5]
720	 pxor	@t[4], @x[6]
721	 pxor	@t[7], @x[4]
722	 pxor	@t[7], @x[5]
723	 pxor	@t[5], @x[7]
724___
725	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
726}
727
728sub aesenc {				# not used
729my @b=@_[0..7];
730my @t=@_[8..15];
731$code.=<<___;
732	movdqa	0x30($const),@t[0]	# .LSR
733___
734	&ShiftRows	(@b,@t[0]);
735	&Sbox		(@b,@t);
736	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
737}
738
739sub aesenclast {			# not used
740my @b=@_[0..7];
741my @t=@_[8..15];
742$code.=<<___;
743	movdqa	0x40($const),@t[0]	# .LSRM0
744___
745	&ShiftRows	(@b,@t[0]);
746	&Sbox		(@b,@t);
747$code.=<<___
748	pxor	0x00($key),@b[0]
749	pxor	0x10($key),@b[1]
750	pxor	0x20($key),@b[4]
751	pxor	0x30($key),@b[6]
752	pxor	0x40($key),@b[3]
753	pxor	0x50($key),@b[7]
754	pxor	0x60($key),@b[2]
755	pxor	0x70($key),@b[5]
756___
757}
758
759sub swapmove {
760my ($a,$b,$n,$mask,$t)=@_;
761$code.=<<___;
762	movdqa	$b,$t
763	psrlq	\$$n,$b
764	pxor  	$a,$b
765	pand	$mask,$b
766	pxor	$b,$a
767	psllq	\$$n,$b
768	pxor	$t,$b
769___
770}
771sub swapmove2x {
772my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
773$code.=<<___;
774	movdqa	$b0,$t0
775	psrlq	\$$n,$b0
776	 movdqa	$b1,$t1
777	 psrlq	\$$n,$b1
778	pxor  	$a0,$b0
779	 pxor  	$a1,$b1
780	pand	$mask,$b0
781	 pand	$mask,$b1
782	pxor	$b0,$a0
783	psllq	\$$n,$b0
784	 pxor	$b1,$a1
785	 psllq	\$$n,$b1
786	pxor	$t0,$b0
787	 pxor	$t1,$b1
788___
789}
790
791sub bitslice {
792my @x=reverse(@_[0..7]);
793my ($t0,$t1,$t2,$t3)=@_[8..11];
794$code.=<<___;
795	movdqa	0x00($const),$t0	# .LBS0
796	movdqa	0x10($const),$t1	# .LBS1
797___
798	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
799	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
800$code.=<<___;
801	movdqa	0x20($const),$t0	# .LBS2
802___
803	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
804	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
805
806	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
807	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
808}
809
810$code.=<<___;
811.text
812
813.extern	asm_AES_encrypt
814.extern	asm_AES_decrypt
815
816.type	_bsaes_encrypt8,\@abi-omnipotent
817.align	64
818_bsaes_encrypt8:
819.cfi_startproc
820	lea	.LBS0(%rip), $const	# constants table
821
822	movdqa	($key), @XMM[9]		# round 0 key
823	lea	0x10($key), $key
824	movdqa	0x50($const), @XMM[8]	# .LM0SR
825	pxor	@XMM[9], @XMM[0]	# xor with round0 key
826	pxor	@XMM[9], @XMM[1]
827	pxor	@XMM[9], @XMM[2]
828	pxor	@XMM[9], @XMM[3]
829	 pshufb	@XMM[8], @XMM[0]
830	 pshufb	@XMM[8], @XMM[1]
831	pxor	@XMM[9], @XMM[4]
832	pxor	@XMM[9], @XMM[5]
833	 pshufb	@XMM[8], @XMM[2]
834	 pshufb	@XMM[8], @XMM[3]
835	pxor	@XMM[9], @XMM[6]
836	pxor	@XMM[9], @XMM[7]
837	 pshufb	@XMM[8], @XMM[4]
838	 pshufb	@XMM[8], @XMM[5]
839	 pshufb	@XMM[8], @XMM[6]
840	 pshufb	@XMM[8], @XMM[7]
841_bsaes_encrypt8_bitslice:
842___
843	&bitslice	(@XMM[0..7, 8..11]);
844$code.=<<___;
845	dec	$rounds
846	jmp	.Lenc_sbox
847.align	16
848.Lenc_loop:
849___
850	&ShiftRows	(@XMM[0..7, 8]);
851$code.=".Lenc_sbox:\n";
852	&Sbox		(@XMM[0..7, 8..15]);
853$code.=<<___;
854	dec	$rounds
855	jl	.Lenc_done
856___
857	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
858$code.=<<___;
859	movdqa	0x30($const), @XMM[8]	# .LSR
860	jnz	.Lenc_loop
861	movdqa	0x40($const), @XMM[8]	# .LSRM0
862	jmp	.Lenc_loop
863.align	16
864.Lenc_done:
865___
866	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
867	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
868$code.=<<___;
869	movdqa	($key), @XMM[8]		# last round key
870	pxor	@XMM[8], @XMM[4]
871	pxor	@XMM[8], @XMM[6]
872	pxor	@XMM[8], @XMM[3]
873	pxor	@XMM[8], @XMM[7]
874	pxor	@XMM[8], @XMM[2]
875	pxor	@XMM[8], @XMM[5]
876	pxor	@XMM[8], @XMM[0]
877	pxor	@XMM[8], @XMM[1]
878	ret
879.cfi_endproc
880.size	_bsaes_encrypt8,.-_bsaes_encrypt8
881
882.type	_bsaes_decrypt8,\@abi-omnipotent
883.align	64
884_bsaes_decrypt8:
885.cfi_startproc
886	lea	.LBS0(%rip), $const	# constants table
887
888	movdqa	($key), @XMM[9]		# round 0 key
889	lea	0x10($key), $key
890	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
891	pxor	@XMM[9], @XMM[0]	# xor with round0 key
892	pxor	@XMM[9], @XMM[1]
893	pxor	@XMM[9], @XMM[2]
894	pxor	@XMM[9], @XMM[3]
895	 pshufb	@XMM[8], @XMM[0]
896	 pshufb	@XMM[8], @XMM[1]
897	pxor	@XMM[9], @XMM[4]
898	pxor	@XMM[9], @XMM[5]
899	 pshufb	@XMM[8], @XMM[2]
900	 pshufb	@XMM[8], @XMM[3]
901	pxor	@XMM[9], @XMM[6]
902	pxor	@XMM[9], @XMM[7]
903	 pshufb	@XMM[8], @XMM[4]
904	 pshufb	@XMM[8], @XMM[5]
905	 pshufb	@XMM[8], @XMM[6]
906	 pshufb	@XMM[8], @XMM[7]
907___
908	&bitslice	(@XMM[0..7, 8..11]);
909$code.=<<___;
910	dec	$rounds
911	jmp	.Ldec_sbox
912.align	16
913.Ldec_loop:
914___
915	&ShiftRows	(@XMM[0..7, 8]);
916$code.=".Ldec_sbox:\n";
917	&InvSbox	(@XMM[0..7, 8..15]);
918$code.=<<___;
919	dec	$rounds
920	jl	.Ldec_done
921___
922	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
923$code.=<<___;
924	movdqa	-0x10($const), @XMM[8]	# .LISR
925	jnz	.Ldec_loop
926	movdqa	-0x20($const), @XMM[8]	# .LISRM0
927	jmp	.Ldec_loop
928.align	16
929.Ldec_done:
930___
931	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
932$code.=<<___;
933	movdqa	($key), @XMM[8]		# last round key
934	pxor	@XMM[8], @XMM[6]
935	pxor	@XMM[8], @XMM[4]
936	pxor	@XMM[8], @XMM[2]
937	pxor	@XMM[8], @XMM[7]
938	pxor	@XMM[8], @XMM[3]
939	pxor	@XMM[8], @XMM[5]
940	pxor	@XMM[8], @XMM[0]
941	pxor	@XMM[8], @XMM[1]
942	ret
943.cfi_endproc
944.size	_bsaes_decrypt8,.-_bsaes_decrypt8
945___
946}
947{
948my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
949
950sub bitslice_key {
951my @x=reverse(@_[0..7]);
952my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
953
954	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
955$code.=<<___;
956	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
957	movdqa	@x[0], @x[2]
958	movdqa	@x[1], @x[3]
959___
960	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
961
962	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
963$code.=<<___;
964	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
965	movdqa	@x[0], @x[4]
966	movdqa	@x[2], @x[6]
967	movdqa	@x[1], @x[5]
968	movdqa	@x[3], @x[7]
969___
970	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
971	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
972}
973
974$code.=<<___;
975.type	_bsaes_key_convert,\@abi-omnipotent
976.align	16
977_bsaes_key_convert:
978.cfi_startproc
979	lea	.Lmasks(%rip), $const
980	movdqu	($inp), %xmm7		# load round 0 key
981	lea	0x10($inp), $inp
982	movdqa	0x00($const), %xmm0	# 0x01...
983	movdqa	0x10($const), %xmm1	# 0x02...
984	movdqa	0x20($const), %xmm2	# 0x04...
985	movdqa	0x30($const), %xmm3	# 0x08...
986	movdqa	0x40($const), %xmm4	# .LM0
987	pcmpeqd	%xmm5, %xmm5		# .LNOT
988
989	movdqu	($inp), %xmm6		# load round 1 key
990	movdqa	%xmm7, ($out)		# save round 0 key
991	lea	0x10($out), $out
992	dec	$rounds
993	jmp	.Lkey_loop
994.align	16
995.Lkey_loop:
996	pshufb	%xmm4, %xmm6		# .LM0
997
998	movdqa	%xmm0,	%xmm8
999	movdqa	%xmm1,	%xmm9
1000
1001	pand	%xmm6,	%xmm8
1002	pand	%xmm6,	%xmm9
1003	movdqa	%xmm2,	%xmm10
1004	pcmpeqb	%xmm0,	%xmm8
1005	psllq	\$4,	%xmm0		# 0x10...
1006	movdqa	%xmm3,	%xmm11
1007	pcmpeqb	%xmm1,	%xmm9
1008	psllq	\$4,	%xmm1		# 0x20...
1009
1010	pand	%xmm6,	%xmm10
1011	pand	%xmm6,	%xmm11
1012	movdqa	%xmm0,	%xmm12
1013	pcmpeqb	%xmm2,	%xmm10
1014	psllq	\$4,	%xmm2		# 0x40...
1015	movdqa	%xmm1,	%xmm13
1016	pcmpeqb	%xmm3,	%xmm11
1017	psllq	\$4,	%xmm3		# 0x80...
1018
1019	movdqa	%xmm2,	%xmm14
1020	movdqa	%xmm3,	%xmm15
1021	 pxor	%xmm5,	%xmm8		# "pnot"
1022	 pxor	%xmm5,	%xmm9
1023
1024	pand	%xmm6,	%xmm12
1025	pand	%xmm6,	%xmm13
1026	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
1027	pcmpeqb	%xmm0,	%xmm12
1028	psrlq	\$4,	%xmm0		# 0x01...
1029	 movdqa	%xmm9, 0x10($out)
1030	pcmpeqb	%xmm1,	%xmm13
1031	psrlq	\$4,	%xmm1		# 0x02...
1032	 lea	0x10($inp), $inp
1033
1034	pand	%xmm6,	%xmm14
1035	pand	%xmm6,	%xmm15
1036	 movdqa	%xmm10, 0x20($out)
1037	pcmpeqb	%xmm2,	%xmm14
1038	psrlq	\$4,	%xmm2		# 0x04...
1039	 movdqa	%xmm11, 0x30($out)
1040	pcmpeqb	%xmm3,	%xmm15
1041	psrlq	\$4,	%xmm3		# 0x08...
1042	 movdqu	($inp), %xmm6		# load next round key
1043
1044	pxor	%xmm5, %xmm13		# "pnot"
1045	pxor	%xmm5, %xmm14
1046	movdqa	%xmm12, 0x40($out)
1047	movdqa	%xmm13, 0x50($out)
1048	movdqa	%xmm14, 0x60($out)
1049	movdqa	%xmm15, 0x70($out)
1050	lea	0x80($out),$out
1051	dec	$rounds
1052	jnz	.Lkey_loop
1053
1054	movdqa	0x50($const), %xmm7	# .L63
1055	#movdqa	%xmm6, ($out)		# don't save last round key
1056	ret
1057.cfi_endproc
1058.size	_bsaes_key_convert,.-_bsaes_key_convert
1059___
1060}
1061
1062if (0 && !$win64) {	# following four functions are unsupported interface
1063			# used for benchmarking...
1064$code.=<<___;
1065.globl	bsaes_enc_key_convert
1066.type	bsaes_enc_key_convert,\@function,2
1067.align	16
1068bsaes_enc_key_convert:
1069	mov	240($inp),%r10d		# pass rounds
1070	mov	$inp,%rcx		# pass key
1071	mov	$out,%rax		# pass key schedule
1072	call	_bsaes_key_convert
1073	pxor	%xmm6,%xmm7		# fix up last round key
1074	movdqa	%xmm7,(%rax)		# save last round key
1075	ret
1076.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1077
1078.globl	bsaes_encrypt_128
1079.type	bsaes_encrypt_128,\@function,4
1080.align	16
1081bsaes_encrypt_128:
1082.Lenc128_loop:
1083	movdqu	0x00($inp), @XMM[0]	# load input
1084	movdqu	0x10($inp), @XMM[1]
1085	movdqu	0x20($inp), @XMM[2]
1086	movdqu	0x30($inp), @XMM[3]
1087	movdqu	0x40($inp), @XMM[4]
1088	movdqu	0x50($inp), @XMM[5]
1089	movdqu	0x60($inp), @XMM[6]
1090	movdqu	0x70($inp), @XMM[7]
1091	mov	$key, %rax		# pass the $key
1092	lea	0x80($inp), $inp
1093	mov	\$10,%r10d
1094
1095	call	_bsaes_encrypt8
1096
1097	movdqu	@XMM[0], 0x00($out)	# write output
1098	movdqu	@XMM[1], 0x10($out)
1099	movdqu	@XMM[4], 0x20($out)
1100	movdqu	@XMM[6], 0x30($out)
1101	movdqu	@XMM[3], 0x40($out)
1102	movdqu	@XMM[7], 0x50($out)
1103	movdqu	@XMM[2], 0x60($out)
1104	movdqu	@XMM[5], 0x70($out)
1105	lea	0x80($out), $out
1106	sub	\$0x80,$len
1107	ja	.Lenc128_loop
1108	ret
1109.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1110
1111.globl	bsaes_dec_key_convert
1112.type	bsaes_dec_key_convert,\@function,2
1113.align	16
1114bsaes_dec_key_convert:
1115	mov	240($inp),%r10d		# pass rounds
1116	mov	$inp,%rcx		# pass key
1117	mov	$out,%rax		# pass key schedule
1118	call	_bsaes_key_convert
1119	pxor	($out),%xmm7		# fix up round 0 key
1120	movdqa	%xmm6,(%rax)		# save last round key
1121	movdqa	%xmm7,($out)
1122	ret
1123.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1124
1125.globl	bsaes_decrypt_128
1126.type	bsaes_decrypt_128,\@function,4
1127.align	16
1128bsaes_decrypt_128:
1129.Ldec128_loop:
1130	movdqu	0x00($inp), @XMM[0]	# load input
1131	movdqu	0x10($inp), @XMM[1]
1132	movdqu	0x20($inp), @XMM[2]
1133	movdqu	0x30($inp), @XMM[3]
1134	movdqu	0x40($inp), @XMM[4]
1135	movdqu	0x50($inp), @XMM[5]
1136	movdqu	0x60($inp), @XMM[6]
1137	movdqu	0x70($inp), @XMM[7]
1138	mov	$key, %rax		# pass the $key
1139	lea	0x80($inp), $inp
1140	mov	\$10,%r10d
1141
1142	call	_bsaes_decrypt8
1143
1144	movdqu	@XMM[0], 0x00($out)	# write output
1145	movdqu	@XMM[1], 0x10($out)
1146	movdqu	@XMM[6], 0x20($out)
1147	movdqu	@XMM[4], 0x30($out)
1148	movdqu	@XMM[2], 0x40($out)
1149	movdqu	@XMM[7], 0x50($out)
1150	movdqu	@XMM[3], 0x60($out)
1151	movdqu	@XMM[5], 0x70($out)
1152	lea	0x80($out), $out
1153	sub	\$0x80,$len
1154	ja	.Ldec128_loop
1155	ret
1156.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1157___
1158}
1159{
1160######################################################################
1161#
1162# OpenSSL interface
1163#
1164my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1165						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1166my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1167
1168if ($ecb) {
1169$code.=<<___;
1170.globl	bsaes_ecb_encrypt_blocks
1171.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1172.align	16
1173bsaes_ecb_encrypt_blocks:
1174.cfi_startproc
1175	mov	%rsp, %rax
1176.Lecb_enc_prologue:
1177	push	%rbp
1178.cfi_push	%rbp
1179	push	%rbx
1180.cfi_push	%rbx
1181	push	%r12
1182.cfi_push	%r12
1183	push	%r13
1184.cfi_push	%r13
1185	push	%r14
1186.cfi_push	%r14
1187	push	%r15
1188.cfi_push	%r15
1189	lea	-0x48(%rsp),%rsp
1190.cfi_adjust_cfa_offset	0x48
1191___
1192$code.=<<___ if ($win64);
1193	lea	-0xa0(%rsp), %rsp
1194	movaps	%xmm6, 0x40(%rsp)
1195	movaps	%xmm7, 0x50(%rsp)
1196	movaps	%xmm8, 0x60(%rsp)
1197	movaps	%xmm9, 0x70(%rsp)
1198	movaps	%xmm10, 0x80(%rsp)
1199	movaps	%xmm11, 0x90(%rsp)
1200	movaps	%xmm12, 0xa0(%rsp)
1201	movaps	%xmm13, 0xb0(%rsp)
1202	movaps	%xmm14, 0xc0(%rsp)
1203	movaps	%xmm15, 0xd0(%rsp)
1204.Lecb_enc_body:
1205___
1206$code.=<<___;
1207	mov	%rsp,%rbp		# backup %rsp
1208.cfi_def_cfa_register	%rbp
1209	mov	240($arg4),%eax		# rounds
1210	mov	$arg1,$inp		# backup arguments
1211	mov	$arg2,$out
1212	mov	$arg3,$len
1213	mov	$arg4,$key
1214	cmp	\$8,$arg3
1215	jb	.Lecb_enc_short
1216
1217	mov	%eax,%ebx		# backup rounds
1218	shl	\$7,%rax		# 128 bytes per inner round key
1219	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1220	sub	%rax,%rsp
1221	mov	%rsp,%rax		# pass key schedule
1222	mov	$key,%rcx		# pass key
1223	mov	%ebx,%r10d		# pass rounds
1224	call	_bsaes_key_convert
1225	pxor	%xmm6,%xmm7		# fix up last round key
1226	movdqa	%xmm7,(%rax)		# save last round key
1227
1228	sub	\$8,$len
1229.Lecb_enc_loop:
1230	movdqu	0x00($inp), @XMM[0]	# load input
1231	movdqu	0x10($inp), @XMM[1]
1232	movdqu	0x20($inp), @XMM[2]
1233	movdqu	0x30($inp), @XMM[3]
1234	movdqu	0x40($inp), @XMM[4]
1235	movdqu	0x50($inp), @XMM[5]
1236	mov	%rsp, %rax		# pass key schedule
1237	movdqu	0x60($inp), @XMM[6]
1238	mov	%ebx,%r10d		# pass rounds
1239	movdqu	0x70($inp), @XMM[7]
1240	lea	0x80($inp), $inp
1241
1242	call	_bsaes_encrypt8
1243
1244	movdqu	@XMM[0], 0x00($out)	# write output
1245	movdqu	@XMM[1], 0x10($out)
1246	movdqu	@XMM[4], 0x20($out)
1247	movdqu	@XMM[6], 0x30($out)
1248	movdqu	@XMM[3], 0x40($out)
1249	movdqu	@XMM[7], 0x50($out)
1250	movdqu	@XMM[2], 0x60($out)
1251	movdqu	@XMM[5], 0x70($out)
1252	lea	0x80($out), $out
1253	sub	\$8,$len
1254	jnc	.Lecb_enc_loop
1255
1256	add	\$8,$len
1257	jz	.Lecb_enc_done
1258
1259	movdqu	0x00($inp), @XMM[0]	# load input
1260	mov	%rsp, %rax		# pass key schedule
1261	mov	%ebx,%r10d		# pass rounds
1262	cmp	\$2,$len
1263	jb	.Lecb_enc_one
1264	movdqu	0x10($inp), @XMM[1]
1265	je	.Lecb_enc_two
1266	movdqu	0x20($inp), @XMM[2]
1267	cmp	\$4,$len
1268	jb	.Lecb_enc_three
1269	movdqu	0x30($inp), @XMM[3]
1270	je	.Lecb_enc_four
1271	movdqu	0x40($inp), @XMM[4]
1272	cmp	\$6,$len
1273	jb	.Lecb_enc_five
1274	movdqu	0x50($inp), @XMM[5]
1275	je	.Lecb_enc_six
1276	movdqu	0x60($inp), @XMM[6]
1277	call	_bsaes_encrypt8
1278	movdqu	@XMM[0], 0x00($out)	# write output
1279	movdqu	@XMM[1], 0x10($out)
1280	movdqu	@XMM[4], 0x20($out)
1281	movdqu	@XMM[6], 0x30($out)
1282	movdqu	@XMM[3], 0x40($out)
1283	movdqu	@XMM[7], 0x50($out)
1284	movdqu	@XMM[2], 0x60($out)
1285	jmp	.Lecb_enc_done
1286.align	16
1287.Lecb_enc_six:
1288	call	_bsaes_encrypt8
1289	movdqu	@XMM[0], 0x00($out)	# write output
1290	movdqu	@XMM[1], 0x10($out)
1291	movdqu	@XMM[4], 0x20($out)
1292	movdqu	@XMM[6], 0x30($out)
1293	movdqu	@XMM[3], 0x40($out)
1294	movdqu	@XMM[7], 0x50($out)
1295	jmp	.Lecb_enc_done
1296.align	16
1297.Lecb_enc_five:
1298	call	_bsaes_encrypt8
1299	movdqu	@XMM[0], 0x00($out)	# write output
1300	movdqu	@XMM[1], 0x10($out)
1301	movdqu	@XMM[4], 0x20($out)
1302	movdqu	@XMM[6], 0x30($out)
1303	movdqu	@XMM[3], 0x40($out)
1304	jmp	.Lecb_enc_done
1305.align	16
1306.Lecb_enc_four:
1307	call	_bsaes_encrypt8
1308	movdqu	@XMM[0], 0x00($out)	# write output
1309	movdqu	@XMM[1], 0x10($out)
1310	movdqu	@XMM[4], 0x20($out)
1311	movdqu	@XMM[6], 0x30($out)
1312	jmp	.Lecb_enc_done
1313.align	16
1314.Lecb_enc_three:
1315	call	_bsaes_encrypt8
1316	movdqu	@XMM[0], 0x00($out)	# write output
1317	movdqu	@XMM[1], 0x10($out)
1318	movdqu	@XMM[4], 0x20($out)
1319	jmp	.Lecb_enc_done
1320.align	16
1321.Lecb_enc_two:
1322	call	_bsaes_encrypt8
1323	movdqu	@XMM[0], 0x00($out)	# write output
1324	movdqu	@XMM[1], 0x10($out)
1325	jmp	.Lecb_enc_done
1326.align	16
1327.Lecb_enc_one:
1328	call	_bsaes_encrypt8
1329	movdqu	@XMM[0], 0x00($out)	# write output
1330	jmp	.Lecb_enc_done
1331.align	16
1332.Lecb_enc_short:
1333	lea	($inp), $arg1
1334	lea	($out), $arg2
1335	lea	($key), $arg3
1336	call	asm_AES_encrypt
1337	lea	16($inp), $inp
1338	lea	16($out), $out
1339	dec	$len
1340	jnz	.Lecb_enc_short
1341
1342.Lecb_enc_done:
1343	lea	(%rsp),%rax
1344	pxor	%xmm0, %xmm0
1345.Lecb_enc_bzero:			# wipe key schedule [if any]
1346	movdqa	%xmm0, 0x00(%rax)
1347	movdqa	%xmm0, 0x10(%rax)
1348	lea	0x20(%rax), %rax
1349	cmp	%rax, %rbp
1350	jb	.Lecb_enc_bzero
1351
1352	lea	0x78(%rbp),%rax
1353.cfi_def_cfa	%rax,8
1354___
1355$code.=<<___ if ($win64);
1356	movaps	0x40(%rbp), %xmm6
1357	movaps	0x50(%rbp), %xmm7
1358	movaps	0x60(%rbp), %xmm8
1359	movaps	0x70(%rbp), %xmm9
1360	movaps	0x80(%rbp), %xmm10
1361	movaps	0x90(%rbp), %xmm11
1362	movaps	0xa0(%rbp), %xmm12
1363	movaps	0xb0(%rbp), %xmm13
1364	movaps	0xc0(%rbp), %xmm14
1365	movaps	0xd0(%rbp), %xmm15
1366	lea	0xa0(%rax), %rax
1367.Lecb_enc_tail:
1368___
1369$code.=<<___;
1370	mov	-48(%rax), %r15
1371.cfi_restore	%r15
1372	mov	-40(%rax), %r14
1373.cfi_restore	%r14
1374	mov	-32(%rax), %r13
1375.cfi_restore	%r13
1376	mov	-24(%rax), %r12
1377.cfi_restore	%r12
1378	mov	-16(%rax), %rbx
1379.cfi_restore	%rbx
1380	mov	-8(%rax), %rbp
1381.cfi_restore	%rbp
1382	lea	(%rax), %rsp		# restore %rsp
1383.cfi_def_cfa_register	%rsp
1384.Lecb_enc_epilogue:
1385	ret
1386.cfi_endproc
1387.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1388
1389.globl	bsaes_ecb_decrypt_blocks
1390.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1391.align	16
1392bsaes_ecb_decrypt_blocks:
1393.cfi_startproc
1394	mov	%rsp, %rax
1395.Lecb_dec_prologue:
1396	push	%rbp
1397.cfi_push	%rbp
1398	push	%rbx
1399.cfi_push	%rbx
1400	push	%r12
1401.cfi_push	%r12
1402	push	%r13
1403.cfi_push	%r13
1404	push	%r14
1405.cfi_push	%r14
1406	push	%r15
1407.cfi_push	%r15
1408	lea	-0x48(%rsp),%rsp
1409.cfi_adjust_cfa_offset	0x48
1410___
1411$code.=<<___ if ($win64);
1412	lea	-0xa0(%rsp), %rsp
1413	movaps	%xmm6, 0x40(%rsp)
1414	movaps	%xmm7, 0x50(%rsp)
1415	movaps	%xmm8, 0x60(%rsp)
1416	movaps	%xmm9, 0x70(%rsp)
1417	movaps	%xmm10, 0x80(%rsp)
1418	movaps	%xmm11, 0x90(%rsp)
1419	movaps	%xmm12, 0xa0(%rsp)
1420	movaps	%xmm13, 0xb0(%rsp)
1421	movaps	%xmm14, 0xc0(%rsp)
1422	movaps	%xmm15, 0xd0(%rsp)
1423.Lecb_dec_body:
1424___
1425$code.=<<___;
1426	mov	%rsp,%rbp		# backup %rsp
1427.cfi_def_cfa_register	%rbp
1428	mov	240($arg4),%eax		# rounds
1429	mov	$arg1,$inp		# backup arguments
1430	mov	$arg2,$out
1431	mov	$arg3,$len
1432	mov	$arg4,$key
1433	cmp	\$8,$arg3
1434	jb	.Lecb_dec_short
1435
1436	mov	%eax,%ebx		# backup rounds
1437	shl	\$7,%rax		# 128 bytes per inner round key
1438	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1439	sub	%rax,%rsp
1440	mov	%rsp,%rax		# pass key schedule
1441	mov	$key,%rcx		# pass key
1442	mov	%ebx,%r10d		# pass rounds
1443	call	_bsaes_key_convert
1444	pxor	(%rsp),%xmm7		# fix up 0 round key
1445	movdqa	%xmm6,(%rax)		# save last round key
1446	movdqa	%xmm7,(%rsp)
1447
1448	sub	\$8,$len
1449.Lecb_dec_loop:
1450	movdqu	0x00($inp), @XMM[0]	# load input
1451	movdqu	0x10($inp), @XMM[1]
1452	movdqu	0x20($inp), @XMM[2]
1453	movdqu	0x30($inp), @XMM[3]
1454	movdqu	0x40($inp), @XMM[4]
1455	movdqu	0x50($inp), @XMM[5]
1456	mov	%rsp, %rax		# pass key schedule
1457	movdqu	0x60($inp), @XMM[6]
1458	mov	%ebx,%r10d		# pass rounds
1459	movdqu	0x70($inp), @XMM[7]
1460	lea	0x80($inp), $inp
1461
1462	call	_bsaes_decrypt8
1463
1464	movdqu	@XMM[0], 0x00($out)	# write output
1465	movdqu	@XMM[1], 0x10($out)
1466	movdqu	@XMM[6], 0x20($out)
1467	movdqu	@XMM[4], 0x30($out)
1468	movdqu	@XMM[2], 0x40($out)
1469	movdqu	@XMM[7], 0x50($out)
1470	movdqu	@XMM[3], 0x60($out)
1471	movdqu	@XMM[5], 0x70($out)
1472	lea	0x80($out), $out
1473	sub	\$8,$len
1474	jnc	.Lecb_dec_loop
1475
1476	add	\$8,$len
1477	jz	.Lecb_dec_done
1478
1479	movdqu	0x00($inp), @XMM[0]	# load input
1480	mov	%rsp, %rax		# pass key schedule
1481	mov	%ebx,%r10d		# pass rounds
1482	cmp	\$2,$len
1483	jb	.Lecb_dec_one
1484	movdqu	0x10($inp), @XMM[1]
1485	je	.Lecb_dec_two
1486	movdqu	0x20($inp), @XMM[2]
1487	cmp	\$4,$len
1488	jb	.Lecb_dec_three
1489	movdqu	0x30($inp), @XMM[3]
1490	je	.Lecb_dec_four
1491	movdqu	0x40($inp), @XMM[4]
1492	cmp	\$6,$len
1493	jb	.Lecb_dec_five
1494	movdqu	0x50($inp), @XMM[5]
1495	je	.Lecb_dec_six
1496	movdqu	0x60($inp), @XMM[6]
1497	call	_bsaes_decrypt8
1498	movdqu	@XMM[0], 0x00($out)	# write output
1499	movdqu	@XMM[1], 0x10($out)
1500	movdqu	@XMM[6], 0x20($out)
1501	movdqu	@XMM[4], 0x30($out)
1502	movdqu	@XMM[2], 0x40($out)
1503	movdqu	@XMM[7], 0x50($out)
1504	movdqu	@XMM[3], 0x60($out)
1505	jmp	.Lecb_dec_done
1506.align	16
1507.Lecb_dec_six:
1508	call	_bsaes_decrypt8
1509	movdqu	@XMM[0], 0x00($out)	# write output
1510	movdqu	@XMM[1], 0x10($out)
1511	movdqu	@XMM[6], 0x20($out)
1512	movdqu	@XMM[4], 0x30($out)
1513	movdqu	@XMM[2], 0x40($out)
1514	movdqu	@XMM[7], 0x50($out)
1515	jmp	.Lecb_dec_done
1516.align	16
1517.Lecb_dec_five:
1518	call	_bsaes_decrypt8
1519	movdqu	@XMM[0], 0x00($out)	# write output
1520	movdqu	@XMM[1], 0x10($out)
1521	movdqu	@XMM[6], 0x20($out)
1522	movdqu	@XMM[4], 0x30($out)
1523	movdqu	@XMM[2], 0x40($out)
1524	jmp	.Lecb_dec_done
1525.align	16
1526.Lecb_dec_four:
1527	call	_bsaes_decrypt8
1528	movdqu	@XMM[0], 0x00($out)	# write output
1529	movdqu	@XMM[1], 0x10($out)
1530	movdqu	@XMM[6], 0x20($out)
1531	movdqu	@XMM[4], 0x30($out)
1532	jmp	.Lecb_dec_done
1533.align	16
1534.Lecb_dec_three:
1535	call	_bsaes_decrypt8
1536	movdqu	@XMM[0], 0x00($out)	# write output
1537	movdqu	@XMM[1], 0x10($out)
1538	movdqu	@XMM[6], 0x20($out)
1539	jmp	.Lecb_dec_done
1540.align	16
1541.Lecb_dec_two:
1542	call	_bsaes_decrypt8
1543	movdqu	@XMM[0], 0x00($out)	# write output
1544	movdqu	@XMM[1], 0x10($out)
1545	jmp	.Lecb_dec_done
1546.align	16
1547.Lecb_dec_one:
1548	call	_bsaes_decrypt8
1549	movdqu	@XMM[0], 0x00($out)	# write output
1550	jmp	.Lecb_dec_done
1551.align	16
1552.Lecb_dec_short:
1553	lea	($inp), $arg1
1554	lea	($out), $arg2
1555	lea	($key), $arg3
1556	call	asm_AES_decrypt
1557	lea	16($inp), $inp
1558	lea	16($out), $out
1559	dec	$len
1560	jnz	.Lecb_dec_short
1561
1562.Lecb_dec_done:
1563	lea	(%rsp),%rax
1564	pxor	%xmm0, %xmm0
1565.Lecb_dec_bzero:			# wipe key schedule [if any]
1566	movdqa	%xmm0, 0x00(%rax)
1567	movdqa	%xmm0, 0x10(%rax)
1568	lea	0x20(%rax), %rax
1569	cmp	%rax, %rbp
1570	jb	.Lecb_dec_bzero
1571
1572	lea	0x78(%rbp),%rax
1573.cfi_def_cfa	%rax,8
1574___
1575$code.=<<___ if ($win64);
1576	movaps	0x40(%rbp), %xmm6
1577	movaps	0x50(%rbp), %xmm7
1578	movaps	0x60(%rbp), %xmm8
1579	movaps	0x70(%rbp), %xmm9
1580	movaps	0x80(%rbp), %xmm10
1581	movaps	0x90(%rbp), %xmm11
1582	movaps	0xa0(%rbp), %xmm12
1583	movaps	0xb0(%rbp), %xmm13
1584	movaps	0xc0(%rbp), %xmm14
1585	movaps	0xd0(%rbp), %xmm15
1586	lea	0xa0(%rax), %rax
1587.Lecb_dec_tail:
1588___
1589$code.=<<___;
1590	mov	-48(%rax), %r15
1591.cfi_restore	%r15
1592	mov	-40(%rax), %r14
1593.cfi_restore	%r14
1594	mov	-32(%rax), %r13
1595.cfi_restore	%r13
1596	mov	-24(%rax), %r12
1597.cfi_restore	%r12
1598	mov	-16(%rax), %rbx
1599.cfi_restore	%rbx
1600	mov	-8(%rax), %rbp
1601.cfi_restore	%rbp
1602	lea	(%rax), %rsp		# restore %rsp
1603.cfi_def_cfa_register	%rsp
1604.Lecb_dec_epilogue:
1605	ret
1606.cfi_endproc
1607.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1608___
1609}
1610$code.=<<___;
1611.extern	asm_AES_cbc_encrypt
1612.globl	bsaes_cbc_encrypt
1613.type	bsaes_cbc_encrypt,\@abi-omnipotent
1614.align	16
1615bsaes_cbc_encrypt:
1616.cfi_startproc
1617___
1618$code.=<<___ if ($win64);
1619	mov	48(%rsp),$arg6		# pull direction flag
1620___
1621$code.=<<___;
1622	cmp	\$0,$arg6
1623	jne	asm_AES_cbc_encrypt
1624	cmp	\$128,$arg3
1625	jb	asm_AES_cbc_encrypt
1626
1627	mov	%rsp, %rax
1628.Lcbc_dec_prologue:
1629	push	%rbp
1630.cfi_push	%rbp
1631	push	%rbx
1632.cfi_push	%rbx
1633	push	%r12
1634.cfi_push	%r12
1635	push	%r13
1636.cfi_push	%r13
1637	push	%r14
1638.cfi_push	%r14
1639	push	%r15
1640.cfi_push	%r15
1641	lea	-0x48(%rsp), %rsp
1642.cfi_adjust_cfa_offset	0x48
1643___
1644$code.=<<___ if ($win64);
1645	mov	0xa0(%rsp),$arg5	# pull ivp
1646	lea	-0xa0(%rsp), %rsp
1647	movaps	%xmm6, 0x40(%rsp)
1648	movaps	%xmm7, 0x50(%rsp)
1649	movaps	%xmm8, 0x60(%rsp)
1650	movaps	%xmm9, 0x70(%rsp)
1651	movaps	%xmm10, 0x80(%rsp)
1652	movaps	%xmm11, 0x90(%rsp)
1653	movaps	%xmm12, 0xa0(%rsp)
1654	movaps	%xmm13, 0xb0(%rsp)
1655	movaps	%xmm14, 0xc0(%rsp)
1656	movaps	%xmm15, 0xd0(%rsp)
1657.Lcbc_dec_body:
1658___
1659$code.=<<___;
1660	mov	%rsp, %rbp		# backup %rsp
1661.cfi_def_cfa_register	%rbp
1662	mov	240($arg4), %eax	# rounds
1663	mov	$arg1, $inp		# backup arguments
1664	mov	$arg2, $out
1665	mov	$arg3, $len
1666	mov	$arg4, $key
1667	mov	$arg5, %rbx
1668	shr	\$4, $len		# bytes to blocks
1669
1670	mov	%eax, %edx		# rounds
1671	shl	\$7, %rax		# 128 bytes per inner round key
1672	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1673	sub	%rax, %rsp
1674
1675	mov	%rsp, %rax		# pass key schedule
1676	mov	$key, %rcx		# pass key
1677	mov	%edx, %r10d		# pass rounds
1678	call	_bsaes_key_convert
1679	pxor	(%rsp),%xmm7		# fix up 0 round key
1680	movdqa	%xmm6,(%rax)		# save last round key
1681	movdqa	%xmm7,(%rsp)
1682
1683	movdqu	(%rbx), @XMM[15]	# load IV
1684	sub	\$8,$len
1685.Lcbc_dec_loop:
1686	movdqu	0x00($inp), @XMM[0]	# load input
1687	movdqu	0x10($inp), @XMM[1]
1688	movdqu	0x20($inp), @XMM[2]
1689	movdqu	0x30($inp), @XMM[3]
1690	movdqu	0x40($inp), @XMM[4]
1691	movdqu	0x50($inp), @XMM[5]
1692	mov	%rsp, %rax		# pass key schedule
1693	movdqu	0x60($inp), @XMM[6]
1694	mov	%edx,%r10d		# pass rounds
1695	movdqu	0x70($inp), @XMM[7]
1696	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1697
1698	call	_bsaes_decrypt8
1699
1700	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1701	movdqu	0x00($inp), @XMM[8]	# re-load input
1702	movdqu	0x10($inp), @XMM[9]
1703	pxor	@XMM[8], @XMM[1]
1704	movdqu	0x20($inp), @XMM[10]
1705	pxor	@XMM[9], @XMM[6]
1706	movdqu	0x30($inp), @XMM[11]
1707	pxor	@XMM[10], @XMM[4]
1708	movdqu	0x40($inp), @XMM[12]
1709	pxor	@XMM[11], @XMM[2]
1710	movdqu	0x50($inp), @XMM[13]
1711	pxor	@XMM[12], @XMM[7]
1712	movdqu	0x60($inp), @XMM[14]
1713	pxor	@XMM[13], @XMM[3]
1714	movdqu	0x70($inp), @XMM[15]	# IV
1715	pxor	@XMM[14], @XMM[5]
1716	movdqu	@XMM[0], 0x00($out)	# write output
1717	lea	0x80($inp), $inp
1718	movdqu	@XMM[1], 0x10($out)
1719	movdqu	@XMM[6], 0x20($out)
1720	movdqu	@XMM[4], 0x30($out)
1721	movdqu	@XMM[2], 0x40($out)
1722	movdqu	@XMM[7], 0x50($out)
1723	movdqu	@XMM[3], 0x60($out)
1724	movdqu	@XMM[5], 0x70($out)
1725	lea	0x80($out), $out
1726	sub	\$8,$len
1727	jnc	.Lcbc_dec_loop
1728
1729	add	\$8,$len
1730	jz	.Lcbc_dec_done
1731
1732	movdqu	0x00($inp), @XMM[0]	# load input
1733	mov	%rsp, %rax		# pass key schedule
1734	mov	%edx, %r10d		# pass rounds
1735	cmp	\$2,$len
1736	jb	.Lcbc_dec_one
1737	movdqu	0x10($inp), @XMM[1]
1738	je	.Lcbc_dec_two
1739	movdqu	0x20($inp), @XMM[2]
1740	cmp	\$4,$len
1741	jb	.Lcbc_dec_three
1742	movdqu	0x30($inp), @XMM[3]
1743	je	.Lcbc_dec_four
1744	movdqu	0x40($inp), @XMM[4]
1745	cmp	\$6,$len
1746	jb	.Lcbc_dec_five
1747	movdqu	0x50($inp), @XMM[5]
1748	je	.Lcbc_dec_six
1749	movdqu	0x60($inp), @XMM[6]
1750	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1751	call	_bsaes_decrypt8
1752	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1753	movdqu	0x00($inp), @XMM[8]	# re-load input
1754	movdqu	0x10($inp), @XMM[9]
1755	pxor	@XMM[8], @XMM[1]
1756	movdqu	0x20($inp), @XMM[10]
1757	pxor	@XMM[9], @XMM[6]
1758	movdqu	0x30($inp), @XMM[11]
1759	pxor	@XMM[10], @XMM[4]
1760	movdqu	0x40($inp), @XMM[12]
1761	pxor	@XMM[11], @XMM[2]
1762	movdqu	0x50($inp), @XMM[13]
1763	pxor	@XMM[12], @XMM[7]
1764	movdqu	0x60($inp), @XMM[15]	# IV
1765	pxor	@XMM[13], @XMM[3]
1766	movdqu	@XMM[0], 0x00($out)	# write output
1767	movdqu	@XMM[1], 0x10($out)
1768	movdqu	@XMM[6], 0x20($out)
1769	movdqu	@XMM[4], 0x30($out)
1770	movdqu	@XMM[2], 0x40($out)
1771	movdqu	@XMM[7], 0x50($out)
1772	movdqu	@XMM[3], 0x60($out)
1773	jmp	.Lcbc_dec_done
1774.align	16
1775.Lcbc_dec_six:
1776	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1777	call	_bsaes_decrypt8
1778	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1779	movdqu	0x00($inp), @XMM[8]	# re-load input
1780	movdqu	0x10($inp), @XMM[9]
1781	pxor	@XMM[8], @XMM[1]
1782	movdqu	0x20($inp), @XMM[10]
1783	pxor	@XMM[9], @XMM[6]
1784	movdqu	0x30($inp), @XMM[11]
1785	pxor	@XMM[10], @XMM[4]
1786	movdqu	0x40($inp), @XMM[12]
1787	pxor	@XMM[11], @XMM[2]
1788	movdqu	0x50($inp), @XMM[15]	# IV
1789	pxor	@XMM[12], @XMM[7]
1790	movdqu	@XMM[0], 0x00($out)	# write output
1791	movdqu	@XMM[1], 0x10($out)
1792	movdqu	@XMM[6], 0x20($out)
1793	movdqu	@XMM[4], 0x30($out)
1794	movdqu	@XMM[2], 0x40($out)
1795	movdqu	@XMM[7], 0x50($out)
1796	jmp	.Lcbc_dec_done
1797.align	16
1798.Lcbc_dec_five:
1799	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1800	call	_bsaes_decrypt8
1801	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1802	movdqu	0x00($inp), @XMM[8]	# re-load input
1803	movdqu	0x10($inp), @XMM[9]
1804	pxor	@XMM[8], @XMM[1]
1805	movdqu	0x20($inp), @XMM[10]
1806	pxor	@XMM[9], @XMM[6]
1807	movdqu	0x30($inp), @XMM[11]
1808	pxor	@XMM[10], @XMM[4]
1809	movdqu	0x40($inp), @XMM[15]	# IV
1810	pxor	@XMM[11], @XMM[2]
1811	movdqu	@XMM[0], 0x00($out)	# write output
1812	movdqu	@XMM[1], 0x10($out)
1813	movdqu	@XMM[6], 0x20($out)
1814	movdqu	@XMM[4], 0x30($out)
1815	movdqu	@XMM[2], 0x40($out)
1816	jmp	.Lcbc_dec_done
1817.align	16
1818.Lcbc_dec_four:
1819	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1820	call	_bsaes_decrypt8
1821	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1822	movdqu	0x00($inp), @XMM[8]	# re-load input
1823	movdqu	0x10($inp), @XMM[9]
1824	pxor	@XMM[8], @XMM[1]
1825	movdqu	0x20($inp), @XMM[10]
1826	pxor	@XMM[9], @XMM[6]
1827	movdqu	0x30($inp), @XMM[15]	# IV
1828	pxor	@XMM[10], @XMM[4]
1829	movdqu	@XMM[0], 0x00($out)	# write output
1830	movdqu	@XMM[1], 0x10($out)
1831	movdqu	@XMM[6], 0x20($out)
1832	movdqu	@XMM[4], 0x30($out)
1833	jmp	.Lcbc_dec_done
1834.align	16
1835.Lcbc_dec_three:
1836	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1837	call	_bsaes_decrypt8
1838	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1839	movdqu	0x00($inp), @XMM[8]	# re-load input
1840	movdqu	0x10($inp), @XMM[9]
1841	pxor	@XMM[8], @XMM[1]
1842	movdqu	0x20($inp), @XMM[15]	# IV
1843	pxor	@XMM[9], @XMM[6]
1844	movdqu	@XMM[0], 0x00($out)	# write output
1845	movdqu	@XMM[1], 0x10($out)
1846	movdqu	@XMM[6], 0x20($out)
1847	jmp	.Lcbc_dec_done
1848.align	16
1849.Lcbc_dec_two:
1850	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1851	call	_bsaes_decrypt8
1852	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1853	movdqu	0x00($inp), @XMM[8]	# re-load input
1854	movdqu	0x10($inp), @XMM[15]	# IV
1855	pxor	@XMM[8], @XMM[1]
1856	movdqu	@XMM[0], 0x00($out)	# write output
1857	movdqu	@XMM[1], 0x10($out)
1858	jmp	.Lcbc_dec_done
1859.align	16
1860.Lcbc_dec_one:
1861	lea	($inp), $arg1
1862	lea	0x20(%rbp), $arg2	# buffer output
1863	lea	($key), $arg3
1864	call	asm_AES_decrypt		# doesn't touch %xmm
1865	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1866	movdqu	@XMM[15], ($out)	# write output
1867	movdqa	@XMM[0], @XMM[15]	# IV
1868
1869.Lcbc_dec_done:
1870	movdqu	@XMM[15], (%rbx)	# return IV
1871	lea	(%rsp), %rax
1872	pxor	%xmm0, %xmm0
1873.Lcbc_dec_bzero:			# wipe key schedule [if any]
1874	movdqa	%xmm0, 0x00(%rax)
1875	movdqa	%xmm0, 0x10(%rax)
1876	lea	0x20(%rax), %rax
1877	cmp	%rax, %rbp
1878	ja	.Lcbc_dec_bzero
1879
1880	lea	0x78(%rbp),%rax
1881.cfi_def_cfa	%rax,8
1882___
1883$code.=<<___ if ($win64);
1884	movaps	0x40(%rbp), %xmm6
1885	movaps	0x50(%rbp), %xmm7
1886	movaps	0x60(%rbp), %xmm8
1887	movaps	0x70(%rbp), %xmm9
1888	movaps	0x80(%rbp), %xmm10
1889	movaps	0x90(%rbp), %xmm11
1890	movaps	0xa0(%rbp), %xmm12
1891	movaps	0xb0(%rbp), %xmm13
1892	movaps	0xc0(%rbp), %xmm14
1893	movaps	0xd0(%rbp), %xmm15
1894	lea	0xa0(%rax), %rax
1895.Lcbc_dec_tail:
1896___
1897$code.=<<___;
1898	mov	-48(%rax), %r15
1899.cfi_restore	%r15
1900	mov	-40(%rax), %r14
1901.cfi_restore	%r14
1902	mov	-32(%rax), %r13
1903.cfi_restore	%r13
1904	mov	-24(%rax), %r12
1905.cfi_restore	%r12
1906	mov	-16(%rax), %rbx
1907.cfi_restore	%rbx
1908	mov	-8(%rax), %rbp
1909.cfi_restore	%rbp
1910	lea	(%rax), %rsp		# restore %rsp
1911.cfi_def_cfa_register	%rsp
1912.Lcbc_dec_epilogue:
1913	ret
1914.cfi_endproc
1915.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1916
1917.globl	bsaes_ctr32_encrypt_blocks
1918.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1919.align	16
1920bsaes_ctr32_encrypt_blocks:
1921.cfi_startproc
1922	mov	%rsp, %rax
1923.Lctr_enc_prologue:
1924	push	%rbp
1925.cfi_push	%rbp
1926	push	%rbx
1927.cfi_push	%rbx
1928	push	%r12
1929.cfi_push	%r12
1930	push	%r13
1931.cfi_push	%r13
1932	push	%r14
1933.cfi_push	%r14
1934	push	%r15
1935.cfi_push	%r15
1936	lea	-0x48(%rsp), %rsp
1937.cfi_adjust_cfa_offset	0x48
1938___
1939$code.=<<___ if ($win64);
1940	mov	0xa0(%rsp),$arg5	# pull ivp
1941	lea	-0xa0(%rsp), %rsp
1942	movaps	%xmm6, 0x40(%rsp)
1943	movaps	%xmm7, 0x50(%rsp)
1944	movaps	%xmm8, 0x60(%rsp)
1945	movaps	%xmm9, 0x70(%rsp)
1946	movaps	%xmm10, 0x80(%rsp)
1947	movaps	%xmm11, 0x90(%rsp)
1948	movaps	%xmm12, 0xa0(%rsp)
1949	movaps	%xmm13, 0xb0(%rsp)
1950	movaps	%xmm14, 0xc0(%rsp)
1951	movaps	%xmm15, 0xd0(%rsp)
1952.Lctr_enc_body:
1953___
1954$code.=<<___;
1955	mov	%rsp, %rbp		# backup %rsp
1956.cfi_def_cfa_register	%rbp
1957	movdqu	($arg5), %xmm0		# load counter
1958	mov	240($arg4), %eax	# rounds
1959	mov	$arg1, $inp		# backup arguments
1960	mov	$arg2, $out
1961	mov	$arg3, $len
1962	mov	$arg4, $key
1963	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1964	cmp	\$8, $arg3
1965	jb	.Lctr_enc_short
1966
1967	mov	%eax, %ebx		# rounds
1968	shl	\$7, %rax		# 128 bytes per inner round key
1969	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1970	sub	%rax, %rsp
1971
1972	mov	%rsp, %rax		# pass key schedule
1973	mov	$key, %rcx		# pass key
1974	mov	%ebx, %r10d		# pass rounds
1975	call	_bsaes_key_convert
1976	pxor	%xmm6,%xmm7		# fix up last round key
1977	movdqa	%xmm7,(%rax)		# save last round key
1978
1979	movdqa	(%rsp), @XMM[9]		# load round0 key
1980	lea	.LADD1(%rip), %r11
1981	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1982	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1983	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1984	pshufb	@XMM[8], @XMM[0]
1985	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1986	jmp	.Lctr_enc_loop
1987.align	16
1988.Lctr_enc_loop:
1989	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1990	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1991	movdqa	@XMM[0], @XMM[2]
1992	paddd	0x00(%r11), @XMM[1]	# .LADD1
1993	movdqa	@XMM[0], @XMM[3]
1994	paddd	0x10(%r11), @XMM[2]	# .LADD2
1995	movdqa	@XMM[0], @XMM[4]
1996	paddd	0x20(%r11), @XMM[3]	# .LADD3
1997	movdqa	@XMM[0], @XMM[5]
1998	paddd	0x30(%r11), @XMM[4]	# .LADD4
1999	movdqa	@XMM[0], @XMM[6]
2000	paddd	0x40(%r11), @XMM[5]	# .LADD5
2001	movdqa	@XMM[0], @XMM[7]
2002	paddd	0x50(%r11), @XMM[6]	# .LADD6
2003	paddd	0x60(%r11), @XMM[7]	# .LADD7
2004
2005	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
2006	# to flip byte order in 32-bit counter
2007	movdqa	(%rsp), @XMM[9]		# round 0 key
2008	lea	0x10(%rsp), %rax	# pass key schedule
2009	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
2010	pxor	@XMM[9], @XMM[0]	# xor with round0 key
2011	pxor	@XMM[9], @XMM[1]
2012	pxor	@XMM[9], @XMM[2]
2013	pxor	@XMM[9], @XMM[3]
2014	 pshufb	@XMM[8], @XMM[0]
2015	 pshufb	@XMM[8], @XMM[1]
2016	pxor	@XMM[9], @XMM[4]
2017	pxor	@XMM[9], @XMM[5]
2018	 pshufb	@XMM[8], @XMM[2]
2019	 pshufb	@XMM[8], @XMM[3]
2020	pxor	@XMM[9], @XMM[6]
2021	pxor	@XMM[9], @XMM[7]
2022	 pshufb	@XMM[8], @XMM[4]
2023	 pshufb	@XMM[8], @XMM[5]
2024	 pshufb	@XMM[8], @XMM[6]
2025	 pshufb	@XMM[8], @XMM[7]
2026	lea	.LBS0(%rip), %r11	# constants table
2027	mov	%ebx,%r10d		# pass rounds
2028
2029	call	_bsaes_encrypt8_bitslice
2030
2031	sub	\$8,$len
2032	jc	.Lctr_enc_loop_done
2033
2034	movdqu	0x00($inp), @XMM[8]	# load input
2035	movdqu	0x10($inp), @XMM[9]
2036	movdqu	0x20($inp), @XMM[10]
2037	movdqu	0x30($inp), @XMM[11]
2038	movdqu	0x40($inp), @XMM[12]
2039	movdqu	0x50($inp), @XMM[13]
2040	movdqu	0x60($inp), @XMM[14]
2041	movdqu	0x70($inp), @XMM[15]
2042	lea	0x80($inp),$inp
2043	pxor	@XMM[0], @XMM[8]
2044	movdqa	0x20(%rbp), @XMM[0]	# load counter
2045	pxor	@XMM[9], @XMM[1]
2046	movdqu	@XMM[8], 0x00($out)	# write output
2047	pxor	@XMM[10], @XMM[4]
2048	movdqu	@XMM[1], 0x10($out)
2049	pxor	@XMM[11], @XMM[6]
2050	movdqu	@XMM[4], 0x20($out)
2051	pxor	@XMM[12], @XMM[3]
2052	movdqu	@XMM[6], 0x30($out)
2053	pxor	@XMM[13], @XMM[7]
2054	movdqu	@XMM[3], 0x40($out)
2055	pxor	@XMM[14], @XMM[2]
2056	movdqu	@XMM[7], 0x50($out)
2057	pxor	@XMM[15], @XMM[5]
2058	movdqu	@XMM[2], 0x60($out)
2059	lea	.LADD1(%rip), %r11
2060	movdqu	@XMM[5], 0x70($out)
2061	lea	0x80($out), $out
2062	paddd	0x70(%r11), @XMM[0]	# .LADD8
2063	jnz	.Lctr_enc_loop
2064
2065	jmp	.Lctr_enc_done
2066.align	16
2067.Lctr_enc_loop_done:
2068	add	\$8, $len
2069	movdqu	0x00($inp), @XMM[8]	# load input
2070	pxor	@XMM[8], @XMM[0]
2071	movdqu	@XMM[0], 0x00($out)	# write output
2072	cmp	\$2,$len
2073	jb	.Lctr_enc_done
2074	movdqu	0x10($inp), @XMM[9]
2075	pxor	@XMM[9], @XMM[1]
2076	movdqu	@XMM[1], 0x10($out)
2077	je	.Lctr_enc_done
2078	movdqu	0x20($inp), @XMM[10]
2079	pxor	@XMM[10], @XMM[4]
2080	movdqu	@XMM[4], 0x20($out)
2081	cmp	\$4,$len
2082	jb	.Lctr_enc_done
2083	movdqu	0x30($inp), @XMM[11]
2084	pxor	@XMM[11], @XMM[6]
2085	movdqu	@XMM[6], 0x30($out)
2086	je	.Lctr_enc_done
2087	movdqu	0x40($inp), @XMM[12]
2088	pxor	@XMM[12], @XMM[3]
2089	movdqu	@XMM[3], 0x40($out)
2090	cmp	\$6,$len
2091	jb	.Lctr_enc_done
2092	movdqu	0x50($inp), @XMM[13]
2093	pxor	@XMM[13], @XMM[7]
2094	movdqu	@XMM[7], 0x50($out)
2095	je	.Lctr_enc_done
2096	movdqu	0x60($inp), @XMM[14]
2097	pxor	@XMM[14], @XMM[2]
2098	movdqu	@XMM[2], 0x60($out)
2099	jmp	.Lctr_enc_done
2100
2101.align	16
2102.Lctr_enc_short:
2103	lea	0x20(%rbp), $arg1
2104	lea	0x30(%rbp), $arg2
2105	lea	($key), $arg3
2106	call	asm_AES_encrypt
2107	movdqu	($inp), @XMM[1]
2108	lea	16($inp), $inp
2109	mov	0x2c(%rbp), %eax	# load 32-bit counter
2110	bswap	%eax
2111	pxor	0x30(%rbp), @XMM[1]
2112	inc	%eax			# increment
2113	movdqu	@XMM[1], ($out)
2114	bswap	%eax
2115	lea	16($out), $out
2116	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
2117	dec	$len
2118	jnz	.Lctr_enc_short
2119
2120.Lctr_enc_done:
2121	lea	(%rsp), %rax
2122	pxor	%xmm0, %xmm0
2123.Lctr_enc_bzero:			# wipe key schedule [if any]
2124	movdqa	%xmm0, 0x00(%rax)
2125	movdqa	%xmm0, 0x10(%rax)
2126	lea	0x20(%rax), %rax
2127	cmp	%rax, %rbp
2128	ja	.Lctr_enc_bzero
2129
2130	lea	0x78(%rbp),%rax
2131.cfi_def_cfa	%rax,8
2132___
2133$code.=<<___ if ($win64);
2134	movaps	0x40(%rbp), %xmm6
2135	movaps	0x50(%rbp), %xmm7
2136	movaps	0x60(%rbp), %xmm8
2137	movaps	0x70(%rbp), %xmm9
2138	movaps	0x80(%rbp), %xmm10
2139	movaps	0x90(%rbp), %xmm11
2140	movaps	0xa0(%rbp), %xmm12
2141	movaps	0xb0(%rbp), %xmm13
2142	movaps	0xc0(%rbp), %xmm14
2143	movaps	0xd0(%rbp), %xmm15
2144	lea	0xa0(%rax), %rax
2145.Lctr_enc_tail:
2146___
2147$code.=<<___;
2148	mov	-48(%rax), %r15
2149.cfi_restore	%r15
2150	mov	-40(%rax), %r14
2151.cfi_restore	%r14
2152	mov	-32(%rax), %r13
2153.cfi_restore	%r13
2154	mov	-24(%rax), %r12
2155.cfi_restore	%r12
2156	mov	-16(%rax), %rbx
2157.cfi_restore	%rbx
2158	mov	-8(%rax), %rbp
2159.cfi_restore	%rbp
2160	lea	(%rax), %rsp		# restore %rsp
2161.cfi_def_cfa_register	%rsp
2162.Lctr_enc_epilogue:
2163	ret
2164.cfi_endproc
2165.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2166___
2167######################################################################
2168# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2169#	const AES_KEY *key1, const AES_KEY *key2,
2170#	const unsigned char iv[16]);
2171#
2172my ($twmask,$twres,$twtmp)=@XMM[13..15];
2173$arg6=~s/d$//;
2174
2175$code.=<<___;
2176.globl	bsaes_xts_encrypt
2177.type	bsaes_xts_encrypt,\@abi-omnipotent
2178.align	16
2179bsaes_xts_encrypt:
2180.cfi_startproc
2181	mov	%rsp, %rax
2182.Lxts_enc_prologue:
2183	push	%rbp
2184.cfi_push	%rbp
2185	push	%rbx
2186.cfi_push	%rbx
2187	push	%r12
2188.cfi_push	%r12
2189	push	%r13
2190.cfi_push	%r13
2191	push	%r14
2192.cfi_push	%r14
2193	push	%r15
2194.cfi_push	%r15
2195	lea	-0x48(%rsp), %rsp
2196.cfi_adjust_cfa_offset	0x48
2197___
2198$code.=<<___ if ($win64);
2199	mov	0xa0(%rsp),$arg5	# pull key2
2200	mov	0xa8(%rsp),$arg6	# pull ivp
2201	lea	-0xa0(%rsp), %rsp
2202	movaps	%xmm6, 0x40(%rsp)
2203	movaps	%xmm7, 0x50(%rsp)
2204	movaps	%xmm8, 0x60(%rsp)
2205	movaps	%xmm9, 0x70(%rsp)
2206	movaps	%xmm10, 0x80(%rsp)
2207	movaps	%xmm11, 0x90(%rsp)
2208	movaps	%xmm12, 0xa0(%rsp)
2209	movaps	%xmm13, 0xb0(%rsp)
2210	movaps	%xmm14, 0xc0(%rsp)
2211	movaps	%xmm15, 0xd0(%rsp)
2212.Lxts_enc_body:
2213___
2214$code.=<<___;
2215	mov	%rsp, %rbp		# backup %rsp
2216.cfi_def_cfa_register	%rbp
2217	mov	$arg1, $inp		# backup arguments
2218	mov	$arg2, $out
2219	mov	$arg3, $len
2220	mov	$arg4, $key
2221
2222	lea	($arg6), $arg1
2223	lea	0x20(%rbp), $arg2
2224	lea	($arg5), $arg3
2225	call	asm_AES_encrypt		# generate initial tweak
2226
2227	mov	240($key), %eax		# rounds
2228	mov	$len, %rbx		# backup $len
2229
2230	mov	%eax, %edx		# rounds
2231	shl	\$7, %rax		# 128 bytes per inner round key
2232	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2233	sub	%rax, %rsp
2234
2235	mov	%rsp, %rax		# pass key schedule
2236	mov	$key, %rcx		# pass key
2237	mov	%edx, %r10d		# pass rounds
2238	call	_bsaes_key_convert
2239	pxor	%xmm6, %xmm7		# fix up last round key
2240	movdqa	%xmm7, (%rax)		# save last round key
2241
2242	and	\$-16, $len
2243	sub	\$0x80, %rsp		# place for tweak[8]
2244	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2245
2246	pxor	$twtmp, $twtmp
2247	movdqa	.Lxts_magic(%rip), $twmask
2248	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2249
2250	sub	\$0x80, $len
2251	jc	.Lxts_enc_short
2252	jmp	.Lxts_enc_loop
2253
2254.align	16
2255.Lxts_enc_loop:
2256___
2257    for ($i=0;$i<7;$i++) {
2258    $code.=<<___;
2259	pshufd	\$0x13, $twtmp, $twres
2260	pxor	$twtmp, $twtmp
2261	movdqa	@XMM[7], @XMM[$i]
2262	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2263	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2264	pand	$twmask, $twres		# isolate carry and residue
2265	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2266	pxor	$twres, @XMM[7]
2267___
2268    $code.=<<___ if ($i>=1);
2269	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2270___
2271    $code.=<<___ if ($i>=2);
2272	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2273___
2274    }
2275$code.=<<___;
2276	movdqu	0x60($inp), @XMM[8+6]
2277	pxor	@XMM[8+5], @XMM[5]
2278	movdqu	0x70($inp), @XMM[8+7]
2279	lea	0x80($inp), $inp
2280	movdqa	@XMM[7], 0x70(%rsp)
2281	pxor	@XMM[8+6], @XMM[6]
2282	lea	0x80(%rsp), %rax	# pass key schedule
2283	pxor	@XMM[8+7], @XMM[7]
2284	mov	%edx, %r10d		# pass rounds
2285
2286	call	_bsaes_encrypt8
2287
2288	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2289	pxor	0x10(%rsp), @XMM[1]
2290	movdqu	@XMM[0], 0x00($out)	# write output
2291	pxor	0x20(%rsp), @XMM[4]
2292	movdqu	@XMM[1], 0x10($out)
2293	pxor	0x30(%rsp), @XMM[6]
2294	movdqu	@XMM[4], 0x20($out)
2295	pxor	0x40(%rsp), @XMM[3]
2296	movdqu	@XMM[6], 0x30($out)
2297	pxor	0x50(%rsp), @XMM[7]
2298	movdqu	@XMM[3], 0x40($out)
2299	pxor	0x60(%rsp), @XMM[2]
2300	movdqu	@XMM[7], 0x50($out)
2301	pxor	0x70(%rsp), @XMM[5]
2302	movdqu	@XMM[2], 0x60($out)
2303	movdqu	@XMM[5], 0x70($out)
2304	lea	0x80($out), $out
2305
2306	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2307	pxor	$twtmp, $twtmp
2308	movdqa	.Lxts_magic(%rip), $twmask
2309	pcmpgtd	@XMM[7], $twtmp
2310	pshufd	\$0x13, $twtmp, $twres
2311	pxor	$twtmp, $twtmp
2312	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2313	pand	$twmask, $twres		# isolate carry and residue
2314	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2315	pxor	$twres, @XMM[7]
2316
2317	sub	\$0x80,$len
2318	jnc	.Lxts_enc_loop
2319
2320.Lxts_enc_short:
2321	add	\$0x80, $len
2322	jz	.Lxts_enc_done
2323___
2324    for ($i=0;$i<7;$i++) {
2325    $code.=<<___;
2326	pshufd	\$0x13, $twtmp, $twres
2327	pxor	$twtmp, $twtmp
2328	movdqa	@XMM[7], @XMM[$i]
2329	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2330	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2331	pand	$twmask, $twres		# isolate carry and residue
2332	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2333	pxor	$twres, @XMM[7]
2334___
2335    $code.=<<___ if ($i>=1);
2336	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2337	cmp	\$`0x10*$i`,$len
2338	je	.Lxts_enc_$i
2339___
2340    $code.=<<___ if ($i>=2);
2341	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2342___
2343    }
2344$code.=<<___;
2345	movdqu	0x60($inp), @XMM[8+6]
2346	pxor	@XMM[8+5], @XMM[5]
2347	movdqa	@XMM[7], 0x70(%rsp)
2348	lea	0x70($inp), $inp
2349	pxor	@XMM[8+6], @XMM[6]
2350	lea	0x80(%rsp), %rax	# pass key schedule
2351	mov	%edx, %r10d		# pass rounds
2352
2353	call	_bsaes_encrypt8
2354
2355	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2356	pxor	0x10(%rsp), @XMM[1]
2357	movdqu	@XMM[0], 0x00($out)	# write output
2358	pxor	0x20(%rsp), @XMM[4]
2359	movdqu	@XMM[1], 0x10($out)
2360	pxor	0x30(%rsp), @XMM[6]
2361	movdqu	@XMM[4], 0x20($out)
2362	pxor	0x40(%rsp), @XMM[3]
2363	movdqu	@XMM[6], 0x30($out)
2364	pxor	0x50(%rsp), @XMM[7]
2365	movdqu	@XMM[3], 0x40($out)
2366	pxor	0x60(%rsp), @XMM[2]
2367	movdqu	@XMM[7], 0x50($out)
2368	movdqu	@XMM[2], 0x60($out)
2369	lea	0x70($out), $out
2370
2371	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2372	jmp	.Lxts_enc_done
2373.align	16
2374.Lxts_enc_6:
2375	pxor	@XMM[8+4], @XMM[4]
2376	lea	0x60($inp), $inp
2377	pxor	@XMM[8+5], @XMM[5]
2378	lea	0x80(%rsp), %rax	# pass key schedule
2379	mov	%edx, %r10d		# pass rounds
2380
2381	call	_bsaes_encrypt8
2382
2383	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2384	pxor	0x10(%rsp), @XMM[1]
2385	movdqu	@XMM[0], 0x00($out)	# write output
2386	pxor	0x20(%rsp), @XMM[4]
2387	movdqu	@XMM[1], 0x10($out)
2388	pxor	0x30(%rsp), @XMM[6]
2389	movdqu	@XMM[4], 0x20($out)
2390	pxor	0x40(%rsp), @XMM[3]
2391	movdqu	@XMM[6], 0x30($out)
2392	pxor	0x50(%rsp), @XMM[7]
2393	movdqu	@XMM[3], 0x40($out)
2394	movdqu	@XMM[7], 0x50($out)
2395	lea	0x60($out), $out
2396
2397	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2398	jmp	.Lxts_enc_done
2399.align	16
2400.Lxts_enc_5:
2401	pxor	@XMM[8+3], @XMM[3]
2402	lea	0x50($inp), $inp
2403	pxor	@XMM[8+4], @XMM[4]
2404	lea	0x80(%rsp), %rax	# pass key schedule
2405	mov	%edx, %r10d		# pass rounds
2406
2407	call	_bsaes_encrypt8
2408
2409	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2410	pxor	0x10(%rsp), @XMM[1]
2411	movdqu	@XMM[0], 0x00($out)	# write output
2412	pxor	0x20(%rsp), @XMM[4]
2413	movdqu	@XMM[1], 0x10($out)
2414	pxor	0x30(%rsp), @XMM[6]
2415	movdqu	@XMM[4], 0x20($out)
2416	pxor	0x40(%rsp), @XMM[3]
2417	movdqu	@XMM[6], 0x30($out)
2418	movdqu	@XMM[3], 0x40($out)
2419	lea	0x50($out), $out
2420
2421	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2422	jmp	.Lxts_enc_done
2423.align	16
2424.Lxts_enc_4:
2425	pxor	@XMM[8+2], @XMM[2]
2426	lea	0x40($inp), $inp
2427	pxor	@XMM[8+3], @XMM[3]
2428	lea	0x80(%rsp), %rax	# pass key schedule
2429	mov	%edx, %r10d		# pass rounds
2430
2431	call	_bsaes_encrypt8
2432
2433	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2434	pxor	0x10(%rsp), @XMM[1]
2435	movdqu	@XMM[0], 0x00($out)	# write output
2436	pxor	0x20(%rsp), @XMM[4]
2437	movdqu	@XMM[1], 0x10($out)
2438	pxor	0x30(%rsp), @XMM[6]
2439	movdqu	@XMM[4], 0x20($out)
2440	movdqu	@XMM[6], 0x30($out)
2441	lea	0x40($out), $out
2442
2443	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2444	jmp	.Lxts_enc_done
2445.align	16
2446.Lxts_enc_3:
2447	pxor	@XMM[8+1], @XMM[1]
2448	lea	0x30($inp), $inp
2449	pxor	@XMM[8+2], @XMM[2]
2450	lea	0x80(%rsp), %rax	# pass key schedule
2451	mov	%edx, %r10d		# pass rounds
2452
2453	call	_bsaes_encrypt8
2454
2455	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2456	pxor	0x10(%rsp), @XMM[1]
2457	movdqu	@XMM[0], 0x00($out)	# write output
2458	pxor	0x20(%rsp), @XMM[4]
2459	movdqu	@XMM[1], 0x10($out)
2460	movdqu	@XMM[4], 0x20($out)
2461	lea	0x30($out), $out
2462
2463	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2464	jmp	.Lxts_enc_done
2465.align	16
2466.Lxts_enc_2:
2467	pxor	@XMM[8+0], @XMM[0]
2468	lea	0x20($inp), $inp
2469	pxor	@XMM[8+1], @XMM[1]
2470	lea	0x80(%rsp), %rax	# pass key schedule
2471	mov	%edx, %r10d		# pass rounds
2472
2473	call	_bsaes_encrypt8
2474
2475	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2476	pxor	0x10(%rsp), @XMM[1]
2477	movdqu	@XMM[0], 0x00($out)	# write output
2478	movdqu	@XMM[1], 0x10($out)
2479	lea	0x20($out), $out
2480
2481	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2482	jmp	.Lxts_enc_done
2483.align	16
2484.Lxts_enc_1:
2485	pxor	@XMM[0], @XMM[8]
2486	lea	0x10($inp), $inp
2487	movdqa	@XMM[8], 0x20(%rbp)
2488	lea	0x20(%rbp), $arg1
2489	lea	0x20(%rbp), $arg2
2490	lea	($key), $arg3
2491	call	asm_AES_encrypt		# doesn't touch %xmm
2492	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2493	#pxor	@XMM[8], @XMM[0]
2494	#lea	0x80(%rsp), %rax	# pass key schedule
2495	#mov	%edx, %r10d		# pass rounds
2496	#call	_bsaes_encrypt8
2497	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2498	movdqu	@XMM[0], 0x00($out)	# write output
2499	lea	0x10($out), $out
2500
2501	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2502
2503.Lxts_enc_done:
2504	and	\$15, %ebx
2505	jz	.Lxts_enc_ret
2506	mov	$out, %rdx
2507
2508.Lxts_enc_steal:
2509	movzb	($inp), %eax
2510	movzb	-16(%rdx), %ecx
2511	lea	1($inp), $inp
2512	mov	%al, -16(%rdx)
2513	mov	%cl, 0(%rdx)
2514	lea	1(%rdx), %rdx
2515	sub	\$1,%ebx
2516	jnz	.Lxts_enc_steal
2517
2518	movdqu	-16($out), @XMM[0]
2519	lea	0x20(%rbp), $arg1
2520	pxor	@XMM[7], @XMM[0]
2521	lea	0x20(%rbp), $arg2
2522	movdqa	@XMM[0], 0x20(%rbp)
2523	lea	($key), $arg3
2524	call	asm_AES_encrypt		# doesn't touch %xmm
2525	pxor	0x20(%rbp), @XMM[7]
2526	movdqu	@XMM[7], -16($out)
2527
2528.Lxts_enc_ret:
2529	lea	(%rsp), %rax
2530	pxor	%xmm0, %xmm0
2531.Lxts_enc_bzero:			# wipe key schedule [if any]
2532	movdqa	%xmm0, 0x00(%rax)
2533	movdqa	%xmm0, 0x10(%rax)
2534	lea	0x20(%rax), %rax
2535	cmp	%rax, %rbp
2536	ja	.Lxts_enc_bzero
2537
2538	lea	0x78(%rbp),%rax
2539.cfi_def_cfa	%rax,8
2540___
2541$code.=<<___ if ($win64);
2542	movaps	0x40(%rbp), %xmm6
2543	movaps	0x50(%rbp), %xmm7
2544	movaps	0x60(%rbp), %xmm8
2545	movaps	0x70(%rbp), %xmm9
2546	movaps	0x80(%rbp), %xmm10
2547	movaps	0x90(%rbp), %xmm11
2548	movaps	0xa0(%rbp), %xmm12
2549	movaps	0xb0(%rbp), %xmm13
2550	movaps	0xc0(%rbp), %xmm14
2551	movaps	0xd0(%rbp), %xmm15
2552	lea	0xa0(%rax), %rax
2553.Lxts_enc_tail:
2554___
2555$code.=<<___;
2556	mov	-48(%rax), %r15
2557.cfi_restore	%r15
2558	mov	-40(%rax), %r14
2559.cfi_restore	%r14
2560	mov	-32(%rax), %r13
2561.cfi_restore	%r13
2562	mov	-24(%rax), %r12
2563.cfi_restore	%r12
2564	mov	-16(%rax), %rbx
2565.cfi_restore	%rbx
2566	mov	-8(%rax), %rbp
2567.cfi_restore	%rbp
2568	lea	(%rax), %rsp		# restore %rsp
2569.cfi_def_cfa_register	%rsp
2570.Lxts_enc_epilogue:
2571	ret
2572.cfi_endproc
2573.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2574
2575.globl	bsaes_xts_decrypt
2576.type	bsaes_xts_decrypt,\@abi-omnipotent
2577.align	16
2578bsaes_xts_decrypt:
2579.cfi_startproc
2580	mov	%rsp, %rax
2581.Lxts_dec_prologue:
2582	push	%rbp
2583.cfi_push	%rbp
2584	push	%rbx
2585.cfi_push	%rbx
2586	push	%r12
2587.cfi_push	%r12
2588	push	%r13
2589.cfi_push	%r13
2590	push	%r14
2591.cfi_push	%r14
2592	push	%r15
2593.cfi_push	%r15
2594	lea	-0x48(%rsp), %rsp
2595.cfi_adjust_cfa_offset	0x48
2596___
2597$code.=<<___ if ($win64);
2598	mov	0xa0(%rsp),$arg5	# pull key2
2599	mov	0xa8(%rsp),$arg6	# pull ivp
2600	lea	-0xa0(%rsp), %rsp
2601	movaps	%xmm6, 0x40(%rsp)
2602	movaps	%xmm7, 0x50(%rsp)
2603	movaps	%xmm8, 0x60(%rsp)
2604	movaps	%xmm9, 0x70(%rsp)
2605	movaps	%xmm10, 0x80(%rsp)
2606	movaps	%xmm11, 0x90(%rsp)
2607	movaps	%xmm12, 0xa0(%rsp)
2608	movaps	%xmm13, 0xb0(%rsp)
2609	movaps	%xmm14, 0xc0(%rsp)
2610	movaps	%xmm15, 0xd0(%rsp)
2611.Lxts_dec_body:
2612___
2613$code.=<<___;
2614	mov	%rsp, %rbp		# backup %rsp
2615	mov	$arg1, $inp		# backup arguments
2616	mov	$arg2, $out
2617	mov	$arg3, $len
2618	mov	$arg4, $key
2619
2620	lea	($arg6), $arg1
2621	lea	0x20(%rbp), $arg2
2622	lea	($arg5), $arg3
2623	call	asm_AES_encrypt		# generate initial tweak
2624
2625	mov	240($key), %eax		# rounds
2626	mov	$len, %rbx		# backup $len
2627
2628	mov	%eax, %edx		# rounds
2629	shl	\$7, %rax		# 128 bytes per inner round key
2630	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2631	sub	%rax, %rsp
2632
2633	mov	%rsp, %rax		# pass key schedule
2634	mov	$key, %rcx		# pass key
2635	mov	%edx, %r10d		# pass rounds
2636	call	_bsaes_key_convert
2637	pxor	(%rsp), %xmm7		# fix up round 0 key
2638	movdqa	%xmm6, (%rax)		# save last round key
2639	movdqa	%xmm7, (%rsp)
2640
2641	xor	%eax, %eax		# if ($len%16) len-=16;
2642	and	\$-16, $len
2643	test	\$15, %ebx
2644	setnz	%al
2645	shl	\$4, %rax
2646	sub	%rax, $len
2647
2648	sub	\$0x80, %rsp		# place for tweak[8]
2649	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2650
2651	pxor	$twtmp, $twtmp
2652	movdqa	.Lxts_magic(%rip), $twmask
2653	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2654
2655	sub	\$0x80, $len
2656	jc	.Lxts_dec_short
2657	jmp	.Lxts_dec_loop
2658
2659.align	16
2660.Lxts_dec_loop:
2661___
2662    for ($i=0;$i<7;$i++) {
2663    $code.=<<___;
2664	pshufd	\$0x13, $twtmp, $twres
2665	pxor	$twtmp, $twtmp
2666	movdqa	@XMM[7], @XMM[$i]
2667	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2668	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2669	pand	$twmask, $twres		# isolate carry and residue
2670	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2671	pxor	$twres, @XMM[7]
2672___
2673    $code.=<<___ if ($i>=1);
2674	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2675___
2676    $code.=<<___ if ($i>=2);
2677	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2678___
2679    }
2680$code.=<<___;
2681	movdqu	0x60($inp), @XMM[8+6]
2682	pxor	@XMM[8+5], @XMM[5]
2683	movdqu	0x70($inp), @XMM[8+7]
2684	lea	0x80($inp), $inp
2685	movdqa	@XMM[7], 0x70(%rsp)
2686	pxor	@XMM[8+6], @XMM[6]
2687	lea	0x80(%rsp), %rax	# pass key schedule
2688	pxor	@XMM[8+7], @XMM[7]
2689	mov	%edx, %r10d		# pass rounds
2690
2691	call	_bsaes_decrypt8
2692
2693	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2694	pxor	0x10(%rsp), @XMM[1]
2695	movdqu	@XMM[0], 0x00($out)	# write output
2696	pxor	0x20(%rsp), @XMM[6]
2697	movdqu	@XMM[1], 0x10($out)
2698	pxor	0x30(%rsp), @XMM[4]
2699	movdqu	@XMM[6], 0x20($out)
2700	pxor	0x40(%rsp), @XMM[2]
2701	movdqu	@XMM[4], 0x30($out)
2702	pxor	0x50(%rsp), @XMM[7]
2703	movdqu	@XMM[2], 0x40($out)
2704	pxor	0x60(%rsp), @XMM[3]
2705	movdqu	@XMM[7], 0x50($out)
2706	pxor	0x70(%rsp), @XMM[5]
2707	movdqu	@XMM[3], 0x60($out)
2708	movdqu	@XMM[5], 0x70($out)
2709	lea	0x80($out), $out
2710
2711	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2712	pxor	$twtmp, $twtmp
2713	movdqa	.Lxts_magic(%rip), $twmask
2714	pcmpgtd	@XMM[7], $twtmp
2715	pshufd	\$0x13, $twtmp, $twres
2716	pxor	$twtmp, $twtmp
2717	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2718	pand	$twmask, $twres		# isolate carry and residue
2719	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2720	pxor	$twres, @XMM[7]
2721
2722	sub	\$0x80,$len
2723	jnc	.Lxts_dec_loop
2724
2725.Lxts_dec_short:
2726	add	\$0x80, $len
2727	jz	.Lxts_dec_done
2728___
2729    for ($i=0;$i<7;$i++) {
2730    $code.=<<___;
2731	pshufd	\$0x13, $twtmp, $twres
2732	pxor	$twtmp, $twtmp
2733	movdqa	@XMM[7], @XMM[$i]
2734	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2735	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2736	pand	$twmask, $twres		# isolate carry and residue
2737	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2738	pxor	$twres, @XMM[7]
2739___
2740    $code.=<<___ if ($i>=1);
2741	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2742	cmp	\$`0x10*$i`,$len
2743	je	.Lxts_dec_$i
2744___
2745    $code.=<<___ if ($i>=2);
2746	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2747___
2748    }
2749$code.=<<___;
2750	movdqu	0x60($inp), @XMM[8+6]
2751	pxor	@XMM[8+5], @XMM[5]
2752	movdqa	@XMM[7], 0x70(%rsp)
2753	lea	0x70($inp), $inp
2754	pxor	@XMM[8+6], @XMM[6]
2755	lea	0x80(%rsp), %rax	# pass key schedule
2756	mov	%edx, %r10d		# pass rounds
2757
2758	call	_bsaes_decrypt8
2759
2760	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2761	pxor	0x10(%rsp), @XMM[1]
2762	movdqu	@XMM[0], 0x00($out)	# write output
2763	pxor	0x20(%rsp), @XMM[6]
2764	movdqu	@XMM[1], 0x10($out)
2765	pxor	0x30(%rsp), @XMM[4]
2766	movdqu	@XMM[6], 0x20($out)
2767	pxor	0x40(%rsp), @XMM[2]
2768	movdqu	@XMM[4], 0x30($out)
2769	pxor	0x50(%rsp), @XMM[7]
2770	movdqu	@XMM[2], 0x40($out)
2771	pxor	0x60(%rsp), @XMM[3]
2772	movdqu	@XMM[7], 0x50($out)
2773	movdqu	@XMM[3], 0x60($out)
2774	lea	0x70($out), $out
2775
2776	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2777	jmp	.Lxts_dec_done
2778.align	16
2779.Lxts_dec_6:
2780	pxor	@XMM[8+4], @XMM[4]
2781	lea	0x60($inp), $inp
2782	pxor	@XMM[8+5], @XMM[5]
2783	lea	0x80(%rsp), %rax	# pass key schedule
2784	mov	%edx, %r10d		# pass rounds
2785
2786	call	_bsaes_decrypt8
2787
2788	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2789	pxor	0x10(%rsp), @XMM[1]
2790	movdqu	@XMM[0], 0x00($out)	# write output
2791	pxor	0x20(%rsp), @XMM[6]
2792	movdqu	@XMM[1], 0x10($out)
2793	pxor	0x30(%rsp), @XMM[4]
2794	movdqu	@XMM[6], 0x20($out)
2795	pxor	0x40(%rsp), @XMM[2]
2796	movdqu	@XMM[4], 0x30($out)
2797	pxor	0x50(%rsp), @XMM[7]
2798	movdqu	@XMM[2], 0x40($out)
2799	movdqu	@XMM[7], 0x50($out)
2800	lea	0x60($out), $out
2801
2802	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2803	jmp	.Lxts_dec_done
2804.align	16
2805.Lxts_dec_5:
2806	pxor	@XMM[8+3], @XMM[3]
2807	lea	0x50($inp), $inp
2808	pxor	@XMM[8+4], @XMM[4]
2809	lea	0x80(%rsp), %rax	# pass key schedule
2810	mov	%edx, %r10d		# pass rounds
2811
2812	call	_bsaes_decrypt8
2813
2814	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2815	pxor	0x10(%rsp), @XMM[1]
2816	movdqu	@XMM[0], 0x00($out)	# write output
2817	pxor	0x20(%rsp), @XMM[6]
2818	movdqu	@XMM[1], 0x10($out)
2819	pxor	0x30(%rsp), @XMM[4]
2820	movdqu	@XMM[6], 0x20($out)
2821	pxor	0x40(%rsp), @XMM[2]
2822	movdqu	@XMM[4], 0x30($out)
2823	movdqu	@XMM[2], 0x40($out)
2824	lea	0x50($out), $out
2825
2826	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2827	jmp	.Lxts_dec_done
2828.align	16
2829.Lxts_dec_4:
2830	pxor	@XMM[8+2], @XMM[2]
2831	lea	0x40($inp), $inp
2832	pxor	@XMM[8+3], @XMM[3]
2833	lea	0x80(%rsp), %rax	# pass key schedule
2834	mov	%edx, %r10d		# pass rounds
2835
2836	call	_bsaes_decrypt8
2837
2838	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2839	pxor	0x10(%rsp), @XMM[1]
2840	movdqu	@XMM[0], 0x00($out)	# write output
2841	pxor	0x20(%rsp), @XMM[6]
2842	movdqu	@XMM[1], 0x10($out)
2843	pxor	0x30(%rsp), @XMM[4]
2844	movdqu	@XMM[6], 0x20($out)
2845	movdqu	@XMM[4], 0x30($out)
2846	lea	0x40($out), $out
2847
2848	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2849	jmp	.Lxts_dec_done
2850.align	16
2851.Lxts_dec_3:
2852	pxor	@XMM[8+1], @XMM[1]
2853	lea	0x30($inp), $inp
2854	pxor	@XMM[8+2], @XMM[2]
2855	lea	0x80(%rsp), %rax	# pass key schedule
2856	mov	%edx, %r10d		# pass rounds
2857
2858	call	_bsaes_decrypt8
2859
2860	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2861	pxor	0x10(%rsp), @XMM[1]
2862	movdqu	@XMM[0], 0x00($out)	# write output
2863	pxor	0x20(%rsp), @XMM[6]
2864	movdqu	@XMM[1], 0x10($out)
2865	movdqu	@XMM[6], 0x20($out)
2866	lea	0x30($out), $out
2867
2868	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2869	jmp	.Lxts_dec_done
2870.align	16
2871.Lxts_dec_2:
2872	pxor	@XMM[8+0], @XMM[0]
2873	lea	0x20($inp), $inp
2874	pxor	@XMM[8+1], @XMM[1]
2875	lea	0x80(%rsp), %rax	# pass key schedule
2876	mov	%edx, %r10d		# pass rounds
2877
2878	call	_bsaes_decrypt8
2879
2880	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2881	pxor	0x10(%rsp), @XMM[1]
2882	movdqu	@XMM[0], 0x00($out)	# write output
2883	movdqu	@XMM[1], 0x10($out)
2884	lea	0x20($out), $out
2885
2886	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2887	jmp	.Lxts_dec_done
2888.align	16
2889.Lxts_dec_1:
2890	pxor	@XMM[0], @XMM[8]
2891	lea	0x10($inp), $inp
2892	movdqa	@XMM[8], 0x20(%rbp)
2893	lea	0x20(%rbp), $arg1
2894	lea	0x20(%rbp), $arg2
2895	lea	($key), $arg3
2896	call	asm_AES_decrypt		# doesn't touch %xmm
2897	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2898	#pxor	@XMM[8], @XMM[0]
2899	#lea	0x80(%rsp), %rax	# pass key schedule
2900	#mov	%edx, %r10d		# pass rounds
2901	#call	_bsaes_decrypt8
2902	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2903	movdqu	@XMM[0], 0x00($out)	# write output
2904	lea	0x10($out), $out
2905
2906	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2907
2908.Lxts_dec_done:
2909	and	\$15, %ebx
2910	jz	.Lxts_dec_ret
2911
2912	pxor	$twtmp, $twtmp
2913	movdqa	.Lxts_magic(%rip), $twmask
2914	pcmpgtd	@XMM[7], $twtmp
2915	pshufd	\$0x13, $twtmp, $twres
2916	movdqa	@XMM[7], @XMM[6]
2917	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2918	pand	$twmask, $twres		# isolate carry and residue
2919	movdqu	($inp), @XMM[0]
2920	pxor	$twres, @XMM[7]
2921
2922	lea	0x20(%rbp), $arg1
2923	pxor	@XMM[7], @XMM[0]
2924	lea	0x20(%rbp), $arg2
2925	movdqa	@XMM[0], 0x20(%rbp)
2926	lea	($key), $arg3
2927	call	asm_AES_decrypt		# doesn't touch %xmm
2928	pxor	0x20(%rbp), @XMM[7]
2929	mov	$out, %rdx
2930	movdqu	@XMM[7], ($out)
2931
2932.Lxts_dec_steal:
2933	movzb	16($inp), %eax
2934	movzb	(%rdx), %ecx
2935	lea	1($inp), $inp
2936	mov	%al, (%rdx)
2937	mov	%cl, 16(%rdx)
2938	lea	1(%rdx), %rdx
2939	sub	\$1,%ebx
2940	jnz	.Lxts_dec_steal
2941
2942	movdqu	($out), @XMM[0]
2943	lea	0x20(%rbp), $arg1
2944	pxor	@XMM[6], @XMM[0]
2945	lea	0x20(%rbp), $arg2
2946	movdqa	@XMM[0], 0x20(%rbp)
2947	lea	($key), $arg3
2948	call	asm_AES_decrypt		# doesn't touch %xmm
2949	pxor	0x20(%rbp), @XMM[6]
2950	movdqu	@XMM[6], ($out)
2951
2952.Lxts_dec_ret:
2953	lea	(%rsp), %rax
2954	pxor	%xmm0, %xmm0
2955.Lxts_dec_bzero:			# wipe key schedule [if any]
2956	movdqa	%xmm0, 0x00(%rax)
2957	movdqa	%xmm0, 0x10(%rax)
2958	lea	0x20(%rax), %rax
2959	cmp	%rax, %rbp
2960	ja	.Lxts_dec_bzero
2961
2962	lea	0x78(%rbp),%rax
2963.cfi_def_cfa	%rax,8
2964___
2965$code.=<<___ if ($win64);
2966	movaps	0x40(%rbp), %xmm6
2967	movaps	0x50(%rbp), %xmm7
2968	movaps	0x60(%rbp), %xmm8
2969	movaps	0x70(%rbp), %xmm9
2970	movaps	0x80(%rbp), %xmm10
2971	movaps	0x90(%rbp), %xmm11
2972	movaps	0xa0(%rbp), %xmm12
2973	movaps	0xb0(%rbp), %xmm13
2974	movaps	0xc0(%rbp), %xmm14
2975	movaps	0xd0(%rbp), %xmm15
2976	lea	0xa0(%rax), %rax
2977.Lxts_dec_tail:
2978___
2979$code.=<<___;
2980	mov	-48(%rax), %r15
2981.cfi_restore	%r15
2982	mov	-40(%rax), %r14
2983.cfi_restore	%r14
2984	mov	-32(%rax), %r13
2985.cfi_restore	%r13
2986	mov	-24(%rax), %r12
2987.cfi_restore	%r12
2988	mov	-16(%rax), %rbx
2989.cfi_restore	%rbx
2990	mov	-8(%rax), %rbp
2991.cfi_restore	%rbp
2992	lea	(%rax), %rsp		# restore %rsp
2993.cfi_def_cfa_register	%rsp
2994.Lxts_dec_epilogue:
2995	ret
2996.cfi_endproc
2997.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2998___
2999}
3000$code.=<<___;
3001.type	_bsaes_const,\@object
3002.align	64
3003_bsaes_const:
3004.LM0ISR:	# InvShiftRows constants
3005	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
3006.LISRM0:
3007	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
3008.LISR:
3009	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
3010.LBS0:		# bit-slice constants
3011	.quad	0x5555555555555555, 0x5555555555555555
3012.LBS1:
3013	.quad	0x3333333333333333, 0x3333333333333333
3014.LBS2:
3015	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
3016.LSR:		# shiftrows constants
3017	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
3018.LSRM0:
3019	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
3020.LM0SR:
3021	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
3022.LSWPUP:	# byte-swap upper dword
3023	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
3024.LSWPUPM0SR:
3025	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
3026.LADD1:		# counter increment constants
3027	.quad	0x0000000000000000, 0x0000000100000000
3028.LADD2:
3029	.quad	0x0000000000000000, 0x0000000200000000
3030.LADD3:
3031	.quad	0x0000000000000000, 0x0000000300000000
3032.LADD4:
3033	.quad	0x0000000000000000, 0x0000000400000000
3034.LADD5:
3035	.quad	0x0000000000000000, 0x0000000500000000
3036.LADD6:
3037	.quad	0x0000000000000000, 0x0000000600000000
3038.LADD7:
3039	.quad	0x0000000000000000, 0x0000000700000000
3040.LADD8:
3041	.quad	0x0000000000000000, 0x0000000800000000
3042.Lxts_magic:
3043	.long	0x87,0,1,0
3044.Lmasks:
3045	.quad	0x0101010101010101, 0x0101010101010101
3046	.quad	0x0202020202020202, 0x0202020202020202
3047	.quad	0x0404040404040404, 0x0404040404040404
3048	.quad	0x0808080808080808, 0x0808080808080808
3049.LM0:
3050	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
3051.L63:
3052	.quad	0x6363636363636363, 0x6363636363636363
3053.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
3054.align	64
3055.size	_bsaes_const,.-_bsaes_const
3056___
3057
3058# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3059#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3060if ($win64) {
3061$rec="%rcx";
3062$frame="%rdx";
3063$context="%r8";
3064$disp="%r9";
3065
3066$code.=<<___;
3067.extern	__imp_RtlVirtualUnwind
3068.type	se_handler,\@abi-omnipotent
3069.align	16
3070se_handler:
3071	push	%rsi
3072	push	%rdi
3073	push	%rbx
3074	push	%rbp
3075	push	%r12
3076	push	%r13
3077	push	%r14
3078	push	%r15
3079	pushfq
3080	sub	\$64,%rsp
3081
3082	mov	120($context),%rax	# pull context->Rax
3083	mov	248($context),%rbx	# pull context->Rip
3084
3085	mov	8($disp),%rsi		# disp->ImageBase
3086	mov	56($disp),%r11		# disp->HandlerData
3087
3088	mov	0(%r11),%r10d		# HandlerData[0]
3089	lea	(%rsi,%r10),%r10	# prologue label
3090	cmp	%r10,%rbx		# context->Rip<=prologue label
3091	jbe	.Lin_prologue
3092
3093	mov	4(%r11),%r10d		# HandlerData[1]
3094	lea	(%rsi,%r10),%r10	# epilogue label
3095	cmp	%r10,%rbx		# context->Rip>=epilogue label
3096	jae	.Lin_prologue
3097
3098	mov	8(%r11),%r10d		# HandlerData[2]
3099	lea	(%rsi,%r10),%r10	# epilogue label
3100	cmp	%r10,%rbx		# context->Rip>=tail label
3101	jae	.Lin_tail
3102
3103	mov	160($context),%rax	# pull context->Rbp
3104
3105	lea	0x40(%rax),%rsi		# %xmm save area
3106	lea	512($context),%rdi	# &context.Xmm6
3107	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
3108	.long	0xa548f3fc		# cld; rep movsq
3109	lea	0xa0+0x78(%rax),%rax	# adjust stack pointer
3110
3111.Lin_tail:
3112	mov	-48(%rax),%rbp
3113	mov	-40(%rax),%rbx
3114	mov	-32(%rax),%r12
3115	mov	-24(%rax),%r13
3116	mov	-16(%rax),%r14
3117	mov	-8(%rax),%r15
3118	mov	%rbx,144($context)	# restore context->Rbx
3119	mov	%rbp,160($context)	# restore context->Rbp
3120	mov	%r12,216($context)	# restore context->R12
3121	mov	%r13,224($context)	# restore context->R13
3122	mov	%r14,232($context)	# restore context->R14
3123	mov	%r15,240($context)	# restore context->R15
3124
3125.Lin_prologue:
3126	mov	%rax,152($context)	# restore context->Rsp
3127
3128	mov	40($disp),%rdi		# disp->ContextRecord
3129	mov	$context,%rsi		# context
3130	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
3131	.long	0xa548f3fc		# cld; rep movsq
3132
3133	mov	$disp,%rsi
3134	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3135	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3136	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3137	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3138	mov	40(%rsi),%r10		# disp->ContextRecord
3139	lea	56(%rsi),%r11		# &disp->HandlerData
3140	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3141	mov	%r10,32(%rsp)		# arg5
3142	mov	%r11,40(%rsp)		# arg6
3143	mov	%r12,48(%rsp)		# arg7
3144	mov	%rcx,56(%rsp)		# arg8, (NULL)
3145	call	*__imp_RtlVirtualUnwind(%rip)
3146
3147	mov	\$1,%eax		# ExceptionContinueSearch
3148	add	\$64,%rsp
3149	popfq
3150	pop	%r15
3151	pop	%r14
3152	pop	%r13
3153	pop	%r12
3154	pop	%rbp
3155	pop	%rbx
3156	pop	%rdi
3157	pop	%rsi
3158	ret
3159.size	se_handler,.-se_handler
3160
3161.section	.pdata
3162.align	4
3163___
3164$code.=<<___ if ($ecb);
3165	.rva	.Lecb_enc_prologue
3166	.rva	.Lecb_enc_epilogue
3167	.rva	.Lecb_enc_info
3168
3169	.rva	.Lecb_dec_prologue
3170	.rva	.Lecb_dec_epilogue
3171	.rva	.Lecb_dec_info
3172___
3173$code.=<<___;
3174	.rva	.Lcbc_dec_prologue
3175	.rva	.Lcbc_dec_epilogue
3176	.rva	.Lcbc_dec_info
3177
3178	.rva	.Lctr_enc_prologue
3179	.rva	.Lctr_enc_epilogue
3180	.rva	.Lctr_enc_info
3181
3182	.rva	.Lxts_enc_prologue
3183	.rva	.Lxts_enc_epilogue
3184	.rva	.Lxts_enc_info
3185
3186	.rva	.Lxts_dec_prologue
3187	.rva	.Lxts_dec_epilogue
3188	.rva	.Lxts_dec_info
3189
3190.section	.xdata
3191.align	8
3192___
3193$code.=<<___ if ($ecb);
3194.Lecb_enc_info:
3195	.byte	9,0,0,0
3196	.rva	se_handler
3197	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3198	.rva	.Lecb_enc_tail
3199	.long	0
3200.Lecb_dec_info:
3201	.byte	9,0,0,0
3202	.rva	se_handler
3203	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3204	.rva	.Lecb_dec_tail
3205	.long	0
3206___
3207$code.=<<___;
3208.Lcbc_dec_info:
3209	.byte	9,0,0,0
3210	.rva	se_handler
3211	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3212	.rva	.Lcbc_dec_tail
3213	.long	0
3214.Lctr_enc_info:
3215	.byte	9,0,0,0
3216	.rva	se_handler
3217	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3218	.rva	.Lctr_enc_tail
3219	.long	0
3220.Lxts_enc_info:
3221	.byte	9,0,0,0
3222	.rva	se_handler
3223	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3224	.rva	.Lxts_enc_tail
3225	.long	0
3226.Lxts_dec_info:
3227	.byte	9,0,0,0
3228	.rva	se_handler
3229	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3230	.rva	.Lxts_dec_tail
3231	.long	0
3232___
3233}
3234
3235$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3236
3237print $code;
3238
3239close STDOUT;
3240