xref: /freebsd/crypto/openssl/crypto/aes/asm/aesp8-ppc.pl (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1#! /usr/bin/env perl
2# Copyright 2014-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for AES instructions as per PowerISA
18# specification version 2.07, first implemented by POWER8 processor.
19# The module is endian-agnostic in sense that it supports both big-
20# and little-endian cases. Data alignment in parallelizable modes is
21# handled with VSX loads and stores, which implies MSR.VSX flag being
22# set. It should also be noted that ISA specification doesn't prohibit
23# alignment exceptions for these instructions on page boundaries.
24# Initially alignment was handled in pure AltiVec/VMX way [when data
25# is aligned programmatically, which in turn guarantees exception-
26# free execution], but it turned to hamper performance when vcipher
27# instructions are interleaved. It's reckoned that eventual
28# misalignment penalties at page boundaries are in average lower
29# than additional overhead in pure AltiVec approach.
30#
31# May 2016
32#
33# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
34# systems were measured.
35#
36######################################################################
37# Current large-block performance in cycles per byte processed with
38# 128-bit key (less is better).
39#
40#		CBC en-/decrypt	CTR	XTS
41# POWER8[le]	3.96/0.72	0.74	1.1
42# POWER8[be]	3.75/0.65	0.66	1.0
43# POWER9[le]	4.02/0.86	0.84	1.05
44# POWER9[be]	3.99/0.78	0.79	0.97
45
46# $output is the last argument if it looks like a file (it has an extension)
47# $flavour is the first argument if it doesn't look like a file
48$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
49$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
50
51if ($flavour =~ /64/) {
52	$SIZE_T	=8;
53	$LRSAVE	=2*$SIZE_T;
54	$STU	="stdu";
55	$POP	="ld";
56	$PUSH	="std";
57	$UCMP	="cmpld";
58	$SHL	="sldi";
59} elsif ($flavour =~ /32/) {
60	$SIZE_T	=4;
61	$LRSAVE	=$SIZE_T;
62	$STU	="stwu";
63	$POP	="lwz";
64	$PUSH	="stw";
65	$UCMP	="cmplw";
66	$SHL	="slwi";
67} else { die "nonsense $flavour"; }
68
69$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
70
71$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
72( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
73( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
74die "can't locate ppc-xlate.pl";
75
76open STDOUT,"| $^X $xlate $flavour \"$output\""
77    or die "can't call $xlate: $!";
78
79$FRAME=8*$SIZE_T;
80$prefix="aes_p8";
81
82$sp="r1";
83$vrsave="r12";
84
85#########################################################################
86{{{	# Key setup procedures						#
87my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
88my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
89my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
90
91$code.=<<___;
92.machine	"any"
93
94.text
95
96.align	7
97rcon:
98.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
99.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
100.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
101.long	0,0,0,0						?asis
102.long	0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
103Lconsts:
104	mflr	r0
105	bcl	20,31,\$+4
106	mflr	$ptr	 #vvvvv "distance between . and rcon
107	addi	$ptr,$ptr,-0x58
108	mtlr	r0
109	blr
110	.long	0
111	.byte	0,12,0x14,0,0,0,0,0
112.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
113
114.globl	.${prefix}_set_encrypt_key
115.align	5
116.${prefix}_set_encrypt_key:
117Lset_encrypt_key:
118	mflr		r11
119	$PUSH		r11,$LRSAVE($sp)
120
121	li		$ptr,-1
122	${UCMP}i	$inp,0
123	beq-		Lenc_key_abort		# if ($inp==0) return -1;
124	${UCMP}i	$out,0
125	beq-		Lenc_key_abort		# if ($out==0) return -1;
126	li		$ptr,-2
127	cmpwi		$bits,128
128	blt-		Lenc_key_abort
129	cmpwi		$bits,256
130	bgt-		Lenc_key_abort
131	andi.		r0,$bits,0x3f
132	bne-		Lenc_key_abort
133
134	lis		r0,0xfff0
135	mfspr		$vrsave,256
136	mtspr		256,r0
137
138	bl		Lconsts
139	mtlr		r11
140
141	neg		r9,$inp
142	lvx		$in0,0,$inp
143	addi		$inp,$inp,15		# 15 is not typo
144	lvsr		$key,0,r9		# borrow $key
145	li		r8,0x20
146	cmpwi		$bits,192
147	lvx		$in1,0,$inp
148	le?vspltisb	$mask,0x0f		# borrow $mask
149	lvx		$rcon,0,$ptr
150	le?vxor		$key,$key,$mask		# adjust for byte swap
151	lvx		$mask,r8,$ptr
152	addi		$ptr,$ptr,0x10
153	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
154	li		$cnt,8
155	vxor		$zero,$zero,$zero
156	mtctr		$cnt
157
158	?lvsr		$outperm,0,$out
159	vspltisb	$outmask,-1
160	lvx		$outhead,0,$out
161	?vperm		$outmask,$zero,$outmask,$outperm
162
163	blt		Loop128
164	addi		$inp,$inp,8
165	beq		L192
166	addi		$inp,$inp,8
167	b		L256
168
169.align	4
170Loop128:
171	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
172	vsldoi		$tmp,$zero,$in0,12	# >>32
173	 vperm		$outtail,$in0,$in0,$outperm	# rotate
174	 vsel		$stage,$outhead,$outtail,$outmask
175	 vmr		$outhead,$outtail
176	vcipherlast	$key,$key,$rcon
177	 stvx		$stage,0,$out
178	 addi		$out,$out,16
179
180	vxor		$in0,$in0,$tmp
181	vsldoi		$tmp,$zero,$tmp,12	# >>32
182	vxor		$in0,$in0,$tmp
183	vsldoi		$tmp,$zero,$tmp,12	# >>32
184	vxor		$in0,$in0,$tmp
185	 vadduwm	$rcon,$rcon,$rcon
186	vxor		$in0,$in0,$key
187	bdnz		Loop128
188
189	lvx		$rcon,0,$ptr		# last two round keys
190
191	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
192	vsldoi		$tmp,$zero,$in0,12	# >>32
193	 vperm		$outtail,$in0,$in0,$outperm	# rotate
194	 vsel		$stage,$outhead,$outtail,$outmask
195	 vmr		$outhead,$outtail
196	vcipherlast	$key,$key,$rcon
197	 stvx		$stage,0,$out
198	 addi		$out,$out,16
199
200	vxor		$in0,$in0,$tmp
201	vsldoi		$tmp,$zero,$tmp,12	# >>32
202	vxor		$in0,$in0,$tmp
203	vsldoi		$tmp,$zero,$tmp,12	# >>32
204	vxor		$in0,$in0,$tmp
205	 vadduwm	$rcon,$rcon,$rcon
206	vxor		$in0,$in0,$key
207
208	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
209	vsldoi		$tmp,$zero,$in0,12	# >>32
210	 vperm		$outtail,$in0,$in0,$outperm	# rotate
211	 vsel		$stage,$outhead,$outtail,$outmask
212	 vmr		$outhead,$outtail
213	vcipherlast	$key,$key,$rcon
214	 stvx		$stage,0,$out
215	 addi		$out,$out,16
216
217	vxor		$in0,$in0,$tmp
218	vsldoi		$tmp,$zero,$tmp,12	# >>32
219	vxor		$in0,$in0,$tmp
220	vsldoi		$tmp,$zero,$tmp,12	# >>32
221	vxor		$in0,$in0,$tmp
222	vxor		$in0,$in0,$key
223	 vperm		$outtail,$in0,$in0,$outperm	# rotate
224	 vsel		$stage,$outhead,$outtail,$outmask
225	 vmr		$outhead,$outtail
226	 stvx		$stage,0,$out
227
228	addi		$inp,$out,15		# 15 is not typo
229	addi		$out,$out,0x50
230
231	li		$rounds,10
232	b		Ldone
233
234.align	4
235L192:
236	lvx		$tmp,0,$inp
237	li		$cnt,4
238	 vperm		$outtail,$in0,$in0,$outperm	# rotate
239	 vsel		$stage,$outhead,$outtail,$outmask
240	 vmr		$outhead,$outtail
241	 stvx		$stage,0,$out
242	 addi		$out,$out,16
243	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
244	vspltisb	$key,8			# borrow $key
245	mtctr		$cnt
246	vsububm		$mask,$mask,$key	# adjust the mask
247
248Loop192:
249	vperm		$key,$in1,$in1,$mask	# roate-n-splat
250	vsldoi		$tmp,$zero,$in0,12	# >>32
251	vcipherlast	$key,$key,$rcon
252
253	vxor		$in0,$in0,$tmp
254	vsldoi		$tmp,$zero,$tmp,12	# >>32
255	vxor		$in0,$in0,$tmp
256	vsldoi		$tmp,$zero,$tmp,12	# >>32
257	vxor		$in0,$in0,$tmp
258
259	 vsldoi		$stage,$zero,$in1,8
260	vspltw		$tmp,$in0,3
261	vxor		$tmp,$tmp,$in1
262	vsldoi		$in1,$zero,$in1,12	# >>32
263	 vadduwm	$rcon,$rcon,$rcon
264	vxor		$in1,$in1,$tmp
265	vxor		$in0,$in0,$key
266	vxor		$in1,$in1,$key
267	 vsldoi		$stage,$stage,$in0,8
268
269	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
270	vsldoi		$tmp,$zero,$in0,12	# >>32
271	 vperm		$outtail,$stage,$stage,$outperm	# rotate
272	 vsel		$stage,$outhead,$outtail,$outmask
273	 vmr		$outhead,$outtail
274	vcipherlast	$key,$key,$rcon
275	 stvx		$stage,0,$out
276	 addi		$out,$out,16
277
278	 vsldoi		$stage,$in0,$in1,8
279	vxor		$in0,$in0,$tmp
280	vsldoi		$tmp,$zero,$tmp,12	# >>32
281	 vperm		$outtail,$stage,$stage,$outperm	# rotate
282	 vsel		$stage,$outhead,$outtail,$outmask
283	 vmr		$outhead,$outtail
284	vxor		$in0,$in0,$tmp
285	vsldoi		$tmp,$zero,$tmp,12	# >>32
286	vxor		$in0,$in0,$tmp
287	 stvx		$stage,0,$out
288	 addi		$out,$out,16
289
290	vspltw		$tmp,$in0,3
291	vxor		$tmp,$tmp,$in1
292	vsldoi		$in1,$zero,$in1,12	# >>32
293	 vadduwm	$rcon,$rcon,$rcon
294	vxor		$in1,$in1,$tmp
295	vxor		$in0,$in0,$key
296	vxor		$in1,$in1,$key
297	 vperm		$outtail,$in0,$in0,$outperm	# rotate
298	 vsel		$stage,$outhead,$outtail,$outmask
299	 vmr		$outhead,$outtail
300	 stvx		$stage,0,$out
301	 addi		$inp,$out,15		# 15 is not typo
302	 addi		$out,$out,16
303	bdnz		Loop192
304
305	li		$rounds,12
306	addi		$out,$out,0x20
307	b		Ldone
308
309.align	4
310L256:
311	lvx		$tmp,0,$inp
312	li		$cnt,7
313	li		$rounds,14
314	 vperm		$outtail,$in0,$in0,$outperm	# rotate
315	 vsel		$stage,$outhead,$outtail,$outmask
316	 vmr		$outhead,$outtail
317	 stvx		$stage,0,$out
318	 addi		$out,$out,16
319	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
320	mtctr		$cnt
321
322Loop256:
323	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
324	vsldoi		$tmp,$zero,$in0,12	# >>32
325	 vperm		$outtail,$in1,$in1,$outperm	# rotate
326	 vsel		$stage,$outhead,$outtail,$outmask
327	 vmr		$outhead,$outtail
328	vcipherlast	$key,$key,$rcon
329	 stvx		$stage,0,$out
330	 addi		$out,$out,16
331
332	vxor		$in0,$in0,$tmp
333	vsldoi		$tmp,$zero,$tmp,12	# >>32
334	vxor		$in0,$in0,$tmp
335	vsldoi		$tmp,$zero,$tmp,12	# >>32
336	vxor		$in0,$in0,$tmp
337	 vadduwm	$rcon,$rcon,$rcon
338	vxor		$in0,$in0,$key
339	 vperm		$outtail,$in0,$in0,$outperm	# rotate
340	 vsel		$stage,$outhead,$outtail,$outmask
341	 vmr		$outhead,$outtail
342	 stvx		$stage,0,$out
343	 addi		$inp,$out,15		# 15 is not typo
344	 addi		$out,$out,16
345	bdz		Ldone
346
347	vspltw		$key,$in0,3		# just splat
348	vsldoi		$tmp,$zero,$in1,12	# >>32
349	vsbox		$key,$key
350
351	vxor		$in1,$in1,$tmp
352	vsldoi		$tmp,$zero,$tmp,12	# >>32
353	vxor		$in1,$in1,$tmp
354	vsldoi		$tmp,$zero,$tmp,12	# >>32
355	vxor		$in1,$in1,$tmp
356
357	vxor		$in1,$in1,$key
358	b		Loop256
359
360.align	4
361Ldone:
362	lvx		$in1,0,$inp		# redundant in aligned case
363	vsel		$in1,$outhead,$in1,$outmask
364	stvx		$in1,0,$inp
365	li		$ptr,0
366	mtspr		256,$vrsave
367	stw		$rounds,0($out)
368
369Lenc_key_abort:
370	mr		r3,$ptr
371	blr
372	.long		0
373	.byte		0,12,0x14,1,0,0,3,0
374	.long		0
375.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
376
377.globl	.${prefix}_set_decrypt_key
378.align	5
379.${prefix}_set_decrypt_key:
380	$STU		$sp,-$FRAME($sp)
381	mflr		r10
382	$PUSH		r10,$FRAME+$LRSAVE($sp)
383	bl		Lset_encrypt_key
384	mtlr		r10
385
386	cmpwi		r3,0
387	bne-		Ldec_key_abort
388
389	slwi		$cnt,$rounds,4
390	subi		$inp,$out,240		# first round key
391	srwi		$rounds,$rounds,1
392	add		$out,$inp,$cnt		# last round key
393	mtctr		$rounds
394
395Ldeckey:
396	lwz		r0, 0($inp)
397	lwz		r6, 4($inp)
398	lwz		r7, 8($inp)
399	lwz		r8, 12($inp)
400	addi		$inp,$inp,16
401	lwz		r9, 0($out)
402	lwz		r10,4($out)
403	lwz		r11,8($out)
404	lwz		r12,12($out)
405	stw		r0, 0($out)
406	stw		r6, 4($out)
407	stw		r7, 8($out)
408	stw		r8, 12($out)
409	subi		$out,$out,16
410	stw		r9, -16($inp)
411	stw		r10,-12($inp)
412	stw		r11,-8($inp)
413	stw		r12,-4($inp)
414	bdnz		Ldeckey
415
416	xor		r3,r3,r3		# return value
417Ldec_key_abort:
418	addi		$sp,$sp,$FRAME
419	blr
420	.long		0
421	.byte		0,12,4,1,0x80,0,3,0
422	.long		0
423.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
424___
425}}}
426#########################################################################
427{{{	# Single block en- and decrypt procedures			#
428sub gen_block () {
429my $dir = shift;
430my $n   = $dir eq "de" ? "n" : "";
431my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
432
433$code.=<<___;
434.globl	.${prefix}_${dir}crypt
435.align	5
436.${prefix}_${dir}crypt:
437	lwz		$rounds,240($key)
438	lis		r0,0xfc00
439	mfspr		$vrsave,256
440	li		$idx,15			# 15 is not typo
441	mtspr		256,r0
442
443	lvx		v0,0,$inp
444	neg		r11,$out
445	lvx		v1,$idx,$inp
446	lvsl		v2,0,$inp		# inpperm
447	le?vspltisb	v4,0x0f
448	?lvsl		v3,0,r11		# outperm
449	le?vxor		v2,v2,v4
450	li		$idx,16
451	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
452	lvx		v1,0,$key
453	?lvsl		v5,0,$key		# keyperm
454	srwi		$rounds,$rounds,1
455	lvx		v2,$idx,$key
456	addi		$idx,$idx,16
457	subi		$rounds,$rounds,1
458	?vperm		v1,v1,v2,v5		# align round key
459
460	vxor		v0,v0,v1
461	lvx		v1,$idx,$key
462	addi		$idx,$idx,16
463	mtctr		$rounds
464
465Loop_${dir}c:
466	?vperm		v2,v2,v1,v5
467	v${n}cipher	v0,v0,v2
468	lvx		v2,$idx,$key
469	addi		$idx,$idx,16
470	?vperm		v1,v1,v2,v5
471	v${n}cipher	v0,v0,v1
472	lvx		v1,$idx,$key
473	addi		$idx,$idx,16
474	bdnz		Loop_${dir}c
475
476	?vperm		v2,v2,v1,v5
477	v${n}cipher	v0,v0,v2
478	lvx		v2,$idx,$key
479	?vperm		v1,v1,v2,v5
480	v${n}cipherlast	v0,v0,v1
481
482	vspltisb	v2,-1
483	vxor		v1,v1,v1
484	li		$idx,15			# 15 is not typo
485	?vperm		v2,v1,v2,v3		# outmask
486	le?vxor		v3,v3,v4
487	lvx		v1,0,$out		# outhead
488	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
489	vsel		v1,v1,v0,v2
490	lvx		v4,$idx,$out
491	stvx		v1,0,$out
492	vsel		v0,v0,v4,v2
493	stvx		v0,$idx,$out
494
495	mtspr		256,$vrsave
496	blr
497	.long		0
498	.byte		0,12,0x14,0,0,0,3,0
499	.long		0
500.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
501___
502}
503&gen_block("en");
504&gen_block("de");
505}}}
506#########################################################################
507{{{	# CBC en- and decrypt procedures				#
508my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
509my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
510my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
511						map("v$_",(4..10));
512$code.=<<___;
513.globl	.${prefix}_cbc_encrypt
514.align	5
515.${prefix}_cbc_encrypt:
516	${UCMP}i	$len,16
517	bltlr-
518
519	cmpwi		$enc,0			# test direction
520	lis		r0,0xffe0
521	mfspr		$vrsave,256
522	mtspr		256,r0
523
524	li		$idx,15
525	vxor		$rndkey0,$rndkey0,$rndkey0
526	le?vspltisb	$tmp,0x0f
527
528	lvx		$ivec,0,$ivp		# load [unaligned] iv
529	lvsl		$inpperm,0,$ivp
530	lvx		$inptail,$idx,$ivp
531	le?vxor		$inpperm,$inpperm,$tmp
532	vperm		$ivec,$ivec,$inptail,$inpperm
533
534	neg		r11,$inp
535	?lvsl		$keyperm,0,$key		# prepare for unaligned key
536	lwz		$rounds,240($key)
537
538	lvsr		$inpperm,0,r11		# prepare for unaligned load
539	lvx		$inptail,0,$inp
540	addi		$inp,$inp,15		# 15 is not typo
541	le?vxor		$inpperm,$inpperm,$tmp
542
543	?lvsr		$outperm,0,$out		# prepare for unaligned store
544	vspltisb	$outmask,-1
545	lvx		$outhead,0,$out
546	?vperm		$outmask,$rndkey0,$outmask,$outperm
547	le?vxor		$outperm,$outperm,$tmp
548
549	srwi		$rounds,$rounds,1
550	li		$idx,16
551	subi		$rounds,$rounds,1
552	beq		Lcbc_dec
553
554Lcbc_enc:
555	vmr		$inout,$inptail
556	lvx		$inptail,0,$inp
557	addi		$inp,$inp,16
558	mtctr		$rounds
559	subi		$len,$len,16		# len-=16
560
561	lvx		$rndkey0,0,$key
562	 vperm		$inout,$inout,$inptail,$inpperm
563	lvx		$rndkey1,$idx,$key
564	addi		$idx,$idx,16
565	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
566	vxor		$inout,$inout,$rndkey0
567	lvx		$rndkey0,$idx,$key
568	addi		$idx,$idx,16
569	vxor		$inout,$inout,$ivec
570
571Loop_cbc_enc:
572	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
573	vcipher		$inout,$inout,$rndkey1
574	lvx		$rndkey1,$idx,$key
575	addi		$idx,$idx,16
576	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
577	vcipher		$inout,$inout,$rndkey0
578	lvx		$rndkey0,$idx,$key
579	addi		$idx,$idx,16
580	bdnz		Loop_cbc_enc
581
582	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
583	vcipher		$inout,$inout,$rndkey1
584	lvx		$rndkey1,$idx,$key
585	li		$idx,16
586	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
587	vcipherlast	$ivec,$inout,$rndkey0
588	${UCMP}i	$len,16
589
590	vperm		$tmp,$ivec,$ivec,$outperm
591	vsel		$inout,$outhead,$tmp,$outmask
592	vmr		$outhead,$tmp
593	stvx		$inout,0,$out
594	addi		$out,$out,16
595	bge		Lcbc_enc
596
597	b		Lcbc_done
598
599.align	4
600Lcbc_dec:
601	${UCMP}i	$len,128
602	bge		_aesp8_cbc_decrypt8x
603	vmr		$tmp,$inptail
604	lvx		$inptail,0,$inp
605	addi		$inp,$inp,16
606	mtctr		$rounds
607	subi		$len,$len,16		# len-=16
608
609	lvx		$rndkey0,0,$key
610	 vperm		$tmp,$tmp,$inptail,$inpperm
611	lvx		$rndkey1,$idx,$key
612	addi		$idx,$idx,16
613	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
614	vxor		$inout,$tmp,$rndkey0
615	lvx		$rndkey0,$idx,$key
616	addi		$idx,$idx,16
617
618Loop_cbc_dec:
619	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
620	vncipher	$inout,$inout,$rndkey1
621	lvx		$rndkey1,$idx,$key
622	addi		$idx,$idx,16
623	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
624	vncipher	$inout,$inout,$rndkey0
625	lvx		$rndkey0,$idx,$key
626	addi		$idx,$idx,16
627	bdnz		Loop_cbc_dec
628
629	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
630	vncipher	$inout,$inout,$rndkey1
631	lvx		$rndkey1,$idx,$key
632	li		$idx,16
633	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
634	vncipherlast	$inout,$inout,$rndkey0
635	${UCMP}i	$len,16
636
637	vxor		$inout,$inout,$ivec
638	vmr		$ivec,$tmp
639	vperm		$tmp,$inout,$inout,$outperm
640	vsel		$inout,$outhead,$tmp,$outmask
641	vmr		$outhead,$tmp
642	stvx		$inout,0,$out
643	addi		$out,$out,16
644	bge		Lcbc_dec
645
646Lcbc_done:
647	addi		$out,$out,-1
648	lvx		$inout,0,$out		# redundant in aligned case
649	vsel		$inout,$outhead,$inout,$outmask
650	stvx		$inout,0,$out
651
652	neg		$enc,$ivp		# write [unaligned] iv
653	li		$idx,15			# 15 is not typo
654	vxor		$rndkey0,$rndkey0,$rndkey0
655	vspltisb	$outmask,-1
656	le?vspltisb	$tmp,0x0f
657	?lvsl		$outperm,0,$enc
658	?vperm		$outmask,$rndkey0,$outmask,$outperm
659	le?vxor		$outperm,$outperm,$tmp
660	lvx		$outhead,0,$ivp
661	vperm		$ivec,$ivec,$ivec,$outperm
662	vsel		$inout,$outhead,$ivec,$outmask
663	lvx		$inptail,$idx,$ivp
664	stvx		$inout,0,$ivp
665	vsel		$inout,$ivec,$inptail,$outmask
666	stvx		$inout,$idx,$ivp
667
668	mtspr		256,$vrsave
669	blr
670	.long		0
671	.byte		0,12,0x14,0,0,0,6,0
672	.long		0
673___
674#########################################################################
675{{	# Optimized CBC decrypt procedure				#
676my $key_="r11";
677my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
678    $x00=0 if ($flavour =~ /osx/);
679my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
680my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
681my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
682			# v26-v31 last 6 round keys
683my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
684
685$code.=<<___;
686.align	5
687_aesp8_cbc_decrypt8x:
688	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
689	li		r10,`$FRAME+8*16+15`
690	li		r11,`$FRAME+8*16+31`
691	stvx		v20,r10,$sp		# ABI says so
692	addi		r10,r10,32
693	stvx		v21,r11,$sp
694	addi		r11,r11,32
695	stvx		v22,r10,$sp
696	addi		r10,r10,32
697	stvx		v23,r11,$sp
698	addi		r11,r11,32
699	stvx		v24,r10,$sp
700	addi		r10,r10,32
701	stvx		v25,r11,$sp
702	addi		r11,r11,32
703	stvx		v26,r10,$sp
704	addi		r10,r10,32
705	stvx		v27,r11,$sp
706	addi		r11,r11,32
707	stvx		v28,r10,$sp
708	addi		r10,r10,32
709	stvx		v29,r11,$sp
710	addi		r11,r11,32
711	stvx		v30,r10,$sp
712	stvx		v31,r11,$sp
713	li		r0,-1
714	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
715	li		$x10,0x10
716	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
717	li		$x20,0x20
718	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
719	li		$x30,0x30
720	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
721	li		$x40,0x40
722	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
723	li		$x50,0x50
724	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
725	li		$x60,0x60
726	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
727	li		$x70,0x70
728	mtspr		256,r0
729
730	subi		$rounds,$rounds,3	# -4 in total
731	subi		$len,$len,128		# bias
732
733	lvx		$rndkey0,$x00,$key	# load key schedule
734	lvx		v30,$x10,$key
735	addi		$key,$key,0x20
736	lvx		v31,$x00,$key
737	?vperm		$rndkey0,$rndkey0,v30,$keyperm
738	addi		$key_,$sp,$FRAME+15
739	mtctr		$rounds
740
741Load_cbc_dec_key:
742	?vperm		v24,v30,v31,$keyperm
743	lvx		v30,$x10,$key
744	addi		$key,$key,0x20
745	stvx		v24,$x00,$key_		# off-load round[1]
746	?vperm		v25,v31,v30,$keyperm
747	lvx		v31,$x00,$key
748	stvx		v25,$x10,$key_		# off-load round[2]
749	addi		$key_,$key_,0x20
750	bdnz		Load_cbc_dec_key
751
752	lvx		v26,$x10,$key
753	?vperm		v24,v30,v31,$keyperm
754	lvx		v27,$x20,$key
755	stvx		v24,$x00,$key_		# off-load round[3]
756	?vperm		v25,v31,v26,$keyperm
757	lvx		v28,$x30,$key
758	stvx		v25,$x10,$key_		# off-load round[4]
759	addi		$key_,$sp,$FRAME+15	# rewind $key_
760	?vperm		v26,v26,v27,$keyperm
761	lvx		v29,$x40,$key
762	?vperm		v27,v27,v28,$keyperm
763	lvx		v30,$x50,$key
764	?vperm		v28,v28,v29,$keyperm
765	lvx		v31,$x60,$key
766	?vperm		v29,v29,v30,$keyperm
767	lvx		$out0,$x70,$key		# borrow $out0
768	?vperm		v30,v30,v31,$keyperm
769	lvx		v24,$x00,$key_		# pre-load round[1]
770	?vperm		v31,v31,$out0,$keyperm
771	lvx		v25,$x10,$key_		# pre-load round[2]
772
773	#lvx		$inptail,0,$inp		# "caller" already did this
774	#addi		$inp,$inp,15		# 15 is not typo
775	subi		$inp,$inp,15		# undo "caller"
776
777	 le?li		$idx,8
778	lvx_u		$in0,$x00,$inp		# load first 8 "words"
779	 le?lvsl	$inpperm,0,$idx
780	 le?vspltisb	$tmp,0x0f
781	lvx_u		$in1,$x10,$inp
782	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
783	lvx_u		$in2,$x20,$inp
784	 le?vperm	$in0,$in0,$in0,$inpperm
785	lvx_u		$in3,$x30,$inp
786	 le?vperm	$in1,$in1,$in1,$inpperm
787	lvx_u		$in4,$x40,$inp
788	 le?vperm	$in2,$in2,$in2,$inpperm
789	vxor		$out0,$in0,$rndkey0
790	lvx_u		$in5,$x50,$inp
791	 le?vperm	$in3,$in3,$in3,$inpperm
792	vxor		$out1,$in1,$rndkey0
793	lvx_u		$in6,$x60,$inp
794	 le?vperm	$in4,$in4,$in4,$inpperm
795	vxor		$out2,$in2,$rndkey0
796	lvx_u		$in7,$x70,$inp
797	addi		$inp,$inp,0x80
798	 le?vperm	$in5,$in5,$in5,$inpperm
799	vxor		$out3,$in3,$rndkey0
800	 le?vperm	$in6,$in6,$in6,$inpperm
801	vxor		$out4,$in4,$rndkey0
802	 le?vperm	$in7,$in7,$in7,$inpperm
803	vxor		$out5,$in5,$rndkey0
804	vxor		$out6,$in6,$rndkey0
805	vxor		$out7,$in7,$rndkey0
806
807	mtctr		$rounds
808	b		Loop_cbc_dec8x
809.align	5
810Loop_cbc_dec8x:
811	vncipher	$out0,$out0,v24
812	vncipher	$out1,$out1,v24
813	vncipher	$out2,$out2,v24
814	vncipher	$out3,$out3,v24
815	vncipher	$out4,$out4,v24
816	vncipher	$out5,$out5,v24
817	vncipher	$out6,$out6,v24
818	vncipher	$out7,$out7,v24
819	lvx		v24,$x20,$key_		# round[3]
820	addi		$key_,$key_,0x20
821
822	vncipher	$out0,$out0,v25
823	vncipher	$out1,$out1,v25
824	vncipher	$out2,$out2,v25
825	vncipher	$out3,$out3,v25
826	vncipher	$out4,$out4,v25
827	vncipher	$out5,$out5,v25
828	vncipher	$out6,$out6,v25
829	vncipher	$out7,$out7,v25
830	lvx		v25,$x10,$key_		# round[4]
831	bdnz		Loop_cbc_dec8x
832
833	subic		$len,$len,128		# $len-=128
834	vncipher	$out0,$out0,v24
835	vncipher	$out1,$out1,v24
836	vncipher	$out2,$out2,v24
837	vncipher	$out3,$out3,v24
838	vncipher	$out4,$out4,v24
839	vncipher	$out5,$out5,v24
840	vncipher	$out6,$out6,v24
841	vncipher	$out7,$out7,v24
842
843	subfe.		r0,r0,r0		# borrow?-1:0
844	vncipher	$out0,$out0,v25
845	vncipher	$out1,$out1,v25
846	vncipher	$out2,$out2,v25
847	vncipher	$out3,$out3,v25
848	vncipher	$out4,$out4,v25
849	vncipher	$out5,$out5,v25
850	vncipher	$out6,$out6,v25
851	vncipher	$out7,$out7,v25
852
853	and		r0,r0,$len
854	vncipher	$out0,$out0,v26
855	vncipher	$out1,$out1,v26
856	vncipher	$out2,$out2,v26
857	vncipher	$out3,$out3,v26
858	vncipher	$out4,$out4,v26
859	vncipher	$out5,$out5,v26
860	vncipher	$out6,$out6,v26
861	vncipher	$out7,$out7,v26
862
863	add		$inp,$inp,r0		# $inp is adjusted in such
864						# way that at exit from the
865						# loop inX-in7 are loaded
866						# with last "words"
867	vncipher	$out0,$out0,v27
868	vncipher	$out1,$out1,v27
869	vncipher	$out2,$out2,v27
870	vncipher	$out3,$out3,v27
871	vncipher	$out4,$out4,v27
872	vncipher	$out5,$out5,v27
873	vncipher	$out6,$out6,v27
874	vncipher	$out7,$out7,v27
875
876	addi		$key_,$sp,$FRAME+15	# rewind $key_
877	vncipher	$out0,$out0,v28
878	vncipher	$out1,$out1,v28
879	vncipher	$out2,$out2,v28
880	vncipher	$out3,$out3,v28
881	vncipher	$out4,$out4,v28
882	vncipher	$out5,$out5,v28
883	vncipher	$out6,$out6,v28
884	vncipher	$out7,$out7,v28
885	lvx		v24,$x00,$key_		# re-pre-load round[1]
886
887	vncipher	$out0,$out0,v29
888	vncipher	$out1,$out1,v29
889	vncipher	$out2,$out2,v29
890	vncipher	$out3,$out3,v29
891	vncipher	$out4,$out4,v29
892	vncipher	$out5,$out5,v29
893	vncipher	$out6,$out6,v29
894	vncipher	$out7,$out7,v29
895	lvx		v25,$x10,$key_		# re-pre-load round[2]
896
897	vncipher	$out0,$out0,v30
898	 vxor		$ivec,$ivec,v31		# xor with last round key
899	vncipher	$out1,$out1,v30
900	 vxor		$in0,$in0,v31
901	vncipher	$out2,$out2,v30
902	 vxor		$in1,$in1,v31
903	vncipher	$out3,$out3,v30
904	 vxor		$in2,$in2,v31
905	vncipher	$out4,$out4,v30
906	 vxor		$in3,$in3,v31
907	vncipher	$out5,$out5,v30
908	 vxor		$in4,$in4,v31
909	vncipher	$out6,$out6,v30
910	 vxor		$in5,$in5,v31
911	vncipher	$out7,$out7,v30
912	 vxor		$in6,$in6,v31
913
914	vncipherlast	$out0,$out0,$ivec
915	vncipherlast	$out1,$out1,$in0
916	 lvx_u		$in0,$x00,$inp		# load next input block
917	vncipherlast	$out2,$out2,$in1
918	 lvx_u		$in1,$x10,$inp
919	vncipherlast	$out3,$out3,$in2
920	 le?vperm	$in0,$in0,$in0,$inpperm
921	 lvx_u		$in2,$x20,$inp
922	vncipherlast	$out4,$out4,$in3
923	 le?vperm	$in1,$in1,$in1,$inpperm
924	 lvx_u		$in3,$x30,$inp
925	vncipherlast	$out5,$out5,$in4
926	 le?vperm	$in2,$in2,$in2,$inpperm
927	 lvx_u		$in4,$x40,$inp
928	vncipherlast	$out6,$out6,$in5
929	 le?vperm	$in3,$in3,$in3,$inpperm
930	 lvx_u		$in5,$x50,$inp
931	vncipherlast	$out7,$out7,$in6
932	 le?vperm	$in4,$in4,$in4,$inpperm
933	 lvx_u		$in6,$x60,$inp
934	vmr		$ivec,$in7
935	 le?vperm	$in5,$in5,$in5,$inpperm
936	 lvx_u		$in7,$x70,$inp
937	 addi		$inp,$inp,0x80
938
939	le?vperm	$out0,$out0,$out0,$inpperm
940	le?vperm	$out1,$out1,$out1,$inpperm
941	stvx_u		$out0,$x00,$out
942	 le?vperm	$in6,$in6,$in6,$inpperm
943	 vxor		$out0,$in0,$rndkey0
944	le?vperm	$out2,$out2,$out2,$inpperm
945	stvx_u		$out1,$x10,$out
946	 le?vperm	$in7,$in7,$in7,$inpperm
947	 vxor		$out1,$in1,$rndkey0
948	le?vperm	$out3,$out3,$out3,$inpperm
949	stvx_u		$out2,$x20,$out
950	 vxor		$out2,$in2,$rndkey0
951	le?vperm	$out4,$out4,$out4,$inpperm
952	stvx_u		$out3,$x30,$out
953	 vxor		$out3,$in3,$rndkey0
954	le?vperm	$out5,$out5,$out5,$inpperm
955	stvx_u		$out4,$x40,$out
956	 vxor		$out4,$in4,$rndkey0
957	le?vperm	$out6,$out6,$out6,$inpperm
958	stvx_u		$out5,$x50,$out
959	 vxor		$out5,$in5,$rndkey0
960	le?vperm	$out7,$out7,$out7,$inpperm
961	stvx_u		$out6,$x60,$out
962	 vxor		$out6,$in6,$rndkey0
963	stvx_u		$out7,$x70,$out
964	addi		$out,$out,0x80
965	 vxor		$out7,$in7,$rndkey0
966
967	mtctr		$rounds
968	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
969
970	addic.		$len,$len,128
971	beq		Lcbc_dec8x_done
972	nop
973	nop
974
975Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
976	vncipher	$out1,$out1,v24
977	vncipher	$out2,$out2,v24
978	vncipher	$out3,$out3,v24
979	vncipher	$out4,$out4,v24
980	vncipher	$out5,$out5,v24
981	vncipher	$out6,$out6,v24
982	vncipher	$out7,$out7,v24
983	lvx		v24,$x20,$key_		# round[3]
984	addi		$key_,$key_,0x20
985
986	vncipher	$out1,$out1,v25
987	vncipher	$out2,$out2,v25
988	vncipher	$out3,$out3,v25
989	vncipher	$out4,$out4,v25
990	vncipher	$out5,$out5,v25
991	vncipher	$out6,$out6,v25
992	vncipher	$out7,$out7,v25
993	lvx		v25,$x10,$key_		# round[4]
994	bdnz		Loop_cbc_dec8x_tail
995
996	vncipher	$out1,$out1,v24
997	vncipher	$out2,$out2,v24
998	vncipher	$out3,$out3,v24
999	vncipher	$out4,$out4,v24
1000	vncipher	$out5,$out5,v24
1001	vncipher	$out6,$out6,v24
1002	vncipher	$out7,$out7,v24
1003
1004	vncipher	$out1,$out1,v25
1005	vncipher	$out2,$out2,v25
1006	vncipher	$out3,$out3,v25
1007	vncipher	$out4,$out4,v25
1008	vncipher	$out5,$out5,v25
1009	vncipher	$out6,$out6,v25
1010	vncipher	$out7,$out7,v25
1011
1012	vncipher	$out1,$out1,v26
1013	vncipher	$out2,$out2,v26
1014	vncipher	$out3,$out3,v26
1015	vncipher	$out4,$out4,v26
1016	vncipher	$out5,$out5,v26
1017	vncipher	$out6,$out6,v26
1018	vncipher	$out7,$out7,v26
1019
1020	vncipher	$out1,$out1,v27
1021	vncipher	$out2,$out2,v27
1022	vncipher	$out3,$out3,v27
1023	vncipher	$out4,$out4,v27
1024	vncipher	$out5,$out5,v27
1025	vncipher	$out6,$out6,v27
1026	vncipher	$out7,$out7,v27
1027
1028	vncipher	$out1,$out1,v28
1029	vncipher	$out2,$out2,v28
1030	vncipher	$out3,$out3,v28
1031	vncipher	$out4,$out4,v28
1032	vncipher	$out5,$out5,v28
1033	vncipher	$out6,$out6,v28
1034	vncipher	$out7,$out7,v28
1035
1036	vncipher	$out1,$out1,v29
1037	vncipher	$out2,$out2,v29
1038	vncipher	$out3,$out3,v29
1039	vncipher	$out4,$out4,v29
1040	vncipher	$out5,$out5,v29
1041	vncipher	$out6,$out6,v29
1042	vncipher	$out7,$out7,v29
1043
1044	vncipher	$out1,$out1,v30
1045	 vxor		$ivec,$ivec,v31		# last round key
1046	vncipher	$out2,$out2,v30
1047	 vxor		$in1,$in1,v31
1048	vncipher	$out3,$out3,v30
1049	 vxor		$in2,$in2,v31
1050	vncipher	$out4,$out4,v30
1051	 vxor		$in3,$in3,v31
1052	vncipher	$out5,$out5,v30
1053	 vxor		$in4,$in4,v31
1054	vncipher	$out6,$out6,v30
1055	 vxor		$in5,$in5,v31
1056	vncipher	$out7,$out7,v30
1057	 vxor		$in6,$in6,v31
1058
1059	cmplwi		$len,32			# switch($len)
1060	blt		Lcbc_dec8x_one
1061	nop
1062	beq		Lcbc_dec8x_two
1063	cmplwi		$len,64
1064	blt		Lcbc_dec8x_three
1065	nop
1066	beq		Lcbc_dec8x_four
1067	cmplwi		$len,96
1068	blt		Lcbc_dec8x_five
1069	nop
1070	beq		Lcbc_dec8x_six
1071
1072Lcbc_dec8x_seven:
1073	vncipherlast	$out1,$out1,$ivec
1074	vncipherlast	$out2,$out2,$in1
1075	vncipherlast	$out3,$out3,$in2
1076	vncipherlast	$out4,$out4,$in3
1077	vncipherlast	$out5,$out5,$in4
1078	vncipherlast	$out6,$out6,$in5
1079	vncipherlast	$out7,$out7,$in6
1080	vmr		$ivec,$in7
1081
1082	le?vperm	$out1,$out1,$out1,$inpperm
1083	le?vperm	$out2,$out2,$out2,$inpperm
1084	stvx_u		$out1,$x00,$out
1085	le?vperm	$out3,$out3,$out3,$inpperm
1086	stvx_u		$out2,$x10,$out
1087	le?vperm	$out4,$out4,$out4,$inpperm
1088	stvx_u		$out3,$x20,$out
1089	le?vperm	$out5,$out5,$out5,$inpperm
1090	stvx_u		$out4,$x30,$out
1091	le?vperm	$out6,$out6,$out6,$inpperm
1092	stvx_u		$out5,$x40,$out
1093	le?vperm	$out7,$out7,$out7,$inpperm
1094	stvx_u		$out6,$x50,$out
1095	stvx_u		$out7,$x60,$out
1096	addi		$out,$out,0x70
1097	b		Lcbc_dec8x_done
1098
1099.align	5
1100Lcbc_dec8x_six:
1101	vncipherlast	$out2,$out2,$ivec
1102	vncipherlast	$out3,$out3,$in2
1103	vncipherlast	$out4,$out4,$in3
1104	vncipherlast	$out5,$out5,$in4
1105	vncipherlast	$out6,$out6,$in5
1106	vncipherlast	$out7,$out7,$in6
1107	vmr		$ivec,$in7
1108
1109	le?vperm	$out2,$out2,$out2,$inpperm
1110	le?vperm	$out3,$out3,$out3,$inpperm
1111	stvx_u		$out2,$x00,$out
1112	le?vperm	$out4,$out4,$out4,$inpperm
1113	stvx_u		$out3,$x10,$out
1114	le?vperm	$out5,$out5,$out5,$inpperm
1115	stvx_u		$out4,$x20,$out
1116	le?vperm	$out6,$out6,$out6,$inpperm
1117	stvx_u		$out5,$x30,$out
1118	le?vperm	$out7,$out7,$out7,$inpperm
1119	stvx_u		$out6,$x40,$out
1120	stvx_u		$out7,$x50,$out
1121	addi		$out,$out,0x60
1122	b		Lcbc_dec8x_done
1123
1124.align	5
1125Lcbc_dec8x_five:
1126	vncipherlast	$out3,$out3,$ivec
1127	vncipherlast	$out4,$out4,$in3
1128	vncipherlast	$out5,$out5,$in4
1129	vncipherlast	$out6,$out6,$in5
1130	vncipherlast	$out7,$out7,$in6
1131	vmr		$ivec,$in7
1132
1133	le?vperm	$out3,$out3,$out3,$inpperm
1134	le?vperm	$out4,$out4,$out4,$inpperm
1135	stvx_u		$out3,$x00,$out
1136	le?vperm	$out5,$out5,$out5,$inpperm
1137	stvx_u		$out4,$x10,$out
1138	le?vperm	$out6,$out6,$out6,$inpperm
1139	stvx_u		$out5,$x20,$out
1140	le?vperm	$out7,$out7,$out7,$inpperm
1141	stvx_u		$out6,$x30,$out
1142	stvx_u		$out7,$x40,$out
1143	addi		$out,$out,0x50
1144	b		Lcbc_dec8x_done
1145
1146.align	5
1147Lcbc_dec8x_four:
1148	vncipherlast	$out4,$out4,$ivec
1149	vncipherlast	$out5,$out5,$in4
1150	vncipherlast	$out6,$out6,$in5
1151	vncipherlast	$out7,$out7,$in6
1152	vmr		$ivec,$in7
1153
1154	le?vperm	$out4,$out4,$out4,$inpperm
1155	le?vperm	$out5,$out5,$out5,$inpperm
1156	stvx_u		$out4,$x00,$out
1157	le?vperm	$out6,$out6,$out6,$inpperm
1158	stvx_u		$out5,$x10,$out
1159	le?vperm	$out7,$out7,$out7,$inpperm
1160	stvx_u		$out6,$x20,$out
1161	stvx_u		$out7,$x30,$out
1162	addi		$out,$out,0x40
1163	b		Lcbc_dec8x_done
1164
1165.align	5
1166Lcbc_dec8x_three:
1167	vncipherlast	$out5,$out5,$ivec
1168	vncipherlast	$out6,$out6,$in5
1169	vncipherlast	$out7,$out7,$in6
1170	vmr		$ivec,$in7
1171
1172	le?vperm	$out5,$out5,$out5,$inpperm
1173	le?vperm	$out6,$out6,$out6,$inpperm
1174	stvx_u		$out5,$x00,$out
1175	le?vperm	$out7,$out7,$out7,$inpperm
1176	stvx_u		$out6,$x10,$out
1177	stvx_u		$out7,$x20,$out
1178	addi		$out,$out,0x30
1179	b		Lcbc_dec8x_done
1180
1181.align	5
1182Lcbc_dec8x_two:
1183	vncipherlast	$out6,$out6,$ivec
1184	vncipherlast	$out7,$out7,$in6
1185	vmr		$ivec,$in7
1186
1187	le?vperm	$out6,$out6,$out6,$inpperm
1188	le?vperm	$out7,$out7,$out7,$inpperm
1189	stvx_u		$out6,$x00,$out
1190	stvx_u		$out7,$x10,$out
1191	addi		$out,$out,0x20
1192	b		Lcbc_dec8x_done
1193
1194.align	5
1195Lcbc_dec8x_one:
1196	vncipherlast	$out7,$out7,$ivec
1197	vmr		$ivec,$in7
1198
1199	le?vperm	$out7,$out7,$out7,$inpperm
1200	stvx_u		$out7,0,$out
1201	addi		$out,$out,0x10
1202
1203Lcbc_dec8x_done:
1204	le?vperm	$ivec,$ivec,$ivec,$inpperm
1205	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
1206
1207	li		r10,`$FRAME+15`
1208	li		r11,`$FRAME+31`
1209	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1210	addi		r10,r10,32
1211	stvx		$inpperm,r11,$sp
1212	addi		r11,r11,32
1213	stvx		$inpperm,r10,$sp
1214	addi		r10,r10,32
1215	stvx		$inpperm,r11,$sp
1216	addi		r11,r11,32
1217	stvx		$inpperm,r10,$sp
1218	addi		r10,r10,32
1219	stvx		$inpperm,r11,$sp
1220	addi		r11,r11,32
1221	stvx		$inpperm,r10,$sp
1222	addi		r10,r10,32
1223	stvx		$inpperm,r11,$sp
1224	addi		r11,r11,32
1225
1226	mtspr		256,$vrsave
1227	lvx		v20,r10,$sp		# ABI says so
1228	addi		r10,r10,32
1229	lvx		v21,r11,$sp
1230	addi		r11,r11,32
1231	lvx		v22,r10,$sp
1232	addi		r10,r10,32
1233	lvx		v23,r11,$sp
1234	addi		r11,r11,32
1235	lvx		v24,r10,$sp
1236	addi		r10,r10,32
1237	lvx		v25,r11,$sp
1238	addi		r11,r11,32
1239	lvx		v26,r10,$sp
1240	addi		r10,r10,32
1241	lvx		v27,r11,$sp
1242	addi		r11,r11,32
1243	lvx		v28,r10,$sp
1244	addi		r10,r10,32
1245	lvx		v29,r11,$sp
1246	addi		r11,r11,32
1247	lvx		v30,r10,$sp
1248	lvx		v31,r11,$sp
1249	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1250	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1251	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1252	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1253	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1254	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1255	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1256	blr
1257	.long		0
1258	.byte		0,12,0x04,0,0x80,6,6,0
1259	.long		0
1260.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1261___
1262}}	}}}
1263
1264#########################################################################
1265{{{	# CTR procedure[s]						#
1266my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1267my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
1268my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1269						map("v$_",(4..11));
1270my $dat=$tmp;
1271
1272$code.=<<___;
1273.globl	.${prefix}_ctr32_encrypt_blocks
1274.align	5
1275.${prefix}_ctr32_encrypt_blocks:
1276	${UCMP}i	$len,1
1277	bltlr-
1278
1279	lis		r0,0xfff0
1280	mfspr		$vrsave,256
1281	mtspr		256,r0
1282
1283	li		$idx,15
1284	vxor		$rndkey0,$rndkey0,$rndkey0
1285	le?vspltisb	$tmp,0x0f
1286
1287	lvx		$ivec,0,$ivp		# load [unaligned] iv
1288	lvsl		$inpperm,0,$ivp
1289	lvx		$inptail,$idx,$ivp
1290	 vspltisb	$one,1
1291	le?vxor		$inpperm,$inpperm,$tmp
1292	vperm		$ivec,$ivec,$inptail,$inpperm
1293	 vsldoi		$one,$rndkey0,$one,1
1294
1295	neg		r11,$inp
1296	?lvsl		$keyperm,0,$key		# prepare for unaligned key
1297	lwz		$rounds,240($key)
1298
1299	lvsr		$inpperm,0,r11		# prepare for unaligned load
1300	lvx		$inptail,0,$inp
1301	addi		$inp,$inp,15		# 15 is not typo
1302	le?vxor		$inpperm,$inpperm,$tmp
1303
1304	srwi		$rounds,$rounds,1
1305	li		$idx,16
1306	subi		$rounds,$rounds,1
1307
1308	${UCMP}i	$len,8
1309	bge		_aesp8_ctr32_encrypt8x
1310
1311	?lvsr		$outperm,0,$out		# prepare for unaligned store
1312	vspltisb	$outmask,-1
1313	lvx		$outhead,0,$out
1314	?vperm		$outmask,$rndkey0,$outmask,$outperm
1315	le?vxor		$outperm,$outperm,$tmp
1316
1317	lvx		$rndkey0,0,$key
1318	mtctr		$rounds
1319	lvx		$rndkey1,$idx,$key
1320	addi		$idx,$idx,16
1321	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1322	vxor		$inout,$ivec,$rndkey0
1323	lvx		$rndkey0,$idx,$key
1324	addi		$idx,$idx,16
1325	b		Loop_ctr32_enc
1326
1327.align	5
1328Loop_ctr32_enc:
1329	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1330	vcipher		$inout,$inout,$rndkey1
1331	lvx		$rndkey1,$idx,$key
1332	addi		$idx,$idx,16
1333	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1334	vcipher		$inout,$inout,$rndkey0
1335	lvx		$rndkey0,$idx,$key
1336	addi		$idx,$idx,16
1337	bdnz		Loop_ctr32_enc
1338
1339	vadduwm		$ivec,$ivec,$one
1340	 vmr		$dat,$inptail
1341	 lvx		$inptail,0,$inp
1342	 addi		$inp,$inp,16
1343	 subic.		$len,$len,1		# blocks--
1344
1345	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1346	vcipher		$inout,$inout,$rndkey1
1347	lvx		$rndkey1,$idx,$key
1348	 vperm		$dat,$dat,$inptail,$inpperm
1349	 li		$idx,16
1350	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
1351	 lvx		$rndkey0,0,$key
1352	vxor		$dat,$dat,$rndkey1	# last round key
1353	vcipherlast	$inout,$inout,$dat
1354
1355	 lvx		$rndkey1,$idx,$key
1356	 addi		$idx,$idx,16
1357	vperm		$inout,$inout,$inout,$outperm
1358	vsel		$dat,$outhead,$inout,$outmask
1359	 mtctr		$rounds
1360	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1361	vmr		$outhead,$inout
1362	 vxor		$inout,$ivec,$rndkey0
1363	 lvx		$rndkey0,$idx,$key
1364	 addi		$idx,$idx,16
1365	stvx		$dat,0,$out
1366	addi		$out,$out,16
1367	bne		Loop_ctr32_enc
1368
1369	addi		$out,$out,-1
1370	lvx		$inout,0,$out		# redundant in aligned case
1371	vsel		$inout,$outhead,$inout,$outmask
1372	stvx		$inout,0,$out
1373
1374	mtspr		256,$vrsave
1375	blr
1376	.long		0
1377	.byte		0,12,0x14,0,0,0,6,0
1378	.long		0
1379___
1380#########################################################################
1381{{	# Optimized CTR procedure					#
1382my $key_="r11";
1383my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1384    $x00=0 if ($flavour =~ /osx/);
1385my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1386my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1387my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
1388			# v26-v31 last 6 round keys
1389my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
1390my ($two,$three,$four)=($outhead,$outperm,$outmask);
1391
1392$code.=<<___;
1393.align	5
1394_aesp8_ctr32_encrypt8x:
1395	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1396	li		r10,`$FRAME+8*16+15`
1397	li		r11,`$FRAME+8*16+31`
1398	stvx		v20,r10,$sp		# ABI says so
1399	addi		r10,r10,32
1400	stvx		v21,r11,$sp
1401	addi		r11,r11,32
1402	stvx		v22,r10,$sp
1403	addi		r10,r10,32
1404	stvx		v23,r11,$sp
1405	addi		r11,r11,32
1406	stvx		v24,r10,$sp
1407	addi		r10,r10,32
1408	stvx		v25,r11,$sp
1409	addi		r11,r11,32
1410	stvx		v26,r10,$sp
1411	addi		r10,r10,32
1412	stvx		v27,r11,$sp
1413	addi		r11,r11,32
1414	stvx		v28,r10,$sp
1415	addi		r10,r10,32
1416	stvx		v29,r11,$sp
1417	addi		r11,r11,32
1418	stvx		v30,r10,$sp
1419	stvx		v31,r11,$sp
1420	li		r0,-1
1421	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
1422	li		$x10,0x10
1423	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1424	li		$x20,0x20
1425	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1426	li		$x30,0x30
1427	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1428	li		$x40,0x40
1429	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1430	li		$x50,0x50
1431	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1432	li		$x60,0x60
1433	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1434	li		$x70,0x70
1435	mtspr		256,r0
1436
1437	subi		$rounds,$rounds,3	# -4 in total
1438
1439	lvx		$rndkey0,$x00,$key	# load key schedule
1440	lvx		v30,$x10,$key
1441	addi		$key,$key,0x20
1442	lvx		v31,$x00,$key
1443	?vperm		$rndkey0,$rndkey0,v30,$keyperm
1444	addi		$key_,$sp,$FRAME+15
1445	mtctr		$rounds
1446
1447Load_ctr32_enc_key:
1448	?vperm		v24,v30,v31,$keyperm
1449	lvx		v30,$x10,$key
1450	addi		$key,$key,0x20
1451	stvx		v24,$x00,$key_		# off-load round[1]
1452	?vperm		v25,v31,v30,$keyperm
1453	lvx		v31,$x00,$key
1454	stvx		v25,$x10,$key_		# off-load round[2]
1455	addi		$key_,$key_,0x20
1456	bdnz		Load_ctr32_enc_key
1457
1458	lvx		v26,$x10,$key
1459	?vperm		v24,v30,v31,$keyperm
1460	lvx		v27,$x20,$key
1461	stvx		v24,$x00,$key_		# off-load round[3]
1462	?vperm		v25,v31,v26,$keyperm
1463	lvx		v28,$x30,$key
1464	stvx		v25,$x10,$key_		# off-load round[4]
1465	addi		$key_,$sp,$FRAME+15	# rewind $key_
1466	?vperm		v26,v26,v27,$keyperm
1467	lvx		v29,$x40,$key
1468	?vperm		v27,v27,v28,$keyperm
1469	lvx		v30,$x50,$key
1470	?vperm		v28,v28,v29,$keyperm
1471	lvx		v31,$x60,$key
1472	?vperm		v29,v29,v30,$keyperm
1473	lvx		$out0,$x70,$key		# borrow $out0
1474	?vperm		v30,v30,v31,$keyperm
1475	lvx		v24,$x00,$key_		# pre-load round[1]
1476	?vperm		v31,v31,$out0,$keyperm
1477	lvx		v25,$x10,$key_		# pre-load round[2]
1478
1479	vadduwm		$two,$one,$one
1480	subi		$inp,$inp,15		# undo "caller"
1481	$SHL		$len,$len,4
1482
1483	vadduwm		$out1,$ivec,$one	# counter values ...
1484	vadduwm		$out2,$ivec,$two
1485	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1486	 le?li		$idx,8
1487	vadduwm		$out3,$out1,$two
1488	vxor		$out1,$out1,$rndkey0
1489	 le?lvsl	$inpperm,0,$idx
1490	vadduwm		$out4,$out2,$two
1491	vxor		$out2,$out2,$rndkey0
1492	 le?vspltisb	$tmp,0x0f
1493	vadduwm		$out5,$out3,$two
1494	vxor		$out3,$out3,$rndkey0
1495	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
1496	vadduwm		$out6,$out4,$two
1497	vxor		$out4,$out4,$rndkey0
1498	vadduwm		$out7,$out5,$two
1499	vxor		$out5,$out5,$rndkey0
1500	vadduwm		$ivec,$out6,$two	# next counter value
1501	vxor		$out6,$out6,$rndkey0
1502	vxor		$out7,$out7,$rndkey0
1503
1504	mtctr		$rounds
1505	b		Loop_ctr32_enc8x
1506.align	5
1507Loop_ctr32_enc8x:
1508	vcipher 	$out0,$out0,v24
1509	vcipher 	$out1,$out1,v24
1510	vcipher 	$out2,$out2,v24
1511	vcipher 	$out3,$out3,v24
1512	vcipher 	$out4,$out4,v24
1513	vcipher 	$out5,$out5,v24
1514	vcipher 	$out6,$out6,v24
1515	vcipher 	$out7,$out7,v24
1516Loop_ctr32_enc8x_middle:
1517	lvx		v24,$x20,$key_		# round[3]
1518	addi		$key_,$key_,0x20
1519
1520	vcipher 	$out0,$out0,v25
1521	vcipher 	$out1,$out1,v25
1522	vcipher 	$out2,$out2,v25
1523	vcipher 	$out3,$out3,v25
1524	vcipher 	$out4,$out4,v25
1525	vcipher 	$out5,$out5,v25
1526	vcipher 	$out6,$out6,v25
1527	vcipher 	$out7,$out7,v25
1528	lvx		v25,$x10,$key_		# round[4]
1529	bdnz		Loop_ctr32_enc8x
1530
1531	subic		r11,$len,256		# $len-256, borrow $key_
1532	vcipher 	$out0,$out0,v24
1533	vcipher 	$out1,$out1,v24
1534	vcipher 	$out2,$out2,v24
1535	vcipher 	$out3,$out3,v24
1536	vcipher 	$out4,$out4,v24
1537	vcipher 	$out5,$out5,v24
1538	vcipher 	$out6,$out6,v24
1539	vcipher 	$out7,$out7,v24
1540
1541	subfe		r0,r0,r0		# borrow?-1:0
1542	vcipher 	$out0,$out0,v25
1543	vcipher 	$out1,$out1,v25
1544	vcipher 	$out2,$out2,v25
1545	vcipher 	$out3,$out3,v25
1546	vcipher 	$out4,$out4,v25
1547	vcipher		$out5,$out5,v25
1548	vcipher		$out6,$out6,v25
1549	vcipher		$out7,$out7,v25
1550
1551	and		r0,r0,r11
1552	addi		$key_,$sp,$FRAME+15	# rewind $key_
1553	vcipher		$out0,$out0,v26
1554	vcipher		$out1,$out1,v26
1555	vcipher		$out2,$out2,v26
1556	vcipher		$out3,$out3,v26
1557	vcipher		$out4,$out4,v26
1558	vcipher		$out5,$out5,v26
1559	vcipher		$out6,$out6,v26
1560	vcipher		$out7,$out7,v26
1561	lvx		v24,$x00,$key_		# re-pre-load round[1]
1562
1563	subic		$len,$len,129		# $len-=129
1564	vcipher		$out0,$out0,v27
1565	addi		$len,$len,1		# $len-=128 really
1566	vcipher		$out1,$out1,v27
1567	vcipher		$out2,$out2,v27
1568	vcipher		$out3,$out3,v27
1569	vcipher		$out4,$out4,v27
1570	vcipher		$out5,$out5,v27
1571	vcipher		$out6,$out6,v27
1572	vcipher		$out7,$out7,v27
1573	lvx		v25,$x10,$key_		# re-pre-load round[2]
1574
1575	vcipher		$out0,$out0,v28
1576	 lvx_u		$in0,$x00,$inp		# load input
1577	vcipher		$out1,$out1,v28
1578	 lvx_u		$in1,$x10,$inp
1579	vcipher		$out2,$out2,v28
1580	 lvx_u		$in2,$x20,$inp
1581	vcipher		$out3,$out3,v28
1582	 lvx_u		$in3,$x30,$inp
1583	vcipher		$out4,$out4,v28
1584	 lvx_u		$in4,$x40,$inp
1585	vcipher		$out5,$out5,v28
1586	 lvx_u		$in5,$x50,$inp
1587	vcipher		$out6,$out6,v28
1588	 lvx_u		$in6,$x60,$inp
1589	vcipher		$out7,$out7,v28
1590	 lvx_u		$in7,$x70,$inp
1591	 addi		$inp,$inp,0x80
1592
1593	vcipher		$out0,$out0,v29
1594	 le?vperm	$in0,$in0,$in0,$inpperm
1595	vcipher		$out1,$out1,v29
1596	 le?vperm	$in1,$in1,$in1,$inpperm
1597	vcipher		$out2,$out2,v29
1598	 le?vperm	$in2,$in2,$in2,$inpperm
1599	vcipher		$out3,$out3,v29
1600	 le?vperm	$in3,$in3,$in3,$inpperm
1601	vcipher		$out4,$out4,v29
1602	 le?vperm	$in4,$in4,$in4,$inpperm
1603	vcipher		$out5,$out5,v29
1604	 le?vperm	$in5,$in5,$in5,$inpperm
1605	vcipher		$out6,$out6,v29
1606	 le?vperm	$in6,$in6,$in6,$inpperm
1607	vcipher		$out7,$out7,v29
1608	 le?vperm	$in7,$in7,$in7,$inpperm
1609
1610	add		$inp,$inp,r0		# $inp is adjusted in such
1611						# way that at exit from the
1612						# loop inX-in7 are loaded
1613						# with last "words"
1614	subfe.		r0,r0,r0		# borrow?-1:0
1615	vcipher		$out0,$out0,v30
1616	 vxor		$in0,$in0,v31		# xor with last round key
1617	vcipher		$out1,$out1,v30
1618	 vxor		$in1,$in1,v31
1619	vcipher		$out2,$out2,v30
1620	 vxor		$in2,$in2,v31
1621	vcipher		$out3,$out3,v30
1622	 vxor		$in3,$in3,v31
1623	vcipher		$out4,$out4,v30
1624	 vxor		$in4,$in4,v31
1625	vcipher		$out5,$out5,v30
1626	 vxor		$in5,$in5,v31
1627	vcipher		$out6,$out6,v30
1628	 vxor		$in6,$in6,v31
1629	vcipher		$out7,$out7,v30
1630	 vxor		$in7,$in7,v31
1631
1632	bne		Lctr32_enc8x_break	# did $len-129 borrow?
1633
1634	vcipherlast	$in0,$out0,$in0
1635	vcipherlast	$in1,$out1,$in1
1636	 vadduwm	$out1,$ivec,$one	# counter values ...
1637	vcipherlast	$in2,$out2,$in2
1638	 vadduwm	$out2,$ivec,$two
1639	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1640	vcipherlast	$in3,$out3,$in3
1641	 vadduwm	$out3,$out1,$two
1642	 vxor		$out1,$out1,$rndkey0
1643	vcipherlast	$in4,$out4,$in4
1644	 vadduwm	$out4,$out2,$two
1645	 vxor		$out2,$out2,$rndkey0
1646	vcipherlast	$in5,$out5,$in5
1647	 vadduwm	$out5,$out3,$two
1648	 vxor		$out3,$out3,$rndkey0
1649	vcipherlast	$in6,$out6,$in6
1650	 vadduwm	$out6,$out4,$two
1651	 vxor		$out4,$out4,$rndkey0
1652	vcipherlast	$in7,$out7,$in7
1653	 vadduwm	$out7,$out5,$two
1654	 vxor		$out5,$out5,$rndkey0
1655	le?vperm	$in0,$in0,$in0,$inpperm
1656	 vadduwm	$ivec,$out6,$two	# next counter value
1657	 vxor		$out6,$out6,$rndkey0
1658	le?vperm	$in1,$in1,$in1,$inpperm
1659	 vxor		$out7,$out7,$rndkey0
1660	mtctr		$rounds
1661
1662	 vcipher	$out0,$out0,v24
1663	stvx_u		$in0,$x00,$out
1664	le?vperm	$in2,$in2,$in2,$inpperm
1665	 vcipher	$out1,$out1,v24
1666	stvx_u		$in1,$x10,$out
1667	le?vperm	$in3,$in3,$in3,$inpperm
1668	 vcipher	$out2,$out2,v24
1669	stvx_u		$in2,$x20,$out
1670	le?vperm	$in4,$in4,$in4,$inpperm
1671	 vcipher	$out3,$out3,v24
1672	stvx_u		$in3,$x30,$out
1673	le?vperm	$in5,$in5,$in5,$inpperm
1674	 vcipher	$out4,$out4,v24
1675	stvx_u		$in4,$x40,$out
1676	le?vperm	$in6,$in6,$in6,$inpperm
1677	 vcipher	$out5,$out5,v24
1678	stvx_u		$in5,$x50,$out
1679	le?vperm	$in7,$in7,$in7,$inpperm
1680	 vcipher	$out6,$out6,v24
1681	stvx_u		$in6,$x60,$out
1682	 vcipher	$out7,$out7,v24
1683	stvx_u		$in7,$x70,$out
1684	addi		$out,$out,0x80
1685
1686	b		Loop_ctr32_enc8x_middle
1687
1688.align	5
1689Lctr32_enc8x_break:
1690	cmpwi		$len,-0x60
1691	blt		Lctr32_enc8x_one
1692	nop
1693	beq		Lctr32_enc8x_two
1694	cmpwi		$len,-0x40
1695	blt		Lctr32_enc8x_three
1696	nop
1697	beq		Lctr32_enc8x_four
1698	cmpwi		$len,-0x20
1699	blt		Lctr32_enc8x_five
1700	nop
1701	beq		Lctr32_enc8x_six
1702	cmpwi		$len,0x00
1703	blt		Lctr32_enc8x_seven
1704
1705Lctr32_enc8x_eight:
1706	vcipherlast	$out0,$out0,$in0
1707	vcipherlast	$out1,$out1,$in1
1708	vcipherlast	$out2,$out2,$in2
1709	vcipherlast	$out3,$out3,$in3
1710	vcipherlast	$out4,$out4,$in4
1711	vcipherlast	$out5,$out5,$in5
1712	vcipherlast	$out6,$out6,$in6
1713	vcipherlast	$out7,$out7,$in7
1714
1715	le?vperm	$out0,$out0,$out0,$inpperm
1716	le?vperm	$out1,$out1,$out1,$inpperm
1717	stvx_u		$out0,$x00,$out
1718	le?vperm	$out2,$out2,$out2,$inpperm
1719	stvx_u		$out1,$x10,$out
1720	le?vperm	$out3,$out3,$out3,$inpperm
1721	stvx_u		$out2,$x20,$out
1722	le?vperm	$out4,$out4,$out4,$inpperm
1723	stvx_u		$out3,$x30,$out
1724	le?vperm	$out5,$out5,$out5,$inpperm
1725	stvx_u		$out4,$x40,$out
1726	le?vperm	$out6,$out6,$out6,$inpperm
1727	stvx_u		$out5,$x50,$out
1728	le?vperm	$out7,$out7,$out7,$inpperm
1729	stvx_u		$out6,$x60,$out
1730	stvx_u		$out7,$x70,$out
1731	addi		$out,$out,0x80
1732	b		Lctr32_enc8x_done
1733
1734.align	5
1735Lctr32_enc8x_seven:
1736	vcipherlast	$out0,$out0,$in1
1737	vcipherlast	$out1,$out1,$in2
1738	vcipherlast	$out2,$out2,$in3
1739	vcipherlast	$out3,$out3,$in4
1740	vcipherlast	$out4,$out4,$in5
1741	vcipherlast	$out5,$out5,$in6
1742	vcipherlast	$out6,$out6,$in7
1743
1744	le?vperm	$out0,$out0,$out0,$inpperm
1745	le?vperm	$out1,$out1,$out1,$inpperm
1746	stvx_u		$out0,$x00,$out
1747	le?vperm	$out2,$out2,$out2,$inpperm
1748	stvx_u		$out1,$x10,$out
1749	le?vperm	$out3,$out3,$out3,$inpperm
1750	stvx_u		$out2,$x20,$out
1751	le?vperm	$out4,$out4,$out4,$inpperm
1752	stvx_u		$out3,$x30,$out
1753	le?vperm	$out5,$out5,$out5,$inpperm
1754	stvx_u		$out4,$x40,$out
1755	le?vperm	$out6,$out6,$out6,$inpperm
1756	stvx_u		$out5,$x50,$out
1757	stvx_u		$out6,$x60,$out
1758	addi		$out,$out,0x70
1759	b		Lctr32_enc8x_done
1760
1761.align	5
1762Lctr32_enc8x_six:
1763	vcipherlast	$out0,$out0,$in2
1764	vcipherlast	$out1,$out1,$in3
1765	vcipherlast	$out2,$out2,$in4
1766	vcipherlast	$out3,$out3,$in5
1767	vcipherlast	$out4,$out4,$in6
1768	vcipherlast	$out5,$out5,$in7
1769
1770	le?vperm	$out0,$out0,$out0,$inpperm
1771	le?vperm	$out1,$out1,$out1,$inpperm
1772	stvx_u		$out0,$x00,$out
1773	le?vperm	$out2,$out2,$out2,$inpperm
1774	stvx_u		$out1,$x10,$out
1775	le?vperm	$out3,$out3,$out3,$inpperm
1776	stvx_u		$out2,$x20,$out
1777	le?vperm	$out4,$out4,$out4,$inpperm
1778	stvx_u		$out3,$x30,$out
1779	le?vperm	$out5,$out5,$out5,$inpperm
1780	stvx_u		$out4,$x40,$out
1781	stvx_u		$out5,$x50,$out
1782	addi		$out,$out,0x60
1783	b		Lctr32_enc8x_done
1784
1785.align	5
1786Lctr32_enc8x_five:
1787	vcipherlast	$out0,$out0,$in3
1788	vcipherlast	$out1,$out1,$in4
1789	vcipherlast	$out2,$out2,$in5
1790	vcipherlast	$out3,$out3,$in6
1791	vcipherlast	$out4,$out4,$in7
1792
1793	le?vperm	$out0,$out0,$out0,$inpperm
1794	le?vperm	$out1,$out1,$out1,$inpperm
1795	stvx_u		$out0,$x00,$out
1796	le?vperm	$out2,$out2,$out2,$inpperm
1797	stvx_u		$out1,$x10,$out
1798	le?vperm	$out3,$out3,$out3,$inpperm
1799	stvx_u		$out2,$x20,$out
1800	le?vperm	$out4,$out4,$out4,$inpperm
1801	stvx_u		$out3,$x30,$out
1802	stvx_u		$out4,$x40,$out
1803	addi		$out,$out,0x50
1804	b		Lctr32_enc8x_done
1805
1806.align	5
1807Lctr32_enc8x_four:
1808	vcipherlast	$out0,$out0,$in4
1809	vcipherlast	$out1,$out1,$in5
1810	vcipherlast	$out2,$out2,$in6
1811	vcipherlast	$out3,$out3,$in7
1812
1813	le?vperm	$out0,$out0,$out0,$inpperm
1814	le?vperm	$out1,$out1,$out1,$inpperm
1815	stvx_u		$out0,$x00,$out
1816	le?vperm	$out2,$out2,$out2,$inpperm
1817	stvx_u		$out1,$x10,$out
1818	le?vperm	$out3,$out3,$out3,$inpperm
1819	stvx_u		$out2,$x20,$out
1820	stvx_u		$out3,$x30,$out
1821	addi		$out,$out,0x40
1822	b		Lctr32_enc8x_done
1823
1824.align	5
1825Lctr32_enc8x_three:
1826	vcipherlast	$out0,$out0,$in5
1827	vcipherlast	$out1,$out1,$in6
1828	vcipherlast	$out2,$out2,$in7
1829
1830	le?vperm	$out0,$out0,$out0,$inpperm
1831	le?vperm	$out1,$out1,$out1,$inpperm
1832	stvx_u		$out0,$x00,$out
1833	le?vperm	$out2,$out2,$out2,$inpperm
1834	stvx_u		$out1,$x10,$out
1835	stvx_u		$out2,$x20,$out
1836	addi		$out,$out,0x30
1837	b		Lctr32_enc8x_done
1838
1839.align	5
1840Lctr32_enc8x_two:
1841	vcipherlast	$out0,$out0,$in6
1842	vcipherlast	$out1,$out1,$in7
1843
1844	le?vperm	$out0,$out0,$out0,$inpperm
1845	le?vperm	$out1,$out1,$out1,$inpperm
1846	stvx_u		$out0,$x00,$out
1847	stvx_u		$out1,$x10,$out
1848	addi		$out,$out,0x20
1849	b		Lctr32_enc8x_done
1850
1851.align	5
1852Lctr32_enc8x_one:
1853	vcipherlast	$out0,$out0,$in7
1854
1855	le?vperm	$out0,$out0,$out0,$inpperm
1856	stvx_u		$out0,0,$out
1857	addi		$out,$out,0x10
1858
1859Lctr32_enc8x_done:
1860	li		r10,`$FRAME+15`
1861	li		r11,`$FRAME+31`
1862	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1863	addi		r10,r10,32
1864	stvx		$inpperm,r11,$sp
1865	addi		r11,r11,32
1866	stvx		$inpperm,r10,$sp
1867	addi		r10,r10,32
1868	stvx		$inpperm,r11,$sp
1869	addi		r11,r11,32
1870	stvx		$inpperm,r10,$sp
1871	addi		r10,r10,32
1872	stvx		$inpperm,r11,$sp
1873	addi		r11,r11,32
1874	stvx		$inpperm,r10,$sp
1875	addi		r10,r10,32
1876	stvx		$inpperm,r11,$sp
1877	addi		r11,r11,32
1878
1879	mtspr		256,$vrsave
1880	lvx		v20,r10,$sp		# ABI says so
1881	addi		r10,r10,32
1882	lvx		v21,r11,$sp
1883	addi		r11,r11,32
1884	lvx		v22,r10,$sp
1885	addi		r10,r10,32
1886	lvx		v23,r11,$sp
1887	addi		r11,r11,32
1888	lvx		v24,r10,$sp
1889	addi		r10,r10,32
1890	lvx		v25,r11,$sp
1891	addi		r11,r11,32
1892	lvx		v26,r10,$sp
1893	addi		r10,r10,32
1894	lvx		v27,r11,$sp
1895	addi		r11,r11,32
1896	lvx		v28,r10,$sp
1897	addi		r10,r10,32
1898	lvx		v29,r11,$sp
1899	addi		r11,r11,32
1900	lvx		v30,r10,$sp
1901	lvx		v31,r11,$sp
1902	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1903	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1904	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1905	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1906	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1907	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1908	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1909	blr
1910	.long		0
1911	.byte		0,12,0x04,0,0x80,6,6,0
1912	.long		0
1913.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1914___
1915}}	}}}
1916
1917#########################################################################
1918{{{	# XTS procedures						#
1919# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
1920#                             const AES_KEY *key1, const AES_KEY *key2,	#
1921#                             [const] unsigned char iv[16]);		#
1922# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
1923# input tweak value is assumed to be encrypted already, and last tweak	#
1924# value, one suitable for consecutive call on same chunk of data, is	#
1925# written back to original buffer. In addition, in "tweak chaining"	#
1926# mode only complete input blocks are processed.			#
1927
1928my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
1929my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
1930my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
1931my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
1932my $taillen = $key2;
1933
1934   ($inp,$idx) = ($idx,$inp);				# reassign
1935
1936$code.=<<___;
1937.globl	.${prefix}_xts_encrypt
1938.align	5
1939.${prefix}_xts_encrypt:
1940	mr		$inp,r3				# reassign
1941	li		r3,-1
1942	${UCMP}i	$len,16
1943	bltlr-
1944
1945	lis		r0,0xfff0
1946	mfspr		r12,256				# save vrsave
1947	li		r11,0
1948	mtspr		256,r0
1949
1950	vspltisb	$seven,0x07			# 0x070707..07
1951	le?lvsl		$leperm,r11,r11
1952	le?vspltisb	$tmp,0x0f
1953	le?vxor		$leperm,$leperm,$seven
1954
1955	li		$idx,15
1956	lvx		$tweak,0,$ivp			# load [unaligned] iv
1957	lvsl		$inpperm,0,$ivp
1958	lvx		$inptail,$idx,$ivp
1959	le?vxor		$inpperm,$inpperm,$tmp
1960	vperm		$tweak,$tweak,$inptail,$inpperm
1961
1962	neg		r11,$inp
1963	lvsr		$inpperm,0,r11			# prepare for unaligned load
1964	lvx		$inout,0,$inp
1965	addi		$inp,$inp,15			# 15 is not typo
1966	le?vxor		$inpperm,$inpperm,$tmp
1967
1968	${UCMP}i	$key2,0				# key2==NULL?
1969	beq		Lxts_enc_no_key2
1970
1971	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
1972	lwz		$rounds,240($key2)
1973	srwi		$rounds,$rounds,1
1974	subi		$rounds,$rounds,1
1975	li		$idx,16
1976
1977	lvx		$rndkey0,0,$key2
1978	lvx		$rndkey1,$idx,$key2
1979	addi		$idx,$idx,16
1980	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1981	vxor		$tweak,$tweak,$rndkey0
1982	lvx		$rndkey0,$idx,$key2
1983	addi		$idx,$idx,16
1984	mtctr		$rounds
1985
1986Ltweak_xts_enc:
1987	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1988	vcipher		$tweak,$tweak,$rndkey1
1989	lvx		$rndkey1,$idx,$key2
1990	addi		$idx,$idx,16
1991	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1992	vcipher		$tweak,$tweak,$rndkey0
1993	lvx		$rndkey0,$idx,$key2
1994	addi		$idx,$idx,16
1995	bdnz		Ltweak_xts_enc
1996
1997	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1998	vcipher		$tweak,$tweak,$rndkey1
1999	lvx		$rndkey1,$idx,$key2
2000	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2001	vcipherlast	$tweak,$tweak,$rndkey0
2002
2003	li		$ivp,0				# don't chain the tweak
2004	b		Lxts_enc
2005
2006Lxts_enc_no_key2:
2007	li		$idx,-16
2008	and		$len,$len,$idx			# in "tweak chaining"
2009							# mode only complete
2010							# blocks are processed
2011Lxts_enc:
2012	lvx		$inptail,0,$inp
2013	addi		$inp,$inp,16
2014
2015	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2016	lwz		$rounds,240($key1)
2017	srwi		$rounds,$rounds,1
2018	subi		$rounds,$rounds,1
2019	li		$idx,16
2020
2021	vslb		$eighty7,$seven,$seven		# 0x808080..80
2022	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2023	vspltisb	$tmp,1				# 0x010101..01
2024	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2025
2026	${UCMP}i	$len,96
2027	bge		_aesp8_xts_encrypt6x
2028
2029	andi.		$taillen,$len,15
2030	subic		r0,$len,32
2031	subi		$taillen,$taillen,16
2032	subfe		r0,r0,r0
2033	and		r0,r0,$taillen
2034	add		$inp,$inp,r0
2035
2036	lvx		$rndkey0,0,$key1
2037	lvx		$rndkey1,$idx,$key1
2038	addi		$idx,$idx,16
2039	vperm		$inout,$inout,$inptail,$inpperm
2040	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2041	vxor		$inout,$inout,$tweak
2042	vxor		$inout,$inout,$rndkey0
2043	lvx		$rndkey0,$idx,$key1
2044	addi		$idx,$idx,16
2045	mtctr		$rounds
2046	b		Loop_xts_enc
2047
2048.align	5
2049Loop_xts_enc:
2050	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2051	vcipher		$inout,$inout,$rndkey1
2052	lvx		$rndkey1,$idx,$key1
2053	addi		$idx,$idx,16
2054	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2055	vcipher		$inout,$inout,$rndkey0
2056	lvx		$rndkey0,$idx,$key1
2057	addi		$idx,$idx,16
2058	bdnz		Loop_xts_enc
2059
2060	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2061	vcipher		$inout,$inout,$rndkey1
2062	lvx		$rndkey1,$idx,$key1
2063	li		$idx,16
2064	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2065	vxor		$rndkey0,$rndkey0,$tweak
2066	vcipherlast	$output,$inout,$rndkey0
2067
2068	le?vperm	$tmp,$output,$output,$leperm
2069	be?nop
2070	le?stvx_u	$tmp,0,$out
2071	be?stvx_u	$output,0,$out
2072	addi		$out,$out,16
2073
2074	subic.		$len,$len,16
2075	beq		Lxts_enc_done
2076
2077	vmr		$inout,$inptail
2078	lvx		$inptail,0,$inp
2079	addi		$inp,$inp,16
2080	lvx		$rndkey0,0,$key1
2081	lvx		$rndkey1,$idx,$key1
2082	addi		$idx,$idx,16
2083
2084	subic		r0,$len,32
2085	subfe		r0,r0,r0
2086	and		r0,r0,$taillen
2087	add		$inp,$inp,r0
2088
2089	vsrab		$tmp,$tweak,$seven		# next tweak value
2090	vaddubm		$tweak,$tweak,$tweak
2091	vsldoi		$tmp,$tmp,$tmp,15
2092	vand		$tmp,$tmp,$eighty7
2093	vxor		$tweak,$tweak,$tmp
2094
2095	vperm		$inout,$inout,$inptail,$inpperm
2096	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2097	vxor		$inout,$inout,$tweak
2098	vxor		$output,$output,$rndkey0	# just in case $len<16
2099	vxor		$inout,$inout,$rndkey0
2100	lvx		$rndkey0,$idx,$key1
2101	addi		$idx,$idx,16
2102
2103	mtctr		$rounds
2104	${UCMP}i	$len,16
2105	bge		Loop_xts_enc
2106
2107	vxor		$output,$output,$tweak
2108	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2109	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2110	vspltisb	$tmp,-1
2111	vperm		$inptail,$inptail,$tmp,$inpperm
2112	vsel		$inout,$inout,$output,$inptail
2113
2114	subi		r11,$out,17
2115	subi		$out,$out,16
2116	mtctr		$len
2117	li		$len,16
2118Loop_xts_enc_steal:
2119	lbzu		r0,1(r11)
2120	stb		r0,16(r11)
2121	bdnz		Loop_xts_enc_steal
2122
2123	mtctr		$rounds
2124	b		Loop_xts_enc			# one more time...
2125
2126Lxts_enc_done:
2127	${UCMP}i	$ivp,0
2128	beq		Lxts_enc_ret
2129
2130	vsrab		$tmp,$tweak,$seven		# next tweak value
2131	vaddubm		$tweak,$tweak,$tweak
2132	vsldoi		$tmp,$tmp,$tmp,15
2133	vand		$tmp,$tmp,$eighty7
2134	vxor		$tweak,$tweak,$tmp
2135
2136	le?vperm	$tweak,$tweak,$tweak,$leperm
2137	stvx_u		$tweak,0,$ivp
2138
2139Lxts_enc_ret:
2140	mtspr		256,r12				# restore vrsave
2141	li		r3,0
2142	blr
2143	.long		0
2144	.byte		0,12,0x04,0,0x80,6,6,0
2145	.long		0
2146.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2147
2148.globl	.${prefix}_xts_decrypt
2149.align	5
2150.${prefix}_xts_decrypt:
2151	mr		$inp,r3				# reassign
2152	li		r3,-1
2153	${UCMP}i	$len,16
2154	bltlr-
2155
2156	lis		r0,0xfff8
2157	mfspr		r12,256				# save vrsave
2158	li		r11,0
2159	mtspr		256,r0
2160
2161	andi.		r0,$len,15
2162	neg		r0,r0
2163	andi.		r0,r0,16
2164	sub		$len,$len,r0
2165
2166	vspltisb	$seven,0x07			# 0x070707..07
2167	le?lvsl		$leperm,r11,r11
2168	le?vspltisb	$tmp,0x0f
2169	le?vxor		$leperm,$leperm,$seven
2170
2171	li		$idx,15
2172	lvx		$tweak,0,$ivp			# load [unaligned] iv
2173	lvsl		$inpperm,0,$ivp
2174	lvx		$inptail,$idx,$ivp
2175	le?vxor		$inpperm,$inpperm,$tmp
2176	vperm		$tweak,$tweak,$inptail,$inpperm
2177
2178	neg		r11,$inp
2179	lvsr		$inpperm,0,r11			# prepare for unaligned load
2180	lvx		$inout,0,$inp
2181	addi		$inp,$inp,15			# 15 is not typo
2182	le?vxor		$inpperm,$inpperm,$tmp
2183
2184	${UCMP}i	$key2,0				# key2==NULL?
2185	beq		Lxts_dec_no_key2
2186
2187	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2188	lwz		$rounds,240($key2)
2189	srwi		$rounds,$rounds,1
2190	subi		$rounds,$rounds,1
2191	li		$idx,16
2192
2193	lvx		$rndkey0,0,$key2
2194	lvx		$rndkey1,$idx,$key2
2195	addi		$idx,$idx,16
2196	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2197	vxor		$tweak,$tweak,$rndkey0
2198	lvx		$rndkey0,$idx,$key2
2199	addi		$idx,$idx,16
2200	mtctr		$rounds
2201
2202Ltweak_xts_dec:
2203	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2204	vcipher		$tweak,$tweak,$rndkey1
2205	lvx		$rndkey1,$idx,$key2
2206	addi		$idx,$idx,16
2207	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2208	vcipher		$tweak,$tweak,$rndkey0
2209	lvx		$rndkey0,$idx,$key2
2210	addi		$idx,$idx,16
2211	bdnz		Ltweak_xts_dec
2212
2213	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2214	vcipher		$tweak,$tweak,$rndkey1
2215	lvx		$rndkey1,$idx,$key2
2216	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2217	vcipherlast	$tweak,$tweak,$rndkey0
2218
2219	li		$ivp,0				# don't chain the tweak
2220	b		Lxts_dec
2221
2222Lxts_dec_no_key2:
2223	neg		$idx,$len
2224	andi.		$idx,$idx,15
2225	add		$len,$len,$idx			# in "tweak chaining"
2226							# mode only complete
2227							# blocks are processed
2228Lxts_dec:
2229	lvx		$inptail,0,$inp
2230	addi		$inp,$inp,16
2231
2232	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2233	lwz		$rounds,240($key1)
2234	srwi		$rounds,$rounds,1
2235	subi		$rounds,$rounds,1
2236	li		$idx,16
2237
2238	vslb		$eighty7,$seven,$seven		# 0x808080..80
2239	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2240	vspltisb	$tmp,1				# 0x010101..01
2241	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2242
2243	${UCMP}i	$len,96
2244	bge		_aesp8_xts_decrypt6x
2245
2246	lvx		$rndkey0,0,$key1
2247	lvx		$rndkey1,$idx,$key1
2248	addi		$idx,$idx,16
2249	vperm		$inout,$inout,$inptail,$inpperm
2250	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2251	vxor		$inout,$inout,$tweak
2252	vxor		$inout,$inout,$rndkey0
2253	lvx		$rndkey0,$idx,$key1
2254	addi		$idx,$idx,16
2255	mtctr		$rounds
2256
2257	${UCMP}i	$len,16
2258	blt		Ltail_xts_dec
2259	be?b		Loop_xts_dec
2260
2261.align	5
2262Loop_xts_dec:
2263	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2264	vncipher	$inout,$inout,$rndkey1
2265	lvx		$rndkey1,$idx,$key1
2266	addi		$idx,$idx,16
2267	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2268	vncipher	$inout,$inout,$rndkey0
2269	lvx		$rndkey0,$idx,$key1
2270	addi		$idx,$idx,16
2271	bdnz		Loop_xts_dec
2272
2273	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2274	vncipher	$inout,$inout,$rndkey1
2275	lvx		$rndkey1,$idx,$key1
2276	li		$idx,16
2277	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2278	vxor		$rndkey0,$rndkey0,$tweak
2279	vncipherlast	$output,$inout,$rndkey0
2280
2281	le?vperm	$tmp,$output,$output,$leperm
2282	be?nop
2283	le?stvx_u	$tmp,0,$out
2284	be?stvx_u	$output,0,$out
2285	addi		$out,$out,16
2286
2287	subic.		$len,$len,16
2288	beq		Lxts_dec_done
2289
2290	vmr		$inout,$inptail
2291	lvx		$inptail,0,$inp
2292	addi		$inp,$inp,16
2293	lvx		$rndkey0,0,$key1
2294	lvx		$rndkey1,$idx,$key1
2295	addi		$idx,$idx,16
2296
2297	vsrab		$tmp,$tweak,$seven		# next tweak value
2298	vaddubm		$tweak,$tweak,$tweak
2299	vsldoi		$tmp,$tmp,$tmp,15
2300	vand		$tmp,$tmp,$eighty7
2301	vxor		$tweak,$tweak,$tmp
2302
2303	vperm		$inout,$inout,$inptail,$inpperm
2304	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2305	vxor		$inout,$inout,$tweak
2306	vxor		$inout,$inout,$rndkey0
2307	lvx		$rndkey0,$idx,$key1
2308	addi		$idx,$idx,16
2309
2310	mtctr		$rounds
2311	${UCMP}i	$len,16
2312	bge		Loop_xts_dec
2313
2314Ltail_xts_dec:
2315	vsrab		$tmp,$tweak,$seven		# next tweak value
2316	vaddubm		$tweak1,$tweak,$tweak
2317	vsldoi		$tmp,$tmp,$tmp,15
2318	vand		$tmp,$tmp,$eighty7
2319	vxor		$tweak1,$tweak1,$tmp
2320
2321	subi		$inp,$inp,16
2322	add		$inp,$inp,$len
2323
2324	vxor		$inout,$inout,$tweak		# :-(
2325	vxor		$inout,$inout,$tweak1		# :-)
2326
2327Loop_xts_dec_short:
2328	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2329	vncipher	$inout,$inout,$rndkey1
2330	lvx		$rndkey1,$idx,$key1
2331	addi		$idx,$idx,16
2332	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2333	vncipher	$inout,$inout,$rndkey0
2334	lvx		$rndkey0,$idx,$key1
2335	addi		$idx,$idx,16
2336	bdnz		Loop_xts_dec_short
2337
2338	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2339	vncipher	$inout,$inout,$rndkey1
2340	lvx		$rndkey1,$idx,$key1
2341	li		$idx,16
2342	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2343	vxor		$rndkey0,$rndkey0,$tweak1
2344	vncipherlast	$output,$inout,$rndkey0
2345
2346	le?vperm	$tmp,$output,$output,$leperm
2347	be?nop
2348	le?stvx_u	$tmp,0,$out
2349	be?stvx_u	$output,0,$out
2350
2351	vmr		$inout,$inptail
2352	lvx		$inptail,0,$inp
2353	#addi		$inp,$inp,16
2354	lvx		$rndkey0,0,$key1
2355	lvx		$rndkey1,$idx,$key1
2356	addi		$idx,$idx,16
2357	vperm		$inout,$inout,$inptail,$inpperm
2358	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2359
2360	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2361	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2362	vspltisb	$tmp,-1
2363	vperm		$inptail,$inptail,$tmp,$inpperm
2364	vsel		$inout,$inout,$output,$inptail
2365
2366	vxor		$rndkey0,$rndkey0,$tweak
2367	vxor		$inout,$inout,$rndkey0
2368	lvx		$rndkey0,$idx,$key1
2369	addi		$idx,$idx,16
2370
2371	subi		r11,$out,1
2372	mtctr		$len
2373	li		$len,16
2374Loop_xts_dec_steal:
2375	lbzu		r0,1(r11)
2376	stb		r0,16(r11)
2377	bdnz		Loop_xts_dec_steal
2378
2379	mtctr		$rounds
2380	b		Loop_xts_dec			# one more time...
2381
2382Lxts_dec_done:
2383	${UCMP}i	$ivp,0
2384	beq		Lxts_dec_ret
2385
2386	vsrab		$tmp,$tweak,$seven		# next tweak value
2387	vaddubm		$tweak,$tweak,$tweak
2388	vsldoi		$tmp,$tmp,$tmp,15
2389	vand		$tmp,$tmp,$eighty7
2390	vxor		$tweak,$tweak,$tmp
2391
2392	le?vperm	$tweak,$tweak,$tweak,$leperm
2393	stvx_u		$tweak,0,$ivp
2394
2395Lxts_dec_ret:
2396	mtspr		256,r12				# restore vrsave
2397	li		r3,0
2398	blr
2399	.long		0
2400	.byte		0,12,0x04,0,0x80,6,6,0
2401	.long		0
2402.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2403___
2404#########################################################################
2405{{	# Optimized XTS procedures					#
2406my $key_=$key2;
2407my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
2408    $x00=0 if ($flavour =~ /osx/);
2409my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5)=map("v$_",(0..5));
2410my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2411my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2412my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
2413			# v26-v31 last 6 round keys
2414my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
2415my $taillen=$x70;
2416
2417$code.=<<___;
2418.align	5
2419_aesp8_xts_encrypt6x:
2420	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2421	mflr		r11
2422	li		r7,`$FRAME+8*16+15`
2423	li		r3,`$FRAME+8*16+31`
2424	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2425	stvx		v20,r7,$sp		# ABI says so
2426	addi		r7,r7,32
2427	stvx		v21,r3,$sp
2428	addi		r3,r3,32
2429	stvx		v22,r7,$sp
2430	addi		r7,r7,32
2431	stvx		v23,r3,$sp
2432	addi		r3,r3,32
2433	stvx		v24,r7,$sp
2434	addi		r7,r7,32
2435	stvx		v25,r3,$sp
2436	addi		r3,r3,32
2437	stvx		v26,r7,$sp
2438	addi		r7,r7,32
2439	stvx		v27,r3,$sp
2440	addi		r3,r3,32
2441	stvx		v28,r7,$sp
2442	addi		r7,r7,32
2443	stvx		v29,r3,$sp
2444	addi		r3,r3,32
2445	stvx		v30,r7,$sp
2446	stvx		v31,r3,$sp
2447	li		r0,-1
2448	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
2449	li		$x10,0x10
2450	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2451	li		$x20,0x20
2452	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2453	li		$x30,0x30
2454	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2455	li		$x40,0x40
2456	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2457	li		$x50,0x50
2458	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2459	li		$x60,0x60
2460	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2461	li		$x70,0x70
2462	mtspr		256,r0
2463
2464	# Reverse eighty7 to 0x010101..87
2465	xxlor		2, 32+$eighty7, 32+$eighty7
2466	vsldoi		$eighty7,$tmp,$eighty7,1	# 0x010101..87
2467	xxlor		1, 32+$eighty7, 32+$eighty7
2468
2469	# Load XOR contents. 0xf102132435465768798a9bacbdcedfe
2470	mr		$x70, r6
2471	bl		Lconsts
2472	lxvw4x		0, $x40, r6		# load XOR contents
2473	mr		r6, $x70
2474	li		$x70,0x70
2475
2476	subi		$rounds,$rounds,3	# -4 in total
2477
2478	lvx		$rndkey0,$x00,$key1	# load key schedule
2479	lvx		v30,$x10,$key1
2480	addi		$key1,$key1,0x20
2481	lvx		v31,$x00,$key1
2482	?vperm		$rndkey0,$rndkey0,v30,$keyperm
2483	addi		$key_,$sp,$FRAME+15
2484	mtctr		$rounds
2485
2486Load_xts_enc_key:
2487	?vperm		v24,v30,v31,$keyperm
2488	lvx		v30,$x10,$key1
2489	addi		$key1,$key1,0x20
2490	stvx		v24,$x00,$key_		# off-load round[1]
2491	?vperm		v25,v31,v30,$keyperm
2492	lvx		v31,$x00,$key1
2493	stvx		v25,$x10,$key_		# off-load round[2]
2494	addi		$key_,$key_,0x20
2495	bdnz		Load_xts_enc_key
2496
2497	lvx		v26,$x10,$key1
2498	?vperm		v24,v30,v31,$keyperm
2499	lvx		v27,$x20,$key1
2500	stvx		v24,$x00,$key_		# off-load round[3]
2501	?vperm		v25,v31,v26,$keyperm
2502	lvx		v28,$x30,$key1
2503	stvx		v25,$x10,$key_		# off-load round[4]
2504	addi		$key_,$sp,$FRAME+15	# rewind $key_
2505	?vperm		v26,v26,v27,$keyperm
2506	lvx		v29,$x40,$key1
2507	?vperm		v27,v27,v28,$keyperm
2508	lvx		v30,$x50,$key1
2509	?vperm		v28,v28,v29,$keyperm
2510	lvx		v31,$x60,$key1
2511	?vperm		v29,v29,v30,$keyperm
2512	lvx		$twk5,$x70,$key1	# borrow $twk5
2513	?vperm		v30,v30,v31,$keyperm
2514	lvx		v24,$x00,$key_		# pre-load round[1]
2515	?vperm		v31,v31,$twk5,$keyperm
2516	lvx		v25,$x10,$key_		# pre-load round[2]
2517
2518	# Switch to use the following codes with 0x010101..87 to generate tweak.
2519	#     eighty7 = 0x010101..87
2520	# vsrab		tmp, tweak, seven	# next tweak value, right shift 7 bits
2521	# vand		tmp, tmp, eighty7	# last byte with carry
2522	# vaddubm	tweak, tweak, tweak	# left shift 1 bit (x2)
2523	# xxlor		vsx, 0, 0
2524	# vpermxor	tweak, tweak, tmp, vsx
2525
2526	 vperm		$in0,$inout,$inptail,$inpperm
2527	 subi		$inp,$inp,31		# undo "caller"
2528	vxor		$twk0,$tweak,$rndkey0
2529	vsrab		$tmp,$tweak,$seven	# next tweak value
2530	vaddubm		$tweak,$tweak,$tweak
2531	vand		$tmp,$tmp,$eighty7
2532	 vxor		$out0,$in0,$twk0
2533	xxlor		32+$in1, 0, 0
2534	vpermxor	$tweak, $tweak, $tmp, $in1
2535
2536	 lvx_u		$in1,$x10,$inp
2537	vxor		$twk1,$tweak,$rndkey0
2538	vsrab		$tmp,$tweak,$seven	# next tweak value
2539	vaddubm		$tweak,$tweak,$tweak
2540	 le?vperm	$in1,$in1,$in1,$leperm
2541	vand		$tmp,$tmp,$eighty7
2542	 vxor		$out1,$in1,$twk1
2543	xxlor		32+$in2, 0, 0
2544	vpermxor	$tweak, $tweak, $tmp, $in2
2545
2546	 lvx_u		$in2,$x20,$inp
2547	 andi.		$taillen,$len,15
2548	vxor		$twk2,$tweak,$rndkey0
2549	vsrab		$tmp,$tweak,$seven	# next tweak value
2550	vaddubm		$tweak,$tweak,$tweak
2551	 le?vperm	$in2,$in2,$in2,$leperm
2552	vand		$tmp,$tmp,$eighty7
2553	 vxor		$out2,$in2,$twk2
2554	xxlor		32+$in3, 0, 0
2555	vpermxor	$tweak, $tweak, $tmp, $in3
2556
2557	 lvx_u		$in3,$x30,$inp
2558	 sub		$len,$len,$taillen
2559	vxor		$twk3,$tweak,$rndkey0
2560	vsrab		$tmp,$tweak,$seven	# next tweak value
2561	vaddubm		$tweak,$tweak,$tweak
2562	 le?vperm	$in3,$in3,$in3,$leperm
2563	vand		$tmp,$tmp,$eighty7
2564	 vxor		$out3,$in3,$twk3
2565	xxlor		32+$in4, 0, 0
2566	vpermxor	$tweak, $tweak, $tmp, $in4
2567
2568	 lvx_u		$in4,$x40,$inp
2569	 subi		$len,$len,0x60
2570	vxor		$twk4,$tweak,$rndkey0
2571	vsrab		$tmp,$tweak,$seven	# next tweak value
2572	vaddubm		$tweak,$tweak,$tweak
2573	 le?vperm	$in4,$in4,$in4,$leperm
2574	vand		$tmp,$tmp,$eighty7
2575	 vxor		$out4,$in4,$twk4
2576	xxlor		32+$in5, 0, 0
2577	vpermxor	$tweak, $tweak, $tmp, $in5
2578
2579	 lvx_u		$in5,$x50,$inp
2580	 addi		$inp,$inp,0x60
2581	vxor		$twk5,$tweak,$rndkey0
2582	vsrab		$tmp,$tweak,$seven	# next tweak value
2583	vaddubm		$tweak,$tweak,$tweak
2584	 le?vperm	$in5,$in5,$in5,$leperm
2585	vand		$tmp,$tmp,$eighty7
2586	 vxor		$out5,$in5,$twk5
2587	xxlor		32+$in0, 0, 0
2588	vpermxor	$tweak, $tweak, $tmp, $in0
2589
2590	vxor		v31,v31,$rndkey0
2591	mtctr		$rounds
2592	b		Loop_xts_enc6x
2593
2594.align	5
2595Loop_xts_enc6x:
2596	vcipher		$out0,$out0,v24
2597	vcipher		$out1,$out1,v24
2598	vcipher		$out2,$out2,v24
2599	vcipher		$out3,$out3,v24
2600	vcipher		$out4,$out4,v24
2601	vcipher		$out5,$out5,v24
2602	lvx		v24,$x20,$key_		# round[3]
2603	addi		$key_,$key_,0x20
2604
2605	vcipher		$out0,$out0,v25
2606	vcipher		$out1,$out1,v25
2607	vcipher		$out2,$out2,v25
2608	vcipher		$out3,$out3,v25
2609	vcipher		$out4,$out4,v25
2610	vcipher		$out5,$out5,v25
2611	lvx		v25,$x10,$key_		# round[4]
2612	bdnz		Loop_xts_enc6x
2613
2614	xxlor		32+$eighty7, 1, 1		# 0x010101..87
2615
2616	subic		$len,$len,96		# $len-=96
2617	 vxor		$in0,$twk0,v31		# xor with last round key
2618	vcipher		$out0,$out0,v24
2619	vcipher		$out1,$out1,v24
2620	 vsrab		$tmp,$tweak,$seven	# next tweak value
2621	 vxor		$twk0,$tweak,$rndkey0
2622	 vaddubm	$tweak,$tweak,$tweak
2623	vcipher		$out2,$out2,v24
2624	vcipher		$out3,$out3,v24
2625	vcipher		$out4,$out4,v24
2626	vcipher		$out5,$out5,v24
2627
2628	subfe.		r0,r0,r0		# borrow?-1:0
2629	 vand		$tmp,$tmp,$eighty7
2630	vcipher		$out0,$out0,v25
2631	vcipher		$out1,$out1,v25
2632	 xxlor		32+$in1, 0, 0
2633	 vpermxor	$tweak, $tweak, $tmp, $in1
2634	vcipher		$out2,$out2,v25
2635	vcipher		$out3,$out3,v25
2636	 vxor		$in1,$twk1,v31
2637	 vsrab		$tmp,$tweak,$seven	# next tweak value
2638	 vxor		$twk1,$tweak,$rndkey0
2639	vcipher		$out4,$out4,v25
2640	vcipher		$out5,$out5,v25
2641
2642	and		r0,r0,$len
2643	 vaddubm	$tweak,$tweak,$tweak
2644	vcipher		$out0,$out0,v26
2645	vcipher		$out1,$out1,v26
2646	 vand		$tmp,$tmp,$eighty7
2647	vcipher		$out2,$out2,v26
2648	vcipher		$out3,$out3,v26
2649	 xxlor		32+$in2, 0, 0
2650	 vpermxor	$tweak, $tweak, $tmp, $in2
2651	vcipher		$out4,$out4,v26
2652	vcipher		$out5,$out5,v26
2653
2654	add		$inp,$inp,r0		# $inp is adjusted in such
2655						# way that at exit from the
2656						# loop inX-in5 are loaded
2657						# with last "words"
2658	 vxor		$in2,$twk2,v31
2659	 vsrab		$tmp,$tweak,$seven	# next tweak value
2660	 vxor		$twk2,$tweak,$rndkey0
2661	 vaddubm	$tweak,$tweak,$tweak
2662	vcipher		$out0,$out0,v27
2663	vcipher		$out1,$out1,v27
2664	vcipher		$out2,$out2,v27
2665	vcipher		$out3,$out3,v27
2666	 vand		$tmp,$tmp,$eighty7
2667	vcipher		$out4,$out4,v27
2668	vcipher		$out5,$out5,v27
2669
2670	addi		$key_,$sp,$FRAME+15	# rewind $key_
2671	 xxlor		32+$in3, 0, 0
2672	 vpermxor	$tweak, $tweak, $tmp, $in3
2673	vcipher		$out0,$out0,v28
2674	vcipher		$out1,$out1,v28
2675	 vxor		$in3,$twk3,v31
2676	 vsrab		$tmp,$tweak,$seven	# next tweak value
2677	 vxor		$twk3,$tweak,$rndkey0
2678	vcipher		$out2,$out2,v28
2679	vcipher		$out3,$out3,v28
2680	 vaddubm	$tweak,$tweak,$tweak
2681	vcipher		$out4,$out4,v28
2682	vcipher		$out5,$out5,v28
2683	lvx		v24,$x00,$key_		# re-pre-load round[1]
2684	 vand		$tmp,$tmp,$eighty7
2685
2686	vcipher		$out0,$out0,v29
2687	vcipher		$out1,$out1,v29
2688	 xxlor		32+$in4, 0, 0
2689	 vpermxor	$tweak, $tweak, $tmp, $in4
2690	vcipher		$out2,$out2,v29
2691	vcipher		$out3,$out3,v29
2692	 vxor		$in4,$twk4,v31
2693	 vsrab		$tmp,$tweak,$seven	# next tweak value
2694	 vxor		$twk4,$tweak,$rndkey0
2695	vcipher		$out4,$out4,v29
2696	vcipher		$out5,$out5,v29
2697	lvx		v25,$x10,$key_		# re-pre-load round[2]
2698	 vaddubm	$tweak,$tweak,$tweak
2699
2700	vcipher		$out0,$out0,v30
2701	vcipher		$out1,$out1,v30
2702	 vand		$tmp,$tmp,$eighty7
2703	vcipher		$out2,$out2,v30
2704	vcipher		$out3,$out3,v30
2705	 xxlor		32+$in5, 0, 0
2706	 vpermxor	$tweak, $tweak, $tmp, $in5
2707	vcipher		$out4,$out4,v30
2708	vcipher		$out5,$out5,v30
2709	 vxor		$in5,$twk5,v31
2710	 vsrab		$tmp,$tweak,$seven	# next tweak value
2711	 vxor		$twk5,$tweak,$rndkey0
2712
2713	vcipherlast	$out0,$out0,$in0
2714	 lvx_u		$in0,$x00,$inp		# load next input block
2715	 vaddubm	$tweak,$tweak,$tweak
2716	vcipherlast	$out1,$out1,$in1
2717	 lvx_u		$in1,$x10,$inp
2718	vcipherlast	$out2,$out2,$in2
2719	 le?vperm	$in0,$in0,$in0,$leperm
2720	 lvx_u		$in2,$x20,$inp
2721	 vand		$tmp,$tmp,$eighty7
2722	vcipherlast	$out3,$out3,$in3
2723	 le?vperm	$in1,$in1,$in1,$leperm
2724	 lvx_u		$in3,$x30,$inp
2725	vcipherlast	$out4,$out4,$in4
2726	 le?vperm	$in2,$in2,$in2,$leperm
2727	 lvx_u		$in4,$x40,$inp
2728	 xxlor		10, 32+$in0, 32+$in0
2729	 xxlor		32+$in0, 0, 0
2730	 vpermxor	$tweak, $tweak, $tmp, $in0
2731	 xxlor		32+$in0, 10, 10
2732	vcipherlast	$tmp,$out5,$in5		# last block might be needed
2733						# in stealing mode
2734	 le?vperm	$in3,$in3,$in3,$leperm
2735	 lvx_u		$in5,$x50,$inp
2736	 addi		$inp,$inp,0x60
2737	 le?vperm	$in4,$in4,$in4,$leperm
2738	 le?vperm	$in5,$in5,$in5,$leperm
2739
2740	le?vperm	$out0,$out0,$out0,$leperm
2741	le?vperm	$out1,$out1,$out1,$leperm
2742	stvx_u		$out0,$x00,$out		# store output
2743	 vxor		$out0,$in0,$twk0
2744	le?vperm	$out2,$out2,$out2,$leperm
2745	stvx_u		$out1,$x10,$out
2746	 vxor		$out1,$in1,$twk1
2747	le?vperm	$out3,$out3,$out3,$leperm
2748	stvx_u		$out2,$x20,$out
2749	 vxor		$out2,$in2,$twk2
2750	le?vperm	$out4,$out4,$out4,$leperm
2751	stvx_u		$out3,$x30,$out
2752	 vxor		$out3,$in3,$twk3
2753	le?vperm	$out5,$tmp,$tmp,$leperm
2754	stvx_u		$out4,$x40,$out
2755	 vxor		$out4,$in4,$twk4
2756	le?stvx_u	$out5,$x50,$out
2757	be?stvx_u	$tmp, $x50,$out
2758	 vxor		$out5,$in5,$twk5
2759	addi		$out,$out,0x60
2760
2761	mtctr		$rounds
2762	beq		Loop_xts_enc6x		# did $len-=96 borrow?
2763
2764	xxlor		32+$eighty7, 2, 2		# 0x870101..01
2765
2766	addic.		$len,$len,0x60
2767	beq		Lxts_enc6x_zero
2768	cmpwi		$len,0x20
2769	blt		Lxts_enc6x_one
2770	nop
2771	beq		Lxts_enc6x_two
2772	cmpwi		$len,0x40
2773	blt		Lxts_enc6x_three
2774	nop
2775	beq		Lxts_enc6x_four
2776
2777Lxts_enc6x_five:
2778	vxor		$out0,$in1,$twk0
2779	vxor		$out1,$in2,$twk1
2780	vxor		$out2,$in3,$twk2
2781	vxor		$out3,$in4,$twk3
2782	vxor		$out4,$in5,$twk4
2783
2784	bl		_aesp8_xts_enc5x
2785
2786	le?vperm	$out0,$out0,$out0,$leperm
2787	vmr		$twk0,$twk5		# unused tweak
2788	le?vperm	$out1,$out1,$out1,$leperm
2789	stvx_u		$out0,$x00,$out		# store output
2790	le?vperm	$out2,$out2,$out2,$leperm
2791	stvx_u		$out1,$x10,$out
2792	le?vperm	$out3,$out3,$out3,$leperm
2793	stvx_u		$out2,$x20,$out
2794	vxor		$tmp,$out4,$twk5	# last block prep for stealing
2795	le?vperm	$out4,$out4,$out4,$leperm
2796	stvx_u		$out3,$x30,$out
2797	stvx_u		$out4,$x40,$out
2798	addi		$out,$out,0x50
2799	bne		Lxts_enc6x_steal
2800	b		Lxts_enc6x_done
2801
2802.align	4
2803Lxts_enc6x_four:
2804	vxor		$out0,$in2,$twk0
2805	vxor		$out1,$in3,$twk1
2806	vxor		$out2,$in4,$twk2
2807	vxor		$out3,$in5,$twk3
2808	vxor		$out4,$out4,$out4
2809
2810	bl		_aesp8_xts_enc5x
2811
2812	le?vperm	$out0,$out0,$out0,$leperm
2813	vmr		$twk0,$twk4		# unused tweak
2814	le?vperm	$out1,$out1,$out1,$leperm
2815	stvx_u		$out0,$x00,$out		# store output
2816	le?vperm	$out2,$out2,$out2,$leperm
2817	stvx_u		$out1,$x10,$out
2818	vxor		$tmp,$out3,$twk4	# last block prep for stealing
2819	le?vperm	$out3,$out3,$out3,$leperm
2820	stvx_u		$out2,$x20,$out
2821	stvx_u		$out3,$x30,$out
2822	addi		$out,$out,0x40
2823	bne		Lxts_enc6x_steal
2824	b		Lxts_enc6x_done
2825
2826.align	4
2827Lxts_enc6x_three:
2828	vxor		$out0,$in3,$twk0
2829	vxor		$out1,$in4,$twk1
2830	vxor		$out2,$in5,$twk2
2831	vxor		$out3,$out3,$out3
2832	vxor		$out4,$out4,$out4
2833
2834	bl		_aesp8_xts_enc5x
2835
2836	le?vperm	$out0,$out0,$out0,$leperm
2837	vmr		$twk0,$twk3		# unused tweak
2838	le?vperm	$out1,$out1,$out1,$leperm
2839	stvx_u		$out0,$x00,$out		# store output
2840	vxor		$tmp,$out2,$twk3	# last block prep for stealing
2841	le?vperm	$out2,$out2,$out2,$leperm
2842	stvx_u		$out1,$x10,$out
2843	stvx_u		$out2,$x20,$out
2844	addi		$out,$out,0x30
2845	bne		Lxts_enc6x_steal
2846	b		Lxts_enc6x_done
2847
2848.align	4
2849Lxts_enc6x_two:
2850	vxor		$out0,$in4,$twk0
2851	vxor		$out1,$in5,$twk1
2852	vxor		$out2,$out2,$out2
2853	vxor		$out3,$out3,$out3
2854	vxor		$out4,$out4,$out4
2855
2856	bl		_aesp8_xts_enc5x
2857
2858	le?vperm	$out0,$out0,$out0,$leperm
2859	vmr		$twk0,$twk2		# unused tweak
2860	vxor		$tmp,$out1,$twk2	# last block prep for stealing
2861	le?vperm	$out1,$out1,$out1,$leperm
2862	stvx_u		$out0,$x00,$out		# store output
2863	stvx_u		$out1,$x10,$out
2864	addi		$out,$out,0x20
2865	bne		Lxts_enc6x_steal
2866	b		Lxts_enc6x_done
2867
2868.align	4
2869Lxts_enc6x_one:
2870	vxor		$out0,$in5,$twk0
2871	nop
2872Loop_xts_enc1x:
2873	vcipher		$out0,$out0,v24
2874	lvx		v24,$x20,$key_		# round[3]
2875	addi		$key_,$key_,0x20
2876
2877	vcipher		$out0,$out0,v25
2878	lvx		v25,$x10,$key_		# round[4]
2879	bdnz		Loop_xts_enc1x
2880
2881	add		$inp,$inp,$taillen
2882	cmpwi		$taillen,0
2883	vcipher		$out0,$out0,v24
2884
2885	subi		$inp,$inp,16
2886	vcipher		$out0,$out0,v25
2887
2888	lvsr		$inpperm,0,$taillen
2889	vcipher		$out0,$out0,v26
2890
2891	lvx_u		$in0,0,$inp
2892	vcipher		$out0,$out0,v27
2893
2894	addi		$key_,$sp,$FRAME+15	# rewind $key_
2895	vcipher		$out0,$out0,v28
2896	lvx		v24,$x00,$key_		# re-pre-load round[1]
2897
2898	vcipher		$out0,$out0,v29
2899	lvx		v25,$x10,$key_		# re-pre-load round[2]
2900	 vxor		$twk0,$twk0,v31
2901
2902	le?vperm	$in0,$in0,$in0,$leperm
2903	vcipher		$out0,$out0,v30
2904
2905	vperm		$in0,$in0,$in0,$inpperm
2906	vcipherlast	$out0,$out0,$twk0
2907
2908	vmr		$twk0,$twk1		# unused tweak
2909	vxor		$tmp,$out0,$twk1	# last block prep for stealing
2910	le?vperm	$out0,$out0,$out0,$leperm
2911	stvx_u		$out0,$x00,$out		# store output
2912	addi		$out,$out,0x10
2913	bne		Lxts_enc6x_steal
2914	b		Lxts_enc6x_done
2915
2916.align	4
2917Lxts_enc6x_zero:
2918	cmpwi		$taillen,0
2919	beq		Lxts_enc6x_done
2920
2921	add		$inp,$inp,$taillen
2922	subi		$inp,$inp,16
2923	lvx_u		$in0,0,$inp
2924	lvsr		$inpperm,0,$taillen	# $in5 is no more
2925	le?vperm	$in0,$in0,$in0,$leperm
2926	vperm		$in0,$in0,$in0,$inpperm
2927	vxor		$tmp,$tmp,$twk0
2928Lxts_enc6x_steal:
2929	vxor		$in0,$in0,$twk0
2930	vxor		$out0,$out0,$out0
2931	vspltisb	$out1,-1
2932	vperm		$out0,$out0,$out1,$inpperm
2933	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
2934
2935	subi		r30,$out,17
2936	subi		$out,$out,16
2937	mtctr		$taillen
2938Loop_xts_enc6x_steal:
2939	lbzu		r0,1(r30)
2940	stb		r0,16(r30)
2941	bdnz		Loop_xts_enc6x_steal
2942
2943	li		$taillen,0
2944	mtctr		$rounds
2945	b		Loop_xts_enc1x		# one more time...
2946
2947.align	4
2948Lxts_enc6x_done:
2949	${UCMP}i	$ivp,0
2950	beq		Lxts_enc6x_ret
2951
2952	vxor		$tweak,$twk0,$rndkey0
2953	le?vperm	$tweak,$tweak,$tweak,$leperm
2954	stvx_u		$tweak,0,$ivp
2955
2956Lxts_enc6x_ret:
2957	mtlr		r11
2958	li		r10,`$FRAME+15`
2959	li		r11,`$FRAME+31`
2960	stvx		$seven,r10,$sp		# wipe copies of round keys
2961	addi		r10,r10,32
2962	stvx		$seven,r11,$sp
2963	addi		r11,r11,32
2964	stvx		$seven,r10,$sp
2965	addi		r10,r10,32
2966	stvx		$seven,r11,$sp
2967	addi		r11,r11,32
2968	stvx		$seven,r10,$sp
2969	addi		r10,r10,32
2970	stvx		$seven,r11,$sp
2971	addi		r11,r11,32
2972	stvx		$seven,r10,$sp
2973	addi		r10,r10,32
2974	stvx		$seven,r11,$sp
2975	addi		r11,r11,32
2976
2977	mtspr		256,$vrsave
2978	lvx		v20,r10,$sp		# ABI says so
2979	addi		r10,r10,32
2980	lvx		v21,r11,$sp
2981	addi		r11,r11,32
2982	lvx		v22,r10,$sp
2983	addi		r10,r10,32
2984	lvx		v23,r11,$sp
2985	addi		r11,r11,32
2986	lvx		v24,r10,$sp
2987	addi		r10,r10,32
2988	lvx		v25,r11,$sp
2989	addi		r11,r11,32
2990	lvx		v26,r10,$sp
2991	addi		r10,r10,32
2992	lvx		v27,r11,$sp
2993	addi		r11,r11,32
2994	lvx		v28,r10,$sp
2995	addi		r10,r10,32
2996	lvx		v29,r11,$sp
2997	addi		r11,r11,32
2998	lvx		v30,r10,$sp
2999	lvx		v31,r11,$sp
3000	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3001	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3002	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3003	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3004	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3005	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3006	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3007	blr
3008	.long		0
3009	.byte		0,12,0x04,1,0x80,6,6,0
3010	.long		0
3011
3012.align	5
3013_aesp8_xts_enc5x:
3014	vcipher		$out0,$out0,v24
3015	vcipher		$out1,$out1,v24
3016	vcipher		$out2,$out2,v24
3017	vcipher		$out3,$out3,v24
3018	vcipher		$out4,$out4,v24
3019	lvx		v24,$x20,$key_		# round[3]
3020	addi		$key_,$key_,0x20
3021
3022	vcipher		$out0,$out0,v25
3023	vcipher		$out1,$out1,v25
3024	vcipher		$out2,$out2,v25
3025	vcipher		$out3,$out3,v25
3026	vcipher		$out4,$out4,v25
3027	lvx		v25,$x10,$key_		# round[4]
3028	bdnz		_aesp8_xts_enc5x
3029
3030	add		$inp,$inp,$taillen
3031	cmpwi		$taillen,0
3032	vcipher		$out0,$out0,v24
3033	vcipher		$out1,$out1,v24
3034	vcipher		$out2,$out2,v24
3035	vcipher		$out3,$out3,v24
3036	vcipher		$out4,$out4,v24
3037
3038	subi		$inp,$inp,16
3039	vcipher		$out0,$out0,v25
3040	vcipher		$out1,$out1,v25
3041	vcipher		$out2,$out2,v25
3042	vcipher		$out3,$out3,v25
3043	vcipher		$out4,$out4,v25
3044	 vxor		$twk0,$twk0,v31
3045
3046	vcipher		$out0,$out0,v26
3047	lvsr		$inpperm,0,$taillen	# $in5 is no more
3048	vcipher		$out1,$out1,v26
3049	vcipher		$out2,$out2,v26
3050	vcipher		$out3,$out3,v26
3051	vcipher		$out4,$out4,v26
3052	 vxor		$in1,$twk1,v31
3053
3054	vcipher		$out0,$out0,v27
3055	lvx_u		$in0,0,$inp
3056	vcipher		$out1,$out1,v27
3057	vcipher		$out2,$out2,v27
3058	vcipher		$out3,$out3,v27
3059	vcipher		$out4,$out4,v27
3060	 vxor		$in2,$twk2,v31
3061
3062	addi		$key_,$sp,$FRAME+15	# rewind $key_
3063	vcipher		$out0,$out0,v28
3064	vcipher		$out1,$out1,v28
3065	vcipher		$out2,$out2,v28
3066	vcipher		$out3,$out3,v28
3067	vcipher		$out4,$out4,v28
3068	lvx		v24,$x00,$key_		# re-pre-load round[1]
3069	 vxor		$in3,$twk3,v31
3070
3071	vcipher		$out0,$out0,v29
3072	le?vperm	$in0,$in0,$in0,$leperm
3073	vcipher		$out1,$out1,v29
3074	vcipher		$out2,$out2,v29
3075	vcipher		$out3,$out3,v29
3076	vcipher		$out4,$out4,v29
3077	lvx		v25,$x10,$key_		# re-pre-load round[2]
3078	 vxor		$in4,$twk4,v31
3079
3080	vcipher		$out0,$out0,v30
3081	vperm		$in0,$in0,$in0,$inpperm
3082	vcipher		$out1,$out1,v30
3083	vcipher		$out2,$out2,v30
3084	vcipher		$out3,$out3,v30
3085	vcipher		$out4,$out4,v30
3086
3087	vcipherlast	$out0,$out0,$twk0
3088	vcipherlast	$out1,$out1,$in1
3089	vcipherlast	$out2,$out2,$in2
3090	vcipherlast	$out3,$out3,$in3
3091	vcipherlast	$out4,$out4,$in4
3092	blr
3093        .long   	0
3094        .byte   	0,12,0x14,0,0,0,0,0
3095
3096.align	5
3097_aesp8_xts_decrypt6x:
3098	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
3099	mflr		r11
3100	li		r7,`$FRAME+8*16+15`
3101	li		r3,`$FRAME+8*16+31`
3102	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
3103	stvx		v20,r7,$sp		# ABI says so
3104	addi		r7,r7,32
3105	stvx		v21,r3,$sp
3106	addi		r3,r3,32
3107	stvx		v22,r7,$sp
3108	addi		r7,r7,32
3109	stvx		v23,r3,$sp
3110	addi		r3,r3,32
3111	stvx		v24,r7,$sp
3112	addi		r7,r7,32
3113	stvx		v25,r3,$sp
3114	addi		r3,r3,32
3115	stvx		v26,r7,$sp
3116	addi		r7,r7,32
3117	stvx		v27,r3,$sp
3118	addi		r3,r3,32
3119	stvx		v28,r7,$sp
3120	addi		r7,r7,32
3121	stvx		v29,r3,$sp
3122	addi		r3,r3,32
3123	stvx		v30,r7,$sp
3124	stvx		v31,r3,$sp
3125	li		r0,-1
3126	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
3127	li		$x10,0x10
3128	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3129	li		$x20,0x20
3130	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3131	li		$x30,0x30
3132	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3133	li		$x40,0x40
3134	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3135	li		$x50,0x50
3136	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3137	li		$x60,0x60
3138	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3139	li		$x70,0x70
3140	mtspr		256,r0
3141
3142	# Reverse eighty7 to 0x010101..87
3143	xxlor		2, 32+$eighty7, 32+$eighty7
3144	vsldoi		$eighty7,$tmp,$eighty7,1	# 0x010101..87
3145	xxlor		1, 32+$eighty7, 32+$eighty7
3146
3147	# Load XOR contents. 0xf102132435465768798a9bacbdcedfe
3148	mr		$x70, r6
3149	bl		Lconsts
3150	lxvw4x		0, $x40, r6		# load XOR contents
3151	mr		r6, $x70
3152	li		$x70,0x70
3153
3154	subi		$rounds,$rounds,3	# -4 in total
3155
3156	lvx		$rndkey0,$x00,$key1	# load key schedule
3157	lvx		v30,$x10,$key1
3158	addi		$key1,$key1,0x20
3159	lvx		v31,$x00,$key1
3160	?vperm		$rndkey0,$rndkey0,v30,$keyperm
3161	addi		$key_,$sp,$FRAME+15
3162	mtctr		$rounds
3163
3164Load_xts_dec_key:
3165	?vperm		v24,v30,v31,$keyperm
3166	lvx		v30,$x10,$key1
3167	addi		$key1,$key1,0x20
3168	stvx		v24,$x00,$key_		# off-load round[1]
3169	?vperm		v25,v31,v30,$keyperm
3170	lvx		v31,$x00,$key1
3171	stvx		v25,$x10,$key_		# off-load round[2]
3172	addi		$key_,$key_,0x20
3173	bdnz		Load_xts_dec_key
3174
3175	lvx		v26,$x10,$key1
3176	?vperm		v24,v30,v31,$keyperm
3177	lvx		v27,$x20,$key1
3178	stvx		v24,$x00,$key_		# off-load round[3]
3179	?vperm		v25,v31,v26,$keyperm
3180	lvx		v28,$x30,$key1
3181	stvx		v25,$x10,$key_		# off-load round[4]
3182	addi		$key_,$sp,$FRAME+15	# rewind $key_
3183	?vperm		v26,v26,v27,$keyperm
3184	lvx		v29,$x40,$key1
3185	?vperm		v27,v27,v28,$keyperm
3186	lvx		v30,$x50,$key1
3187	?vperm		v28,v28,v29,$keyperm
3188	lvx		v31,$x60,$key1
3189	?vperm		v29,v29,v30,$keyperm
3190	lvx		$twk5,$x70,$key1	# borrow $twk5
3191	?vperm		v30,v30,v31,$keyperm
3192	lvx		v24,$x00,$key_		# pre-load round[1]
3193	?vperm		v31,v31,$twk5,$keyperm
3194	lvx		v25,$x10,$key_		# pre-load round[2]
3195
3196	 vperm		$in0,$inout,$inptail,$inpperm
3197	 subi		$inp,$inp,31		# undo "caller"
3198	vxor		$twk0,$tweak,$rndkey0
3199	vsrab		$tmp,$tweak,$seven	# next tweak value
3200	vaddubm		$tweak,$tweak,$tweak
3201	vand		$tmp,$tmp,$eighty7
3202	 vxor		$out0,$in0,$twk0
3203	xxlor		32+$in1, 0, 0
3204	vpermxor	$tweak, $tweak, $tmp, $in1
3205
3206	 lvx_u		$in1,$x10,$inp
3207	vxor		$twk1,$tweak,$rndkey0
3208	vsrab		$tmp,$tweak,$seven	# next tweak value
3209	vaddubm		$tweak,$tweak,$tweak
3210	 le?vperm	$in1,$in1,$in1,$leperm
3211	vand		$tmp,$tmp,$eighty7
3212	 vxor		$out1,$in1,$twk1
3213	xxlor		32+$in2, 0, 0
3214	vpermxor	$tweak, $tweak, $tmp, $in2
3215
3216	 lvx_u		$in2,$x20,$inp
3217	 andi.		$taillen,$len,15
3218	vxor		$twk2,$tweak,$rndkey0
3219	vsrab		$tmp,$tweak,$seven	# next tweak value
3220	vaddubm		$tweak,$tweak,$tweak
3221	 le?vperm	$in2,$in2,$in2,$leperm
3222	vand		$tmp,$tmp,$eighty7
3223	 vxor		$out2,$in2,$twk2
3224	xxlor		32+$in3, 0, 0
3225	vpermxor	$tweak, $tweak, $tmp, $in3
3226
3227	 lvx_u		$in3,$x30,$inp
3228	 sub		$len,$len,$taillen
3229	vxor		$twk3,$tweak,$rndkey0
3230	vsrab		$tmp,$tweak,$seven	# next tweak value
3231	vaddubm		$tweak,$tweak,$tweak
3232	 le?vperm	$in3,$in3,$in3,$leperm
3233	vand		$tmp,$tmp,$eighty7
3234	 vxor		$out3,$in3,$twk3
3235	xxlor		32+$in4, 0, 0
3236	vpermxor	$tweak, $tweak, $tmp, $in4
3237
3238	 lvx_u		$in4,$x40,$inp
3239	 subi		$len,$len,0x60
3240	vxor		$twk4,$tweak,$rndkey0
3241	vsrab		$tmp,$tweak,$seven	# next tweak value
3242	vaddubm		$tweak,$tweak,$tweak
3243	 le?vperm	$in4,$in4,$in4,$leperm
3244	vand		$tmp,$tmp,$eighty7
3245	 vxor		$out4,$in4,$twk4
3246	xxlor		32+$in5, 0, 0
3247	vpermxor	$tweak, $tweak, $tmp, $in5
3248
3249	 lvx_u		$in5,$x50,$inp
3250	 addi		$inp,$inp,0x60
3251	vxor		$twk5,$tweak,$rndkey0
3252	vsrab		$tmp,$tweak,$seven	# next tweak value
3253	vaddubm		$tweak,$tweak,$tweak
3254	 le?vperm	$in5,$in5,$in5,$leperm
3255	vand		$tmp,$tmp,$eighty7
3256	 vxor		$out5,$in5,$twk5
3257	xxlor		32+$in0, 0, 0
3258	vpermxor	$tweak, $tweak, $tmp, $in0
3259
3260	vxor		v31,v31,$rndkey0
3261	mtctr		$rounds
3262	b		Loop_xts_dec6x
3263
3264.align	5
3265Loop_xts_dec6x:
3266	vncipher	$out0,$out0,v24
3267	vncipher	$out1,$out1,v24
3268	vncipher	$out2,$out2,v24
3269	vncipher	$out3,$out3,v24
3270	vncipher	$out4,$out4,v24
3271	vncipher	$out5,$out5,v24
3272	lvx		v24,$x20,$key_		# round[3]
3273	addi		$key_,$key_,0x20
3274
3275	vncipher	$out0,$out0,v25
3276	vncipher	$out1,$out1,v25
3277	vncipher	$out2,$out2,v25
3278	vncipher	$out3,$out3,v25
3279	vncipher	$out4,$out4,v25
3280	vncipher	$out5,$out5,v25
3281	lvx		v25,$x10,$key_		# round[4]
3282	bdnz		Loop_xts_dec6x
3283
3284	xxlor		32+$eighty7, 1, 1
3285
3286	subic		$len,$len,96		# $len-=96
3287	 vxor		$in0,$twk0,v31		# xor with last round key
3288	vncipher	$out0,$out0,v24
3289	vncipher	$out1,$out1,v24
3290	 vsrab		$tmp,$tweak,$seven	# next tweak value
3291	 vxor		$twk0,$tweak,$rndkey0
3292	 vaddubm	$tweak,$tweak,$tweak
3293	vncipher	$out2,$out2,v24
3294	vncipher	$out3,$out3,v24
3295	vncipher	$out4,$out4,v24
3296	vncipher	$out5,$out5,v24
3297
3298	subfe.		r0,r0,r0		# borrow?-1:0
3299	 vand		$tmp,$tmp,$eighty7
3300	vncipher	$out0,$out0,v25
3301	vncipher	$out1,$out1,v25
3302	 xxlor		32+$in1, 0, 0
3303	 vpermxor	$tweak, $tweak, $tmp, $in1
3304	vncipher	$out2,$out2,v25
3305	vncipher	$out3,$out3,v25
3306	 vxor		$in1,$twk1,v31
3307	 vsrab		$tmp,$tweak,$seven	# next tweak value
3308	 vxor		$twk1,$tweak,$rndkey0
3309	vncipher	$out4,$out4,v25
3310	vncipher	$out5,$out5,v25
3311
3312	and		r0,r0,$len
3313	 vaddubm	$tweak,$tweak,$tweak
3314	vncipher	$out0,$out0,v26
3315	vncipher	$out1,$out1,v26
3316	 vand		$tmp,$tmp,$eighty7
3317	vncipher	$out2,$out2,v26
3318	vncipher	$out3,$out3,v26
3319	 xxlor		32+$in2, 0, 0
3320	 vpermxor	$tweak, $tweak, $tmp, $in2
3321	vncipher	$out4,$out4,v26
3322	vncipher	$out5,$out5,v26
3323
3324	add		$inp,$inp,r0		# $inp is adjusted in such
3325						# way that at exit from the
3326						# loop inX-in5 are loaded
3327						# with last "words"
3328	 vxor		$in2,$twk2,v31
3329	 vsrab		$tmp,$tweak,$seven	# next tweak value
3330	 vxor		$twk2,$tweak,$rndkey0
3331	 vaddubm	$tweak,$tweak,$tweak
3332	vncipher	$out0,$out0,v27
3333	vncipher	$out1,$out1,v27
3334	vncipher	$out2,$out2,v27
3335	vncipher	$out3,$out3,v27
3336	 vand		$tmp,$tmp,$eighty7
3337	vncipher	$out4,$out4,v27
3338	vncipher	$out5,$out5,v27
3339
3340	addi		$key_,$sp,$FRAME+15	# rewind $key_
3341	 xxlor		32+$in3, 0, 0
3342	 vpermxor	$tweak, $tweak, $tmp, $in3
3343	vncipher	$out0,$out0,v28
3344	vncipher	$out1,$out1,v28
3345	 vxor		$in3,$twk3,v31
3346	 vsrab		$tmp,$tweak,$seven	# next tweak value
3347	 vxor		$twk3,$tweak,$rndkey0
3348	vncipher	$out2,$out2,v28
3349	vncipher	$out3,$out3,v28
3350	 vaddubm	$tweak,$tweak,$tweak
3351	vncipher	$out4,$out4,v28
3352	vncipher	$out5,$out5,v28
3353	lvx		v24,$x00,$key_		# re-pre-load round[1]
3354	 vand		$tmp,$tmp,$eighty7
3355
3356	vncipher	$out0,$out0,v29
3357	vncipher	$out1,$out1,v29
3358	 xxlor		32+$in4, 0, 0
3359	 vpermxor	$tweak, $tweak, $tmp, $in4
3360	vncipher	$out2,$out2,v29
3361	vncipher	$out3,$out3,v29
3362	 vxor		$in4,$twk4,v31
3363	 vsrab		$tmp,$tweak,$seven	# next tweak value
3364	 vxor		$twk4,$tweak,$rndkey0
3365	vncipher	$out4,$out4,v29
3366	vncipher	$out5,$out5,v29
3367	lvx		v25,$x10,$key_		# re-pre-load round[2]
3368	 vaddubm	$tweak,$tweak,$tweak
3369
3370	vncipher	$out0,$out0,v30
3371	vncipher	$out1,$out1,v30
3372	 vand		$tmp,$tmp,$eighty7
3373	vncipher	$out2,$out2,v30
3374	vncipher	$out3,$out3,v30
3375	 xxlor		32+$in5, 0, 0
3376	 vpermxor	$tweak, $tweak, $tmp, $in5
3377	vncipher	$out4,$out4,v30
3378	vncipher	$out5,$out5,v30
3379	 vxor		$in5,$twk5,v31
3380	 vsrab		$tmp,$tweak,$seven	# next tweak value
3381	 vxor		$twk5,$tweak,$rndkey0
3382
3383	vncipherlast	$out0,$out0,$in0
3384	 lvx_u		$in0,$x00,$inp		# load next input block
3385	 vaddubm	$tweak,$tweak,$tweak
3386	vncipherlast	$out1,$out1,$in1
3387	 lvx_u		$in1,$x10,$inp
3388	vncipherlast	$out2,$out2,$in2
3389	 le?vperm	$in0,$in0,$in0,$leperm
3390	 lvx_u		$in2,$x20,$inp
3391	 vand		$tmp,$tmp,$eighty7
3392	vncipherlast	$out3,$out3,$in3
3393	 le?vperm	$in1,$in1,$in1,$leperm
3394	 lvx_u		$in3,$x30,$inp
3395	vncipherlast	$out4,$out4,$in4
3396	 le?vperm	$in2,$in2,$in2,$leperm
3397	 lvx_u		$in4,$x40,$inp
3398	 xxlor		10, 32+$in0, 32+$in0
3399	 xxlor		32+$in0, 0, 0
3400	 vpermxor	$tweak, $tweak, $tmp, $in0
3401	 xxlor		32+$in0, 10, 10
3402	vncipherlast	$out5,$out5,$in5
3403	 le?vperm	$in3,$in3,$in3,$leperm
3404	 lvx_u		$in5,$x50,$inp
3405	 addi		$inp,$inp,0x60
3406	 le?vperm	$in4,$in4,$in4,$leperm
3407	 le?vperm	$in5,$in5,$in5,$leperm
3408
3409	le?vperm	$out0,$out0,$out0,$leperm
3410	le?vperm	$out1,$out1,$out1,$leperm
3411	stvx_u		$out0,$x00,$out		# store output
3412	 vxor		$out0,$in0,$twk0
3413	le?vperm	$out2,$out2,$out2,$leperm
3414	stvx_u		$out1,$x10,$out
3415	 vxor		$out1,$in1,$twk1
3416	le?vperm	$out3,$out3,$out3,$leperm
3417	stvx_u		$out2,$x20,$out
3418	 vxor		$out2,$in2,$twk2
3419	le?vperm	$out4,$out4,$out4,$leperm
3420	stvx_u		$out3,$x30,$out
3421	 vxor		$out3,$in3,$twk3
3422	le?vperm	$out5,$out5,$out5,$leperm
3423	stvx_u		$out4,$x40,$out
3424	 vxor		$out4,$in4,$twk4
3425	stvx_u		$out5,$x50,$out
3426	 vxor		$out5,$in5,$twk5
3427	addi		$out,$out,0x60
3428
3429	mtctr		$rounds
3430	beq		Loop_xts_dec6x		# did $len-=96 borrow?
3431
3432	xxlor		32+$eighty7, 2, 2
3433
3434	addic.		$len,$len,0x60
3435	beq		Lxts_dec6x_zero
3436	cmpwi		$len,0x20
3437	blt		Lxts_dec6x_one
3438	nop
3439	beq		Lxts_dec6x_two
3440	cmpwi		$len,0x40
3441	blt		Lxts_dec6x_three
3442	nop
3443	beq		Lxts_dec6x_four
3444
3445Lxts_dec6x_five:
3446	vxor		$out0,$in1,$twk0
3447	vxor		$out1,$in2,$twk1
3448	vxor		$out2,$in3,$twk2
3449	vxor		$out3,$in4,$twk3
3450	vxor		$out4,$in5,$twk4
3451
3452	bl		_aesp8_xts_dec5x
3453
3454	le?vperm	$out0,$out0,$out0,$leperm
3455	vmr		$twk0,$twk5		# unused tweak
3456	vxor		$twk1,$tweak,$rndkey0
3457	le?vperm	$out1,$out1,$out1,$leperm
3458	stvx_u		$out0,$x00,$out		# store output
3459	vxor		$out0,$in0,$twk1
3460	le?vperm	$out2,$out2,$out2,$leperm
3461	stvx_u		$out1,$x10,$out
3462	le?vperm	$out3,$out3,$out3,$leperm
3463	stvx_u		$out2,$x20,$out
3464	le?vperm	$out4,$out4,$out4,$leperm
3465	stvx_u		$out3,$x30,$out
3466	stvx_u		$out4,$x40,$out
3467	addi		$out,$out,0x50
3468	bne		Lxts_dec6x_steal
3469	b		Lxts_dec6x_done
3470
3471.align	4
3472Lxts_dec6x_four:
3473	vxor		$out0,$in2,$twk0
3474	vxor		$out1,$in3,$twk1
3475	vxor		$out2,$in4,$twk2
3476	vxor		$out3,$in5,$twk3
3477	vxor		$out4,$out4,$out4
3478
3479	bl		_aesp8_xts_dec5x
3480
3481	le?vperm	$out0,$out0,$out0,$leperm
3482	vmr		$twk0,$twk4		# unused tweak
3483	vmr		$twk1,$twk5
3484	le?vperm	$out1,$out1,$out1,$leperm
3485	stvx_u		$out0,$x00,$out		# store output
3486	vxor		$out0,$in0,$twk5
3487	le?vperm	$out2,$out2,$out2,$leperm
3488	stvx_u		$out1,$x10,$out
3489	le?vperm	$out3,$out3,$out3,$leperm
3490	stvx_u		$out2,$x20,$out
3491	stvx_u		$out3,$x30,$out
3492	addi		$out,$out,0x40
3493	bne		Lxts_dec6x_steal
3494	b		Lxts_dec6x_done
3495
3496.align	4
3497Lxts_dec6x_three:
3498	vxor		$out0,$in3,$twk0
3499	vxor		$out1,$in4,$twk1
3500	vxor		$out2,$in5,$twk2
3501	vxor		$out3,$out3,$out3
3502	vxor		$out4,$out4,$out4
3503
3504	bl		_aesp8_xts_dec5x
3505
3506	le?vperm	$out0,$out0,$out0,$leperm
3507	vmr		$twk0,$twk3		# unused tweak
3508	vmr		$twk1,$twk4
3509	le?vperm	$out1,$out1,$out1,$leperm
3510	stvx_u		$out0,$x00,$out		# store output
3511	vxor		$out0,$in0,$twk4
3512	le?vperm	$out2,$out2,$out2,$leperm
3513	stvx_u		$out1,$x10,$out
3514	stvx_u		$out2,$x20,$out
3515	addi		$out,$out,0x30
3516	bne		Lxts_dec6x_steal
3517	b		Lxts_dec6x_done
3518
3519.align	4
3520Lxts_dec6x_two:
3521	vxor		$out0,$in4,$twk0
3522	vxor		$out1,$in5,$twk1
3523	vxor		$out2,$out2,$out2
3524	vxor		$out3,$out3,$out3
3525	vxor		$out4,$out4,$out4
3526
3527	bl		_aesp8_xts_dec5x
3528
3529	le?vperm	$out0,$out0,$out0,$leperm
3530	vmr		$twk0,$twk2		# unused tweak
3531	vmr		$twk1,$twk3
3532	le?vperm	$out1,$out1,$out1,$leperm
3533	stvx_u		$out0,$x00,$out		# store output
3534	vxor		$out0,$in0,$twk3
3535	stvx_u		$out1,$x10,$out
3536	addi		$out,$out,0x20
3537	bne		Lxts_dec6x_steal
3538	b		Lxts_dec6x_done
3539
3540.align	4
3541Lxts_dec6x_one:
3542	vxor		$out0,$in5,$twk0
3543	nop
3544Loop_xts_dec1x:
3545	vncipher	$out0,$out0,v24
3546	lvx		v24,$x20,$key_		# round[3]
3547	addi		$key_,$key_,0x20
3548
3549	vncipher	$out0,$out0,v25
3550	lvx		v25,$x10,$key_		# round[4]
3551	bdnz		Loop_xts_dec1x
3552
3553	subi		r0,$taillen,1
3554	vncipher	$out0,$out0,v24
3555
3556	andi.		r0,r0,16
3557	cmpwi		$taillen,0
3558	vncipher	$out0,$out0,v25
3559
3560	sub		$inp,$inp,r0
3561	vncipher	$out0,$out0,v26
3562
3563	lvx_u		$in0,0,$inp
3564	vncipher	$out0,$out0,v27
3565
3566	addi		$key_,$sp,$FRAME+15	# rewind $key_
3567	vncipher	$out0,$out0,v28
3568	lvx		v24,$x00,$key_		# re-pre-load round[1]
3569
3570	vncipher	$out0,$out0,v29
3571	lvx		v25,$x10,$key_		# re-pre-load round[2]
3572	 vxor		$twk0,$twk0,v31
3573
3574	le?vperm	$in0,$in0,$in0,$leperm
3575	vncipher	$out0,$out0,v30
3576
3577	mtctr		$rounds
3578	vncipherlast	$out0,$out0,$twk0
3579
3580	vmr		$twk0,$twk1		# unused tweak
3581	vmr		$twk1,$twk2
3582	le?vperm	$out0,$out0,$out0,$leperm
3583	stvx_u		$out0,$x00,$out		# store output
3584	addi		$out,$out,0x10
3585	vxor		$out0,$in0,$twk2
3586	bne		Lxts_dec6x_steal
3587	b		Lxts_dec6x_done
3588
3589.align	4
3590Lxts_dec6x_zero:
3591	cmpwi		$taillen,0
3592	beq		Lxts_dec6x_done
3593
3594	lvx_u		$in0,0,$inp
3595	le?vperm	$in0,$in0,$in0,$leperm
3596	vxor		$out0,$in0,$twk1
3597Lxts_dec6x_steal:
3598	vncipher	$out0,$out0,v24
3599	lvx		v24,$x20,$key_		# round[3]
3600	addi		$key_,$key_,0x20
3601
3602	vncipher	$out0,$out0,v25
3603	lvx		v25,$x10,$key_		# round[4]
3604	bdnz		Lxts_dec6x_steal
3605
3606	add		$inp,$inp,$taillen
3607	vncipher	$out0,$out0,v24
3608
3609	cmpwi		$taillen,0
3610	vncipher	$out0,$out0,v25
3611
3612	lvx_u		$in0,0,$inp
3613	vncipher	$out0,$out0,v26
3614
3615	lvsr		$inpperm,0,$taillen	# $in5 is no more
3616	vncipher	$out0,$out0,v27
3617
3618	addi		$key_,$sp,$FRAME+15	# rewind $key_
3619	vncipher	$out0,$out0,v28
3620	lvx		v24,$x00,$key_		# re-pre-load round[1]
3621
3622	vncipher	$out0,$out0,v29
3623	lvx		v25,$x10,$key_		# re-pre-load round[2]
3624	 vxor		$twk1,$twk1,v31
3625
3626	le?vperm	$in0,$in0,$in0,$leperm
3627	vncipher	$out0,$out0,v30
3628
3629	vperm		$in0,$in0,$in0,$inpperm
3630	vncipherlast	$tmp,$out0,$twk1
3631
3632	le?vperm	$out0,$tmp,$tmp,$leperm
3633	le?stvx_u	$out0,0,$out
3634	be?stvx_u	$tmp,0,$out
3635
3636	vxor		$out0,$out0,$out0
3637	vspltisb	$out1,-1
3638	vperm		$out0,$out0,$out1,$inpperm
3639	vsel		$out0,$in0,$tmp,$out0
3640	vxor		$out0,$out0,$twk0
3641
3642	subi		r30,$out,1
3643	mtctr		$taillen
3644Loop_xts_dec6x_steal:
3645	lbzu		r0,1(r30)
3646	stb		r0,16(r30)
3647	bdnz		Loop_xts_dec6x_steal
3648
3649	li		$taillen,0
3650	mtctr		$rounds
3651	b		Loop_xts_dec1x		# one more time...
3652
3653.align	4
3654Lxts_dec6x_done:
3655	${UCMP}i	$ivp,0
3656	beq		Lxts_dec6x_ret
3657
3658	vxor		$tweak,$twk0,$rndkey0
3659	le?vperm	$tweak,$tweak,$tweak,$leperm
3660	stvx_u		$tweak,0,$ivp
3661
3662Lxts_dec6x_ret:
3663	mtlr		r11
3664	li		r10,`$FRAME+15`
3665	li		r11,`$FRAME+31`
3666	stvx		$seven,r10,$sp		# wipe copies of round keys
3667	addi		r10,r10,32
3668	stvx		$seven,r11,$sp
3669	addi		r11,r11,32
3670	stvx		$seven,r10,$sp
3671	addi		r10,r10,32
3672	stvx		$seven,r11,$sp
3673	addi		r11,r11,32
3674	stvx		$seven,r10,$sp
3675	addi		r10,r10,32
3676	stvx		$seven,r11,$sp
3677	addi		r11,r11,32
3678	stvx		$seven,r10,$sp
3679	addi		r10,r10,32
3680	stvx		$seven,r11,$sp
3681	addi		r11,r11,32
3682
3683	mtspr		256,$vrsave
3684	lvx		v20,r10,$sp		# ABI says so
3685	addi		r10,r10,32
3686	lvx		v21,r11,$sp
3687	addi		r11,r11,32
3688	lvx		v22,r10,$sp
3689	addi		r10,r10,32
3690	lvx		v23,r11,$sp
3691	addi		r11,r11,32
3692	lvx		v24,r10,$sp
3693	addi		r10,r10,32
3694	lvx		v25,r11,$sp
3695	addi		r11,r11,32
3696	lvx		v26,r10,$sp
3697	addi		r10,r10,32
3698	lvx		v27,r11,$sp
3699	addi		r11,r11,32
3700	lvx		v28,r10,$sp
3701	addi		r10,r10,32
3702	lvx		v29,r11,$sp
3703	addi		r11,r11,32
3704	lvx		v30,r10,$sp
3705	lvx		v31,r11,$sp
3706	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3707	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3708	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3709	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3710	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3711	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3712	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3713	blr
3714	.long		0
3715	.byte		0,12,0x04,1,0x80,6,6,0
3716	.long		0
3717
3718.align	5
3719_aesp8_xts_dec5x:
3720	vncipher	$out0,$out0,v24
3721	vncipher	$out1,$out1,v24
3722	vncipher	$out2,$out2,v24
3723	vncipher	$out3,$out3,v24
3724	vncipher	$out4,$out4,v24
3725	lvx		v24,$x20,$key_		# round[3]
3726	addi		$key_,$key_,0x20
3727
3728	vncipher	$out0,$out0,v25
3729	vncipher	$out1,$out1,v25
3730	vncipher	$out2,$out2,v25
3731	vncipher	$out3,$out3,v25
3732	vncipher	$out4,$out4,v25
3733	lvx		v25,$x10,$key_		# round[4]
3734	bdnz		_aesp8_xts_dec5x
3735
3736	subi		r0,$taillen,1
3737	vncipher	$out0,$out0,v24
3738	vncipher	$out1,$out1,v24
3739	vncipher	$out2,$out2,v24
3740	vncipher	$out3,$out3,v24
3741	vncipher	$out4,$out4,v24
3742
3743	andi.		r0,r0,16
3744	cmpwi		$taillen,0
3745	vncipher	$out0,$out0,v25
3746	vncipher	$out1,$out1,v25
3747	vncipher	$out2,$out2,v25
3748	vncipher	$out3,$out3,v25
3749	vncipher	$out4,$out4,v25
3750	 vxor		$twk0,$twk0,v31
3751
3752	sub		$inp,$inp,r0
3753	vncipher	$out0,$out0,v26
3754	vncipher	$out1,$out1,v26
3755	vncipher	$out2,$out2,v26
3756	vncipher	$out3,$out3,v26
3757	vncipher	$out4,$out4,v26
3758	 vxor		$in1,$twk1,v31
3759
3760	vncipher	$out0,$out0,v27
3761	lvx_u		$in0,0,$inp
3762	vncipher	$out1,$out1,v27
3763	vncipher	$out2,$out2,v27
3764	vncipher	$out3,$out3,v27
3765	vncipher	$out4,$out4,v27
3766	 vxor		$in2,$twk2,v31
3767
3768	addi		$key_,$sp,$FRAME+15	# rewind $key_
3769	vncipher	$out0,$out0,v28
3770	vncipher	$out1,$out1,v28
3771	vncipher	$out2,$out2,v28
3772	vncipher	$out3,$out3,v28
3773	vncipher	$out4,$out4,v28
3774	lvx		v24,$x00,$key_		# re-pre-load round[1]
3775	 vxor		$in3,$twk3,v31
3776
3777	vncipher	$out0,$out0,v29
3778	le?vperm	$in0,$in0,$in0,$leperm
3779	vncipher	$out1,$out1,v29
3780	vncipher	$out2,$out2,v29
3781	vncipher	$out3,$out3,v29
3782	vncipher	$out4,$out4,v29
3783	lvx		v25,$x10,$key_		# re-pre-load round[2]
3784	 vxor		$in4,$twk4,v31
3785
3786	vncipher	$out0,$out0,v30
3787	vncipher	$out1,$out1,v30
3788	vncipher	$out2,$out2,v30
3789	vncipher	$out3,$out3,v30
3790	vncipher	$out4,$out4,v30
3791
3792	vncipherlast	$out0,$out0,$twk0
3793	vncipherlast	$out1,$out1,$in1
3794	vncipherlast	$out2,$out2,$in2
3795	vncipherlast	$out3,$out3,$in3
3796	vncipherlast	$out4,$out4,$in4
3797	mtctr		$rounds
3798	blr
3799        .long   	0
3800        .byte   	0,12,0x14,0,0,0,0,0
3801___
3802}}	}}}
3803
3804my $consts=1;
3805foreach(split("\n",$code)) {
3806        s/\`([^\`]*)\`/eval($1)/geo;
3807
3808	# constants table endian-specific conversion
3809	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3810	    my $conv=$3;
3811	    my @bytes=();
3812
3813	    # convert to endian-agnostic format
3814	    if ($1 eq "long") {
3815	      foreach (split(/,\s*/,$2)) {
3816		my $l = /^0/?oct:int;
3817		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3818	      }
3819	    } else {
3820		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3821	    }
3822
3823	    # little-endian conversion
3824	    if ($flavour =~ /le$/o) {
3825		SWITCH: for($conv)  {
3826		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
3827		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
3828		}
3829	    }
3830
3831	    #emit
3832	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3833	    next;
3834	}
3835	$consts=0 if (m/Lconsts:/o);	# end of table
3836
3837	# instructions prefixed with '?' are endian-specific and need
3838	# to be adjusted accordingly...
3839	if ($flavour =~ /le$/o) {	# little-endian
3840	    s/le\?//o		or
3841	    s/be\?/#be#/o	or
3842	    s/\?lvsr/lvsl/o	or
3843	    s/\?lvsl/lvsr/o	or
3844	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3845	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3846	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3847	} else {			# big-endian
3848	    s/le\?/#le#/o	or
3849	    s/be\?//o		or
3850	    s/\?([a-z]+)/$1/o;
3851	}
3852
3853        print $_,"\n";
3854}
3855
3856close STDOUT or die "error closing STDOUT: $!";
3857