xref: /freebsd/crypto/openssl/crypto/aes/asm/aesp8-ppc.pl (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for AES instructions as per PowerISA
18# specification version 2.07, first implemented by POWER8 processor.
19# The module is endian-agnostic in sense that it supports both big-
20# and little-endian cases. Data alignment in parallelizable modes is
21# handled with VSX loads and stores, which implies MSR.VSX flag being
22# set. It should also be noted that ISA specification doesn't prohibit
23# alignment exceptions for these instructions on page boundaries.
24# Initially alignment was handled in pure AltiVec/VMX way [when data
25# is aligned programmatically, which in turn guarantees exception-
26# free execution], but it turned to hamper performance when vcipher
27# instructions are interleaved. It's reckoned that eventual
28# misalignment penalties at page boundaries are in average lower
29# than additional overhead in pure AltiVec approach.
30#
31# May 2016
32#
33# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
34# systems were measured.
35#
36######################################################################
37# Current large-block performance in cycles per byte processed with
38# 128-bit key (less is better).
39#
40#		CBC en-/decrypt	CTR	XTS
41# POWER8[le]	3.96/0.72	0.74	1.1
42# POWER8[be]	3.75/0.65	0.66	1.0
43# POWER9[le]	4.02/0.86	0.84	1.05
44# POWER9[be]	3.99/0.78	0.79	0.97
45
46# $output is the last argument if it looks like a file (it has an extension)
47# $flavour is the first argument if it doesn't look like a file
48$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
49$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
50
51if ($flavour =~ /64/) {
52	$SIZE_T	=8;
53	$LRSAVE	=2*$SIZE_T;
54	$STU	="stdu";
55	$POP	="ld";
56	$PUSH	="std";
57	$UCMP	="cmpld";
58	$SHL	="sldi";
59} elsif ($flavour =~ /32/) {
60	$SIZE_T	=4;
61	$LRSAVE	=$SIZE_T;
62	$STU	="stwu";
63	$POP	="lwz";
64	$PUSH	="stw";
65	$UCMP	="cmplw";
66	$SHL	="slwi";
67} else { die "nonsense $flavour"; }
68
69$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
70
71$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
72( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
73( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
74die "can't locate ppc-xlate.pl";
75
76open STDOUT,"| $^X $xlate $flavour \"$output\""
77    or die "can't call $xlate: $!";
78
79$FRAME=8*$SIZE_T;
80$prefix="aes_p8";
81
82$sp="r1";
83$vrsave="r12";
84
85#########################################################################
86{{{	# Key setup procedures						#
87my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
88my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
89my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
90
91$code.=<<___;
92.machine	"any"
93
94.text
95
96.align	7
97rcon:
98.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
99.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
100.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
101.long	0,0,0,0						?asis
102Lconsts:
103	mflr	r0
104	bcl	20,31,\$+4
105	mflr	$ptr	 #vvvvv "distance between . and rcon
106	addi	$ptr,$ptr,-0x48
107	mtlr	r0
108	blr
109	.long	0
110	.byte	0,12,0x14,0,0,0,0,0
111.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
112
113.globl	.${prefix}_set_encrypt_key
114.align	5
115.${prefix}_set_encrypt_key:
116Lset_encrypt_key:
117	mflr		r11
118	$PUSH		r11,$LRSAVE($sp)
119
120	li		$ptr,-1
121	${UCMP}i	$inp,0
122	beq-		Lenc_key_abort		# if ($inp==0) return -1;
123	${UCMP}i	$out,0
124	beq-		Lenc_key_abort		# if ($out==0) return -1;
125	li		$ptr,-2
126	cmpwi		$bits,128
127	blt-		Lenc_key_abort
128	cmpwi		$bits,256
129	bgt-		Lenc_key_abort
130	andi.		r0,$bits,0x3f
131	bne-		Lenc_key_abort
132
133	lis		r0,0xfff0
134	mfspr		$vrsave,256
135	mtspr		256,r0
136
137	bl		Lconsts
138	mtlr		r11
139
140	neg		r9,$inp
141	lvx		$in0,0,$inp
142	addi		$inp,$inp,15		# 15 is not typo
143	lvsr		$key,0,r9		# borrow $key
144	li		r8,0x20
145	cmpwi		$bits,192
146	lvx		$in1,0,$inp
147	le?vspltisb	$mask,0x0f		# borrow $mask
148	lvx		$rcon,0,$ptr
149	le?vxor		$key,$key,$mask		# adjust for byte swap
150	lvx		$mask,r8,$ptr
151	addi		$ptr,$ptr,0x10
152	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
153	li		$cnt,8
154	vxor		$zero,$zero,$zero
155	mtctr		$cnt
156
157	?lvsr		$outperm,0,$out
158	vspltisb	$outmask,-1
159	lvx		$outhead,0,$out
160	?vperm		$outmask,$zero,$outmask,$outperm
161
162	blt		Loop128
163	addi		$inp,$inp,8
164	beq		L192
165	addi		$inp,$inp,8
166	b		L256
167
168.align	4
169Loop128:
170	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
171	vsldoi		$tmp,$zero,$in0,12	# >>32
172	 vperm		$outtail,$in0,$in0,$outperm	# rotate
173	 vsel		$stage,$outhead,$outtail,$outmask
174	 vmr		$outhead,$outtail
175	vcipherlast	$key,$key,$rcon
176	 stvx		$stage,0,$out
177	 addi		$out,$out,16
178
179	vxor		$in0,$in0,$tmp
180	vsldoi		$tmp,$zero,$tmp,12	# >>32
181	vxor		$in0,$in0,$tmp
182	vsldoi		$tmp,$zero,$tmp,12	# >>32
183	vxor		$in0,$in0,$tmp
184	 vadduwm	$rcon,$rcon,$rcon
185	vxor		$in0,$in0,$key
186	bdnz		Loop128
187
188	lvx		$rcon,0,$ptr		# last two round keys
189
190	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
191	vsldoi		$tmp,$zero,$in0,12	# >>32
192	 vperm		$outtail,$in0,$in0,$outperm	# rotate
193	 vsel		$stage,$outhead,$outtail,$outmask
194	 vmr		$outhead,$outtail
195	vcipherlast	$key,$key,$rcon
196	 stvx		$stage,0,$out
197	 addi		$out,$out,16
198
199	vxor		$in0,$in0,$tmp
200	vsldoi		$tmp,$zero,$tmp,12	# >>32
201	vxor		$in0,$in0,$tmp
202	vsldoi		$tmp,$zero,$tmp,12	# >>32
203	vxor		$in0,$in0,$tmp
204	 vadduwm	$rcon,$rcon,$rcon
205	vxor		$in0,$in0,$key
206
207	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
208	vsldoi		$tmp,$zero,$in0,12	# >>32
209	 vperm		$outtail,$in0,$in0,$outperm	# rotate
210	 vsel		$stage,$outhead,$outtail,$outmask
211	 vmr		$outhead,$outtail
212	vcipherlast	$key,$key,$rcon
213	 stvx		$stage,0,$out
214	 addi		$out,$out,16
215
216	vxor		$in0,$in0,$tmp
217	vsldoi		$tmp,$zero,$tmp,12	# >>32
218	vxor		$in0,$in0,$tmp
219	vsldoi		$tmp,$zero,$tmp,12	# >>32
220	vxor		$in0,$in0,$tmp
221	vxor		$in0,$in0,$key
222	 vperm		$outtail,$in0,$in0,$outperm	# rotate
223	 vsel		$stage,$outhead,$outtail,$outmask
224	 vmr		$outhead,$outtail
225	 stvx		$stage,0,$out
226
227	addi		$inp,$out,15		# 15 is not typo
228	addi		$out,$out,0x50
229
230	li		$rounds,10
231	b		Ldone
232
233.align	4
234L192:
235	lvx		$tmp,0,$inp
236	li		$cnt,4
237	 vperm		$outtail,$in0,$in0,$outperm	# rotate
238	 vsel		$stage,$outhead,$outtail,$outmask
239	 vmr		$outhead,$outtail
240	 stvx		$stage,0,$out
241	 addi		$out,$out,16
242	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
243	vspltisb	$key,8			# borrow $key
244	mtctr		$cnt
245	vsububm		$mask,$mask,$key	# adjust the mask
246
247Loop192:
248	vperm		$key,$in1,$in1,$mask	# roate-n-splat
249	vsldoi		$tmp,$zero,$in0,12	# >>32
250	vcipherlast	$key,$key,$rcon
251
252	vxor		$in0,$in0,$tmp
253	vsldoi		$tmp,$zero,$tmp,12	# >>32
254	vxor		$in0,$in0,$tmp
255	vsldoi		$tmp,$zero,$tmp,12	# >>32
256	vxor		$in0,$in0,$tmp
257
258	 vsldoi		$stage,$zero,$in1,8
259	vspltw		$tmp,$in0,3
260	vxor		$tmp,$tmp,$in1
261	vsldoi		$in1,$zero,$in1,12	# >>32
262	 vadduwm	$rcon,$rcon,$rcon
263	vxor		$in1,$in1,$tmp
264	vxor		$in0,$in0,$key
265	vxor		$in1,$in1,$key
266	 vsldoi		$stage,$stage,$in0,8
267
268	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
269	vsldoi		$tmp,$zero,$in0,12	# >>32
270	 vperm		$outtail,$stage,$stage,$outperm	# rotate
271	 vsel		$stage,$outhead,$outtail,$outmask
272	 vmr		$outhead,$outtail
273	vcipherlast	$key,$key,$rcon
274	 stvx		$stage,0,$out
275	 addi		$out,$out,16
276
277	 vsldoi		$stage,$in0,$in1,8
278	vxor		$in0,$in0,$tmp
279	vsldoi		$tmp,$zero,$tmp,12	# >>32
280	 vperm		$outtail,$stage,$stage,$outperm	# rotate
281	 vsel		$stage,$outhead,$outtail,$outmask
282	 vmr		$outhead,$outtail
283	vxor		$in0,$in0,$tmp
284	vsldoi		$tmp,$zero,$tmp,12	# >>32
285	vxor		$in0,$in0,$tmp
286	 stvx		$stage,0,$out
287	 addi		$out,$out,16
288
289	vspltw		$tmp,$in0,3
290	vxor		$tmp,$tmp,$in1
291	vsldoi		$in1,$zero,$in1,12	# >>32
292	 vadduwm	$rcon,$rcon,$rcon
293	vxor		$in1,$in1,$tmp
294	vxor		$in0,$in0,$key
295	vxor		$in1,$in1,$key
296	 vperm		$outtail,$in0,$in0,$outperm	# rotate
297	 vsel		$stage,$outhead,$outtail,$outmask
298	 vmr		$outhead,$outtail
299	 stvx		$stage,0,$out
300	 addi		$inp,$out,15		# 15 is not typo
301	 addi		$out,$out,16
302	bdnz		Loop192
303
304	li		$rounds,12
305	addi		$out,$out,0x20
306	b		Ldone
307
308.align	4
309L256:
310	lvx		$tmp,0,$inp
311	li		$cnt,7
312	li		$rounds,14
313	 vperm		$outtail,$in0,$in0,$outperm	# rotate
314	 vsel		$stage,$outhead,$outtail,$outmask
315	 vmr		$outhead,$outtail
316	 stvx		$stage,0,$out
317	 addi		$out,$out,16
318	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
319	mtctr		$cnt
320
321Loop256:
322	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
323	vsldoi		$tmp,$zero,$in0,12	# >>32
324	 vperm		$outtail,$in1,$in1,$outperm	# rotate
325	 vsel		$stage,$outhead,$outtail,$outmask
326	 vmr		$outhead,$outtail
327	vcipherlast	$key,$key,$rcon
328	 stvx		$stage,0,$out
329	 addi		$out,$out,16
330
331	vxor		$in0,$in0,$tmp
332	vsldoi		$tmp,$zero,$tmp,12	# >>32
333	vxor		$in0,$in0,$tmp
334	vsldoi		$tmp,$zero,$tmp,12	# >>32
335	vxor		$in0,$in0,$tmp
336	 vadduwm	$rcon,$rcon,$rcon
337	vxor		$in0,$in0,$key
338	 vperm		$outtail,$in0,$in0,$outperm	# rotate
339	 vsel		$stage,$outhead,$outtail,$outmask
340	 vmr		$outhead,$outtail
341	 stvx		$stage,0,$out
342	 addi		$inp,$out,15		# 15 is not typo
343	 addi		$out,$out,16
344	bdz		Ldone
345
346	vspltw		$key,$in0,3		# just splat
347	vsldoi		$tmp,$zero,$in1,12	# >>32
348	vsbox		$key,$key
349
350	vxor		$in1,$in1,$tmp
351	vsldoi		$tmp,$zero,$tmp,12	# >>32
352	vxor		$in1,$in1,$tmp
353	vsldoi		$tmp,$zero,$tmp,12	# >>32
354	vxor		$in1,$in1,$tmp
355
356	vxor		$in1,$in1,$key
357	b		Loop256
358
359.align	4
360Ldone:
361	lvx		$in1,0,$inp		# redundant in aligned case
362	vsel		$in1,$outhead,$in1,$outmask
363	stvx		$in1,0,$inp
364	li		$ptr,0
365	mtspr		256,$vrsave
366	stw		$rounds,0($out)
367
368Lenc_key_abort:
369	mr		r3,$ptr
370	blr
371	.long		0
372	.byte		0,12,0x14,1,0,0,3,0
373	.long		0
374.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
375
376.globl	.${prefix}_set_decrypt_key
377.align	5
378.${prefix}_set_decrypt_key:
379	$STU		$sp,-$FRAME($sp)
380	mflr		r10
381	$PUSH		r10,$FRAME+$LRSAVE($sp)
382	bl		Lset_encrypt_key
383	mtlr		r10
384
385	cmpwi		r3,0
386	bne-		Ldec_key_abort
387
388	slwi		$cnt,$rounds,4
389	subi		$inp,$out,240		# first round key
390	srwi		$rounds,$rounds,1
391	add		$out,$inp,$cnt		# last round key
392	mtctr		$rounds
393
394Ldeckey:
395	lwz		r0, 0($inp)
396	lwz		r6, 4($inp)
397	lwz		r7, 8($inp)
398	lwz		r8, 12($inp)
399	addi		$inp,$inp,16
400	lwz		r9, 0($out)
401	lwz		r10,4($out)
402	lwz		r11,8($out)
403	lwz		r12,12($out)
404	stw		r0, 0($out)
405	stw		r6, 4($out)
406	stw		r7, 8($out)
407	stw		r8, 12($out)
408	subi		$out,$out,16
409	stw		r9, -16($inp)
410	stw		r10,-12($inp)
411	stw		r11,-8($inp)
412	stw		r12,-4($inp)
413	bdnz		Ldeckey
414
415	xor		r3,r3,r3		# return value
416Ldec_key_abort:
417	addi		$sp,$sp,$FRAME
418	blr
419	.long		0
420	.byte		0,12,4,1,0x80,0,3,0
421	.long		0
422.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
423___
424}}}
425#########################################################################
426{{{	# Single block en- and decrypt procedures			#
427sub gen_block () {
428my $dir = shift;
429my $n   = $dir eq "de" ? "n" : "";
430my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
431
432$code.=<<___;
433.globl	.${prefix}_${dir}crypt
434.align	5
435.${prefix}_${dir}crypt:
436	lwz		$rounds,240($key)
437	lis		r0,0xfc00
438	mfspr		$vrsave,256
439	li		$idx,15			# 15 is not typo
440	mtspr		256,r0
441
442	lvx		v0,0,$inp
443	neg		r11,$out
444	lvx		v1,$idx,$inp
445	lvsl		v2,0,$inp		# inpperm
446	le?vspltisb	v4,0x0f
447	?lvsl		v3,0,r11		# outperm
448	le?vxor		v2,v2,v4
449	li		$idx,16
450	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
451	lvx		v1,0,$key
452	?lvsl		v5,0,$key		# keyperm
453	srwi		$rounds,$rounds,1
454	lvx		v2,$idx,$key
455	addi		$idx,$idx,16
456	subi		$rounds,$rounds,1
457	?vperm		v1,v1,v2,v5		# align round key
458
459	vxor		v0,v0,v1
460	lvx		v1,$idx,$key
461	addi		$idx,$idx,16
462	mtctr		$rounds
463
464Loop_${dir}c:
465	?vperm		v2,v2,v1,v5
466	v${n}cipher	v0,v0,v2
467	lvx		v2,$idx,$key
468	addi		$idx,$idx,16
469	?vperm		v1,v1,v2,v5
470	v${n}cipher	v0,v0,v1
471	lvx		v1,$idx,$key
472	addi		$idx,$idx,16
473	bdnz		Loop_${dir}c
474
475	?vperm		v2,v2,v1,v5
476	v${n}cipher	v0,v0,v2
477	lvx		v2,$idx,$key
478	?vperm		v1,v1,v2,v5
479	v${n}cipherlast	v0,v0,v1
480
481	vspltisb	v2,-1
482	vxor		v1,v1,v1
483	li		$idx,15			# 15 is not typo
484	?vperm		v2,v1,v2,v3		# outmask
485	le?vxor		v3,v3,v4
486	lvx		v1,0,$out		# outhead
487	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
488	vsel		v1,v1,v0,v2
489	lvx		v4,$idx,$out
490	stvx		v1,0,$out
491	vsel		v0,v0,v4,v2
492	stvx		v0,$idx,$out
493
494	mtspr		256,$vrsave
495	blr
496	.long		0
497	.byte		0,12,0x14,0,0,0,3,0
498	.long		0
499.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
500___
501}
502&gen_block("en");
503&gen_block("de");
504}}}
505#########################################################################
506{{{	# CBC en- and decrypt procedures				#
507my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
508my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
509my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
510						map("v$_",(4..10));
511$code.=<<___;
512.globl	.${prefix}_cbc_encrypt
513.align	5
514.${prefix}_cbc_encrypt:
515	${UCMP}i	$len,16
516	bltlr-
517
518	cmpwi		$enc,0			# test direction
519	lis		r0,0xffe0
520	mfspr		$vrsave,256
521	mtspr		256,r0
522
523	li		$idx,15
524	vxor		$rndkey0,$rndkey0,$rndkey0
525	le?vspltisb	$tmp,0x0f
526
527	lvx		$ivec,0,$ivp		# load [unaligned] iv
528	lvsl		$inpperm,0,$ivp
529	lvx		$inptail,$idx,$ivp
530	le?vxor		$inpperm,$inpperm,$tmp
531	vperm		$ivec,$ivec,$inptail,$inpperm
532
533	neg		r11,$inp
534	?lvsl		$keyperm,0,$key		# prepare for unaligned key
535	lwz		$rounds,240($key)
536
537	lvsr		$inpperm,0,r11		# prepare for unaligned load
538	lvx		$inptail,0,$inp
539	addi		$inp,$inp,15		# 15 is not typo
540	le?vxor		$inpperm,$inpperm,$tmp
541
542	?lvsr		$outperm,0,$out		# prepare for unaligned store
543	vspltisb	$outmask,-1
544	lvx		$outhead,0,$out
545	?vperm		$outmask,$rndkey0,$outmask,$outperm
546	le?vxor		$outperm,$outperm,$tmp
547
548	srwi		$rounds,$rounds,1
549	li		$idx,16
550	subi		$rounds,$rounds,1
551	beq		Lcbc_dec
552
553Lcbc_enc:
554	vmr		$inout,$inptail
555	lvx		$inptail,0,$inp
556	addi		$inp,$inp,16
557	mtctr		$rounds
558	subi		$len,$len,16		# len-=16
559
560	lvx		$rndkey0,0,$key
561	 vperm		$inout,$inout,$inptail,$inpperm
562	lvx		$rndkey1,$idx,$key
563	addi		$idx,$idx,16
564	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
565	vxor		$inout,$inout,$rndkey0
566	lvx		$rndkey0,$idx,$key
567	addi		$idx,$idx,16
568	vxor		$inout,$inout,$ivec
569
570Loop_cbc_enc:
571	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
572	vcipher		$inout,$inout,$rndkey1
573	lvx		$rndkey1,$idx,$key
574	addi		$idx,$idx,16
575	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
576	vcipher		$inout,$inout,$rndkey0
577	lvx		$rndkey0,$idx,$key
578	addi		$idx,$idx,16
579	bdnz		Loop_cbc_enc
580
581	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
582	vcipher		$inout,$inout,$rndkey1
583	lvx		$rndkey1,$idx,$key
584	li		$idx,16
585	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
586	vcipherlast	$ivec,$inout,$rndkey0
587	${UCMP}i	$len,16
588
589	vperm		$tmp,$ivec,$ivec,$outperm
590	vsel		$inout,$outhead,$tmp,$outmask
591	vmr		$outhead,$tmp
592	stvx		$inout,0,$out
593	addi		$out,$out,16
594	bge		Lcbc_enc
595
596	b		Lcbc_done
597
598.align	4
599Lcbc_dec:
600	${UCMP}i	$len,128
601	bge		_aesp8_cbc_decrypt8x
602	vmr		$tmp,$inptail
603	lvx		$inptail,0,$inp
604	addi		$inp,$inp,16
605	mtctr		$rounds
606	subi		$len,$len,16		# len-=16
607
608	lvx		$rndkey0,0,$key
609	 vperm		$tmp,$tmp,$inptail,$inpperm
610	lvx		$rndkey1,$idx,$key
611	addi		$idx,$idx,16
612	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
613	vxor		$inout,$tmp,$rndkey0
614	lvx		$rndkey0,$idx,$key
615	addi		$idx,$idx,16
616
617Loop_cbc_dec:
618	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
619	vncipher	$inout,$inout,$rndkey1
620	lvx		$rndkey1,$idx,$key
621	addi		$idx,$idx,16
622	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
623	vncipher	$inout,$inout,$rndkey0
624	lvx		$rndkey0,$idx,$key
625	addi		$idx,$idx,16
626	bdnz		Loop_cbc_dec
627
628	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
629	vncipher	$inout,$inout,$rndkey1
630	lvx		$rndkey1,$idx,$key
631	li		$idx,16
632	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
633	vncipherlast	$inout,$inout,$rndkey0
634	${UCMP}i	$len,16
635
636	vxor		$inout,$inout,$ivec
637	vmr		$ivec,$tmp
638	vperm		$tmp,$inout,$inout,$outperm
639	vsel		$inout,$outhead,$tmp,$outmask
640	vmr		$outhead,$tmp
641	stvx		$inout,0,$out
642	addi		$out,$out,16
643	bge		Lcbc_dec
644
645Lcbc_done:
646	addi		$out,$out,-1
647	lvx		$inout,0,$out		# redundant in aligned case
648	vsel		$inout,$outhead,$inout,$outmask
649	stvx		$inout,0,$out
650
651	neg		$enc,$ivp		# write [unaligned] iv
652	li		$idx,15			# 15 is not typo
653	vxor		$rndkey0,$rndkey0,$rndkey0
654	vspltisb	$outmask,-1
655	le?vspltisb	$tmp,0x0f
656	?lvsl		$outperm,0,$enc
657	?vperm		$outmask,$rndkey0,$outmask,$outperm
658	le?vxor		$outperm,$outperm,$tmp
659	lvx		$outhead,0,$ivp
660	vperm		$ivec,$ivec,$ivec,$outperm
661	vsel		$inout,$outhead,$ivec,$outmask
662	lvx		$inptail,$idx,$ivp
663	stvx		$inout,0,$ivp
664	vsel		$inout,$ivec,$inptail,$outmask
665	stvx		$inout,$idx,$ivp
666
667	mtspr		256,$vrsave
668	blr
669	.long		0
670	.byte		0,12,0x14,0,0,0,6,0
671	.long		0
672___
673#########################################################################
674{{	# Optimized CBC decrypt procedure				#
675my $key_="r11";
676my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
677    $x00=0 if ($flavour =~ /osx/);
678my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
679my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
680my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
681			# v26-v31 last 6 round keys
682my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
683
684$code.=<<___;
685.align	5
686_aesp8_cbc_decrypt8x:
687	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
688	li		r10,`$FRAME+8*16+15`
689	li		r11,`$FRAME+8*16+31`
690	stvx		v20,r10,$sp		# ABI says so
691	addi		r10,r10,32
692	stvx		v21,r11,$sp
693	addi		r11,r11,32
694	stvx		v22,r10,$sp
695	addi		r10,r10,32
696	stvx		v23,r11,$sp
697	addi		r11,r11,32
698	stvx		v24,r10,$sp
699	addi		r10,r10,32
700	stvx		v25,r11,$sp
701	addi		r11,r11,32
702	stvx		v26,r10,$sp
703	addi		r10,r10,32
704	stvx		v27,r11,$sp
705	addi		r11,r11,32
706	stvx		v28,r10,$sp
707	addi		r10,r10,32
708	stvx		v29,r11,$sp
709	addi		r11,r11,32
710	stvx		v30,r10,$sp
711	stvx		v31,r11,$sp
712	li		r0,-1
713	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
714	li		$x10,0x10
715	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
716	li		$x20,0x20
717	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
718	li		$x30,0x30
719	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
720	li		$x40,0x40
721	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
722	li		$x50,0x50
723	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
724	li		$x60,0x60
725	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
726	li		$x70,0x70
727	mtspr		256,r0
728
729	subi		$rounds,$rounds,3	# -4 in total
730	subi		$len,$len,128		# bias
731
732	lvx		$rndkey0,$x00,$key	# load key schedule
733	lvx		v30,$x10,$key
734	addi		$key,$key,0x20
735	lvx		v31,$x00,$key
736	?vperm		$rndkey0,$rndkey0,v30,$keyperm
737	addi		$key_,$sp,$FRAME+15
738	mtctr		$rounds
739
740Load_cbc_dec_key:
741	?vperm		v24,v30,v31,$keyperm
742	lvx		v30,$x10,$key
743	addi		$key,$key,0x20
744	stvx		v24,$x00,$key_		# off-load round[1]
745	?vperm		v25,v31,v30,$keyperm
746	lvx		v31,$x00,$key
747	stvx		v25,$x10,$key_		# off-load round[2]
748	addi		$key_,$key_,0x20
749	bdnz		Load_cbc_dec_key
750
751	lvx		v26,$x10,$key
752	?vperm		v24,v30,v31,$keyperm
753	lvx		v27,$x20,$key
754	stvx		v24,$x00,$key_		# off-load round[3]
755	?vperm		v25,v31,v26,$keyperm
756	lvx		v28,$x30,$key
757	stvx		v25,$x10,$key_		# off-load round[4]
758	addi		$key_,$sp,$FRAME+15	# rewind $key_
759	?vperm		v26,v26,v27,$keyperm
760	lvx		v29,$x40,$key
761	?vperm		v27,v27,v28,$keyperm
762	lvx		v30,$x50,$key
763	?vperm		v28,v28,v29,$keyperm
764	lvx		v31,$x60,$key
765	?vperm		v29,v29,v30,$keyperm
766	lvx		$out0,$x70,$key		# borrow $out0
767	?vperm		v30,v30,v31,$keyperm
768	lvx		v24,$x00,$key_		# pre-load round[1]
769	?vperm		v31,v31,$out0,$keyperm
770	lvx		v25,$x10,$key_		# pre-load round[2]
771
772	#lvx		$inptail,0,$inp		# "caller" already did this
773	#addi		$inp,$inp,15		# 15 is not typo
774	subi		$inp,$inp,15		# undo "caller"
775
776	 le?li		$idx,8
777	lvx_u		$in0,$x00,$inp		# load first 8 "words"
778	 le?lvsl	$inpperm,0,$idx
779	 le?vspltisb	$tmp,0x0f
780	lvx_u		$in1,$x10,$inp
781	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
782	lvx_u		$in2,$x20,$inp
783	 le?vperm	$in0,$in0,$in0,$inpperm
784	lvx_u		$in3,$x30,$inp
785	 le?vperm	$in1,$in1,$in1,$inpperm
786	lvx_u		$in4,$x40,$inp
787	 le?vperm	$in2,$in2,$in2,$inpperm
788	vxor		$out0,$in0,$rndkey0
789	lvx_u		$in5,$x50,$inp
790	 le?vperm	$in3,$in3,$in3,$inpperm
791	vxor		$out1,$in1,$rndkey0
792	lvx_u		$in6,$x60,$inp
793	 le?vperm	$in4,$in4,$in4,$inpperm
794	vxor		$out2,$in2,$rndkey0
795	lvx_u		$in7,$x70,$inp
796	addi		$inp,$inp,0x80
797	 le?vperm	$in5,$in5,$in5,$inpperm
798	vxor		$out3,$in3,$rndkey0
799	 le?vperm	$in6,$in6,$in6,$inpperm
800	vxor		$out4,$in4,$rndkey0
801	 le?vperm	$in7,$in7,$in7,$inpperm
802	vxor		$out5,$in5,$rndkey0
803	vxor		$out6,$in6,$rndkey0
804	vxor		$out7,$in7,$rndkey0
805
806	mtctr		$rounds
807	b		Loop_cbc_dec8x
808.align	5
809Loop_cbc_dec8x:
810	vncipher	$out0,$out0,v24
811	vncipher	$out1,$out1,v24
812	vncipher	$out2,$out2,v24
813	vncipher	$out3,$out3,v24
814	vncipher	$out4,$out4,v24
815	vncipher	$out5,$out5,v24
816	vncipher	$out6,$out6,v24
817	vncipher	$out7,$out7,v24
818	lvx		v24,$x20,$key_		# round[3]
819	addi		$key_,$key_,0x20
820
821	vncipher	$out0,$out0,v25
822	vncipher	$out1,$out1,v25
823	vncipher	$out2,$out2,v25
824	vncipher	$out3,$out3,v25
825	vncipher	$out4,$out4,v25
826	vncipher	$out5,$out5,v25
827	vncipher	$out6,$out6,v25
828	vncipher	$out7,$out7,v25
829	lvx		v25,$x10,$key_		# round[4]
830	bdnz		Loop_cbc_dec8x
831
832	subic		$len,$len,128		# $len-=128
833	vncipher	$out0,$out0,v24
834	vncipher	$out1,$out1,v24
835	vncipher	$out2,$out2,v24
836	vncipher	$out3,$out3,v24
837	vncipher	$out4,$out4,v24
838	vncipher	$out5,$out5,v24
839	vncipher	$out6,$out6,v24
840	vncipher	$out7,$out7,v24
841
842	subfe.		r0,r0,r0		# borrow?-1:0
843	vncipher	$out0,$out0,v25
844	vncipher	$out1,$out1,v25
845	vncipher	$out2,$out2,v25
846	vncipher	$out3,$out3,v25
847	vncipher	$out4,$out4,v25
848	vncipher	$out5,$out5,v25
849	vncipher	$out6,$out6,v25
850	vncipher	$out7,$out7,v25
851
852	and		r0,r0,$len
853	vncipher	$out0,$out0,v26
854	vncipher	$out1,$out1,v26
855	vncipher	$out2,$out2,v26
856	vncipher	$out3,$out3,v26
857	vncipher	$out4,$out4,v26
858	vncipher	$out5,$out5,v26
859	vncipher	$out6,$out6,v26
860	vncipher	$out7,$out7,v26
861
862	add		$inp,$inp,r0		# $inp is adjusted in such
863						# way that at exit from the
864						# loop inX-in7 are loaded
865						# with last "words"
866	vncipher	$out0,$out0,v27
867	vncipher	$out1,$out1,v27
868	vncipher	$out2,$out2,v27
869	vncipher	$out3,$out3,v27
870	vncipher	$out4,$out4,v27
871	vncipher	$out5,$out5,v27
872	vncipher	$out6,$out6,v27
873	vncipher	$out7,$out7,v27
874
875	addi		$key_,$sp,$FRAME+15	# rewind $key_
876	vncipher	$out0,$out0,v28
877	vncipher	$out1,$out1,v28
878	vncipher	$out2,$out2,v28
879	vncipher	$out3,$out3,v28
880	vncipher	$out4,$out4,v28
881	vncipher	$out5,$out5,v28
882	vncipher	$out6,$out6,v28
883	vncipher	$out7,$out7,v28
884	lvx		v24,$x00,$key_		# re-pre-load round[1]
885
886	vncipher	$out0,$out0,v29
887	vncipher	$out1,$out1,v29
888	vncipher	$out2,$out2,v29
889	vncipher	$out3,$out3,v29
890	vncipher	$out4,$out4,v29
891	vncipher	$out5,$out5,v29
892	vncipher	$out6,$out6,v29
893	vncipher	$out7,$out7,v29
894	lvx		v25,$x10,$key_		# re-pre-load round[2]
895
896	vncipher	$out0,$out0,v30
897	 vxor		$ivec,$ivec,v31		# xor with last round key
898	vncipher	$out1,$out1,v30
899	 vxor		$in0,$in0,v31
900	vncipher	$out2,$out2,v30
901	 vxor		$in1,$in1,v31
902	vncipher	$out3,$out3,v30
903	 vxor		$in2,$in2,v31
904	vncipher	$out4,$out4,v30
905	 vxor		$in3,$in3,v31
906	vncipher	$out5,$out5,v30
907	 vxor		$in4,$in4,v31
908	vncipher	$out6,$out6,v30
909	 vxor		$in5,$in5,v31
910	vncipher	$out7,$out7,v30
911	 vxor		$in6,$in6,v31
912
913	vncipherlast	$out0,$out0,$ivec
914	vncipherlast	$out1,$out1,$in0
915	 lvx_u		$in0,$x00,$inp		# load next input block
916	vncipherlast	$out2,$out2,$in1
917	 lvx_u		$in1,$x10,$inp
918	vncipherlast	$out3,$out3,$in2
919	 le?vperm	$in0,$in0,$in0,$inpperm
920	 lvx_u		$in2,$x20,$inp
921	vncipherlast	$out4,$out4,$in3
922	 le?vperm	$in1,$in1,$in1,$inpperm
923	 lvx_u		$in3,$x30,$inp
924	vncipherlast	$out5,$out5,$in4
925	 le?vperm	$in2,$in2,$in2,$inpperm
926	 lvx_u		$in4,$x40,$inp
927	vncipherlast	$out6,$out6,$in5
928	 le?vperm	$in3,$in3,$in3,$inpperm
929	 lvx_u		$in5,$x50,$inp
930	vncipherlast	$out7,$out7,$in6
931	 le?vperm	$in4,$in4,$in4,$inpperm
932	 lvx_u		$in6,$x60,$inp
933	vmr		$ivec,$in7
934	 le?vperm	$in5,$in5,$in5,$inpperm
935	 lvx_u		$in7,$x70,$inp
936	 addi		$inp,$inp,0x80
937
938	le?vperm	$out0,$out0,$out0,$inpperm
939	le?vperm	$out1,$out1,$out1,$inpperm
940	stvx_u		$out0,$x00,$out
941	 le?vperm	$in6,$in6,$in6,$inpperm
942	 vxor		$out0,$in0,$rndkey0
943	le?vperm	$out2,$out2,$out2,$inpperm
944	stvx_u		$out1,$x10,$out
945	 le?vperm	$in7,$in7,$in7,$inpperm
946	 vxor		$out1,$in1,$rndkey0
947	le?vperm	$out3,$out3,$out3,$inpperm
948	stvx_u		$out2,$x20,$out
949	 vxor		$out2,$in2,$rndkey0
950	le?vperm	$out4,$out4,$out4,$inpperm
951	stvx_u		$out3,$x30,$out
952	 vxor		$out3,$in3,$rndkey0
953	le?vperm	$out5,$out5,$out5,$inpperm
954	stvx_u		$out4,$x40,$out
955	 vxor		$out4,$in4,$rndkey0
956	le?vperm	$out6,$out6,$out6,$inpperm
957	stvx_u		$out5,$x50,$out
958	 vxor		$out5,$in5,$rndkey0
959	le?vperm	$out7,$out7,$out7,$inpperm
960	stvx_u		$out6,$x60,$out
961	 vxor		$out6,$in6,$rndkey0
962	stvx_u		$out7,$x70,$out
963	addi		$out,$out,0x80
964	 vxor		$out7,$in7,$rndkey0
965
966	mtctr		$rounds
967	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
968
969	addic.		$len,$len,128
970	beq		Lcbc_dec8x_done
971	nop
972	nop
973
974Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
975	vncipher	$out1,$out1,v24
976	vncipher	$out2,$out2,v24
977	vncipher	$out3,$out3,v24
978	vncipher	$out4,$out4,v24
979	vncipher	$out5,$out5,v24
980	vncipher	$out6,$out6,v24
981	vncipher	$out7,$out7,v24
982	lvx		v24,$x20,$key_		# round[3]
983	addi		$key_,$key_,0x20
984
985	vncipher	$out1,$out1,v25
986	vncipher	$out2,$out2,v25
987	vncipher	$out3,$out3,v25
988	vncipher	$out4,$out4,v25
989	vncipher	$out5,$out5,v25
990	vncipher	$out6,$out6,v25
991	vncipher	$out7,$out7,v25
992	lvx		v25,$x10,$key_		# round[4]
993	bdnz		Loop_cbc_dec8x_tail
994
995	vncipher	$out1,$out1,v24
996	vncipher	$out2,$out2,v24
997	vncipher	$out3,$out3,v24
998	vncipher	$out4,$out4,v24
999	vncipher	$out5,$out5,v24
1000	vncipher	$out6,$out6,v24
1001	vncipher	$out7,$out7,v24
1002
1003	vncipher	$out1,$out1,v25
1004	vncipher	$out2,$out2,v25
1005	vncipher	$out3,$out3,v25
1006	vncipher	$out4,$out4,v25
1007	vncipher	$out5,$out5,v25
1008	vncipher	$out6,$out6,v25
1009	vncipher	$out7,$out7,v25
1010
1011	vncipher	$out1,$out1,v26
1012	vncipher	$out2,$out2,v26
1013	vncipher	$out3,$out3,v26
1014	vncipher	$out4,$out4,v26
1015	vncipher	$out5,$out5,v26
1016	vncipher	$out6,$out6,v26
1017	vncipher	$out7,$out7,v26
1018
1019	vncipher	$out1,$out1,v27
1020	vncipher	$out2,$out2,v27
1021	vncipher	$out3,$out3,v27
1022	vncipher	$out4,$out4,v27
1023	vncipher	$out5,$out5,v27
1024	vncipher	$out6,$out6,v27
1025	vncipher	$out7,$out7,v27
1026
1027	vncipher	$out1,$out1,v28
1028	vncipher	$out2,$out2,v28
1029	vncipher	$out3,$out3,v28
1030	vncipher	$out4,$out4,v28
1031	vncipher	$out5,$out5,v28
1032	vncipher	$out6,$out6,v28
1033	vncipher	$out7,$out7,v28
1034
1035	vncipher	$out1,$out1,v29
1036	vncipher	$out2,$out2,v29
1037	vncipher	$out3,$out3,v29
1038	vncipher	$out4,$out4,v29
1039	vncipher	$out5,$out5,v29
1040	vncipher	$out6,$out6,v29
1041	vncipher	$out7,$out7,v29
1042
1043	vncipher	$out1,$out1,v30
1044	 vxor		$ivec,$ivec,v31		# last round key
1045	vncipher	$out2,$out2,v30
1046	 vxor		$in1,$in1,v31
1047	vncipher	$out3,$out3,v30
1048	 vxor		$in2,$in2,v31
1049	vncipher	$out4,$out4,v30
1050	 vxor		$in3,$in3,v31
1051	vncipher	$out5,$out5,v30
1052	 vxor		$in4,$in4,v31
1053	vncipher	$out6,$out6,v30
1054	 vxor		$in5,$in5,v31
1055	vncipher	$out7,$out7,v30
1056	 vxor		$in6,$in6,v31
1057
1058	cmplwi		$len,32			# switch($len)
1059	blt		Lcbc_dec8x_one
1060	nop
1061	beq		Lcbc_dec8x_two
1062	cmplwi		$len,64
1063	blt		Lcbc_dec8x_three
1064	nop
1065	beq		Lcbc_dec8x_four
1066	cmplwi		$len,96
1067	blt		Lcbc_dec8x_five
1068	nop
1069	beq		Lcbc_dec8x_six
1070
1071Lcbc_dec8x_seven:
1072	vncipherlast	$out1,$out1,$ivec
1073	vncipherlast	$out2,$out2,$in1
1074	vncipherlast	$out3,$out3,$in2
1075	vncipherlast	$out4,$out4,$in3
1076	vncipherlast	$out5,$out5,$in4
1077	vncipherlast	$out6,$out6,$in5
1078	vncipherlast	$out7,$out7,$in6
1079	vmr		$ivec,$in7
1080
1081	le?vperm	$out1,$out1,$out1,$inpperm
1082	le?vperm	$out2,$out2,$out2,$inpperm
1083	stvx_u		$out1,$x00,$out
1084	le?vperm	$out3,$out3,$out3,$inpperm
1085	stvx_u		$out2,$x10,$out
1086	le?vperm	$out4,$out4,$out4,$inpperm
1087	stvx_u		$out3,$x20,$out
1088	le?vperm	$out5,$out5,$out5,$inpperm
1089	stvx_u		$out4,$x30,$out
1090	le?vperm	$out6,$out6,$out6,$inpperm
1091	stvx_u		$out5,$x40,$out
1092	le?vperm	$out7,$out7,$out7,$inpperm
1093	stvx_u		$out6,$x50,$out
1094	stvx_u		$out7,$x60,$out
1095	addi		$out,$out,0x70
1096	b		Lcbc_dec8x_done
1097
1098.align	5
1099Lcbc_dec8x_six:
1100	vncipherlast	$out2,$out2,$ivec
1101	vncipherlast	$out3,$out3,$in2
1102	vncipherlast	$out4,$out4,$in3
1103	vncipherlast	$out5,$out5,$in4
1104	vncipherlast	$out6,$out6,$in5
1105	vncipherlast	$out7,$out7,$in6
1106	vmr		$ivec,$in7
1107
1108	le?vperm	$out2,$out2,$out2,$inpperm
1109	le?vperm	$out3,$out3,$out3,$inpperm
1110	stvx_u		$out2,$x00,$out
1111	le?vperm	$out4,$out4,$out4,$inpperm
1112	stvx_u		$out3,$x10,$out
1113	le?vperm	$out5,$out5,$out5,$inpperm
1114	stvx_u		$out4,$x20,$out
1115	le?vperm	$out6,$out6,$out6,$inpperm
1116	stvx_u		$out5,$x30,$out
1117	le?vperm	$out7,$out7,$out7,$inpperm
1118	stvx_u		$out6,$x40,$out
1119	stvx_u		$out7,$x50,$out
1120	addi		$out,$out,0x60
1121	b		Lcbc_dec8x_done
1122
1123.align	5
1124Lcbc_dec8x_five:
1125	vncipherlast	$out3,$out3,$ivec
1126	vncipherlast	$out4,$out4,$in3
1127	vncipherlast	$out5,$out5,$in4
1128	vncipherlast	$out6,$out6,$in5
1129	vncipherlast	$out7,$out7,$in6
1130	vmr		$ivec,$in7
1131
1132	le?vperm	$out3,$out3,$out3,$inpperm
1133	le?vperm	$out4,$out4,$out4,$inpperm
1134	stvx_u		$out3,$x00,$out
1135	le?vperm	$out5,$out5,$out5,$inpperm
1136	stvx_u		$out4,$x10,$out
1137	le?vperm	$out6,$out6,$out6,$inpperm
1138	stvx_u		$out5,$x20,$out
1139	le?vperm	$out7,$out7,$out7,$inpperm
1140	stvx_u		$out6,$x30,$out
1141	stvx_u		$out7,$x40,$out
1142	addi		$out,$out,0x50
1143	b		Lcbc_dec8x_done
1144
1145.align	5
1146Lcbc_dec8x_four:
1147	vncipherlast	$out4,$out4,$ivec
1148	vncipherlast	$out5,$out5,$in4
1149	vncipherlast	$out6,$out6,$in5
1150	vncipherlast	$out7,$out7,$in6
1151	vmr		$ivec,$in7
1152
1153	le?vperm	$out4,$out4,$out4,$inpperm
1154	le?vperm	$out5,$out5,$out5,$inpperm
1155	stvx_u		$out4,$x00,$out
1156	le?vperm	$out6,$out6,$out6,$inpperm
1157	stvx_u		$out5,$x10,$out
1158	le?vperm	$out7,$out7,$out7,$inpperm
1159	stvx_u		$out6,$x20,$out
1160	stvx_u		$out7,$x30,$out
1161	addi		$out,$out,0x40
1162	b		Lcbc_dec8x_done
1163
1164.align	5
1165Lcbc_dec8x_three:
1166	vncipherlast	$out5,$out5,$ivec
1167	vncipherlast	$out6,$out6,$in5
1168	vncipherlast	$out7,$out7,$in6
1169	vmr		$ivec,$in7
1170
1171	le?vperm	$out5,$out5,$out5,$inpperm
1172	le?vperm	$out6,$out6,$out6,$inpperm
1173	stvx_u		$out5,$x00,$out
1174	le?vperm	$out7,$out7,$out7,$inpperm
1175	stvx_u		$out6,$x10,$out
1176	stvx_u		$out7,$x20,$out
1177	addi		$out,$out,0x30
1178	b		Lcbc_dec8x_done
1179
1180.align	5
1181Lcbc_dec8x_two:
1182	vncipherlast	$out6,$out6,$ivec
1183	vncipherlast	$out7,$out7,$in6
1184	vmr		$ivec,$in7
1185
1186	le?vperm	$out6,$out6,$out6,$inpperm
1187	le?vperm	$out7,$out7,$out7,$inpperm
1188	stvx_u		$out6,$x00,$out
1189	stvx_u		$out7,$x10,$out
1190	addi		$out,$out,0x20
1191	b		Lcbc_dec8x_done
1192
1193.align	5
1194Lcbc_dec8x_one:
1195	vncipherlast	$out7,$out7,$ivec
1196	vmr		$ivec,$in7
1197
1198	le?vperm	$out7,$out7,$out7,$inpperm
1199	stvx_u		$out7,0,$out
1200	addi		$out,$out,0x10
1201
1202Lcbc_dec8x_done:
1203	le?vperm	$ivec,$ivec,$ivec,$inpperm
1204	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
1205
1206	li		r10,`$FRAME+15`
1207	li		r11,`$FRAME+31`
1208	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1209	addi		r10,r10,32
1210	stvx		$inpperm,r11,$sp
1211	addi		r11,r11,32
1212	stvx		$inpperm,r10,$sp
1213	addi		r10,r10,32
1214	stvx		$inpperm,r11,$sp
1215	addi		r11,r11,32
1216	stvx		$inpperm,r10,$sp
1217	addi		r10,r10,32
1218	stvx		$inpperm,r11,$sp
1219	addi		r11,r11,32
1220	stvx		$inpperm,r10,$sp
1221	addi		r10,r10,32
1222	stvx		$inpperm,r11,$sp
1223	addi		r11,r11,32
1224
1225	mtspr		256,$vrsave
1226	lvx		v20,r10,$sp		# ABI says so
1227	addi		r10,r10,32
1228	lvx		v21,r11,$sp
1229	addi		r11,r11,32
1230	lvx		v22,r10,$sp
1231	addi		r10,r10,32
1232	lvx		v23,r11,$sp
1233	addi		r11,r11,32
1234	lvx		v24,r10,$sp
1235	addi		r10,r10,32
1236	lvx		v25,r11,$sp
1237	addi		r11,r11,32
1238	lvx		v26,r10,$sp
1239	addi		r10,r10,32
1240	lvx		v27,r11,$sp
1241	addi		r11,r11,32
1242	lvx		v28,r10,$sp
1243	addi		r10,r10,32
1244	lvx		v29,r11,$sp
1245	addi		r11,r11,32
1246	lvx		v30,r10,$sp
1247	lvx		v31,r11,$sp
1248	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1249	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1250	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1251	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1252	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1253	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1254	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1255	blr
1256	.long		0
1257	.byte		0,12,0x04,0,0x80,6,6,0
1258	.long		0
1259.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1260___
1261}}	}}}
1262
1263#########################################################################
1264{{{	# CTR procedure[s]						#
1265my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1266my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
1267my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1268						map("v$_",(4..11));
1269my $dat=$tmp;
1270
1271$code.=<<___;
1272.globl	.${prefix}_ctr32_encrypt_blocks
1273.align	5
1274.${prefix}_ctr32_encrypt_blocks:
1275	${UCMP}i	$len,1
1276	bltlr-
1277
1278	lis		r0,0xfff0
1279	mfspr		$vrsave,256
1280	mtspr		256,r0
1281
1282	li		$idx,15
1283	vxor		$rndkey0,$rndkey0,$rndkey0
1284	le?vspltisb	$tmp,0x0f
1285
1286	lvx		$ivec,0,$ivp		# load [unaligned] iv
1287	lvsl		$inpperm,0,$ivp
1288	lvx		$inptail,$idx,$ivp
1289	 vspltisb	$one,1
1290	le?vxor		$inpperm,$inpperm,$tmp
1291	vperm		$ivec,$ivec,$inptail,$inpperm
1292	 vsldoi		$one,$rndkey0,$one,1
1293
1294	neg		r11,$inp
1295	?lvsl		$keyperm,0,$key		# prepare for unaligned key
1296	lwz		$rounds,240($key)
1297
1298	lvsr		$inpperm,0,r11		# prepare for unaligned load
1299	lvx		$inptail,0,$inp
1300	addi		$inp,$inp,15		# 15 is not typo
1301	le?vxor		$inpperm,$inpperm,$tmp
1302
1303	srwi		$rounds,$rounds,1
1304	li		$idx,16
1305	subi		$rounds,$rounds,1
1306
1307	${UCMP}i	$len,8
1308	bge		_aesp8_ctr32_encrypt8x
1309
1310	?lvsr		$outperm,0,$out		# prepare for unaligned store
1311	vspltisb	$outmask,-1
1312	lvx		$outhead,0,$out
1313	?vperm		$outmask,$rndkey0,$outmask,$outperm
1314	le?vxor		$outperm,$outperm,$tmp
1315
1316	lvx		$rndkey0,0,$key
1317	mtctr		$rounds
1318	lvx		$rndkey1,$idx,$key
1319	addi		$idx,$idx,16
1320	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1321	vxor		$inout,$ivec,$rndkey0
1322	lvx		$rndkey0,$idx,$key
1323	addi		$idx,$idx,16
1324	b		Loop_ctr32_enc
1325
1326.align	5
1327Loop_ctr32_enc:
1328	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1329	vcipher		$inout,$inout,$rndkey1
1330	lvx		$rndkey1,$idx,$key
1331	addi		$idx,$idx,16
1332	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1333	vcipher		$inout,$inout,$rndkey0
1334	lvx		$rndkey0,$idx,$key
1335	addi		$idx,$idx,16
1336	bdnz		Loop_ctr32_enc
1337
1338	vadduwm		$ivec,$ivec,$one
1339	 vmr		$dat,$inptail
1340	 lvx		$inptail,0,$inp
1341	 addi		$inp,$inp,16
1342	 subic.		$len,$len,1		# blocks--
1343
1344	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1345	vcipher		$inout,$inout,$rndkey1
1346	lvx		$rndkey1,$idx,$key
1347	 vperm		$dat,$dat,$inptail,$inpperm
1348	 li		$idx,16
1349	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
1350	 lvx		$rndkey0,0,$key
1351	vxor		$dat,$dat,$rndkey1	# last round key
1352	vcipherlast	$inout,$inout,$dat
1353
1354	 lvx		$rndkey1,$idx,$key
1355	 addi		$idx,$idx,16
1356	vperm		$inout,$inout,$inout,$outperm
1357	vsel		$dat,$outhead,$inout,$outmask
1358	 mtctr		$rounds
1359	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1360	vmr		$outhead,$inout
1361	 vxor		$inout,$ivec,$rndkey0
1362	 lvx		$rndkey0,$idx,$key
1363	 addi		$idx,$idx,16
1364	stvx		$dat,0,$out
1365	addi		$out,$out,16
1366	bne		Loop_ctr32_enc
1367
1368	addi		$out,$out,-1
1369	lvx		$inout,0,$out		# redundant in aligned case
1370	vsel		$inout,$outhead,$inout,$outmask
1371	stvx		$inout,0,$out
1372
1373	mtspr		256,$vrsave
1374	blr
1375	.long		0
1376	.byte		0,12,0x14,0,0,0,6,0
1377	.long		0
1378___
1379#########################################################################
1380{{	# Optimized CTR procedure					#
1381my $key_="r11";
1382my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1383    $x00=0 if ($flavour =~ /osx/);
1384my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1385my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1386my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
1387			# v26-v31 last 6 round keys
1388my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
1389my ($two,$three,$four)=($outhead,$outperm,$outmask);
1390
1391$code.=<<___;
1392.align	5
1393_aesp8_ctr32_encrypt8x:
1394	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1395	li		r10,`$FRAME+8*16+15`
1396	li		r11,`$FRAME+8*16+31`
1397	stvx		v20,r10,$sp		# ABI says so
1398	addi		r10,r10,32
1399	stvx		v21,r11,$sp
1400	addi		r11,r11,32
1401	stvx		v22,r10,$sp
1402	addi		r10,r10,32
1403	stvx		v23,r11,$sp
1404	addi		r11,r11,32
1405	stvx		v24,r10,$sp
1406	addi		r10,r10,32
1407	stvx		v25,r11,$sp
1408	addi		r11,r11,32
1409	stvx		v26,r10,$sp
1410	addi		r10,r10,32
1411	stvx		v27,r11,$sp
1412	addi		r11,r11,32
1413	stvx		v28,r10,$sp
1414	addi		r10,r10,32
1415	stvx		v29,r11,$sp
1416	addi		r11,r11,32
1417	stvx		v30,r10,$sp
1418	stvx		v31,r11,$sp
1419	li		r0,-1
1420	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
1421	li		$x10,0x10
1422	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1423	li		$x20,0x20
1424	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1425	li		$x30,0x30
1426	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1427	li		$x40,0x40
1428	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1429	li		$x50,0x50
1430	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1431	li		$x60,0x60
1432	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1433	li		$x70,0x70
1434	mtspr		256,r0
1435
1436	subi		$rounds,$rounds,3	# -4 in total
1437
1438	lvx		$rndkey0,$x00,$key	# load key schedule
1439	lvx		v30,$x10,$key
1440	addi		$key,$key,0x20
1441	lvx		v31,$x00,$key
1442	?vperm		$rndkey0,$rndkey0,v30,$keyperm
1443	addi		$key_,$sp,$FRAME+15
1444	mtctr		$rounds
1445
1446Load_ctr32_enc_key:
1447	?vperm		v24,v30,v31,$keyperm
1448	lvx		v30,$x10,$key
1449	addi		$key,$key,0x20
1450	stvx		v24,$x00,$key_		# off-load round[1]
1451	?vperm		v25,v31,v30,$keyperm
1452	lvx		v31,$x00,$key
1453	stvx		v25,$x10,$key_		# off-load round[2]
1454	addi		$key_,$key_,0x20
1455	bdnz		Load_ctr32_enc_key
1456
1457	lvx		v26,$x10,$key
1458	?vperm		v24,v30,v31,$keyperm
1459	lvx		v27,$x20,$key
1460	stvx		v24,$x00,$key_		# off-load round[3]
1461	?vperm		v25,v31,v26,$keyperm
1462	lvx		v28,$x30,$key
1463	stvx		v25,$x10,$key_		# off-load round[4]
1464	addi		$key_,$sp,$FRAME+15	# rewind $key_
1465	?vperm		v26,v26,v27,$keyperm
1466	lvx		v29,$x40,$key
1467	?vperm		v27,v27,v28,$keyperm
1468	lvx		v30,$x50,$key
1469	?vperm		v28,v28,v29,$keyperm
1470	lvx		v31,$x60,$key
1471	?vperm		v29,v29,v30,$keyperm
1472	lvx		$out0,$x70,$key		# borrow $out0
1473	?vperm		v30,v30,v31,$keyperm
1474	lvx		v24,$x00,$key_		# pre-load round[1]
1475	?vperm		v31,v31,$out0,$keyperm
1476	lvx		v25,$x10,$key_		# pre-load round[2]
1477
1478	vadduwm		$two,$one,$one
1479	subi		$inp,$inp,15		# undo "caller"
1480	$SHL		$len,$len,4
1481
1482	vadduwm		$out1,$ivec,$one	# counter values ...
1483	vadduwm		$out2,$ivec,$two
1484	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1485	 le?li		$idx,8
1486	vadduwm		$out3,$out1,$two
1487	vxor		$out1,$out1,$rndkey0
1488	 le?lvsl	$inpperm,0,$idx
1489	vadduwm		$out4,$out2,$two
1490	vxor		$out2,$out2,$rndkey0
1491	 le?vspltisb	$tmp,0x0f
1492	vadduwm		$out5,$out3,$two
1493	vxor		$out3,$out3,$rndkey0
1494	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
1495	vadduwm		$out6,$out4,$two
1496	vxor		$out4,$out4,$rndkey0
1497	vadduwm		$out7,$out5,$two
1498	vxor		$out5,$out5,$rndkey0
1499	vadduwm		$ivec,$out6,$two	# next counter value
1500	vxor		$out6,$out6,$rndkey0
1501	vxor		$out7,$out7,$rndkey0
1502
1503	mtctr		$rounds
1504	b		Loop_ctr32_enc8x
1505.align	5
1506Loop_ctr32_enc8x:
1507	vcipher 	$out0,$out0,v24
1508	vcipher 	$out1,$out1,v24
1509	vcipher 	$out2,$out2,v24
1510	vcipher 	$out3,$out3,v24
1511	vcipher 	$out4,$out4,v24
1512	vcipher 	$out5,$out5,v24
1513	vcipher 	$out6,$out6,v24
1514	vcipher 	$out7,$out7,v24
1515Loop_ctr32_enc8x_middle:
1516	lvx		v24,$x20,$key_		# round[3]
1517	addi		$key_,$key_,0x20
1518
1519	vcipher 	$out0,$out0,v25
1520	vcipher 	$out1,$out1,v25
1521	vcipher 	$out2,$out2,v25
1522	vcipher 	$out3,$out3,v25
1523	vcipher 	$out4,$out4,v25
1524	vcipher 	$out5,$out5,v25
1525	vcipher 	$out6,$out6,v25
1526	vcipher 	$out7,$out7,v25
1527	lvx		v25,$x10,$key_		# round[4]
1528	bdnz		Loop_ctr32_enc8x
1529
1530	subic		r11,$len,256		# $len-256, borrow $key_
1531	vcipher 	$out0,$out0,v24
1532	vcipher 	$out1,$out1,v24
1533	vcipher 	$out2,$out2,v24
1534	vcipher 	$out3,$out3,v24
1535	vcipher 	$out4,$out4,v24
1536	vcipher 	$out5,$out5,v24
1537	vcipher 	$out6,$out6,v24
1538	vcipher 	$out7,$out7,v24
1539
1540	subfe		r0,r0,r0		# borrow?-1:0
1541	vcipher 	$out0,$out0,v25
1542	vcipher 	$out1,$out1,v25
1543	vcipher 	$out2,$out2,v25
1544	vcipher 	$out3,$out3,v25
1545	vcipher 	$out4,$out4,v25
1546	vcipher		$out5,$out5,v25
1547	vcipher		$out6,$out6,v25
1548	vcipher		$out7,$out7,v25
1549
1550	and		r0,r0,r11
1551	addi		$key_,$sp,$FRAME+15	# rewind $key_
1552	vcipher		$out0,$out0,v26
1553	vcipher		$out1,$out1,v26
1554	vcipher		$out2,$out2,v26
1555	vcipher		$out3,$out3,v26
1556	vcipher		$out4,$out4,v26
1557	vcipher		$out5,$out5,v26
1558	vcipher		$out6,$out6,v26
1559	vcipher		$out7,$out7,v26
1560	lvx		v24,$x00,$key_		# re-pre-load round[1]
1561
1562	subic		$len,$len,129		# $len-=129
1563	vcipher		$out0,$out0,v27
1564	addi		$len,$len,1		# $len-=128 really
1565	vcipher		$out1,$out1,v27
1566	vcipher		$out2,$out2,v27
1567	vcipher		$out3,$out3,v27
1568	vcipher		$out4,$out4,v27
1569	vcipher		$out5,$out5,v27
1570	vcipher		$out6,$out6,v27
1571	vcipher		$out7,$out7,v27
1572	lvx		v25,$x10,$key_		# re-pre-load round[2]
1573
1574	vcipher		$out0,$out0,v28
1575	 lvx_u		$in0,$x00,$inp		# load input
1576	vcipher		$out1,$out1,v28
1577	 lvx_u		$in1,$x10,$inp
1578	vcipher		$out2,$out2,v28
1579	 lvx_u		$in2,$x20,$inp
1580	vcipher		$out3,$out3,v28
1581	 lvx_u		$in3,$x30,$inp
1582	vcipher		$out4,$out4,v28
1583	 lvx_u		$in4,$x40,$inp
1584	vcipher		$out5,$out5,v28
1585	 lvx_u		$in5,$x50,$inp
1586	vcipher		$out6,$out6,v28
1587	 lvx_u		$in6,$x60,$inp
1588	vcipher		$out7,$out7,v28
1589	 lvx_u		$in7,$x70,$inp
1590	 addi		$inp,$inp,0x80
1591
1592	vcipher		$out0,$out0,v29
1593	 le?vperm	$in0,$in0,$in0,$inpperm
1594	vcipher		$out1,$out1,v29
1595	 le?vperm	$in1,$in1,$in1,$inpperm
1596	vcipher		$out2,$out2,v29
1597	 le?vperm	$in2,$in2,$in2,$inpperm
1598	vcipher		$out3,$out3,v29
1599	 le?vperm	$in3,$in3,$in3,$inpperm
1600	vcipher		$out4,$out4,v29
1601	 le?vperm	$in4,$in4,$in4,$inpperm
1602	vcipher		$out5,$out5,v29
1603	 le?vperm	$in5,$in5,$in5,$inpperm
1604	vcipher		$out6,$out6,v29
1605	 le?vperm	$in6,$in6,$in6,$inpperm
1606	vcipher		$out7,$out7,v29
1607	 le?vperm	$in7,$in7,$in7,$inpperm
1608
1609	add		$inp,$inp,r0		# $inp is adjusted in such
1610						# way that at exit from the
1611						# loop inX-in7 are loaded
1612						# with last "words"
1613	subfe.		r0,r0,r0		# borrow?-1:0
1614	vcipher		$out0,$out0,v30
1615	 vxor		$in0,$in0,v31		# xor with last round key
1616	vcipher		$out1,$out1,v30
1617	 vxor		$in1,$in1,v31
1618	vcipher		$out2,$out2,v30
1619	 vxor		$in2,$in2,v31
1620	vcipher		$out3,$out3,v30
1621	 vxor		$in3,$in3,v31
1622	vcipher		$out4,$out4,v30
1623	 vxor		$in4,$in4,v31
1624	vcipher		$out5,$out5,v30
1625	 vxor		$in5,$in5,v31
1626	vcipher		$out6,$out6,v30
1627	 vxor		$in6,$in6,v31
1628	vcipher		$out7,$out7,v30
1629	 vxor		$in7,$in7,v31
1630
1631	bne		Lctr32_enc8x_break	# did $len-129 borrow?
1632
1633	vcipherlast	$in0,$out0,$in0
1634	vcipherlast	$in1,$out1,$in1
1635	 vadduwm	$out1,$ivec,$one	# counter values ...
1636	vcipherlast	$in2,$out2,$in2
1637	 vadduwm	$out2,$ivec,$two
1638	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1639	vcipherlast	$in3,$out3,$in3
1640	 vadduwm	$out3,$out1,$two
1641	 vxor		$out1,$out1,$rndkey0
1642	vcipherlast	$in4,$out4,$in4
1643	 vadduwm	$out4,$out2,$two
1644	 vxor		$out2,$out2,$rndkey0
1645	vcipherlast	$in5,$out5,$in5
1646	 vadduwm	$out5,$out3,$two
1647	 vxor		$out3,$out3,$rndkey0
1648	vcipherlast	$in6,$out6,$in6
1649	 vadduwm	$out6,$out4,$two
1650	 vxor		$out4,$out4,$rndkey0
1651	vcipherlast	$in7,$out7,$in7
1652	 vadduwm	$out7,$out5,$two
1653	 vxor		$out5,$out5,$rndkey0
1654	le?vperm	$in0,$in0,$in0,$inpperm
1655	 vadduwm	$ivec,$out6,$two	# next counter value
1656	 vxor		$out6,$out6,$rndkey0
1657	le?vperm	$in1,$in1,$in1,$inpperm
1658	 vxor		$out7,$out7,$rndkey0
1659	mtctr		$rounds
1660
1661	 vcipher	$out0,$out0,v24
1662	stvx_u		$in0,$x00,$out
1663	le?vperm	$in2,$in2,$in2,$inpperm
1664	 vcipher	$out1,$out1,v24
1665	stvx_u		$in1,$x10,$out
1666	le?vperm	$in3,$in3,$in3,$inpperm
1667	 vcipher	$out2,$out2,v24
1668	stvx_u		$in2,$x20,$out
1669	le?vperm	$in4,$in4,$in4,$inpperm
1670	 vcipher	$out3,$out3,v24
1671	stvx_u		$in3,$x30,$out
1672	le?vperm	$in5,$in5,$in5,$inpperm
1673	 vcipher	$out4,$out4,v24
1674	stvx_u		$in4,$x40,$out
1675	le?vperm	$in6,$in6,$in6,$inpperm
1676	 vcipher	$out5,$out5,v24
1677	stvx_u		$in5,$x50,$out
1678	le?vperm	$in7,$in7,$in7,$inpperm
1679	 vcipher	$out6,$out6,v24
1680	stvx_u		$in6,$x60,$out
1681	 vcipher	$out7,$out7,v24
1682	stvx_u		$in7,$x70,$out
1683	addi		$out,$out,0x80
1684
1685	b		Loop_ctr32_enc8x_middle
1686
1687.align	5
1688Lctr32_enc8x_break:
1689	cmpwi		$len,-0x60
1690	blt		Lctr32_enc8x_one
1691	nop
1692	beq		Lctr32_enc8x_two
1693	cmpwi		$len,-0x40
1694	blt		Lctr32_enc8x_three
1695	nop
1696	beq		Lctr32_enc8x_four
1697	cmpwi		$len,-0x20
1698	blt		Lctr32_enc8x_five
1699	nop
1700	beq		Lctr32_enc8x_six
1701	cmpwi		$len,0x00
1702	blt		Lctr32_enc8x_seven
1703
1704Lctr32_enc8x_eight:
1705	vcipherlast	$out0,$out0,$in0
1706	vcipherlast	$out1,$out1,$in1
1707	vcipherlast	$out2,$out2,$in2
1708	vcipherlast	$out3,$out3,$in3
1709	vcipherlast	$out4,$out4,$in4
1710	vcipherlast	$out5,$out5,$in5
1711	vcipherlast	$out6,$out6,$in6
1712	vcipherlast	$out7,$out7,$in7
1713
1714	le?vperm	$out0,$out0,$out0,$inpperm
1715	le?vperm	$out1,$out1,$out1,$inpperm
1716	stvx_u		$out0,$x00,$out
1717	le?vperm	$out2,$out2,$out2,$inpperm
1718	stvx_u		$out1,$x10,$out
1719	le?vperm	$out3,$out3,$out3,$inpperm
1720	stvx_u		$out2,$x20,$out
1721	le?vperm	$out4,$out4,$out4,$inpperm
1722	stvx_u		$out3,$x30,$out
1723	le?vperm	$out5,$out5,$out5,$inpperm
1724	stvx_u		$out4,$x40,$out
1725	le?vperm	$out6,$out6,$out6,$inpperm
1726	stvx_u		$out5,$x50,$out
1727	le?vperm	$out7,$out7,$out7,$inpperm
1728	stvx_u		$out6,$x60,$out
1729	stvx_u		$out7,$x70,$out
1730	addi		$out,$out,0x80
1731	b		Lctr32_enc8x_done
1732
1733.align	5
1734Lctr32_enc8x_seven:
1735	vcipherlast	$out0,$out0,$in1
1736	vcipherlast	$out1,$out1,$in2
1737	vcipherlast	$out2,$out2,$in3
1738	vcipherlast	$out3,$out3,$in4
1739	vcipherlast	$out4,$out4,$in5
1740	vcipherlast	$out5,$out5,$in6
1741	vcipherlast	$out6,$out6,$in7
1742
1743	le?vperm	$out0,$out0,$out0,$inpperm
1744	le?vperm	$out1,$out1,$out1,$inpperm
1745	stvx_u		$out0,$x00,$out
1746	le?vperm	$out2,$out2,$out2,$inpperm
1747	stvx_u		$out1,$x10,$out
1748	le?vperm	$out3,$out3,$out3,$inpperm
1749	stvx_u		$out2,$x20,$out
1750	le?vperm	$out4,$out4,$out4,$inpperm
1751	stvx_u		$out3,$x30,$out
1752	le?vperm	$out5,$out5,$out5,$inpperm
1753	stvx_u		$out4,$x40,$out
1754	le?vperm	$out6,$out6,$out6,$inpperm
1755	stvx_u		$out5,$x50,$out
1756	stvx_u		$out6,$x60,$out
1757	addi		$out,$out,0x70
1758	b		Lctr32_enc8x_done
1759
1760.align	5
1761Lctr32_enc8x_six:
1762	vcipherlast	$out0,$out0,$in2
1763	vcipherlast	$out1,$out1,$in3
1764	vcipherlast	$out2,$out2,$in4
1765	vcipherlast	$out3,$out3,$in5
1766	vcipherlast	$out4,$out4,$in6
1767	vcipherlast	$out5,$out5,$in7
1768
1769	le?vperm	$out0,$out0,$out0,$inpperm
1770	le?vperm	$out1,$out1,$out1,$inpperm
1771	stvx_u		$out0,$x00,$out
1772	le?vperm	$out2,$out2,$out2,$inpperm
1773	stvx_u		$out1,$x10,$out
1774	le?vperm	$out3,$out3,$out3,$inpperm
1775	stvx_u		$out2,$x20,$out
1776	le?vperm	$out4,$out4,$out4,$inpperm
1777	stvx_u		$out3,$x30,$out
1778	le?vperm	$out5,$out5,$out5,$inpperm
1779	stvx_u		$out4,$x40,$out
1780	stvx_u		$out5,$x50,$out
1781	addi		$out,$out,0x60
1782	b		Lctr32_enc8x_done
1783
1784.align	5
1785Lctr32_enc8x_five:
1786	vcipherlast	$out0,$out0,$in3
1787	vcipherlast	$out1,$out1,$in4
1788	vcipherlast	$out2,$out2,$in5
1789	vcipherlast	$out3,$out3,$in6
1790	vcipherlast	$out4,$out4,$in7
1791
1792	le?vperm	$out0,$out0,$out0,$inpperm
1793	le?vperm	$out1,$out1,$out1,$inpperm
1794	stvx_u		$out0,$x00,$out
1795	le?vperm	$out2,$out2,$out2,$inpperm
1796	stvx_u		$out1,$x10,$out
1797	le?vperm	$out3,$out3,$out3,$inpperm
1798	stvx_u		$out2,$x20,$out
1799	le?vperm	$out4,$out4,$out4,$inpperm
1800	stvx_u		$out3,$x30,$out
1801	stvx_u		$out4,$x40,$out
1802	addi		$out,$out,0x50
1803	b		Lctr32_enc8x_done
1804
1805.align	5
1806Lctr32_enc8x_four:
1807	vcipherlast	$out0,$out0,$in4
1808	vcipherlast	$out1,$out1,$in5
1809	vcipherlast	$out2,$out2,$in6
1810	vcipherlast	$out3,$out3,$in7
1811
1812	le?vperm	$out0,$out0,$out0,$inpperm
1813	le?vperm	$out1,$out1,$out1,$inpperm
1814	stvx_u		$out0,$x00,$out
1815	le?vperm	$out2,$out2,$out2,$inpperm
1816	stvx_u		$out1,$x10,$out
1817	le?vperm	$out3,$out3,$out3,$inpperm
1818	stvx_u		$out2,$x20,$out
1819	stvx_u		$out3,$x30,$out
1820	addi		$out,$out,0x40
1821	b		Lctr32_enc8x_done
1822
1823.align	5
1824Lctr32_enc8x_three:
1825	vcipherlast	$out0,$out0,$in5
1826	vcipherlast	$out1,$out1,$in6
1827	vcipherlast	$out2,$out2,$in7
1828
1829	le?vperm	$out0,$out0,$out0,$inpperm
1830	le?vperm	$out1,$out1,$out1,$inpperm
1831	stvx_u		$out0,$x00,$out
1832	le?vperm	$out2,$out2,$out2,$inpperm
1833	stvx_u		$out1,$x10,$out
1834	stvx_u		$out2,$x20,$out
1835	addi		$out,$out,0x30
1836	b		Lctr32_enc8x_done
1837
1838.align	5
1839Lctr32_enc8x_two:
1840	vcipherlast	$out0,$out0,$in6
1841	vcipherlast	$out1,$out1,$in7
1842
1843	le?vperm	$out0,$out0,$out0,$inpperm
1844	le?vperm	$out1,$out1,$out1,$inpperm
1845	stvx_u		$out0,$x00,$out
1846	stvx_u		$out1,$x10,$out
1847	addi		$out,$out,0x20
1848	b		Lctr32_enc8x_done
1849
1850.align	5
1851Lctr32_enc8x_one:
1852	vcipherlast	$out0,$out0,$in7
1853
1854	le?vperm	$out0,$out0,$out0,$inpperm
1855	stvx_u		$out0,0,$out
1856	addi		$out,$out,0x10
1857
1858Lctr32_enc8x_done:
1859	li		r10,`$FRAME+15`
1860	li		r11,`$FRAME+31`
1861	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1862	addi		r10,r10,32
1863	stvx		$inpperm,r11,$sp
1864	addi		r11,r11,32
1865	stvx		$inpperm,r10,$sp
1866	addi		r10,r10,32
1867	stvx		$inpperm,r11,$sp
1868	addi		r11,r11,32
1869	stvx		$inpperm,r10,$sp
1870	addi		r10,r10,32
1871	stvx		$inpperm,r11,$sp
1872	addi		r11,r11,32
1873	stvx		$inpperm,r10,$sp
1874	addi		r10,r10,32
1875	stvx		$inpperm,r11,$sp
1876	addi		r11,r11,32
1877
1878	mtspr		256,$vrsave
1879	lvx		v20,r10,$sp		# ABI says so
1880	addi		r10,r10,32
1881	lvx		v21,r11,$sp
1882	addi		r11,r11,32
1883	lvx		v22,r10,$sp
1884	addi		r10,r10,32
1885	lvx		v23,r11,$sp
1886	addi		r11,r11,32
1887	lvx		v24,r10,$sp
1888	addi		r10,r10,32
1889	lvx		v25,r11,$sp
1890	addi		r11,r11,32
1891	lvx		v26,r10,$sp
1892	addi		r10,r10,32
1893	lvx		v27,r11,$sp
1894	addi		r11,r11,32
1895	lvx		v28,r10,$sp
1896	addi		r10,r10,32
1897	lvx		v29,r11,$sp
1898	addi		r11,r11,32
1899	lvx		v30,r10,$sp
1900	lvx		v31,r11,$sp
1901	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1902	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1903	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1904	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1905	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1906	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1907	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1908	blr
1909	.long		0
1910	.byte		0,12,0x04,0,0x80,6,6,0
1911	.long		0
1912.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1913___
1914}}	}}}
1915
1916#########################################################################
1917{{{	# XTS procedures						#
1918# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
1919#                             const AES_KEY *key1, const AES_KEY *key2,	#
1920#                             [const] unsigned char iv[16]);		#
1921# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
1922# input tweak value is assumed to be encrypted already, and last tweak	#
1923# value, one suitable for consecutive call on same chunk of data, is	#
1924# written back to original buffer. In addition, in "tweak chaining"	#
1925# mode only complete input blocks are processed.			#
1926
1927my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
1928my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
1929my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
1930my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
1931my $taillen = $key2;
1932
1933   ($inp,$idx) = ($idx,$inp);				# reassign
1934
1935$code.=<<___;
1936.globl	.${prefix}_xts_encrypt
1937.align	5
1938.${prefix}_xts_encrypt:
1939	mr		$inp,r3				# reassign
1940	li		r3,-1
1941	${UCMP}i	$len,16
1942	bltlr-
1943
1944	lis		r0,0xfff0
1945	mfspr		r12,256				# save vrsave
1946	li		r11,0
1947	mtspr		256,r0
1948
1949	vspltisb	$seven,0x07			# 0x070707..07
1950	le?lvsl		$leperm,r11,r11
1951	le?vspltisb	$tmp,0x0f
1952	le?vxor		$leperm,$leperm,$seven
1953
1954	li		$idx,15
1955	lvx		$tweak,0,$ivp			# load [unaligned] iv
1956	lvsl		$inpperm,0,$ivp
1957	lvx		$inptail,$idx,$ivp
1958	le?vxor		$inpperm,$inpperm,$tmp
1959	vperm		$tweak,$tweak,$inptail,$inpperm
1960
1961	neg		r11,$inp
1962	lvsr		$inpperm,0,r11			# prepare for unaligned load
1963	lvx		$inout,0,$inp
1964	addi		$inp,$inp,15			# 15 is not typo
1965	le?vxor		$inpperm,$inpperm,$tmp
1966
1967	${UCMP}i	$key2,0				# key2==NULL?
1968	beq		Lxts_enc_no_key2
1969
1970	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
1971	lwz		$rounds,240($key2)
1972	srwi		$rounds,$rounds,1
1973	subi		$rounds,$rounds,1
1974	li		$idx,16
1975
1976	lvx		$rndkey0,0,$key2
1977	lvx		$rndkey1,$idx,$key2
1978	addi		$idx,$idx,16
1979	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1980	vxor		$tweak,$tweak,$rndkey0
1981	lvx		$rndkey0,$idx,$key2
1982	addi		$idx,$idx,16
1983	mtctr		$rounds
1984
1985Ltweak_xts_enc:
1986	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1987	vcipher		$tweak,$tweak,$rndkey1
1988	lvx		$rndkey1,$idx,$key2
1989	addi		$idx,$idx,16
1990	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1991	vcipher		$tweak,$tweak,$rndkey0
1992	lvx		$rndkey0,$idx,$key2
1993	addi		$idx,$idx,16
1994	bdnz		Ltweak_xts_enc
1995
1996	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1997	vcipher		$tweak,$tweak,$rndkey1
1998	lvx		$rndkey1,$idx,$key2
1999	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2000	vcipherlast	$tweak,$tweak,$rndkey0
2001
2002	li		$ivp,0				# don't chain the tweak
2003	b		Lxts_enc
2004
2005Lxts_enc_no_key2:
2006	li		$idx,-16
2007	and		$len,$len,$idx			# in "tweak chaining"
2008							# mode only complete
2009							# blocks are processed
2010Lxts_enc:
2011	lvx		$inptail,0,$inp
2012	addi		$inp,$inp,16
2013
2014	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2015	lwz		$rounds,240($key1)
2016	srwi		$rounds,$rounds,1
2017	subi		$rounds,$rounds,1
2018	li		$idx,16
2019
2020	vslb		$eighty7,$seven,$seven		# 0x808080..80
2021	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2022	vspltisb	$tmp,1				# 0x010101..01
2023	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2024
2025	${UCMP}i	$len,96
2026	bge		_aesp8_xts_encrypt6x
2027
2028	andi.		$taillen,$len,15
2029	subic		r0,$len,32
2030	subi		$taillen,$taillen,16
2031	subfe		r0,r0,r0
2032	and		r0,r0,$taillen
2033	add		$inp,$inp,r0
2034
2035	lvx		$rndkey0,0,$key1
2036	lvx		$rndkey1,$idx,$key1
2037	addi		$idx,$idx,16
2038	vperm		$inout,$inout,$inptail,$inpperm
2039	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2040	vxor		$inout,$inout,$tweak
2041	vxor		$inout,$inout,$rndkey0
2042	lvx		$rndkey0,$idx,$key1
2043	addi		$idx,$idx,16
2044	mtctr		$rounds
2045	b		Loop_xts_enc
2046
2047.align	5
2048Loop_xts_enc:
2049	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2050	vcipher		$inout,$inout,$rndkey1
2051	lvx		$rndkey1,$idx,$key1
2052	addi		$idx,$idx,16
2053	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2054	vcipher		$inout,$inout,$rndkey0
2055	lvx		$rndkey0,$idx,$key1
2056	addi		$idx,$idx,16
2057	bdnz		Loop_xts_enc
2058
2059	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2060	vcipher		$inout,$inout,$rndkey1
2061	lvx		$rndkey1,$idx,$key1
2062	li		$idx,16
2063	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2064	vxor		$rndkey0,$rndkey0,$tweak
2065	vcipherlast	$output,$inout,$rndkey0
2066
2067	le?vperm	$tmp,$output,$output,$leperm
2068	be?nop
2069	le?stvx_u	$tmp,0,$out
2070	be?stvx_u	$output,0,$out
2071	addi		$out,$out,16
2072
2073	subic.		$len,$len,16
2074	beq		Lxts_enc_done
2075
2076	vmr		$inout,$inptail
2077	lvx		$inptail,0,$inp
2078	addi		$inp,$inp,16
2079	lvx		$rndkey0,0,$key1
2080	lvx		$rndkey1,$idx,$key1
2081	addi		$idx,$idx,16
2082
2083	subic		r0,$len,32
2084	subfe		r0,r0,r0
2085	and		r0,r0,$taillen
2086	add		$inp,$inp,r0
2087
2088	vsrab		$tmp,$tweak,$seven		# next tweak value
2089	vaddubm		$tweak,$tweak,$tweak
2090	vsldoi		$tmp,$tmp,$tmp,15
2091	vand		$tmp,$tmp,$eighty7
2092	vxor		$tweak,$tweak,$tmp
2093
2094	vperm		$inout,$inout,$inptail,$inpperm
2095	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2096	vxor		$inout,$inout,$tweak
2097	vxor		$output,$output,$rndkey0	# just in case $len<16
2098	vxor		$inout,$inout,$rndkey0
2099	lvx		$rndkey0,$idx,$key1
2100	addi		$idx,$idx,16
2101
2102	mtctr		$rounds
2103	${UCMP}i	$len,16
2104	bge		Loop_xts_enc
2105
2106	vxor		$output,$output,$tweak
2107	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2108	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2109	vspltisb	$tmp,-1
2110	vperm		$inptail,$inptail,$tmp,$inpperm
2111	vsel		$inout,$inout,$output,$inptail
2112
2113	subi		r11,$out,17
2114	subi		$out,$out,16
2115	mtctr		$len
2116	li		$len,16
2117Loop_xts_enc_steal:
2118	lbzu		r0,1(r11)
2119	stb		r0,16(r11)
2120	bdnz		Loop_xts_enc_steal
2121
2122	mtctr		$rounds
2123	b		Loop_xts_enc			# one more time...
2124
2125Lxts_enc_done:
2126	${UCMP}i	$ivp,0
2127	beq		Lxts_enc_ret
2128
2129	vsrab		$tmp,$tweak,$seven		# next tweak value
2130	vaddubm		$tweak,$tweak,$tweak
2131	vsldoi		$tmp,$tmp,$tmp,15
2132	vand		$tmp,$tmp,$eighty7
2133	vxor		$tweak,$tweak,$tmp
2134
2135	le?vperm	$tweak,$tweak,$tweak,$leperm
2136	stvx_u		$tweak,0,$ivp
2137
2138Lxts_enc_ret:
2139	mtspr		256,r12				# restore vrsave
2140	li		r3,0
2141	blr
2142	.long		0
2143	.byte		0,12,0x04,0,0x80,6,6,0
2144	.long		0
2145.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2146
2147.globl	.${prefix}_xts_decrypt
2148.align	5
2149.${prefix}_xts_decrypt:
2150	mr		$inp,r3				# reassign
2151	li		r3,-1
2152	${UCMP}i	$len,16
2153	bltlr-
2154
2155	lis		r0,0xfff8
2156	mfspr		r12,256				# save vrsave
2157	li		r11,0
2158	mtspr		256,r0
2159
2160	andi.		r0,$len,15
2161	neg		r0,r0
2162	andi.		r0,r0,16
2163	sub		$len,$len,r0
2164
2165	vspltisb	$seven,0x07			# 0x070707..07
2166	le?lvsl		$leperm,r11,r11
2167	le?vspltisb	$tmp,0x0f
2168	le?vxor		$leperm,$leperm,$seven
2169
2170	li		$idx,15
2171	lvx		$tweak,0,$ivp			# load [unaligned] iv
2172	lvsl		$inpperm,0,$ivp
2173	lvx		$inptail,$idx,$ivp
2174	le?vxor		$inpperm,$inpperm,$tmp
2175	vperm		$tweak,$tweak,$inptail,$inpperm
2176
2177	neg		r11,$inp
2178	lvsr		$inpperm,0,r11			# prepare for unaligned load
2179	lvx		$inout,0,$inp
2180	addi		$inp,$inp,15			# 15 is not typo
2181	le?vxor		$inpperm,$inpperm,$tmp
2182
2183	${UCMP}i	$key2,0				# key2==NULL?
2184	beq		Lxts_dec_no_key2
2185
2186	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2187	lwz		$rounds,240($key2)
2188	srwi		$rounds,$rounds,1
2189	subi		$rounds,$rounds,1
2190	li		$idx,16
2191
2192	lvx		$rndkey0,0,$key2
2193	lvx		$rndkey1,$idx,$key2
2194	addi		$idx,$idx,16
2195	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2196	vxor		$tweak,$tweak,$rndkey0
2197	lvx		$rndkey0,$idx,$key2
2198	addi		$idx,$idx,16
2199	mtctr		$rounds
2200
2201Ltweak_xts_dec:
2202	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2203	vcipher		$tweak,$tweak,$rndkey1
2204	lvx		$rndkey1,$idx,$key2
2205	addi		$idx,$idx,16
2206	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2207	vcipher		$tweak,$tweak,$rndkey0
2208	lvx		$rndkey0,$idx,$key2
2209	addi		$idx,$idx,16
2210	bdnz		Ltweak_xts_dec
2211
2212	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2213	vcipher		$tweak,$tweak,$rndkey1
2214	lvx		$rndkey1,$idx,$key2
2215	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2216	vcipherlast	$tweak,$tweak,$rndkey0
2217
2218	li		$ivp,0				# don't chain the tweak
2219	b		Lxts_dec
2220
2221Lxts_dec_no_key2:
2222	neg		$idx,$len
2223	andi.		$idx,$idx,15
2224	add		$len,$len,$idx			# in "tweak chaining"
2225							# mode only complete
2226							# blocks are processed
2227Lxts_dec:
2228	lvx		$inptail,0,$inp
2229	addi		$inp,$inp,16
2230
2231	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2232	lwz		$rounds,240($key1)
2233	srwi		$rounds,$rounds,1
2234	subi		$rounds,$rounds,1
2235	li		$idx,16
2236
2237	vslb		$eighty7,$seven,$seven		# 0x808080..80
2238	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2239	vspltisb	$tmp,1				# 0x010101..01
2240	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2241
2242	${UCMP}i	$len,96
2243	bge		_aesp8_xts_decrypt6x
2244
2245	lvx		$rndkey0,0,$key1
2246	lvx		$rndkey1,$idx,$key1
2247	addi		$idx,$idx,16
2248	vperm		$inout,$inout,$inptail,$inpperm
2249	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2250	vxor		$inout,$inout,$tweak
2251	vxor		$inout,$inout,$rndkey0
2252	lvx		$rndkey0,$idx,$key1
2253	addi		$idx,$idx,16
2254	mtctr		$rounds
2255
2256	${UCMP}i	$len,16
2257	blt		Ltail_xts_dec
2258	be?b		Loop_xts_dec
2259
2260.align	5
2261Loop_xts_dec:
2262	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2263	vncipher	$inout,$inout,$rndkey1
2264	lvx		$rndkey1,$idx,$key1
2265	addi		$idx,$idx,16
2266	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2267	vncipher	$inout,$inout,$rndkey0
2268	lvx		$rndkey0,$idx,$key1
2269	addi		$idx,$idx,16
2270	bdnz		Loop_xts_dec
2271
2272	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2273	vncipher	$inout,$inout,$rndkey1
2274	lvx		$rndkey1,$idx,$key1
2275	li		$idx,16
2276	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2277	vxor		$rndkey0,$rndkey0,$tweak
2278	vncipherlast	$output,$inout,$rndkey0
2279
2280	le?vperm	$tmp,$output,$output,$leperm
2281	be?nop
2282	le?stvx_u	$tmp,0,$out
2283	be?stvx_u	$output,0,$out
2284	addi		$out,$out,16
2285
2286	subic.		$len,$len,16
2287	beq		Lxts_dec_done
2288
2289	vmr		$inout,$inptail
2290	lvx		$inptail,0,$inp
2291	addi		$inp,$inp,16
2292	lvx		$rndkey0,0,$key1
2293	lvx		$rndkey1,$idx,$key1
2294	addi		$idx,$idx,16
2295
2296	vsrab		$tmp,$tweak,$seven		# next tweak value
2297	vaddubm		$tweak,$tweak,$tweak
2298	vsldoi		$tmp,$tmp,$tmp,15
2299	vand		$tmp,$tmp,$eighty7
2300	vxor		$tweak,$tweak,$tmp
2301
2302	vperm		$inout,$inout,$inptail,$inpperm
2303	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2304	vxor		$inout,$inout,$tweak
2305	vxor		$inout,$inout,$rndkey0
2306	lvx		$rndkey0,$idx,$key1
2307	addi		$idx,$idx,16
2308
2309	mtctr		$rounds
2310	${UCMP}i	$len,16
2311	bge		Loop_xts_dec
2312
2313Ltail_xts_dec:
2314	vsrab		$tmp,$tweak,$seven		# next tweak value
2315	vaddubm		$tweak1,$tweak,$tweak
2316	vsldoi		$tmp,$tmp,$tmp,15
2317	vand		$tmp,$tmp,$eighty7
2318	vxor		$tweak1,$tweak1,$tmp
2319
2320	subi		$inp,$inp,16
2321	add		$inp,$inp,$len
2322
2323	vxor		$inout,$inout,$tweak		# :-(
2324	vxor		$inout,$inout,$tweak1		# :-)
2325
2326Loop_xts_dec_short:
2327	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2328	vncipher	$inout,$inout,$rndkey1
2329	lvx		$rndkey1,$idx,$key1
2330	addi		$idx,$idx,16
2331	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2332	vncipher	$inout,$inout,$rndkey0
2333	lvx		$rndkey0,$idx,$key1
2334	addi		$idx,$idx,16
2335	bdnz		Loop_xts_dec_short
2336
2337	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2338	vncipher	$inout,$inout,$rndkey1
2339	lvx		$rndkey1,$idx,$key1
2340	li		$idx,16
2341	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2342	vxor		$rndkey0,$rndkey0,$tweak1
2343	vncipherlast	$output,$inout,$rndkey0
2344
2345	le?vperm	$tmp,$output,$output,$leperm
2346	be?nop
2347	le?stvx_u	$tmp,0,$out
2348	be?stvx_u	$output,0,$out
2349
2350	vmr		$inout,$inptail
2351	lvx		$inptail,0,$inp
2352	#addi		$inp,$inp,16
2353	lvx		$rndkey0,0,$key1
2354	lvx		$rndkey1,$idx,$key1
2355	addi		$idx,$idx,16
2356	vperm		$inout,$inout,$inptail,$inpperm
2357	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2358
2359	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2360	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2361	vspltisb	$tmp,-1
2362	vperm		$inptail,$inptail,$tmp,$inpperm
2363	vsel		$inout,$inout,$output,$inptail
2364
2365	vxor		$rndkey0,$rndkey0,$tweak
2366	vxor		$inout,$inout,$rndkey0
2367	lvx		$rndkey0,$idx,$key1
2368	addi		$idx,$idx,16
2369
2370	subi		r11,$out,1
2371	mtctr		$len
2372	li		$len,16
2373Loop_xts_dec_steal:
2374	lbzu		r0,1(r11)
2375	stb		r0,16(r11)
2376	bdnz		Loop_xts_dec_steal
2377
2378	mtctr		$rounds
2379	b		Loop_xts_dec			# one more time...
2380
2381Lxts_dec_done:
2382	${UCMP}i	$ivp,0
2383	beq		Lxts_dec_ret
2384
2385	vsrab		$tmp,$tweak,$seven		# next tweak value
2386	vaddubm		$tweak,$tweak,$tweak
2387	vsldoi		$tmp,$tmp,$tmp,15
2388	vand		$tmp,$tmp,$eighty7
2389	vxor		$tweak,$tweak,$tmp
2390
2391	le?vperm	$tweak,$tweak,$tweak,$leperm
2392	stvx_u		$tweak,0,$ivp
2393
2394Lxts_dec_ret:
2395	mtspr		256,r12				# restore vrsave
2396	li		r3,0
2397	blr
2398	.long		0
2399	.byte		0,12,0x04,0,0x80,6,6,0
2400	.long		0
2401.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2402___
2403#########################################################################
2404{{	# Optimized XTS procedures					#
2405my $key_=$key2;
2406my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
2407    $x00=0 if ($flavour =~ /osx/);
2408my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
2409my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2410my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2411my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
2412			# v26-v31 last 6 round keys
2413my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
2414my $taillen=$x70;
2415
2416$code.=<<___;
2417.align	5
2418_aesp8_xts_encrypt6x:
2419	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2420	mflr		r11
2421	li		r7,`$FRAME+8*16+15`
2422	li		r3,`$FRAME+8*16+31`
2423	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2424	stvx		v20,r7,$sp		# ABI says so
2425	addi		r7,r7,32
2426	stvx		v21,r3,$sp
2427	addi		r3,r3,32
2428	stvx		v22,r7,$sp
2429	addi		r7,r7,32
2430	stvx		v23,r3,$sp
2431	addi		r3,r3,32
2432	stvx		v24,r7,$sp
2433	addi		r7,r7,32
2434	stvx		v25,r3,$sp
2435	addi		r3,r3,32
2436	stvx		v26,r7,$sp
2437	addi		r7,r7,32
2438	stvx		v27,r3,$sp
2439	addi		r3,r3,32
2440	stvx		v28,r7,$sp
2441	addi		r7,r7,32
2442	stvx		v29,r3,$sp
2443	addi		r3,r3,32
2444	stvx		v30,r7,$sp
2445	stvx		v31,r3,$sp
2446	li		r0,-1
2447	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
2448	li		$x10,0x10
2449	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2450	li		$x20,0x20
2451	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2452	li		$x30,0x30
2453	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2454	li		$x40,0x40
2455	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2456	li		$x50,0x50
2457	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2458	li		$x60,0x60
2459	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2460	li		$x70,0x70
2461	mtspr		256,r0
2462
2463	subi		$rounds,$rounds,3	# -4 in total
2464
2465	lvx		$rndkey0,$x00,$key1	# load key schedule
2466	lvx		v30,$x10,$key1
2467	addi		$key1,$key1,0x20
2468	lvx		v31,$x00,$key1
2469	?vperm		$rndkey0,$rndkey0,v30,$keyperm
2470	addi		$key_,$sp,$FRAME+15
2471	mtctr		$rounds
2472
2473Load_xts_enc_key:
2474	?vperm		v24,v30,v31,$keyperm
2475	lvx		v30,$x10,$key1
2476	addi		$key1,$key1,0x20
2477	stvx		v24,$x00,$key_		# off-load round[1]
2478	?vperm		v25,v31,v30,$keyperm
2479	lvx		v31,$x00,$key1
2480	stvx		v25,$x10,$key_		# off-load round[2]
2481	addi		$key_,$key_,0x20
2482	bdnz		Load_xts_enc_key
2483
2484	lvx		v26,$x10,$key1
2485	?vperm		v24,v30,v31,$keyperm
2486	lvx		v27,$x20,$key1
2487	stvx		v24,$x00,$key_		# off-load round[3]
2488	?vperm		v25,v31,v26,$keyperm
2489	lvx		v28,$x30,$key1
2490	stvx		v25,$x10,$key_		# off-load round[4]
2491	addi		$key_,$sp,$FRAME+15	# rewind $key_
2492	?vperm		v26,v26,v27,$keyperm
2493	lvx		v29,$x40,$key1
2494	?vperm		v27,v27,v28,$keyperm
2495	lvx		v30,$x50,$key1
2496	?vperm		v28,v28,v29,$keyperm
2497	lvx		v31,$x60,$key1
2498	?vperm		v29,v29,v30,$keyperm
2499	lvx		$twk5,$x70,$key1	# borrow $twk5
2500	?vperm		v30,v30,v31,$keyperm
2501	lvx		v24,$x00,$key_		# pre-load round[1]
2502	?vperm		v31,v31,$twk5,$keyperm
2503	lvx		v25,$x10,$key_		# pre-load round[2]
2504
2505	 vperm		$in0,$inout,$inptail,$inpperm
2506	 subi		$inp,$inp,31		# undo "caller"
2507	vxor		$twk0,$tweak,$rndkey0
2508	vsrab		$tmp,$tweak,$seven	# next tweak value
2509	vaddubm		$tweak,$tweak,$tweak
2510	vsldoi		$tmp,$tmp,$tmp,15
2511	vand		$tmp,$tmp,$eighty7
2512	 vxor		$out0,$in0,$twk0
2513	vxor		$tweak,$tweak,$tmp
2514
2515	 lvx_u		$in1,$x10,$inp
2516	vxor		$twk1,$tweak,$rndkey0
2517	vsrab		$tmp,$tweak,$seven	# next tweak value
2518	vaddubm		$tweak,$tweak,$tweak
2519	vsldoi		$tmp,$tmp,$tmp,15
2520	 le?vperm	$in1,$in1,$in1,$leperm
2521	vand		$tmp,$tmp,$eighty7
2522	 vxor		$out1,$in1,$twk1
2523	vxor		$tweak,$tweak,$tmp
2524
2525	 lvx_u		$in2,$x20,$inp
2526	 andi.		$taillen,$len,15
2527	vxor		$twk2,$tweak,$rndkey0
2528	vsrab		$tmp,$tweak,$seven	# next tweak value
2529	vaddubm		$tweak,$tweak,$tweak
2530	vsldoi		$tmp,$tmp,$tmp,15
2531	 le?vperm	$in2,$in2,$in2,$leperm
2532	vand		$tmp,$tmp,$eighty7
2533	 vxor		$out2,$in2,$twk2
2534	vxor		$tweak,$tweak,$tmp
2535
2536	 lvx_u		$in3,$x30,$inp
2537	 sub		$len,$len,$taillen
2538	vxor		$twk3,$tweak,$rndkey0
2539	vsrab		$tmp,$tweak,$seven	# next tweak value
2540	vaddubm		$tweak,$tweak,$tweak
2541	vsldoi		$tmp,$tmp,$tmp,15
2542	 le?vperm	$in3,$in3,$in3,$leperm
2543	vand		$tmp,$tmp,$eighty7
2544	 vxor		$out3,$in3,$twk3
2545	vxor		$tweak,$tweak,$tmp
2546
2547	 lvx_u		$in4,$x40,$inp
2548	 subi		$len,$len,0x60
2549	vxor		$twk4,$tweak,$rndkey0
2550	vsrab		$tmp,$tweak,$seven	# next tweak value
2551	vaddubm		$tweak,$tweak,$tweak
2552	vsldoi		$tmp,$tmp,$tmp,15
2553	 le?vperm	$in4,$in4,$in4,$leperm
2554	vand		$tmp,$tmp,$eighty7
2555	 vxor		$out4,$in4,$twk4
2556	vxor		$tweak,$tweak,$tmp
2557
2558	 lvx_u		$in5,$x50,$inp
2559	 addi		$inp,$inp,0x60
2560	vxor		$twk5,$tweak,$rndkey0
2561	vsrab		$tmp,$tweak,$seven	# next tweak value
2562	vaddubm		$tweak,$tweak,$tweak
2563	vsldoi		$tmp,$tmp,$tmp,15
2564	 le?vperm	$in5,$in5,$in5,$leperm
2565	vand		$tmp,$tmp,$eighty7
2566	 vxor		$out5,$in5,$twk5
2567	vxor		$tweak,$tweak,$tmp
2568
2569	vxor		v31,v31,$rndkey0
2570	mtctr		$rounds
2571	b		Loop_xts_enc6x
2572
2573.align	5
2574Loop_xts_enc6x:
2575	vcipher		$out0,$out0,v24
2576	vcipher		$out1,$out1,v24
2577	vcipher		$out2,$out2,v24
2578	vcipher		$out3,$out3,v24
2579	vcipher		$out4,$out4,v24
2580	vcipher		$out5,$out5,v24
2581	lvx		v24,$x20,$key_		# round[3]
2582	addi		$key_,$key_,0x20
2583
2584	vcipher		$out0,$out0,v25
2585	vcipher		$out1,$out1,v25
2586	vcipher		$out2,$out2,v25
2587	vcipher		$out3,$out3,v25
2588	vcipher		$out4,$out4,v25
2589	vcipher		$out5,$out5,v25
2590	lvx		v25,$x10,$key_		# round[4]
2591	bdnz		Loop_xts_enc6x
2592
2593	subic		$len,$len,96		# $len-=96
2594	 vxor		$in0,$twk0,v31		# xor with last round key
2595	vcipher		$out0,$out0,v24
2596	vcipher		$out1,$out1,v24
2597	 vsrab		$tmp,$tweak,$seven	# next tweak value
2598	 vxor		$twk0,$tweak,$rndkey0
2599	 vaddubm	$tweak,$tweak,$tweak
2600	vcipher		$out2,$out2,v24
2601	vcipher		$out3,$out3,v24
2602	 vsldoi		$tmp,$tmp,$tmp,15
2603	vcipher		$out4,$out4,v24
2604	vcipher		$out5,$out5,v24
2605
2606	subfe.		r0,r0,r0		# borrow?-1:0
2607	 vand		$tmp,$tmp,$eighty7
2608	vcipher		$out0,$out0,v25
2609	vcipher		$out1,$out1,v25
2610	 vxor		$tweak,$tweak,$tmp
2611	vcipher		$out2,$out2,v25
2612	vcipher		$out3,$out3,v25
2613	 vxor		$in1,$twk1,v31
2614	 vsrab		$tmp,$tweak,$seven	# next tweak value
2615	 vxor		$twk1,$tweak,$rndkey0
2616	vcipher		$out4,$out4,v25
2617	vcipher		$out5,$out5,v25
2618
2619	and		r0,r0,$len
2620	 vaddubm	$tweak,$tweak,$tweak
2621	 vsldoi		$tmp,$tmp,$tmp,15
2622	vcipher		$out0,$out0,v26
2623	vcipher		$out1,$out1,v26
2624	 vand		$tmp,$tmp,$eighty7
2625	vcipher		$out2,$out2,v26
2626	vcipher		$out3,$out3,v26
2627	 vxor		$tweak,$tweak,$tmp
2628	vcipher		$out4,$out4,v26
2629	vcipher		$out5,$out5,v26
2630
2631	add		$inp,$inp,r0		# $inp is adjusted in such
2632						# way that at exit from the
2633						# loop inX-in5 are loaded
2634						# with last "words"
2635	 vxor		$in2,$twk2,v31
2636	 vsrab		$tmp,$tweak,$seven	# next tweak value
2637	 vxor		$twk2,$tweak,$rndkey0
2638	 vaddubm	$tweak,$tweak,$tweak
2639	vcipher		$out0,$out0,v27
2640	vcipher		$out1,$out1,v27
2641	 vsldoi		$tmp,$tmp,$tmp,15
2642	vcipher		$out2,$out2,v27
2643	vcipher		$out3,$out3,v27
2644	 vand		$tmp,$tmp,$eighty7
2645	vcipher		$out4,$out4,v27
2646	vcipher		$out5,$out5,v27
2647
2648	addi		$key_,$sp,$FRAME+15	# rewind $key_
2649	 vxor		$tweak,$tweak,$tmp
2650	vcipher		$out0,$out0,v28
2651	vcipher		$out1,$out1,v28
2652	 vxor		$in3,$twk3,v31
2653	 vsrab		$tmp,$tweak,$seven	# next tweak value
2654	 vxor		$twk3,$tweak,$rndkey0
2655	vcipher		$out2,$out2,v28
2656	vcipher		$out3,$out3,v28
2657	 vaddubm	$tweak,$tweak,$tweak
2658	 vsldoi		$tmp,$tmp,$tmp,15
2659	vcipher		$out4,$out4,v28
2660	vcipher		$out5,$out5,v28
2661	lvx		v24,$x00,$key_		# re-pre-load round[1]
2662	 vand		$tmp,$tmp,$eighty7
2663
2664	vcipher		$out0,$out0,v29
2665	vcipher		$out1,$out1,v29
2666	 vxor		$tweak,$tweak,$tmp
2667	vcipher		$out2,$out2,v29
2668	vcipher		$out3,$out3,v29
2669	 vxor		$in4,$twk4,v31
2670	 vsrab		$tmp,$tweak,$seven	# next tweak value
2671	 vxor		$twk4,$tweak,$rndkey0
2672	vcipher		$out4,$out4,v29
2673	vcipher		$out5,$out5,v29
2674	lvx		v25,$x10,$key_		# re-pre-load round[2]
2675	 vaddubm	$tweak,$tweak,$tweak
2676	 vsldoi		$tmp,$tmp,$tmp,15
2677
2678	vcipher		$out0,$out0,v30
2679	vcipher		$out1,$out1,v30
2680	 vand		$tmp,$tmp,$eighty7
2681	vcipher		$out2,$out2,v30
2682	vcipher		$out3,$out3,v30
2683	 vxor		$tweak,$tweak,$tmp
2684	vcipher		$out4,$out4,v30
2685	vcipher		$out5,$out5,v30
2686	 vxor		$in5,$twk5,v31
2687	 vsrab		$tmp,$tweak,$seven	# next tweak value
2688	 vxor		$twk5,$tweak,$rndkey0
2689
2690	vcipherlast	$out0,$out0,$in0
2691	 lvx_u		$in0,$x00,$inp		# load next input block
2692	 vaddubm	$tweak,$tweak,$tweak
2693	 vsldoi		$tmp,$tmp,$tmp,15
2694	vcipherlast	$out1,$out1,$in1
2695	 lvx_u		$in1,$x10,$inp
2696	vcipherlast	$out2,$out2,$in2
2697	 le?vperm	$in0,$in0,$in0,$leperm
2698	 lvx_u		$in2,$x20,$inp
2699	 vand		$tmp,$tmp,$eighty7
2700	vcipherlast	$out3,$out3,$in3
2701	 le?vperm	$in1,$in1,$in1,$leperm
2702	 lvx_u		$in3,$x30,$inp
2703	vcipherlast	$out4,$out4,$in4
2704	 le?vperm	$in2,$in2,$in2,$leperm
2705	 lvx_u		$in4,$x40,$inp
2706	 vxor		$tweak,$tweak,$tmp
2707	vcipherlast	$tmp,$out5,$in5		# last block might be needed
2708						# in stealing mode
2709	 le?vperm	$in3,$in3,$in3,$leperm
2710	 lvx_u		$in5,$x50,$inp
2711	 addi		$inp,$inp,0x60
2712	 le?vperm	$in4,$in4,$in4,$leperm
2713	 le?vperm	$in5,$in5,$in5,$leperm
2714
2715	le?vperm	$out0,$out0,$out0,$leperm
2716	le?vperm	$out1,$out1,$out1,$leperm
2717	stvx_u		$out0,$x00,$out		# store output
2718	 vxor		$out0,$in0,$twk0
2719	le?vperm	$out2,$out2,$out2,$leperm
2720	stvx_u		$out1,$x10,$out
2721	 vxor		$out1,$in1,$twk1
2722	le?vperm	$out3,$out3,$out3,$leperm
2723	stvx_u		$out2,$x20,$out
2724	 vxor		$out2,$in2,$twk2
2725	le?vperm	$out4,$out4,$out4,$leperm
2726	stvx_u		$out3,$x30,$out
2727	 vxor		$out3,$in3,$twk3
2728	le?vperm	$out5,$tmp,$tmp,$leperm
2729	stvx_u		$out4,$x40,$out
2730	 vxor		$out4,$in4,$twk4
2731	le?stvx_u	$out5,$x50,$out
2732	be?stvx_u	$tmp, $x50,$out
2733	 vxor		$out5,$in5,$twk5
2734	addi		$out,$out,0x60
2735
2736	mtctr		$rounds
2737	beq		Loop_xts_enc6x		# did $len-=96 borrow?
2738
2739	addic.		$len,$len,0x60
2740	beq		Lxts_enc6x_zero
2741	cmpwi		$len,0x20
2742	blt		Lxts_enc6x_one
2743	nop
2744	beq		Lxts_enc6x_two
2745	cmpwi		$len,0x40
2746	blt		Lxts_enc6x_three
2747	nop
2748	beq		Lxts_enc6x_four
2749
2750Lxts_enc6x_five:
2751	vxor		$out0,$in1,$twk0
2752	vxor		$out1,$in2,$twk1
2753	vxor		$out2,$in3,$twk2
2754	vxor		$out3,$in4,$twk3
2755	vxor		$out4,$in5,$twk4
2756
2757	bl		_aesp8_xts_enc5x
2758
2759	le?vperm	$out0,$out0,$out0,$leperm
2760	vmr		$twk0,$twk5		# unused tweak
2761	le?vperm	$out1,$out1,$out1,$leperm
2762	stvx_u		$out0,$x00,$out		# store output
2763	le?vperm	$out2,$out2,$out2,$leperm
2764	stvx_u		$out1,$x10,$out
2765	le?vperm	$out3,$out3,$out3,$leperm
2766	stvx_u		$out2,$x20,$out
2767	vxor		$tmp,$out4,$twk5	# last block prep for stealing
2768	le?vperm	$out4,$out4,$out4,$leperm
2769	stvx_u		$out3,$x30,$out
2770	stvx_u		$out4,$x40,$out
2771	addi		$out,$out,0x50
2772	bne		Lxts_enc6x_steal
2773	b		Lxts_enc6x_done
2774
2775.align	4
2776Lxts_enc6x_four:
2777	vxor		$out0,$in2,$twk0
2778	vxor		$out1,$in3,$twk1
2779	vxor		$out2,$in4,$twk2
2780	vxor		$out3,$in5,$twk3
2781	vxor		$out4,$out4,$out4
2782
2783	bl		_aesp8_xts_enc5x
2784
2785	le?vperm	$out0,$out0,$out0,$leperm
2786	vmr		$twk0,$twk4		# unused tweak
2787	le?vperm	$out1,$out1,$out1,$leperm
2788	stvx_u		$out0,$x00,$out		# store output
2789	le?vperm	$out2,$out2,$out2,$leperm
2790	stvx_u		$out1,$x10,$out
2791	vxor		$tmp,$out3,$twk4	# last block prep for stealing
2792	le?vperm	$out3,$out3,$out3,$leperm
2793	stvx_u		$out2,$x20,$out
2794	stvx_u		$out3,$x30,$out
2795	addi		$out,$out,0x40
2796	bne		Lxts_enc6x_steal
2797	b		Lxts_enc6x_done
2798
2799.align	4
2800Lxts_enc6x_three:
2801	vxor		$out0,$in3,$twk0
2802	vxor		$out1,$in4,$twk1
2803	vxor		$out2,$in5,$twk2
2804	vxor		$out3,$out3,$out3
2805	vxor		$out4,$out4,$out4
2806
2807	bl		_aesp8_xts_enc5x
2808
2809	le?vperm	$out0,$out0,$out0,$leperm
2810	vmr		$twk0,$twk3		# unused tweak
2811	le?vperm	$out1,$out1,$out1,$leperm
2812	stvx_u		$out0,$x00,$out		# store output
2813	vxor		$tmp,$out2,$twk3	# last block prep for stealing
2814	le?vperm	$out2,$out2,$out2,$leperm
2815	stvx_u		$out1,$x10,$out
2816	stvx_u		$out2,$x20,$out
2817	addi		$out,$out,0x30
2818	bne		Lxts_enc6x_steal
2819	b		Lxts_enc6x_done
2820
2821.align	4
2822Lxts_enc6x_two:
2823	vxor		$out0,$in4,$twk0
2824	vxor		$out1,$in5,$twk1
2825	vxor		$out2,$out2,$out2
2826	vxor		$out3,$out3,$out3
2827	vxor		$out4,$out4,$out4
2828
2829	bl		_aesp8_xts_enc5x
2830
2831	le?vperm	$out0,$out0,$out0,$leperm
2832	vmr		$twk0,$twk2		# unused tweak
2833	vxor		$tmp,$out1,$twk2	# last block prep for stealing
2834	le?vperm	$out1,$out1,$out1,$leperm
2835	stvx_u		$out0,$x00,$out		# store output
2836	stvx_u		$out1,$x10,$out
2837	addi		$out,$out,0x20
2838	bne		Lxts_enc6x_steal
2839	b		Lxts_enc6x_done
2840
2841.align	4
2842Lxts_enc6x_one:
2843	vxor		$out0,$in5,$twk0
2844	nop
2845Loop_xts_enc1x:
2846	vcipher		$out0,$out0,v24
2847	lvx		v24,$x20,$key_		# round[3]
2848	addi		$key_,$key_,0x20
2849
2850	vcipher		$out0,$out0,v25
2851	lvx		v25,$x10,$key_		# round[4]
2852	bdnz		Loop_xts_enc1x
2853
2854	add		$inp,$inp,$taillen
2855	cmpwi		$taillen,0
2856	vcipher		$out0,$out0,v24
2857
2858	subi		$inp,$inp,16
2859	vcipher		$out0,$out0,v25
2860
2861	lvsr		$inpperm,0,$taillen
2862	vcipher		$out0,$out0,v26
2863
2864	lvx_u		$in0,0,$inp
2865	vcipher		$out0,$out0,v27
2866
2867	addi		$key_,$sp,$FRAME+15	# rewind $key_
2868	vcipher		$out0,$out0,v28
2869	lvx		v24,$x00,$key_		# re-pre-load round[1]
2870
2871	vcipher		$out0,$out0,v29
2872	lvx		v25,$x10,$key_		# re-pre-load round[2]
2873	 vxor		$twk0,$twk0,v31
2874
2875	le?vperm	$in0,$in0,$in0,$leperm
2876	vcipher		$out0,$out0,v30
2877
2878	vperm		$in0,$in0,$in0,$inpperm
2879	vcipherlast	$out0,$out0,$twk0
2880
2881	vmr		$twk0,$twk1		# unused tweak
2882	vxor		$tmp,$out0,$twk1	# last block prep for stealing
2883	le?vperm	$out0,$out0,$out0,$leperm
2884	stvx_u		$out0,$x00,$out		# store output
2885	addi		$out,$out,0x10
2886	bne		Lxts_enc6x_steal
2887	b		Lxts_enc6x_done
2888
2889.align	4
2890Lxts_enc6x_zero:
2891	cmpwi		$taillen,0
2892	beq		Lxts_enc6x_done
2893
2894	add		$inp,$inp,$taillen
2895	subi		$inp,$inp,16
2896	lvx_u		$in0,0,$inp
2897	lvsr		$inpperm,0,$taillen	# $in5 is no more
2898	le?vperm	$in0,$in0,$in0,$leperm
2899	vperm		$in0,$in0,$in0,$inpperm
2900	vxor		$tmp,$tmp,$twk0
2901Lxts_enc6x_steal:
2902	vxor		$in0,$in0,$twk0
2903	vxor		$out0,$out0,$out0
2904	vspltisb	$out1,-1
2905	vperm		$out0,$out0,$out1,$inpperm
2906	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
2907
2908	subi		r30,$out,17
2909	subi		$out,$out,16
2910	mtctr		$taillen
2911Loop_xts_enc6x_steal:
2912	lbzu		r0,1(r30)
2913	stb		r0,16(r30)
2914	bdnz		Loop_xts_enc6x_steal
2915
2916	li		$taillen,0
2917	mtctr		$rounds
2918	b		Loop_xts_enc1x		# one more time...
2919
2920.align	4
2921Lxts_enc6x_done:
2922	${UCMP}i	$ivp,0
2923	beq		Lxts_enc6x_ret
2924
2925	vxor		$tweak,$twk0,$rndkey0
2926	le?vperm	$tweak,$tweak,$tweak,$leperm
2927	stvx_u		$tweak,0,$ivp
2928
2929Lxts_enc6x_ret:
2930	mtlr		r11
2931	li		r10,`$FRAME+15`
2932	li		r11,`$FRAME+31`
2933	stvx		$seven,r10,$sp		# wipe copies of round keys
2934	addi		r10,r10,32
2935	stvx		$seven,r11,$sp
2936	addi		r11,r11,32
2937	stvx		$seven,r10,$sp
2938	addi		r10,r10,32
2939	stvx		$seven,r11,$sp
2940	addi		r11,r11,32
2941	stvx		$seven,r10,$sp
2942	addi		r10,r10,32
2943	stvx		$seven,r11,$sp
2944	addi		r11,r11,32
2945	stvx		$seven,r10,$sp
2946	addi		r10,r10,32
2947	stvx		$seven,r11,$sp
2948	addi		r11,r11,32
2949
2950	mtspr		256,$vrsave
2951	lvx		v20,r10,$sp		# ABI says so
2952	addi		r10,r10,32
2953	lvx		v21,r11,$sp
2954	addi		r11,r11,32
2955	lvx		v22,r10,$sp
2956	addi		r10,r10,32
2957	lvx		v23,r11,$sp
2958	addi		r11,r11,32
2959	lvx		v24,r10,$sp
2960	addi		r10,r10,32
2961	lvx		v25,r11,$sp
2962	addi		r11,r11,32
2963	lvx		v26,r10,$sp
2964	addi		r10,r10,32
2965	lvx		v27,r11,$sp
2966	addi		r11,r11,32
2967	lvx		v28,r10,$sp
2968	addi		r10,r10,32
2969	lvx		v29,r11,$sp
2970	addi		r11,r11,32
2971	lvx		v30,r10,$sp
2972	lvx		v31,r11,$sp
2973	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2974	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2975	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2976	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2977	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2978	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2979	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
2980	blr
2981	.long		0
2982	.byte		0,12,0x04,1,0x80,6,6,0
2983	.long		0
2984
2985.align	5
2986_aesp8_xts_enc5x:
2987	vcipher		$out0,$out0,v24
2988	vcipher		$out1,$out1,v24
2989	vcipher		$out2,$out2,v24
2990	vcipher		$out3,$out3,v24
2991	vcipher		$out4,$out4,v24
2992	lvx		v24,$x20,$key_		# round[3]
2993	addi		$key_,$key_,0x20
2994
2995	vcipher		$out0,$out0,v25
2996	vcipher		$out1,$out1,v25
2997	vcipher		$out2,$out2,v25
2998	vcipher		$out3,$out3,v25
2999	vcipher		$out4,$out4,v25
3000	lvx		v25,$x10,$key_		# round[4]
3001	bdnz		_aesp8_xts_enc5x
3002
3003	add		$inp,$inp,$taillen
3004	cmpwi		$taillen,0
3005	vcipher		$out0,$out0,v24
3006	vcipher		$out1,$out1,v24
3007	vcipher		$out2,$out2,v24
3008	vcipher		$out3,$out3,v24
3009	vcipher		$out4,$out4,v24
3010
3011	subi		$inp,$inp,16
3012	vcipher		$out0,$out0,v25
3013	vcipher		$out1,$out1,v25
3014	vcipher		$out2,$out2,v25
3015	vcipher		$out3,$out3,v25
3016	vcipher		$out4,$out4,v25
3017	 vxor		$twk0,$twk0,v31
3018
3019	vcipher		$out0,$out0,v26
3020	lvsr		$inpperm,0,$taillen	# $in5 is no more
3021	vcipher		$out1,$out1,v26
3022	vcipher		$out2,$out2,v26
3023	vcipher		$out3,$out3,v26
3024	vcipher		$out4,$out4,v26
3025	 vxor		$in1,$twk1,v31
3026
3027	vcipher		$out0,$out0,v27
3028	lvx_u		$in0,0,$inp
3029	vcipher		$out1,$out1,v27
3030	vcipher		$out2,$out2,v27
3031	vcipher		$out3,$out3,v27
3032	vcipher		$out4,$out4,v27
3033	 vxor		$in2,$twk2,v31
3034
3035	addi		$key_,$sp,$FRAME+15	# rewind $key_
3036	vcipher		$out0,$out0,v28
3037	vcipher		$out1,$out1,v28
3038	vcipher		$out2,$out2,v28
3039	vcipher		$out3,$out3,v28
3040	vcipher		$out4,$out4,v28
3041	lvx		v24,$x00,$key_		# re-pre-load round[1]
3042	 vxor		$in3,$twk3,v31
3043
3044	vcipher		$out0,$out0,v29
3045	le?vperm	$in0,$in0,$in0,$leperm
3046	vcipher		$out1,$out1,v29
3047	vcipher		$out2,$out2,v29
3048	vcipher		$out3,$out3,v29
3049	vcipher		$out4,$out4,v29
3050	lvx		v25,$x10,$key_		# re-pre-load round[2]
3051	 vxor		$in4,$twk4,v31
3052
3053	vcipher		$out0,$out0,v30
3054	vperm		$in0,$in0,$in0,$inpperm
3055	vcipher		$out1,$out1,v30
3056	vcipher		$out2,$out2,v30
3057	vcipher		$out3,$out3,v30
3058	vcipher		$out4,$out4,v30
3059
3060	vcipherlast	$out0,$out0,$twk0
3061	vcipherlast	$out1,$out1,$in1
3062	vcipherlast	$out2,$out2,$in2
3063	vcipherlast	$out3,$out3,$in3
3064	vcipherlast	$out4,$out4,$in4
3065	blr
3066        .long   	0
3067        .byte   	0,12,0x14,0,0,0,0,0
3068
3069.align	5
3070_aesp8_xts_decrypt6x:
3071	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
3072	mflr		r11
3073	li		r7,`$FRAME+8*16+15`
3074	li		r3,`$FRAME+8*16+31`
3075	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
3076	stvx		v20,r7,$sp		# ABI says so
3077	addi		r7,r7,32
3078	stvx		v21,r3,$sp
3079	addi		r3,r3,32
3080	stvx		v22,r7,$sp
3081	addi		r7,r7,32
3082	stvx		v23,r3,$sp
3083	addi		r3,r3,32
3084	stvx		v24,r7,$sp
3085	addi		r7,r7,32
3086	stvx		v25,r3,$sp
3087	addi		r3,r3,32
3088	stvx		v26,r7,$sp
3089	addi		r7,r7,32
3090	stvx		v27,r3,$sp
3091	addi		r3,r3,32
3092	stvx		v28,r7,$sp
3093	addi		r7,r7,32
3094	stvx		v29,r3,$sp
3095	addi		r3,r3,32
3096	stvx		v30,r7,$sp
3097	stvx		v31,r3,$sp
3098	li		r0,-1
3099	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
3100	li		$x10,0x10
3101	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3102	li		$x20,0x20
3103	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3104	li		$x30,0x30
3105	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3106	li		$x40,0x40
3107	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3108	li		$x50,0x50
3109	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3110	li		$x60,0x60
3111	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3112	li		$x70,0x70
3113	mtspr		256,r0
3114
3115	subi		$rounds,$rounds,3	# -4 in total
3116
3117	lvx		$rndkey0,$x00,$key1	# load key schedule
3118	lvx		v30,$x10,$key1
3119	addi		$key1,$key1,0x20
3120	lvx		v31,$x00,$key1
3121	?vperm		$rndkey0,$rndkey0,v30,$keyperm
3122	addi		$key_,$sp,$FRAME+15
3123	mtctr		$rounds
3124
3125Load_xts_dec_key:
3126	?vperm		v24,v30,v31,$keyperm
3127	lvx		v30,$x10,$key1
3128	addi		$key1,$key1,0x20
3129	stvx		v24,$x00,$key_		# off-load round[1]
3130	?vperm		v25,v31,v30,$keyperm
3131	lvx		v31,$x00,$key1
3132	stvx		v25,$x10,$key_		# off-load round[2]
3133	addi		$key_,$key_,0x20
3134	bdnz		Load_xts_dec_key
3135
3136	lvx		v26,$x10,$key1
3137	?vperm		v24,v30,v31,$keyperm
3138	lvx		v27,$x20,$key1
3139	stvx		v24,$x00,$key_		# off-load round[3]
3140	?vperm		v25,v31,v26,$keyperm
3141	lvx		v28,$x30,$key1
3142	stvx		v25,$x10,$key_		# off-load round[4]
3143	addi		$key_,$sp,$FRAME+15	# rewind $key_
3144	?vperm		v26,v26,v27,$keyperm
3145	lvx		v29,$x40,$key1
3146	?vperm		v27,v27,v28,$keyperm
3147	lvx		v30,$x50,$key1
3148	?vperm		v28,v28,v29,$keyperm
3149	lvx		v31,$x60,$key1
3150	?vperm		v29,v29,v30,$keyperm
3151	lvx		$twk5,$x70,$key1	# borrow $twk5
3152	?vperm		v30,v30,v31,$keyperm
3153	lvx		v24,$x00,$key_		# pre-load round[1]
3154	?vperm		v31,v31,$twk5,$keyperm
3155	lvx		v25,$x10,$key_		# pre-load round[2]
3156
3157	 vperm		$in0,$inout,$inptail,$inpperm
3158	 subi		$inp,$inp,31		# undo "caller"
3159	vxor		$twk0,$tweak,$rndkey0
3160	vsrab		$tmp,$tweak,$seven	# next tweak value
3161	vaddubm		$tweak,$tweak,$tweak
3162	vsldoi		$tmp,$tmp,$tmp,15
3163	vand		$tmp,$tmp,$eighty7
3164	 vxor		$out0,$in0,$twk0
3165	vxor		$tweak,$tweak,$tmp
3166
3167	 lvx_u		$in1,$x10,$inp
3168	vxor		$twk1,$tweak,$rndkey0
3169	vsrab		$tmp,$tweak,$seven	# next tweak value
3170	vaddubm		$tweak,$tweak,$tweak
3171	vsldoi		$tmp,$tmp,$tmp,15
3172	 le?vperm	$in1,$in1,$in1,$leperm
3173	vand		$tmp,$tmp,$eighty7
3174	 vxor		$out1,$in1,$twk1
3175	vxor		$tweak,$tweak,$tmp
3176
3177	 lvx_u		$in2,$x20,$inp
3178	 andi.		$taillen,$len,15
3179	vxor		$twk2,$tweak,$rndkey0
3180	vsrab		$tmp,$tweak,$seven	# next tweak value
3181	vaddubm		$tweak,$tweak,$tweak
3182	vsldoi		$tmp,$tmp,$tmp,15
3183	 le?vperm	$in2,$in2,$in2,$leperm
3184	vand		$tmp,$tmp,$eighty7
3185	 vxor		$out2,$in2,$twk2
3186	vxor		$tweak,$tweak,$tmp
3187
3188	 lvx_u		$in3,$x30,$inp
3189	 sub		$len,$len,$taillen
3190	vxor		$twk3,$tweak,$rndkey0
3191	vsrab		$tmp,$tweak,$seven	# next tweak value
3192	vaddubm		$tweak,$tweak,$tweak
3193	vsldoi		$tmp,$tmp,$tmp,15
3194	 le?vperm	$in3,$in3,$in3,$leperm
3195	vand		$tmp,$tmp,$eighty7
3196	 vxor		$out3,$in3,$twk3
3197	vxor		$tweak,$tweak,$tmp
3198
3199	 lvx_u		$in4,$x40,$inp
3200	 subi		$len,$len,0x60
3201	vxor		$twk4,$tweak,$rndkey0
3202	vsrab		$tmp,$tweak,$seven	# next tweak value
3203	vaddubm		$tweak,$tweak,$tweak
3204	vsldoi		$tmp,$tmp,$tmp,15
3205	 le?vperm	$in4,$in4,$in4,$leperm
3206	vand		$tmp,$tmp,$eighty7
3207	 vxor		$out4,$in4,$twk4
3208	vxor		$tweak,$tweak,$tmp
3209
3210	 lvx_u		$in5,$x50,$inp
3211	 addi		$inp,$inp,0x60
3212	vxor		$twk5,$tweak,$rndkey0
3213	vsrab		$tmp,$tweak,$seven	# next tweak value
3214	vaddubm		$tweak,$tweak,$tweak
3215	vsldoi		$tmp,$tmp,$tmp,15
3216	 le?vperm	$in5,$in5,$in5,$leperm
3217	vand		$tmp,$tmp,$eighty7
3218	 vxor		$out5,$in5,$twk5
3219	vxor		$tweak,$tweak,$tmp
3220
3221	vxor		v31,v31,$rndkey0
3222	mtctr		$rounds
3223	b		Loop_xts_dec6x
3224
3225.align	5
3226Loop_xts_dec6x:
3227	vncipher	$out0,$out0,v24
3228	vncipher	$out1,$out1,v24
3229	vncipher	$out2,$out2,v24
3230	vncipher	$out3,$out3,v24
3231	vncipher	$out4,$out4,v24
3232	vncipher	$out5,$out5,v24
3233	lvx		v24,$x20,$key_		# round[3]
3234	addi		$key_,$key_,0x20
3235
3236	vncipher	$out0,$out0,v25
3237	vncipher	$out1,$out1,v25
3238	vncipher	$out2,$out2,v25
3239	vncipher	$out3,$out3,v25
3240	vncipher	$out4,$out4,v25
3241	vncipher	$out5,$out5,v25
3242	lvx		v25,$x10,$key_		# round[4]
3243	bdnz		Loop_xts_dec6x
3244
3245	subic		$len,$len,96		# $len-=96
3246	 vxor		$in0,$twk0,v31		# xor with last round key
3247	vncipher	$out0,$out0,v24
3248	vncipher	$out1,$out1,v24
3249	 vsrab		$tmp,$tweak,$seven	# next tweak value
3250	 vxor		$twk0,$tweak,$rndkey0
3251	 vaddubm	$tweak,$tweak,$tweak
3252	vncipher	$out2,$out2,v24
3253	vncipher	$out3,$out3,v24
3254	 vsldoi		$tmp,$tmp,$tmp,15
3255	vncipher	$out4,$out4,v24
3256	vncipher	$out5,$out5,v24
3257
3258	subfe.		r0,r0,r0		# borrow?-1:0
3259	 vand		$tmp,$tmp,$eighty7
3260	vncipher	$out0,$out0,v25
3261	vncipher	$out1,$out1,v25
3262	 vxor		$tweak,$tweak,$tmp
3263	vncipher	$out2,$out2,v25
3264	vncipher	$out3,$out3,v25
3265	 vxor		$in1,$twk1,v31
3266	 vsrab		$tmp,$tweak,$seven	# next tweak value
3267	 vxor		$twk1,$tweak,$rndkey0
3268	vncipher	$out4,$out4,v25
3269	vncipher	$out5,$out5,v25
3270
3271	and		r0,r0,$len
3272	 vaddubm	$tweak,$tweak,$tweak
3273	 vsldoi		$tmp,$tmp,$tmp,15
3274	vncipher	$out0,$out0,v26
3275	vncipher	$out1,$out1,v26
3276	 vand		$tmp,$tmp,$eighty7
3277	vncipher	$out2,$out2,v26
3278	vncipher	$out3,$out3,v26
3279	 vxor		$tweak,$tweak,$tmp
3280	vncipher	$out4,$out4,v26
3281	vncipher	$out5,$out5,v26
3282
3283	add		$inp,$inp,r0		# $inp is adjusted in such
3284						# way that at exit from the
3285						# loop inX-in5 are loaded
3286						# with last "words"
3287	 vxor		$in2,$twk2,v31
3288	 vsrab		$tmp,$tweak,$seven	# next tweak value
3289	 vxor		$twk2,$tweak,$rndkey0
3290	 vaddubm	$tweak,$tweak,$tweak
3291	vncipher	$out0,$out0,v27
3292	vncipher	$out1,$out1,v27
3293	 vsldoi		$tmp,$tmp,$tmp,15
3294	vncipher	$out2,$out2,v27
3295	vncipher	$out3,$out3,v27
3296	 vand		$tmp,$tmp,$eighty7
3297	vncipher	$out4,$out4,v27
3298	vncipher	$out5,$out5,v27
3299
3300	addi		$key_,$sp,$FRAME+15	# rewind $key_
3301	 vxor		$tweak,$tweak,$tmp
3302	vncipher	$out0,$out0,v28
3303	vncipher	$out1,$out1,v28
3304	 vxor		$in3,$twk3,v31
3305	 vsrab		$tmp,$tweak,$seven	# next tweak value
3306	 vxor		$twk3,$tweak,$rndkey0
3307	vncipher	$out2,$out2,v28
3308	vncipher	$out3,$out3,v28
3309	 vaddubm	$tweak,$tweak,$tweak
3310	 vsldoi		$tmp,$tmp,$tmp,15
3311	vncipher	$out4,$out4,v28
3312	vncipher	$out5,$out5,v28
3313	lvx		v24,$x00,$key_		# re-pre-load round[1]
3314	 vand		$tmp,$tmp,$eighty7
3315
3316	vncipher	$out0,$out0,v29
3317	vncipher	$out1,$out1,v29
3318	 vxor		$tweak,$tweak,$tmp
3319	vncipher	$out2,$out2,v29
3320	vncipher	$out3,$out3,v29
3321	 vxor		$in4,$twk4,v31
3322	 vsrab		$tmp,$tweak,$seven	# next tweak value
3323	 vxor		$twk4,$tweak,$rndkey0
3324	vncipher	$out4,$out4,v29
3325	vncipher	$out5,$out5,v29
3326	lvx		v25,$x10,$key_		# re-pre-load round[2]
3327	 vaddubm	$tweak,$tweak,$tweak
3328	 vsldoi		$tmp,$tmp,$tmp,15
3329
3330	vncipher	$out0,$out0,v30
3331	vncipher	$out1,$out1,v30
3332	 vand		$tmp,$tmp,$eighty7
3333	vncipher	$out2,$out2,v30
3334	vncipher	$out3,$out3,v30
3335	 vxor		$tweak,$tweak,$tmp
3336	vncipher	$out4,$out4,v30
3337	vncipher	$out5,$out5,v30
3338	 vxor		$in5,$twk5,v31
3339	 vsrab		$tmp,$tweak,$seven	# next tweak value
3340	 vxor		$twk5,$tweak,$rndkey0
3341
3342	vncipherlast	$out0,$out0,$in0
3343	 lvx_u		$in0,$x00,$inp		# load next input block
3344	 vaddubm	$tweak,$tweak,$tweak
3345	 vsldoi		$tmp,$tmp,$tmp,15
3346	vncipherlast	$out1,$out1,$in1
3347	 lvx_u		$in1,$x10,$inp
3348	vncipherlast	$out2,$out2,$in2
3349	 le?vperm	$in0,$in0,$in0,$leperm
3350	 lvx_u		$in2,$x20,$inp
3351	 vand		$tmp,$tmp,$eighty7
3352	vncipherlast	$out3,$out3,$in3
3353	 le?vperm	$in1,$in1,$in1,$leperm
3354	 lvx_u		$in3,$x30,$inp
3355	vncipherlast	$out4,$out4,$in4
3356	 le?vperm	$in2,$in2,$in2,$leperm
3357	 lvx_u		$in4,$x40,$inp
3358	 vxor		$tweak,$tweak,$tmp
3359	vncipherlast	$out5,$out5,$in5
3360	 le?vperm	$in3,$in3,$in3,$leperm
3361	 lvx_u		$in5,$x50,$inp
3362	 addi		$inp,$inp,0x60
3363	 le?vperm	$in4,$in4,$in4,$leperm
3364	 le?vperm	$in5,$in5,$in5,$leperm
3365
3366	le?vperm	$out0,$out0,$out0,$leperm
3367	le?vperm	$out1,$out1,$out1,$leperm
3368	stvx_u		$out0,$x00,$out		# store output
3369	 vxor		$out0,$in0,$twk0
3370	le?vperm	$out2,$out2,$out2,$leperm
3371	stvx_u		$out1,$x10,$out
3372	 vxor		$out1,$in1,$twk1
3373	le?vperm	$out3,$out3,$out3,$leperm
3374	stvx_u		$out2,$x20,$out
3375	 vxor		$out2,$in2,$twk2
3376	le?vperm	$out4,$out4,$out4,$leperm
3377	stvx_u		$out3,$x30,$out
3378	 vxor		$out3,$in3,$twk3
3379	le?vperm	$out5,$out5,$out5,$leperm
3380	stvx_u		$out4,$x40,$out
3381	 vxor		$out4,$in4,$twk4
3382	stvx_u		$out5,$x50,$out
3383	 vxor		$out5,$in5,$twk5
3384	addi		$out,$out,0x60
3385
3386	mtctr		$rounds
3387	beq		Loop_xts_dec6x		# did $len-=96 borrow?
3388
3389	addic.		$len,$len,0x60
3390	beq		Lxts_dec6x_zero
3391	cmpwi		$len,0x20
3392	blt		Lxts_dec6x_one
3393	nop
3394	beq		Lxts_dec6x_two
3395	cmpwi		$len,0x40
3396	blt		Lxts_dec6x_three
3397	nop
3398	beq		Lxts_dec6x_four
3399
3400Lxts_dec6x_five:
3401	vxor		$out0,$in1,$twk0
3402	vxor		$out1,$in2,$twk1
3403	vxor		$out2,$in3,$twk2
3404	vxor		$out3,$in4,$twk3
3405	vxor		$out4,$in5,$twk4
3406
3407	bl		_aesp8_xts_dec5x
3408
3409	le?vperm	$out0,$out0,$out0,$leperm
3410	vmr		$twk0,$twk5		# unused tweak
3411	vxor		$twk1,$tweak,$rndkey0
3412	le?vperm	$out1,$out1,$out1,$leperm
3413	stvx_u		$out0,$x00,$out		# store output
3414	vxor		$out0,$in0,$twk1
3415	le?vperm	$out2,$out2,$out2,$leperm
3416	stvx_u		$out1,$x10,$out
3417	le?vperm	$out3,$out3,$out3,$leperm
3418	stvx_u		$out2,$x20,$out
3419	le?vperm	$out4,$out4,$out4,$leperm
3420	stvx_u		$out3,$x30,$out
3421	stvx_u		$out4,$x40,$out
3422	addi		$out,$out,0x50
3423	bne		Lxts_dec6x_steal
3424	b		Lxts_dec6x_done
3425
3426.align	4
3427Lxts_dec6x_four:
3428	vxor		$out0,$in2,$twk0
3429	vxor		$out1,$in3,$twk1
3430	vxor		$out2,$in4,$twk2
3431	vxor		$out3,$in5,$twk3
3432	vxor		$out4,$out4,$out4
3433
3434	bl		_aesp8_xts_dec5x
3435
3436	le?vperm	$out0,$out0,$out0,$leperm
3437	vmr		$twk0,$twk4		# unused tweak
3438	vmr		$twk1,$twk5
3439	le?vperm	$out1,$out1,$out1,$leperm
3440	stvx_u		$out0,$x00,$out		# store output
3441	vxor		$out0,$in0,$twk5
3442	le?vperm	$out2,$out2,$out2,$leperm
3443	stvx_u		$out1,$x10,$out
3444	le?vperm	$out3,$out3,$out3,$leperm
3445	stvx_u		$out2,$x20,$out
3446	stvx_u		$out3,$x30,$out
3447	addi		$out,$out,0x40
3448	bne		Lxts_dec6x_steal
3449	b		Lxts_dec6x_done
3450
3451.align	4
3452Lxts_dec6x_three:
3453	vxor		$out0,$in3,$twk0
3454	vxor		$out1,$in4,$twk1
3455	vxor		$out2,$in5,$twk2
3456	vxor		$out3,$out3,$out3
3457	vxor		$out4,$out4,$out4
3458
3459	bl		_aesp8_xts_dec5x
3460
3461	le?vperm	$out0,$out0,$out0,$leperm
3462	vmr		$twk0,$twk3		# unused tweak
3463	vmr		$twk1,$twk4
3464	le?vperm	$out1,$out1,$out1,$leperm
3465	stvx_u		$out0,$x00,$out		# store output
3466	vxor		$out0,$in0,$twk4
3467	le?vperm	$out2,$out2,$out2,$leperm
3468	stvx_u		$out1,$x10,$out
3469	stvx_u		$out2,$x20,$out
3470	addi		$out,$out,0x30
3471	bne		Lxts_dec6x_steal
3472	b		Lxts_dec6x_done
3473
3474.align	4
3475Lxts_dec6x_two:
3476	vxor		$out0,$in4,$twk0
3477	vxor		$out1,$in5,$twk1
3478	vxor		$out2,$out2,$out2
3479	vxor		$out3,$out3,$out3
3480	vxor		$out4,$out4,$out4
3481
3482	bl		_aesp8_xts_dec5x
3483
3484	le?vperm	$out0,$out0,$out0,$leperm
3485	vmr		$twk0,$twk2		# unused tweak
3486	vmr		$twk1,$twk3
3487	le?vperm	$out1,$out1,$out1,$leperm
3488	stvx_u		$out0,$x00,$out		# store output
3489	vxor		$out0,$in0,$twk3
3490	stvx_u		$out1,$x10,$out
3491	addi		$out,$out,0x20
3492	bne		Lxts_dec6x_steal
3493	b		Lxts_dec6x_done
3494
3495.align	4
3496Lxts_dec6x_one:
3497	vxor		$out0,$in5,$twk0
3498	nop
3499Loop_xts_dec1x:
3500	vncipher	$out0,$out0,v24
3501	lvx		v24,$x20,$key_		# round[3]
3502	addi		$key_,$key_,0x20
3503
3504	vncipher	$out0,$out0,v25
3505	lvx		v25,$x10,$key_		# round[4]
3506	bdnz		Loop_xts_dec1x
3507
3508	subi		r0,$taillen,1
3509	vncipher	$out0,$out0,v24
3510
3511	andi.		r0,r0,16
3512	cmpwi		$taillen,0
3513	vncipher	$out0,$out0,v25
3514
3515	sub		$inp,$inp,r0
3516	vncipher	$out0,$out0,v26
3517
3518	lvx_u		$in0,0,$inp
3519	vncipher	$out0,$out0,v27
3520
3521	addi		$key_,$sp,$FRAME+15	# rewind $key_
3522	vncipher	$out0,$out0,v28
3523	lvx		v24,$x00,$key_		# re-pre-load round[1]
3524
3525	vncipher	$out0,$out0,v29
3526	lvx		v25,$x10,$key_		# re-pre-load round[2]
3527	 vxor		$twk0,$twk0,v31
3528
3529	le?vperm	$in0,$in0,$in0,$leperm
3530	vncipher	$out0,$out0,v30
3531
3532	mtctr		$rounds
3533	vncipherlast	$out0,$out0,$twk0
3534
3535	vmr		$twk0,$twk1		# unused tweak
3536	vmr		$twk1,$twk2
3537	le?vperm	$out0,$out0,$out0,$leperm
3538	stvx_u		$out0,$x00,$out		# store output
3539	addi		$out,$out,0x10
3540	vxor		$out0,$in0,$twk2
3541	bne		Lxts_dec6x_steal
3542	b		Lxts_dec6x_done
3543
3544.align	4
3545Lxts_dec6x_zero:
3546	cmpwi		$taillen,0
3547	beq		Lxts_dec6x_done
3548
3549	lvx_u		$in0,0,$inp
3550	le?vperm	$in0,$in0,$in0,$leperm
3551	vxor		$out0,$in0,$twk1
3552Lxts_dec6x_steal:
3553	vncipher	$out0,$out0,v24
3554	lvx		v24,$x20,$key_		# round[3]
3555	addi		$key_,$key_,0x20
3556
3557	vncipher	$out0,$out0,v25
3558	lvx		v25,$x10,$key_		# round[4]
3559	bdnz		Lxts_dec6x_steal
3560
3561	add		$inp,$inp,$taillen
3562	vncipher	$out0,$out0,v24
3563
3564	cmpwi		$taillen,0
3565	vncipher	$out0,$out0,v25
3566
3567	lvx_u		$in0,0,$inp
3568	vncipher	$out0,$out0,v26
3569
3570	lvsr		$inpperm,0,$taillen	# $in5 is no more
3571	vncipher	$out0,$out0,v27
3572
3573	addi		$key_,$sp,$FRAME+15	# rewind $key_
3574	vncipher	$out0,$out0,v28
3575	lvx		v24,$x00,$key_		# re-pre-load round[1]
3576
3577	vncipher	$out0,$out0,v29
3578	lvx		v25,$x10,$key_		# re-pre-load round[2]
3579	 vxor		$twk1,$twk1,v31
3580
3581	le?vperm	$in0,$in0,$in0,$leperm
3582	vncipher	$out0,$out0,v30
3583
3584	vperm		$in0,$in0,$in0,$inpperm
3585	vncipherlast	$tmp,$out0,$twk1
3586
3587	le?vperm	$out0,$tmp,$tmp,$leperm
3588	le?stvx_u	$out0,0,$out
3589	be?stvx_u	$tmp,0,$out
3590
3591	vxor		$out0,$out0,$out0
3592	vspltisb	$out1,-1
3593	vperm		$out0,$out0,$out1,$inpperm
3594	vsel		$out0,$in0,$tmp,$out0
3595	vxor		$out0,$out0,$twk0
3596
3597	subi		r30,$out,1
3598	mtctr		$taillen
3599Loop_xts_dec6x_steal:
3600	lbzu		r0,1(r30)
3601	stb		r0,16(r30)
3602	bdnz		Loop_xts_dec6x_steal
3603
3604	li		$taillen,0
3605	mtctr		$rounds
3606	b		Loop_xts_dec1x		# one more time...
3607
3608.align	4
3609Lxts_dec6x_done:
3610	${UCMP}i	$ivp,0
3611	beq		Lxts_dec6x_ret
3612
3613	vxor		$tweak,$twk0,$rndkey0
3614	le?vperm	$tweak,$tweak,$tweak,$leperm
3615	stvx_u		$tweak,0,$ivp
3616
3617Lxts_dec6x_ret:
3618	mtlr		r11
3619	li		r10,`$FRAME+15`
3620	li		r11,`$FRAME+31`
3621	stvx		$seven,r10,$sp		# wipe copies of round keys
3622	addi		r10,r10,32
3623	stvx		$seven,r11,$sp
3624	addi		r11,r11,32
3625	stvx		$seven,r10,$sp
3626	addi		r10,r10,32
3627	stvx		$seven,r11,$sp
3628	addi		r11,r11,32
3629	stvx		$seven,r10,$sp
3630	addi		r10,r10,32
3631	stvx		$seven,r11,$sp
3632	addi		r11,r11,32
3633	stvx		$seven,r10,$sp
3634	addi		r10,r10,32
3635	stvx		$seven,r11,$sp
3636	addi		r11,r11,32
3637
3638	mtspr		256,$vrsave
3639	lvx		v20,r10,$sp		# ABI says so
3640	addi		r10,r10,32
3641	lvx		v21,r11,$sp
3642	addi		r11,r11,32
3643	lvx		v22,r10,$sp
3644	addi		r10,r10,32
3645	lvx		v23,r11,$sp
3646	addi		r11,r11,32
3647	lvx		v24,r10,$sp
3648	addi		r10,r10,32
3649	lvx		v25,r11,$sp
3650	addi		r11,r11,32
3651	lvx		v26,r10,$sp
3652	addi		r10,r10,32
3653	lvx		v27,r11,$sp
3654	addi		r11,r11,32
3655	lvx		v28,r10,$sp
3656	addi		r10,r10,32
3657	lvx		v29,r11,$sp
3658	addi		r11,r11,32
3659	lvx		v30,r10,$sp
3660	lvx		v31,r11,$sp
3661	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3662	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3663	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3664	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3665	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3666	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3667	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3668	blr
3669	.long		0
3670	.byte		0,12,0x04,1,0x80,6,6,0
3671	.long		0
3672
3673.align	5
3674_aesp8_xts_dec5x:
3675	vncipher	$out0,$out0,v24
3676	vncipher	$out1,$out1,v24
3677	vncipher	$out2,$out2,v24
3678	vncipher	$out3,$out3,v24
3679	vncipher	$out4,$out4,v24
3680	lvx		v24,$x20,$key_		# round[3]
3681	addi		$key_,$key_,0x20
3682
3683	vncipher	$out0,$out0,v25
3684	vncipher	$out1,$out1,v25
3685	vncipher	$out2,$out2,v25
3686	vncipher	$out3,$out3,v25
3687	vncipher	$out4,$out4,v25
3688	lvx		v25,$x10,$key_		# round[4]
3689	bdnz		_aesp8_xts_dec5x
3690
3691	subi		r0,$taillen,1
3692	vncipher	$out0,$out0,v24
3693	vncipher	$out1,$out1,v24
3694	vncipher	$out2,$out2,v24
3695	vncipher	$out3,$out3,v24
3696	vncipher	$out4,$out4,v24
3697
3698	andi.		r0,r0,16
3699	cmpwi		$taillen,0
3700	vncipher	$out0,$out0,v25
3701	vncipher	$out1,$out1,v25
3702	vncipher	$out2,$out2,v25
3703	vncipher	$out3,$out3,v25
3704	vncipher	$out4,$out4,v25
3705	 vxor		$twk0,$twk0,v31
3706
3707	sub		$inp,$inp,r0
3708	vncipher	$out0,$out0,v26
3709	vncipher	$out1,$out1,v26
3710	vncipher	$out2,$out2,v26
3711	vncipher	$out3,$out3,v26
3712	vncipher	$out4,$out4,v26
3713	 vxor		$in1,$twk1,v31
3714
3715	vncipher	$out0,$out0,v27
3716	lvx_u		$in0,0,$inp
3717	vncipher	$out1,$out1,v27
3718	vncipher	$out2,$out2,v27
3719	vncipher	$out3,$out3,v27
3720	vncipher	$out4,$out4,v27
3721	 vxor		$in2,$twk2,v31
3722
3723	addi		$key_,$sp,$FRAME+15	# rewind $key_
3724	vncipher	$out0,$out0,v28
3725	vncipher	$out1,$out1,v28
3726	vncipher	$out2,$out2,v28
3727	vncipher	$out3,$out3,v28
3728	vncipher	$out4,$out4,v28
3729	lvx		v24,$x00,$key_		# re-pre-load round[1]
3730	 vxor		$in3,$twk3,v31
3731
3732	vncipher	$out0,$out0,v29
3733	le?vperm	$in0,$in0,$in0,$leperm
3734	vncipher	$out1,$out1,v29
3735	vncipher	$out2,$out2,v29
3736	vncipher	$out3,$out3,v29
3737	vncipher	$out4,$out4,v29
3738	lvx		v25,$x10,$key_		# re-pre-load round[2]
3739	 vxor		$in4,$twk4,v31
3740
3741	vncipher	$out0,$out0,v30
3742	vncipher	$out1,$out1,v30
3743	vncipher	$out2,$out2,v30
3744	vncipher	$out3,$out3,v30
3745	vncipher	$out4,$out4,v30
3746
3747	vncipherlast	$out0,$out0,$twk0
3748	vncipherlast	$out1,$out1,$in1
3749	vncipherlast	$out2,$out2,$in2
3750	vncipherlast	$out3,$out3,$in3
3751	vncipherlast	$out4,$out4,$in4
3752	mtctr		$rounds
3753	blr
3754        .long   	0
3755        .byte   	0,12,0x14,0,0,0,0,0
3756___
3757}}	}}}
3758
3759my $consts=1;
3760foreach(split("\n",$code)) {
3761        s/\`([^\`]*)\`/eval($1)/geo;
3762
3763	# constants table endian-specific conversion
3764	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3765	    my $conv=$3;
3766	    my @bytes=();
3767
3768	    # convert to endian-agnostic format
3769	    if ($1 eq "long") {
3770	      foreach (split(/,\s*/,$2)) {
3771		my $l = /^0/?oct:int;
3772		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3773	      }
3774	    } else {
3775		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3776	    }
3777
3778	    # little-endian conversion
3779	    if ($flavour =~ /le$/o) {
3780		SWITCH: for($conv)  {
3781		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
3782		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
3783		}
3784	    }
3785
3786	    #emit
3787	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3788	    next;
3789	}
3790	$consts=0 if (m/Lconsts:/o);	# end of table
3791
3792	# instructions prefixed with '?' are endian-specific and need
3793	# to be adjusted accordingly...
3794	if ($flavour =~ /le$/o) {	# little-endian
3795	    s/le\?//o		or
3796	    s/be\?/#be#/o	or
3797	    s/\?lvsr/lvsl/o	or
3798	    s/\?lvsl/lvsr/o	or
3799	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3800	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3801	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3802	} else {			# big-endian
3803	    s/le\?/#le#/o	or
3804	    s/be\?//o		or
3805	    s/\?([a-z]+)/$1/o;
3806	}
3807
3808        print $_,"\n";
3809}
3810
3811close STDOUT or die "error closing STDOUT: $!";
3812