xref: /freebsd/crypto/openssl/crypto/aes/asm/aesp8-ppc.pl (revision 562894f0dc310f658284863ff329906e7737a0a0)
1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for AES instructions as per PowerISA
18# specification version 2.07, first implemented by POWER8 processor.
19# The module is endian-agnostic in sense that it supports both big-
20# and little-endian cases. Data alignment in parallelizable modes is
21# handled with VSX loads and stores, which implies MSR.VSX flag being
22# set. It should also be noted that ISA specification doesn't prohibit
23# alignment exceptions for these instructions on page boundaries.
24# Initially alignment was handled in pure AltiVec/VMX way [when data
25# is aligned programmatically, which in turn guarantees exception-
26# free execution], but it turned to hamper performance when vcipher
27# instructions are interleaved. It's reckoned that eventual
28# misalignment penalties at page boundaries are in average lower
29# than additional overhead in pure AltiVec approach.
30#
31# May 2016
32#
33# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
34# systems were measured.
35#
36######################################################################
37# Current large-block performance in cycles per byte processed with
38# 128-bit key (less is better).
39#
40#		CBC en-/decrypt	CTR	XTS
41# POWER8[le]	3.96/0.72	0.74	1.1
42# POWER8[be]	3.75/0.65	0.66	1.0
43# POWER9[le]	4.02/0.86	0.84	1.05
44# POWER9[be]	3.99/0.78	0.79	0.97
45
46$flavour = shift;
47
48if ($flavour =~ /64/) {
49	$SIZE_T	=8;
50	$LRSAVE	=2*$SIZE_T;
51	$STU	="stdu";
52	$POP	="ld";
53	$PUSH	="std";
54	$UCMP	="cmpld";
55	$SHL	="sldi";
56} elsif ($flavour =~ /32/) {
57	$SIZE_T	=4;
58	$LRSAVE	=$SIZE_T;
59	$STU	="stwu";
60	$POP	="lwz";
61	$PUSH	="stw";
62	$UCMP	="cmplw";
63	$SHL	="slwi";
64} else { die "nonsense $flavour"; }
65
66$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
67
68$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
69( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
70( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
71die "can't locate ppc-xlate.pl";
72
73open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
74
75$FRAME=8*$SIZE_T;
76$prefix="aes_p8";
77
78$sp="r1";
79$vrsave="r12";
80
81#########################################################################
82{{{	# Key setup procedures						#
83my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
84my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
85my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
86
87$code.=<<___;
88.machine	"any"
89
90.text
91
92.align	7
93rcon:
94.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
95.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
96.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
97.long	0,0,0,0						?asis
98Lconsts:
99	mflr	r0
100	bcl	20,31,\$+4
101	mflr	$ptr	 #vvvvv "distance between . and rcon
102	addi	$ptr,$ptr,-0x48
103	mtlr	r0
104	blr
105	.long	0
106	.byte	0,12,0x14,0,0,0,0,0
107.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
108
109.globl	.${prefix}_set_encrypt_key
110.align	5
111.${prefix}_set_encrypt_key:
112Lset_encrypt_key:
113	mflr		r11
114	$PUSH		r11,$LRSAVE($sp)
115
116	li		$ptr,-1
117	${UCMP}i	$inp,0
118	beq-		Lenc_key_abort		# if ($inp==0) return -1;
119	${UCMP}i	$out,0
120	beq-		Lenc_key_abort		# if ($out==0) return -1;
121	li		$ptr,-2
122	cmpwi		$bits,128
123	blt-		Lenc_key_abort
124	cmpwi		$bits,256
125	bgt-		Lenc_key_abort
126	andi.		r0,$bits,0x3f
127	bne-		Lenc_key_abort
128
129	lis		r0,0xfff0
130	mfspr		$vrsave,256
131	mtspr		256,r0
132
133	bl		Lconsts
134	mtlr		r11
135
136	neg		r9,$inp
137	lvx		$in0,0,$inp
138	addi		$inp,$inp,15		# 15 is not typo
139	lvsr		$key,0,r9		# borrow $key
140	li		r8,0x20
141	cmpwi		$bits,192
142	lvx		$in1,0,$inp
143	le?vspltisb	$mask,0x0f		# borrow $mask
144	lvx		$rcon,0,$ptr
145	le?vxor		$key,$key,$mask		# adjust for byte swap
146	lvx		$mask,r8,$ptr
147	addi		$ptr,$ptr,0x10
148	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
149	li		$cnt,8
150	vxor		$zero,$zero,$zero
151	mtctr		$cnt
152
153	?lvsr		$outperm,0,$out
154	vspltisb	$outmask,-1
155	lvx		$outhead,0,$out
156	?vperm		$outmask,$zero,$outmask,$outperm
157
158	blt		Loop128
159	addi		$inp,$inp,8
160	beq		L192
161	addi		$inp,$inp,8
162	b		L256
163
164.align	4
165Loop128:
166	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
167	vsldoi		$tmp,$zero,$in0,12	# >>32
168	 vperm		$outtail,$in0,$in0,$outperm	# rotate
169	 vsel		$stage,$outhead,$outtail,$outmask
170	 vmr		$outhead,$outtail
171	vcipherlast	$key,$key,$rcon
172	 stvx		$stage,0,$out
173	 addi		$out,$out,16
174
175	vxor		$in0,$in0,$tmp
176	vsldoi		$tmp,$zero,$tmp,12	# >>32
177	vxor		$in0,$in0,$tmp
178	vsldoi		$tmp,$zero,$tmp,12	# >>32
179	vxor		$in0,$in0,$tmp
180	 vadduwm	$rcon,$rcon,$rcon
181	vxor		$in0,$in0,$key
182	bdnz		Loop128
183
184	lvx		$rcon,0,$ptr		# last two round keys
185
186	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
187	vsldoi		$tmp,$zero,$in0,12	# >>32
188	 vperm		$outtail,$in0,$in0,$outperm	# rotate
189	 vsel		$stage,$outhead,$outtail,$outmask
190	 vmr		$outhead,$outtail
191	vcipherlast	$key,$key,$rcon
192	 stvx		$stage,0,$out
193	 addi		$out,$out,16
194
195	vxor		$in0,$in0,$tmp
196	vsldoi		$tmp,$zero,$tmp,12	# >>32
197	vxor		$in0,$in0,$tmp
198	vsldoi		$tmp,$zero,$tmp,12	# >>32
199	vxor		$in0,$in0,$tmp
200	 vadduwm	$rcon,$rcon,$rcon
201	vxor		$in0,$in0,$key
202
203	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
204	vsldoi		$tmp,$zero,$in0,12	# >>32
205	 vperm		$outtail,$in0,$in0,$outperm	# rotate
206	 vsel		$stage,$outhead,$outtail,$outmask
207	 vmr		$outhead,$outtail
208	vcipherlast	$key,$key,$rcon
209	 stvx		$stage,0,$out
210	 addi		$out,$out,16
211
212	vxor		$in0,$in0,$tmp
213	vsldoi		$tmp,$zero,$tmp,12	# >>32
214	vxor		$in0,$in0,$tmp
215	vsldoi		$tmp,$zero,$tmp,12	# >>32
216	vxor		$in0,$in0,$tmp
217	vxor		$in0,$in0,$key
218	 vperm		$outtail,$in0,$in0,$outperm	# rotate
219	 vsel		$stage,$outhead,$outtail,$outmask
220	 vmr		$outhead,$outtail
221	 stvx		$stage,0,$out
222
223	addi		$inp,$out,15		# 15 is not typo
224	addi		$out,$out,0x50
225
226	li		$rounds,10
227	b		Ldone
228
229.align	4
230L192:
231	lvx		$tmp,0,$inp
232	li		$cnt,4
233	 vperm		$outtail,$in0,$in0,$outperm	# rotate
234	 vsel		$stage,$outhead,$outtail,$outmask
235	 vmr		$outhead,$outtail
236	 stvx		$stage,0,$out
237	 addi		$out,$out,16
238	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
239	vspltisb	$key,8			# borrow $key
240	mtctr		$cnt
241	vsububm		$mask,$mask,$key	# adjust the mask
242
243Loop192:
244	vperm		$key,$in1,$in1,$mask	# roate-n-splat
245	vsldoi		$tmp,$zero,$in0,12	# >>32
246	vcipherlast	$key,$key,$rcon
247
248	vxor		$in0,$in0,$tmp
249	vsldoi		$tmp,$zero,$tmp,12	# >>32
250	vxor		$in0,$in0,$tmp
251	vsldoi		$tmp,$zero,$tmp,12	# >>32
252	vxor		$in0,$in0,$tmp
253
254	 vsldoi		$stage,$zero,$in1,8
255	vspltw		$tmp,$in0,3
256	vxor		$tmp,$tmp,$in1
257	vsldoi		$in1,$zero,$in1,12	# >>32
258	 vadduwm	$rcon,$rcon,$rcon
259	vxor		$in1,$in1,$tmp
260	vxor		$in0,$in0,$key
261	vxor		$in1,$in1,$key
262	 vsldoi		$stage,$stage,$in0,8
263
264	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
265	vsldoi		$tmp,$zero,$in0,12	# >>32
266	 vperm		$outtail,$stage,$stage,$outperm	# rotate
267	 vsel		$stage,$outhead,$outtail,$outmask
268	 vmr		$outhead,$outtail
269	vcipherlast	$key,$key,$rcon
270	 stvx		$stage,0,$out
271	 addi		$out,$out,16
272
273	 vsldoi		$stage,$in0,$in1,8
274	vxor		$in0,$in0,$tmp
275	vsldoi		$tmp,$zero,$tmp,12	# >>32
276	 vperm		$outtail,$stage,$stage,$outperm	# rotate
277	 vsel		$stage,$outhead,$outtail,$outmask
278	 vmr		$outhead,$outtail
279	vxor		$in0,$in0,$tmp
280	vsldoi		$tmp,$zero,$tmp,12	# >>32
281	vxor		$in0,$in0,$tmp
282	 stvx		$stage,0,$out
283	 addi		$out,$out,16
284
285	vspltw		$tmp,$in0,3
286	vxor		$tmp,$tmp,$in1
287	vsldoi		$in1,$zero,$in1,12	# >>32
288	 vadduwm	$rcon,$rcon,$rcon
289	vxor		$in1,$in1,$tmp
290	vxor		$in0,$in0,$key
291	vxor		$in1,$in1,$key
292	 vperm		$outtail,$in0,$in0,$outperm	# rotate
293	 vsel		$stage,$outhead,$outtail,$outmask
294	 vmr		$outhead,$outtail
295	 stvx		$stage,0,$out
296	 addi		$inp,$out,15		# 15 is not typo
297	 addi		$out,$out,16
298	bdnz		Loop192
299
300	li		$rounds,12
301	addi		$out,$out,0x20
302	b		Ldone
303
304.align	4
305L256:
306	lvx		$tmp,0,$inp
307	li		$cnt,7
308	li		$rounds,14
309	 vperm		$outtail,$in0,$in0,$outperm	# rotate
310	 vsel		$stage,$outhead,$outtail,$outmask
311	 vmr		$outhead,$outtail
312	 stvx		$stage,0,$out
313	 addi		$out,$out,16
314	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
315	mtctr		$cnt
316
317Loop256:
318	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
319	vsldoi		$tmp,$zero,$in0,12	# >>32
320	 vperm		$outtail,$in1,$in1,$outperm	# rotate
321	 vsel		$stage,$outhead,$outtail,$outmask
322	 vmr		$outhead,$outtail
323	vcipherlast	$key,$key,$rcon
324	 stvx		$stage,0,$out
325	 addi		$out,$out,16
326
327	vxor		$in0,$in0,$tmp
328	vsldoi		$tmp,$zero,$tmp,12	# >>32
329	vxor		$in0,$in0,$tmp
330	vsldoi		$tmp,$zero,$tmp,12	# >>32
331	vxor		$in0,$in0,$tmp
332	 vadduwm	$rcon,$rcon,$rcon
333	vxor		$in0,$in0,$key
334	 vperm		$outtail,$in0,$in0,$outperm	# rotate
335	 vsel		$stage,$outhead,$outtail,$outmask
336	 vmr		$outhead,$outtail
337	 stvx		$stage,0,$out
338	 addi		$inp,$out,15		# 15 is not typo
339	 addi		$out,$out,16
340	bdz		Ldone
341
342	vspltw		$key,$in0,3		# just splat
343	vsldoi		$tmp,$zero,$in1,12	# >>32
344	vsbox		$key,$key
345
346	vxor		$in1,$in1,$tmp
347	vsldoi		$tmp,$zero,$tmp,12	# >>32
348	vxor		$in1,$in1,$tmp
349	vsldoi		$tmp,$zero,$tmp,12	# >>32
350	vxor		$in1,$in1,$tmp
351
352	vxor		$in1,$in1,$key
353	b		Loop256
354
355.align	4
356Ldone:
357	lvx		$in1,0,$inp		# redundant in aligned case
358	vsel		$in1,$outhead,$in1,$outmask
359	stvx		$in1,0,$inp
360	li		$ptr,0
361	mtspr		256,$vrsave
362	stw		$rounds,0($out)
363
364Lenc_key_abort:
365	mr		r3,$ptr
366	blr
367	.long		0
368	.byte		0,12,0x14,1,0,0,3,0
369	.long		0
370.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
371
372.globl	.${prefix}_set_decrypt_key
373.align	5
374.${prefix}_set_decrypt_key:
375	$STU		$sp,-$FRAME($sp)
376	mflr		r10
377	$PUSH		r10,$FRAME+$LRSAVE($sp)
378	bl		Lset_encrypt_key
379	mtlr		r10
380
381	cmpwi		r3,0
382	bne-		Ldec_key_abort
383
384	slwi		$cnt,$rounds,4
385	subi		$inp,$out,240		# first round key
386	srwi		$rounds,$rounds,1
387	add		$out,$inp,$cnt		# last round key
388	mtctr		$rounds
389
390Ldeckey:
391	lwz		r0, 0($inp)
392	lwz		r6, 4($inp)
393	lwz		r7, 8($inp)
394	lwz		r8, 12($inp)
395	addi		$inp,$inp,16
396	lwz		r9, 0($out)
397	lwz		r10,4($out)
398	lwz		r11,8($out)
399	lwz		r12,12($out)
400	stw		r0, 0($out)
401	stw		r6, 4($out)
402	stw		r7, 8($out)
403	stw		r8, 12($out)
404	subi		$out,$out,16
405	stw		r9, -16($inp)
406	stw		r10,-12($inp)
407	stw		r11,-8($inp)
408	stw		r12,-4($inp)
409	bdnz		Ldeckey
410
411	xor		r3,r3,r3		# return value
412Ldec_key_abort:
413	addi		$sp,$sp,$FRAME
414	blr
415	.long		0
416	.byte		0,12,4,1,0x80,0,3,0
417	.long		0
418.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
419___
420}}}
421#########################################################################
422{{{	# Single block en- and decrypt procedures			#
423sub gen_block () {
424my $dir = shift;
425my $n   = $dir eq "de" ? "n" : "";
426my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
427
428$code.=<<___;
429.globl	.${prefix}_${dir}crypt
430.align	5
431.${prefix}_${dir}crypt:
432	lwz		$rounds,240($key)
433	lis		r0,0xfc00
434	mfspr		$vrsave,256
435	li		$idx,15			# 15 is not typo
436	mtspr		256,r0
437
438	lvx		v0,0,$inp
439	neg		r11,$out
440	lvx		v1,$idx,$inp
441	lvsl		v2,0,$inp		# inpperm
442	le?vspltisb	v4,0x0f
443	?lvsl		v3,0,r11		# outperm
444	le?vxor		v2,v2,v4
445	li		$idx,16
446	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
447	lvx		v1,0,$key
448	?lvsl		v5,0,$key		# keyperm
449	srwi		$rounds,$rounds,1
450	lvx		v2,$idx,$key
451	addi		$idx,$idx,16
452	subi		$rounds,$rounds,1
453	?vperm		v1,v1,v2,v5		# align round key
454
455	vxor		v0,v0,v1
456	lvx		v1,$idx,$key
457	addi		$idx,$idx,16
458	mtctr		$rounds
459
460Loop_${dir}c:
461	?vperm		v2,v2,v1,v5
462	v${n}cipher	v0,v0,v2
463	lvx		v2,$idx,$key
464	addi		$idx,$idx,16
465	?vperm		v1,v1,v2,v5
466	v${n}cipher	v0,v0,v1
467	lvx		v1,$idx,$key
468	addi		$idx,$idx,16
469	bdnz		Loop_${dir}c
470
471	?vperm		v2,v2,v1,v5
472	v${n}cipher	v0,v0,v2
473	lvx		v2,$idx,$key
474	?vperm		v1,v1,v2,v5
475	v${n}cipherlast	v0,v0,v1
476
477	vspltisb	v2,-1
478	vxor		v1,v1,v1
479	li		$idx,15			# 15 is not typo
480	?vperm		v2,v1,v2,v3		# outmask
481	le?vxor		v3,v3,v4
482	lvx		v1,0,$out		# outhead
483	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
484	vsel		v1,v1,v0,v2
485	lvx		v4,$idx,$out
486	stvx		v1,0,$out
487	vsel		v0,v0,v4,v2
488	stvx		v0,$idx,$out
489
490	mtspr		256,$vrsave
491	blr
492	.long		0
493	.byte		0,12,0x14,0,0,0,3,0
494	.long		0
495.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
496___
497}
498&gen_block("en");
499&gen_block("de");
500}}}
501#########################################################################
502{{{	# CBC en- and decrypt procedures				#
503my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
504my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
505my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
506						map("v$_",(4..10));
507$code.=<<___;
508.globl	.${prefix}_cbc_encrypt
509.align	5
510.${prefix}_cbc_encrypt:
511	${UCMP}i	$len,16
512	bltlr-
513
514	cmpwi		$enc,0			# test direction
515	lis		r0,0xffe0
516	mfspr		$vrsave,256
517	mtspr		256,r0
518
519	li		$idx,15
520	vxor		$rndkey0,$rndkey0,$rndkey0
521	le?vspltisb	$tmp,0x0f
522
523	lvx		$ivec,0,$ivp		# load [unaligned] iv
524	lvsl		$inpperm,0,$ivp
525	lvx		$inptail,$idx,$ivp
526	le?vxor		$inpperm,$inpperm,$tmp
527	vperm		$ivec,$ivec,$inptail,$inpperm
528
529	neg		r11,$inp
530	?lvsl		$keyperm,0,$key		# prepare for unaligned key
531	lwz		$rounds,240($key)
532
533	lvsr		$inpperm,0,r11		# prepare for unaligned load
534	lvx		$inptail,0,$inp
535	addi		$inp,$inp,15		# 15 is not typo
536	le?vxor		$inpperm,$inpperm,$tmp
537
538	?lvsr		$outperm,0,$out		# prepare for unaligned store
539	vspltisb	$outmask,-1
540	lvx		$outhead,0,$out
541	?vperm		$outmask,$rndkey0,$outmask,$outperm
542	le?vxor		$outperm,$outperm,$tmp
543
544	srwi		$rounds,$rounds,1
545	li		$idx,16
546	subi		$rounds,$rounds,1
547	beq		Lcbc_dec
548
549Lcbc_enc:
550	vmr		$inout,$inptail
551	lvx		$inptail,0,$inp
552	addi		$inp,$inp,16
553	mtctr		$rounds
554	subi		$len,$len,16		# len-=16
555
556	lvx		$rndkey0,0,$key
557	 vperm		$inout,$inout,$inptail,$inpperm
558	lvx		$rndkey1,$idx,$key
559	addi		$idx,$idx,16
560	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
561	vxor		$inout,$inout,$rndkey0
562	lvx		$rndkey0,$idx,$key
563	addi		$idx,$idx,16
564	vxor		$inout,$inout,$ivec
565
566Loop_cbc_enc:
567	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
568	vcipher		$inout,$inout,$rndkey1
569	lvx		$rndkey1,$idx,$key
570	addi		$idx,$idx,16
571	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
572	vcipher		$inout,$inout,$rndkey0
573	lvx		$rndkey0,$idx,$key
574	addi		$idx,$idx,16
575	bdnz		Loop_cbc_enc
576
577	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
578	vcipher		$inout,$inout,$rndkey1
579	lvx		$rndkey1,$idx,$key
580	li		$idx,16
581	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
582	vcipherlast	$ivec,$inout,$rndkey0
583	${UCMP}i	$len,16
584
585	vperm		$tmp,$ivec,$ivec,$outperm
586	vsel		$inout,$outhead,$tmp,$outmask
587	vmr		$outhead,$tmp
588	stvx		$inout,0,$out
589	addi		$out,$out,16
590	bge		Lcbc_enc
591
592	b		Lcbc_done
593
594.align	4
595Lcbc_dec:
596	${UCMP}i	$len,128
597	bge		_aesp8_cbc_decrypt8x
598	vmr		$tmp,$inptail
599	lvx		$inptail,0,$inp
600	addi		$inp,$inp,16
601	mtctr		$rounds
602	subi		$len,$len,16		# len-=16
603
604	lvx		$rndkey0,0,$key
605	 vperm		$tmp,$tmp,$inptail,$inpperm
606	lvx		$rndkey1,$idx,$key
607	addi		$idx,$idx,16
608	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
609	vxor		$inout,$tmp,$rndkey0
610	lvx		$rndkey0,$idx,$key
611	addi		$idx,$idx,16
612
613Loop_cbc_dec:
614	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
615	vncipher	$inout,$inout,$rndkey1
616	lvx		$rndkey1,$idx,$key
617	addi		$idx,$idx,16
618	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
619	vncipher	$inout,$inout,$rndkey0
620	lvx		$rndkey0,$idx,$key
621	addi		$idx,$idx,16
622	bdnz		Loop_cbc_dec
623
624	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
625	vncipher	$inout,$inout,$rndkey1
626	lvx		$rndkey1,$idx,$key
627	li		$idx,16
628	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
629	vncipherlast	$inout,$inout,$rndkey0
630	${UCMP}i	$len,16
631
632	vxor		$inout,$inout,$ivec
633	vmr		$ivec,$tmp
634	vperm		$tmp,$inout,$inout,$outperm
635	vsel		$inout,$outhead,$tmp,$outmask
636	vmr		$outhead,$tmp
637	stvx		$inout,0,$out
638	addi		$out,$out,16
639	bge		Lcbc_dec
640
641Lcbc_done:
642	addi		$out,$out,-1
643	lvx		$inout,0,$out		# redundant in aligned case
644	vsel		$inout,$outhead,$inout,$outmask
645	stvx		$inout,0,$out
646
647	neg		$enc,$ivp		# write [unaligned] iv
648	li		$idx,15			# 15 is not typo
649	vxor		$rndkey0,$rndkey0,$rndkey0
650	vspltisb	$outmask,-1
651	le?vspltisb	$tmp,0x0f
652	?lvsl		$outperm,0,$enc
653	?vperm		$outmask,$rndkey0,$outmask,$outperm
654	le?vxor		$outperm,$outperm,$tmp
655	lvx		$outhead,0,$ivp
656	vperm		$ivec,$ivec,$ivec,$outperm
657	vsel		$inout,$outhead,$ivec,$outmask
658	lvx		$inptail,$idx,$ivp
659	stvx		$inout,0,$ivp
660	vsel		$inout,$ivec,$inptail,$outmask
661	stvx		$inout,$idx,$ivp
662
663	mtspr		256,$vrsave
664	blr
665	.long		0
666	.byte		0,12,0x14,0,0,0,6,0
667	.long		0
668___
669#########################################################################
670{{	# Optimized CBC decrypt procedure				#
671my $key_="r11";
672my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
673    $x00=0 if ($flavour =~ /osx/);
674my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
675my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
676my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
677			# v26-v31 last 6 round keys
678my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
679
680$code.=<<___;
681.align	5
682_aesp8_cbc_decrypt8x:
683	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
684	li		r10,`$FRAME+8*16+15`
685	li		r11,`$FRAME+8*16+31`
686	stvx		v20,r10,$sp		# ABI says so
687	addi		r10,r10,32
688	stvx		v21,r11,$sp
689	addi		r11,r11,32
690	stvx		v22,r10,$sp
691	addi		r10,r10,32
692	stvx		v23,r11,$sp
693	addi		r11,r11,32
694	stvx		v24,r10,$sp
695	addi		r10,r10,32
696	stvx		v25,r11,$sp
697	addi		r11,r11,32
698	stvx		v26,r10,$sp
699	addi		r10,r10,32
700	stvx		v27,r11,$sp
701	addi		r11,r11,32
702	stvx		v28,r10,$sp
703	addi		r10,r10,32
704	stvx		v29,r11,$sp
705	addi		r11,r11,32
706	stvx		v30,r10,$sp
707	stvx		v31,r11,$sp
708	li		r0,-1
709	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
710	li		$x10,0x10
711	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
712	li		$x20,0x20
713	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
714	li		$x30,0x30
715	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
716	li		$x40,0x40
717	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
718	li		$x50,0x50
719	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
720	li		$x60,0x60
721	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
722	li		$x70,0x70
723	mtspr		256,r0
724
725	subi		$rounds,$rounds,3	# -4 in total
726	subi		$len,$len,128		# bias
727
728	lvx		$rndkey0,$x00,$key	# load key schedule
729	lvx		v30,$x10,$key
730	addi		$key,$key,0x20
731	lvx		v31,$x00,$key
732	?vperm		$rndkey0,$rndkey0,v30,$keyperm
733	addi		$key_,$sp,$FRAME+15
734	mtctr		$rounds
735
736Load_cbc_dec_key:
737	?vperm		v24,v30,v31,$keyperm
738	lvx		v30,$x10,$key
739	addi		$key,$key,0x20
740	stvx		v24,$x00,$key_		# off-load round[1]
741	?vperm		v25,v31,v30,$keyperm
742	lvx		v31,$x00,$key
743	stvx		v25,$x10,$key_		# off-load round[2]
744	addi		$key_,$key_,0x20
745	bdnz		Load_cbc_dec_key
746
747	lvx		v26,$x10,$key
748	?vperm		v24,v30,v31,$keyperm
749	lvx		v27,$x20,$key
750	stvx		v24,$x00,$key_		# off-load round[3]
751	?vperm		v25,v31,v26,$keyperm
752	lvx		v28,$x30,$key
753	stvx		v25,$x10,$key_		# off-load round[4]
754	addi		$key_,$sp,$FRAME+15	# rewind $key_
755	?vperm		v26,v26,v27,$keyperm
756	lvx		v29,$x40,$key
757	?vperm		v27,v27,v28,$keyperm
758	lvx		v30,$x50,$key
759	?vperm		v28,v28,v29,$keyperm
760	lvx		v31,$x60,$key
761	?vperm		v29,v29,v30,$keyperm
762	lvx		$out0,$x70,$key		# borrow $out0
763	?vperm		v30,v30,v31,$keyperm
764	lvx		v24,$x00,$key_		# pre-load round[1]
765	?vperm		v31,v31,$out0,$keyperm
766	lvx		v25,$x10,$key_		# pre-load round[2]
767
768	#lvx		$inptail,0,$inp		# "caller" already did this
769	#addi		$inp,$inp,15		# 15 is not typo
770	subi		$inp,$inp,15		# undo "caller"
771
772	 le?li		$idx,8
773	lvx_u		$in0,$x00,$inp		# load first 8 "words"
774	 le?lvsl	$inpperm,0,$idx
775	 le?vspltisb	$tmp,0x0f
776	lvx_u		$in1,$x10,$inp
777	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
778	lvx_u		$in2,$x20,$inp
779	 le?vperm	$in0,$in0,$in0,$inpperm
780	lvx_u		$in3,$x30,$inp
781	 le?vperm	$in1,$in1,$in1,$inpperm
782	lvx_u		$in4,$x40,$inp
783	 le?vperm	$in2,$in2,$in2,$inpperm
784	vxor		$out0,$in0,$rndkey0
785	lvx_u		$in5,$x50,$inp
786	 le?vperm	$in3,$in3,$in3,$inpperm
787	vxor		$out1,$in1,$rndkey0
788	lvx_u		$in6,$x60,$inp
789	 le?vperm	$in4,$in4,$in4,$inpperm
790	vxor		$out2,$in2,$rndkey0
791	lvx_u		$in7,$x70,$inp
792	addi		$inp,$inp,0x80
793	 le?vperm	$in5,$in5,$in5,$inpperm
794	vxor		$out3,$in3,$rndkey0
795	 le?vperm	$in6,$in6,$in6,$inpperm
796	vxor		$out4,$in4,$rndkey0
797	 le?vperm	$in7,$in7,$in7,$inpperm
798	vxor		$out5,$in5,$rndkey0
799	vxor		$out6,$in6,$rndkey0
800	vxor		$out7,$in7,$rndkey0
801
802	mtctr		$rounds
803	b		Loop_cbc_dec8x
804.align	5
805Loop_cbc_dec8x:
806	vncipher	$out0,$out0,v24
807	vncipher	$out1,$out1,v24
808	vncipher	$out2,$out2,v24
809	vncipher	$out3,$out3,v24
810	vncipher	$out4,$out4,v24
811	vncipher	$out5,$out5,v24
812	vncipher	$out6,$out6,v24
813	vncipher	$out7,$out7,v24
814	lvx		v24,$x20,$key_		# round[3]
815	addi		$key_,$key_,0x20
816
817	vncipher	$out0,$out0,v25
818	vncipher	$out1,$out1,v25
819	vncipher	$out2,$out2,v25
820	vncipher	$out3,$out3,v25
821	vncipher	$out4,$out4,v25
822	vncipher	$out5,$out5,v25
823	vncipher	$out6,$out6,v25
824	vncipher	$out7,$out7,v25
825	lvx		v25,$x10,$key_		# round[4]
826	bdnz		Loop_cbc_dec8x
827
828	subic		$len,$len,128		# $len-=128
829	vncipher	$out0,$out0,v24
830	vncipher	$out1,$out1,v24
831	vncipher	$out2,$out2,v24
832	vncipher	$out3,$out3,v24
833	vncipher	$out4,$out4,v24
834	vncipher	$out5,$out5,v24
835	vncipher	$out6,$out6,v24
836	vncipher	$out7,$out7,v24
837
838	subfe.		r0,r0,r0		# borrow?-1:0
839	vncipher	$out0,$out0,v25
840	vncipher	$out1,$out1,v25
841	vncipher	$out2,$out2,v25
842	vncipher	$out3,$out3,v25
843	vncipher	$out4,$out4,v25
844	vncipher	$out5,$out5,v25
845	vncipher	$out6,$out6,v25
846	vncipher	$out7,$out7,v25
847
848	and		r0,r0,$len
849	vncipher	$out0,$out0,v26
850	vncipher	$out1,$out1,v26
851	vncipher	$out2,$out2,v26
852	vncipher	$out3,$out3,v26
853	vncipher	$out4,$out4,v26
854	vncipher	$out5,$out5,v26
855	vncipher	$out6,$out6,v26
856	vncipher	$out7,$out7,v26
857
858	add		$inp,$inp,r0		# $inp is adjusted in such
859						# way that at exit from the
860						# loop inX-in7 are loaded
861						# with last "words"
862	vncipher	$out0,$out0,v27
863	vncipher	$out1,$out1,v27
864	vncipher	$out2,$out2,v27
865	vncipher	$out3,$out3,v27
866	vncipher	$out4,$out4,v27
867	vncipher	$out5,$out5,v27
868	vncipher	$out6,$out6,v27
869	vncipher	$out7,$out7,v27
870
871	addi		$key_,$sp,$FRAME+15	# rewind $key_
872	vncipher	$out0,$out0,v28
873	vncipher	$out1,$out1,v28
874	vncipher	$out2,$out2,v28
875	vncipher	$out3,$out3,v28
876	vncipher	$out4,$out4,v28
877	vncipher	$out5,$out5,v28
878	vncipher	$out6,$out6,v28
879	vncipher	$out7,$out7,v28
880	lvx		v24,$x00,$key_		# re-pre-load round[1]
881
882	vncipher	$out0,$out0,v29
883	vncipher	$out1,$out1,v29
884	vncipher	$out2,$out2,v29
885	vncipher	$out3,$out3,v29
886	vncipher	$out4,$out4,v29
887	vncipher	$out5,$out5,v29
888	vncipher	$out6,$out6,v29
889	vncipher	$out7,$out7,v29
890	lvx		v25,$x10,$key_		# re-pre-load round[2]
891
892	vncipher	$out0,$out0,v30
893	 vxor		$ivec,$ivec,v31		# xor with last round key
894	vncipher	$out1,$out1,v30
895	 vxor		$in0,$in0,v31
896	vncipher	$out2,$out2,v30
897	 vxor		$in1,$in1,v31
898	vncipher	$out3,$out3,v30
899	 vxor		$in2,$in2,v31
900	vncipher	$out4,$out4,v30
901	 vxor		$in3,$in3,v31
902	vncipher	$out5,$out5,v30
903	 vxor		$in4,$in4,v31
904	vncipher	$out6,$out6,v30
905	 vxor		$in5,$in5,v31
906	vncipher	$out7,$out7,v30
907	 vxor		$in6,$in6,v31
908
909	vncipherlast	$out0,$out0,$ivec
910	vncipherlast	$out1,$out1,$in0
911	 lvx_u		$in0,$x00,$inp		# load next input block
912	vncipherlast	$out2,$out2,$in1
913	 lvx_u		$in1,$x10,$inp
914	vncipherlast	$out3,$out3,$in2
915	 le?vperm	$in0,$in0,$in0,$inpperm
916	 lvx_u		$in2,$x20,$inp
917	vncipherlast	$out4,$out4,$in3
918	 le?vperm	$in1,$in1,$in1,$inpperm
919	 lvx_u		$in3,$x30,$inp
920	vncipherlast	$out5,$out5,$in4
921	 le?vperm	$in2,$in2,$in2,$inpperm
922	 lvx_u		$in4,$x40,$inp
923	vncipherlast	$out6,$out6,$in5
924	 le?vperm	$in3,$in3,$in3,$inpperm
925	 lvx_u		$in5,$x50,$inp
926	vncipherlast	$out7,$out7,$in6
927	 le?vperm	$in4,$in4,$in4,$inpperm
928	 lvx_u		$in6,$x60,$inp
929	vmr		$ivec,$in7
930	 le?vperm	$in5,$in5,$in5,$inpperm
931	 lvx_u		$in7,$x70,$inp
932	 addi		$inp,$inp,0x80
933
934	le?vperm	$out0,$out0,$out0,$inpperm
935	le?vperm	$out1,$out1,$out1,$inpperm
936	stvx_u		$out0,$x00,$out
937	 le?vperm	$in6,$in6,$in6,$inpperm
938	 vxor		$out0,$in0,$rndkey0
939	le?vperm	$out2,$out2,$out2,$inpperm
940	stvx_u		$out1,$x10,$out
941	 le?vperm	$in7,$in7,$in7,$inpperm
942	 vxor		$out1,$in1,$rndkey0
943	le?vperm	$out3,$out3,$out3,$inpperm
944	stvx_u		$out2,$x20,$out
945	 vxor		$out2,$in2,$rndkey0
946	le?vperm	$out4,$out4,$out4,$inpperm
947	stvx_u		$out3,$x30,$out
948	 vxor		$out3,$in3,$rndkey0
949	le?vperm	$out5,$out5,$out5,$inpperm
950	stvx_u		$out4,$x40,$out
951	 vxor		$out4,$in4,$rndkey0
952	le?vperm	$out6,$out6,$out6,$inpperm
953	stvx_u		$out5,$x50,$out
954	 vxor		$out5,$in5,$rndkey0
955	le?vperm	$out7,$out7,$out7,$inpperm
956	stvx_u		$out6,$x60,$out
957	 vxor		$out6,$in6,$rndkey0
958	stvx_u		$out7,$x70,$out
959	addi		$out,$out,0x80
960	 vxor		$out7,$in7,$rndkey0
961
962	mtctr		$rounds
963	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
964
965	addic.		$len,$len,128
966	beq		Lcbc_dec8x_done
967	nop
968	nop
969
970Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
971	vncipher	$out1,$out1,v24
972	vncipher	$out2,$out2,v24
973	vncipher	$out3,$out3,v24
974	vncipher	$out4,$out4,v24
975	vncipher	$out5,$out5,v24
976	vncipher	$out6,$out6,v24
977	vncipher	$out7,$out7,v24
978	lvx		v24,$x20,$key_		# round[3]
979	addi		$key_,$key_,0x20
980
981	vncipher	$out1,$out1,v25
982	vncipher	$out2,$out2,v25
983	vncipher	$out3,$out3,v25
984	vncipher	$out4,$out4,v25
985	vncipher	$out5,$out5,v25
986	vncipher	$out6,$out6,v25
987	vncipher	$out7,$out7,v25
988	lvx		v25,$x10,$key_		# round[4]
989	bdnz		Loop_cbc_dec8x_tail
990
991	vncipher	$out1,$out1,v24
992	vncipher	$out2,$out2,v24
993	vncipher	$out3,$out3,v24
994	vncipher	$out4,$out4,v24
995	vncipher	$out5,$out5,v24
996	vncipher	$out6,$out6,v24
997	vncipher	$out7,$out7,v24
998
999	vncipher	$out1,$out1,v25
1000	vncipher	$out2,$out2,v25
1001	vncipher	$out3,$out3,v25
1002	vncipher	$out4,$out4,v25
1003	vncipher	$out5,$out5,v25
1004	vncipher	$out6,$out6,v25
1005	vncipher	$out7,$out7,v25
1006
1007	vncipher	$out1,$out1,v26
1008	vncipher	$out2,$out2,v26
1009	vncipher	$out3,$out3,v26
1010	vncipher	$out4,$out4,v26
1011	vncipher	$out5,$out5,v26
1012	vncipher	$out6,$out6,v26
1013	vncipher	$out7,$out7,v26
1014
1015	vncipher	$out1,$out1,v27
1016	vncipher	$out2,$out2,v27
1017	vncipher	$out3,$out3,v27
1018	vncipher	$out4,$out4,v27
1019	vncipher	$out5,$out5,v27
1020	vncipher	$out6,$out6,v27
1021	vncipher	$out7,$out7,v27
1022
1023	vncipher	$out1,$out1,v28
1024	vncipher	$out2,$out2,v28
1025	vncipher	$out3,$out3,v28
1026	vncipher	$out4,$out4,v28
1027	vncipher	$out5,$out5,v28
1028	vncipher	$out6,$out6,v28
1029	vncipher	$out7,$out7,v28
1030
1031	vncipher	$out1,$out1,v29
1032	vncipher	$out2,$out2,v29
1033	vncipher	$out3,$out3,v29
1034	vncipher	$out4,$out4,v29
1035	vncipher	$out5,$out5,v29
1036	vncipher	$out6,$out6,v29
1037	vncipher	$out7,$out7,v29
1038
1039	vncipher	$out1,$out1,v30
1040	 vxor		$ivec,$ivec,v31		# last round key
1041	vncipher	$out2,$out2,v30
1042	 vxor		$in1,$in1,v31
1043	vncipher	$out3,$out3,v30
1044	 vxor		$in2,$in2,v31
1045	vncipher	$out4,$out4,v30
1046	 vxor		$in3,$in3,v31
1047	vncipher	$out5,$out5,v30
1048	 vxor		$in4,$in4,v31
1049	vncipher	$out6,$out6,v30
1050	 vxor		$in5,$in5,v31
1051	vncipher	$out7,$out7,v30
1052	 vxor		$in6,$in6,v31
1053
1054	cmplwi		$len,32			# switch($len)
1055	blt		Lcbc_dec8x_one
1056	nop
1057	beq		Lcbc_dec8x_two
1058	cmplwi		$len,64
1059	blt		Lcbc_dec8x_three
1060	nop
1061	beq		Lcbc_dec8x_four
1062	cmplwi		$len,96
1063	blt		Lcbc_dec8x_five
1064	nop
1065	beq		Lcbc_dec8x_six
1066
1067Lcbc_dec8x_seven:
1068	vncipherlast	$out1,$out1,$ivec
1069	vncipherlast	$out2,$out2,$in1
1070	vncipherlast	$out3,$out3,$in2
1071	vncipherlast	$out4,$out4,$in3
1072	vncipherlast	$out5,$out5,$in4
1073	vncipherlast	$out6,$out6,$in5
1074	vncipherlast	$out7,$out7,$in6
1075	vmr		$ivec,$in7
1076
1077	le?vperm	$out1,$out1,$out1,$inpperm
1078	le?vperm	$out2,$out2,$out2,$inpperm
1079	stvx_u		$out1,$x00,$out
1080	le?vperm	$out3,$out3,$out3,$inpperm
1081	stvx_u		$out2,$x10,$out
1082	le?vperm	$out4,$out4,$out4,$inpperm
1083	stvx_u		$out3,$x20,$out
1084	le?vperm	$out5,$out5,$out5,$inpperm
1085	stvx_u		$out4,$x30,$out
1086	le?vperm	$out6,$out6,$out6,$inpperm
1087	stvx_u		$out5,$x40,$out
1088	le?vperm	$out7,$out7,$out7,$inpperm
1089	stvx_u		$out6,$x50,$out
1090	stvx_u		$out7,$x60,$out
1091	addi		$out,$out,0x70
1092	b		Lcbc_dec8x_done
1093
1094.align	5
1095Lcbc_dec8x_six:
1096	vncipherlast	$out2,$out2,$ivec
1097	vncipherlast	$out3,$out3,$in2
1098	vncipherlast	$out4,$out4,$in3
1099	vncipherlast	$out5,$out5,$in4
1100	vncipherlast	$out6,$out6,$in5
1101	vncipherlast	$out7,$out7,$in6
1102	vmr		$ivec,$in7
1103
1104	le?vperm	$out2,$out2,$out2,$inpperm
1105	le?vperm	$out3,$out3,$out3,$inpperm
1106	stvx_u		$out2,$x00,$out
1107	le?vperm	$out4,$out4,$out4,$inpperm
1108	stvx_u		$out3,$x10,$out
1109	le?vperm	$out5,$out5,$out5,$inpperm
1110	stvx_u		$out4,$x20,$out
1111	le?vperm	$out6,$out6,$out6,$inpperm
1112	stvx_u		$out5,$x30,$out
1113	le?vperm	$out7,$out7,$out7,$inpperm
1114	stvx_u		$out6,$x40,$out
1115	stvx_u		$out7,$x50,$out
1116	addi		$out,$out,0x60
1117	b		Lcbc_dec8x_done
1118
1119.align	5
1120Lcbc_dec8x_five:
1121	vncipherlast	$out3,$out3,$ivec
1122	vncipherlast	$out4,$out4,$in3
1123	vncipherlast	$out5,$out5,$in4
1124	vncipherlast	$out6,$out6,$in5
1125	vncipherlast	$out7,$out7,$in6
1126	vmr		$ivec,$in7
1127
1128	le?vperm	$out3,$out3,$out3,$inpperm
1129	le?vperm	$out4,$out4,$out4,$inpperm
1130	stvx_u		$out3,$x00,$out
1131	le?vperm	$out5,$out5,$out5,$inpperm
1132	stvx_u		$out4,$x10,$out
1133	le?vperm	$out6,$out6,$out6,$inpperm
1134	stvx_u		$out5,$x20,$out
1135	le?vperm	$out7,$out7,$out7,$inpperm
1136	stvx_u		$out6,$x30,$out
1137	stvx_u		$out7,$x40,$out
1138	addi		$out,$out,0x50
1139	b		Lcbc_dec8x_done
1140
1141.align	5
1142Lcbc_dec8x_four:
1143	vncipherlast	$out4,$out4,$ivec
1144	vncipherlast	$out5,$out5,$in4
1145	vncipherlast	$out6,$out6,$in5
1146	vncipherlast	$out7,$out7,$in6
1147	vmr		$ivec,$in7
1148
1149	le?vperm	$out4,$out4,$out4,$inpperm
1150	le?vperm	$out5,$out5,$out5,$inpperm
1151	stvx_u		$out4,$x00,$out
1152	le?vperm	$out6,$out6,$out6,$inpperm
1153	stvx_u		$out5,$x10,$out
1154	le?vperm	$out7,$out7,$out7,$inpperm
1155	stvx_u		$out6,$x20,$out
1156	stvx_u		$out7,$x30,$out
1157	addi		$out,$out,0x40
1158	b		Lcbc_dec8x_done
1159
1160.align	5
1161Lcbc_dec8x_three:
1162	vncipherlast	$out5,$out5,$ivec
1163	vncipherlast	$out6,$out6,$in5
1164	vncipherlast	$out7,$out7,$in6
1165	vmr		$ivec,$in7
1166
1167	le?vperm	$out5,$out5,$out5,$inpperm
1168	le?vperm	$out6,$out6,$out6,$inpperm
1169	stvx_u		$out5,$x00,$out
1170	le?vperm	$out7,$out7,$out7,$inpperm
1171	stvx_u		$out6,$x10,$out
1172	stvx_u		$out7,$x20,$out
1173	addi		$out,$out,0x30
1174	b		Lcbc_dec8x_done
1175
1176.align	5
1177Lcbc_dec8x_two:
1178	vncipherlast	$out6,$out6,$ivec
1179	vncipherlast	$out7,$out7,$in6
1180	vmr		$ivec,$in7
1181
1182	le?vperm	$out6,$out6,$out6,$inpperm
1183	le?vperm	$out7,$out7,$out7,$inpperm
1184	stvx_u		$out6,$x00,$out
1185	stvx_u		$out7,$x10,$out
1186	addi		$out,$out,0x20
1187	b		Lcbc_dec8x_done
1188
1189.align	5
1190Lcbc_dec8x_one:
1191	vncipherlast	$out7,$out7,$ivec
1192	vmr		$ivec,$in7
1193
1194	le?vperm	$out7,$out7,$out7,$inpperm
1195	stvx_u		$out7,0,$out
1196	addi		$out,$out,0x10
1197
1198Lcbc_dec8x_done:
1199	le?vperm	$ivec,$ivec,$ivec,$inpperm
1200	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
1201
1202	li		r10,`$FRAME+15`
1203	li		r11,`$FRAME+31`
1204	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1205	addi		r10,r10,32
1206	stvx		$inpperm,r11,$sp
1207	addi		r11,r11,32
1208	stvx		$inpperm,r10,$sp
1209	addi		r10,r10,32
1210	stvx		$inpperm,r11,$sp
1211	addi		r11,r11,32
1212	stvx		$inpperm,r10,$sp
1213	addi		r10,r10,32
1214	stvx		$inpperm,r11,$sp
1215	addi		r11,r11,32
1216	stvx		$inpperm,r10,$sp
1217	addi		r10,r10,32
1218	stvx		$inpperm,r11,$sp
1219	addi		r11,r11,32
1220
1221	mtspr		256,$vrsave
1222	lvx		v20,r10,$sp		# ABI says so
1223	addi		r10,r10,32
1224	lvx		v21,r11,$sp
1225	addi		r11,r11,32
1226	lvx		v22,r10,$sp
1227	addi		r10,r10,32
1228	lvx		v23,r11,$sp
1229	addi		r11,r11,32
1230	lvx		v24,r10,$sp
1231	addi		r10,r10,32
1232	lvx		v25,r11,$sp
1233	addi		r11,r11,32
1234	lvx		v26,r10,$sp
1235	addi		r10,r10,32
1236	lvx		v27,r11,$sp
1237	addi		r11,r11,32
1238	lvx		v28,r10,$sp
1239	addi		r10,r10,32
1240	lvx		v29,r11,$sp
1241	addi		r11,r11,32
1242	lvx		v30,r10,$sp
1243	lvx		v31,r11,$sp
1244	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1245	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1246	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1247	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1248	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1249	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1250	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1251	blr
1252	.long		0
1253	.byte		0,12,0x04,0,0x80,6,6,0
1254	.long		0
1255.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1256___
1257}}	}}}
1258
1259#########################################################################
1260{{{	# CTR procedure[s]						#
1261my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1262my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
1263my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1264						map("v$_",(4..11));
1265my $dat=$tmp;
1266
1267$code.=<<___;
1268.globl	.${prefix}_ctr32_encrypt_blocks
1269.align	5
1270.${prefix}_ctr32_encrypt_blocks:
1271	${UCMP}i	$len,1
1272	bltlr-
1273
1274	lis		r0,0xfff0
1275	mfspr		$vrsave,256
1276	mtspr		256,r0
1277
1278	li		$idx,15
1279	vxor		$rndkey0,$rndkey0,$rndkey0
1280	le?vspltisb	$tmp,0x0f
1281
1282	lvx		$ivec,0,$ivp		# load [unaligned] iv
1283	lvsl		$inpperm,0,$ivp
1284	lvx		$inptail,$idx,$ivp
1285	 vspltisb	$one,1
1286	le?vxor		$inpperm,$inpperm,$tmp
1287	vperm		$ivec,$ivec,$inptail,$inpperm
1288	 vsldoi		$one,$rndkey0,$one,1
1289
1290	neg		r11,$inp
1291	?lvsl		$keyperm,0,$key		# prepare for unaligned key
1292	lwz		$rounds,240($key)
1293
1294	lvsr		$inpperm,0,r11		# prepare for unaligned load
1295	lvx		$inptail,0,$inp
1296	addi		$inp,$inp,15		# 15 is not typo
1297	le?vxor		$inpperm,$inpperm,$tmp
1298
1299	srwi		$rounds,$rounds,1
1300	li		$idx,16
1301	subi		$rounds,$rounds,1
1302
1303	${UCMP}i	$len,8
1304	bge		_aesp8_ctr32_encrypt8x
1305
1306	?lvsr		$outperm,0,$out		# prepare for unaligned store
1307	vspltisb	$outmask,-1
1308	lvx		$outhead,0,$out
1309	?vperm		$outmask,$rndkey0,$outmask,$outperm
1310	le?vxor		$outperm,$outperm,$tmp
1311
1312	lvx		$rndkey0,0,$key
1313	mtctr		$rounds
1314	lvx		$rndkey1,$idx,$key
1315	addi		$idx,$idx,16
1316	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1317	vxor		$inout,$ivec,$rndkey0
1318	lvx		$rndkey0,$idx,$key
1319	addi		$idx,$idx,16
1320	b		Loop_ctr32_enc
1321
1322.align	5
1323Loop_ctr32_enc:
1324	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1325	vcipher		$inout,$inout,$rndkey1
1326	lvx		$rndkey1,$idx,$key
1327	addi		$idx,$idx,16
1328	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1329	vcipher		$inout,$inout,$rndkey0
1330	lvx		$rndkey0,$idx,$key
1331	addi		$idx,$idx,16
1332	bdnz		Loop_ctr32_enc
1333
1334	vadduwm		$ivec,$ivec,$one
1335	 vmr		$dat,$inptail
1336	 lvx		$inptail,0,$inp
1337	 addi		$inp,$inp,16
1338	 subic.		$len,$len,1		# blocks--
1339
1340	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1341	vcipher		$inout,$inout,$rndkey1
1342	lvx		$rndkey1,$idx,$key
1343	 vperm		$dat,$dat,$inptail,$inpperm
1344	 li		$idx,16
1345	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
1346	 lvx		$rndkey0,0,$key
1347	vxor		$dat,$dat,$rndkey1	# last round key
1348	vcipherlast	$inout,$inout,$dat
1349
1350	 lvx		$rndkey1,$idx,$key
1351	 addi		$idx,$idx,16
1352	vperm		$inout,$inout,$inout,$outperm
1353	vsel		$dat,$outhead,$inout,$outmask
1354	 mtctr		$rounds
1355	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1356	vmr		$outhead,$inout
1357	 vxor		$inout,$ivec,$rndkey0
1358	 lvx		$rndkey0,$idx,$key
1359	 addi		$idx,$idx,16
1360	stvx		$dat,0,$out
1361	addi		$out,$out,16
1362	bne		Loop_ctr32_enc
1363
1364	addi		$out,$out,-1
1365	lvx		$inout,0,$out		# redundant in aligned case
1366	vsel		$inout,$outhead,$inout,$outmask
1367	stvx		$inout,0,$out
1368
1369	mtspr		256,$vrsave
1370	blr
1371	.long		0
1372	.byte		0,12,0x14,0,0,0,6,0
1373	.long		0
1374___
1375#########################################################################
1376{{	# Optimized CTR procedure					#
1377my $key_="r11";
1378my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1379    $x00=0 if ($flavour =~ /osx/);
1380my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1381my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1382my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
1383			# v26-v31 last 6 round keys
1384my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
1385my ($two,$three,$four)=($outhead,$outperm,$outmask);
1386
1387$code.=<<___;
1388.align	5
1389_aesp8_ctr32_encrypt8x:
1390	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1391	li		r10,`$FRAME+8*16+15`
1392	li		r11,`$FRAME+8*16+31`
1393	stvx		v20,r10,$sp		# ABI says so
1394	addi		r10,r10,32
1395	stvx		v21,r11,$sp
1396	addi		r11,r11,32
1397	stvx		v22,r10,$sp
1398	addi		r10,r10,32
1399	stvx		v23,r11,$sp
1400	addi		r11,r11,32
1401	stvx		v24,r10,$sp
1402	addi		r10,r10,32
1403	stvx		v25,r11,$sp
1404	addi		r11,r11,32
1405	stvx		v26,r10,$sp
1406	addi		r10,r10,32
1407	stvx		v27,r11,$sp
1408	addi		r11,r11,32
1409	stvx		v28,r10,$sp
1410	addi		r10,r10,32
1411	stvx		v29,r11,$sp
1412	addi		r11,r11,32
1413	stvx		v30,r10,$sp
1414	stvx		v31,r11,$sp
1415	li		r0,-1
1416	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
1417	li		$x10,0x10
1418	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1419	li		$x20,0x20
1420	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1421	li		$x30,0x30
1422	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1423	li		$x40,0x40
1424	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1425	li		$x50,0x50
1426	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1427	li		$x60,0x60
1428	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1429	li		$x70,0x70
1430	mtspr		256,r0
1431
1432	subi		$rounds,$rounds,3	# -4 in total
1433
1434	lvx		$rndkey0,$x00,$key	# load key schedule
1435	lvx		v30,$x10,$key
1436	addi		$key,$key,0x20
1437	lvx		v31,$x00,$key
1438	?vperm		$rndkey0,$rndkey0,v30,$keyperm
1439	addi		$key_,$sp,$FRAME+15
1440	mtctr		$rounds
1441
1442Load_ctr32_enc_key:
1443	?vperm		v24,v30,v31,$keyperm
1444	lvx		v30,$x10,$key
1445	addi		$key,$key,0x20
1446	stvx		v24,$x00,$key_		# off-load round[1]
1447	?vperm		v25,v31,v30,$keyperm
1448	lvx		v31,$x00,$key
1449	stvx		v25,$x10,$key_		# off-load round[2]
1450	addi		$key_,$key_,0x20
1451	bdnz		Load_ctr32_enc_key
1452
1453	lvx		v26,$x10,$key
1454	?vperm		v24,v30,v31,$keyperm
1455	lvx		v27,$x20,$key
1456	stvx		v24,$x00,$key_		# off-load round[3]
1457	?vperm		v25,v31,v26,$keyperm
1458	lvx		v28,$x30,$key
1459	stvx		v25,$x10,$key_		# off-load round[4]
1460	addi		$key_,$sp,$FRAME+15	# rewind $key_
1461	?vperm		v26,v26,v27,$keyperm
1462	lvx		v29,$x40,$key
1463	?vperm		v27,v27,v28,$keyperm
1464	lvx		v30,$x50,$key
1465	?vperm		v28,v28,v29,$keyperm
1466	lvx		v31,$x60,$key
1467	?vperm		v29,v29,v30,$keyperm
1468	lvx		$out0,$x70,$key		# borrow $out0
1469	?vperm		v30,v30,v31,$keyperm
1470	lvx		v24,$x00,$key_		# pre-load round[1]
1471	?vperm		v31,v31,$out0,$keyperm
1472	lvx		v25,$x10,$key_		# pre-load round[2]
1473
1474	vadduwm		$two,$one,$one
1475	subi		$inp,$inp,15		# undo "caller"
1476	$SHL		$len,$len,4
1477
1478	vadduwm		$out1,$ivec,$one	# counter values ...
1479	vadduwm		$out2,$ivec,$two
1480	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1481	 le?li		$idx,8
1482	vadduwm		$out3,$out1,$two
1483	vxor		$out1,$out1,$rndkey0
1484	 le?lvsl	$inpperm,0,$idx
1485	vadduwm		$out4,$out2,$two
1486	vxor		$out2,$out2,$rndkey0
1487	 le?vspltisb	$tmp,0x0f
1488	vadduwm		$out5,$out3,$two
1489	vxor		$out3,$out3,$rndkey0
1490	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
1491	vadduwm		$out6,$out4,$two
1492	vxor		$out4,$out4,$rndkey0
1493	vadduwm		$out7,$out5,$two
1494	vxor		$out5,$out5,$rndkey0
1495	vadduwm		$ivec,$out6,$two	# next counter value
1496	vxor		$out6,$out6,$rndkey0
1497	vxor		$out7,$out7,$rndkey0
1498
1499	mtctr		$rounds
1500	b		Loop_ctr32_enc8x
1501.align	5
1502Loop_ctr32_enc8x:
1503	vcipher 	$out0,$out0,v24
1504	vcipher 	$out1,$out1,v24
1505	vcipher 	$out2,$out2,v24
1506	vcipher 	$out3,$out3,v24
1507	vcipher 	$out4,$out4,v24
1508	vcipher 	$out5,$out5,v24
1509	vcipher 	$out6,$out6,v24
1510	vcipher 	$out7,$out7,v24
1511Loop_ctr32_enc8x_middle:
1512	lvx		v24,$x20,$key_		# round[3]
1513	addi		$key_,$key_,0x20
1514
1515	vcipher 	$out0,$out0,v25
1516	vcipher 	$out1,$out1,v25
1517	vcipher 	$out2,$out2,v25
1518	vcipher 	$out3,$out3,v25
1519	vcipher 	$out4,$out4,v25
1520	vcipher 	$out5,$out5,v25
1521	vcipher 	$out6,$out6,v25
1522	vcipher 	$out7,$out7,v25
1523	lvx		v25,$x10,$key_		# round[4]
1524	bdnz		Loop_ctr32_enc8x
1525
1526	subic		r11,$len,256		# $len-256, borrow $key_
1527	vcipher 	$out0,$out0,v24
1528	vcipher 	$out1,$out1,v24
1529	vcipher 	$out2,$out2,v24
1530	vcipher 	$out3,$out3,v24
1531	vcipher 	$out4,$out4,v24
1532	vcipher 	$out5,$out5,v24
1533	vcipher 	$out6,$out6,v24
1534	vcipher 	$out7,$out7,v24
1535
1536	subfe		r0,r0,r0		# borrow?-1:0
1537	vcipher 	$out0,$out0,v25
1538	vcipher 	$out1,$out1,v25
1539	vcipher 	$out2,$out2,v25
1540	vcipher 	$out3,$out3,v25
1541	vcipher 	$out4,$out4,v25
1542	vcipher		$out5,$out5,v25
1543	vcipher		$out6,$out6,v25
1544	vcipher		$out7,$out7,v25
1545
1546	and		r0,r0,r11
1547	addi		$key_,$sp,$FRAME+15	# rewind $key_
1548	vcipher		$out0,$out0,v26
1549	vcipher		$out1,$out1,v26
1550	vcipher		$out2,$out2,v26
1551	vcipher		$out3,$out3,v26
1552	vcipher		$out4,$out4,v26
1553	vcipher		$out5,$out5,v26
1554	vcipher		$out6,$out6,v26
1555	vcipher		$out7,$out7,v26
1556	lvx		v24,$x00,$key_		# re-pre-load round[1]
1557
1558	subic		$len,$len,129		# $len-=129
1559	vcipher		$out0,$out0,v27
1560	addi		$len,$len,1		# $len-=128 really
1561	vcipher		$out1,$out1,v27
1562	vcipher		$out2,$out2,v27
1563	vcipher		$out3,$out3,v27
1564	vcipher		$out4,$out4,v27
1565	vcipher		$out5,$out5,v27
1566	vcipher		$out6,$out6,v27
1567	vcipher		$out7,$out7,v27
1568	lvx		v25,$x10,$key_		# re-pre-load round[2]
1569
1570	vcipher		$out0,$out0,v28
1571	 lvx_u		$in0,$x00,$inp		# load input
1572	vcipher		$out1,$out1,v28
1573	 lvx_u		$in1,$x10,$inp
1574	vcipher		$out2,$out2,v28
1575	 lvx_u		$in2,$x20,$inp
1576	vcipher		$out3,$out3,v28
1577	 lvx_u		$in3,$x30,$inp
1578	vcipher		$out4,$out4,v28
1579	 lvx_u		$in4,$x40,$inp
1580	vcipher		$out5,$out5,v28
1581	 lvx_u		$in5,$x50,$inp
1582	vcipher		$out6,$out6,v28
1583	 lvx_u		$in6,$x60,$inp
1584	vcipher		$out7,$out7,v28
1585	 lvx_u		$in7,$x70,$inp
1586	 addi		$inp,$inp,0x80
1587
1588	vcipher		$out0,$out0,v29
1589	 le?vperm	$in0,$in0,$in0,$inpperm
1590	vcipher		$out1,$out1,v29
1591	 le?vperm	$in1,$in1,$in1,$inpperm
1592	vcipher		$out2,$out2,v29
1593	 le?vperm	$in2,$in2,$in2,$inpperm
1594	vcipher		$out3,$out3,v29
1595	 le?vperm	$in3,$in3,$in3,$inpperm
1596	vcipher		$out4,$out4,v29
1597	 le?vperm	$in4,$in4,$in4,$inpperm
1598	vcipher		$out5,$out5,v29
1599	 le?vperm	$in5,$in5,$in5,$inpperm
1600	vcipher		$out6,$out6,v29
1601	 le?vperm	$in6,$in6,$in6,$inpperm
1602	vcipher		$out7,$out7,v29
1603	 le?vperm	$in7,$in7,$in7,$inpperm
1604
1605	add		$inp,$inp,r0		# $inp is adjusted in such
1606						# way that at exit from the
1607						# loop inX-in7 are loaded
1608						# with last "words"
1609	subfe.		r0,r0,r0		# borrow?-1:0
1610	vcipher		$out0,$out0,v30
1611	 vxor		$in0,$in0,v31		# xor with last round key
1612	vcipher		$out1,$out1,v30
1613	 vxor		$in1,$in1,v31
1614	vcipher		$out2,$out2,v30
1615	 vxor		$in2,$in2,v31
1616	vcipher		$out3,$out3,v30
1617	 vxor		$in3,$in3,v31
1618	vcipher		$out4,$out4,v30
1619	 vxor		$in4,$in4,v31
1620	vcipher		$out5,$out5,v30
1621	 vxor		$in5,$in5,v31
1622	vcipher		$out6,$out6,v30
1623	 vxor		$in6,$in6,v31
1624	vcipher		$out7,$out7,v30
1625	 vxor		$in7,$in7,v31
1626
1627	bne		Lctr32_enc8x_break	# did $len-129 borrow?
1628
1629	vcipherlast	$in0,$out0,$in0
1630	vcipherlast	$in1,$out1,$in1
1631	 vadduwm	$out1,$ivec,$one	# counter values ...
1632	vcipherlast	$in2,$out2,$in2
1633	 vadduwm	$out2,$ivec,$two
1634	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1635	vcipherlast	$in3,$out3,$in3
1636	 vadduwm	$out3,$out1,$two
1637	 vxor		$out1,$out1,$rndkey0
1638	vcipherlast	$in4,$out4,$in4
1639	 vadduwm	$out4,$out2,$two
1640	 vxor		$out2,$out2,$rndkey0
1641	vcipherlast	$in5,$out5,$in5
1642	 vadduwm	$out5,$out3,$two
1643	 vxor		$out3,$out3,$rndkey0
1644	vcipherlast	$in6,$out6,$in6
1645	 vadduwm	$out6,$out4,$two
1646	 vxor		$out4,$out4,$rndkey0
1647	vcipherlast	$in7,$out7,$in7
1648	 vadduwm	$out7,$out5,$two
1649	 vxor		$out5,$out5,$rndkey0
1650	le?vperm	$in0,$in0,$in0,$inpperm
1651	 vadduwm	$ivec,$out6,$two	# next counter value
1652	 vxor		$out6,$out6,$rndkey0
1653	le?vperm	$in1,$in1,$in1,$inpperm
1654	 vxor		$out7,$out7,$rndkey0
1655	mtctr		$rounds
1656
1657	 vcipher	$out0,$out0,v24
1658	stvx_u		$in0,$x00,$out
1659	le?vperm	$in2,$in2,$in2,$inpperm
1660	 vcipher	$out1,$out1,v24
1661	stvx_u		$in1,$x10,$out
1662	le?vperm	$in3,$in3,$in3,$inpperm
1663	 vcipher	$out2,$out2,v24
1664	stvx_u		$in2,$x20,$out
1665	le?vperm	$in4,$in4,$in4,$inpperm
1666	 vcipher	$out3,$out3,v24
1667	stvx_u		$in3,$x30,$out
1668	le?vperm	$in5,$in5,$in5,$inpperm
1669	 vcipher	$out4,$out4,v24
1670	stvx_u		$in4,$x40,$out
1671	le?vperm	$in6,$in6,$in6,$inpperm
1672	 vcipher	$out5,$out5,v24
1673	stvx_u		$in5,$x50,$out
1674	le?vperm	$in7,$in7,$in7,$inpperm
1675	 vcipher	$out6,$out6,v24
1676	stvx_u		$in6,$x60,$out
1677	 vcipher	$out7,$out7,v24
1678	stvx_u		$in7,$x70,$out
1679	addi		$out,$out,0x80
1680
1681	b		Loop_ctr32_enc8x_middle
1682
1683.align	5
1684Lctr32_enc8x_break:
1685	cmpwi		$len,-0x60
1686	blt		Lctr32_enc8x_one
1687	nop
1688	beq		Lctr32_enc8x_two
1689	cmpwi		$len,-0x40
1690	blt		Lctr32_enc8x_three
1691	nop
1692	beq		Lctr32_enc8x_four
1693	cmpwi		$len,-0x20
1694	blt		Lctr32_enc8x_five
1695	nop
1696	beq		Lctr32_enc8x_six
1697	cmpwi		$len,0x00
1698	blt		Lctr32_enc8x_seven
1699
1700Lctr32_enc8x_eight:
1701	vcipherlast	$out0,$out0,$in0
1702	vcipherlast	$out1,$out1,$in1
1703	vcipherlast	$out2,$out2,$in2
1704	vcipherlast	$out3,$out3,$in3
1705	vcipherlast	$out4,$out4,$in4
1706	vcipherlast	$out5,$out5,$in5
1707	vcipherlast	$out6,$out6,$in6
1708	vcipherlast	$out7,$out7,$in7
1709
1710	le?vperm	$out0,$out0,$out0,$inpperm
1711	le?vperm	$out1,$out1,$out1,$inpperm
1712	stvx_u		$out0,$x00,$out
1713	le?vperm	$out2,$out2,$out2,$inpperm
1714	stvx_u		$out1,$x10,$out
1715	le?vperm	$out3,$out3,$out3,$inpperm
1716	stvx_u		$out2,$x20,$out
1717	le?vperm	$out4,$out4,$out4,$inpperm
1718	stvx_u		$out3,$x30,$out
1719	le?vperm	$out5,$out5,$out5,$inpperm
1720	stvx_u		$out4,$x40,$out
1721	le?vperm	$out6,$out6,$out6,$inpperm
1722	stvx_u		$out5,$x50,$out
1723	le?vperm	$out7,$out7,$out7,$inpperm
1724	stvx_u		$out6,$x60,$out
1725	stvx_u		$out7,$x70,$out
1726	addi		$out,$out,0x80
1727	b		Lctr32_enc8x_done
1728
1729.align	5
1730Lctr32_enc8x_seven:
1731	vcipherlast	$out0,$out0,$in1
1732	vcipherlast	$out1,$out1,$in2
1733	vcipherlast	$out2,$out2,$in3
1734	vcipherlast	$out3,$out3,$in4
1735	vcipherlast	$out4,$out4,$in5
1736	vcipherlast	$out5,$out5,$in6
1737	vcipherlast	$out6,$out6,$in7
1738
1739	le?vperm	$out0,$out0,$out0,$inpperm
1740	le?vperm	$out1,$out1,$out1,$inpperm
1741	stvx_u		$out0,$x00,$out
1742	le?vperm	$out2,$out2,$out2,$inpperm
1743	stvx_u		$out1,$x10,$out
1744	le?vperm	$out3,$out3,$out3,$inpperm
1745	stvx_u		$out2,$x20,$out
1746	le?vperm	$out4,$out4,$out4,$inpperm
1747	stvx_u		$out3,$x30,$out
1748	le?vperm	$out5,$out5,$out5,$inpperm
1749	stvx_u		$out4,$x40,$out
1750	le?vperm	$out6,$out6,$out6,$inpperm
1751	stvx_u		$out5,$x50,$out
1752	stvx_u		$out6,$x60,$out
1753	addi		$out,$out,0x70
1754	b		Lctr32_enc8x_done
1755
1756.align	5
1757Lctr32_enc8x_six:
1758	vcipherlast	$out0,$out0,$in2
1759	vcipherlast	$out1,$out1,$in3
1760	vcipherlast	$out2,$out2,$in4
1761	vcipherlast	$out3,$out3,$in5
1762	vcipherlast	$out4,$out4,$in6
1763	vcipherlast	$out5,$out5,$in7
1764
1765	le?vperm	$out0,$out0,$out0,$inpperm
1766	le?vperm	$out1,$out1,$out1,$inpperm
1767	stvx_u		$out0,$x00,$out
1768	le?vperm	$out2,$out2,$out2,$inpperm
1769	stvx_u		$out1,$x10,$out
1770	le?vperm	$out3,$out3,$out3,$inpperm
1771	stvx_u		$out2,$x20,$out
1772	le?vperm	$out4,$out4,$out4,$inpperm
1773	stvx_u		$out3,$x30,$out
1774	le?vperm	$out5,$out5,$out5,$inpperm
1775	stvx_u		$out4,$x40,$out
1776	stvx_u		$out5,$x50,$out
1777	addi		$out,$out,0x60
1778	b		Lctr32_enc8x_done
1779
1780.align	5
1781Lctr32_enc8x_five:
1782	vcipherlast	$out0,$out0,$in3
1783	vcipherlast	$out1,$out1,$in4
1784	vcipherlast	$out2,$out2,$in5
1785	vcipherlast	$out3,$out3,$in6
1786	vcipherlast	$out4,$out4,$in7
1787
1788	le?vperm	$out0,$out0,$out0,$inpperm
1789	le?vperm	$out1,$out1,$out1,$inpperm
1790	stvx_u		$out0,$x00,$out
1791	le?vperm	$out2,$out2,$out2,$inpperm
1792	stvx_u		$out1,$x10,$out
1793	le?vperm	$out3,$out3,$out3,$inpperm
1794	stvx_u		$out2,$x20,$out
1795	le?vperm	$out4,$out4,$out4,$inpperm
1796	stvx_u		$out3,$x30,$out
1797	stvx_u		$out4,$x40,$out
1798	addi		$out,$out,0x50
1799	b		Lctr32_enc8x_done
1800
1801.align	5
1802Lctr32_enc8x_four:
1803	vcipherlast	$out0,$out0,$in4
1804	vcipherlast	$out1,$out1,$in5
1805	vcipherlast	$out2,$out2,$in6
1806	vcipherlast	$out3,$out3,$in7
1807
1808	le?vperm	$out0,$out0,$out0,$inpperm
1809	le?vperm	$out1,$out1,$out1,$inpperm
1810	stvx_u		$out0,$x00,$out
1811	le?vperm	$out2,$out2,$out2,$inpperm
1812	stvx_u		$out1,$x10,$out
1813	le?vperm	$out3,$out3,$out3,$inpperm
1814	stvx_u		$out2,$x20,$out
1815	stvx_u		$out3,$x30,$out
1816	addi		$out,$out,0x40
1817	b		Lctr32_enc8x_done
1818
1819.align	5
1820Lctr32_enc8x_three:
1821	vcipherlast	$out0,$out0,$in5
1822	vcipherlast	$out1,$out1,$in6
1823	vcipherlast	$out2,$out2,$in7
1824
1825	le?vperm	$out0,$out0,$out0,$inpperm
1826	le?vperm	$out1,$out1,$out1,$inpperm
1827	stvx_u		$out0,$x00,$out
1828	le?vperm	$out2,$out2,$out2,$inpperm
1829	stvx_u		$out1,$x10,$out
1830	stvx_u		$out2,$x20,$out
1831	addi		$out,$out,0x30
1832	b		Lctr32_enc8x_done
1833
1834.align	5
1835Lctr32_enc8x_two:
1836	vcipherlast	$out0,$out0,$in6
1837	vcipherlast	$out1,$out1,$in7
1838
1839	le?vperm	$out0,$out0,$out0,$inpperm
1840	le?vperm	$out1,$out1,$out1,$inpperm
1841	stvx_u		$out0,$x00,$out
1842	stvx_u		$out1,$x10,$out
1843	addi		$out,$out,0x20
1844	b		Lctr32_enc8x_done
1845
1846.align	5
1847Lctr32_enc8x_one:
1848	vcipherlast	$out0,$out0,$in7
1849
1850	le?vperm	$out0,$out0,$out0,$inpperm
1851	stvx_u		$out0,0,$out
1852	addi		$out,$out,0x10
1853
1854Lctr32_enc8x_done:
1855	li		r10,`$FRAME+15`
1856	li		r11,`$FRAME+31`
1857	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1858	addi		r10,r10,32
1859	stvx		$inpperm,r11,$sp
1860	addi		r11,r11,32
1861	stvx		$inpperm,r10,$sp
1862	addi		r10,r10,32
1863	stvx		$inpperm,r11,$sp
1864	addi		r11,r11,32
1865	stvx		$inpperm,r10,$sp
1866	addi		r10,r10,32
1867	stvx		$inpperm,r11,$sp
1868	addi		r11,r11,32
1869	stvx		$inpperm,r10,$sp
1870	addi		r10,r10,32
1871	stvx		$inpperm,r11,$sp
1872	addi		r11,r11,32
1873
1874	mtspr		256,$vrsave
1875	lvx		v20,r10,$sp		# ABI says so
1876	addi		r10,r10,32
1877	lvx		v21,r11,$sp
1878	addi		r11,r11,32
1879	lvx		v22,r10,$sp
1880	addi		r10,r10,32
1881	lvx		v23,r11,$sp
1882	addi		r11,r11,32
1883	lvx		v24,r10,$sp
1884	addi		r10,r10,32
1885	lvx		v25,r11,$sp
1886	addi		r11,r11,32
1887	lvx		v26,r10,$sp
1888	addi		r10,r10,32
1889	lvx		v27,r11,$sp
1890	addi		r11,r11,32
1891	lvx		v28,r10,$sp
1892	addi		r10,r10,32
1893	lvx		v29,r11,$sp
1894	addi		r11,r11,32
1895	lvx		v30,r10,$sp
1896	lvx		v31,r11,$sp
1897	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1898	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1899	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1900	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1901	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1902	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1903	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1904	blr
1905	.long		0
1906	.byte		0,12,0x04,0,0x80,6,6,0
1907	.long		0
1908.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1909___
1910}}	}}}
1911
1912#########################################################################
1913{{{	# XTS procedures						#
1914# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
1915#                             const AES_KEY *key1, const AES_KEY *key2,	#
1916#                             [const] unsigned char iv[16]);		#
1917# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
1918# input tweak value is assumed to be encrypted already, and last tweak	#
1919# value, one suitable for consecutive call on same chunk of data, is	#
1920# written back to original buffer. In addition, in "tweak chaining"	#
1921# mode only complete input blocks are processed.			#
1922
1923my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
1924my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
1925my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
1926my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
1927my $taillen = $key2;
1928
1929   ($inp,$idx) = ($idx,$inp);				# reassign
1930
1931$code.=<<___;
1932.globl	.${prefix}_xts_encrypt
1933.align	5
1934.${prefix}_xts_encrypt:
1935	mr		$inp,r3				# reassign
1936	li		r3,-1
1937	${UCMP}i	$len,16
1938	bltlr-
1939
1940	lis		r0,0xfff0
1941	mfspr		r12,256				# save vrsave
1942	li		r11,0
1943	mtspr		256,r0
1944
1945	vspltisb	$seven,0x07			# 0x070707..07
1946	le?lvsl		$leperm,r11,r11
1947	le?vspltisb	$tmp,0x0f
1948	le?vxor		$leperm,$leperm,$seven
1949
1950	li		$idx,15
1951	lvx		$tweak,0,$ivp			# load [unaligned] iv
1952	lvsl		$inpperm,0,$ivp
1953	lvx		$inptail,$idx,$ivp
1954	le?vxor		$inpperm,$inpperm,$tmp
1955	vperm		$tweak,$tweak,$inptail,$inpperm
1956
1957	neg		r11,$inp
1958	lvsr		$inpperm,0,r11			# prepare for unaligned load
1959	lvx		$inout,0,$inp
1960	addi		$inp,$inp,15			# 15 is not typo
1961	le?vxor		$inpperm,$inpperm,$tmp
1962
1963	${UCMP}i	$key2,0				# key2==NULL?
1964	beq		Lxts_enc_no_key2
1965
1966	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
1967	lwz		$rounds,240($key2)
1968	srwi		$rounds,$rounds,1
1969	subi		$rounds,$rounds,1
1970	li		$idx,16
1971
1972	lvx		$rndkey0,0,$key2
1973	lvx		$rndkey1,$idx,$key2
1974	addi		$idx,$idx,16
1975	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1976	vxor		$tweak,$tweak,$rndkey0
1977	lvx		$rndkey0,$idx,$key2
1978	addi		$idx,$idx,16
1979	mtctr		$rounds
1980
1981Ltweak_xts_enc:
1982	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1983	vcipher		$tweak,$tweak,$rndkey1
1984	lvx		$rndkey1,$idx,$key2
1985	addi		$idx,$idx,16
1986	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1987	vcipher		$tweak,$tweak,$rndkey0
1988	lvx		$rndkey0,$idx,$key2
1989	addi		$idx,$idx,16
1990	bdnz		Ltweak_xts_enc
1991
1992	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1993	vcipher		$tweak,$tweak,$rndkey1
1994	lvx		$rndkey1,$idx,$key2
1995	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1996	vcipherlast	$tweak,$tweak,$rndkey0
1997
1998	li		$ivp,0				# don't chain the tweak
1999	b		Lxts_enc
2000
2001Lxts_enc_no_key2:
2002	li		$idx,-16
2003	and		$len,$len,$idx			# in "tweak chaining"
2004							# mode only complete
2005							# blocks are processed
2006Lxts_enc:
2007	lvx		$inptail,0,$inp
2008	addi		$inp,$inp,16
2009
2010	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2011	lwz		$rounds,240($key1)
2012	srwi		$rounds,$rounds,1
2013	subi		$rounds,$rounds,1
2014	li		$idx,16
2015
2016	vslb		$eighty7,$seven,$seven		# 0x808080..80
2017	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2018	vspltisb	$tmp,1				# 0x010101..01
2019	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2020
2021	${UCMP}i	$len,96
2022	bge		_aesp8_xts_encrypt6x
2023
2024	andi.		$taillen,$len,15
2025	subic		r0,$len,32
2026	subi		$taillen,$taillen,16
2027	subfe		r0,r0,r0
2028	and		r0,r0,$taillen
2029	add		$inp,$inp,r0
2030
2031	lvx		$rndkey0,0,$key1
2032	lvx		$rndkey1,$idx,$key1
2033	addi		$idx,$idx,16
2034	vperm		$inout,$inout,$inptail,$inpperm
2035	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2036	vxor		$inout,$inout,$tweak
2037	vxor		$inout,$inout,$rndkey0
2038	lvx		$rndkey0,$idx,$key1
2039	addi		$idx,$idx,16
2040	mtctr		$rounds
2041	b		Loop_xts_enc
2042
2043.align	5
2044Loop_xts_enc:
2045	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2046	vcipher		$inout,$inout,$rndkey1
2047	lvx		$rndkey1,$idx,$key1
2048	addi		$idx,$idx,16
2049	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2050	vcipher		$inout,$inout,$rndkey0
2051	lvx		$rndkey0,$idx,$key1
2052	addi		$idx,$idx,16
2053	bdnz		Loop_xts_enc
2054
2055	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2056	vcipher		$inout,$inout,$rndkey1
2057	lvx		$rndkey1,$idx,$key1
2058	li		$idx,16
2059	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2060	vxor		$rndkey0,$rndkey0,$tweak
2061	vcipherlast	$output,$inout,$rndkey0
2062
2063	le?vperm	$tmp,$output,$output,$leperm
2064	be?nop
2065	le?stvx_u	$tmp,0,$out
2066	be?stvx_u	$output,0,$out
2067	addi		$out,$out,16
2068
2069	subic.		$len,$len,16
2070	beq		Lxts_enc_done
2071
2072	vmr		$inout,$inptail
2073	lvx		$inptail,0,$inp
2074	addi		$inp,$inp,16
2075	lvx		$rndkey0,0,$key1
2076	lvx		$rndkey1,$idx,$key1
2077	addi		$idx,$idx,16
2078
2079	subic		r0,$len,32
2080	subfe		r0,r0,r0
2081	and		r0,r0,$taillen
2082	add		$inp,$inp,r0
2083
2084	vsrab		$tmp,$tweak,$seven		# next tweak value
2085	vaddubm		$tweak,$tweak,$tweak
2086	vsldoi		$tmp,$tmp,$tmp,15
2087	vand		$tmp,$tmp,$eighty7
2088	vxor		$tweak,$tweak,$tmp
2089
2090	vperm		$inout,$inout,$inptail,$inpperm
2091	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2092	vxor		$inout,$inout,$tweak
2093	vxor		$output,$output,$rndkey0	# just in case $len<16
2094	vxor		$inout,$inout,$rndkey0
2095	lvx		$rndkey0,$idx,$key1
2096	addi		$idx,$idx,16
2097
2098	mtctr		$rounds
2099	${UCMP}i	$len,16
2100	bge		Loop_xts_enc
2101
2102	vxor		$output,$output,$tweak
2103	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2104	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2105	vspltisb	$tmp,-1
2106	vperm		$inptail,$inptail,$tmp,$inpperm
2107	vsel		$inout,$inout,$output,$inptail
2108
2109	subi		r11,$out,17
2110	subi		$out,$out,16
2111	mtctr		$len
2112	li		$len,16
2113Loop_xts_enc_steal:
2114	lbzu		r0,1(r11)
2115	stb		r0,16(r11)
2116	bdnz		Loop_xts_enc_steal
2117
2118	mtctr		$rounds
2119	b		Loop_xts_enc			# one more time...
2120
2121Lxts_enc_done:
2122	${UCMP}i	$ivp,0
2123	beq		Lxts_enc_ret
2124
2125	vsrab		$tmp,$tweak,$seven		# next tweak value
2126	vaddubm		$tweak,$tweak,$tweak
2127	vsldoi		$tmp,$tmp,$tmp,15
2128	vand		$tmp,$tmp,$eighty7
2129	vxor		$tweak,$tweak,$tmp
2130
2131	le?vperm	$tweak,$tweak,$tweak,$leperm
2132	stvx_u		$tweak,0,$ivp
2133
2134Lxts_enc_ret:
2135	mtspr		256,r12				# restore vrsave
2136	li		r3,0
2137	blr
2138	.long		0
2139	.byte		0,12,0x04,0,0x80,6,6,0
2140	.long		0
2141.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2142
2143.globl	.${prefix}_xts_decrypt
2144.align	5
2145.${prefix}_xts_decrypt:
2146	mr		$inp,r3				# reassign
2147	li		r3,-1
2148	${UCMP}i	$len,16
2149	bltlr-
2150
2151	lis		r0,0xfff8
2152	mfspr		r12,256				# save vrsave
2153	li		r11,0
2154	mtspr		256,r0
2155
2156	andi.		r0,$len,15
2157	neg		r0,r0
2158	andi.		r0,r0,16
2159	sub		$len,$len,r0
2160
2161	vspltisb	$seven,0x07			# 0x070707..07
2162	le?lvsl		$leperm,r11,r11
2163	le?vspltisb	$tmp,0x0f
2164	le?vxor		$leperm,$leperm,$seven
2165
2166	li		$idx,15
2167	lvx		$tweak,0,$ivp			# load [unaligned] iv
2168	lvsl		$inpperm,0,$ivp
2169	lvx		$inptail,$idx,$ivp
2170	le?vxor		$inpperm,$inpperm,$tmp
2171	vperm		$tweak,$tweak,$inptail,$inpperm
2172
2173	neg		r11,$inp
2174	lvsr		$inpperm,0,r11			# prepare for unaligned load
2175	lvx		$inout,0,$inp
2176	addi		$inp,$inp,15			# 15 is not typo
2177	le?vxor		$inpperm,$inpperm,$tmp
2178
2179	${UCMP}i	$key2,0				# key2==NULL?
2180	beq		Lxts_dec_no_key2
2181
2182	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2183	lwz		$rounds,240($key2)
2184	srwi		$rounds,$rounds,1
2185	subi		$rounds,$rounds,1
2186	li		$idx,16
2187
2188	lvx		$rndkey0,0,$key2
2189	lvx		$rndkey1,$idx,$key2
2190	addi		$idx,$idx,16
2191	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2192	vxor		$tweak,$tweak,$rndkey0
2193	lvx		$rndkey0,$idx,$key2
2194	addi		$idx,$idx,16
2195	mtctr		$rounds
2196
2197Ltweak_xts_dec:
2198	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2199	vcipher		$tweak,$tweak,$rndkey1
2200	lvx		$rndkey1,$idx,$key2
2201	addi		$idx,$idx,16
2202	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2203	vcipher		$tweak,$tweak,$rndkey0
2204	lvx		$rndkey0,$idx,$key2
2205	addi		$idx,$idx,16
2206	bdnz		Ltweak_xts_dec
2207
2208	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2209	vcipher		$tweak,$tweak,$rndkey1
2210	lvx		$rndkey1,$idx,$key2
2211	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2212	vcipherlast	$tweak,$tweak,$rndkey0
2213
2214	li		$ivp,0				# don't chain the tweak
2215	b		Lxts_dec
2216
2217Lxts_dec_no_key2:
2218	neg		$idx,$len
2219	andi.		$idx,$idx,15
2220	add		$len,$len,$idx			# in "tweak chaining"
2221							# mode only complete
2222							# blocks are processed
2223Lxts_dec:
2224	lvx		$inptail,0,$inp
2225	addi		$inp,$inp,16
2226
2227	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2228	lwz		$rounds,240($key1)
2229	srwi		$rounds,$rounds,1
2230	subi		$rounds,$rounds,1
2231	li		$idx,16
2232
2233	vslb		$eighty7,$seven,$seven		# 0x808080..80
2234	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2235	vspltisb	$tmp,1				# 0x010101..01
2236	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2237
2238	${UCMP}i	$len,96
2239	bge		_aesp8_xts_decrypt6x
2240
2241	lvx		$rndkey0,0,$key1
2242	lvx		$rndkey1,$idx,$key1
2243	addi		$idx,$idx,16
2244	vperm		$inout,$inout,$inptail,$inpperm
2245	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2246	vxor		$inout,$inout,$tweak
2247	vxor		$inout,$inout,$rndkey0
2248	lvx		$rndkey0,$idx,$key1
2249	addi		$idx,$idx,16
2250	mtctr		$rounds
2251
2252	${UCMP}i	$len,16
2253	blt		Ltail_xts_dec
2254	be?b		Loop_xts_dec
2255
2256.align	5
2257Loop_xts_dec:
2258	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2259	vncipher	$inout,$inout,$rndkey1
2260	lvx		$rndkey1,$idx,$key1
2261	addi		$idx,$idx,16
2262	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2263	vncipher	$inout,$inout,$rndkey0
2264	lvx		$rndkey0,$idx,$key1
2265	addi		$idx,$idx,16
2266	bdnz		Loop_xts_dec
2267
2268	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2269	vncipher	$inout,$inout,$rndkey1
2270	lvx		$rndkey1,$idx,$key1
2271	li		$idx,16
2272	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2273	vxor		$rndkey0,$rndkey0,$tweak
2274	vncipherlast	$output,$inout,$rndkey0
2275
2276	le?vperm	$tmp,$output,$output,$leperm
2277	be?nop
2278	le?stvx_u	$tmp,0,$out
2279	be?stvx_u	$output,0,$out
2280	addi		$out,$out,16
2281
2282	subic.		$len,$len,16
2283	beq		Lxts_dec_done
2284
2285	vmr		$inout,$inptail
2286	lvx		$inptail,0,$inp
2287	addi		$inp,$inp,16
2288	lvx		$rndkey0,0,$key1
2289	lvx		$rndkey1,$idx,$key1
2290	addi		$idx,$idx,16
2291
2292	vsrab		$tmp,$tweak,$seven		# next tweak value
2293	vaddubm		$tweak,$tweak,$tweak
2294	vsldoi		$tmp,$tmp,$tmp,15
2295	vand		$tmp,$tmp,$eighty7
2296	vxor		$tweak,$tweak,$tmp
2297
2298	vperm		$inout,$inout,$inptail,$inpperm
2299	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2300	vxor		$inout,$inout,$tweak
2301	vxor		$inout,$inout,$rndkey0
2302	lvx		$rndkey0,$idx,$key1
2303	addi		$idx,$idx,16
2304
2305	mtctr		$rounds
2306	${UCMP}i	$len,16
2307	bge		Loop_xts_dec
2308
2309Ltail_xts_dec:
2310	vsrab		$tmp,$tweak,$seven		# next tweak value
2311	vaddubm		$tweak1,$tweak,$tweak
2312	vsldoi		$tmp,$tmp,$tmp,15
2313	vand		$tmp,$tmp,$eighty7
2314	vxor		$tweak1,$tweak1,$tmp
2315
2316	subi		$inp,$inp,16
2317	add		$inp,$inp,$len
2318
2319	vxor		$inout,$inout,$tweak		# :-(
2320	vxor		$inout,$inout,$tweak1		# :-)
2321
2322Loop_xts_dec_short:
2323	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2324	vncipher	$inout,$inout,$rndkey1
2325	lvx		$rndkey1,$idx,$key1
2326	addi		$idx,$idx,16
2327	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2328	vncipher	$inout,$inout,$rndkey0
2329	lvx		$rndkey0,$idx,$key1
2330	addi		$idx,$idx,16
2331	bdnz		Loop_xts_dec_short
2332
2333	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2334	vncipher	$inout,$inout,$rndkey1
2335	lvx		$rndkey1,$idx,$key1
2336	li		$idx,16
2337	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2338	vxor		$rndkey0,$rndkey0,$tweak1
2339	vncipherlast	$output,$inout,$rndkey0
2340
2341	le?vperm	$tmp,$output,$output,$leperm
2342	be?nop
2343	le?stvx_u	$tmp,0,$out
2344	be?stvx_u	$output,0,$out
2345
2346	vmr		$inout,$inptail
2347	lvx		$inptail,0,$inp
2348	#addi		$inp,$inp,16
2349	lvx		$rndkey0,0,$key1
2350	lvx		$rndkey1,$idx,$key1
2351	addi		$idx,$idx,16
2352	vperm		$inout,$inout,$inptail,$inpperm
2353	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2354
2355	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2356	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2357	vspltisb	$tmp,-1
2358	vperm		$inptail,$inptail,$tmp,$inpperm
2359	vsel		$inout,$inout,$output,$inptail
2360
2361	vxor		$rndkey0,$rndkey0,$tweak
2362	vxor		$inout,$inout,$rndkey0
2363	lvx		$rndkey0,$idx,$key1
2364	addi		$idx,$idx,16
2365
2366	subi		r11,$out,1
2367	mtctr		$len
2368	li		$len,16
2369Loop_xts_dec_steal:
2370	lbzu		r0,1(r11)
2371	stb		r0,16(r11)
2372	bdnz		Loop_xts_dec_steal
2373
2374	mtctr		$rounds
2375	b		Loop_xts_dec			# one more time...
2376
2377Lxts_dec_done:
2378	${UCMP}i	$ivp,0
2379	beq		Lxts_dec_ret
2380
2381	vsrab		$tmp,$tweak,$seven		# next tweak value
2382	vaddubm		$tweak,$tweak,$tweak
2383	vsldoi		$tmp,$tmp,$tmp,15
2384	vand		$tmp,$tmp,$eighty7
2385	vxor		$tweak,$tweak,$tmp
2386
2387	le?vperm	$tweak,$tweak,$tweak,$leperm
2388	stvx_u		$tweak,0,$ivp
2389
2390Lxts_dec_ret:
2391	mtspr		256,r12				# restore vrsave
2392	li		r3,0
2393	blr
2394	.long		0
2395	.byte		0,12,0x04,0,0x80,6,6,0
2396	.long		0
2397.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2398___
2399#########################################################################
2400{{	# Optimized XTS procedures					#
2401my $key_=$key2;
2402my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
2403    $x00=0 if ($flavour =~ /osx/);
2404my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
2405my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2406my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2407my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
2408			# v26-v31 last 6 round keys
2409my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
2410my $taillen=$x70;
2411
2412$code.=<<___;
2413.align	5
2414_aesp8_xts_encrypt6x:
2415	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2416	mflr		r11
2417	li		r7,`$FRAME+8*16+15`
2418	li		r3,`$FRAME+8*16+31`
2419	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2420	stvx		v20,r7,$sp		# ABI says so
2421	addi		r7,r7,32
2422	stvx		v21,r3,$sp
2423	addi		r3,r3,32
2424	stvx		v22,r7,$sp
2425	addi		r7,r7,32
2426	stvx		v23,r3,$sp
2427	addi		r3,r3,32
2428	stvx		v24,r7,$sp
2429	addi		r7,r7,32
2430	stvx		v25,r3,$sp
2431	addi		r3,r3,32
2432	stvx		v26,r7,$sp
2433	addi		r7,r7,32
2434	stvx		v27,r3,$sp
2435	addi		r3,r3,32
2436	stvx		v28,r7,$sp
2437	addi		r7,r7,32
2438	stvx		v29,r3,$sp
2439	addi		r3,r3,32
2440	stvx		v30,r7,$sp
2441	stvx		v31,r3,$sp
2442	li		r0,-1
2443	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
2444	li		$x10,0x10
2445	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2446	li		$x20,0x20
2447	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2448	li		$x30,0x30
2449	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2450	li		$x40,0x40
2451	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2452	li		$x50,0x50
2453	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2454	li		$x60,0x60
2455	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2456	li		$x70,0x70
2457	mtspr		256,r0
2458
2459	subi		$rounds,$rounds,3	# -4 in total
2460
2461	lvx		$rndkey0,$x00,$key1	# load key schedule
2462	lvx		v30,$x10,$key1
2463	addi		$key1,$key1,0x20
2464	lvx		v31,$x00,$key1
2465	?vperm		$rndkey0,$rndkey0,v30,$keyperm
2466	addi		$key_,$sp,$FRAME+15
2467	mtctr		$rounds
2468
2469Load_xts_enc_key:
2470	?vperm		v24,v30,v31,$keyperm
2471	lvx		v30,$x10,$key1
2472	addi		$key1,$key1,0x20
2473	stvx		v24,$x00,$key_		# off-load round[1]
2474	?vperm		v25,v31,v30,$keyperm
2475	lvx		v31,$x00,$key1
2476	stvx		v25,$x10,$key_		# off-load round[2]
2477	addi		$key_,$key_,0x20
2478	bdnz		Load_xts_enc_key
2479
2480	lvx		v26,$x10,$key1
2481	?vperm		v24,v30,v31,$keyperm
2482	lvx		v27,$x20,$key1
2483	stvx		v24,$x00,$key_		# off-load round[3]
2484	?vperm		v25,v31,v26,$keyperm
2485	lvx		v28,$x30,$key1
2486	stvx		v25,$x10,$key_		# off-load round[4]
2487	addi		$key_,$sp,$FRAME+15	# rewind $key_
2488	?vperm		v26,v26,v27,$keyperm
2489	lvx		v29,$x40,$key1
2490	?vperm		v27,v27,v28,$keyperm
2491	lvx		v30,$x50,$key1
2492	?vperm		v28,v28,v29,$keyperm
2493	lvx		v31,$x60,$key1
2494	?vperm		v29,v29,v30,$keyperm
2495	lvx		$twk5,$x70,$key1	# borrow $twk5
2496	?vperm		v30,v30,v31,$keyperm
2497	lvx		v24,$x00,$key_		# pre-load round[1]
2498	?vperm		v31,v31,$twk5,$keyperm
2499	lvx		v25,$x10,$key_		# pre-load round[2]
2500
2501	 vperm		$in0,$inout,$inptail,$inpperm
2502	 subi		$inp,$inp,31		# undo "caller"
2503	vxor		$twk0,$tweak,$rndkey0
2504	vsrab		$tmp,$tweak,$seven	# next tweak value
2505	vaddubm		$tweak,$tweak,$tweak
2506	vsldoi		$tmp,$tmp,$tmp,15
2507	vand		$tmp,$tmp,$eighty7
2508	 vxor		$out0,$in0,$twk0
2509	vxor		$tweak,$tweak,$tmp
2510
2511	 lvx_u		$in1,$x10,$inp
2512	vxor		$twk1,$tweak,$rndkey0
2513	vsrab		$tmp,$tweak,$seven	# next tweak value
2514	vaddubm		$tweak,$tweak,$tweak
2515	vsldoi		$tmp,$tmp,$tmp,15
2516	 le?vperm	$in1,$in1,$in1,$leperm
2517	vand		$tmp,$tmp,$eighty7
2518	 vxor		$out1,$in1,$twk1
2519	vxor		$tweak,$tweak,$tmp
2520
2521	 lvx_u		$in2,$x20,$inp
2522	 andi.		$taillen,$len,15
2523	vxor		$twk2,$tweak,$rndkey0
2524	vsrab		$tmp,$tweak,$seven	# next tweak value
2525	vaddubm		$tweak,$tweak,$tweak
2526	vsldoi		$tmp,$tmp,$tmp,15
2527	 le?vperm	$in2,$in2,$in2,$leperm
2528	vand		$tmp,$tmp,$eighty7
2529	 vxor		$out2,$in2,$twk2
2530	vxor		$tweak,$tweak,$tmp
2531
2532	 lvx_u		$in3,$x30,$inp
2533	 sub		$len,$len,$taillen
2534	vxor		$twk3,$tweak,$rndkey0
2535	vsrab		$tmp,$tweak,$seven	# next tweak value
2536	vaddubm		$tweak,$tweak,$tweak
2537	vsldoi		$tmp,$tmp,$tmp,15
2538	 le?vperm	$in3,$in3,$in3,$leperm
2539	vand		$tmp,$tmp,$eighty7
2540	 vxor		$out3,$in3,$twk3
2541	vxor		$tweak,$tweak,$tmp
2542
2543	 lvx_u		$in4,$x40,$inp
2544	 subi		$len,$len,0x60
2545	vxor		$twk4,$tweak,$rndkey0
2546	vsrab		$tmp,$tweak,$seven	# next tweak value
2547	vaddubm		$tweak,$tweak,$tweak
2548	vsldoi		$tmp,$tmp,$tmp,15
2549	 le?vperm	$in4,$in4,$in4,$leperm
2550	vand		$tmp,$tmp,$eighty7
2551	 vxor		$out4,$in4,$twk4
2552	vxor		$tweak,$tweak,$tmp
2553
2554	 lvx_u		$in5,$x50,$inp
2555	 addi		$inp,$inp,0x60
2556	vxor		$twk5,$tweak,$rndkey0
2557	vsrab		$tmp,$tweak,$seven	# next tweak value
2558	vaddubm		$tweak,$tweak,$tweak
2559	vsldoi		$tmp,$tmp,$tmp,15
2560	 le?vperm	$in5,$in5,$in5,$leperm
2561	vand		$tmp,$tmp,$eighty7
2562	 vxor		$out5,$in5,$twk5
2563	vxor		$tweak,$tweak,$tmp
2564
2565	vxor		v31,v31,$rndkey0
2566	mtctr		$rounds
2567	b		Loop_xts_enc6x
2568
2569.align	5
2570Loop_xts_enc6x:
2571	vcipher		$out0,$out0,v24
2572	vcipher		$out1,$out1,v24
2573	vcipher		$out2,$out2,v24
2574	vcipher		$out3,$out3,v24
2575	vcipher		$out4,$out4,v24
2576	vcipher		$out5,$out5,v24
2577	lvx		v24,$x20,$key_		# round[3]
2578	addi		$key_,$key_,0x20
2579
2580	vcipher		$out0,$out0,v25
2581	vcipher		$out1,$out1,v25
2582	vcipher		$out2,$out2,v25
2583	vcipher		$out3,$out3,v25
2584	vcipher		$out4,$out4,v25
2585	vcipher		$out5,$out5,v25
2586	lvx		v25,$x10,$key_		# round[4]
2587	bdnz		Loop_xts_enc6x
2588
2589	subic		$len,$len,96		# $len-=96
2590	 vxor		$in0,$twk0,v31		# xor with last round key
2591	vcipher		$out0,$out0,v24
2592	vcipher		$out1,$out1,v24
2593	 vsrab		$tmp,$tweak,$seven	# next tweak value
2594	 vxor		$twk0,$tweak,$rndkey0
2595	 vaddubm	$tweak,$tweak,$tweak
2596	vcipher		$out2,$out2,v24
2597	vcipher		$out3,$out3,v24
2598	 vsldoi		$tmp,$tmp,$tmp,15
2599	vcipher		$out4,$out4,v24
2600	vcipher		$out5,$out5,v24
2601
2602	subfe.		r0,r0,r0		# borrow?-1:0
2603	 vand		$tmp,$tmp,$eighty7
2604	vcipher		$out0,$out0,v25
2605	vcipher		$out1,$out1,v25
2606	 vxor		$tweak,$tweak,$tmp
2607	vcipher		$out2,$out2,v25
2608	vcipher		$out3,$out3,v25
2609	 vxor		$in1,$twk1,v31
2610	 vsrab		$tmp,$tweak,$seven	# next tweak value
2611	 vxor		$twk1,$tweak,$rndkey0
2612	vcipher		$out4,$out4,v25
2613	vcipher		$out5,$out5,v25
2614
2615	and		r0,r0,$len
2616	 vaddubm	$tweak,$tweak,$tweak
2617	 vsldoi		$tmp,$tmp,$tmp,15
2618	vcipher		$out0,$out0,v26
2619	vcipher		$out1,$out1,v26
2620	 vand		$tmp,$tmp,$eighty7
2621	vcipher		$out2,$out2,v26
2622	vcipher		$out3,$out3,v26
2623	 vxor		$tweak,$tweak,$tmp
2624	vcipher		$out4,$out4,v26
2625	vcipher		$out5,$out5,v26
2626
2627	add		$inp,$inp,r0		# $inp is adjusted in such
2628						# way that at exit from the
2629						# loop inX-in5 are loaded
2630						# with last "words"
2631	 vxor		$in2,$twk2,v31
2632	 vsrab		$tmp,$tweak,$seven	# next tweak value
2633	 vxor		$twk2,$tweak,$rndkey0
2634	 vaddubm	$tweak,$tweak,$tweak
2635	vcipher		$out0,$out0,v27
2636	vcipher		$out1,$out1,v27
2637	 vsldoi		$tmp,$tmp,$tmp,15
2638	vcipher		$out2,$out2,v27
2639	vcipher		$out3,$out3,v27
2640	 vand		$tmp,$tmp,$eighty7
2641	vcipher		$out4,$out4,v27
2642	vcipher		$out5,$out5,v27
2643
2644	addi		$key_,$sp,$FRAME+15	# rewind $key_
2645	 vxor		$tweak,$tweak,$tmp
2646	vcipher		$out0,$out0,v28
2647	vcipher		$out1,$out1,v28
2648	 vxor		$in3,$twk3,v31
2649	 vsrab		$tmp,$tweak,$seven	# next tweak value
2650	 vxor		$twk3,$tweak,$rndkey0
2651	vcipher		$out2,$out2,v28
2652	vcipher		$out3,$out3,v28
2653	 vaddubm	$tweak,$tweak,$tweak
2654	 vsldoi		$tmp,$tmp,$tmp,15
2655	vcipher		$out4,$out4,v28
2656	vcipher		$out5,$out5,v28
2657	lvx		v24,$x00,$key_		# re-pre-load round[1]
2658	 vand		$tmp,$tmp,$eighty7
2659
2660	vcipher		$out0,$out0,v29
2661	vcipher		$out1,$out1,v29
2662	 vxor		$tweak,$tweak,$tmp
2663	vcipher		$out2,$out2,v29
2664	vcipher		$out3,$out3,v29
2665	 vxor		$in4,$twk4,v31
2666	 vsrab		$tmp,$tweak,$seven	# next tweak value
2667	 vxor		$twk4,$tweak,$rndkey0
2668	vcipher		$out4,$out4,v29
2669	vcipher		$out5,$out5,v29
2670	lvx		v25,$x10,$key_		# re-pre-load round[2]
2671	 vaddubm	$tweak,$tweak,$tweak
2672	 vsldoi		$tmp,$tmp,$tmp,15
2673
2674	vcipher		$out0,$out0,v30
2675	vcipher		$out1,$out1,v30
2676	 vand		$tmp,$tmp,$eighty7
2677	vcipher		$out2,$out2,v30
2678	vcipher		$out3,$out3,v30
2679	 vxor		$tweak,$tweak,$tmp
2680	vcipher		$out4,$out4,v30
2681	vcipher		$out5,$out5,v30
2682	 vxor		$in5,$twk5,v31
2683	 vsrab		$tmp,$tweak,$seven	# next tweak value
2684	 vxor		$twk5,$tweak,$rndkey0
2685
2686	vcipherlast	$out0,$out0,$in0
2687	 lvx_u		$in0,$x00,$inp		# load next input block
2688	 vaddubm	$tweak,$tweak,$tweak
2689	 vsldoi		$tmp,$tmp,$tmp,15
2690	vcipherlast	$out1,$out1,$in1
2691	 lvx_u		$in1,$x10,$inp
2692	vcipherlast	$out2,$out2,$in2
2693	 le?vperm	$in0,$in0,$in0,$leperm
2694	 lvx_u		$in2,$x20,$inp
2695	 vand		$tmp,$tmp,$eighty7
2696	vcipherlast	$out3,$out3,$in3
2697	 le?vperm	$in1,$in1,$in1,$leperm
2698	 lvx_u		$in3,$x30,$inp
2699	vcipherlast	$out4,$out4,$in4
2700	 le?vperm	$in2,$in2,$in2,$leperm
2701	 lvx_u		$in4,$x40,$inp
2702	 vxor		$tweak,$tweak,$tmp
2703	vcipherlast	$tmp,$out5,$in5		# last block might be needed
2704						# in stealing mode
2705	 le?vperm	$in3,$in3,$in3,$leperm
2706	 lvx_u		$in5,$x50,$inp
2707	 addi		$inp,$inp,0x60
2708	 le?vperm	$in4,$in4,$in4,$leperm
2709	 le?vperm	$in5,$in5,$in5,$leperm
2710
2711	le?vperm	$out0,$out0,$out0,$leperm
2712	le?vperm	$out1,$out1,$out1,$leperm
2713	stvx_u		$out0,$x00,$out		# store output
2714	 vxor		$out0,$in0,$twk0
2715	le?vperm	$out2,$out2,$out2,$leperm
2716	stvx_u		$out1,$x10,$out
2717	 vxor		$out1,$in1,$twk1
2718	le?vperm	$out3,$out3,$out3,$leperm
2719	stvx_u		$out2,$x20,$out
2720	 vxor		$out2,$in2,$twk2
2721	le?vperm	$out4,$out4,$out4,$leperm
2722	stvx_u		$out3,$x30,$out
2723	 vxor		$out3,$in3,$twk3
2724	le?vperm	$out5,$tmp,$tmp,$leperm
2725	stvx_u		$out4,$x40,$out
2726	 vxor		$out4,$in4,$twk4
2727	le?stvx_u	$out5,$x50,$out
2728	be?stvx_u	$tmp, $x50,$out
2729	 vxor		$out5,$in5,$twk5
2730	addi		$out,$out,0x60
2731
2732	mtctr		$rounds
2733	beq		Loop_xts_enc6x		# did $len-=96 borrow?
2734
2735	addic.		$len,$len,0x60
2736	beq		Lxts_enc6x_zero
2737	cmpwi		$len,0x20
2738	blt		Lxts_enc6x_one
2739	nop
2740	beq		Lxts_enc6x_two
2741	cmpwi		$len,0x40
2742	blt		Lxts_enc6x_three
2743	nop
2744	beq		Lxts_enc6x_four
2745
2746Lxts_enc6x_five:
2747	vxor		$out0,$in1,$twk0
2748	vxor		$out1,$in2,$twk1
2749	vxor		$out2,$in3,$twk2
2750	vxor		$out3,$in4,$twk3
2751	vxor		$out4,$in5,$twk4
2752
2753	bl		_aesp8_xts_enc5x
2754
2755	le?vperm	$out0,$out0,$out0,$leperm
2756	vmr		$twk0,$twk5		# unused tweak
2757	le?vperm	$out1,$out1,$out1,$leperm
2758	stvx_u		$out0,$x00,$out		# store output
2759	le?vperm	$out2,$out2,$out2,$leperm
2760	stvx_u		$out1,$x10,$out
2761	le?vperm	$out3,$out3,$out3,$leperm
2762	stvx_u		$out2,$x20,$out
2763	vxor		$tmp,$out4,$twk5	# last block prep for stealing
2764	le?vperm	$out4,$out4,$out4,$leperm
2765	stvx_u		$out3,$x30,$out
2766	stvx_u		$out4,$x40,$out
2767	addi		$out,$out,0x50
2768	bne		Lxts_enc6x_steal
2769	b		Lxts_enc6x_done
2770
2771.align	4
2772Lxts_enc6x_four:
2773	vxor		$out0,$in2,$twk0
2774	vxor		$out1,$in3,$twk1
2775	vxor		$out2,$in4,$twk2
2776	vxor		$out3,$in5,$twk3
2777	vxor		$out4,$out4,$out4
2778
2779	bl		_aesp8_xts_enc5x
2780
2781	le?vperm	$out0,$out0,$out0,$leperm
2782	vmr		$twk0,$twk4		# unused tweak
2783	le?vperm	$out1,$out1,$out1,$leperm
2784	stvx_u		$out0,$x00,$out		# store output
2785	le?vperm	$out2,$out2,$out2,$leperm
2786	stvx_u		$out1,$x10,$out
2787	vxor		$tmp,$out3,$twk4	# last block prep for stealing
2788	le?vperm	$out3,$out3,$out3,$leperm
2789	stvx_u		$out2,$x20,$out
2790	stvx_u		$out3,$x30,$out
2791	addi		$out,$out,0x40
2792	bne		Lxts_enc6x_steal
2793	b		Lxts_enc6x_done
2794
2795.align	4
2796Lxts_enc6x_three:
2797	vxor		$out0,$in3,$twk0
2798	vxor		$out1,$in4,$twk1
2799	vxor		$out2,$in5,$twk2
2800	vxor		$out3,$out3,$out3
2801	vxor		$out4,$out4,$out4
2802
2803	bl		_aesp8_xts_enc5x
2804
2805	le?vperm	$out0,$out0,$out0,$leperm
2806	vmr		$twk0,$twk3		# unused tweak
2807	le?vperm	$out1,$out1,$out1,$leperm
2808	stvx_u		$out0,$x00,$out		# store output
2809	vxor		$tmp,$out2,$twk3	# last block prep for stealing
2810	le?vperm	$out2,$out2,$out2,$leperm
2811	stvx_u		$out1,$x10,$out
2812	stvx_u		$out2,$x20,$out
2813	addi		$out,$out,0x30
2814	bne		Lxts_enc6x_steal
2815	b		Lxts_enc6x_done
2816
2817.align	4
2818Lxts_enc6x_two:
2819	vxor		$out0,$in4,$twk0
2820	vxor		$out1,$in5,$twk1
2821	vxor		$out2,$out2,$out2
2822	vxor		$out3,$out3,$out3
2823	vxor		$out4,$out4,$out4
2824
2825	bl		_aesp8_xts_enc5x
2826
2827	le?vperm	$out0,$out0,$out0,$leperm
2828	vmr		$twk0,$twk2		# unused tweak
2829	vxor		$tmp,$out1,$twk2	# last block prep for stealing
2830	le?vperm	$out1,$out1,$out1,$leperm
2831	stvx_u		$out0,$x00,$out		# store output
2832	stvx_u		$out1,$x10,$out
2833	addi		$out,$out,0x20
2834	bne		Lxts_enc6x_steal
2835	b		Lxts_enc6x_done
2836
2837.align	4
2838Lxts_enc6x_one:
2839	vxor		$out0,$in5,$twk0
2840	nop
2841Loop_xts_enc1x:
2842	vcipher		$out0,$out0,v24
2843	lvx		v24,$x20,$key_		# round[3]
2844	addi		$key_,$key_,0x20
2845
2846	vcipher		$out0,$out0,v25
2847	lvx		v25,$x10,$key_		# round[4]
2848	bdnz		Loop_xts_enc1x
2849
2850	add		$inp,$inp,$taillen
2851	cmpwi		$taillen,0
2852	vcipher		$out0,$out0,v24
2853
2854	subi		$inp,$inp,16
2855	vcipher		$out0,$out0,v25
2856
2857	lvsr		$inpperm,0,$taillen
2858	vcipher		$out0,$out0,v26
2859
2860	lvx_u		$in0,0,$inp
2861	vcipher		$out0,$out0,v27
2862
2863	addi		$key_,$sp,$FRAME+15	# rewind $key_
2864	vcipher		$out0,$out0,v28
2865	lvx		v24,$x00,$key_		# re-pre-load round[1]
2866
2867	vcipher		$out0,$out0,v29
2868	lvx		v25,$x10,$key_		# re-pre-load round[2]
2869	 vxor		$twk0,$twk0,v31
2870
2871	le?vperm	$in0,$in0,$in0,$leperm
2872	vcipher		$out0,$out0,v30
2873
2874	vperm		$in0,$in0,$in0,$inpperm
2875	vcipherlast	$out0,$out0,$twk0
2876
2877	vmr		$twk0,$twk1		# unused tweak
2878	vxor		$tmp,$out0,$twk1	# last block prep for stealing
2879	le?vperm	$out0,$out0,$out0,$leperm
2880	stvx_u		$out0,$x00,$out		# store output
2881	addi		$out,$out,0x10
2882	bne		Lxts_enc6x_steal
2883	b		Lxts_enc6x_done
2884
2885.align	4
2886Lxts_enc6x_zero:
2887	cmpwi		$taillen,0
2888	beq		Lxts_enc6x_done
2889
2890	add		$inp,$inp,$taillen
2891	subi		$inp,$inp,16
2892	lvx_u		$in0,0,$inp
2893	lvsr		$inpperm,0,$taillen	# $in5 is no more
2894	le?vperm	$in0,$in0,$in0,$leperm
2895	vperm		$in0,$in0,$in0,$inpperm
2896	vxor		$tmp,$tmp,$twk0
2897Lxts_enc6x_steal:
2898	vxor		$in0,$in0,$twk0
2899	vxor		$out0,$out0,$out0
2900	vspltisb	$out1,-1
2901	vperm		$out0,$out0,$out1,$inpperm
2902	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
2903
2904	subi		r30,$out,17
2905	subi		$out,$out,16
2906	mtctr		$taillen
2907Loop_xts_enc6x_steal:
2908	lbzu		r0,1(r30)
2909	stb		r0,16(r30)
2910	bdnz		Loop_xts_enc6x_steal
2911
2912	li		$taillen,0
2913	mtctr		$rounds
2914	b		Loop_xts_enc1x		# one more time...
2915
2916.align	4
2917Lxts_enc6x_done:
2918	${UCMP}i	$ivp,0
2919	beq		Lxts_enc6x_ret
2920
2921	vxor		$tweak,$twk0,$rndkey0
2922	le?vperm	$tweak,$tweak,$tweak,$leperm
2923	stvx_u		$tweak,0,$ivp
2924
2925Lxts_enc6x_ret:
2926	mtlr		r11
2927	li		r10,`$FRAME+15`
2928	li		r11,`$FRAME+31`
2929	stvx		$seven,r10,$sp		# wipe copies of round keys
2930	addi		r10,r10,32
2931	stvx		$seven,r11,$sp
2932	addi		r11,r11,32
2933	stvx		$seven,r10,$sp
2934	addi		r10,r10,32
2935	stvx		$seven,r11,$sp
2936	addi		r11,r11,32
2937	stvx		$seven,r10,$sp
2938	addi		r10,r10,32
2939	stvx		$seven,r11,$sp
2940	addi		r11,r11,32
2941	stvx		$seven,r10,$sp
2942	addi		r10,r10,32
2943	stvx		$seven,r11,$sp
2944	addi		r11,r11,32
2945
2946	mtspr		256,$vrsave
2947	lvx		v20,r10,$sp		# ABI says so
2948	addi		r10,r10,32
2949	lvx		v21,r11,$sp
2950	addi		r11,r11,32
2951	lvx		v22,r10,$sp
2952	addi		r10,r10,32
2953	lvx		v23,r11,$sp
2954	addi		r11,r11,32
2955	lvx		v24,r10,$sp
2956	addi		r10,r10,32
2957	lvx		v25,r11,$sp
2958	addi		r11,r11,32
2959	lvx		v26,r10,$sp
2960	addi		r10,r10,32
2961	lvx		v27,r11,$sp
2962	addi		r11,r11,32
2963	lvx		v28,r10,$sp
2964	addi		r10,r10,32
2965	lvx		v29,r11,$sp
2966	addi		r11,r11,32
2967	lvx		v30,r10,$sp
2968	lvx		v31,r11,$sp
2969	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2970	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2971	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2972	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2973	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2974	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2975	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
2976	blr
2977	.long		0
2978	.byte		0,12,0x04,1,0x80,6,6,0
2979	.long		0
2980
2981.align	5
2982_aesp8_xts_enc5x:
2983	vcipher		$out0,$out0,v24
2984	vcipher		$out1,$out1,v24
2985	vcipher		$out2,$out2,v24
2986	vcipher		$out3,$out3,v24
2987	vcipher		$out4,$out4,v24
2988	lvx		v24,$x20,$key_		# round[3]
2989	addi		$key_,$key_,0x20
2990
2991	vcipher		$out0,$out0,v25
2992	vcipher		$out1,$out1,v25
2993	vcipher		$out2,$out2,v25
2994	vcipher		$out3,$out3,v25
2995	vcipher		$out4,$out4,v25
2996	lvx		v25,$x10,$key_		# round[4]
2997	bdnz		_aesp8_xts_enc5x
2998
2999	add		$inp,$inp,$taillen
3000	cmpwi		$taillen,0
3001	vcipher		$out0,$out0,v24
3002	vcipher		$out1,$out1,v24
3003	vcipher		$out2,$out2,v24
3004	vcipher		$out3,$out3,v24
3005	vcipher		$out4,$out4,v24
3006
3007	subi		$inp,$inp,16
3008	vcipher		$out0,$out0,v25
3009	vcipher		$out1,$out1,v25
3010	vcipher		$out2,$out2,v25
3011	vcipher		$out3,$out3,v25
3012	vcipher		$out4,$out4,v25
3013	 vxor		$twk0,$twk0,v31
3014
3015	vcipher		$out0,$out0,v26
3016	lvsr		$inpperm,0,$taillen	# $in5 is no more
3017	vcipher		$out1,$out1,v26
3018	vcipher		$out2,$out2,v26
3019	vcipher		$out3,$out3,v26
3020	vcipher		$out4,$out4,v26
3021	 vxor		$in1,$twk1,v31
3022
3023	vcipher		$out0,$out0,v27
3024	lvx_u		$in0,0,$inp
3025	vcipher		$out1,$out1,v27
3026	vcipher		$out2,$out2,v27
3027	vcipher		$out3,$out3,v27
3028	vcipher		$out4,$out4,v27
3029	 vxor		$in2,$twk2,v31
3030
3031	addi		$key_,$sp,$FRAME+15	# rewind $key_
3032	vcipher		$out0,$out0,v28
3033	vcipher		$out1,$out1,v28
3034	vcipher		$out2,$out2,v28
3035	vcipher		$out3,$out3,v28
3036	vcipher		$out4,$out4,v28
3037	lvx		v24,$x00,$key_		# re-pre-load round[1]
3038	 vxor		$in3,$twk3,v31
3039
3040	vcipher		$out0,$out0,v29
3041	le?vperm	$in0,$in0,$in0,$leperm
3042	vcipher		$out1,$out1,v29
3043	vcipher		$out2,$out2,v29
3044	vcipher		$out3,$out3,v29
3045	vcipher		$out4,$out4,v29
3046	lvx		v25,$x10,$key_		# re-pre-load round[2]
3047	 vxor		$in4,$twk4,v31
3048
3049	vcipher		$out0,$out0,v30
3050	vperm		$in0,$in0,$in0,$inpperm
3051	vcipher		$out1,$out1,v30
3052	vcipher		$out2,$out2,v30
3053	vcipher		$out3,$out3,v30
3054	vcipher		$out4,$out4,v30
3055
3056	vcipherlast	$out0,$out0,$twk0
3057	vcipherlast	$out1,$out1,$in1
3058	vcipherlast	$out2,$out2,$in2
3059	vcipherlast	$out3,$out3,$in3
3060	vcipherlast	$out4,$out4,$in4
3061	blr
3062        .long   	0
3063        .byte   	0,12,0x14,0,0,0,0,0
3064
3065.align	5
3066_aesp8_xts_decrypt6x:
3067	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
3068	mflr		r11
3069	li		r7,`$FRAME+8*16+15`
3070	li		r3,`$FRAME+8*16+31`
3071	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
3072	stvx		v20,r7,$sp		# ABI says so
3073	addi		r7,r7,32
3074	stvx		v21,r3,$sp
3075	addi		r3,r3,32
3076	stvx		v22,r7,$sp
3077	addi		r7,r7,32
3078	stvx		v23,r3,$sp
3079	addi		r3,r3,32
3080	stvx		v24,r7,$sp
3081	addi		r7,r7,32
3082	stvx		v25,r3,$sp
3083	addi		r3,r3,32
3084	stvx		v26,r7,$sp
3085	addi		r7,r7,32
3086	stvx		v27,r3,$sp
3087	addi		r3,r3,32
3088	stvx		v28,r7,$sp
3089	addi		r7,r7,32
3090	stvx		v29,r3,$sp
3091	addi		r3,r3,32
3092	stvx		v30,r7,$sp
3093	stvx		v31,r3,$sp
3094	li		r0,-1
3095	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
3096	li		$x10,0x10
3097	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3098	li		$x20,0x20
3099	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3100	li		$x30,0x30
3101	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3102	li		$x40,0x40
3103	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3104	li		$x50,0x50
3105	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3106	li		$x60,0x60
3107	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3108	li		$x70,0x70
3109	mtspr		256,r0
3110
3111	subi		$rounds,$rounds,3	# -4 in total
3112
3113	lvx		$rndkey0,$x00,$key1	# load key schedule
3114	lvx		v30,$x10,$key1
3115	addi		$key1,$key1,0x20
3116	lvx		v31,$x00,$key1
3117	?vperm		$rndkey0,$rndkey0,v30,$keyperm
3118	addi		$key_,$sp,$FRAME+15
3119	mtctr		$rounds
3120
3121Load_xts_dec_key:
3122	?vperm		v24,v30,v31,$keyperm
3123	lvx		v30,$x10,$key1
3124	addi		$key1,$key1,0x20
3125	stvx		v24,$x00,$key_		# off-load round[1]
3126	?vperm		v25,v31,v30,$keyperm
3127	lvx		v31,$x00,$key1
3128	stvx		v25,$x10,$key_		# off-load round[2]
3129	addi		$key_,$key_,0x20
3130	bdnz		Load_xts_dec_key
3131
3132	lvx		v26,$x10,$key1
3133	?vperm		v24,v30,v31,$keyperm
3134	lvx		v27,$x20,$key1
3135	stvx		v24,$x00,$key_		# off-load round[3]
3136	?vperm		v25,v31,v26,$keyperm
3137	lvx		v28,$x30,$key1
3138	stvx		v25,$x10,$key_		# off-load round[4]
3139	addi		$key_,$sp,$FRAME+15	# rewind $key_
3140	?vperm		v26,v26,v27,$keyperm
3141	lvx		v29,$x40,$key1
3142	?vperm		v27,v27,v28,$keyperm
3143	lvx		v30,$x50,$key1
3144	?vperm		v28,v28,v29,$keyperm
3145	lvx		v31,$x60,$key1
3146	?vperm		v29,v29,v30,$keyperm
3147	lvx		$twk5,$x70,$key1	# borrow $twk5
3148	?vperm		v30,v30,v31,$keyperm
3149	lvx		v24,$x00,$key_		# pre-load round[1]
3150	?vperm		v31,v31,$twk5,$keyperm
3151	lvx		v25,$x10,$key_		# pre-load round[2]
3152
3153	 vperm		$in0,$inout,$inptail,$inpperm
3154	 subi		$inp,$inp,31		# undo "caller"
3155	vxor		$twk0,$tweak,$rndkey0
3156	vsrab		$tmp,$tweak,$seven	# next tweak value
3157	vaddubm		$tweak,$tweak,$tweak
3158	vsldoi		$tmp,$tmp,$tmp,15
3159	vand		$tmp,$tmp,$eighty7
3160	 vxor		$out0,$in0,$twk0
3161	vxor		$tweak,$tweak,$tmp
3162
3163	 lvx_u		$in1,$x10,$inp
3164	vxor		$twk1,$tweak,$rndkey0
3165	vsrab		$tmp,$tweak,$seven	# next tweak value
3166	vaddubm		$tweak,$tweak,$tweak
3167	vsldoi		$tmp,$tmp,$tmp,15
3168	 le?vperm	$in1,$in1,$in1,$leperm
3169	vand		$tmp,$tmp,$eighty7
3170	 vxor		$out1,$in1,$twk1
3171	vxor		$tweak,$tweak,$tmp
3172
3173	 lvx_u		$in2,$x20,$inp
3174	 andi.		$taillen,$len,15
3175	vxor		$twk2,$tweak,$rndkey0
3176	vsrab		$tmp,$tweak,$seven	# next tweak value
3177	vaddubm		$tweak,$tweak,$tweak
3178	vsldoi		$tmp,$tmp,$tmp,15
3179	 le?vperm	$in2,$in2,$in2,$leperm
3180	vand		$tmp,$tmp,$eighty7
3181	 vxor		$out2,$in2,$twk2
3182	vxor		$tweak,$tweak,$tmp
3183
3184	 lvx_u		$in3,$x30,$inp
3185	 sub		$len,$len,$taillen
3186	vxor		$twk3,$tweak,$rndkey0
3187	vsrab		$tmp,$tweak,$seven	# next tweak value
3188	vaddubm		$tweak,$tweak,$tweak
3189	vsldoi		$tmp,$tmp,$tmp,15
3190	 le?vperm	$in3,$in3,$in3,$leperm
3191	vand		$tmp,$tmp,$eighty7
3192	 vxor		$out3,$in3,$twk3
3193	vxor		$tweak,$tweak,$tmp
3194
3195	 lvx_u		$in4,$x40,$inp
3196	 subi		$len,$len,0x60
3197	vxor		$twk4,$tweak,$rndkey0
3198	vsrab		$tmp,$tweak,$seven	# next tweak value
3199	vaddubm		$tweak,$tweak,$tweak
3200	vsldoi		$tmp,$tmp,$tmp,15
3201	 le?vperm	$in4,$in4,$in4,$leperm
3202	vand		$tmp,$tmp,$eighty7
3203	 vxor		$out4,$in4,$twk4
3204	vxor		$tweak,$tweak,$tmp
3205
3206	 lvx_u		$in5,$x50,$inp
3207	 addi		$inp,$inp,0x60
3208	vxor		$twk5,$tweak,$rndkey0
3209	vsrab		$tmp,$tweak,$seven	# next tweak value
3210	vaddubm		$tweak,$tweak,$tweak
3211	vsldoi		$tmp,$tmp,$tmp,15
3212	 le?vperm	$in5,$in5,$in5,$leperm
3213	vand		$tmp,$tmp,$eighty7
3214	 vxor		$out5,$in5,$twk5
3215	vxor		$tweak,$tweak,$tmp
3216
3217	vxor		v31,v31,$rndkey0
3218	mtctr		$rounds
3219	b		Loop_xts_dec6x
3220
3221.align	5
3222Loop_xts_dec6x:
3223	vncipher	$out0,$out0,v24
3224	vncipher	$out1,$out1,v24
3225	vncipher	$out2,$out2,v24
3226	vncipher	$out3,$out3,v24
3227	vncipher	$out4,$out4,v24
3228	vncipher	$out5,$out5,v24
3229	lvx		v24,$x20,$key_		# round[3]
3230	addi		$key_,$key_,0x20
3231
3232	vncipher	$out0,$out0,v25
3233	vncipher	$out1,$out1,v25
3234	vncipher	$out2,$out2,v25
3235	vncipher	$out3,$out3,v25
3236	vncipher	$out4,$out4,v25
3237	vncipher	$out5,$out5,v25
3238	lvx		v25,$x10,$key_		# round[4]
3239	bdnz		Loop_xts_dec6x
3240
3241	subic		$len,$len,96		# $len-=96
3242	 vxor		$in0,$twk0,v31		# xor with last round key
3243	vncipher	$out0,$out0,v24
3244	vncipher	$out1,$out1,v24
3245	 vsrab		$tmp,$tweak,$seven	# next tweak value
3246	 vxor		$twk0,$tweak,$rndkey0
3247	 vaddubm	$tweak,$tweak,$tweak
3248	vncipher	$out2,$out2,v24
3249	vncipher	$out3,$out3,v24
3250	 vsldoi		$tmp,$tmp,$tmp,15
3251	vncipher	$out4,$out4,v24
3252	vncipher	$out5,$out5,v24
3253
3254	subfe.		r0,r0,r0		# borrow?-1:0
3255	 vand		$tmp,$tmp,$eighty7
3256	vncipher	$out0,$out0,v25
3257	vncipher	$out1,$out1,v25
3258	 vxor		$tweak,$tweak,$tmp
3259	vncipher	$out2,$out2,v25
3260	vncipher	$out3,$out3,v25
3261	 vxor		$in1,$twk1,v31
3262	 vsrab		$tmp,$tweak,$seven	# next tweak value
3263	 vxor		$twk1,$tweak,$rndkey0
3264	vncipher	$out4,$out4,v25
3265	vncipher	$out5,$out5,v25
3266
3267	and		r0,r0,$len
3268	 vaddubm	$tweak,$tweak,$tweak
3269	 vsldoi		$tmp,$tmp,$tmp,15
3270	vncipher	$out0,$out0,v26
3271	vncipher	$out1,$out1,v26
3272	 vand		$tmp,$tmp,$eighty7
3273	vncipher	$out2,$out2,v26
3274	vncipher	$out3,$out3,v26
3275	 vxor		$tweak,$tweak,$tmp
3276	vncipher	$out4,$out4,v26
3277	vncipher	$out5,$out5,v26
3278
3279	add		$inp,$inp,r0		# $inp is adjusted in such
3280						# way that at exit from the
3281						# loop inX-in5 are loaded
3282						# with last "words"
3283	 vxor		$in2,$twk2,v31
3284	 vsrab		$tmp,$tweak,$seven	# next tweak value
3285	 vxor		$twk2,$tweak,$rndkey0
3286	 vaddubm	$tweak,$tweak,$tweak
3287	vncipher	$out0,$out0,v27
3288	vncipher	$out1,$out1,v27
3289	 vsldoi		$tmp,$tmp,$tmp,15
3290	vncipher	$out2,$out2,v27
3291	vncipher	$out3,$out3,v27
3292	 vand		$tmp,$tmp,$eighty7
3293	vncipher	$out4,$out4,v27
3294	vncipher	$out5,$out5,v27
3295
3296	addi		$key_,$sp,$FRAME+15	# rewind $key_
3297	 vxor		$tweak,$tweak,$tmp
3298	vncipher	$out0,$out0,v28
3299	vncipher	$out1,$out1,v28
3300	 vxor		$in3,$twk3,v31
3301	 vsrab		$tmp,$tweak,$seven	# next tweak value
3302	 vxor		$twk3,$tweak,$rndkey0
3303	vncipher	$out2,$out2,v28
3304	vncipher	$out3,$out3,v28
3305	 vaddubm	$tweak,$tweak,$tweak
3306	 vsldoi		$tmp,$tmp,$tmp,15
3307	vncipher	$out4,$out4,v28
3308	vncipher	$out5,$out5,v28
3309	lvx		v24,$x00,$key_		# re-pre-load round[1]
3310	 vand		$tmp,$tmp,$eighty7
3311
3312	vncipher	$out0,$out0,v29
3313	vncipher	$out1,$out1,v29
3314	 vxor		$tweak,$tweak,$tmp
3315	vncipher	$out2,$out2,v29
3316	vncipher	$out3,$out3,v29
3317	 vxor		$in4,$twk4,v31
3318	 vsrab		$tmp,$tweak,$seven	# next tweak value
3319	 vxor		$twk4,$tweak,$rndkey0
3320	vncipher	$out4,$out4,v29
3321	vncipher	$out5,$out5,v29
3322	lvx		v25,$x10,$key_		# re-pre-load round[2]
3323	 vaddubm	$tweak,$tweak,$tweak
3324	 vsldoi		$tmp,$tmp,$tmp,15
3325
3326	vncipher	$out0,$out0,v30
3327	vncipher	$out1,$out1,v30
3328	 vand		$tmp,$tmp,$eighty7
3329	vncipher	$out2,$out2,v30
3330	vncipher	$out3,$out3,v30
3331	 vxor		$tweak,$tweak,$tmp
3332	vncipher	$out4,$out4,v30
3333	vncipher	$out5,$out5,v30
3334	 vxor		$in5,$twk5,v31
3335	 vsrab		$tmp,$tweak,$seven	# next tweak value
3336	 vxor		$twk5,$tweak,$rndkey0
3337
3338	vncipherlast	$out0,$out0,$in0
3339	 lvx_u		$in0,$x00,$inp		# load next input block
3340	 vaddubm	$tweak,$tweak,$tweak
3341	 vsldoi		$tmp,$tmp,$tmp,15
3342	vncipherlast	$out1,$out1,$in1
3343	 lvx_u		$in1,$x10,$inp
3344	vncipherlast	$out2,$out2,$in2
3345	 le?vperm	$in0,$in0,$in0,$leperm
3346	 lvx_u		$in2,$x20,$inp
3347	 vand		$tmp,$tmp,$eighty7
3348	vncipherlast	$out3,$out3,$in3
3349	 le?vperm	$in1,$in1,$in1,$leperm
3350	 lvx_u		$in3,$x30,$inp
3351	vncipherlast	$out4,$out4,$in4
3352	 le?vperm	$in2,$in2,$in2,$leperm
3353	 lvx_u		$in4,$x40,$inp
3354	 vxor		$tweak,$tweak,$tmp
3355	vncipherlast	$out5,$out5,$in5
3356	 le?vperm	$in3,$in3,$in3,$leperm
3357	 lvx_u		$in5,$x50,$inp
3358	 addi		$inp,$inp,0x60
3359	 le?vperm	$in4,$in4,$in4,$leperm
3360	 le?vperm	$in5,$in5,$in5,$leperm
3361
3362	le?vperm	$out0,$out0,$out0,$leperm
3363	le?vperm	$out1,$out1,$out1,$leperm
3364	stvx_u		$out0,$x00,$out		# store output
3365	 vxor		$out0,$in0,$twk0
3366	le?vperm	$out2,$out2,$out2,$leperm
3367	stvx_u		$out1,$x10,$out
3368	 vxor		$out1,$in1,$twk1
3369	le?vperm	$out3,$out3,$out3,$leperm
3370	stvx_u		$out2,$x20,$out
3371	 vxor		$out2,$in2,$twk2
3372	le?vperm	$out4,$out4,$out4,$leperm
3373	stvx_u		$out3,$x30,$out
3374	 vxor		$out3,$in3,$twk3
3375	le?vperm	$out5,$out5,$out5,$leperm
3376	stvx_u		$out4,$x40,$out
3377	 vxor		$out4,$in4,$twk4
3378	stvx_u		$out5,$x50,$out
3379	 vxor		$out5,$in5,$twk5
3380	addi		$out,$out,0x60
3381
3382	mtctr		$rounds
3383	beq		Loop_xts_dec6x		# did $len-=96 borrow?
3384
3385	addic.		$len,$len,0x60
3386	beq		Lxts_dec6x_zero
3387	cmpwi		$len,0x20
3388	blt		Lxts_dec6x_one
3389	nop
3390	beq		Lxts_dec6x_two
3391	cmpwi		$len,0x40
3392	blt		Lxts_dec6x_three
3393	nop
3394	beq		Lxts_dec6x_four
3395
3396Lxts_dec6x_five:
3397	vxor		$out0,$in1,$twk0
3398	vxor		$out1,$in2,$twk1
3399	vxor		$out2,$in3,$twk2
3400	vxor		$out3,$in4,$twk3
3401	vxor		$out4,$in5,$twk4
3402
3403	bl		_aesp8_xts_dec5x
3404
3405	le?vperm	$out0,$out0,$out0,$leperm
3406	vmr		$twk0,$twk5		# unused tweak
3407	vxor		$twk1,$tweak,$rndkey0
3408	le?vperm	$out1,$out1,$out1,$leperm
3409	stvx_u		$out0,$x00,$out		# store output
3410	vxor		$out0,$in0,$twk1
3411	le?vperm	$out2,$out2,$out2,$leperm
3412	stvx_u		$out1,$x10,$out
3413	le?vperm	$out3,$out3,$out3,$leperm
3414	stvx_u		$out2,$x20,$out
3415	le?vperm	$out4,$out4,$out4,$leperm
3416	stvx_u		$out3,$x30,$out
3417	stvx_u		$out4,$x40,$out
3418	addi		$out,$out,0x50
3419	bne		Lxts_dec6x_steal
3420	b		Lxts_dec6x_done
3421
3422.align	4
3423Lxts_dec6x_four:
3424	vxor		$out0,$in2,$twk0
3425	vxor		$out1,$in3,$twk1
3426	vxor		$out2,$in4,$twk2
3427	vxor		$out3,$in5,$twk3
3428	vxor		$out4,$out4,$out4
3429
3430	bl		_aesp8_xts_dec5x
3431
3432	le?vperm	$out0,$out0,$out0,$leperm
3433	vmr		$twk0,$twk4		# unused tweak
3434	vmr		$twk1,$twk5
3435	le?vperm	$out1,$out1,$out1,$leperm
3436	stvx_u		$out0,$x00,$out		# store output
3437	vxor		$out0,$in0,$twk5
3438	le?vperm	$out2,$out2,$out2,$leperm
3439	stvx_u		$out1,$x10,$out
3440	le?vperm	$out3,$out3,$out3,$leperm
3441	stvx_u		$out2,$x20,$out
3442	stvx_u		$out3,$x30,$out
3443	addi		$out,$out,0x40
3444	bne		Lxts_dec6x_steal
3445	b		Lxts_dec6x_done
3446
3447.align	4
3448Lxts_dec6x_three:
3449	vxor		$out0,$in3,$twk0
3450	vxor		$out1,$in4,$twk1
3451	vxor		$out2,$in5,$twk2
3452	vxor		$out3,$out3,$out3
3453	vxor		$out4,$out4,$out4
3454
3455	bl		_aesp8_xts_dec5x
3456
3457	le?vperm	$out0,$out0,$out0,$leperm
3458	vmr		$twk0,$twk3		# unused tweak
3459	vmr		$twk1,$twk4
3460	le?vperm	$out1,$out1,$out1,$leperm
3461	stvx_u		$out0,$x00,$out		# store output
3462	vxor		$out0,$in0,$twk4
3463	le?vperm	$out2,$out2,$out2,$leperm
3464	stvx_u		$out1,$x10,$out
3465	stvx_u		$out2,$x20,$out
3466	addi		$out,$out,0x30
3467	bne		Lxts_dec6x_steal
3468	b		Lxts_dec6x_done
3469
3470.align	4
3471Lxts_dec6x_two:
3472	vxor		$out0,$in4,$twk0
3473	vxor		$out1,$in5,$twk1
3474	vxor		$out2,$out2,$out2
3475	vxor		$out3,$out3,$out3
3476	vxor		$out4,$out4,$out4
3477
3478	bl		_aesp8_xts_dec5x
3479
3480	le?vperm	$out0,$out0,$out0,$leperm
3481	vmr		$twk0,$twk2		# unused tweak
3482	vmr		$twk1,$twk3
3483	le?vperm	$out1,$out1,$out1,$leperm
3484	stvx_u		$out0,$x00,$out		# store output
3485	vxor		$out0,$in0,$twk3
3486	stvx_u		$out1,$x10,$out
3487	addi		$out,$out,0x20
3488	bne		Lxts_dec6x_steal
3489	b		Lxts_dec6x_done
3490
3491.align	4
3492Lxts_dec6x_one:
3493	vxor		$out0,$in5,$twk0
3494	nop
3495Loop_xts_dec1x:
3496	vncipher	$out0,$out0,v24
3497	lvx		v24,$x20,$key_		# round[3]
3498	addi		$key_,$key_,0x20
3499
3500	vncipher	$out0,$out0,v25
3501	lvx		v25,$x10,$key_		# round[4]
3502	bdnz		Loop_xts_dec1x
3503
3504	subi		r0,$taillen,1
3505	vncipher	$out0,$out0,v24
3506
3507	andi.		r0,r0,16
3508	cmpwi		$taillen,0
3509	vncipher	$out0,$out0,v25
3510
3511	sub		$inp,$inp,r0
3512	vncipher	$out0,$out0,v26
3513
3514	lvx_u		$in0,0,$inp
3515	vncipher	$out0,$out0,v27
3516
3517	addi		$key_,$sp,$FRAME+15	# rewind $key_
3518	vncipher	$out0,$out0,v28
3519	lvx		v24,$x00,$key_		# re-pre-load round[1]
3520
3521	vncipher	$out0,$out0,v29
3522	lvx		v25,$x10,$key_		# re-pre-load round[2]
3523	 vxor		$twk0,$twk0,v31
3524
3525	le?vperm	$in0,$in0,$in0,$leperm
3526	vncipher	$out0,$out0,v30
3527
3528	mtctr		$rounds
3529	vncipherlast	$out0,$out0,$twk0
3530
3531	vmr		$twk0,$twk1		# unused tweak
3532	vmr		$twk1,$twk2
3533	le?vperm	$out0,$out0,$out0,$leperm
3534	stvx_u		$out0,$x00,$out		# store output
3535	addi		$out,$out,0x10
3536	vxor		$out0,$in0,$twk2
3537	bne		Lxts_dec6x_steal
3538	b		Lxts_dec6x_done
3539
3540.align	4
3541Lxts_dec6x_zero:
3542	cmpwi		$taillen,0
3543	beq		Lxts_dec6x_done
3544
3545	lvx_u		$in0,0,$inp
3546	le?vperm	$in0,$in0,$in0,$leperm
3547	vxor		$out0,$in0,$twk1
3548Lxts_dec6x_steal:
3549	vncipher	$out0,$out0,v24
3550	lvx		v24,$x20,$key_		# round[3]
3551	addi		$key_,$key_,0x20
3552
3553	vncipher	$out0,$out0,v25
3554	lvx		v25,$x10,$key_		# round[4]
3555	bdnz		Lxts_dec6x_steal
3556
3557	add		$inp,$inp,$taillen
3558	vncipher	$out0,$out0,v24
3559
3560	cmpwi		$taillen,0
3561	vncipher	$out0,$out0,v25
3562
3563	lvx_u		$in0,0,$inp
3564	vncipher	$out0,$out0,v26
3565
3566	lvsr		$inpperm,0,$taillen	# $in5 is no more
3567	vncipher	$out0,$out0,v27
3568
3569	addi		$key_,$sp,$FRAME+15	# rewind $key_
3570	vncipher	$out0,$out0,v28
3571	lvx		v24,$x00,$key_		# re-pre-load round[1]
3572
3573	vncipher	$out0,$out0,v29
3574	lvx		v25,$x10,$key_		# re-pre-load round[2]
3575	 vxor		$twk1,$twk1,v31
3576
3577	le?vperm	$in0,$in0,$in0,$leperm
3578	vncipher	$out0,$out0,v30
3579
3580	vperm		$in0,$in0,$in0,$inpperm
3581	vncipherlast	$tmp,$out0,$twk1
3582
3583	le?vperm	$out0,$tmp,$tmp,$leperm
3584	le?stvx_u	$out0,0,$out
3585	be?stvx_u	$tmp,0,$out
3586
3587	vxor		$out0,$out0,$out0
3588	vspltisb	$out1,-1
3589	vperm		$out0,$out0,$out1,$inpperm
3590	vsel		$out0,$in0,$tmp,$out0
3591	vxor		$out0,$out0,$twk0
3592
3593	subi		r30,$out,1
3594	mtctr		$taillen
3595Loop_xts_dec6x_steal:
3596	lbzu		r0,1(r30)
3597	stb		r0,16(r30)
3598	bdnz		Loop_xts_dec6x_steal
3599
3600	li		$taillen,0
3601	mtctr		$rounds
3602	b		Loop_xts_dec1x		# one more time...
3603
3604.align	4
3605Lxts_dec6x_done:
3606	${UCMP}i	$ivp,0
3607	beq		Lxts_dec6x_ret
3608
3609	vxor		$tweak,$twk0,$rndkey0
3610	le?vperm	$tweak,$tweak,$tweak,$leperm
3611	stvx_u		$tweak,0,$ivp
3612
3613Lxts_dec6x_ret:
3614	mtlr		r11
3615	li		r10,`$FRAME+15`
3616	li		r11,`$FRAME+31`
3617	stvx		$seven,r10,$sp		# wipe copies of round keys
3618	addi		r10,r10,32
3619	stvx		$seven,r11,$sp
3620	addi		r11,r11,32
3621	stvx		$seven,r10,$sp
3622	addi		r10,r10,32
3623	stvx		$seven,r11,$sp
3624	addi		r11,r11,32
3625	stvx		$seven,r10,$sp
3626	addi		r10,r10,32
3627	stvx		$seven,r11,$sp
3628	addi		r11,r11,32
3629	stvx		$seven,r10,$sp
3630	addi		r10,r10,32
3631	stvx		$seven,r11,$sp
3632	addi		r11,r11,32
3633
3634	mtspr		256,$vrsave
3635	lvx		v20,r10,$sp		# ABI says so
3636	addi		r10,r10,32
3637	lvx		v21,r11,$sp
3638	addi		r11,r11,32
3639	lvx		v22,r10,$sp
3640	addi		r10,r10,32
3641	lvx		v23,r11,$sp
3642	addi		r11,r11,32
3643	lvx		v24,r10,$sp
3644	addi		r10,r10,32
3645	lvx		v25,r11,$sp
3646	addi		r11,r11,32
3647	lvx		v26,r10,$sp
3648	addi		r10,r10,32
3649	lvx		v27,r11,$sp
3650	addi		r11,r11,32
3651	lvx		v28,r10,$sp
3652	addi		r10,r10,32
3653	lvx		v29,r11,$sp
3654	addi		r11,r11,32
3655	lvx		v30,r10,$sp
3656	lvx		v31,r11,$sp
3657	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3658	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3659	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3660	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3661	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3662	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3663	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3664	blr
3665	.long		0
3666	.byte		0,12,0x04,1,0x80,6,6,0
3667	.long		0
3668
3669.align	5
3670_aesp8_xts_dec5x:
3671	vncipher	$out0,$out0,v24
3672	vncipher	$out1,$out1,v24
3673	vncipher	$out2,$out2,v24
3674	vncipher	$out3,$out3,v24
3675	vncipher	$out4,$out4,v24
3676	lvx		v24,$x20,$key_		# round[3]
3677	addi		$key_,$key_,0x20
3678
3679	vncipher	$out0,$out0,v25
3680	vncipher	$out1,$out1,v25
3681	vncipher	$out2,$out2,v25
3682	vncipher	$out3,$out3,v25
3683	vncipher	$out4,$out4,v25
3684	lvx		v25,$x10,$key_		# round[4]
3685	bdnz		_aesp8_xts_dec5x
3686
3687	subi		r0,$taillen,1
3688	vncipher	$out0,$out0,v24
3689	vncipher	$out1,$out1,v24
3690	vncipher	$out2,$out2,v24
3691	vncipher	$out3,$out3,v24
3692	vncipher	$out4,$out4,v24
3693
3694	andi.		r0,r0,16
3695	cmpwi		$taillen,0
3696	vncipher	$out0,$out0,v25
3697	vncipher	$out1,$out1,v25
3698	vncipher	$out2,$out2,v25
3699	vncipher	$out3,$out3,v25
3700	vncipher	$out4,$out4,v25
3701	 vxor		$twk0,$twk0,v31
3702
3703	sub		$inp,$inp,r0
3704	vncipher	$out0,$out0,v26
3705	vncipher	$out1,$out1,v26
3706	vncipher	$out2,$out2,v26
3707	vncipher	$out3,$out3,v26
3708	vncipher	$out4,$out4,v26
3709	 vxor		$in1,$twk1,v31
3710
3711	vncipher	$out0,$out0,v27
3712	lvx_u		$in0,0,$inp
3713	vncipher	$out1,$out1,v27
3714	vncipher	$out2,$out2,v27
3715	vncipher	$out3,$out3,v27
3716	vncipher	$out4,$out4,v27
3717	 vxor		$in2,$twk2,v31
3718
3719	addi		$key_,$sp,$FRAME+15	# rewind $key_
3720	vncipher	$out0,$out0,v28
3721	vncipher	$out1,$out1,v28
3722	vncipher	$out2,$out2,v28
3723	vncipher	$out3,$out3,v28
3724	vncipher	$out4,$out4,v28
3725	lvx		v24,$x00,$key_		# re-pre-load round[1]
3726	 vxor		$in3,$twk3,v31
3727
3728	vncipher	$out0,$out0,v29
3729	le?vperm	$in0,$in0,$in0,$leperm
3730	vncipher	$out1,$out1,v29
3731	vncipher	$out2,$out2,v29
3732	vncipher	$out3,$out3,v29
3733	vncipher	$out4,$out4,v29
3734	lvx		v25,$x10,$key_		# re-pre-load round[2]
3735	 vxor		$in4,$twk4,v31
3736
3737	vncipher	$out0,$out0,v30
3738	vncipher	$out1,$out1,v30
3739	vncipher	$out2,$out2,v30
3740	vncipher	$out3,$out3,v30
3741	vncipher	$out4,$out4,v30
3742
3743	vncipherlast	$out0,$out0,$twk0
3744	vncipherlast	$out1,$out1,$in1
3745	vncipherlast	$out2,$out2,$in2
3746	vncipherlast	$out3,$out3,$in3
3747	vncipherlast	$out4,$out4,$in4
3748	mtctr		$rounds
3749	blr
3750        .long   	0
3751        .byte   	0,12,0x14,0,0,0,0,0
3752___
3753}}	}}}
3754
3755my $consts=1;
3756foreach(split("\n",$code)) {
3757        s/\`([^\`]*)\`/eval($1)/geo;
3758
3759	# constants table endian-specific conversion
3760	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3761	    my $conv=$3;
3762	    my @bytes=();
3763
3764	    # convert to endian-agnostic format
3765	    if ($1 eq "long") {
3766	      foreach (split(/,\s*/,$2)) {
3767		my $l = /^0/?oct:int;
3768		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3769	      }
3770	    } else {
3771		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3772	    }
3773
3774	    # little-endian conversion
3775	    if ($flavour =~ /le$/o) {
3776		SWITCH: for($conv)  {
3777		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
3778		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
3779		}
3780	    }
3781
3782	    #emit
3783	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3784	    next;
3785	}
3786	$consts=0 if (m/Lconsts:/o);	# end of table
3787
3788	# instructions prefixed with '?' are endian-specific and need
3789	# to be adjusted accordingly...
3790	if ($flavour =~ /le$/o) {	# little-endian
3791	    s/le\?//o		or
3792	    s/be\?/#be#/o	or
3793	    s/\?lvsr/lvsl/o	or
3794	    s/\?lvsl/lvsr/o	or
3795	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3796	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3797	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3798	} else {			# big-endian
3799	    s/le\?/#le#/o	or
3800	    s/be\?//o		or
3801	    s/\?([a-z]+)/$1/o;
3802	}
3803
3804        print $_,"\n";
3805}
3806
3807close STDOUT or die "error closing STDOUT: $!";
3808