xref: /linux/lib/crypto/powerpc/aesp8-ppc.pl (revision a4e573db06a4e8c519ec4c42f8e1249a0853367a)
1#! /usr/bin/env perl
2# SPDX-License-Identifier: GPL-2.0
3
4# This code is taken from CRYPTOGAMs[1] and is included here using the option
5# in the license to distribute the code under the GPL. Therefore this program
6# is free software; you can redistribute it and/or modify it under the terms of
7# the GNU General Public License version 2 as published by the Free Software
8# Foundation.
9#
10# [1] https://www.openssl.org/~appro/cryptogams/
11
12# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
13# All rights reserved.
14#
15# Redistribution and use in source and binary forms, with or without
16# modification, are permitted provided that the following conditions
17# are met:
18#
19#       * Redistributions of source code must retain copyright notices,
20#         this list of conditions and the following disclaimer.
21#
22#       * Redistributions in binary form must reproduce the above
23#         copyright notice, this list of conditions and the following
24#         disclaimer in the documentation and/or other materials
25#         provided with the distribution.
26#
27#       * Neither the name of the CRYPTOGAMS nor the names of its
28#         copyright holder and contributors may be used to endorse or
29#         promote products derived from this software without specific
30#         prior written permission.
31#
32# ALTERNATIVELY, provided that this notice is retained in full, this
33# product may be distributed under the terms of the GNU General Public
34# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
35# those given above.
36#
37# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
38# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48
49# ====================================================================
50# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
51# project. The module is, however, dual licensed under OpenSSL and
52# CRYPTOGAMS licenses depending on where you obtain it. For further
53# details see https://www.openssl.org/~appro/cryptogams/.
54# ====================================================================
55#
56# This module implements support for AES instructions as per PowerISA
57# specification version 2.07, first implemented by POWER8 processor.
58# The module is endian-agnostic in sense that it supports both big-
59# and little-endian cases. Data alignment in parallelizable modes is
60# handled with VSX loads and stores, which implies MSR.VSX flag being
61# set. It should also be noted that ISA specification doesn't prohibit
62# alignment exceptions for these instructions on page boundaries.
63# Initially alignment was handled in pure AltiVec/VMX way [when data
64# is aligned programmatically, which in turn guarantees exception-
65# free execution], but it turned to hamper performance when vcipher
66# instructions are interleaved. It's reckoned that eventual
67# misalignment penalties at page boundaries are in average lower
68# than additional overhead in pure AltiVec approach.
69#
70# May 2016
71#
72# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
73# systems were measured.
74#
75######################################################################
76# Current large-block performance in cycles per byte processed with
77# 128-bit key (less is better).
78#
79#		CBC en-/decrypt	CTR	XTS
80# POWER8[le]	3.96/0.72	0.74	1.1
81# POWER8[be]	3.75/0.65	0.66	1.0
82
83$flavour = shift;
84
85if ($flavour =~ /64/) {
86	$SIZE_T	=8;
87	$LRSAVE	=2*$SIZE_T;
88	$STU	="stdu";
89	$POP	="ld";
90	$PUSH	="std";
91	$UCMP	="cmpld";
92	$SHL	="sldi";
93} elsif ($flavour =~ /32/) {
94	$SIZE_T	=4;
95	$LRSAVE	=$SIZE_T;
96	$STU	="stwu";
97	$POP	="lwz";
98	$PUSH	="stw";
99	$UCMP	="cmplw";
100	$SHL	="slwi";
101} else { die "nonsense $flavour"; }
102
103$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
104
105$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
106( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
107( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
108( $xlate="${dir}../../../arch/powerpc/crypto/ppc-xlate.pl" and -f $xlate) or
109die "can't locate ppc-xlate.pl";
110
111open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
112
113$FRAME=8*$SIZE_T;
114$prefix="aes_p8";
115
116$sp="r1";
117$vrsave="r12";
118
119#########################################################################
120{{{	# Key setup procedures						#
121my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
122my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
123my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
124
125$code.=<<___;
126.machine	"any"
127
128.text
129
130.align	7
131rcon:
132.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
133.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
134.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
135.long	0,0,0,0						?asis
136.long	0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
137Lconsts:
138	mflr	r0
139	bcl	20,31,\$+4
140	mflr	$ptr	 #vvvvv "distance between . and rcon
141	addi	$ptr,$ptr,-0x58
142	mtlr	r0
143	blr
144	.long	0
145	.byte	0,12,0x14,0,0,0,0,0
146.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
147
148.globl	.${prefix}_set_encrypt_key
149Lset_encrypt_key:
150	mflr		r11
151	$PUSH		r11,$LRSAVE($sp)
152
153	li		$ptr,-1
154	${UCMP}i	$inp,0
155	beq-		Lenc_key_abort		# if ($inp==0) return -1;
156	${UCMP}i	$out,0
157	beq-		Lenc_key_abort		# if ($out==0) return -1;
158	li		$ptr,-2
159	cmpwi		$bits,128
160	blt-		Lenc_key_abort
161	cmpwi		$bits,256
162	bgt-		Lenc_key_abort
163	andi.		r0,$bits,0x3f
164	bne-		Lenc_key_abort
165
166	lis		r0,0xfff0
167	mfspr		$vrsave,256
168	mtspr		256,r0
169
170	bl		Lconsts
171	mtlr		r11
172
173	neg		r9,$inp
174	lvx		$in0,0,$inp
175	addi		$inp,$inp,15		# 15 is not typo
176	lvsr		$key,0,r9		# borrow $key
177	li		r8,0x20
178	cmpwi		$bits,192
179	lvx		$in1,0,$inp
180	le?vspltisb	$mask,0x0f		# borrow $mask
181	lvx		$rcon,0,$ptr
182	le?vxor		$key,$key,$mask		# adjust for byte swap
183	lvx		$mask,r8,$ptr
184	addi		$ptr,$ptr,0x10
185	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
186	li		$cnt,8
187	vxor		$zero,$zero,$zero
188	mtctr		$cnt
189
190	?lvsr		$outperm,0,$out
191	vspltisb	$outmask,-1
192	lvx		$outhead,0,$out
193	?vperm		$outmask,$zero,$outmask,$outperm
194
195	blt		Loop128
196	addi		$inp,$inp,8
197	beq		L192
198	addi		$inp,$inp,8
199	b		L256
200
201.align	4
202Loop128:
203	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
204	vsldoi		$tmp,$zero,$in0,12	# >>32
205	 vperm		$outtail,$in0,$in0,$outperm	# rotate
206	 vsel		$stage,$outhead,$outtail,$outmask
207	 vmr		$outhead,$outtail
208	vcipherlast	$key,$key,$rcon
209	 stvx		$stage,0,$out
210	 addi		$out,$out,16
211
212	vxor		$in0,$in0,$tmp
213	vsldoi		$tmp,$zero,$tmp,12	# >>32
214	vxor		$in0,$in0,$tmp
215	vsldoi		$tmp,$zero,$tmp,12	# >>32
216	vxor		$in0,$in0,$tmp
217	 vadduwm	$rcon,$rcon,$rcon
218	vxor		$in0,$in0,$key
219	bdnz		Loop128
220
221	lvx		$rcon,0,$ptr		# last two round keys
222
223	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
224	vsldoi		$tmp,$zero,$in0,12	# >>32
225	 vperm		$outtail,$in0,$in0,$outperm	# rotate
226	 vsel		$stage,$outhead,$outtail,$outmask
227	 vmr		$outhead,$outtail
228	vcipherlast	$key,$key,$rcon
229	 stvx		$stage,0,$out
230	 addi		$out,$out,16
231
232	vxor		$in0,$in0,$tmp
233	vsldoi		$tmp,$zero,$tmp,12	# >>32
234	vxor		$in0,$in0,$tmp
235	vsldoi		$tmp,$zero,$tmp,12	# >>32
236	vxor		$in0,$in0,$tmp
237	 vadduwm	$rcon,$rcon,$rcon
238	vxor		$in0,$in0,$key
239
240	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
241	vsldoi		$tmp,$zero,$in0,12	# >>32
242	 vperm		$outtail,$in0,$in0,$outperm	# rotate
243	 vsel		$stage,$outhead,$outtail,$outmask
244	 vmr		$outhead,$outtail
245	vcipherlast	$key,$key,$rcon
246	 stvx		$stage,0,$out
247	 addi		$out,$out,16
248
249	vxor		$in0,$in0,$tmp
250	vsldoi		$tmp,$zero,$tmp,12	# >>32
251	vxor		$in0,$in0,$tmp
252	vsldoi		$tmp,$zero,$tmp,12	# >>32
253	vxor		$in0,$in0,$tmp
254	vxor		$in0,$in0,$key
255	 vperm		$outtail,$in0,$in0,$outperm	# rotate
256	 vsel		$stage,$outhead,$outtail,$outmask
257	 vmr		$outhead,$outtail
258	 stvx		$stage,0,$out
259
260	addi		$inp,$out,15		# 15 is not typo
261	addi		$out,$out,0x50
262
263	li		$rounds,10
264	b		Ldone
265
266.align	4
267L192:
268	lvx		$tmp,0,$inp
269	li		$cnt,4
270	 vperm		$outtail,$in0,$in0,$outperm	# rotate
271	 vsel		$stage,$outhead,$outtail,$outmask
272	 vmr		$outhead,$outtail
273	 stvx		$stage,0,$out
274	 addi		$out,$out,16
275	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
276	vspltisb	$key,8			# borrow $key
277	mtctr		$cnt
278	vsububm		$mask,$mask,$key	# adjust the mask
279
280Loop192:
281	vperm		$key,$in1,$in1,$mask	# roate-n-splat
282	vsldoi		$tmp,$zero,$in0,12	# >>32
283	vcipherlast	$key,$key,$rcon
284
285	vxor		$in0,$in0,$tmp
286	vsldoi		$tmp,$zero,$tmp,12	# >>32
287	vxor		$in0,$in0,$tmp
288	vsldoi		$tmp,$zero,$tmp,12	# >>32
289	vxor		$in0,$in0,$tmp
290
291	 vsldoi		$stage,$zero,$in1,8
292	vspltw		$tmp,$in0,3
293	vxor		$tmp,$tmp,$in1
294	vsldoi		$in1,$zero,$in1,12	# >>32
295	 vadduwm	$rcon,$rcon,$rcon
296	vxor		$in1,$in1,$tmp
297	vxor		$in0,$in0,$key
298	vxor		$in1,$in1,$key
299	 vsldoi		$stage,$stage,$in0,8
300
301	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
302	vsldoi		$tmp,$zero,$in0,12	# >>32
303	 vperm		$outtail,$stage,$stage,$outperm	# rotate
304	 vsel		$stage,$outhead,$outtail,$outmask
305	 vmr		$outhead,$outtail
306	vcipherlast	$key,$key,$rcon
307	 stvx		$stage,0,$out
308	 addi		$out,$out,16
309
310	 vsldoi		$stage,$in0,$in1,8
311	vxor		$in0,$in0,$tmp
312	vsldoi		$tmp,$zero,$tmp,12	# >>32
313	 vperm		$outtail,$stage,$stage,$outperm	# rotate
314	 vsel		$stage,$outhead,$outtail,$outmask
315	 vmr		$outhead,$outtail
316	vxor		$in0,$in0,$tmp
317	vsldoi		$tmp,$zero,$tmp,12	# >>32
318	vxor		$in0,$in0,$tmp
319	 stvx		$stage,0,$out
320	 addi		$out,$out,16
321
322	vspltw		$tmp,$in0,3
323	vxor		$tmp,$tmp,$in1
324	vsldoi		$in1,$zero,$in1,12	# >>32
325	 vadduwm	$rcon,$rcon,$rcon
326	vxor		$in1,$in1,$tmp
327	vxor		$in0,$in0,$key
328	vxor		$in1,$in1,$key
329	 vperm		$outtail,$in0,$in0,$outperm	# rotate
330	 vsel		$stage,$outhead,$outtail,$outmask
331	 vmr		$outhead,$outtail
332	 stvx		$stage,0,$out
333	 addi		$inp,$out,15		# 15 is not typo
334	 addi		$out,$out,16
335	bdnz		Loop192
336
337	li		$rounds,12
338	addi		$out,$out,0x20
339	b		Ldone
340
341.align	4
342L256:
343	lvx		$tmp,0,$inp
344	li		$cnt,7
345	li		$rounds,14
346	 vperm		$outtail,$in0,$in0,$outperm	# rotate
347	 vsel		$stage,$outhead,$outtail,$outmask
348	 vmr		$outhead,$outtail
349	 stvx		$stage,0,$out
350	 addi		$out,$out,16
351	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
352	mtctr		$cnt
353
354Loop256:
355	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
356	vsldoi		$tmp,$zero,$in0,12	# >>32
357	 vperm		$outtail,$in1,$in1,$outperm	# rotate
358	 vsel		$stage,$outhead,$outtail,$outmask
359	 vmr		$outhead,$outtail
360	vcipherlast	$key,$key,$rcon
361	 stvx		$stage,0,$out
362	 addi		$out,$out,16
363
364	vxor		$in0,$in0,$tmp
365	vsldoi		$tmp,$zero,$tmp,12	# >>32
366	vxor		$in0,$in0,$tmp
367	vsldoi		$tmp,$zero,$tmp,12	# >>32
368	vxor		$in0,$in0,$tmp
369	 vadduwm	$rcon,$rcon,$rcon
370	vxor		$in0,$in0,$key
371	 vperm		$outtail,$in0,$in0,$outperm	# rotate
372	 vsel		$stage,$outhead,$outtail,$outmask
373	 vmr		$outhead,$outtail
374	 stvx		$stage,0,$out
375	 addi		$inp,$out,15		# 15 is not typo
376	 addi		$out,$out,16
377	bdz		Ldone
378
379	vspltw		$key,$in0,3		# just splat
380	vsldoi		$tmp,$zero,$in1,12	# >>32
381	vsbox		$key,$key
382
383	vxor		$in1,$in1,$tmp
384	vsldoi		$tmp,$zero,$tmp,12	# >>32
385	vxor		$in1,$in1,$tmp
386	vsldoi		$tmp,$zero,$tmp,12	# >>32
387	vxor		$in1,$in1,$tmp
388
389	vxor		$in1,$in1,$key
390	b		Loop256
391
392.align	4
393Ldone:
394	lvx		$in1,0,$inp		# redundant in aligned case
395	vsel		$in1,$outhead,$in1,$outmask
396	stvx		$in1,0,$inp
397	li		$ptr,0
398	mtspr		256,$vrsave
399	stw		$rounds,0($out)
400
401Lenc_key_abort:
402	mr		r3,$ptr
403	blr
404	.long		0
405	.byte		0,12,0x14,1,0,0,3,0
406	.long		0
407.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
408
409.globl	.${prefix}_set_decrypt_key
410	$STU		$sp,-$FRAME($sp)
411	mflr		r10
412	$PUSH		r10,$FRAME+$LRSAVE($sp)
413	bl		Lset_encrypt_key
414	mtlr		r10
415
416	cmpwi		r3,0
417	bne-		Ldec_key_abort
418
419	slwi		$cnt,$rounds,4
420	subi		$inp,$out,240		# first round key
421	srwi		$rounds,$rounds,1
422	add		$out,$inp,$cnt		# last round key
423	mtctr		$rounds
424
425Ldeckey:
426	lwz		r0, 0($inp)
427	lwz		r6, 4($inp)
428	lwz		r7, 8($inp)
429	lwz		r8, 12($inp)
430	addi		$inp,$inp,16
431	lwz		r9, 0($out)
432	lwz		r10,4($out)
433	lwz		r11,8($out)
434	lwz		r12,12($out)
435	stw		r0, 0($out)
436	stw		r6, 4($out)
437	stw		r7, 8($out)
438	stw		r8, 12($out)
439	subi		$out,$out,16
440	stw		r9, -16($inp)
441	stw		r10,-12($inp)
442	stw		r11,-8($inp)
443	stw		r12,-4($inp)
444	bdnz		Ldeckey
445
446	xor		r3,r3,r3		# return value
447Ldec_key_abort:
448	addi		$sp,$sp,$FRAME
449	blr
450	.long		0
451	.byte		0,12,4,1,0x80,0,3,0
452	.long		0
453.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
454___
455}}}
456#########################################################################
457{{{	# Single block en- and decrypt procedures			#
458sub gen_block () {
459my $dir = shift;
460my $n   = $dir eq "de" ? "n" : "";
461my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
462
463$code.=<<___;
464.globl	.${prefix}_${dir}crypt
465	lwz		$rounds,240($key)
466	lis		r0,0xfc00
467	mfspr		$vrsave,256
468	li		$idx,15			# 15 is not typo
469	mtspr		256,r0
470
471	lvx		v0,0,$inp
472	neg		r11,$out
473	lvx		v1,$idx,$inp
474	lvsl		v2,0,$inp		# inpperm
475	le?vspltisb	v4,0x0f
476	?lvsl		v3,0,r11		# outperm
477	le?vxor		v2,v2,v4
478	li		$idx,16
479	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
480	lvx		v1,0,$key
481	?lvsl		v5,0,$key		# keyperm
482	srwi		$rounds,$rounds,1
483	lvx		v2,$idx,$key
484	addi		$idx,$idx,16
485	subi		$rounds,$rounds,1
486	?vperm		v1,v1,v2,v5		# align round key
487
488	vxor		v0,v0,v1
489	lvx		v1,$idx,$key
490	addi		$idx,$idx,16
491	mtctr		$rounds
492
493Loop_${dir}c:
494	?vperm		v2,v2,v1,v5
495	v${n}cipher	v0,v0,v2
496	lvx		v2,$idx,$key
497	addi		$idx,$idx,16
498	?vperm		v1,v1,v2,v5
499	v${n}cipher	v0,v0,v1
500	lvx		v1,$idx,$key
501	addi		$idx,$idx,16
502	bdnz		Loop_${dir}c
503
504	?vperm		v2,v2,v1,v5
505	v${n}cipher	v0,v0,v2
506	lvx		v2,$idx,$key
507	?vperm		v1,v1,v2,v5
508	v${n}cipherlast	v0,v0,v1
509
510	vspltisb	v2,-1
511	vxor		v1,v1,v1
512	li		$idx,15			# 15 is not typo
513	?vperm		v2,v1,v2,v3		# outmask
514	le?vxor		v3,v3,v4
515	lvx		v1,0,$out		# outhead
516	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
517	vsel		v1,v1,v0,v2
518	lvx		v4,$idx,$out
519	stvx		v1,0,$out
520	vsel		v0,v0,v4,v2
521	stvx		v0,$idx,$out
522
523	mtspr		256,$vrsave
524	blr
525	.long		0
526	.byte		0,12,0x14,0,0,0,3,0
527	.long		0
528.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
529___
530}
531&gen_block("en");
532&gen_block("de");
533}}}
534#########################################################################
535{{{	# CBC en- and decrypt procedures				#
536my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
537my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
538my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
539						map("v$_",(4..10));
540$code.=<<___;
541.globl	.${prefix}_cbc_encrypt
542	${UCMP}i	$len,16
543	bltlr-
544
545	cmpwi		$enc,0			# test direction
546	lis		r0,0xffe0
547	mfspr		$vrsave,256
548	mtspr		256,r0
549
550	li		$idx,15
551	vxor		$rndkey0,$rndkey0,$rndkey0
552	le?vspltisb	$tmp,0x0f
553
554	lvx		$ivec,0,$ivp		# load [unaligned] iv
555	lvsl		$inpperm,0,$ivp
556	lvx		$inptail,$idx,$ivp
557	le?vxor		$inpperm,$inpperm,$tmp
558	vperm		$ivec,$ivec,$inptail,$inpperm
559
560	neg		r11,$inp
561	?lvsl		$keyperm,0,$key		# prepare for unaligned key
562	lwz		$rounds,240($key)
563
564	lvsr		$inpperm,0,r11		# prepare for unaligned load
565	lvx		$inptail,0,$inp
566	addi		$inp,$inp,15		# 15 is not typo
567	le?vxor		$inpperm,$inpperm,$tmp
568
569	?lvsr		$outperm,0,$out		# prepare for unaligned store
570	vspltisb	$outmask,-1
571	lvx		$outhead,0,$out
572	?vperm		$outmask,$rndkey0,$outmask,$outperm
573	le?vxor		$outperm,$outperm,$tmp
574
575	srwi		$rounds,$rounds,1
576	li		$idx,16
577	subi		$rounds,$rounds,1
578	beq		Lcbc_dec
579
580Lcbc_enc:
581	vmr		$inout,$inptail
582	lvx		$inptail,0,$inp
583	addi		$inp,$inp,16
584	mtctr		$rounds
585	subi		$len,$len,16		# len-=16
586
587	lvx		$rndkey0,0,$key
588	 vperm		$inout,$inout,$inptail,$inpperm
589	lvx		$rndkey1,$idx,$key
590	addi		$idx,$idx,16
591	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
592	vxor		$inout,$inout,$rndkey0
593	lvx		$rndkey0,$idx,$key
594	addi		$idx,$idx,16
595	vxor		$inout,$inout,$ivec
596
597Loop_cbc_enc:
598	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
599	vcipher		$inout,$inout,$rndkey1
600	lvx		$rndkey1,$idx,$key
601	addi		$idx,$idx,16
602	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
603	vcipher		$inout,$inout,$rndkey0
604	lvx		$rndkey0,$idx,$key
605	addi		$idx,$idx,16
606	bdnz		Loop_cbc_enc
607
608	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
609	vcipher		$inout,$inout,$rndkey1
610	lvx		$rndkey1,$idx,$key
611	li		$idx,16
612	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
613	vcipherlast	$ivec,$inout,$rndkey0
614	${UCMP}i	$len,16
615
616	vperm		$tmp,$ivec,$ivec,$outperm
617	vsel		$inout,$outhead,$tmp,$outmask
618	vmr		$outhead,$tmp
619	stvx		$inout,0,$out
620	addi		$out,$out,16
621	bge		Lcbc_enc
622
623	b		Lcbc_done
624
625.align	4
626Lcbc_dec:
627	${UCMP}i	$len,128
628	bge		_aesp8_cbc_decrypt8x
629	vmr		$tmp,$inptail
630	lvx		$inptail,0,$inp
631	addi		$inp,$inp,16
632	mtctr		$rounds
633	subi		$len,$len,16		# len-=16
634
635	lvx		$rndkey0,0,$key
636	 vperm		$tmp,$tmp,$inptail,$inpperm
637	lvx		$rndkey1,$idx,$key
638	addi		$idx,$idx,16
639	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
640	vxor		$inout,$tmp,$rndkey0
641	lvx		$rndkey0,$idx,$key
642	addi		$idx,$idx,16
643
644Loop_cbc_dec:
645	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
646	vncipher	$inout,$inout,$rndkey1
647	lvx		$rndkey1,$idx,$key
648	addi		$idx,$idx,16
649	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
650	vncipher	$inout,$inout,$rndkey0
651	lvx		$rndkey0,$idx,$key
652	addi		$idx,$idx,16
653	bdnz		Loop_cbc_dec
654
655	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
656	vncipher	$inout,$inout,$rndkey1
657	lvx		$rndkey1,$idx,$key
658	li		$idx,16
659	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
660	vncipherlast	$inout,$inout,$rndkey0
661	${UCMP}i	$len,16
662
663	vxor		$inout,$inout,$ivec
664	vmr		$ivec,$tmp
665	vperm		$tmp,$inout,$inout,$outperm
666	vsel		$inout,$outhead,$tmp,$outmask
667	vmr		$outhead,$tmp
668	stvx		$inout,0,$out
669	addi		$out,$out,16
670	bge		Lcbc_dec
671
672Lcbc_done:
673	addi		$out,$out,-1
674	lvx		$inout,0,$out		# redundant in aligned case
675	vsel		$inout,$outhead,$inout,$outmask
676	stvx		$inout,0,$out
677
678	neg		$enc,$ivp		# write [unaligned] iv
679	li		$idx,15			# 15 is not typo
680	vxor		$rndkey0,$rndkey0,$rndkey0
681	vspltisb	$outmask,-1
682	le?vspltisb	$tmp,0x0f
683	?lvsl		$outperm,0,$enc
684	?vperm		$outmask,$rndkey0,$outmask,$outperm
685	le?vxor		$outperm,$outperm,$tmp
686	lvx		$outhead,0,$ivp
687	vperm		$ivec,$ivec,$ivec,$outperm
688	vsel		$inout,$outhead,$ivec,$outmask
689	lvx		$inptail,$idx,$ivp
690	stvx		$inout,0,$ivp
691	vsel		$inout,$ivec,$inptail,$outmask
692	stvx		$inout,$idx,$ivp
693
694	mtspr		256,$vrsave
695	blr
696	.long		0
697	.byte		0,12,0x14,0,0,0,6,0
698	.long		0
699___
700#########################################################################
701{{	# Optimized CBC decrypt procedure				#
702my $key_="r11";
703my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
704my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
705my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
706my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
707			# v26-v31 last 6 round keys
708my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
709
710$code.=<<___;
711.align	5
712_aesp8_cbc_decrypt8x:
713	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
714	li		r10,`$FRAME+8*16+15`
715	li		r11,`$FRAME+8*16+31`
716	stvx		v20,r10,$sp		# ABI says so
717	addi		r10,r10,32
718	stvx		v21,r11,$sp
719	addi		r11,r11,32
720	stvx		v22,r10,$sp
721	addi		r10,r10,32
722	stvx		v23,r11,$sp
723	addi		r11,r11,32
724	stvx		v24,r10,$sp
725	addi		r10,r10,32
726	stvx		v25,r11,$sp
727	addi		r11,r11,32
728	stvx		v26,r10,$sp
729	addi		r10,r10,32
730	stvx		v27,r11,$sp
731	addi		r11,r11,32
732	stvx		v28,r10,$sp
733	addi		r10,r10,32
734	stvx		v29,r11,$sp
735	addi		r11,r11,32
736	stvx		v30,r10,$sp
737	stvx		v31,r11,$sp
738	li		r0,-1
739	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
740	li		$x10,0x10
741	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
742	li		$x20,0x20
743	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
744	li		$x30,0x30
745	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
746	li		$x40,0x40
747	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
748	li		$x50,0x50
749	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
750	li		$x60,0x60
751	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
752	li		$x70,0x70
753	mtspr		256,r0
754
755	subi		$rounds,$rounds,3	# -4 in total
756	subi		$len,$len,128		# bias
757
758	lvx		$rndkey0,$x00,$key	# load key schedule
759	lvx		v30,$x10,$key
760	addi		$key,$key,0x20
761	lvx		v31,$x00,$key
762	?vperm		$rndkey0,$rndkey0,v30,$keyperm
763	addi		$key_,$sp,$FRAME+15
764	mtctr		$rounds
765
766Load_cbc_dec_key:
767	?vperm		v24,v30,v31,$keyperm
768	lvx		v30,$x10,$key
769	addi		$key,$key,0x20
770	stvx		v24,$x00,$key_		# off-load round[1]
771	?vperm		v25,v31,v30,$keyperm
772	lvx		v31,$x00,$key
773	stvx		v25,$x10,$key_		# off-load round[2]
774	addi		$key_,$key_,0x20
775	bdnz		Load_cbc_dec_key
776
777	lvx		v26,$x10,$key
778	?vperm		v24,v30,v31,$keyperm
779	lvx		v27,$x20,$key
780	stvx		v24,$x00,$key_		# off-load round[3]
781	?vperm		v25,v31,v26,$keyperm
782	lvx		v28,$x30,$key
783	stvx		v25,$x10,$key_		# off-load round[4]
784	addi		$key_,$sp,$FRAME+15	# rewind $key_
785	?vperm		v26,v26,v27,$keyperm
786	lvx		v29,$x40,$key
787	?vperm		v27,v27,v28,$keyperm
788	lvx		v30,$x50,$key
789	?vperm		v28,v28,v29,$keyperm
790	lvx		v31,$x60,$key
791	?vperm		v29,v29,v30,$keyperm
792	lvx		$out0,$x70,$key		# borrow $out0
793	?vperm		v30,v30,v31,$keyperm
794	lvx		v24,$x00,$key_		# pre-load round[1]
795	?vperm		v31,v31,$out0,$keyperm
796	lvx		v25,$x10,$key_		# pre-load round[2]
797
798	#lvx		$inptail,0,$inp		# "caller" already did this
799	#addi		$inp,$inp,15		# 15 is not typo
800	subi		$inp,$inp,15		# undo "caller"
801
802	 le?li		$idx,8
803	lvx_u		$in0,$x00,$inp		# load first 8 "words"
804	 le?lvsl	$inpperm,0,$idx
805	 le?vspltisb	$tmp,0x0f
806	lvx_u		$in1,$x10,$inp
807	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
808	lvx_u		$in2,$x20,$inp
809	 le?vperm	$in0,$in0,$in0,$inpperm
810	lvx_u		$in3,$x30,$inp
811	 le?vperm	$in1,$in1,$in1,$inpperm
812	lvx_u		$in4,$x40,$inp
813	 le?vperm	$in2,$in2,$in2,$inpperm
814	vxor		$out0,$in0,$rndkey0
815	lvx_u		$in5,$x50,$inp
816	 le?vperm	$in3,$in3,$in3,$inpperm
817	vxor		$out1,$in1,$rndkey0
818	lvx_u		$in6,$x60,$inp
819	 le?vperm	$in4,$in4,$in4,$inpperm
820	vxor		$out2,$in2,$rndkey0
821	lvx_u		$in7,$x70,$inp
822	addi		$inp,$inp,0x80
823	 le?vperm	$in5,$in5,$in5,$inpperm
824	vxor		$out3,$in3,$rndkey0
825	 le?vperm	$in6,$in6,$in6,$inpperm
826	vxor		$out4,$in4,$rndkey0
827	 le?vperm	$in7,$in7,$in7,$inpperm
828	vxor		$out5,$in5,$rndkey0
829	vxor		$out6,$in6,$rndkey0
830	vxor		$out7,$in7,$rndkey0
831
832	mtctr		$rounds
833	b		Loop_cbc_dec8x
834.align	5
835Loop_cbc_dec8x:
836	vncipher	$out0,$out0,v24
837	vncipher	$out1,$out1,v24
838	vncipher	$out2,$out2,v24
839	vncipher	$out3,$out3,v24
840	vncipher	$out4,$out4,v24
841	vncipher	$out5,$out5,v24
842	vncipher	$out6,$out6,v24
843	vncipher	$out7,$out7,v24
844	lvx		v24,$x20,$key_		# round[3]
845	addi		$key_,$key_,0x20
846
847	vncipher	$out0,$out0,v25
848	vncipher	$out1,$out1,v25
849	vncipher	$out2,$out2,v25
850	vncipher	$out3,$out3,v25
851	vncipher	$out4,$out4,v25
852	vncipher	$out5,$out5,v25
853	vncipher	$out6,$out6,v25
854	vncipher	$out7,$out7,v25
855	lvx		v25,$x10,$key_		# round[4]
856	bdnz		Loop_cbc_dec8x
857
858	subic		$len,$len,128		# $len-=128
859	vncipher	$out0,$out0,v24
860	vncipher	$out1,$out1,v24
861	vncipher	$out2,$out2,v24
862	vncipher	$out3,$out3,v24
863	vncipher	$out4,$out4,v24
864	vncipher	$out5,$out5,v24
865	vncipher	$out6,$out6,v24
866	vncipher	$out7,$out7,v24
867
868	subfe.		r0,r0,r0		# borrow?-1:0
869	vncipher	$out0,$out0,v25
870	vncipher	$out1,$out1,v25
871	vncipher	$out2,$out2,v25
872	vncipher	$out3,$out3,v25
873	vncipher	$out4,$out4,v25
874	vncipher	$out5,$out5,v25
875	vncipher	$out6,$out6,v25
876	vncipher	$out7,$out7,v25
877
878	and		r0,r0,$len
879	vncipher	$out0,$out0,v26
880	vncipher	$out1,$out1,v26
881	vncipher	$out2,$out2,v26
882	vncipher	$out3,$out3,v26
883	vncipher	$out4,$out4,v26
884	vncipher	$out5,$out5,v26
885	vncipher	$out6,$out6,v26
886	vncipher	$out7,$out7,v26
887
888	add		$inp,$inp,r0		# $inp is adjusted in such
889						# way that at exit from the
890						# loop inX-in7 are loaded
891						# with last "words"
892	vncipher	$out0,$out0,v27
893	vncipher	$out1,$out1,v27
894	vncipher	$out2,$out2,v27
895	vncipher	$out3,$out3,v27
896	vncipher	$out4,$out4,v27
897	vncipher	$out5,$out5,v27
898	vncipher	$out6,$out6,v27
899	vncipher	$out7,$out7,v27
900
901	addi		$key_,$sp,$FRAME+15	# rewind $key_
902	vncipher	$out0,$out0,v28
903	vncipher	$out1,$out1,v28
904	vncipher	$out2,$out2,v28
905	vncipher	$out3,$out3,v28
906	vncipher	$out4,$out4,v28
907	vncipher	$out5,$out5,v28
908	vncipher	$out6,$out6,v28
909	vncipher	$out7,$out7,v28
910	lvx		v24,$x00,$key_		# re-pre-load round[1]
911
912	vncipher	$out0,$out0,v29
913	vncipher	$out1,$out1,v29
914	vncipher	$out2,$out2,v29
915	vncipher	$out3,$out3,v29
916	vncipher	$out4,$out4,v29
917	vncipher	$out5,$out5,v29
918	vncipher	$out6,$out6,v29
919	vncipher	$out7,$out7,v29
920	lvx		v25,$x10,$key_		# re-pre-load round[2]
921
922	vncipher	$out0,$out0,v30
923	 vxor		$ivec,$ivec,v31		# xor with last round key
924	vncipher	$out1,$out1,v30
925	 vxor		$in0,$in0,v31
926	vncipher	$out2,$out2,v30
927	 vxor		$in1,$in1,v31
928	vncipher	$out3,$out3,v30
929	 vxor		$in2,$in2,v31
930	vncipher	$out4,$out4,v30
931	 vxor		$in3,$in3,v31
932	vncipher	$out5,$out5,v30
933	 vxor		$in4,$in4,v31
934	vncipher	$out6,$out6,v30
935	 vxor		$in5,$in5,v31
936	vncipher	$out7,$out7,v30
937	 vxor		$in6,$in6,v31
938
939	vncipherlast	$out0,$out0,$ivec
940	vncipherlast	$out1,$out1,$in0
941	 lvx_u		$in0,$x00,$inp		# load next input block
942	vncipherlast	$out2,$out2,$in1
943	 lvx_u		$in1,$x10,$inp
944	vncipherlast	$out3,$out3,$in2
945	 le?vperm	$in0,$in0,$in0,$inpperm
946	 lvx_u		$in2,$x20,$inp
947	vncipherlast	$out4,$out4,$in3
948	 le?vperm	$in1,$in1,$in1,$inpperm
949	 lvx_u		$in3,$x30,$inp
950	vncipherlast	$out5,$out5,$in4
951	 le?vperm	$in2,$in2,$in2,$inpperm
952	 lvx_u		$in4,$x40,$inp
953	vncipherlast	$out6,$out6,$in5
954	 le?vperm	$in3,$in3,$in3,$inpperm
955	 lvx_u		$in5,$x50,$inp
956	vncipherlast	$out7,$out7,$in6
957	 le?vperm	$in4,$in4,$in4,$inpperm
958	 lvx_u		$in6,$x60,$inp
959	vmr		$ivec,$in7
960	 le?vperm	$in5,$in5,$in5,$inpperm
961	 lvx_u		$in7,$x70,$inp
962	 addi		$inp,$inp,0x80
963
964	le?vperm	$out0,$out0,$out0,$inpperm
965	le?vperm	$out1,$out1,$out1,$inpperm
966	stvx_u		$out0,$x00,$out
967	 le?vperm	$in6,$in6,$in6,$inpperm
968	 vxor		$out0,$in0,$rndkey0
969	le?vperm	$out2,$out2,$out2,$inpperm
970	stvx_u		$out1,$x10,$out
971	 le?vperm	$in7,$in7,$in7,$inpperm
972	 vxor		$out1,$in1,$rndkey0
973	le?vperm	$out3,$out3,$out3,$inpperm
974	stvx_u		$out2,$x20,$out
975	 vxor		$out2,$in2,$rndkey0
976	le?vperm	$out4,$out4,$out4,$inpperm
977	stvx_u		$out3,$x30,$out
978	 vxor		$out3,$in3,$rndkey0
979	le?vperm	$out5,$out5,$out5,$inpperm
980	stvx_u		$out4,$x40,$out
981	 vxor		$out4,$in4,$rndkey0
982	le?vperm	$out6,$out6,$out6,$inpperm
983	stvx_u		$out5,$x50,$out
984	 vxor		$out5,$in5,$rndkey0
985	le?vperm	$out7,$out7,$out7,$inpperm
986	stvx_u		$out6,$x60,$out
987	 vxor		$out6,$in6,$rndkey0
988	stvx_u		$out7,$x70,$out
989	addi		$out,$out,0x80
990	 vxor		$out7,$in7,$rndkey0
991
992	mtctr		$rounds
993	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
994
995	addic.		$len,$len,128
996	beq		Lcbc_dec8x_done
997	nop
998	nop
999
1000Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
1001	vncipher	$out1,$out1,v24
1002	vncipher	$out2,$out2,v24
1003	vncipher	$out3,$out3,v24
1004	vncipher	$out4,$out4,v24
1005	vncipher	$out5,$out5,v24
1006	vncipher	$out6,$out6,v24
1007	vncipher	$out7,$out7,v24
1008	lvx		v24,$x20,$key_		# round[3]
1009	addi		$key_,$key_,0x20
1010
1011	vncipher	$out1,$out1,v25
1012	vncipher	$out2,$out2,v25
1013	vncipher	$out3,$out3,v25
1014	vncipher	$out4,$out4,v25
1015	vncipher	$out5,$out5,v25
1016	vncipher	$out6,$out6,v25
1017	vncipher	$out7,$out7,v25
1018	lvx		v25,$x10,$key_		# round[4]
1019	bdnz		Loop_cbc_dec8x_tail
1020
1021	vncipher	$out1,$out1,v24
1022	vncipher	$out2,$out2,v24
1023	vncipher	$out3,$out3,v24
1024	vncipher	$out4,$out4,v24
1025	vncipher	$out5,$out5,v24
1026	vncipher	$out6,$out6,v24
1027	vncipher	$out7,$out7,v24
1028
1029	vncipher	$out1,$out1,v25
1030	vncipher	$out2,$out2,v25
1031	vncipher	$out3,$out3,v25
1032	vncipher	$out4,$out4,v25
1033	vncipher	$out5,$out5,v25
1034	vncipher	$out6,$out6,v25
1035	vncipher	$out7,$out7,v25
1036
1037	vncipher	$out1,$out1,v26
1038	vncipher	$out2,$out2,v26
1039	vncipher	$out3,$out3,v26
1040	vncipher	$out4,$out4,v26
1041	vncipher	$out5,$out5,v26
1042	vncipher	$out6,$out6,v26
1043	vncipher	$out7,$out7,v26
1044
1045	vncipher	$out1,$out1,v27
1046	vncipher	$out2,$out2,v27
1047	vncipher	$out3,$out3,v27
1048	vncipher	$out4,$out4,v27
1049	vncipher	$out5,$out5,v27
1050	vncipher	$out6,$out6,v27
1051	vncipher	$out7,$out7,v27
1052
1053	vncipher	$out1,$out1,v28
1054	vncipher	$out2,$out2,v28
1055	vncipher	$out3,$out3,v28
1056	vncipher	$out4,$out4,v28
1057	vncipher	$out5,$out5,v28
1058	vncipher	$out6,$out6,v28
1059	vncipher	$out7,$out7,v28
1060
1061	vncipher	$out1,$out1,v29
1062	vncipher	$out2,$out2,v29
1063	vncipher	$out3,$out3,v29
1064	vncipher	$out4,$out4,v29
1065	vncipher	$out5,$out5,v29
1066	vncipher	$out6,$out6,v29
1067	vncipher	$out7,$out7,v29
1068
1069	vncipher	$out1,$out1,v30
1070	 vxor		$ivec,$ivec,v31		# last round key
1071	vncipher	$out2,$out2,v30
1072	 vxor		$in1,$in1,v31
1073	vncipher	$out3,$out3,v30
1074	 vxor		$in2,$in2,v31
1075	vncipher	$out4,$out4,v30
1076	 vxor		$in3,$in3,v31
1077	vncipher	$out5,$out5,v30
1078	 vxor		$in4,$in4,v31
1079	vncipher	$out6,$out6,v30
1080	 vxor		$in5,$in5,v31
1081	vncipher	$out7,$out7,v30
1082	 vxor		$in6,$in6,v31
1083
1084	cmplwi		$len,32			# switch($len)
1085	blt		Lcbc_dec8x_one
1086	nop
1087	beq		Lcbc_dec8x_two
1088	cmplwi		$len,64
1089	blt		Lcbc_dec8x_three
1090	nop
1091	beq		Lcbc_dec8x_four
1092	cmplwi		$len,96
1093	blt		Lcbc_dec8x_five
1094	nop
1095	beq		Lcbc_dec8x_six
1096
1097Lcbc_dec8x_seven:
1098	vncipherlast	$out1,$out1,$ivec
1099	vncipherlast	$out2,$out2,$in1
1100	vncipherlast	$out3,$out3,$in2
1101	vncipherlast	$out4,$out4,$in3
1102	vncipherlast	$out5,$out5,$in4
1103	vncipherlast	$out6,$out6,$in5
1104	vncipherlast	$out7,$out7,$in6
1105	vmr		$ivec,$in7
1106
1107	le?vperm	$out1,$out1,$out1,$inpperm
1108	le?vperm	$out2,$out2,$out2,$inpperm
1109	stvx_u		$out1,$x00,$out
1110	le?vperm	$out3,$out3,$out3,$inpperm
1111	stvx_u		$out2,$x10,$out
1112	le?vperm	$out4,$out4,$out4,$inpperm
1113	stvx_u		$out3,$x20,$out
1114	le?vperm	$out5,$out5,$out5,$inpperm
1115	stvx_u		$out4,$x30,$out
1116	le?vperm	$out6,$out6,$out6,$inpperm
1117	stvx_u		$out5,$x40,$out
1118	le?vperm	$out7,$out7,$out7,$inpperm
1119	stvx_u		$out6,$x50,$out
1120	stvx_u		$out7,$x60,$out
1121	addi		$out,$out,0x70
1122	b		Lcbc_dec8x_done
1123
1124.align	5
1125Lcbc_dec8x_six:
1126	vncipherlast	$out2,$out2,$ivec
1127	vncipherlast	$out3,$out3,$in2
1128	vncipherlast	$out4,$out4,$in3
1129	vncipherlast	$out5,$out5,$in4
1130	vncipherlast	$out6,$out6,$in5
1131	vncipherlast	$out7,$out7,$in6
1132	vmr		$ivec,$in7
1133
1134	le?vperm	$out2,$out2,$out2,$inpperm
1135	le?vperm	$out3,$out3,$out3,$inpperm
1136	stvx_u		$out2,$x00,$out
1137	le?vperm	$out4,$out4,$out4,$inpperm
1138	stvx_u		$out3,$x10,$out
1139	le?vperm	$out5,$out5,$out5,$inpperm
1140	stvx_u		$out4,$x20,$out
1141	le?vperm	$out6,$out6,$out6,$inpperm
1142	stvx_u		$out5,$x30,$out
1143	le?vperm	$out7,$out7,$out7,$inpperm
1144	stvx_u		$out6,$x40,$out
1145	stvx_u		$out7,$x50,$out
1146	addi		$out,$out,0x60
1147	b		Lcbc_dec8x_done
1148
1149.align	5
1150Lcbc_dec8x_five:
1151	vncipherlast	$out3,$out3,$ivec
1152	vncipherlast	$out4,$out4,$in3
1153	vncipherlast	$out5,$out5,$in4
1154	vncipherlast	$out6,$out6,$in5
1155	vncipherlast	$out7,$out7,$in6
1156	vmr		$ivec,$in7
1157
1158	le?vperm	$out3,$out3,$out3,$inpperm
1159	le?vperm	$out4,$out4,$out4,$inpperm
1160	stvx_u		$out3,$x00,$out
1161	le?vperm	$out5,$out5,$out5,$inpperm
1162	stvx_u		$out4,$x10,$out
1163	le?vperm	$out6,$out6,$out6,$inpperm
1164	stvx_u		$out5,$x20,$out
1165	le?vperm	$out7,$out7,$out7,$inpperm
1166	stvx_u		$out6,$x30,$out
1167	stvx_u		$out7,$x40,$out
1168	addi		$out,$out,0x50
1169	b		Lcbc_dec8x_done
1170
1171.align	5
1172Lcbc_dec8x_four:
1173	vncipherlast	$out4,$out4,$ivec
1174	vncipherlast	$out5,$out5,$in4
1175	vncipherlast	$out6,$out6,$in5
1176	vncipherlast	$out7,$out7,$in6
1177	vmr		$ivec,$in7
1178
1179	le?vperm	$out4,$out4,$out4,$inpperm
1180	le?vperm	$out5,$out5,$out5,$inpperm
1181	stvx_u		$out4,$x00,$out
1182	le?vperm	$out6,$out6,$out6,$inpperm
1183	stvx_u		$out5,$x10,$out
1184	le?vperm	$out7,$out7,$out7,$inpperm
1185	stvx_u		$out6,$x20,$out
1186	stvx_u		$out7,$x30,$out
1187	addi		$out,$out,0x40
1188	b		Lcbc_dec8x_done
1189
1190.align	5
1191Lcbc_dec8x_three:
1192	vncipherlast	$out5,$out5,$ivec
1193	vncipherlast	$out6,$out6,$in5
1194	vncipherlast	$out7,$out7,$in6
1195	vmr		$ivec,$in7
1196
1197	le?vperm	$out5,$out5,$out5,$inpperm
1198	le?vperm	$out6,$out6,$out6,$inpperm
1199	stvx_u		$out5,$x00,$out
1200	le?vperm	$out7,$out7,$out7,$inpperm
1201	stvx_u		$out6,$x10,$out
1202	stvx_u		$out7,$x20,$out
1203	addi		$out,$out,0x30
1204	b		Lcbc_dec8x_done
1205
1206.align	5
1207Lcbc_dec8x_two:
1208	vncipherlast	$out6,$out6,$ivec
1209	vncipherlast	$out7,$out7,$in6
1210	vmr		$ivec,$in7
1211
1212	le?vperm	$out6,$out6,$out6,$inpperm
1213	le?vperm	$out7,$out7,$out7,$inpperm
1214	stvx_u		$out6,$x00,$out
1215	stvx_u		$out7,$x10,$out
1216	addi		$out,$out,0x20
1217	b		Lcbc_dec8x_done
1218
1219.align	5
1220Lcbc_dec8x_one:
1221	vncipherlast	$out7,$out7,$ivec
1222	vmr		$ivec,$in7
1223
1224	le?vperm	$out7,$out7,$out7,$inpperm
1225	stvx_u		$out7,0,$out
1226	addi		$out,$out,0x10
1227
1228Lcbc_dec8x_done:
1229	le?vperm	$ivec,$ivec,$ivec,$inpperm
1230	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
1231
1232	li		r10,`$FRAME+15`
1233	li		r11,`$FRAME+31`
1234	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1235	addi		r10,r10,32
1236	stvx		$inpperm,r11,$sp
1237	addi		r11,r11,32
1238	stvx		$inpperm,r10,$sp
1239	addi		r10,r10,32
1240	stvx		$inpperm,r11,$sp
1241	addi		r11,r11,32
1242	stvx		$inpperm,r10,$sp
1243	addi		r10,r10,32
1244	stvx		$inpperm,r11,$sp
1245	addi		r11,r11,32
1246	stvx		$inpperm,r10,$sp
1247	addi		r10,r10,32
1248	stvx		$inpperm,r11,$sp
1249	addi		r11,r11,32
1250
1251	mtspr		256,$vrsave
1252	lvx		v20,r10,$sp		# ABI says so
1253	addi		r10,r10,32
1254	lvx		v21,r11,$sp
1255	addi		r11,r11,32
1256	lvx		v22,r10,$sp
1257	addi		r10,r10,32
1258	lvx		v23,r11,$sp
1259	addi		r11,r11,32
1260	lvx		v24,r10,$sp
1261	addi		r10,r10,32
1262	lvx		v25,r11,$sp
1263	addi		r11,r11,32
1264	lvx		v26,r10,$sp
1265	addi		r10,r10,32
1266	lvx		v27,r11,$sp
1267	addi		r11,r11,32
1268	lvx		v28,r10,$sp
1269	addi		r10,r10,32
1270	lvx		v29,r11,$sp
1271	addi		r11,r11,32
1272	lvx		v30,r10,$sp
1273	lvx		v31,r11,$sp
1274	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1275	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1276	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1277	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1278	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1279	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1280	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1281	blr
1282	.long		0
1283	.byte		0,12,0x14,0,0x80,6,6,0
1284	.long		0
1285.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1286___
1287}}	}}}
1288
1289#########################################################################
1290{{{	# CTR procedure[s]						#
1291
1292####################### WARNING: Here be dragons! #######################
1293#
1294# This code is written as 'ctr32', based on a 32-bit counter used
1295# upstream. The kernel does *not* use a 32-bit counter. The kernel uses
1296# a 128-bit counter.
1297#
1298# This leads to subtle changes from the upstream code: the counter
1299# is incremented with vaddu_q_m rather than vaddu_w_m. This occurs in
1300# both the bulk (8 blocks at a time) path, and in the individual block
1301# path. Be aware of this when doing updates.
1302#
1303# See:
1304# 1d4aa0b4c181 ("crypto: vmx - Fixing AES-CTR counter bug")
1305# 009b30ac7444 ("crypto: vmx - CTR: always increment IV as quadword")
1306# https://github.com/openssl/openssl/pull/8942
1307#
1308#########################################################################
1309my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1310my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
1311my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1312						map("v$_",(4..11));
1313my $dat=$tmp;
1314
1315$code.=<<___;
1316.globl	.${prefix}_ctr32_encrypt_blocks
1317	${UCMP}i	$len,1
1318	bltlr-
1319
1320	lis		r0,0xfff0
1321	mfspr		$vrsave,256
1322	mtspr		256,r0
1323
1324	li		$idx,15
1325	vxor		$rndkey0,$rndkey0,$rndkey0
1326	le?vspltisb	$tmp,0x0f
1327
1328	lvx		$ivec,0,$ivp		# load [unaligned] iv
1329	lvsl		$inpperm,0,$ivp
1330	lvx		$inptail,$idx,$ivp
1331	 vspltisb	$one,1
1332	le?vxor		$inpperm,$inpperm,$tmp
1333	vperm		$ivec,$ivec,$inptail,$inpperm
1334	 vsldoi		$one,$rndkey0,$one,1
1335
1336	neg		r11,$inp
1337	?lvsl		$keyperm,0,$key		# prepare for unaligned key
1338	lwz		$rounds,240($key)
1339
1340	lvsr		$inpperm,0,r11		# prepare for unaligned load
1341	lvx		$inptail,0,$inp
1342	addi		$inp,$inp,15		# 15 is not typo
1343	le?vxor		$inpperm,$inpperm,$tmp
1344
1345	srwi		$rounds,$rounds,1
1346	li		$idx,16
1347	subi		$rounds,$rounds,1
1348
1349	${UCMP}i	$len,8
1350	bge		_aesp8_ctr32_encrypt8x
1351
1352	?lvsr		$outperm,0,$out		# prepare for unaligned store
1353	vspltisb	$outmask,-1
1354	lvx		$outhead,0,$out
1355	?vperm		$outmask,$rndkey0,$outmask,$outperm
1356	le?vxor		$outperm,$outperm,$tmp
1357
1358	lvx		$rndkey0,0,$key
1359	mtctr		$rounds
1360	lvx		$rndkey1,$idx,$key
1361	addi		$idx,$idx,16
1362	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1363	vxor		$inout,$ivec,$rndkey0
1364	lvx		$rndkey0,$idx,$key
1365	addi		$idx,$idx,16
1366	b		Loop_ctr32_enc
1367
1368.align	5
1369Loop_ctr32_enc:
1370	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1371	vcipher		$inout,$inout,$rndkey1
1372	lvx		$rndkey1,$idx,$key
1373	addi		$idx,$idx,16
1374	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1375	vcipher		$inout,$inout,$rndkey0
1376	lvx		$rndkey0,$idx,$key
1377	addi		$idx,$idx,16
1378	bdnz		Loop_ctr32_enc
1379
1380	vadduqm		$ivec,$ivec,$one	# Kernel change for 128-bit
1381	 vmr		$dat,$inptail
1382	 lvx		$inptail,0,$inp
1383	 addi		$inp,$inp,16
1384	 subic.		$len,$len,1		# blocks--
1385
1386	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1387	vcipher		$inout,$inout,$rndkey1
1388	lvx		$rndkey1,$idx,$key
1389	 vperm		$dat,$dat,$inptail,$inpperm
1390	 li		$idx,16
1391	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
1392	 lvx		$rndkey0,0,$key
1393	vxor		$dat,$dat,$rndkey1	# last round key
1394	vcipherlast	$inout,$inout,$dat
1395
1396	 lvx		$rndkey1,$idx,$key
1397	 addi		$idx,$idx,16
1398	vperm		$inout,$inout,$inout,$outperm
1399	vsel		$dat,$outhead,$inout,$outmask
1400	 mtctr		$rounds
1401	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1402	vmr		$outhead,$inout
1403	 vxor		$inout,$ivec,$rndkey0
1404	 lvx		$rndkey0,$idx,$key
1405	 addi		$idx,$idx,16
1406	stvx		$dat,0,$out
1407	addi		$out,$out,16
1408	bne		Loop_ctr32_enc
1409
1410	addi		$out,$out,-1
1411	lvx		$inout,0,$out		# redundant in aligned case
1412	vsel		$inout,$outhead,$inout,$outmask
1413	stvx		$inout,0,$out
1414
1415	mtspr		256,$vrsave
1416	blr
1417	.long		0
1418	.byte		0,12,0x14,0,0,0,6,0
1419	.long		0
1420___
1421#########################################################################
1422{{	# Optimized CTR procedure					#
1423my $key_="r11";
1424my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1425my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1426my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1427my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
1428			# v26-v31 last 6 round keys
1429my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
1430my ($two,$three,$four)=($outhead,$outperm,$outmask);
1431
1432$code.=<<___;
1433.align	5
1434_aesp8_ctr32_encrypt8x:
1435	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1436	li		r10,`$FRAME+8*16+15`
1437	li		r11,`$FRAME+8*16+31`
1438	stvx		v20,r10,$sp		# ABI says so
1439	addi		r10,r10,32
1440	stvx		v21,r11,$sp
1441	addi		r11,r11,32
1442	stvx		v22,r10,$sp
1443	addi		r10,r10,32
1444	stvx		v23,r11,$sp
1445	addi		r11,r11,32
1446	stvx		v24,r10,$sp
1447	addi		r10,r10,32
1448	stvx		v25,r11,$sp
1449	addi		r11,r11,32
1450	stvx		v26,r10,$sp
1451	addi		r10,r10,32
1452	stvx		v27,r11,$sp
1453	addi		r11,r11,32
1454	stvx		v28,r10,$sp
1455	addi		r10,r10,32
1456	stvx		v29,r11,$sp
1457	addi		r11,r11,32
1458	stvx		v30,r10,$sp
1459	stvx		v31,r11,$sp
1460	li		r0,-1
1461	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
1462	li		$x10,0x10
1463	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1464	li		$x20,0x20
1465	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1466	li		$x30,0x30
1467	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1468	li		$x40,0x40
1469	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1470	li		$x50,0x50
1471	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1472	li		$x60,0x60
1473	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1474	li		$x70,0x70
1475	mtspr		256,r0
1476
1477	subi		$rounds,$rounds,3	# -4 in total
1478
1479	lvx		$rndkey0,$x00,$key	# load key schedule
1480	lvx		v30,$x10,$key
1481	addi		$key,$key,0x20
1482	lvx		v31,$x00,$key
1483	?vperm		$rndkey0,$rndkey0,v30,$keyperm
1484	addi		$key_,$sp,$FRAME+15
1485	mtctr		$rounds
1486
1487Load_ctr32_enc_key:
1488	?vperm		v24,v30,v31,$keyperm
1489	lvx		v30,$x10,$key
1490	addi		$key,$key,0x20
1491	stvx		v24,$x00,$key_		# off-load round[1]
1492	?vperm		v25,v31,v30,$keyperm
1493	lvx		v31,$x00,$key
1494	stvx		v25,$x10,$key_		# off-load round[2]
1495	addi		$key_,$key_,0x20
1496	bdnz		Load_ctr32_enc_key
1497
1498	lvx		v26,$x10,$key
1499	?vperm		v24,v30,v31,$keyperm
1500	lvx		v27,$x20,$key
1501	stvx		v24,$x00,$key_		# off-load round[3]
1502	?vperm		v25,v31,v26,$keyperm
1503	lvx		v28,$x30,$key
1504	stvx		v25,$x10,$key_		# off-load round[4]
1505	addi		$key_,$sp,$FRAME+15	# rewind $key_
1506	?vperm		v26,v26,v27,$keyperm
1507	lvx		v29,$x40,$key
1508	?vperm		v27,v27,v28,$keyperm
1509	lvx		v30,$x50,$key
1510	?vperm		v28,v28,v29,$keyperm
1511	lvx		v31,$x60,$key
1512	?vperm		v29,v29,v30,$keyperm
1513	lvx		$out0,$x70,$key		# borrow $out0
1514	?vperm		v30,v30,v31,$keyperm
1515	lvx		v24,$x00,$key_		# pre-load round[1]
1516	?vperm		v31,v31,$out0,$keyperm
1517	lvx		v25,$x10,$key_		# pre-load round[2]
1518
1519	vadduqm		$two,$one,$one
1520	subi		$inp,$inp,15		# undo "caller"
1521	$SHL		$len,$len,4
1522
1523	vadduqm		$out1,$ivec,$one	# counter values ...
1524	vadduqm		$out2,$ivec,$two	# (do all ctr adds as 128-bit)
1525	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1526	 le?li		$idx,8
1527	vadduqm		$out3,$out1,$two
1528	vxor		$out1,$out1,$rndkey0
1529	 le?lvsl	$inpperm,0,$idx
1530	vadduqm		$out4,$out2,$two
1531	vxor		$out2,$out2,$rndkey0
1532	 le?vspltisb	$tmp,0x0f
1533	vadduqm		$out5,$out3,$two
1534	vxor		$out3,$out3,$rndkey0
1535	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
1536	vadduqm		$out6,$out4,$two
1537	vxor		$out4,$out4,$rndkey0
1538	vadduqm		$out7,$out5,$two
1539	vxor		$out5,$out5,$rndkey0
1540	vadduqm		$ivec,$out6,$two	# next counter value
1541	vxor		$out6,$out6,$rndkey0
1542	vxor		$out7,$out7,$rndkey0
1543
1544	mtctr		$rounds
1545	b		Loop_ctr32_enc8x
1546.align	5
1547Loop_ctr32_enc8x:
1548	vcipher 	$out0,$out0,v24
1549	vcipher 	$out1,$out1,v24
1550	vcipher 	$out2,$out2,v24
1551	vcipher 	$out3,$out3,v24
1552	vcipher 	$out4,$out4,v24
1553	vcipher 	$out5,$out5,v24
1554	vcipher 	$out6,$out6,v24
1555	vcipher 	$out7,$out7,v24
1556Loop_ctr32_enc8x_middle:
1557	lvx		v24,$x20,$key_		# round[3]
1558	addi		$key_,$key_,0x20
1559
1560	vcipher 	$out0,$out0,v25
1561	vcipher 	$out1,$out1,v25
1562	vcipher 	$out2,$out2,v25
1563	vcipher 	$out3,$out3,v25
1564	vcipher 	$out4,$out4,v25
1565	vcipher 	$out5,$out5,v25
1566	vcipher 	$out6,$out6,v25
1567	vcipher 	$out7,$out7,v25
1568	lvx		v25,$x10,$key_		# round[4]
1569	bdnz		Loop_ctr32_enc8x
1570
1571	subic		r11,$len,256		# $len-256, borrow $key_
1572	vcipher 	$out0,$out0,v24
1573	vcipher 	$out1,$out1,v24
1574	vcipher 	$out2,$out2,v24
1575	vcipher 	$out3,$out3,v24
1576	vcipher 	$out4,$out4,v24
1577	vcipher 	$out5,$out5,v24
1578	vcipher 	$out6,$out6,v24
1579	vcipher 	$out7,$out7,v24
1580
1581	subfe		r0,r0,r0		# borrow?-1:0
1582	vcipher 	$out0,$out0,v25
1583	vcipher 	$out1,$out1,v25
1584	vcipher 	$out2,$out2,v25
1585	vcipher 	$out3,$out3,v25
1586	vcipher 	$out4,$out4,v25
1587	vcipher		$out5,$out5,v25
1588	vcipher		$out6,$out6,v25
1589	vcipher		$out7,$out7,v25
1590
1591	and		r0,r0,r11
1592	addi		$key_,$sp,$FRAME+15	# rewind $key_
1593	vcipher		$out0,$out0,v26
1594	vcipher		$out1,$out1,v26
1595	vcipher		$out2,$out2,v26
1596	vcipher		$out3,$out3,v26
1597	vcipher		$out4,$out4,v26
1598	vcipher		$out5,$out5,v26
1599	vcipher		$out6,$out6,v26
1600	vcipher		$out7,$out7,v26
1601	lvx		v24,$x00,$key_		# re-pre-load round[1]
1602
1603	subic		$len,$len,129		# $len-=129
1604	vcipher		$out0,$out0,v27
1605	addi		$len,$len,1		# $len-=128 really
1606	vcipher		$out1,$out1,v27
1607	vcipher		$out2,$out2,v27
1608	vcipher		$out3,$out3,v27
1609	vcipher		$out4,$out4,v27
1610	vcipher		$out5,$out5,v27
1611	vcipher		$out6,$out6,v27
1612	vcipher		$out7,$out7,v27
1613	lvx		v25,$x10,$key_		# re-pre-load round[2]
1614
1615	vcipher		$out0,$out0,v28
1616	 lvx_u		$in0,$x00,$inp		# load input
1617	vcipher		$out1,$out1,v28
1618	 lvx_u		$in1,$x10,$inp
1619	vcipher		$out2,$out2,v28
1620	 lvx_u		$in2,$x20,$inp
1621	vcipher		$out3,$out3,v28
1622	 lvx_u		$in3,$x30,$inp
1623	vcipher		$out4,$out4,v28
1624	 lvx_u		$in4,$x40,$inp
1625	vcipher		$out5,$out5,v28
1626	 lvx_u		$in5,$x50,$inp
1627	vcipher		$out6,$out6,v28
1628	 lvx_u		$in6,$x60,$inp
1629	vcipher		$out7,$out7,v28
1630	 lvx_u		$in7,$x70,$inp
1631	 addi		$inp,$inp,0x80
1632
1633	vcipher		$out0,$out0,v29
1634	 le?vperm	$in0,$in0,$in0,$inpperm
1635	vcipher		$out1,$out1,v29
1636	 le?vperm	$in1,$in1,$in1,$inpperm
1637	vcipher		$out2,$out2,v29
1638	 le?vperm	$in2,$in2,$in2,$inpperm
1639	vcipher		$out3,$out3,v29
1640	 le?vperm	$in3,$in3,$in3,$inpperm
1641	vcipher		$out4,$out4,v29
1642	 le?vperm	$in4,$in4,$in4,$inpperm
1643	vcipher		$out5,$out5,v29
1644	 le?vperm	$in5,$in5,$in5,$inpperm
1645	vcipher		$out6,$out6,v29
1646	 le?vperm	$in6,$in6,$in6,$inpperm
1647	vcipher		$out7,$out7,v29
1648	 le?vperm	$in7,$in7,$in7,$inpperm
1649
1650	add		$inp,$inp,r0		# $inp is adjusted in such
1651						# way that at exit from the
1652						# loop inX-in7 are loaded
1653						# with last "words"
1654	subfe.		r0,r0,r0		# borrow?-1:0
1655	vcipher		$out0,$out0,v30
1656	 vxor		$in0,$in0,v31		# xor with last round key
1657	vcipher		$out1,$out1,v30
1658	 vxor		$in1,$in1,v31
1659	vcipher		$out2,$out2,v30
1660	 vxor		$in2,$in2,v31
1661	vcipher		$out3,$out3,v30
1662	 vxor		$in3,$in3,v31
1663	vcipher		$out4,$out4,v30
1664	 vxor		$in4,$in4,v31
1665	vcipher		$out5,$out5,v30
1666	 vxor		$in5,$in5,v31
1667	vcipher		$out6,$out6,v30
1668	 vxor		$in6,$in6,v31
1669	vcipher		$out7,$out7,v30
1670	 vxor		$in7,$in7,v31
1671
1672	bne		Lctr32_enc8x_break	# did $len-129 borrow?
1673
1674	vcipherlast	$in0,$out0,$in0
1675	vcipherlast	$in1,$out1,$in1
1676	 vadduqm	$out1,$ivec,$one	# counter values ...
1677	vcipherlast	$in2,$out2,$in2
1678	 vadduqm	$out2,$ivec,$two
1679	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1680	vcipherlast	$in3,$out3,$in3
1681	 vadduqm	$out3,$out1,$two
1682	 vxor		$out1,$out1,$rndkey0
1683	vcipherlast	$in4,$out4,$in4
1684	 vadduqm	$out4,$out2,$two
1685	 vxor		$out2,$out2,$rndkey0
1686	vcipherlast	$in5,$out5,$in5
1687	 vadduqm	$out5,$out3,$two
1688	 vxor		$out3,$out3,$rndkey0
1689	vcipherlast	$in6,$out6,$in6
1690	 vadduqm	$out6,$out4,$two
1691	 vxor		$out4,$out4,$rndkey0
1692	vcipherlast	$in7,$out7,$in7
1693	 vadduqm	$out7,$out5,$two
1694	 vxor		$out5,$out5,$rndkey0
1695	le?vperm	$in0,$in0,$in0,$inpperm
1696	 vadduqm	$ivec,$out6,$two	# next counter value
1697	 vxor		$out6,$out6,$rndkey0
1698	le?vperm	$in1,$in1,$in1,$inpperm
1699	 vxor		$out7,$out7,$rndkey0
1700	mtctr		$rounds
1701
1702	 vcipher	$out0,$out0,v24
1703	stvx_u		$in0,$x00,$out
1704	le?vperm	$in2,$in2,$in2,$inpperm
1705	 vcipher	$out1,$out1,v24
1706	stvx_u		$in1,$x10,$out
1707	le?vperm	$in3,$in3,$in3,$inpperm
1708	 vcipher	$out2,$out2,v24
1709	stvx_u		$in2,$x20,$out
1710	le?vperm	$in4,$in4,$in4,$inpperm
1711	 vcipher	$out3,$out3,v24
1712	stvx_u		$in3,$x30,$out
1713	le?vperm	$in5,$in5,$in5,$inpperm
1714	 vcipher	$out4,$out4,v24
1715	stvx_u		$in4,$x40,$out
1716	le?vperm	$in6,$in6,$in6,$inpperm
1717	 vcipher	$out5,$out5,v24
1718	stvx_u		$in5,$x50,$out
1719	le?vperm	$in7,$in7,$in7,$inpperm
1720	 vcipher	$out6,$out6,v24
1721	stvx_u		$in6,$x60,$out
1722	 vcipher	$out7,$out7,v24
1723	stvx_u		$in7,$x70,$out
1724	addi		$out,$out,0x80
1725
1726	b		Loop_ctr32_enc8x_middle
1727
1728.align	5
1729Lctr32_enc8x_break:
1730	cmpwi		$len,-0x60
1731	blt		Lctr32_enc8x_one
1732	nop
1733	beq		Lctr32_enc8x_two
1734	cmpwi		$len,-0x40
1735	blt		Lctr32_enc8x_three
1736	nop
1737	beq		Lctr32_enc8x_four
1738	cmpwi		$len,-0x20
1739	blt		Lctr32_enc8x_five
1740	nop
1741	beq		Lctr32_enc8x_six
1742	cmpwi		$len,0x00
1743	blt		Lctr32_enc8x_seven
1744
1745Lctr32_enc8x_eight:
1746	vcipherlast	$out0,$out0,$in0
1747	vcipherlast	$out1,$out1,$in1
1748	vcipherlast	$out2,$out2,$in2
1749	vcipherlast	$out3,$out3,$in3
1750	vcipherlast	$out4,$out4,$in4
1751	vcipherlast	$out5,$out5,$in5
1752	vcipherlast	$out6,$out6,$in6
1753	vcipherlast	$out7,$out7,$in7
1754
1755	le?vperm	$out0,$out0,$out0,$inpperm
1756	le?vperm	$out1,$out1,$out1,$inpperm
1757	stvx_u		$out0,$x00,$out
1758	le?vperm	$out2,$out2,$out2,$inpperm
1759	stvx_u		$out1,$x10,$out
1760	le?vperm	$out3,$out3,$out3,$inpperm
1761	stvx_u		$out2,$x20,$out
1762	le?vperm	$out4,$out4,$out4,$inpperm
1763	stvx_u		$out3,$x30,$out
1764	le?vperm	$out5,$out5,$out5,$inpperm
1765	stvx_u		$out4,$x40,$out
1766	le?vperm	$out6,$out6,$out6,$inpperm
1767	stvx_u		$out5,$x50,$out
1768	le?vperm	$out7,$out7,$out7,$inpperm
1769	stvx_u		$out6,$x60,$out
1770	stvx_u		$out7,$x70,$out
1771	addi		$out,$out,0x80
1772	b		Lctr32_enc8x_done
1773
1774.align	5
1775Lctr32_enc8x_seven:
1776	vcipherlast	$out0,$out0,$in1
1777	vcipherlast	$out1,$out1,$in2
1778	vcipherlast	$out2,$out2,$in3
1779	vcipherlast	$out3,$out3,$in4
1780	vcipherlast	$out4,$out4,$in5
1781	vcipherlast	$out5,$out5,$in6
1782	vcipherlast	$out6,$out6,$in7
1783
1784	le?vperm	$out0,$out0,$out0,$inpperm
1785	le?vperm	$out1,$out1,$out1,$inpperm
1786	stvx_u		$out0,$x00,$out
1787	le?vperm	$out2,$out2,$out2,$inpperm
1788	stvx_u		$out1,$x10,$out
1789	le?vperm	$out3,$out3,$out3,$inpperm
1790	stvx_u		$out2,$x20,$out
1791	le?vperm	$out4,$out4,$out4,$inpperm
1792	stvx_u		$out3,$x30,$out
1793	le?vperm	$out5,$out5,$out5,$inpperm
1794	stvx_u		$out4,$x40,$out
1795	le?vperm	$out6,$out6,$out6,$inpperm
1796	stvx_u		$out5,$x50,$out
1797	stvx_u		$out6,$x60,$out
1798	addi		$out,$out,0x70
1799	b		Lctr32_enc8x_done
1800
1801.align	5
1802Lctr32_enc8x_six:
1803	vcipherlast	$out0,$out0,$in2
1804	vcipherlast	$out1,$out1,$in3
1805	vcipherlast	$out2,$out2,$in4
1806	vcipherlast	$out3,$out3,$in5
1807	vcipherlast	$out4,$out4,$in6
1808	vcipherlast	$out5,$out5,$in7
1809
1810	le?vperm	$out0,$out0,$out0,$inpperm
1811	le?vperm	$out1,$out1,$out1,$inpperm
1812	stvx_u		$out0,$x00,$out
1813	le?vperm	$out2,$out2,$out2,$inpperm
1814	stvx_u		$out1,$x10,$out
1815	le?vperm	$out3,$out3,$out3,$inpperm
1816	stvx_u		$out2,$x20,$out
1817	le?vperm	$out4,$out4,$out4,$inpperm
1818	stvx_u		$out3,$x30,$out
1819	le?vperm	$out5,$out5,$out5,$inpperm
1820	stvx_u		$out4,$x40,$out
1821	stvx_u		$out5,$x50,$out
1822	addi		$out,$out,0x60
1823	b		Lctr32_enc8x_done
1824
1825.align	5
1826Lctr32_enc8x_five:
1827	vcipherlast	$out0,$out0,$in3
1828	vcipherlast	$out1,$out1,$in4
1829	vcipherlast	$out2,$out2,$in5
1830	vcipherlast	$out3,$out3,$in6
1831	vcipherlast	$out4,$out4,$in7
1832
1833	le?vperm	$out0,$out0,$out0,$inpperm
1834	le?vperm	$out1,$out1,$out1,$inpperm
1835	stvx_u		$out0,$x00,$out
1836	le?vperm	$out2,$out2,$out2,$inpperm
1837	stvx_u		$out1,$x10,$out
1838	le?vperm	$out3,$out3,$out3,$inpperm
1839	stvx_u		$out2,$x20,$out
1840	le?vperm	$out4,$out4,$out4,$inpperm
1841	stvx_u		$out3,$x30,$out
1842	stvx_u		$out4,$x40,$out
1843	addi		$out,$out,0x50
1844	b		Lctr32_enc8x_done
1845
1846.align	5
1847Lctr32_enc8x_four:
1848	vcipherlast	$out0,$out0,$in4
1849	vcipherlast	$out1,$out1,$in5
1850	vcipherlast	$out2,$out2,$in6
1851	vcipherlast	$out3,$out3,$in7
1852
1853	le?vperm	$out0,$out0,$out0,$inpperm
1854	le?vperm	$out1,$out1,$out1,$inpperm
1855	stvx_u		$out0,$x00,$out
1856	le?vperm	$out2,$out2,$out2,$inpperm
1857	stvx_u		$out1,$x10,$out
1858	le?vperm	$out3,$out3,$out3,$inpperm
1859	stvx_u		$out2,$x20,$out
1860	stvx_u		$out3,$x30,$out
1861	addi		$out,$out,0x40
1862	b		Lctr32_enc8x_done
1863
1864.align	5
1865Lctr32_enc8x_three:
1866	vcipherlast	$out0,$out0,$in5
1867	vcipherlast	$out1,$out1,$in6
1868	vcipherlast	$out2,$out2,$in7
1869
1870	le?vperm	$out0,$out0,$out0,$inpperm
1871	le?vperm	$out1,$out1,$out1,$inpperm
1872	stvx_u		$out0,$x00,$out
1873	le?vperm	$out2,$out2,$out2,$inpperm
1874	stvx_u		$out1,$x10,$out
1875	stvx_u		$out2,$x20,$out
1876	addi		$out,$out,0x30
1877	b		Lctr32_enc8x_done
1878
1879.align	5
1880Lctr32_enc8x_two:
1881	vcipherlast	$out0,$out0,$in6
1882	vcipherlast	$out1,$out1,$in7
1883
1884	le?vperm	$out0,$out0,$out0,$inpperm
1885	le?vperm	$out1,$out1,$out1,$inpperm
1886	stvx_u		$out0,$x00,$out
1887	stvx_u		$out1,$x10,$out
1888	addi		$out,$out,0x20
1889	b		Lctr32_enc8x_done
1890
1891.align	5
1892Lctr32_enc8x_one:
1893	vcipherlast	$out0,$out0,$in7
1894
1895	le?vperm	$out0,$out0,$out0,$inpperm
1896	stvx_u		$out0,0,$out
1897	addi		$out,$out,0x10
1898
1899Lctr32_enc8x_done:
1900	li		r10,`$FRAME+15`
1901	li		r11,`$FRAME+31`
1902	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1903	addi		r10,r10,32
1904	stvx		$inpperm,r11,$sp
1905	addi		r11,r11,32
1906	stvx		$inpperm,r10,$sp
1907	addi		r10,r10,32
1908	stvx		$inpperm,r11,$sp
1909	addi		r11,r11,32
1910	stvx		$inpperm,r10,$sp
1911	addi		r10,r10,32
1912	stvx		$inpperm,r11,$sp
1913	addi		r11,r11,32
1914	stvx		$inpperm,r10,$sp
1915	addi		r10,r10,32
1916	stvx		$inpperm,r11,$sp
1917	addi		r11,r11,32
1918
1919	mtspr		256,$vrsave
1920	lvx		v20,r10,$sp		# ABI says so
1921	addi		r10,r10,32
1922	lvx		v21,r11,$sp
1923	addi		r11,r11,32
1924	lvx		v22,r10,$sp
1925	addi		r10,r10,32
1926	lvx		v23,r11,$sp
1927	addi		r11,r11,32
1928	lvx		v24,r10,$sp
1929	addi		r10,r10,32
1930	lvx		v25,r11,$sp
1931	addi		r11,r11,32
1932	lvx		v26,r10,$sp
1933	addi		r10,r10,32
1934	lvx		v27,r11,$sp
1935	addi		r11,r11,32
1936	lvx		v28,r10,$sp
1937	addi		r10,r10,32
1938	lvx		v29,r11,$sp
1939	addi		r11,r11,32
1940	lvx		v30,r10,$sp
1941	lvx		v31,r11,$sp
1942	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1943	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1944	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1945	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1946	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1947	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1948	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1949	blr
1950	.long		0
1951	.byte		0,12,0x14,0,0x80,6,6,0
1952	.long		0
1953.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1954___
1955}}	}}}
1956
1957#########################################################################
1958{{{	# XTS procedures						#
1959# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
1960#                             const AES_KEY *key1, const AES_KEY *key2,	#
1961#                             [const] unsigned char iv[16]);		#
1962# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
1963# input tweak value is assumed to be encrypted already, and last tweak	#
1964# value, one suitable for consecutive call on same chunk of data, is	#
1965# written back to original buffer. In addition, in "tweak chaining"	#
1966# mode only complete input blocks are processed.			#
1967
1968my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
1969my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
1970my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
1971my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
1972my $taillen = $key2;
1973
1974   ($inp,$idx) = ($idx,$inp);				# reassign
1975
1976$code.=<<___;
1977.globl	.${prefix}_xts_encrypt
1978	mr		$inp,r3				# reassign
1979	li		r3,-1
1980	${UCMP}i	$len,16
1981	bltlr-
1982
1983	lis		r0,0xfff0
1984	mfspr		r12,256				# save vrsave
1985	li		r11,0
1986	mtspr		256,r0
1987
1988	vspltisb	$seven,0x07			# 0x070707..07
1989	le?lvsl		$leperm,r11,r11
1990	le?vspltisb	$tmp,0x0f
1991	le?vxor		$leperm,$leperm,$seven
1992
1993	li		$idx,15
1994	lvx		$tweak,0,$ivp			# load [unaligned] iv
1995	lvsl		$inpperm,0,$ivp
1996	lvx		$inptail,$idx,$ivp
1997	le?vxor		$inpperm,$inpperm,$tmp
1998	vperm		$tweak,$tweak,$inptail,$inpperm
1999
2000	neg		r11,$inp
2001	lvsr		$inpperm,0,r11			# prepare for unaligned load
2002	lvx		$inout,0,$inp
2003	addi		$inp,$inp,15			# 15 is not typo
2004	le?vxor		$inpperm,$inpperm,$tmp
2005
2006	${UCMP}i	$key2,0				# key2==NULL?
2007	beq		Lxts_enc_no_key2
2008
2009	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2010	lwz		$rounds,240($key2)
2011	srwi		$rounds,$rounds,1
2012	subi		$rounds,$rounds,1
2013	li		$idx,16
2014
2015	lvx		$rndkey0,0,$key2
2016	lvx		$rndkey1,$idx,$key2
2017	addi		$idx,$idx,16
2018	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2019	vxor		$tweak,$tweak,$rndkey0
2020	lvx		$rndkey0,$idx,$key2
2021	addi		$idx,$idx,16
2022	mtctr		$rounds
2023
2024Ltweak_xts_enc:
2025	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2026	vcipher		$tweak,$tweak,$rndkey1
2027	lvx		$rndkey1,$idx,$key2
2028	addi		$idx,$idx,16
2029	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2030	vcipher		$tweak,$tweak,$rndkey0
2031	lvx		$rndkey0,$idx,$key2
2032	addi		$idx,$idx,16
2033	bdnz		Ltweak_xts_enc
2034
2035	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2036	vcipher		$tweak,$tweak,$rndkey1
2037	lvx		$rndkey1,$idx,$key2
2038	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2039	vcipherlast	$tweak,$tweak,$rndkey0
2040
2041	li		$ivp,0				# don't chain the tweak
2042	b		Lxts_enc
2043
2044Lxts_enc_no_key2:
2045	li		$idx,-16
2046	and		$len,$len,$idx			# in "tweak chaining"
2047							# mode only complete
2048							# blocks are processed
2049Lxts_enc:
2050	lvx		$inptail,0,$inp
2051	addi		$inp,$inp,16
2052
2053	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2054	lwz		$rounds,240($key1)
2055	srwi		$rounds,$rounds,1
2056	subi		$rounds,$rounds,1
2057	li		$idx,16
2058
2059	vslb		$eighty7,$seven,$seven		# 0x808080..80
2060	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2061	vspltisb	$tmp,1				# 0x010101..01
2062	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2063
2064	${UCMP}i	$len,96
2065	bge		_aesp8_xts_encrypt6x
2066
2067	andi.		$taillen,$len,15
2068	subic		r0,$len,32
2069	subi		$taillen,$taillen,16
2070	subfe		r0,r0,r0
2071	and		r0,r0,$taillen
2072	add		$inp,$inp,r0
2073
2074	lvx		$rndkey0,0,$key1
2075	lvx		$rndkey1,$idx,$key1
2076	addi		$idx,$idx,16
2077	vperm		$inout,$inout,$inptail,$inpperm
2078	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2079	vxor		$inout,$inout,$tweak
2080	vxor		$inout,$inout,$rndkey0
2081	lvx		$rndkey0,$idx,$key1
2082	addi		$idx,$idx,16
2083	mtctr		$rounds
2084	b		Loop_xts_enc
2085
2086.align	5
2087Loop_xts_enc:
2088	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2089	vcipher		$inout,$inout,$rndkey1
2090	lvx		$rndkey1,$idx,$key1
2091	addi		$idx,$idx,16
2092	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2093	vcipher		$inout,$inout,$rndkey0
2094	lvx		$rndkey0,$idx,$key1
2095	addi		$idx,$idx,16
2096	bdnz		Loop_xts_enc
2097
2098	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2099	vcipher		$inout,$inout,$rndkey1
2100	lvx		$rndkey1,$idx,$key1
2101	li		$idx,16
2102	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2103	vxor		$rndkey0,$rndkey0,$tweak
2104	vcipherlast	$output,$inout,$rndkey0
2105
2106	le?vperm	$tmp,$output,$output,$leperm
2107	be?nop
2108	le?stvx_u	$tmp,0,$out
2109	be?stvx_u	$output,0,$out
2110	addi		$out,$out,16
2111
2112	subic.		$len,$len,16
2113	beq		Lxts_enc_done
2114
2115	vmr		$inout,$inptail
2116	lvx		$inptail,0,$inp
2117	addi		$inp,$inp,16
2118	lvx		$rndkey0,0,$key1
2119	lvx		$rndkey1,$idx,$key1
2120	addi		$idx,$idx,16
2121
2122	subic		r0,$len,32
2123	subfe		r0,r0,r0
2124	and		r0,r0,$taillen
2125	add		$inp,$inp,r0
2126
2127	vsrab		$tmp,$tweak,$seven		# next tweak value
2128	vaddubm		$tweak,$tweak,$tweak
2129	vsldoi		$tmp,$tmp,$tmp,15
2130	vand		$tmp,$tmp,$eighty7
2131	vxor		$tweak,$tweak,$tmp
2132
2133	vperm		$inout,$inout,$inptail,$inpperm
2134	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2135	vxor		$inout,$inout,$tweak
2136	vxor		$output,$output,$rndkey0	# just in case $len<16
2137	vxor		$inout,$inout,$rndkey0
2138	lvx		$rndkey0,$idx,$key1
2139	addi		$idx,$idx,16
2140
2141	mtctr		$rounds
2142	${UCMP}i	$len,16
2143	bge		Loop_xts_enc
2144
2145	vxor		$output,$output,$tweak
2146	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2147	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2148	vspltisb	$tmp,-1
2149	vperm		$inptail,$inptail,$tmp,$inpperm
2150	vsel		$inout,$inout,$output,$inptail
2151
2152	subi		r11,$out,17
2153	subi		$out,$out,16
2154	mtctr		$len
2155	li		$len,16
2156Loop_xts_enc_steal:
2157	lbzu		r0,1(r11)
2158	stb		r0,16(r11)
2159	bdnz		Loop_xts_enc_steal
2160
2161	mtctr		$rounds
2162	b		Loop_xts_enc			# one more time...
2163
2164Lxts_enc_done:
2165	${UCMP}i	$ivp,0
2166	beq		Lxts_enc_ret
2167
2168	vsrab		$tmp,$tweak,$seven		# next tweak value
2169	vaddubm		$tweak,$tweak,$tweak
2170	vsldoi		$tmp,$tmp,$tmp,15
2171	vand		$tmp,$tmp,$eighty7
2172	vxor		$tweak,$tweak,$tmp
2173
2174	le?vperm	$tweak,$tweak,$tweak,$leperm
2175	stvx_u		$tweak,0,$ivp
2176
2177Lxts_enc_ret:
2178	mtspr		256,r12				# restore vrsave
2179	li		r3,0
2180	blr
2181	.long		0
2182	.byte		0,12,0x04,0,0x80,6,6,0
2183	.long		0
2184.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2185
2186.globl	.${prefix}_xts_decrypt
2187	mr		$inp,r3				# reassign
2188	li		r3,-1
2189	${UCMP}i	$len,16
2190	bltlr-
2191
2192	lis		r0,0xfff8
2193	mfspr		r12,256				# save vrsave
2194	li		r11,0
2195	mtspr		256,r0
2196
2197	andi.		r0,$len,15
2198	neg		r0,r0
2199	andi.		r0,r0,16
2200	sub		$len,$len,r0
2201
2202	vspltisb	$seven,0x07			# 0x070707..07
2203	le?lvsl		$leperm,r11,r11
2204	le?vspltisb	$tmp,0x0f
2205	le?vxor		$leperm,$leperm,$seven
2206
2207	li		$idx,15
2208	lvx		$tweak,0,$ivp			# load [unaligned] iv
2209	lvsl		$inpperm,0,$ivp
2210	lvx		$inptail,$idx,$ivp
2211	le?vxor		$inpperm,$inpperm,$tmp
2212	vperm		$tweak,$tweak,$inptail,$inpperm
2213
2214	neg		r11,$inp
2215	lvsr		$inpperm,0,r11			# prepare for unaligned load
2216	lvx		$inout,0,$inp
2217	addi		$inp,$inp,15			# 15 is not typo
2218	le?vxor		$inpperm,$inpperm,$tmp
2219
2220	${UCMP}i	$key2,0				# key2==NULL?
2221	beq		Lxts_dec_no_key2
2222
2223	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2224	lwz		$rounds,240($key2)
2225	srwi		$rounds,$rounds,1
2226	subi		$rounds,$rounds,1
2227	li		$idx,16
2228
2229	lvx		$rndkey0,0,$key2
2230	lvx		$rndkey1,$idx,$key2
2231	addi		$idx,$idx,16
2232	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2233	vxor		$tweak,$tweak,$rndkey0
2234	lvx		$rndkey0,$idx,$key2
2235	addi		$idx,$idx,16
2236	mtctr		$rounds
2237
2238Ltweak_xts_dec:
2239	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2240	vcipher		$tweak,$tweak,$rndkey1
2241	lvx		$rndkey1,$idx,$key2
2242	addi		$idx,$idx,16
2243	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2244	vcipher		$tweak,$tweak,$rndkey0
2245	lvx		$rndkey0,$idx,$key2
2246	addi		$idx,$idx,16
2247	bdnz		Ltweak_xts_dec
2248
2249	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2250	vcipher		$tweak,$tweak,$rndkey1
2251	lvx		$rndkey1,$idx,$key2
2252	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2253	vcipherlast	$tweak,$tweak,$rndkey0
2254
2255	li		$ivp,0				# don't chain the tweak
2256	b		Lxts_dec
2257
2258Lxts_dec_no_key2:
2259	neg		$idx,$len
2260	andi.		$idx,$idx,15
2261	add		$len,$len,$idx			# in "tweak chaining"
2262							# mode only complete
2263							# blocks are processed
2264Lxts_dec:
2265	lvx		$inptail,0,$inp
2266	addi		$inp,$inp,16
2267
2268	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2269	lwz		$rounds,240($key1)
2270	srwi		$rounds,$rounds,1
2271	subi		$rounds,$rounds,1
2272	li		$idx,16
2273
2274	vslb		$eighty7,$seven,$seven		# 0x808080..80
2275	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2276	vspltisb	$tmp,1				# 0x010101..01
2277	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2278
2279	${UCMP}i	$len,96
2280	bge		_aesp8_xts_decrypt6x
2281
2282	lvx		$rndkey0,0,$key1
2283	lvx		$rndkey1,$idx,$key1
2284	addi		$idx,$idx,16
2285	vperm		$inout,$inout,$inptail,$inpperm
2286	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2287	vxor		$inout,$inout,$tweak
2288	vxor		$inout,$inout,$rndkey0
2289	lvx		$rndkey0,$idx,$key1
2290	addi		$idx,$idx,16
2291	mtctr		$rounds
2292
2293	${UCMP}i	$len,16
2294	blt		Ltail_xts_dec
2295	be?b		Loop_xts_dec
2296
2297.align	5
2298Loop_xts_dec:
2299	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2300	vncipher	$inout,$inout,$rndkey1
2301	lvx		$rndkey1,$idx,$key1
2302	addi		$idx,$idx,16
2303	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2304	vncipher	$inout,$inout,$rndkey0
2305	lvx		$rndkey0,$idx,$key1
2306	addi		$idx,$idx,16
2307	bdnz		Loop_xts_dec
2308
2309	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2310	vncipher	$inout,$inout,$rndkey1
2311	lvx		$rndkey1,$idx,$key1
2312	li		$idx,16
2313	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2314	vxor		$rndkey0,$rndkey0,$tweak
2315	vncipherlast	$output,$inout,$rndkey0
2316
2317	le?vperm	$tmp,$output,$output,$leperm
2318	be?nop
2319	le?stvx_u	$tmp,0,$out
2320	be?stvx_u	$output,0,$out
2321	addi		$out,$out,16
2322
2323	subic.		$len,$len,16
2324	beq		Lxts_dec_done
2325
2326	vmr		$inout,$inptail
2327	lvx		$inptail,0,$inp
2328	addi		$inp,$inp,16
2329	lvx		$rndkey0,0,$key1
2330	lvx		$rndkey1,$idx,$key1
2331	addi		$idx,$idx,16
2332
2333	vsrab		$tmp,$tweak,$seven		# next tweak value
2334	vaddubm		$tweak,$tweak,$tweak
2335	vsldoi		$tmp,$tmp,$tmp,15
2336	vand		$tmp,$tmp,$eighty7
2337	vxor		$tweak,$tweak,$tmp
2338
2339	vperm		$inout,$inout,$inptail,$inpperm
2340	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2341	vxor		$inout,$inout,$tweak
2342	vxor		$inout,$inout,$rndkey0
2343	lvx		$rndkey0,$idx,$key1
2344	addi		$idx,$idx,16
2345
2346	mtctr		$rounds
2347	${UCMP}i	$len,16
2348	bge		Loop_xts_dec
2349
2350Ltail_xts_dec:
2351	vsrab		$tmp,$tweak,$seven		# next tweak value
2352	vaddubm		$tweak1,$tweak,$tweak
2353	vsldoi		$tmp,$tmp,$tmp,15
2354	vand		$tmp,$tmp,$eighty7
2355	vxor		$tweak1,$tweak1,$tmp
2356
2357	subi		$inp,$inp,16
2358	add		$inp,$inp,$len
2359
2360	vxor		$inout,$inout,$tweak		# :-(
2361	vxor		$inout,$inout,$tweak1		# :-)
2362
2363Loop_xts_dec_short:
2364	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2365	vncipher	$inout,$inout,$rndkey1
2366	lvx		$rndkey1,$idx,$key1
2367	addi		$idx,$idx,16
2368	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2369	vncipher	$inout,$inout,$rndkey0
2370	lvx		$rndkey0,$idx,$key1
2371	addi		$idx,$idx,16
2372	bdnz		Loop_xts_dec_short
2373
2374	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2375	vncipher	$inout,$inout,$rndkey1
2376	lvx		$rndkey1,$idx,$key1
2377	li		$idx,16
2378	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2379	vxor		$rndkey0,$rndkey0,$tweak1
2380	vncipherlast	$output,$inout,$rndkey0
2381
2382	le?vperm	$tmp,$output,$output,$leperm
2383	be?nop
2384	le?stvx_u	$tmp,0,$out
2385	be?stvx_u	$output,0,$out
2386
2387	vmr		$inout,$inptail
2388	lvx		$inptail,0,$inp
2389	#addi		$inp,$inp,16
2390	lvx		$rndkey0,0,$key1
2391	lvx		$rndkey1,$idx,$key1
2392	addi		$idx,$idx,16
2393	vperm		$inout,$inout,$inptail,$inpperm
2394	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2395
2396	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2397	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2398	vspltisb	$tmp,-1
2399	vperm		$inptail,$inptail,$tmp,$inpperm
2400	vsel		$inout,$inout,$output,$inptail
2401
2402	vxor		$rndkey0,$rndkey0,$tweak
2403	vxor		$inout,$inout,$rndkey0
2404	lvx		$rndkey0,$idx,$key1
2405	addi		$idx,$idx,16
2406
2407	subi		r11,$out,1
2408	mtctr		$len
2409	li		$len,16
2410Loop_xts_dec_steal:
2411	lbzu		r0,1(r11)
2412	stb		r0,16(r11)
2413	bdnz		Loop_xts_dec_steal
2414
2415	mtctr		$rounds
2416	b		Loop_xts_dec			# one more time...
2417
2418Lxts_dec_done:
2419	${UCMP}i	$ivp,0
2420	beq		Lxts_dec_ret
2421
2422	vsrab		$tmp,$tweak,$seven		# next tweak value
2423	vaddubm		$tweak,$tweak,$tweak
2424	vsldoi		$tmp,$tmp,$tmp,15
2425	vand		$tmp,$tmp,$eighty7
2426	vxor		$tweak,$tweak,$tmp
2427
2428	le?vperm	$tweak,$tweak,$tweak,$leperm
2429	stvx_u		$tweak,0,$ivp
2430
2431Lxts_dec_ret:
2432	mtspr		256,r12				# restore vrsave
2433	li		r3,0
2434	blr
2435	.long		0
2436	.byte		0,12,0x04,0,0x80,6,6,0
2437	.long		0
2438.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2439___
2440#########################################################################
2441{{	# Optimized XTS procedures					#
2442my $key_=$key2;
2443my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
2444    $x00=0 if ($flavour =~ /osx/);
2445my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
2446my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2447my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2448my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
2449			# v26-v31 last 6 round keys
2450my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
2451my $taillen=$x70;
2452
2453$code.=<<___;
2454.align	5
2455_aesp8_xts_encrypt6x:
2456	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2457	mflr		r11
2458	li		r7,`$FRAME+8*16+15`
2459	li		r3,`$FRAME+8*16+31`
2460	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2461	stvx		v20,r7,$sp		# ABI says so
2462	addi		r7,r7,32
2463	stvx		v21,r3,$sp
2464	addi		r3,r3,32
2465	stvx		v22,r7,$sp
2466	addi		r7,r7,32
2467	stvx		v23,r3,$sp
2468	addi		r3,r3,32
2469	stvx		v24,r7,$sp
2470	addi		r7,r7,32
2471	stvx		v25,r3,$sp
2472	addi		r3,r3,32
2473	stvx		v26,r7,$sp
2474	addi		r7,r7,32
2475	stvx		v27,r3,$sp
2476	addi		r3,r3,32
2477	stvx		v28,r7,$sp
2478	addi		r7,r7,32
2479	stvx		v29,r3,$sp
2480	addi		r3,r3,32
2481	stvx		v30,r7,$sp
2482	stvx		v31,r3,$sp
2483	li		r0,-1
2484	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
2485	li		$x10,0x10
2486	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2487	li		$x20,0x20
2488	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2489	li		$x30,0x30
2490	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2491	li		$x40,0x40
2492	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2493	li		$x50,0x50
2494	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2495	li		$x60,0x60
2496	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2497	li		$x70,0x70
2498	mtspr		256,r0
2499
2500	xxlor		2, 32+$eighty7, 32+$eighty7
2501	vsldoi		$eighty7,$tmp,$eighty7,1        # 0x010101..87
2502	xxlor		1, 32+$eighty7, 32+$eighty7
2503
2504	# Load XOR Lconsts.
2505	mr		$x70, r6
2506	bl		Lconsts
2507	lxvw4x		0, $x40, r6		# load XOR contents
2508	mr		r6, $x70
2509	li		$x70,0x70
2510
2511	subi		$rounds,$rounds,3	# -4 in total
2512
2513	lvx		$rndkey0,$x00,$key1	# load key schedule
2514	lvx		v30,$x10,$key1
2515	addi		$key1,$key1,0x20
2516	lvx		v31,$x00,$key1
2517	?vperm		$rndkey0,$rndkey0,v30,$keyperm
2518	addi		$key_,$sp,$FRAME+15
2519	mtctr		$rounds
2520
2521Load_xts_enc_key:
2522	?vperm		v24,v30,v31,$keyperm
2523	lvx		v30,$x10,$key1
2524	addi		$key1,$key1,0x20
2525	stvx		v24,$x00,$key_		# off-load round[1]
2526	?vperm		v25,v31,v30,$keyperm
2527	lvx		v31,$x00,$key1
2528	stvx		v25,$x10,$key_		# off-load round[2]
2529	addi		$key_,$key_,0x20
2530	bdnz		Load_xts_enc_key
2531
2532	lvx		v26,$x10,$key1
2533	?vperm		v24,v30,v31,$keyperm
2534	lvx		v27,$x20,$key1
2535	stvx		v24,$x00,$key_		# off-load round[3]
2536	?vperm		v25,v31,v26,$keyperm
2537	lvx		v28,$x30,$key1
2538	stvx		v25,$x10,$key_		# off-load round[4]
2539	addi		$key_,$sp,$FRAME+15	# rewind $key_
2540	?vperm		v26,v26,v27,$keyperm
2541	lvx		v29,$x40,$key1
2542	?vperm		v27,v27,v28,$keyperm
2543	lvx		v30,$x50,$key1
2544	?vperm		v28,v28,v29,$keyperm
2545	lvx		v31,$x60,$key1
2546	?vperm		v29,v29,v30,$keyperm
2547	lvx		$twk5,$x70,$key1	# borrow $twk5
2548	?vperm		v30,v30,v31,$keyperm
2549	lvx		v24,$x00,$key_		# pre-load round[1]
2550	?vperm		v31,v31,$twk5,$keyperm
2551	lvx		v25,$x10,$key_		# pre-load round[2]
2552
2553	# Switch to use the following codes with 0x010101..87 to generate tweak.
2554	#     eighty7 = 0x010101..87
2555	# vsrab         tmp, tweak, seven       # next tweak value, right shift 7 bits
2556	# vand          tmp, tmp, eighty7       # last byte with carry
2557	# vaddubm       tweak, tweak, tweak     # left shift 1 bit (x2)
2558	# xxlor         vsx, 0, 0
2559	# vpermxor      tweak, tweak, tmp, vsx
2560
2561	 vperm		$in0,$inout,$inptail,$inpperm
2562	 subi		$inp,$inp,31		# undo "caller"
2563	vxor		$twk0,$tweak,$rndkey0
2564	vsrab		$tmp,$tweak,$seven	# next tweak value
2565	vaddubm		$tweak,$tweak,$tweak
2566	vand		$tmp,$tmp,$eighty7
2567	 vxor		$out0,$in0,$twk0
2568	xxlor		32+$in1, 0, 0
2569	vpermxor	$tweak, $tweak, $tmp, $in1
2570
2571	 lvx_u		$in1,$x10,$inp
2572	vxor		$twk1,$tweak,$rndkey0
2573	vsrab		$tmp,$tweak,$seven	# next tweak value
2574	vaddubm		$tweak,$tweak,$tweak
2575	 le?vperm	$in1,$in1,$in1,$leperm
2576	vand		$tmp,$tmp,$eighty7
2577	 vxor		$out1,$in1,$twk1
2578	xxlor		32+$in2, 0, 0
2579	vpermxor	$tweak, $tweak, $tmp, $in2
2580
2581	 lvx_u		$in2,$x20,$inp
2582	 andi.		$taillen,$len,15
2583	vxor		$twk2,$tweak,$rndkey0
2584	vsrab		$tmp,$tweak,$seven	# next tweak value
2585	vaddubm		$tweak,$tweak,$tweak
2586	 le?vperm	$in2,$in2,$in2,$leperm
2587	vand		$tmp,$tmp,$eighty7
2588	 vxor		$out2,$in2,$twk2
2589	xxlor		32+$in3, 0, 0
2590	vpermxor	$tweak, $tweak, $tmp, $in3
2591
2592	 lvx_u		$in3,$x30,$inp
2593	 sub		$len,$len,$taillen
2594	vxor		$twk3,$tweak,$rndkey0
2595	vsrab		$tmp,$tweak,$seven	# next tweak value
2596	vaddubm		$tweak,$tweak,$tweak
2597	 le?vperm	$in3,$in3,$in3,$leperm
2598	vand		$tmp,$tmp,$eighty7
2599	 vxor		$out3,$in3,$twk3
2600	xxlor		32+$in4, 0, 0
2601	vpermxor	$tweak, $tweak, $tmp, $in4
2602
2603	 lvx_u		$in4,$x40,$inp
2604	 subi		$len,$len,0x60
2605	vxor		$twk4,$tweak,$rndkey0
2606	vsrab		$tmp,$tweak,$seven	# next tweak value
2607	vaddubm		$tweak,$tweak,$tweak
2608	 le?vperm	$in4,$in4,$in4,$leperm
2609	vand		$tmp,$tmp,$eighty7
2610	 vxor		$out4,$in4,$twk4
2611	xxlor		32+$in5, 0, 0
2612	vpermxor	$tweak, $tweak, $tmp, $in5
2613
2614	 lvx_u		$in5,$x50,$inp
2615	 addi		$inp,$inp,0x60
2616	vxor		$twk5,$tweak,$rndkey0
2617	vsrab		$tmp,$tweak,$seven	# next tweak value
2618	vaddubm		$tweak,$tweak,$tweak
2619	 le?vperm	$in5,$in5,$in5,$leperm
2620	vand		$tmp,$tmp,$eighty7
2621	 vxor		$out5,$in5,$twk5
2622	xxlor		32+$in0, 0, 0
2623	vpermxor	$tweak, $tweak, $tmp, $in0
2624
2625	vxor		v31,v31,$rndkey0
2626	mtctr		$rounds
2627	b		Loop_xts_enc6x
2628
2629.align	5
2630Loop_xts_enc6x:
2631	vcipher		$out0,$out0,v24
2632	vcipher		$out1,$out1,v24
2633	vcipher		$out2,$out2,v24
2634	vcipher		$out3,$out3,v24
2635	vcipher		$out4,$out4,v24
2636	vcipher		$out5,$out5,v24
2637	lvx		v24,$x20,$key_		# round[3]
2638	addi		$key_,$key_,0x20
2639
2640	vcipher		$out0,$out0,v25
2641	vcipher		$out1,$out1,v25
2642	vcipher		$out2,$out2,v25
2643	vcipher		$out3,$out3,v25
2644	vcipher		$out4,$out4,v25
2645	vcipher		$out5,$out5,v25
2646	lvx		v25,$x10,$key_		# round[4]
2647	bdnz		Loop_xts_enc6x
2648
2649	xxlor		32+$eighty7, 1, 1	# 0x010101..87
2650
2651	subic		$len,$len,96		# $len-=96
2652	 vxor		$in0,$twk0,v31		# xor with last round key
2653	vcipher		$out0,$out0,v24
2654	vcipher		$out1,$out1,v24
2655	 vsrab		$tmp,$tweak,$seven	# next tweak value
2656	 vxor		$twk0,$tweak,$rndkey0
2657	 vaddubm	$tweak,$tweak,$tweak
2658	vcipher		$out2,$out2,v24
2659	vcipher		$out3,$out3,v24
2660	vcipher		$out4,$out4,v24
2661	vcipher		$out5,$out5,v24
2662
2663	subfe.		r0,r0,r0		# borrow?-1:0
2664	 vand		$tmp,$tmp,$eighty7
2665	vcipher		$out0,$out0,v25
2666	vcipher		$out1,$out1,v25
2667	 xxlor		32+$in1, 0, 0
2668	 vpermxor	$tweak, $tweak, $tmp, $in1
2669	vcipher		$out2,$out2,v25
2670	vcipher		$out3,$out3,v25
2671	 vxor		$in1,$twk1,v31
2672	 vsrab		$tmp,$tweak,$seven	# next tweak value
2673	 vxor		$twk1,$tweak,$rndkey0
2674	vcipher		$out4,$out4,v25
2675	vcipher		$out5,$out5,v25
2676
2677	and		r0,r0,$len
2678	 vaddubm	$tweak,$tweak,$tweak
2679	vcipher		$out0,$out0,v26
2680	vcipher		$out1,$out1,v26
2681	 vand		$tmp,$tmp,$eighty7
2682	vcipher		$out2,$out2,v26
2683	vcipher		$out3,$out3,v26
2684	 xxlor		32+$in2, 0, 0
2685	 vpermxor	$tweak, $tweak, $tmp, $in2
2686	vcipher		$out4,$out4,v26
2687	vcipher		$out5,$out5,v26
2688
2689	add		$inp,$inp,r0		# $inp is adjusted in such
2690						# way that at exit from the
2691						# loop inX-in5 are loaded
2692						# with last "words"
2693	 vxor		$in2,$twk2,v31
2694	 vsrab		$tmp,$tweak,$seven	# next tweak value
2695	 vxor		$twk2,$tweak,$rndkey0
2696	 vaddubm	$tweak,$tweak,$tweak
2697	vcipher		$out0,$out0,v27
2698	vcipher		$out1,$out1,v27
2699	vcipher		$out2,$out2,v27
2700	vcipher		$out3,$out3,v27
2701	 vand		$tmp,$tmp,$eighty7
2702	vcipher		$out4,$out4,v27
2703	vcipher		$out5,$out5,v27
2704
2705	addi		$key_,$sp,$FRAME+15	# rewind $key_
2706	 xxlor		32+$in3, 0, 0
2707	 vpermxor	$tweak, $tweak, $tmp, $in3
2708	vcipher		$out0,$out0,v28
2709	vcipher		$out1,$out1,v28
2710	 vxor		$in3,$twk3,v31
2711	 vsrab		$tmp,$tweak,$seven	# next tweak value
2712	 vxor		$twk3,$tweak,$rndkey0
2713	vcipher		$out2,$out2,v28
2714	vcipher		$out3,$out3,v28
2715	 vaddubm	$tweak,$tweak,$tweak
2716	vcipher		$out4,$out4,v28
2717	vcipher		$out5,$out5,v28
2718	lvx		v24,$x00,$key_		# re-pre-load round[1]
2719	 vand		$tmp,$tmp,$eighty7
2720
2721	vcipher		$out0,$out0,v29
2722	vcipher		$out1,$out1,v29
2723	 xxlor		32+$in4, 0, 0
2724	 vpermxor	$tweak, $tweak, $tmp, $in4
2725	vcipher		$out2,$out2,v29
2726	vcipher		$out3,$out3,v29
2727	 vxor		$in4,$twk4,v31
2728	 vsrab		$tmp,$tweak,$seven	# next tweak value
2729	 vxor		$twk4,$tweak,$rndkey0
2730	vcipher		$out4,$out4,v29
2731	vcipher		$out5,$out5,v29
2732	lvx		v25,$x10,$key_		# re-pre-load round[2]
2733	 vaddubm	$tweak,$tweak,$tweak
2734
2735	vcipher		$out0,$out0,v30
2736	vcipher		$out1,$out1,v30
2737	 vand		$tmp,$tmp,$eighty7
2738	vcipher		$out2,$out2,v30
2739	vcipher		$out3,$out3,v30
2740	 xxlor		32+$in5, 0, 0
2741	 vpermxor	$tweak, $tweak, $tmp, $in5
2742	vcipher		$out4,$out4,v30
2743	vcipher		$out5,$out5,v30
2744	 vxor		$in5,$twk5,v31
2745	 vsrab		$tmp,$tweak,$seven	# next tweak value
2746	 vxor		$twk5,$tweak,$rndkey0
2747
2748	vcipherlast	$out0,$out0,$in0
2749	 lvx_u		$in0,$x00,$inp		# load next input block
2750	 vaddubm	$tweak,$tweak,$tweak
2751	vcipherlast	$out1,$out1,$in1
2752	 lvx_u		$in1,$x10,$inp
2753	vcipherlast	$out2,$out2,$in2
2754	 le?vperm	$in0,$in0,$in0,$leperm
2755	 lvx_u		$in2,$x20,$inp
2756	 vand		$tmp,$tmp,$eighty7
2757	vcipherlast	$out3,$out3,$in3
2758	 le?vperm	$in1,$in1,$in1,$leperm
2759	 lvx_u		$in3,$x30,$inp
2760	vcipherlast	$out4,$out4,$in4
2761	 le?vperm	$in2,$in2,$in2,$leperm
2762	 lvx_u		$in4,$x40,$inp
2763	 xxlor		10, 32+$in0, 32+$in0
2764	 xxlor		32+$in0, 0, 0
2765	 vpermxor	$tweak, $tweak, $tmp, $in0
2766	 xxlor		32+$in0, 10, 10
2767	vcipherlast	$tmp,$out5,$in5		# last block might be needed
2768						# in stealing mode
2769	 le?vperm	$in3,$in3,$in3,$leperm
2770	 lvx_u		$in5,$x50,$inp
2771	 addi		$inp,$inp,0x60
2772	 le?vperm	$in4,$in4,$in4,$leperm
2773	 le?vperm	$in5,$in5,$in5,$leperm
2774
2775	le?vperm	$out0,$out0,$out0,$leperm
2776	le?vperm	$out1,$out1,$out1,$leperm
2777	stvx_u		$out0,$x00,$out		# store output
2778	 vxor		$out0,$in0,$twk0
2779	le?vperm	$out2,$out2,$out2,$leperm
2780	stvx_u		$out1,$x10,$out
2781	 vxor		$out1,$in1,$twk1
2782	le?vperm	$out3,$out3,$out3,$leperm
2783	stvx_u		$out2,$x20,$out
2784	 vxor		$out2,$in2,$twk2
2785	le?vperm	$out4,$out4,$out4,$leperm
2786	stvx_u		$out3,$x30,$out
2787	 vxor		$out3,$in3,$twk3
2788	le?vperm	$out5,$tmp,$tmp,$leperm
2789	stvx_u		$out4,$x40,$out
2790	 vxor		$out4,$in4,$twk4
2791	le?stvx_u	$out5,$x50,$out
2792	be?stvx_u	$tmp, $x50,$out
2793	 vxor		$out5,$in5,$twk5
2794	addi		$out,$out,0x60
2795
2796	mtctr		$rounds
2797	beq		Loop_xts_enc6x		# did $len-=96 borrow?
2798
2799	xxlor		32+$eighty7, 2, 2	# 0x010101..87
2800
2801	addic.		$len,$len,0x60
2802	beq		Lxts_enc6x_zero
2803	cmpwi		$len,0x20
2804	blt		Lxts_enc6x_one
2805	nop
2806	beq		Lxts_enc6x_two
2807	cmpwi		$len,0x40
2808	blt		Lxts_enc6x_three
2809	nop
2810	beq		Lxts_enc6x_four
2811
2812Lxts_enc6x_five:
2813	vxor		$out0,$in1,$twk0
2814	vxor		$out1,$in2,$twk1
2815	vxor		$out2,$in3,$twk2
2816	vxor		$out3,$in4,$twk3
2817	vxor		$out4,$in5,$twk4
2818
2819	bl		_aesp8_xts_enc5x
2820
2821	le?vperm	$out0,$out0,$out0,$leperm
2822	vmr		$twk0,$twk5		# unused tweak
2823	le?vperm	$out1,$out1,$out1,$leperm
2824	stvx_u		$out0,$x00,$out		# store output
2825	le?vperm	$out2,$out2,$out2,$leperm
2826	stvx_u		$out1,$x10,$out
2827	le?vperm	$out3,$out3,$out3,$leperm
2828	stvx_u		$out2,$x20,$out
2829	vxor		$tmp,$out4,$twk5	# last block prep for stealing
2830	le?vperm	$out4,$out4,$out4,$leperm
2831	stvx_u		$out3,$x30,$out
2832	stvx_u		$out4,$x40,$out
2833	addi		$out,$out,0x50
2834	bne		Lxts_enc6x_steal
2835	b		Lxts_enc6x_done
2836
2837.align	4
2838Lxts_enc6x_four:
2839	vxor		$out0,$in2,$twk0
2840	vxor		$out1,$in3,$twk1
2841	vxor		$out2,$in4,$twk2
2842	vxor		$out3,$in5,$twk3
2843	vxor		$out4,$out4,$out4
2844
2845	bl		_aesp8_xts_enc5x
2846
2847	le?vperm	$out0,$out0,$out0,$leperm
2848	vmr		$twk0,$twk4		# unused tweak
2849	le?vperm	$out1,$out1,$out1,$leperm
2850	stvx_u		$out0,$x00,$out		# store output
2851	le?vperm	$out2,$out2,$out2,$leperm
2852	stvx_u		$out1,$x10,$out
2853	vxor		$tmp,$out3,$twk4	# last block prep for stealing
2854	le?vperm	$out3,$out3,$out3,$leperm
2855	stvx_u		$out2,$x20,$out
2856	stvx_u		$out3,$x30,$out
2857	addi		$out,$out,0x40
2858	bne		Lxts_enc6x_steal
2859	b		Lxts_enc6x_done
2860
2861.align	4
2862Lxts_enc6x_three:
2863	vxor		$out0,$in3,$twk0
2864	vxor		$out1,$in4,$twk1
2865	vxor		$out2,$in5,$twk2
2866	vxor		$out3,$out3,$out3
2867	vxor		$out4,$out4,$out4
2868
2869	bl		_aesp8_xts_enc5x
2870
2871	le?vperm	$out0,$out0,$out0,$leperm
2872	vmr		$twk0,$twk3		# unused tweak
2873	le?vperm	$out1,$out1,$out1,$leperm
2874	stvx_u		$out0,$x00,$out		# store output
2875	vxor		$tmp,$out2,$twk3	# last block prep for stealing
2876	le?vperm	$out2,$out2,$out2,$leperm
2877	stvx_u		$out1,$x10,$out
2878	stvx_u		$out2,$x20,$out
2879	addi		$out,$out,0x30
2880	bne		Lxts_enc6x_steal
2881	b		Lxts_enc6x_done
2882
2883.align	4
2884Lxts_enc6x_two:
2885	vxor		$out0,$in4,$twk0
2886	vxor		$out1,$in5,$twk1
2887	vxor		$out2,$out2,$out2
2888	vxor		$out3,$out3,$out3
2889	vxor		$out4,$out4,$out4
2890
2891	bl		_aesp8_xts_enc5x
2892
2893	le?vperm	$out0,$out0,$out0,$leperm
2894	vmr		$twk0,$twk2		# unused tweak
2895	vxor		$tmp,$out1,$twk2	# last block prep for stealing
2896	le?vperm	$out1,$out1,$out1,$leperm
2897	stvx_u		$out0,$x00,$out		# store output
2898	stvx_u		$out1,$x10,$out
2899	addi		$out,$out,0x20
2900	bne		Lxts_enc6x_steal
2901	b		Lxts_enc6x_done
2902
2903.align	4
2904Lxts_enc6x_one:
2905	vxor		$out0,$in5,$twk0
2906	nop
2907Loop_xts_enc1x:
2908	vcipher		$out0,$out0,v24
2909	lvx		v24,$x20,$key_		# round[3]
2910	addi		$key_,$key_,0x20
2911
2912	vcipher		$out0,$out0,v25
2913	lvx		v25,$x10,$key_		# round[4]
2914	bdnz		Loop_xts_enc1x
2915
2916	add		$inp,$inp,$taillen
2917	cmpwi		$taillen,0
2918	vcipher		$out0,$out0,v24
2919
2920	subi		$inp,$inp,16
2921	vcipher		$out0,$out0,v25
2922
2923	lvsr		$inpperm,0,$taillen
2924	vcipher		$out0,$out0,v26
2925
2926	lvx_u		$in0,0,$inp
2927	vcipher		$out0,$out0,v27
2928
2929	addi		$key_,$sp,$FRAME+15	# rewind $key_
2930	vcipher		$out0,$out0,v28
2931	lvx		v24,$x00,$key_		# re-pre-load round[1]
2932
2933	vcipher		$out0,$out0,v29
2934	lvx		v25,$x10,$key_		# re-pre-load round[2]
2935	 vxor		$twk0,$twk0,v31
2936
2937	le?vperm	$in0,$in0,$in0,$leperm
2938	vcipher		$out0,$out0,v30
2939
2940	vperm		$in0,$in0,$in0,$inpperm
2941	vcipherlast	$out0,$out0,$twk0
2942
2943	vmr		$twk0,$twk1		# unused tweak
2944	vxor		$tmp,$out0,$twk1	# last block prep for stealing
2945	le?vperm	$out0,$out0,$out0,$leperm
2946	stvx_u		$out0,$x00,$out		# store output
2947	addi		$out,$out,0x10
2948	bne		Lxts_enc6x_steal
2949	b		Lxts_enc6x_done
2950
2951.align	4
2952Lxts_enc6x_zero:
2953	cmpwi		$taillen,0
2954	beq		Lxts_enc6x_done
2955
2956	add		$inp,$inp,$taillen
2957	subi		$inp,$inp,16
2958	lvx_u		$in0,0,$inp
2959	lvsr		$inpperm,0,$taillen	# $in5 is no more
2960	le?vperm	$in0,$in0,$in0,$leperm
2961	vperm		$in0,$in0,$in0,$inpperm
2962	vxor		$tmp,$tmp,$twk0
2963Lxts_enc6x_steal:
2964	vxor		$in0,$in0,$twk0
2965	vxor		$out0,$out0,$out0
2966	vspltisb	$out1,-1
2967	vperm		$out0,$out0,$out1,$inpperm
2968	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
2969
2970	subi		r30,$out,17
2971	subi		$out,$out,16
2972	mtctr		$taillen
2973Loop_xts_enc6x_steal:
2974	lbzu		r0,1(r30)
2975	stb		r0,16(r30)
2976	bdnz		Loop_xts_enc6x_steal
2977
2978	li		$taillen,0
2979	mtctr		$rounds
2980	b		Loop_xts_enc1x		# one more time...
2981
2982.align	4
2983Lxts_enc6x_done:
2984	${UCMP}i	$ivp,0
2985	beq		Lxts_enc6x_ret
2986
2987	vxor		$tweak,$twk0,$rndkey0
2988	le?vperm	$tweak,$tweak,$tweak,$leperm
2989	stvx_u		$tweak,0,$ivp
2990
2991Lxts_enc6x_ret:
2992	mtlr		r11
2993	li		r10,`$FRAME+15`
2994	li		r11,`$FRAME+31`
2995	stvx		$seven,r10,$sp		# wipe copies of round keys
2996	addi		r10,r10,32
2997	stvx		$seven,r11,$sp
2998	addi		r11,r11,32
2999	stvx		$seven,r10,$sp
3000	addi		r10,r10,32
3001	stvx		$seven,r11,$sp
3002	addi		r11,r11,32
3003	stvx		$seven,r10,$sp
3004	addi		r10,r10,32
3005	stvx		$seven,r11,$sp
3006	addi		r11,r11,32
3007	stvx		$seven,r10,$sp
3008	addi		r10,r10,32
3009	stvx		$seven,r11,$sp
3010	addi		r11,r11,32
3011
3012	mtspr		256,$vrsave
3013	lvx		v20,r10,$sp		# ABI says so
3014	addi		r10,r10,32
3015	lvx		v21,r11,$sp
3016	addi		r11,r11,32
3017	lvx		v22,r10,$sp
3018	addi		r10,r10,32
3019	lvx		v23,r11,$sp
3020	addi		r11,r11,32
3021	lvx		v24,r10,$sp
3022	addi		r10,r10,32
3023	lvx		v25,r11,$sp
3024	addi		r11,r11,32
3025	lvx		v26,r10,$sp
3026	addi		r10,r10,32
3027	lvx		v27,r11,$sp
3028	addi		r11,r11,32
3029	lvx		v28,r10,$sp
3030	addi		r10,r10,32
3031	lvx		v29,r11,$sp
3032	addi		r11,r11,32
3033	lvx		v30,r10,$sp
3034	lvx		v31,r11,$sp
3035	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3036	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3037	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3038	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3039	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3040	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3041	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3042	blr
3043	.long		0
3044	.byte		0,12,0x04,1,0x80,6,6,0
3045	.long		0
3046
3047.align	5
3048_aesp8_xts_enc5x:
3049	vcipher		$out0,$out0,v24
3050	vcipher		$out1,$out1,v24
3051	vcipher		$out2,$out2,v24
3052	vcipher		$out3,$out3,v24
3053	vcipher		$out4,$out4,v24
3054	lvx		v24,$x20,$key_		# round[3]
3055	addi		$key_,$key_,0x20
3056
3057	vcipher		$out0,$out0,v25
3058	vcipher		$out1,$out1,v25
3059	vcipher		$out2,$out2,v25
3060	vcipher		$out3,$out3,v25
3061	vcipher		$out4,$out4,v25
3062	lvx		v25,$x10,$key_		# round[4]
3063	bdnz		_aesp8_xts_enc5x
3064
3065	add		$inp,$inp,$taillen
3066	cmpwi		$taillen,0
3067	vcipher		$out0,$out0,v24
3068	vcipher		$out1,$out1,v24
3069	vcipher		$out2,$out2,v24
3070	vcipher		$out3,$out3,v24
3071	vcipher		$out4,$out4,v24
3072
3073	subi		$inp,$inp,16
3074	vcipher		$out0,$out0,v25
3075	vcipher		$out1,$out1,v25
3076	vcipher		$out2,$out2,v25
3077	vcipher		$out3,$out3,v25
3078	vcipher		$out4,$out4,v25
3079	 vxor		$twk0,$twk0,v31
3080
3081	vcipher		$out0,$out0,v26
3082	lvsr		$inpperm,r0,$taillen	# $in5 is no more
3083	vcipher		$out1,$out1,v26
3084	vcipher		$out2,$out2,v26
3085	vcipher		$out3,$out3,v26
3086	vcipher		$out4,$out4,v26
3087	 vxor		$in1,$twk1,v31
3088
3089	vcipher		$out0,$out0,v27
3090	lvx_u		$in0,0,$inp
3091	vcipher		$out1,$out1,v27
3092	vcipher		$out2,$out2,v27
3093	vcipher		$out3,$out3,v27
3094	vcipher		$out4,$out4,v27
3095	 vxor		$in2,$twk2,v31
3096
3097	addi		$key_,$sp,$FRAME+15	# rewind $key_
3098	vcipher		$out0,$out0,v28
3099	vcipher		$out1,$out1,v28
3100	vcipher		$out2,$out2,v28
3101	vcipher		$out3,$out3,v28
3102	vcipher		$out4,$out4,v28
3103	lvx		v24,$x00,$key_		# re-pre-load round[1]
3104	 vxor		$in3,$twk3,v31
3105
3106	vcipher		$out0,$out0,v29
3107	le?vperm	$in0,$in0,$in0,$leperm
3108	vcipher		$out1,$out1,v29
3109	vcipher		$out2,$out2,v29
3110	vcipher		$out3,$out3,v29
3111	vcipher		$out4,$out4,v29
3112	lvx		v25,$x10,$key_		# re-pre-load round[2]
3113	 vxor		$in4,$twk4,v31
3114
3115	vcipher		$out0,$out0,v30
3116	vperm		$in0,$in0,$in0,$inpperm
3117	vcipher		$out1,$out1,v30
3118	vcipher		$out2,$out2,v30
3119	vcipher		$out3,$out3,v30
3120	vcipher		$out4,$out4,v30
3121
3122	vcipherlast	$out0,$out0,$twk0
3123	vcipherlast	$out1,$out1,$in1
3124	vcipherlast	$out2,$out2,$in2
3125	vcipherlast	$out3,$out3,$in3
3126	vcipherlast	$out4,$out4,$in4
3127	blr
3128        .long   	0
3129        .byte   	0,12,0x14,0,0,0,0,0
3130
3131.align	5
3132_aesp8_xts_decrypt6x:
3133	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
3134	mflr		r11
3135	li		r7,`$FRAME+8*16+15`
3136	li		r3,`$FRAME+8*16+31`
3137	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
3138	stvx		v20,r7,$sp		# ABI says so
3139	addi		r7,r7,32
3140	stvx		v21,r3,$sp
3141	addi		r3,r3,32
3142	stvx		v22,r7,$sp
3143	addi		r7,r7,32
3144	stvx		v23,r3,$sp
3145	addi		r3,r3,32
3146	stvx		v24,r7,$sp
3147	addi		r7,r7,32
3148	stvx		v25,r3,$sp
3149	addi		r3,r3,32
3150	stvx		v26,r7,$sp
3151	addi		r7,r7,32
3152	stvx		v27,r3,$sp
3153	addi		r3,r3,32
3154	stvx		v28,r7,$sp
3155	addi		r7,r7,32
3156	stvx		v29,r3,$sp
3157	addi		r3,r3,32
3158	stvx		v30,r7,$sp
3159	stvx		v31,r3,$sp
3160	li		r0,-1
3161	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
3162	li		$x10,0x10
3163	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3164	li		$x20,0x20
3165	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3166	li		$x30,0x30
3167	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3168	li		$x40,0x40
3169	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3170	li		$x50,0x50
3171	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3172	li		$x60,0x60
3173	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3174	li		$x70,0x70
3175	mtspr		256,r0
3176
3177	xxlor		2, 32+$eighty7, 32+$eighty7
3178	vsldoi		$eighty7,$tmp,$eighty7,1        # 0x010101..87
3179	xxlor		1, 32+$eighty7, 32+$eighty7
3180
3181	# Load XOR Lconsts.
3182	mr		$x70, r6
3183	bl		Lconsts
3184	lxvw4x		0, $x40, r6		# load XOR contents
3185	mr		r6, $x70
3186	li		$x70,0x70
3187
3188	subi		$rounds,$rounds,3	# -4 in total
3189
3190	lvx		$rndkey0,$x00,$key1	# load key schedule
3191	lvx		v30,$x10,$key1
3192	addi		$key1,$key1,0x20
3193	lvx		v31,$x00,$key1
3194	?vperm		$rndkey0,$rndkey0,v30,$keyperm
3195	addi		$key_,$sp,$FRAME+15
3196	mtctr		$rounds
3197
3198Load_xts_dec_key:
3199	?vperm		v24,v30,v31,$keyperm
3200	lvx		v30,$x10,$key1
3201	addi		$key1,$key1,0x20
3202	stvx		v24,$x00,$key_		# off-load round[1]
3203	?vperm		v25,v31,v30,$keyperm
3204	lvx		v31,$x00,$key1
3205	stvx		v25,$x10,$key_		# off-load round[2]
3206	addi		$key_,$key_,0x20
3207	bdnz		Load_xts_dec_key
3208
3209	lvx		v26,$x10,$key1
3210	?vperm		v24,v30,v31,$keyperm
3211	lvx		v27,$x20,$key1
3212	stvx		v24,$x00,$key_		# off-load round[3]
3213	?vperm		v25,v31,v26,$keyperm
3214	lvx		v28,$x30,$key1
3215	stvx		v25,$x10,$key_		# off-load round[4]
3216	addi		$key_,$sp,$FRAME+15	# rewind $key_
3217	?vperm		v26,v26,v27,$keyperm
3218	lvx		v29,$x40,$key1
3219	?vperm		v27,v27,v28,$keyperm
3220	lvx		v30,$x50,$key1
3221	?vperm		v28,v28,v29,$keyperm
3222	lvx		v31,$x60,$key1
3223	?vperm		v29,v29,v30,$keyperm
3224	lvx		$twk5,$x70,$key1	# borrow $twk5
3225	?vperm		v30,v30,v31,$keyperm
3226	lvx		v24,$x00,$key_		# pre-load round[1]
3227	?vperm		v31,v31,$twk5,$keyperm
3228	lvx		v25,$x10,$key_		# pre-load round[2]
3229
3230	 vperm		$in0,$inout,$inptail,$inpperm
3231	 subi		$inp,$inp,31		# undo "caller"
3232	vxor		$twk0,$tweak,$rndkey0
3233	vsrab		$tmp,$tweak,$seven	# next tweak value
3234	vaddubm		$tweak,$tweak,$tweak
3235	vand		$tmp,$tmp,$eighty7
3236	 vxor		$out0,$in0,$twk0
3237	xxlor		32+$in1, 0, 0
3238	vpermxor	$tweak, $tweak, $tmp, $in1
3239
3240	 lvx_u		$in1,$x10,$inp
3241	vxor		$twk1,$tweak,$rndkey0
3242	vsrab		$tmp,$tweak,$seven	# next tweak value
3243	vaddubm		$tweak,$tweak,$tweak
3244	 le?vperm	$in1,$in1,$in1,$leperm
3245	vand		$tmp,$tmp,$eighty7
3246	 vxor		$out1,$in1,$twk1
3247	xxlor		32+$in2, 0, 0
3248	vpermxor	$tweak, $tweak, $tmp, $in2
3249
3250	 lvx_u		$in2,$x20,$inp
3251	 andi.		$taillen,$len,15
3252	vxor		$twk2,$tweak,$rndkey0
3253	vsrab		$tmp,$tweak,$seven	# next tweak value
3254	vaddubm		$tweak,$tweak,$tweak
3255	 le?vperm	$in2,$in2,$in2,$leperm
3256	vand		$tmp,$tmp,$eighty7
3257	 vxor		$out2,$in2,$twk2
3258	xxlor		32+$in3, 0, 0
3259	vpermxor	$tweak, $tweak, $tmp, $in3
3260
3261	 lvx_u		$in3,$x30,$inp
3262	 sub		$len,$len,$taillen
3263	vxor		$twk3,$tweak,$rndkey0
3264	vsrab		$tmp,$tweak,$seven	# next tweak value
3265	vaddubm		$tweak,$tweak,$tweak
3266	 le?vperm	$in3,$in3,$in3,$leperm
3267	vand		$tmp,$tmp,$eighty7
3268	 vxor		$out3,$in3,$twk3
3269	xxlor		32+$in4, 0, 0
3270	vpermxor	$tweak, $tweak, $tmp, $in4
3271
3272	 lvx_u		$in4,$x40,$inp
3273	 subi		$len,$len,0x60
3274	vxor		$twk4,$tweak,$rndkey0
3275	vsrab		$tmp,$tweak,$seven	# next tweak value
3276	vaddubm		$tweak,$tweak,$tweak
3277	 le?vperm	$in4,$in4,$in4,$leperm
3278	vand		$tmp,$tmp,$eighty7
3279	 vxor		$out4,$in4,$twk4
3280	xxlor		32+$in5, 0, 0
3281	vpermxor	$tweak, $tweak, $tmp, $in5
3282
3283	 lvx_u		$in5,$x50,$inp
3284	 addi		$inp,$inp,0x60
3285	vxor		$twk5,$tweak,$rndkey0
3286	vsrab		$tmp,$tweak,$seven	# next tweak value
3287	vaddubm		$tweak,$tweak,$tweak
3288	 le?vperm	$in5,$in5,$in5,$leperm
3289	vand		$tmp,$tmp,$eighty7
3290	 vxor		$out5,$in5,$twk5
3291	xxlor		32+$in0, 0, 0
3292	vpermxor	$tweak, $tweak, $tmp, $in0
3293
3294	vxor		v31,v31,$rndkey0
3295	mtctr		$rounds
3296	b		Loop_xts_dec6x
3297
3298.align	5
3299Loop_xts_dec6x:
3300	vncipher	$out0,$out0,v24
3301	vncipher	$out1,$out1,v24
3302	vncipher	$out2,$out2,v24
3303	vncipher	$out3,$out3,v24
3304	vncipher	$out4,$out4,v24
3305	vncipher	$out5,$out5,v24
3306	lvx		v24,$x20,$key_		# round[3]
3307	addi		$key_,$key_,0x20
3308
3309	vncipher	$out0,$out0,v25
3310	vncipher	$out1,$out1,v25
3311	vncipher	$out2,$out2,v25
3312	vncipher	$out3,$out3,v25
3313	vncipher	$out4,$out4,v25
3314	vncipher	$out5,$out5,v25
3315	lvx		v25,$x10,$key_		# round[4]
3316	bdnz		Loop_xts_dec6x
3317
3318	xxlor		32+$eighty7, 1, 1	# 0x010101..87
3319
3320	subic		$len,$len,96		# $len-=96
3321	 vxor		$in0,$twk0,v31		# xor with last round key
3322	vncipher	$out0,$out0,v24
3323	vncipher	$out1,$out1,v24
3324	 vsrab		$tmp,$tweak,$seven	# next tweak value
3325	 vxor		$twk0,$tweak,$rndkey0
3326	 vaddubm	$tweak,$tweak,$tweak
3327	vncipher	$out2,$out2,v24
3328	vncipher	$out3,$out3,v24
3329	vncipher	$out4,$out4,v24
3330	vncipher	$out5,$out5,v24
3331
3332	subfe.		r0,r0,r0		# borrow?-1:0
3333	 vand		$tmp,$tmp,$eighty7
3334	vncipher	$out0,$out0,v25
3335	vncipher	$out1,$out1,v25
3336	 xxlor		32+$in1, 0, 0
3337	 vpermxor	$tweak, $tweak, $tmp, $in1
3338	vncipher	$out2,$out2,v25
3339	vncipher	$out3,$out3,v25
3340	 vxor		$in1,$twk1,v31
3341	 vsrab		$tmp,$tweak,$seven	# next tweak value
3342	 vxor		$twk1,$tweak,$rndkey0
3343	vncipher	$out4,$out4,v25
3344	vncipher	$out5,$out5,v25
3345
3346	and		r0,r0,$len
3347	 vaddubm	$tweak,$tweak,$tweak
3348	vncipher	$out0,$out0,v26
3349	vncipher	$out1,$out1,v26
3350	 vand		$tmp,$tmp,$eighty7
3351	vncipher	$out2,$out2,v26
3352	vncipher	$out3,$out3,v26
3353	 xxlor		32+$in2, 0, 0
3354	 vpermxor	$tweak, $tweak, $tmp, $in2
3355	vncipher	$out4,$out4,v26
3356	vncipher	$out5,$out5,v26
3357
3358	add		$inp,$inp,r0		# $inp is adjusted in such
3359						# way that at exit from the
3360						# loop inX-in5 are loaded
3361						# with last "words"
3362	 vxor		$in2,$twk2,v31
3363	 vsrab		$tmp,$tweak,$seven	# next tweak value
3364	 vxor		$twk2,$tweak,$rndkey0
3365	 vaddubm	$tweak,$tweak,$tweak
3366	vncipher	$out0,$out0,v27
3367	vncipher	$out1,$out1,v27
3368	vncipher	$out2,$out2,v27
3369	vncipher	$out3,$out3,v27
3370	 vand		$tmp,$tmp,$eighty7
3371	vncipher	$out4,$out4,v27
3372	vncipher	$out5,$out5,v27
3373
3374	addi		$key_,$sp,$FRAME+15	# rewind $key_
3375	 xxlor		32+$in3, 0, 0
3376	 vpermxor	$tweak, $tweak, $tmp, $in3
3377	vncipher	$out0,$out0,v28
3378	vncipher	$out1,$out1,v28
3379	 vxor		$in3,$twk3,v31
3380	 vsrab		$tmp,$tweak,$seven	# next tweak value
3381	 vxor		$twk3,$tweak,$rndkey0
3382	vncipher	$out2,$out2,v28
3383	vncipher	$out3,$out3,v28
3384	 vaddubm	$tweak,$tweak,$tweak
3385	vncipher	$out4,$out4,v28
3386	vncipher	$out5,$out5,v28
3387	lvx		v24,$x00,$key_		# re-pre-load round[1]
3388	 vand		$tmp,$tmp,$eighty7
3389
3390	vncipher	$out0,$out0,v29
3391	vncipher	$out1,$out1,v29
3392	 xxlor		32+$in4, 0, 0
3393	 vpermxor	$tweak, $tweak, $tmp, $in4
3394	vncipher	$out2,$out2,v29
3395	vncipher	$out3,$out3,v29
3396	 vxor		$in4,$twk4,v31
3397	 vsrab		$tmp,$tweak,$seven	# next tweak value
3398	 vxor		$twk4,$tweak,$rndkey0
3399	vncipher	$out4,$out4,v29
3400	vncipher	$out5,$out5,v29
3401	lvx		v25,$x10,$key_		# re-pre-load round[2]
3402	 vaddubm	$tweak,$tweak,$tweak
3403
3404	vncipher	$out0,$out0,v30
3405	vncipher	$out1,$out1,v30
3406	 vand		$tmp,$tmp,$eighty7
3407	vncipher	$out2,$out2,v30
3408	vncipher	$out3,$out3,v30
3409	 xxlor		32+$in5, 0, 0
3410	 vpermxor	$tweak, $tweak, $tmp, $in5
3411	vncipher	$out4,$out4,v30
3412	vncipher	$out5,$out5,v30
3413	 vxor		$in5,$twk5,v31
3414	 vsrab		$tmp,$tweak,$seven	# next tweak value
3415	 vxor		$twk5,$tweak,$rndkey0
3416
3417	vncipherlast	$out0,$out0,$in0
3418	 lvx_u		$in0,$x00,$inp		# load next input block
3419	 vaddubm	$tweak,$tweak,$tweak
3420	vncipherlast	$out1,$out1,$in1
3421	 lvx_u		$in1,$x10,$inp
3422	vncipherlast	$out2,$out2,$in2
3423	 le?vperm	$in0,$in0,$in0,$leperm
3424	 lvx_u		$in2,$x20,$inp
3425	 vand		$tmp,$tmp,$eighty7
3426	vncipherlast	$out3,$out3,$in3
3427	 le?vperm	$in1,$in1,$in1,$leperm
3428	 lvx_u		$in3,$x30,$inp
3429	vncipherlast	$out4,$out4,$in4
3430	 le?vperm	$in2,$in2,$in2,$leperm
3431	 lvx_u		$in4,$x40,$inp
3432	 xxlor		10, 32+$in0, 32+$in0
3433	 xxlor		32+$in0, 0, 0
3434	 vpermxor	$tweak, $tweak, $tmp, $in0
3435	 xxlor		32+$in0, 10, 10
3436	vncipherlast	$out5,$out5,$in5
3437	 le?vperm	$in3,$in3,$in3,$leperm
3438	 lvx_u		$in5,$x50,$inp
3439	 addi		$inp,$inp,0x60
3440	 le?vperm	$in4,$in4,$in4,$leperm
3441	 le?vperm	$in5,$in5,$in5,$leperm
3442
3443	le?vperm	$out0,$out0,$out0,$leperm
3444	le?vperm	$out1,$out1,$out1,$leperm
3445	stvx_u		$out0,$x00,$out		# store output
3446	 vxor		$out0,$in0,$twk0
3447	le?vperm	$out2,$out2,$out2,$leperm
3448	stvx_u		$out1,$x10,$out
3449	 vxor		$out1,$in1,$twk1
3450	le?vperm	$out3,$out3,$out3,$leperm
3451	stvx_u		$out2,$x20,$out
3452	 vxor		$out2,$in2,$twk2
3453	le?vperm	$out4,$out4,$out4,$leperm
3454	stvx_u		$out3,$x30,$out
3455	 vxor		$out3,$in3,$twk3
3456	le?vperm	$out5,$out5,$out5,$leperm
3457	stvx_u		$out4,$x40,$out
3458	 vxor		$out4,$in4,$twk4
3459	stvx_u		$out5,$x50,$out
3460	 vxor		$out5,$in5,$twk5
3461	addi		$out,$out,0x60
3462
3463	mtctr		$rounds
3464	beq		Loop_xts_dec6x		# did $len-=96 borrow?
3465
3466	xxlor		32+$eighty7, 2, 2	# 0x010101..87
3467
3468	addic.		$len,$len,0x60
3469	beq		Lxts_dec6x_zero
3470	cmpwi		$len,0x20
3471	blt		Lxts_dec6x_one
3472	nop
3473	beq		Lxts_dec6x_two
3474	cmpwi		$len,0x40
3475	blt		Lxts_dec6x_three
3476	nop
3477	beq		Lxts_dec6x_four
3478
3479Lxts_dec6x_five:
3480	vxor		$out0,$in1,$twk0
3481	vxor		$out1,$in2,$twk1
3482	vxor		$out2,$in3,$twk2
3483	vxor		$out3,$in4,$twk3
3484	vxor		$out4,$in5,$twk4
3485
3486	bl		_aesp8_xts_dec5x
3487
3488	le?vperm	$out0,$out0,$out0,$leperm
3489	vmr		$twk0,$twk5		# unused tweak
3490	vxor		$twk1,$tweak,$rndkey0
3491	le?vperm	$out1,$out1,$out1,$leperm
3492	stvx_u		$out0,$x00,$out		# store output
3493	vxor		$out0,$in0,$twk1
3494	le?vperm	$out2,$out2,$out2,$leperm
3495	stvx_u		$out1,$x10,$out
3496	le?vperm	$out3,$out3,$out3,$leperm
3497	stvx_u		$out2,$x20,$out
3498	le?vperm	$out4,$out4,$out4,$leperm
3499	stvx_u		$out3,$x30,$out
3500	stvx_u		$out4,$x40,$out
3501	addi		$out,$out,0x50
3502	bne		Lxts_dec6x_steal
3503	b		Lxts_dec6x_done
3504
3505.align	4
3506Lxts_dec6x_four:
3507	vxor		$out0,$in2,$twk0
3508	vxor		$out1,$in3,$twk1
3509	vxor		$out2,$in4,$twk2
3510	vxor		$out3,$in5,$twk3
3511	vxor		$out4,$out4,$out4
3512
3513	bl		_aesp8_xts_dec5x
3514
3515	le?vperm	$out0,$out0,$out0,$leperm
3516	vmr		$twk0,$twk4		# unused tweak
3517	vmr		$twk1,$twk5
3518	le?vperm	$out1,$out1,$out1,$leperm
3519	stvx_u		$out0,$x00,$out		# store output
3520	vxor		$out0,$in0,$twk5
3521	le?vperm	$out2,$out2,$out2,$leperm
3522	stvx_u		$out1,$x10,$out
3523	le?vperm	$out3,$out3,$out3,$leperm
3524	stvx_u		$out2,$x20,$out
3525	stvx_u		$out3,$x30,$out
3526	addi		$out,$out,0x40
3527	bne		Lxts_dec6x_steal
3528	b		Lxts_dec6x_done
3529
3530.align	4
3531Lxts_dec6x_three:
3532	vxor		$out0,$in3,$twk0
3533	vxor		$out1,$in4,$twk1
3534	vxor		$out2,$in5,$twk2
3535	vxor		$out3,$out3,$out3
3536	vxor		$out4,$out4,$out4
3537
3538	bl		_aesp8_xts_dec5x
3539
3540	le?vperm	$out0,$out0,$out0,$leperm
3541	vmr		$twk0,$twk3		# unused tweak
3542	vmr		$twk1,$twk4
3543	le?vperm	$out1,$out1,$out1,$leperm
3544	stvx_u		$out0,$x00,$out		# store output
3545	vxor		$out0,$in0,$twk4
3546	le?vperm	$out2,$out2,$out2,$leperm
3547	stvx_u		$out1,$x10,$out
3548	stvx_u		$out2,$x20,$out
3549	addi		$out,$out,0x30
3550	bne		Lxts_dec6x_steal
3551	b		Lxts_dec6x_done
3552
3553.align	4
3554Lxts_dec6x_two:
3555	vxor		$out0,$in4,$twk0
3556	vxor		$out1,$in5,$twk1
3557	vxor		$out2,$out2,$out2
3558	vxor		$out3,$out3,$out3
3559	vxor		$out4,$out4,$out4
3560
3561	bl		_aesp8_xts_dec5x
3562
3563	le?vperm	$out0,$out0,$out0,$leperm
3564	vmr		$twk0,$twk2		# unused tweak
3565	vmr		$twk1,$twk3
3566	le?vperm	$out1,$out1,$out1,$leperm
3567	stvx_u		$out0,$x00,$out		# store output
3568	vxor		$out0,$in0,$twk3
3569	stvx_u		$out1,$x10,$out
3570	addi		$out,$out,0x20
3571	bne		Lxts_dec6x_steal
3572	b		Lxts_dec6x_done
3573
3574.align	4
3575Lxts_dec6x_one:
3576	vxor		$out0,$in5,$twk0
3577	nop
3578Loop_xts_dec1x:
3579	vncipher	$out0,$out0,v24
3580	lvx		v24,$x20,$key_		# round[3]
3581	addi		$key_,$key_,0x20
3582
3583	vncipher	$out0,$out0,v25
3584	lvx		v25,$x10,$key_		# round[4]
3585	bdnz		Loop_xts_dec1x
3586
3587	subi		r0,$taillen,1
3588	vncipher	$out0,$out0,v24
3589
3590	andi.		r0,r0,16
3591	cmpwi		$taillen,0
3592	vncipher	$out0,$out0,v25
3593
3594	sub		$inp,$inp,r0
3595	vncipher	$out0,$out0,v26
3596
3597	lvx_u		$in0,0,$inp
3598	vncipher	$out0,$out0,v27
3599
3600	addi		$key_,$sp,$FRAME+15	# rewind $key_
3601	vncipher	$out0,$out0,v28
3602	lvx		v24,$x00,$key_		# re-pre-load round[1]
3603
3604	vncipher	$out0,$out0,v29
3605	lvx		v25,$x10,$key_		# re-pre-load round[2]
3606	 vxor		$twk0,$twk0,v31
3607
3608	le?vperm	$in0,$in0,$in0,$leperm
3609	vncipher	$out0,$out0,v30
3610
3611	mtctr		$rounds
3612	vncipherlast	$out0,$out0,$twk0
3613
3614	vmr		$twk0,$twk1		# unused tweak
3615	vmr		$twk1,$twk2
3616	le?vperm	$out0,$out0,$out0,$leperm
3617	stvx_u		$out0,$x00,$out		# store output
3618	addi		$out,$out,0x10
3619	vxor		$out0,$in0,$twk2
3620	bne		Lxts_dec6x_steal
3621	b		Lxts_dec6x_done
3622
3623.align	4
3624Lxts_dec6x_zero:
3625	cmpwi		$taillen,0
3626	beq		Lxts_dec6x_done
3627
3628	lvx_u		$in0,0,$inp
3629	le?vperm	$in0,$in0,$in0,$leperm
3630	vxor		$out0,$in0,$twk1
3631Lxts_dec6x_steal:
3632	vncipher	$out0,$out0,v24
3633	lvx		v24,$x20,$key_		# round[3]
3634	addi		$key_,$key_,0x20
3635
3636	vncipher	$out0,$out0,v25
3637	lvx		v25,$x10,$key_		# round[4]
3638	bdnz		Lxts_dec6x_steal
3639
3640	add		$inp,$inp,$taillen
3641	vncipher	$out0,$out0,v24
3642
3643	cmpwi		$taillen,0
3644	vncipher	$out0,$out0,v25
3645
3646	lvx_u		$in0,0,$inp
3647	vncipher	$out0,$out0,v26
3648
3649	lvsr		$inpperm,0,$taillen	# $in5 is no more
3650	vncipher	$out0,$out0,v27
3651
3652	addi		$key_,$sp,$FRAME+15	# rewind $key_
3653	vncipher	$out0,$out0,v28
3654	lvx		v24,$x00,$key_		# re-pre-load round[1]
3655
3656	vncipher	$out0,$out0,v29
3657	lvx		v25,$x10,$key_		# re-pre-load round[2]
3658	 vxor		$twk1,$twk1,v31
3659
3660	le?vperm	$in0,$in0,$in0,$leperm
3661	vncipher	$out0,$out0,v30
3662
3663	vperm		$in0,$in0,$in0,$inpperm
3664	vncipherlast	$tmp,$out0,$twk1
3665
3666	le?vperm	$out0,$tmp,$tmp,$leperm
3667	le?stvx_u	$out0,0,$out
3668	be?stvx_u	$tmp,0,$out
3669
3670	vxor		$out0,$out0,$out0
3671	vspltisb	$out1,-1
3672	vperm		$out0,$out0,$out1,$inpperm
3673	vsel		$out0,$in0,$tmp,$out0
3674	vxor		$out0,$out0,$twk0
3675
3676	subi		r30,$out,1
3677	mtctr		$taillen
3678Loop_xts_dec6x_steal:
3679	lbzu		r0,1(r30)
3680	stb		r0,16(r30)
3681	bdnz		Loop_xts_dec6x_steal
3682
3683	li		$taillen,0
3684	mtctr		$rounds
3685	b		Loop_xts_dec1x		# one more time...
3686
3687.align	4
3688Lxts_dec6x_done:
3689	${UCMP}i	$ivp,0
3690	beq		Lxts_dec6x_ret
3691
3692	vxor		$tweak,$twk0,$rndkey0
3693	le?vperm	$tweak,$tweak,$tweak,$leperm
3694	stvx_u		$tweak,0,$ivp
3695
3696Lxts_dec6x_ret:
3697	mtlr		r11
3698	li		r10,`$FRAME+15`
3699	li		r11,`$FRAME+31`
3700	stvx		$seven,r10,$sp		# wipe copies of round keys
3701	addi		r10,r10,32
3702	stvx		$seven,r11,$sp
3703	addi		r11,r11,32
3704	stvx		$seven,r10,$sp
3705	addi		r10,r10,32
3706	stvx		$seven,r11,$sp
3707	addi		r11,r11,32
3708	stvx		$seven,r10,$sp
3709	addi		r10,r10,32
3710	stvx		$seven,r11,$sp
3711	addi		r11,r11,32
3712	stvx		$seven,r10,$sp
3713	addi		r10,r10,32
3714	stvx		$seven,r11,$sp
3715	addi		r11,r11,32
3716
3717	mtspr		256,$vrsave
3718	lvx		v20,r10,$sp		# ABI says so
3719	addi		r10,r10,32
3720	lvx		v21,r11,$sp
3721	addi		r11,r11,32
3722	lvx		v22,r10,$sp
3723	addi		r10,r10,32
3724	lvx		v23,r11,$sp
3725	addi		r11,r11,32
3726	lvx		v24,r10,$sp
3727	addi		r10,r10,32
3728	lvx		v25,r11,$sp
3729	addi		r11,r11,32
3730	lvx		v26,r10,$sp
3731	addi		r10,r10,32
3732	lvx		v27,r11,$sp
3733	addi		r11,r11,32
3734	lvx		v28,r10,$sp
3735	addi		r10,r10,32
3736	lvx		v29,r11,$sp
3737	addi		r11,r11,32
3738	lvx		v30,r10,$sp
3739	lvx		v31,r11,$sp
3740	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3741	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3742	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3743	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3744	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3745	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3746	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3747	blr
3748	.long		0
3749	.byte		0,12,0x04,1,0x80,6,6,0
3750	.long		0
3751
3752.align	5
3753_aesp8_xts_dec5x:
3754	vncipher	$out0,$out0,v24
3755	vncipher	$out1,$out1,v24
3756	vncipher	$out2,$out2,v24
3757	vncipher	$out3,$out3,v24
3758	vncipher	$out4,$out4,v24
3759	lvx		v24,$x20,$key_		# round[3]
3760	addi		$key_,$key_,0x20
3761
3762	vncipher	$out0,$out0,v25
3763	vncipher	$out1,$out1,v25
3764	vncipher	$out2,$out2,v25
3765	vncipher	$out3,$out3,v25
3766	vncipher	$out4,$out4,v25
3767	lvx		v25,$x10,$key_		# round[4]
3768	bdnz		_aesp8_xts_dec5x
3769
3770	subi		r0,$taillen,1
3771	vncipher	$out0,$out0,v24
3772	vncipher	$out1,$out1,v24
3773	vncipher	$out2,$out2,v24
3774	vncipher	$out3,$out3,v24
3775	vncipher	$out4,$out4,v24
3776
3777	andi.		r0,r0,16
3778	cmpwi		$taillen,0
3779	vncipher	$out0,$out0,v25
3780	vncipher	$out1,$out1,v25
3781	vncipher	$out2,$out2,v25
3782	vncipher	$out3,$out3,v25
3783	vncipher	$out4,$out4,v25
3784	 vxor		$twk0,$twk0,v31
3785
3786	sub		$inp,$inp,r0
3787	vncipher	$out0,$out0,v26
3788	vncipher	$out1,$out1,v26
3789	vncipher	$out2,$out2,v26
3790	vncipher	$out3,$out3,v26
3791	vncipher	$out4,$out4,v26
3792	 vxor		$in1,$twk1,v31
3793
3794	vncipher	$out0,$out0,v27
3795	lvx_u		$in0,0,$inp
3796	vncipher	$out1,$out1,v27
3797	vncipher	$out2,$out2,v27
3798	vncipher	$out3,$out3,v27
3799	vncipher	$out4,$out4,v27
3800	 vxor		$in2,$twk2,v31
3801
3802	addi		$key_,$sp,$FRAME+15	# rewind $key_
3803	vncipher	$out0,$out0,v28
3804	vncipher	$out1,$out1,v28
3805	vncipher	$out2,$out2,v28
3806	vncipher	$out3,$out3,v28
3807	vncipher	$out4,$out4,v28
3808	lvx		v24,$x00,$key_		# re-pre-load round[1]
3809	 vxor		$in3,$twk3,v31
3810
3811	vncipher	$out0,$out0,v29
3812	le?vperm	$in0,$in0,$in0,$leperm
3813	vncipher	$out1,$out1,v29
3814	vncipher	$out2,$out2,v29
3815	vncipher	$out3,$out3,v29
3816	vncipher	$out4,$out4,v29
3817	lvx		v25,$x10,$key_		# re-pre-load round[2]
3818	 vxor		$in4,$twk4,v31
3819
3820	vncipher	$out0,$out0,v30
3821	vncipher	$out1,$out1,v30
3822	vncipher	$out2,$out2,v30
3823	vncipher	$out3,$out3,v30
3824	vncipher	$out4,$out4,v30
3825
3826	vncipherlast	$out0,$out0,$twk0
3827	vncipherlast	$out1,$out1,$in1
3828	vncipherlast	$out2,$out2,$in2
3829	vncipherlast	$out3,$out3,$in3
3830	vncipherlast	$out4,$out4,$in4
3831	mtctr		$rounds
3832	blr
3833        .long   	0
3834        .byte   	0,12,0x14,0,0,0,0,0
3835___
3836}}	}}}
3837
3838my $consts=1;
3839foreach(split("\n",$code)) {
3840        s/\`([^\`]*)\`/eval($1)/geo;
3841
3842	# constants table endian-specific conversion
3843	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3844	    my $conv=$3;
3845	    my @bytes=();
3846
3847	    # convert to endian-agnostic format
3848	    if ($1 eq "long") {
3849	      foreach (split(/,\s*/,$2)) {
3850		my $l = /^0/?oct:int;
3851		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3852	      }
3853	    } else {
3854		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3855	    }
3856
3857	    # little-endian conversion
3858	    if ($flavour =~ /le$/o) {
3859		SWITCH: for($conv)  {
3860		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
3861		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
3862		}
3863	    }
3864
3865	    #emit
3866	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3867	    next;
3868	}
3869	$consts=0 if (m/Lconsts:/o);	# end of table
3870
3871	# instructions prefixed with '?' are endian-specific and need
3872	# to be adjusted accordingly...
3873	if ($flavour =~ /le$/o) {	# little-endian
3874	    s/le\?//o		or
3875	    s/be\?/#be#/o	or
3876	    s/\?lvsr/lvsl/o	or
3877	    s/\?lvsl/lvsr/o	or
3878	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3879	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3880	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3881	} else {			# big-endian
3882	    s/le\?/#le#/o	or
3883	    s/be\?//o		or
3884	    s/\?([a-z]+)/$1/o;
3885	}
3886
3887        print $_,"\n";
3888}
3889
3890close STDOUT;
3891