xref: /freebsd/crypto/openssl/crypto/sm4/asm/vpsm4-armv8.pl (revision e7be843b4a162e68651d3911f0357ed464915629)
1*e7be843bSPierre Pronchery#! /usr/bin/env perl
2*e7be843bSPierre Pronchery# Copyright 2020-2025 The OpenSSL Project Authors. All Rights Reserved.
3*e7be843bSPierre Pronchery#
4*e7be843bSPierre Pronchery# Licensed under the Apache License 2.0 (the "License").  You may not use
5*e7be843bSPierre Pronchery# this file except in compliance with the License.  You can obtain a copy
6*e7be843bSPierre Pronchery# in the file LICENSE in the source distribution or at
7*e7be843bSPierre Pronchery# https://www.openssl.org/source/license.html
8*e7be843bSPierre Pronchery
9*e7be843bSPierre Pronchery#
10*e7be843bSPierre Pronchery# This module implements SM4 with ASIMD on aarch64
11*e7be843bSPierre Pronchery#
12*e7be843bSPierre Pronchery# Feb 2022
13*e7be843bSPierre Pronchery#
14*e7be843bSPierre Pronchery
15*e7be843bSPierre Pronchery# $output is the last argument if it looks like a file (it has an extension)
16*e7be843bSPierre Pronchery# $flavour is the first argument if it doesn't look like a file
17*e7be843bSPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18*e7be843bSPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
19*e7be843bSPierre Pronchery
20*e7be843bSPierre Pronchery$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21*e7be843bSPierre Pronchery( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22*e7be843bSPierre Pronchery( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23*e7be843bSPierre Proncherydie "can't locate arm-xlate.pl";
24*e7be843bSPierre Pronchery
25*e7be843bSPierre Proncheryopen OUT,"| \"$^X\" $xlate $flavour \"$output\""
26*e7be843bSPierre Pronchery    or die "can't call $xlate: $!";
27*e7be843bSPierre Pronchery*STDOUT=*OUT;
28*e7be843bSPierre Pronchery
29*e7be843bSPierre Pronchery$prefix="vpsm4";
30*e7be843bSPierre Proncherymy @vtmp=map("v$_",(0..3));
31*e7be843bSPierre Proncherymy @qtmp=map("q$_",(0..3));
32*e7be843bSPierre Proncherymy @data=map("v$_",(4..7));
33*e7be843bSPierre Proncherymy @datax=map("v$_",(8..11));
34*e7be843bSPierre Proncherymy ($rk0,$rk1)=("v12","v13");
35*e7be843bSPierre Proncherymy ($rka,$rkb)=("v14","v15");
36*e7be843bSPierre Proncherymy @vtmpx=map("v$_",(12..15));
37*e7be843bSPierre Proncherymy @sbox=map("v$_",(16..31));
38*e7be843bSPierre Proncherymy ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
39*e7be843bSPierre Proncherymy ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
40*e7be843bSPierre Proncherymy ($xtmp1,$xtmp2)=("x8","x9");
41*e7be843bSPierre Proncherymy ($ptr,$counter)=("x10","w11");
42*e7be843bSPierre Proncherymy ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
43*e7be843bSPierre Pronchery
44*e7be843bSPierre Proncherysub rev32() {
45*e7be843bSPierre Pronchery	my $dst = shift;
46*e7be843bSPierre Pronchery	my $src = shift;
47*e7be843bSPierre Pronchery
48*e7be843bSPierre Pronchery	if ($src and ("$src" ne "$dst")) {
49*e7be843bSPierre Pronchery$code.=<<___;
50*e7be843bSPierre Pronchery#ifndef __AARCH64EB__
51*e7be843bSPierre Pronchery	rev32	$dst.16b,$src.16b
52*e7be843bSPierre Pronchery#else
53*e7be843bSPierre Pronchery	mov	$dst.16b,$src.16b
54*e7be843bSPierre Pronchery#endif
55*e7be843bSPierre Pronchery___
56*e7be843bSPierre Pronchery	} else {
57*e7be843bSPierre Pronchery$code.=<<___;
58*e7be843bSPierre Pronchery#ifndef __AARCH64EB__
59*e7be843bSPierre Pronchery	rev32	$dst.16b,$dst.16b
60*e7be843bSPierre Pronchery#endif
61*e7be843bSPierre Pronchery___
62*e7be843bSPierre Pronchery	}
63*e7be843bSPierre Pronchery}
64*e7be843bSPierre Pronchery
65*e7be843bSPierre Proncherysub rev32_armeb() {
66*e7be843bSPierre Pronchery	my $dst = shift;
67*e7be843bSPierre Pronchery	my $src = shift;
68*e7be843bSPierre Pronchery
69*e7be843bSPierre Pronchery	if ($src and ("$src" ne "$dst")) {
70*e7be843bSPierre Pronchery$code.=<<___;
71*e7be843bSPierre Pronchery#ifdef __AARCH64EB__
72*e7be843bSPierre Pronchery	rev32	$dst.16b,$src.16b
73*e7be843bSPierre Pronchery#else
74*e7be843bSPierre Pronchery	mov	$dst.16b,$src.16b
75*e7be843bSPierre Pronchery#endif
76*e7be843bSPierre Pronchery___
77*e7be843bSPierre Pronchery	} else {
78*e7be843bSPierre Pronchery$code.=<<___;
79*e7be843bSPierre Pronchery#ifdef __AARCH64EB__
80*e7be843bSPierre Pronchery	rev32	$dst.16b,$dst.16b
81*e7be843bSPierre Pronchery#endif
82*e7be843bSPierre Pronchery___
83*e7be843bSPierre Pronchery	}
84*e7be843bSPierre Pronchery}
85*e7be843bSPierre Pronchery
86*e7be843bSPierre Proncherysub rbit() {
87*e7be843bSPierre Pronchery	my $dst = shift;
88*e7be843bSPierre Pronchery	my $src = shift;
89*e7be843bSPierre Pronchery	my $std = shift;
90*e7be843bSPierre Pronchery
91*e7be843bSPierre Pronchery	if ($src and ("$src" ne "$dst")) {
92*e7be843bSPierre Pronchery		if ($std eq "_gb") {
93*e7be843bSPierre Pronchery$code.=<<___;
94*e7be843bSPierre Pronchery			rbit $dst.16b,$src.16b
95*e7be843bSPierre Pronchery___
96*e7be843bSPierre Pronchery		} else {
97*e7be843bSPierre Pronchery$code.=<<___;
98*e7be843bSPierre Pronchery			mov $dst.16b,$src.16b
99*e7be843bSPierre Pronchery___
100*e7be843bSPierre Pronchery		}
101*e7be843bSPierre Pronchery	} else {
102*e7be843bSPierre Pronchery		if ($std eq "_gb") {
103*e7be843bSPierre Pronchery$code.=<<___;
104*e7be843bSPierre Pronchery			rbit $dst.16b,$src.16b
105*e7be843bSPierre Pronchery___
106*e7be843bSPierre Pronchery		}
107*e7be843bSPierre Pronchery	}
108*e7be843bSPierre Pronchery}
109*e7be843bSPierre Pronchery
110*e7be843bSPierre Proncherysub transpose() {
111*e7be843bSPierre Pronchery	my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
112*e7be843bSPierre Pronchery
113*e7be843bSPierre Pronchery$code.=<<___;
114*e7be843bSPierre Pronchery	zip1	$vt0.4s,$dat0.4s,$dat1.4s
115*e7be843bSPierre Pronchery	zip2	$vt1.4s,$dat0.4s,$dat1.4s
116*e7be843bSPierre Pronchery	zip1	$vt2.4s,$dat2.4s,$dat3.4s
117*e7be843bSPierre Pronchery	zip2	$vt3.4s,$dat2.4s,$dat3.4s
118*e7be843bSPierre Pronchery	zip1	$dat0.2d,$vt0.2d,$vt2.2d
119*e7be843bSPierre Pronchery	zip2	$dat1.2d,$vt0.2d,$vt2.2d
120*e7be843bSPierre Pronchery	zip1	$dat2.2d,$vt1.2d,$vt3.2d
121*e7be843bSPierre Pronchery	zip2	$dat3.2d,$vt1.2d,$vt3.2d
122*e7be843bSPierre Pronchery___
123*e7be843bSPierre Pronchery}
124*e7be843bSPierre Pronchery
125*e7be843bSPierre Pronchery# sbox operations for 4-lane of words
126*e7be843bSPierre Proncherysub sbox() {
127*e7be843bSPierre Pronchery	my $dat = shift;
128*e7be843bSPierre Pronchery
129*e7be843bSPierre Pronchery$code.=<<___;
130*e7be843bSPierre Pronchery	movi	@vtmp[0].16b,#64
131*e7be843bSPierre Pronchery	movi	@vtmp[1].16b,#128
132*e7be843bSPierre Pronchery	movi	@vtmp[2].16b,#192
133*e7be843bSPierre Pronchery	sub	@vtmp[0].16b,$dat.16b,@vtmp[0].16b
134*e7be843bSPierre Pronchery	sub	@vtmp[1].16b,$dat.16b,@vtmp[1].16b
135*e7be843bSPierre Pronchery	sub	@vtmp[2].16b,$dat.16b,@vtmp[2].16b
136*e7be843bSPierre Pronchery	tbl	$dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
137*e7be843bSPierre Pronchery	tbl	@vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
138*e7be843bSPierre Pronchery	tbl	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
139*e7be843bSPierre Pronchery	tbl	@vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
140*e7be843bSPierre Pronchery	add	@vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
141*e7be843bSPierre Pronchery	add	@vtmp[2].2d,@vtmp[2].2d,$dat.2d
142*e7be843bSPierre Pronchery	add	$dat.2d,@vtmp[0].2d,@vtmp[2].2d
143*e7be843bSPierre Pronchery
144*e7be843bSPierre Pronchery	ushr	@vtmp[0].4s,$dat.4s,32-2
145*e7be843bSPierre Pronchery	sli	@vtmp[0].4s,$dat.4s,2
146*e7be843bSPierre Pronchery	ushr	@vtmp[2].4s,$dat.4s,32-10
147*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[0].16b,$dat.16b
148*e7be843bSPierre Pronchery	sli	@vtmp[2].4s,$dat.4s,10
149*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
150*e7be843bSPierre Pronchery	ushr	@vtmp[0].4s,$dat.4s,32-18
151*e7be843bSPierre Pronchery	sli	@vtmp[0].4s,$dat.4s,18
152*e7be843bSPierre Pronchery	ushr	@vtmp[2].4s,$dat.4s,32-24
153*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
154*e7be843bSPierre Pronchery	sli	@vtmp[2].4s,$dat.4s,24
155*e7be843bSPierre Pronchery	eor	$dat.16b,@vtmp[2].16b,@vtmp[1].16b
156*e7be843bSPierre Pronchery___
157*e7be843bSPierre Pronchery}
158*e7be843bSPierre Pronchery
159*e7be843bSPierre Pronchery# sbox operation for 8-lane of words
160*e7be843bSPierre Proncherysub sbox_double() {
161*e7be843bSPierre Pronchery	my $dat = shift;
162*e7be843bSPierre Pronchery	my $datx = shift;
163*e7be843bSPierre Pronchery
164*e7be843bSPierre Pronchery$code.=<<___;
165*e7be843bSPierre Pronchery	movi	@vtmp[3].16b,#64
166*e7be843bSPierre Pronchery	sub	@vtmp[0].16b,$dat.16b,@vtmp[3].16b
167*e7be843bSPierre Pronchery	sub	@vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
168*e7be843bSPierre Pronchery	sub	@vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
169*e7be843bSPierre Pronchery	tbl	$dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
170*e7be843bSPierre Pronchery	tbl	@vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
171*e7be843bSPierre Pronchery	tbl	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
172*e7be843bSPierre Pronchery	tbl	@vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
173*e7be843bSPierre Pronchery	add	@vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
174*e7be843bSPierre Pronchery	add	$dat.2d,@vtmp[2].2d,$dat.2d
175*e7be843bSPierre Pronchery	add	$dat.2d,@vtmp[1].2d,$dat.2d
176*e7be843bSPierre Pronchery
177*e7be843bSPierre Pronchery	sub	@vtmp[0].16b,$datx.16b,@vtmp[3].16b
178*e7be843bSPierre Pronchery	sub	@vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
179*e7be843bSPierre Pronchery	sub	@vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
180*e7be843bSPierre Pronchery	tbl	$datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
181*e7be843bSPierre Pronchery	tbl	@vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
182*e7be843bSPierre Pronchery	tbl	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
183*e7be843bSPierre Pronchery	tbl	@vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
184*e7be843bSPierre Pronchery	add	@vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
185*e7be843bSPierre Pronchery	add	$datx.2d,@vtmp[2].2d,$datx.2d
186*e7be843bSPierre Pronchery	add	$datx.2d,@vtmp[1].2d,$datx.2d
187*e7be843bSPierre Pronchery
188*e7be843bSPierre Pronchery	ushr	@vtmp[0].4s,$dat.4s,32-2
189*e7be843bSPierre Pronchery	sli	@vtmp[0].4s,$dat.4s,2
190*e7be843bSPierre Pronchery	ushr	@vtmp[2].4s,$datx.4s,32-2
191*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[0].16b,$dat.16b
192*e7be843bSPierre Pronchery	sli	@vtmp[2].4s,$datx.4s,2
193*e7be843bSPierre Pronchery
194*e7be843bSPierre Pronchery	ushr	@vtmp[0].4s,$dat.4s,32-10
195*e7be843bSPierre Pronchery	eor	@vtmp[3].16b,@vtmp[2].16b,$datx.16b
196*e7be843bSPierre Pronchery	sli	@vtmp[0].4s,$dat.4s,10
197*e7be843bSPierre Pronchery	ushr	@vtmp[2].4s,$datx.4s,32-10
198*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
199*e7be843bSPierre Pronchery	sli	@vtmp[2].4s,$datx.4s,10
200*e7be843bSPierre Pronchery
201*e7be843bSPierre Pronchery	ushr	@vtmp[0].4s,$dat.4s,32-18
202*e7be843bSPierre Pronchery	eor	@vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
203*e7be843bSPierre Pronchery	sli	@vtmp[0].4s,$dat.4s,18
204*e7be843bSPierre Pronchery	ushr	@vtmp[2].4s,$datx.4s,32-18
205*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
206*e7be843bSPierre Pronchery	sli	@vtmp[2].4s,$datx.4s,18
207*e7be843bSPierre Pronchery
208*e7be843bSPierre Pronchery	ushr	@vtmp[0].4s,$dat.4s,32-24
209*e7be843bSPierre Pronchery	eor	@vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
210*e7be843bSPierre Pronchery	sli	@vtmp[0].4s,$dat.4s,24
211*e7be843bSPierre Pronchery	ushr	@vtmp[2].4s,$datx.4s,32-24
212*e7be843bSPierre Pronchery	eor	$dat.16b,@vtmp[0].16b,@vtmp[1].16b
213*e7be843bSPierre Pronchery	sli	@vtmp[2].4s,$datx.4s,24
214*e7be843bSPierre Pronchery	eor	$datx.16b,@vtmp[2].16b,@vtmp[3].16b
215*e7be843bSPierre Pronchery___
216*e7be843bSPierre Pronchery}
217*e7be843bSPierre Pronchery
218*e7be843bSPierre Pronchery# sbox operation for one single word
219*e7be843bSPierre Proncherysub sbox_1word () {
220*e7be843bSPierre Pronchery	my $word = shift;
221*e7be843bSPierre Pronchery
222*e7be843bSPierre Pronchery$code.=<<___;
223*e7be843bSPierre Pronchery	movi	@vtmp[1].16b,#64
224*e7be843bSPierre Pronchery	movi	@vtmp[2].16b,#128
225*e7be843bSPierre Pronchery	movi	@vtmp[3].16b,#192
226*e7be843bSPierre Pronchery	mov	@vtmp[0].s[0],$word
227*e7be843bSPierre Pronchery
228*e7be843bSPierre Pronchery	sub	@vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
229*e7be843bSPierre Pronchery	sub	@vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
230*e7be843bSPierre Pronchery	sub	@vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
231*e7be843bSPierre Pronchery
232*e7be843bSPierre Pronchery	tbl	@vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
233*e7be843bSPierre Pronchery	tbl	@vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
234*e7be843bSPierre Pronchery	tbl	@vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
235*e7be843bSPierre Pronchery	tbl	@vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
236*e7be843bSPierre Pronchery
237*e7be843bSPierre Pronchery	mov	$word,@vtmp[0].s[0]
238*e7be843bSPierre Pronchery	mov	$wtmp0,@vtmp[1].s[0]
239*e7be843bSPierre Pronchery	mov	$wtmp2,@vtmp[2].s[0]
240*e7be843bSPierre Pronchery	add	$wtmp0,$word,$wtmp0
241*e7be843bSPierre Pronchery	mov	$word,@vtmp[3].s[0]
242*e7be843bSPierre Pronchery	add	$wtmp0,$wtmp0,$wtmp2
243*e7be843bSPierre Pronchery	add	$wtmp0,$wtmp0,$word
244*e7be843bSPierre Pronchery
245*e7be843bSPierre Pronchery	eor	$word,$wtmp0,$wtmp0,ror #32-2
246*e7be843bSPierre Pronchery	eor	$word,$word,$wtmp0,ror #32-10
247*e7be843bSPierre Pronchery	eor	$word,$word,$wtmp0,ror #32-18
248*e7be843bSPierre Pronchery	eor	$word,$word,$wtmp0,ror #32-24
249*e7be843bSPierre Pronchery___
250*e7be843bSPierre Pronchery}
251*e7be843bSPierre Pronchery
252*e7be843bSPierre Pronchery# sm4 for one block of data, in scalar registers word0/word1/word2/word3
253*e7be843bSPierre Proncherysub sm4_1blk () {
254*e7be843bSPierre Pronchery	my $kptr = shift;
255*e7be843bSPierre Pronchery
256*e7be843bSPierre Pronchery$code.=<<___;
257*e7be843bSPierre Pronchery	ldp	$wtmp0,$wtmp1,[$kptr],8
258*e7be843bSPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
259*e7be843bSPierre Pronchery	eor	$tmpw,$word2,$word3
260*e7be843bSPierre Pronchery	eor	$wtmp2,$wtmp0,$word1
261*e7be843bSPierre Pronchery	eor	$tmpw,$tmpw,$wtmp2
262*e7be843bSPierre Pronchery___
263*e7be843bSPierre Pronchery	&sbox_1word($tmpw);
264*e7be843bSPierre Pronchery$code.=<<___;
265*e7be843bSPierre Pronchery	eor	$word0,$word0,$tmpw
266*e7be843bSPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
267*e7be843bSPierre Pronchery	eor	$tmpw,$word2,$word3
268*e7be843bSPierre Pronchery	eor	$wtmp2,$word0,$wtmp1
269*e7be843bSPierre Pronchery	eor	$tmpw,$tmpw,$wtmp2
270*e7be843bSPierre Pronchery___
271*e7be843bSPierre Pronchery	&sbox_1word($tmpw);
272*e7be843bSPierre Pronchery$code.=<<___;
273*e7be843bSPierre Pronchery	ldp	$wtmp0,$wtmp1,[$kptr],8
274*e7be843bSPierre Pronchery	eor	$word1,$word1,$tmpw
275*e7be843bSPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
276*e7be843bSPierre Pronchery	eor	$tmpw,$word0,$word1
277*e7be843bSPierre Pronchery	eor	$wtmp2,$wtmp0,$word3
278*e7be843bSPierre Pronchery	eor	$tmpw,$tmpw,$wtmp2
279*e7be843bSPierre Pronchery___
280*e7be843bSPierre Pronchery	&sbox_1word($tmpw);
281*e7be843bSPierre Pronchery$code.=<<___;
282*e7be843bSPierre Pronchery	eor	$word2,$word2,$tmpw
283*e7be843bSPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
284*e7be843bSPierre Pronchery	eor	$tmpw,$word0,$word1
285*e7be843bSPierre Pronchery	eor	$wtmp2,$word2,$wtmp1
286*e7be843bSPierre Pronchery	eor	$tmpw,$tmpw,$wtmp2
287*e7be843bSPierre Pronchery___
288*e7be843bSPierre Pronchery	&sbox_1word($tmpw);
289*e7be843bSPierre Pronchery$code.=<<___;
290*e7be843bSPierre Pronchery	eor	$word3,$word3,$tmpw
291*e7be843bSPierre Pronchery___
292*e7be843bSPierre Pronchery}
293*e7be843bSPierre Pronchery
294*e7be843bSPierre Pronchery# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
295*e7be843bSPierre Proncherysub sm4_4blks () {
296*e7be843bSPierre Pronchery	my $kptr = shift;
297*e7be843bSPierre Pronchery
298*e7be843bSPierre Pronchery$code.=<<___;
299*e7be843bSPierre Pronchery	ldp	$wtmp0,$wtmp1,[$kptr],8
300*e7be843bSPierre Pronchery	dup	$rk0.4s,$wtmp0
301*e7be843bSPierre Pronchery	dup	$rk1.4s,$wtmp1
302*e7be843bSPierre Pronchery
303*e7be843bSPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
304*e7be843bSPierre Pronchery	eor	$rka.16b,@data[2].16b,@data[3].16b
305*e7be843bSPierre Pronchery	eor	$rk0.16b,@data[1].16b,$rk0.16b
306*e7be843bSPierre Pronchery	eor	$rk0.16b,$rka.16b,$rk0.16b
307*e7be843bSPierre Pronchery___
308*e7be843bSPierre Pronchery	&sbox($rk0);
309*e7be843bSPierre Pronchery$code.=<<___;
310*e7be843bSPierre Pronchery	eor	@data[0].16b,@data[0].16b,$rk0.16b
311*e7be843bSPierre Pronchery
312*e7be843bSPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
313*e7be843bSPierre Pronchery	eor	$rka.16b,$rka.16b,@data[0].16b
314*e7be843bSPierre Pronchery	eor	$rk1.16b,$rka.16b,$rk1.16b
315*e7be843bSPierre Pronchery___
316*e7be843bSPierre Pronchery	&sbox($rk1);
317*e7be843bSPierre Pronchery$code.=<<___;
318*e7be843bSPierre Pronchery	ldp	$wtmp0,$wtmp1,[$kptr],8
319*e7be843bSPierre Pronchery	eor	@data[1].16b,@data[1].16b,$rk1.16b
320*e7be843bSPierre Pronchery
321*e7be843bSPierre Pronchery	dup	$rk0.4s,$wtmp0
322*e7be843bSPierre Pronchery	dup	$rk1.4s,$wtmp1
323*e7be843bSPierre Pronchery
324*e7be843bSPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
325*e7be843bSPierre Pronchery	eor	$rka.16b,@data[0].16b,@data[1].16b
326*e7be843bSPierre Pronchery	eor	$rk0.16b,@data[3].16b,$rk0.16b
327*e7be843bSPierre Pronchery	eor	$rk0.16b,$rka.16b,$rk0.16b
328*e7be843bSPierre Pronchery___
329*e7be843bSPierre Pronchery	&sbox($rk0);
330*e7be843bSPierre Pronchery$code.=<<___;
331*e7be843bSPierre Pronchery	eor	@data[2].16b,@data[2].16b,$rk0.16b
332*e7be843bSPierre Pronchery
333*e7be843bSPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
334*e7be843bSPierre Pronchery	eor	$rka.16b,$rka.16b,@data[2].16b
335*e7be843bSPierre Pronchery	eor	$rk1.16b,$rka.16b,$rk1.16b
336*e7be843bSPierre Pronchery___
337*e7be843bSPierre Pronchery	&sbox($rk1);
338*e7be843bSPierre Pronchery$code.=<<___;
339*e7be843bSPierre Pronchery	eor	@data[3].16b,@data[3].16b,$rk1.16b
340*e7be843bSPierre Pronchery___
341*e7be843bSPierre Pronchery}
342*e7be843bSPierre Pronchery
343*e7be843bSPierre Pronchery# sm4 for 8 lanes of data, in neon registers
344*e7be843bSPierre Pronchery# data0/data1/data2/data3 datax0/datax1/datax2/datax3
345*e7be843bSPierre Proncherysub sm4_8blks () {
346*e7be843bSPierre Pronchery	my $kptr = shift;
347*e7be843bSPierre Pronchery
348*e7be843bSPierre Pronchery$code.=<<___;
349*e7be843bSPierre Pronchery	ldp	$wtmp0,$wtmp1,[$kptr],8
350*e7be843bSPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
351*e7be843bSPierre Pronchery	dup	$rk0.4s,$wtmp0
352*e7be843bSPierre Pronchery	eor	$rka.16b,@data[2].16b,@data[3].16b
353*e7be843bSPierre Pronchery	eor	$rkb.16b,@datax[2].16b,@datax[3].16b
354*e7be843bSPierre Pronchery	eor	@vtmp[0].16b,@data[1].16b,$rk0.16b
355*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@datax[1].16b,$rk0.16b
356*e7be843bSPierre Pronchery	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
357*e7be843bSPierre Pronchery	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
358*e7be843bSPierre Pronchery___
359*e7be843bSPierre Pronchery	&sbox_double($rk0,$rk1);
360*e7be843bSPierre Pronchery$code.=<<___;
361*e7be843bSPierre Pronchery	eor	@data[0].16b,@data[0].16b,$rk0.16b
362*e7be843bSPierre Pronchery	eor	@datax[0].16b,@datax[0].16b,$rk1.16b
363*e7be843bSPierre Pronchery
364*e7be843bSPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
365*e7be843bSPierre Pronchery	dup	$rk1.4s,$wtmp1
366*e7be843bSPierre Pronchery	eor	$rka.16b,$rka.16b,@data[0].16b
367*e7be843bSPierre Pronchery	eor	$rkb.16b,$rkb.16b,@datax[0].16b
368*e7be843bSPierre Pronchery	eor	$rk0.16b,$rka.16b,$rk1.16b
369*e7be843bSPierre Pronchery	eor	$rk1.16b,$rkb.16b,$rk1.16b
370*e7be843bSPierre Pronchery___
371*e7be843bSPierre Pronchery	&sbox_double($rk0,$rk1);
372*e7be843bSPierre Pronchery$code.=<<___;
373*e7be843bSPierre Pronchery	ldp	$wtmp0,$wtmp1,[$kptr],8
374*e7be843bSPierre Pronchery	eor	@data[1].16b,@data[1].16b,$rk0.16b
375*e7be843bSPierre Pronchery	eor	@datax[1].16b,@datax[1].16b,$rk1.16b
376*e7be843bSPierre Pronchery
377*e7be843bSPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
378*e7be843bSPierre Pronchery	dup	$rk0.4s,$wtmp0
379*e7be843bSPierre Pronchery	eor	$rka.16b,@data[0].16b,@data[1].16b
380*e7be843bSPierre Pronchery	eor	$rkb.16b,@datax[0].16b,@datax[1].16b
381*e7be843bSPierre Pronchery	eor	@vtmp[0].16b,@data[3].16b,$rk0.16b
382*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@datax[3].16b,$rk0.16b
383*e7be843bSPierre Pronchery	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
384*e7be843bSPierre Pronchery	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
385*e7be843bSPierre Pronchery___
386*e7be843bSPierre Pronchery	&sbox_double($rk0,$rk1);
387*e7be843bSPierre Pronchery$code.=<<___;
388*e7be843bSPierre Pronchery	eor	@data[2].16b,@data[2].16b,$rk0.16b
389*e7be843bSPierre Pronchery	eor	@datax[2].16b,@datax[2].16b,$rk1.16b
390*e7be843bSPierre Pronchery
391*e7be843bSPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
392*e7be843bSPierre Pronchery	dup	$rk1.4s,$wtmp1
393*e7be843bSPierre Pronchery	eor	$rka.16b,$rka.16b,@data[2].16b
394*e7be843bSPierre Pronchery	eor	$rkb.16b,$rkb.16b,@datax[2].16b
395*e7be843bSPierre Pronchery	eor	$rk0.16b,$rka.16b,$rk1.16b
396*e7be843bSPierre Pronchery	eor	$rk1.16b,$rkb.16b,$rk1.16b
397*e7be843bSPierre Pronchery___
398*e7be843bSPierre Pronchery	&sbox_double($rk0,$rk1);
399*e7be843bSPierre Pronchery$code.=<<___;
400*e7be843bSPierre Pronchery	eor	@data[3].16b,@data[3].16b,$rk0.16b
401*e7be843bSPierre Pronchery	eor	@datax[3].16b,@datax[3].16b,$rk1.16b
402*e7be843bSPierre Pronchery___
403*e7be843bSPierre Pronchery}
404*e7be843bSPierre Pronchery
405*e7be843bSPierre Proncherysub encrypt_1blk_norev() {
406*e7be843bSPierre Pronchery	my $dat = shift;
407*e7be843bSPierre Pronchery
408*e7be843bSPierre Pronchery$code.=<<___;
409*e7be843bSPierre Pronchery	mov	$ptr,$rks
410*e7be843bSPierre Pronchery	mov	$counter,#8
411*e7be843bSPierre Pronchery	mov	$word0,$dat.s[0]
412*e7be843bSPierre Pronchery	mov	$word1,$dat.s[1]
413*e7be843bSPierre Pronchery	mov	$word2,$dat.s[2]
414*e7be843bSPierre Pronchery	mov	$word3,$dat.s[3]
415*e7be843bSPierre Pronchery10:
416*e7be843bSPierre Pronchery___
417*e7be843bSPierre Pronchery	&sm4_1blk($ptr);
418*e7be843bSPierre Pronchery$code.=<<___;
419*e7be843bSPierre Pronchery	subs	$counter,$counter,#1
420*e7be843bSPierre Pronchery	b.ne	10b
421*e7be843bSPierre Pronchery	mov	$dat.s[0],$word3
422*e7be843bSPierre Pronchery	mov	$dat.s[1],$word2
423*e7be843bSPierre Pronchery	mov	$dat.s[2],$word1
424*e7be843bSPierre Pronchery	mov	$dat.s[3],$word0
425*e7be843bSPierre Pronchery___
426*e7be843bSPierre Pronchery}
427*e7be843bSPierre Pronchery
428*e7be843bSPierre Proncherysub encrypt_1blk() {
429*e7be843bSPierre Pronchery	my $dat = shift;
430*e7be843bSPierre Pronchery
431*e7be843bSPierre Pronchery	&encrypt_1blk_norev($dat);
432*e7be843bSPierre Pronchery	&rev32($dat,$dat);
433*e7be843bSPierre Pronchery}
434*e7be843bSPierre Pronchery
435*e7be843bSPierre Proncherysub encrypt_4blks() {
436*e7be843bSPierre Pronchery$code.=<<___;
437*e7be843bSPierre Pronchery	mov	$ptr,$rks
438*e7be843bSPierre Pronchery	mov	$counter,#8
439*e7be843bSPierre Pronchery10:
440*e7be843bSPierre Pronchery___
441*e7be843bSPierre Pronchery	&sm4_4blks($ptr);
442*e7be843bSPierre Pronchery$code.=<<___;
443*e7be843bSPierre Pronchery	subs	$counter,$counter,#1
444*e7be843bSPierre Pronchery	b.ne	10b
445*e7be843bSPierre Pronchery___
446*e7be843bSPierre Pronchery	&rev32(@vtmp[3],@data[0]);
447*e7be843bSPierre Pronchery	&rev32(@vtmp[2],@data[1]);
448*e7be843bSPierre Pronchery	&rev32(@vtmp[1],@data[2]);
449*e7be843bSPierre Pronchery	&rev32(@vtmp[0],@data[3]);
450*e7be843bSPierre Pronchery}
451*e7be843bSPierre Pronchery
452*e7be843bSPierre Proncherysub encrypt_8blks() {
453*e7be843bSPierre Pronchery$code.=<<___;
454*e7be843bSPierre Pronchery	mov	$ptr,$rks
455*e7be843bSPierre Pronchery	mov	$counter,#8
456*e7be843bSPierre Pronchery10:
457*e7be843bSPierre Pronchery___
458*e7be843bSPierre Pronchery	&sm4_8blks($ptr);
459*e7be843bSPierre Pronchery$code.=<<___;
460*e7be843bSPierre Pronchery	subs	$counter,$counter,#1
461*e7be843bSPierre Pronchery	b.ne	10b
462*e7be843bSPierre Pronchery___
463*e7be843bSPierre Pronchery	&rev32(@vtmp[3],@data[0]);
464*e7be843bSPierre Pronchery	&rev32(@vtmp[2],@data[1]);
465*e7be843bSPierre Pronchery	&rev32(@vtmp[1],@data[2]);
466*e7be843bSPierre Pronchery	&rev32(@vtmp[0],@data[3]);
467*e7be843bSPierre Pronchery	&rev32(@data[3],@datax[0]);
468*e7be843bSPierre Pronchery	&rev32(@data[2],@datax[1]);
469*e7be843bSPierre Pronchery	&rev32(@data[1],@datax[2]);
470*e7be843bSPierre Pronchery	&rev32(@data[0],@datax[3]);
471*e7be843bSPierre Pronchery}
472*e7be843bSPierre Pronchery
473*e7be843bSPierre Proncherysub load_sbox () {
474*e7be843bSPierre Pronchery	my $data = shift;
475*e7be843bSPierre Pronchery
476*e7be843bSPierre Pronchery$code.=<<___;
477*e7be843bSPierre Pronchery	adrp	$ptr,.Lsbox
478*e7be843bSPierre Pronchery	add	$ptr,$ptr,#:lo12:.Lsbox
479*e7be843bSPierre Pronchery	ld1	{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64
480*e7be843bSPierre Pronchery	ld1	{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64
481*e7be843bSPierre Pronchery	ld1	{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64
482*e7be843bSPierre Pronchery	ld1	{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr]
483*e7be843bSPierre Pronchery___
484*e7be843bSPierre Pronchery}
485*e7be843bSPierre Pronchery
486*e7be843bSPierre Pronchery
487*e7be843bSPierre Proncherysub mov_reg_to_vec() {
488*e7be843bSPierre Pronchery	my $src0 = shift;
489*e7be843bSPierre Pronchery	my $src1 = shift;
490*e7be843bSPierre Pronchery	my $desv = shift;
491*e7be843bSPierre Pronchery$code.=<<___;
492*e7be843bSPierre Pronchery	mov $desv.d[0],$src0
493*e7be843bSPierre Pronchery	mov $desv.d[1],$src1
494*e7be843bSPierre Pronchery___
495*e7be843bSPierre Pronchery	&rev32_armeb($desv,$desv);
496*e7be843bSPierre Pronchery}
497*e7be843bSPierre Pronchery
498*e7be843bSPierre Proncherysub mov_vec_to_reg() {
499*e7be843bSPierre Pronchery	my $srcv = shift;
500*e7be843bSPierre Pronchery	my $des0 = shift;
501*e7be843bSPierre Pronchery	my $des1 = shift;
502*e7be843bSPierre Pronchery$code.=<<___;
503*e7be843bSPierre Pronchery	mov $des0,$srcv.d[0]
504*e7be843bSPierre Pronchery	mov $des1,$srcv.d[1]
505*e7be843bSPierre Pronchery___
506*e7be843bSPierre Pronchery}
507*e7be843bSPierre Pronchery
508*e7be843bSPierre Proncherysub compute_tweak() {
509*e7be843bSPierre Pronchery	my $src0 = shift;
510*e7be843bSPierre Pronchery	my $src1 = shift;
511*e7be843bSPierre Pronchery	my $des0 = shift;
512*e7be843bSPierre Pronchery	my $des1 = shift;
513*e7be843bSPierre Pronchery$code.=<<___;
514*e7be843bSPierre Pronchery	mov $wtmp0,0x87
515*e7be843bSPierre Pronchery	extr	$xtmp2,$src1,$src1,#32
516*e7be843bSPierre Pronchery	extr	$des1,$src1,$src0,#63
517*e7be843bSPierre Pronchery	and	$wtmp1,$wtmp0,$wtmp2,asr#31
518*e7be843bSPierre Pronchery	eor	$des0,$xtmp1,$src0,lsl#1
519*e7be843bSPierre Pronchery___
520*e7be843bSPierre Pronchery}
521*e7be843bSPierre Pronchery
522*e7be843bSPierre Proncherysub compute_tweak_vec() {
523*e7be843bSPierre Pronchery	my $src = shift;
524*e7be843bSPierre Pronchery	my $des = shift;
525*e7be843bSPierre Pronchery	my $std = shift;
526*e7be843bSPierre Pronchery	&rbit(@vtmp[2],$src,$std);
527*e7be843bSPierre Pronchery$code.=<<___;
528*e7be843bSPierre Pronchery	adrp $ptr,.Lxts_magic
529*e7be843bSPierre Pronchery	ldr  @qtmp[0], [$ptr, #:lo12:.Lxts_magic]
530*e7be843bSPierre Pronchery	shl  $des.16b, @vtmp[2].16b, #1
531*e7be843bSPierre Pronchery	ext  @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
532*e7be843bSPierre Pronchery	ushr @vtmp[1].16b, @vtmp[1].16b, #7
533*e7be843bSPierre Pronchery	mul  @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
534*e7be843bSPierre Pronchery	eor  $des.16b, $des.16b, @vtmp[1].16b
535*e7be843bSPierre Pronchery___
536*e7be843bSPierre Pronchery	&rbit($des,$des,$std);
537*e7be843bSPierre Pronchery}
538*e7be843bSPierre Pronchery
539*e7be843bSPierre Pronchery$code=<<___;
540*e7be843bSPierre Pronchery#include "arm_arch.h"
541*e7be843bSPierre Pronchery.arch	armv8-a
542*e7be843bSPierre Pronchery.text
543*e7be843bSPierre Pronchery
544*e7be843bSPierre Pronchery.rodata
545*e7be843bSPierre Pronchery.type	_${prefix}_consts,%object
546*e7be843bSPierre Pronchery.align	7
547*e7be843bSPierre Pronchery_${prefix}_consts:
548*e7be843bSPierre Pronchery.Lsbox:
549*e7be843bSPierre Pronchery	.byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
550*e7be843bSPierre Pronchery	.byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
551*e7be843bSPierre Pronchery	.byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
552*e7be843bSPierre Pronchery	.byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
553*e7be843bSPierre Pronchery	.byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
554*e7be843bSPierre Pronchery	.byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
555*e7be843bSPierre Pronchery	.byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
556*e7be843bSPierre Pronchery	.byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
557*e7be843bSPierre Pronchery	.byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
558*e7be843bSPierre Pronchery	.byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
559*e7be843bSPierre Pronchery	.byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
560*e7be843bSPierre Pronchery	.byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
561*e7be843bSPierre Pronchery	.byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
562*e7be843bSPierre Pronchery	.byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
563*e7be843bSPierre Pronchery	.byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
564*e7be843bSPierre Pronchery	.byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
565*e7be843bSPierre Pronchery.Lck:
566*e7be843bSPierre Pronchery	.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
567*e7be843bSPierre Pronchery	.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
568*e7be843bSPierre Pronchery	.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
569*e7be843bSPierre Pronchery	.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
570*e7be843bSPierre Pronchery	.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
571*e7be843bSPierre Pronchery	.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
572*e7be843bSPierre Pronchery	.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
573*e7be843bSPierre Pronchery	.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
574*e7be843bSPierre Pronchery.Lfk:
575*e7be843bSPierre Pronchery	.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
576*e7be843bSPierre Pronchery.Lshuffles:
577*e7be843bSPierre Pronchery	.quad 0x0B0A090807060504,0x030201000F0E0D0C
578*e7be843bSPierre Pronchery.Lxts_magic:
579*e7be843bSPierre Pronchery	.quad 0x0101010101010187,0x0101010101010101
580*e7be843bSPierre Pronchery
581*e7be843bSPierre Pronchery.size	_${prefix}_consts,.-_${prefix}_consts
582*e7be843bSPierre Pronchery
583*e7be843bSPierre Pronchery.previous
584*e7be843bSPierre Pronchery
585*e7be843bSPierre Pronchery___
586*e7be843bSPierre Pronchery
587*e7be843bSPierre Pronchery{{{
588*e7be843bSPierre Proncherymy ($key,$keys,$enc)=("x0","x1","w2");
589*e7be843bSPierre Proncherymy ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
590*e7be843bSPierre Proncherymy ($vkey,$vfk,$vmap)=("v5","v6","v7");
591*e7be843bSPierre Pronchery$code.=<<___;
592*e7be843bSPierre Pronchery.type	_vpsm4_set_key,%function
593*e7be843bSPierre Pronchery.align	4
594*e7be843bSPierre Pronchery_vpsm4_set_key:
595*e7be843bSPierre Pronchery	AARCH64_VALID_CALL_TARGET
596*e7be843bSPierre Pronchery	ld1	{$vkey.4s},[$key]
597*e7be843bSPierre Pronchery___
598*e7be843bSPierre Pronchery	&load_sbox();
599*e7be843bSPierre Pronchery	&rev32($vkey,$vkey);
600*e7be843bSPierre Pronchery$code.=<<___;
601*e7be843bSPierre Pronchery	adrp	$pointer,.Lshuffles
602*e7be843bSPierre Pronchery	add	$pointer,$pointer,#:lo12:.Lshuffles
603*e7be843bSPierre Pronchery	ld1	{$vmap.2d},[$pointer]
604*e7be843bSPierre Pronchery	adrp	$pointer,.Lfk
605*e7be843bSPierre Pronchery	add	$pointer,$pointer,#:lo12:.Lfk
606*e7be843bSPierre Pronchery	ld1	{$vfk.2d},[$pointer]
607*e7be843bSPierre Pronchery	eor	$vkey.16b,$vkey.16b,$vfk.16b
608*e7be843bSPierre Pronchery	mov	$schedules,#32
609*e7be843bSPierre Pronchery	adrp	$pointer,.Lck
610*e7be843bSPierre Pronchery	add	$pointer,$pointer,#:lo12:.Lck
611*e7be843bSPierre Pronchery	movi	@vtmp[0].16b,#64
612*e7be843bSPierre Pronchery	cbnz	$enc,1f
613*e7be843bSPierre Pronchery	add	$keys,$keys,124
614*e7be843bSPierre Pronchery1:
615*e7be843bSPierre Pronchery	mov	$wtmp,$vkey.s[1]
616*e7be843bSPierre Pronchery	ldr	$roundkey,[$pointer],#4
617*e7be843bSPierre Pronchery	eor	$roundkey,$roundkey,$wtmp
618*e7be843bSPierre Pronchery	mov	$wtmp,$vkey.s[2]
619*e7be843bSPierre Pronchery	eor	$roundkey,$roundkey,$wtmp
620*e7be843bSPierre Pronchery	mov	$wtmp,$vkey.s[3]
621*e7be843bSPierre Pronchery	eor	$roundkey,$roundkey,$wtmp
622*e7be843bSPierre Pronchery	// sbox lookup
623*e7be843bSPierre Pronchery	mov	@data[0].s[0],$roundkey
624*e7be843bSPierre Pronchery	tbl	@vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
625*e7be843bSPierre Pronchery	sub	@data[0].16b,@data[0].16b,@vtmp[0].16b
626*e7be843bSPierre Pronchery	tbx	@vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
627*e7be843bSPierre Pronchery	sub	@data[0].16b,@data[0].16b,@vtmp[0].16b
628*e7be843bSPierre Pronchery	tbx	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
629*e7be843bSPierre Pronchery	sub	@data[0].16b,@data[0].16b,@vtmp[0].16b
630*e7be843bSPierre Pronchery	tbx	@vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
631*e7be843bSPierre Pronchery	mov	$wtmp,@vtmp[1].s[0]
632*e7be843bSPierre Pronchery	eor	$roundkey,$wtmp,$wtmp,ror #19
633*e7be843bSPierre Pronchery	eor	$roundkey,$roundkey,$wtmp,ror #9
634*e7be843bSPierre Pronchery	mov	$wtmp,$vkey.s[0]
635*e7be843bSPierre Pronchery	eor	$roundkey,$roundkey,$wtmp
636*e7be843bSPierre Pronchery	mov	$vkey.s[0],$roundkey
637*e7be843bSPierre Pronchery	cbz	$enc,2f
638*e7be843bSPierre Pronchery	str	$roundkey,[$keys],#4
639*e7be843bSPierre Pronchery	b	3f
640*e7be843bSPierre Pronchery2:
641*e7be843bSPierre Pronchery	str	$roundkey,[$keys],#-4
642*e7be843bSPierre Pronchery3:
643*e7be843bSPierre Pronchery	tbl	$vkey.16b,{$vkey.16b},$vmap.16b
644*e7be843bSPierre Pronchery	subs	$schedules,$schedules,#1
645*e7be843bSPierre Pronchery	b.ne	1b
646*e7be843bSPierre Pronchery	ret
647*e7be843bSPierre Pronchery.size	_vpsm4_set_key,.-_vpsm4_set_key
648*e7be843bSPierre Pronchery___
649*e7be843bSPierre Pronchery}}}
650*e7be843bSPierre Pronchery
651*e7be843bSPierre Pronchery
652*e7be843bSPierre Pronchery{{{
653*e7be843bSPierre Pronchery$code.=<<___;
654*e7be843bSPierre Pronchery.type	_vpsm4_enc_4blks,%function
655*e7be843bSPierre Pronchery.align	4
656*e7be843bSPierre Pronchery_vpsm4_enc_4blks:
657*e7be843bSPierre Pronchery	AARCH64_VALID_CALL_TARGET
658*e7be843bSPierre Pronchery___
659*e7be843bSPierre Pronchery	&encrypt_4blks();
660*e7be843bSPierre Pronchery$code.=<<___;
661*e7be843bSPierre Pronchery	ret
662*e7be843bSPierre Pronchery.size	_vpsm4_enc_4blks,.-_vpsm4_enc_4blks
663*e7be843bSPierre Pronchery___
664*e7be843bSPierre Pronchery}}}
665*e7be843bSPierre Pronchery
666*e7be843bSPierre Pronchery{{{
667*e7be843bSPierre Pronchery$code.=<<___;
668*e7be843bSPierre Pronchery.type	_vpsm4_enc_8blks,%function
669*e7be843bSPierre Pronchery.align	4
670*e7be843bSPierre Pronchery_vpsm4_enc_8blks:
671*e7be843bSPierre Pronchery	AARCH64_VALID_CALL_TARGET
672*e7be843bSPierre Pronchery___
673*e7be843bSPierre Pronchery	&encrypt_8blks();
674*e7be843bSPierre Pronchery$code.=<<___;
675*e7be843bSPierre Pronchery	ret
676*e7be843bSPierre Pronchery.size	_vpsm4_enc_8blks,.-_vpsm4_enc_8blks
677*e7be843bSPierre Pronchery___
678*e7be843bSPierre Pronchery}}}
679*e7be843bSPierre Pronchery
680*e7be843bSPierre Pronchery
681*e7be843bSPierre Pronchery{{{
682*e7be843bSPierre Proncherymy ($key,$keys)=("x0","x1");
683*e7be843bSPierre Pronchery$code.=<<___;
684*e7be843bSPierre Pronchery.globl	${prefix}_set_encrypt_key
685*e7be843bSPierre Pronchery.type	${prefix}_set_encrypt_key,%function
686*e7be843bSPierre Pronchery.align	5
687*e7be843bSPierre Pronchery${prefix}_set_encrypt_key:
688*e7be843bSPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
689*e7be843bSPierre Pronchery	stp	x29,x30,[sp,#-16]!
690*e7be843bSPierre Pronchery	mov	w2,1
691*e7be843bSPierre Pronchery	bl	_vpsm4_set_key
692*e7be843bSPierre Pronchery	ldp	x29,x30,[sp],#16
693*e7be843bSPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
694*e7be843bSPierre Pronchery	ret
695*e7be843bSPierre Pronchery.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
696*e7be843bSPierre Pronchery___
697*e7be843bSPierre Pronchery}}}
698*e7be843bSPierre Pronchery
699*e7be843bSPierre Pronchery{{{
700*e7be843bSPierre Proncherymy ($key,$keys)=("x0","x1");
701*e7be843bSPierre Pronchery$code.=<<___;
702*e7be843bSPierre Pronchery.globl	${prefix}_set_decrypt_key
703*e7be843bSPierre Pronchery.type	${prefix}_set_decrypt_key,%function
704*e7be843bSPierre Pronchery.align	5
705*e7be843bSPierre Pronchery${prefix}_set_decrypt_key:
706*e7be843bSPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
707*e7be843bSPierre Pronchery	stp	x29,x30,[sp,#-16]!
708*e7be843bSPierre Pronchery	mov	w2,0
709*e7be843bSPierre Pronchery	bl	_vpsm4_set_key
710*e7be843bSPierre Pronchery	ldp	x29,x30,[sp],#16
711*e7be843bSPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
712*e7be843bSPierre Pronchery	ret
713*e7be843bSPierre Pronchery.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
714*e7be843bSPierre Pronchery___
715*e7be843bSPierre Pronchery}}}
716*e7be843bSPierre Pronchery
717*e7be843bSPierre Pronchery{{{
718*e7be843bSPierre Proncherysub gen_block () {
719*e7be843bSPierre Pronchery	my $dir = shift;
720*e7be843bSPierre Pronchery	my ($inp,$outp,$rk)=map("x$_",(0..2));
721*e7be843bSPierre Pronchery
722*e7be843bSPierre Pronchery$code.=<<___;
723*e7be843bSPierre Pronchery.globl	${prefix}_${dir}crypt
724*e7be843bSPierre Pronchery.type	${prefix}_${dir}crypt,%function
725*e7be843bSPierre Pronchery.align	5
726*e7be843bSPierre Pronchery${prefix}_${dir}crypt:
727*e7be843bSPierre Pronchery	AARCH64_VALID_CALL_TARGET
728*e7be843bSPierre Pronchery	ld1	{@data[0].4s},[$inp]
729*e7be843bSPierre Pronchery___
730*e7be843bSPierre Pronchery	&load_sbox();
731*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
732*e7be843bSPierre Pronchery$code.=<<___;
733*e7be843bSPierre Pronchery	mov	$rks,x2
734*e7be843bSPierre Pronchery___
735*e7be843bSPierre Pronchery	&encrypt_1blk(@data[0]);
736*e7be843bSPierre Pronchery$code.=<<___;
737*e7be843bSPierre Pronchery	st1	{@data[0].4s},[$outp]
738*e7be843bSPierre Pronchery	ret
739*e7be843bSPierre Pronchery.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
740*e7be843bSPierre Pronchery___
741*e7be843bSPierre Pronchery}
742*e7be843bSPierre Pronchery&gen_block("en");
743*e7be843bSPierre Pronchery&gen_block("de");
744*e7be843bSPierre Pronchery}}}
745*e7be843bSPierre Pronchery
746*e7be843bSPierre Pronchery{{{
747*e7be843bSPierre Proncherymy ($enc) = ("w4");
748*e7be843bSPierre Proncherymy @dat=map("v$_",(16..23));
749*e7be843bSPierre Pronchery
750*e7be843bSPierre Pronchery$code.=<<___;
751*e7be843bSPierre Pronchery.globl	${prefix}_ecb_encrypt
752*e7be843bSPierre Pronchery.type	${prefix}_ecb_encrypt,%function
753*e7be843bSPierre Pronchery.align	5
754*e7be843bSPierre Pronchery${prefix}_ecb_encrypt:
755*e7be843bSPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
756*e7be843bSPierre Pronchery	// convert length into blocks
757*e7be843bSPierre Pronchery	lsr	x2,x2,4
758*e7be843bSPierre Pronchery	stp	d8,d9,[sp,#-80]!
759*e7be843bSPierre Pronchery	stp	d10,d11,[sp,#16]
760*e7be843bSPierre Pronchery	stp	d12,d13,[sp,#32]
761*e7be843bSPierre Pronchery	stp	d14,d15,[sp,#48]
762*e7be843bSPierre Pronchery	stp	x29,x30,[sp,#64]
763*e7be843bSPierre Pronchery___
764*e7be843bSPierre Pronchery	&load_sbox();
765*e7be843bSPierre Pronchery$code.=<<___;
766*e7be843bSPierre Pronchery.Lecb_8_blocks_process:
767*e7be843bSPierre Pronchery	cmp	$blocks,#8
768*e7be843bSPierre Pronchery	b.lt	.Lecb_4_blocks_process
769*e7be843bSPierre Pronchery	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
770*e7be843bSPierre Pronchery	ld4	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
771*e7be843bSPierre Pronchery___
772*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
773*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
774*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
775*e7be843bSPierre Pronchery	&rev32(@data[3],@data[3]);
776*e7be843bSPierre Pronchery	&rev32(@datax[0],@datax[0]);
777*e7be843bSPierre Pronchery	&rev32(@datax[1],@datax[1]);
778*e7be843bSPierre Pronchery	&rev32(@datax[2],@datax[2]);
779*e7be843bSPierre Pronchery	&rev32(@datax[3],@datax[3]);
780*e7be843bSPierre Pronchery$code.=<<___;
781*e7be843bSPierre Pronchery	bl	_vpsm4_enc_8blks
782*e7be843bSPierre Pronchery	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
783*e7be843bSPierre Pronchery	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
784*e7be843bSPierre Pronchery	subs	$blocks,$blocks,#8
785*e7be843bSPierre Pronchery	b.gt	.Lecb_8_blocks_process
786*e7be843bSPierre Pronchery	b	100f
787*e7be843bSPierre Pronchery.Lecb_4_blocks_process:
788*e7be843bSPierre Pronchery	cmp	$blocks,#4
789*e7be843bSPierre Pronchery	b.lt	1f
790*e7be843bSPierre Pronchery	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
791*e7be843bSPierre Pronchery___
792*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
793*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
794*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
795*e7be843bSPierre Pronchery	&rev32(@data[3],@data[3]);
796*e7be843bSPierre Pronchery$code.=<<___;
797*e7be843bSPierre Pronchery	bl	_vpsm4_enc_4blks
798*e7be843bSPierre Pronchery	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
799*e7be843bSPierre Pronchery	sub	$blocks,$blocks,#4
800*e7be843bSPierre Pronchery1:
801*e7be843bSPierre Pronchery	// process last block
802*e7be843bSPierre Pronchery	cmp	$blocks,#1
803*e7be843bSPierre Pronchery	b.lt	100f
804*e7be843bSPierre Pronchery	b.gt	1f
805*e7be843bSPierre Pronchery	ld1	{@data[0].4s},[$inp]
806*e7be843bSPierre Pronchery___
807*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
808*e7be843bSPierre Pronchery	&encrypt_1blk(@data[0]);
809*e7be843bSPierre Pronchery$code.=<<___;
810*e7be843bSPierre Pronchery	st1	{@data[0].4s},[$outp]
811*e7be843bSPierre Pronchery	b	100f
812*e7be843bSPierre Pronchery1:	// process last 2 blocks
813*e7be843bSPierre Pronchery	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
814*e7be843bSPierre Pronchery	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
815*e7be843bSPierre Pronchery	cmp	$blocks,#2
816*e7be843bSPierre Pronchery	b.gt	1f
817*e7be843bSPierre Pronchery___
818*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
819*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
820*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
821*e7be843bSPierre Pronchery	&rev32(@data[3],@data[3]);
822*e7be843bSPierre Pronchery$code.=<<___;
823*e7be843bSPierre Pronchery	bl	_vpsm4_enc_4blks
824*e7be843bSPierre Pronchery	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
825*e7be843bSPierre Pronchery	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp]
826*e7be843bSPierre Pronchery	b	100f
827*e7be843bSPierre Pronchery1:	// process last 3 blocks
828*e7be843bSPierre Pronchery	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
829*e7be843bSPierre Pronchery___
830*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
831*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
832*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
833*e7be843bSPierre Pronchery	&rev32(@data[3],@data[3]);
834*e7be843bSPierre Pronchery$code.=<<___;
835*e7be843bSPierre Pronchery	bl	_vpsm4_enc_4blks
836*e7be843bSPierre Pronchery	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
837*e7be843bSPierre Pronchery	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
838*e7be843bSPierre Pronchery	st4	{@vtmp[0].s-@vtmp[3].s}[2],[$outp]
839*e7be843bSPierre Pronchery100:
840*e7be843bSPierre Pronchery	ldp	d10,d11,[sp,#16]
841*e7be843bSPierre Pronchery	ldp	d12,d13,[sp,#32]
842*e7be843bSPierre Pronchery	ldp	d14,d15,[sp,#48]
843*e7be843bSPierre Pronchery	ldp	x29,x30,[sp,#64]
844*e7be843bSPierre Pronchery	ldp	d8,d9,[sp],#80
845*e7be843bSPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
846*e7be843bSPierre Pronchery	ret
847*e7be843bSPierre Pronchery.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
848*e7be843bSPierre Pronchery___
849*e7be843bSPierre Pronchery}}}
850*e7be843bSPierre Pronchery
851*e7be843bSPierre Pronchery{{{
852*e7be843bSPierre Proncherymy ($len,$ivp,$enc)=("x2","x4","w5");
853*e7be843bSPierre Proncherymy $ivec0=("v3");
854*e7be843bSPierre Proncherymy $ivec1=("v15");
855*e7be843bSPierre Pronchery
856*e7be843bSPierre Pronchery$code.=<<___;
857*e7be843bSPierre Pronchery.globl	${prefix}_cbc_encrypt
858*e7be843bSPierre Pronchery.type	${prefix}_cbc_encrypt,%function
859*e7be843bSPierre Pronchery.align	5
860*e7be843bSPierre Pronchery${prefix}_cbc_encrypt:
861*e7be843bSPierre Pronchery	AARCH64_VALID_CALL_TARGET
862*e7be843bSPierre Pronchery	lsr	$len,$len,4
863*e7be843bSPierre Pronchery___
864*e7be843bSPierre Pronchery	&load_sbox();
865*e7be843bSPierre Pronchery$code.=<<___;
866*e7be843bSPierre Pronchery	cbz	$enc,.Ldec
867*e7be843bSPierre Pronchery	ld1	{$ivec0.4s},[$ivp]
868*e7be843bSPierre Pronchery.Lcbc_4_blocks_enc:
869*e7be843bSPierre Pronchery	cmp	$blocks,#4
870*e7be843bSPierre Pronchery	b.lt	1f
871*e7be843bSPierre Pronchery	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
872*e7be843bSPierre Pronchery	eor	@data[0].16b,@data[0].16b,$ivec0.16b
873*e7be843bSPierre Pronchery___
874*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
875*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
876*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
877*e7be843bSPierre Pronchery	&rev32(@data[3],@data[3]);
878*e7be843bSPierre Pronchery	&encrypt_1blk_norev(@data[0]);
879*e7be843bSPierre Pronchery$code.=<<___;
880*e7be843bSPierre Pronchery	eor	@data[1].16b,@data[1].16b,@data[0].16b
881*e7be843bSPierre Pronchery___
882*e7be843bSPierre Pronchery	&encrypt_1blk_norev(@data[1]);
883*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
884*e7be843bSPierre Pronchery
885*e7be843bSPierre Pronchery$code.=<<___;
886*e7be843bSPierre Pronchery	eor	@data[2].16b,@data[2].16b,@data[1].16b
887*e7be843bSPierre Pronchery___
888*e7be843bSPierre Pronchery	&encrypt_1blk_norev(@data[2]);
889*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
890*e7be843bSPierre Pronchery$code.=<<___;
891*e7be843bSPierre Pronchery	eor	@data[3].16b,@data[3].16b,@data[2].16b
892*e7be843bSPierre Pronchery___
893*e7be843bSPierre Pronchery	&encrypt_1blk_norev(@data[3]);
894*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
895*e7be843bSPierre Pronchery	&rev32(@data[3],@data[3]);
896*e7be843bSPierre Pronchery$code.=<<___;
897*e7be843bSPierre Pronchery	orr	$ivec0.16b,@data[3].16b,@data[3].16b
898*e7be843bSPierre Pronchery	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
899*e7be843bSPierre Pronchery	subs	$blocks,$blocks,#4
900*e7be843bSPierre Pronchery	b.ne	.Lcbc_4_blocks_enc
901*e7be843bSPierre Pronchery	b	2f
902*e7be843bSPierre Pronchery1:
903*e7be843bSPierre Pronchery	subs	$blocks,$blocks,#1
904*e7be843bSPierre Pronchery	b.lt	2f
905*e7be843bSPierre Pronchery	ld1	{@data[0].4s},[$inp],#16
906*e7be843bSPierre Pronchery	eor	$ivec0.16b,$ivec0.16b,@data[0].16b
907*e7be843bSPierre Pronchery___
908*e7be843bSPierre Pronchery	&rev32($ivec0,$ivec0);
909*e7be843bSPierre Pronchery	&encrypt_1blk($ivec0);
910*e7be843bSPierre Pronchery$code.=<<___;
911*e7be843bSPierre Pronchery	st1	{$ivec0.4s},[$outp],#16
912*e7be843bSPierre Pronchery	b	1b
913*e7be843bSPierre Pronchery2:
914*e7be843bSPierre Pronchery	// save back IV
915*e7be843bSPierre Pronchery	st1	{$ivec0.4s},[$ivp]
916*e7be843bSPierre Pronchery	ret
917*e7be843bSPierre Pronchery
918*e7be843bSPierre Pronchery.Ldec:
919*e7be843bSPierre Pronchery	// decryption mode starts
920*e7be843bSPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
921*e7be843bSPierre Pronchery	stp	d8,d9,[sp,#-80]!
922*e7be843bSPierre Pronchery	stp	d10,d11,[sp,#16]
923*e7be843bSPierre Pronchery	stp	d12,d13,[sp,#32]
924*e7be843bSPierre Pronchery	stp	d14,d15,[sp,#48]
925*e7be843bSPierre Pronchery	stp	x29,x30,[sp,#64]
926*e7be843bSPierre Pronchery.Lcbc_8_blocks_dec:
927*e7be843bSPierre Pronchery	cmp	$blocks,#8
928*e7be843bSPierre Pronchery	b.lt	1f
929*e7be843bSPierre Pronchery	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
930*e7be843bSPierre Pronchery	add	$ptr,$inp,#64
931*e7be843bSPierre Pronchery	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
932*e7be843bSPierre Pronchery___
933*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
934*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
935*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
936*e7be843bSPierre Pronchery	&rev32(@data[3],$data[3]);
937*e7be843bSPierre Pronchery	&rev32(@datax[0],@datax[0]);
938*e7be843bSPierre Pronchery	&rev32(@datax[1],@datax[1]);
939*e7be843bSPierre Pronchery	&rev32(@datax[2],@datax[2]);
940*e7be843bSPierre Pronchery	&rev32(@datax[3],$datax[3]);
941*e7be843bSPierre Pronchery$code.=<<___;
942*e7be843bSPierre Pronchery	bl	_vpsm4_enc_8blks
943*e7be843bSPierre Pronchery___
944*e7be843bSPierre Pronchery	&transpose(@vtmp,@datax);
945*e7be843bSPierre Pronchery	&transpose(@data,@datax);
946*e7be843bSPierre Pronchery$code.=<<___;
947*e7be843bSPierre Pronchery	ld1	{$ivec1.4s},[$ivp]
948*e7be843bSPierre Pronchery	ld1	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
949*e7be843bSPierre Pronchery	// note ivec1 and vtmpx[3] are reusing the same register
950*e7be843bSPierre Pronchery	// care needs to be taken to avoid conflict
951*e7be843bSPierre Pronchery	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
952*e7be843bSPierre Pronchery	ld1	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
953*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
954*e7be843bSPierre Pronchery	eor	@vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
955*e7be843bSPierre Pronchery	eor	@vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
956*e7be843bSPierre Pronchery	// save back IV
957*e7be843bSPierre Pronchery	st1	{$vtmpx[3].4s}, [$ivp]
958*e7be843bSPierre Pronchery	eor	@data[0].16b,@data[0].16b,$datax[3].16b
959*e7be843bSPierre Pronchery	eor	@data[1].16b,@data[1].16b,@vtmpx[0].16b
960*e7be843bSPierre Pronchery	eor	@data[2].16b,@data[2].16b,@vtmpx[1].16b
961*e7be843bSPierre Pronchery	eor	@data[3].16b,$data[3].16b,@vtmpx[2].16b
962*e7be843bSPierre Pronchery	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
963*e7be843bSPierre Pronchery	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
964*e7be843bSPierre Pronchery	subs	$blocks,$blocks,#8
965*e7be843bSPierre Pronchery	b.gt	.Lcbc_8_blocks_dec
966*e7be843bSPierre Pronchery	b.eq	100f
967*e7be843bSPierre Pronchery1:
968*e7be843bSPierre Pronchery	ld1	{$ivec1.4s},[$ivp]
969*e7be843bSPierre Pronchery.Lcbc_4_blocks_dec:
970*e7be843bSPierre Pronchery	cmp	$blocks,#4
971*e7be843bSPierre Pronchery	b.lt	1f
972*e7be843bSPierre Pronchery	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
973*e7be843bSPierre Pronchery___
974*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
975*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
976*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
977*e7be843bSPierre Pronchery	&rev32(@data[3],$data[3]);
978*e7be843bSPierre Pronchery$code.=<<___;
979*e7be843bSPierre Pronchery	bl	_vpsm4_enc_4blks
980*e7be843bSPierre Pronchery	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
981*e7be843bSPierre Pronchery___
982*e7be843bSPierre Pronchery	&transpose(@vtmp,@datax);
983*e7be843bSPierre Pronchery$code.=<<___;
984*e7be843bSPierre Pronchery	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
985*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
986*e7be843bSPierre Pronchery	orr	$ivec1.16b,@data[3].16b,@data[3].16b
987*e7be843bSPierre Pronchery	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
988*e7be843bSPierre Pronchery	eor	@vtmp[3].16b,$vtmp[3].16b,@data[2].16b
989*e7be843bSPierre Pronchery	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
990*e7be843bSPierre Pronchery	subs	$blocks,$blocks,#4
991*e7be843bSPierre Pronchery	b.gt	.Lcbc_4_blocks_dec
992*e7be843bSPierre Pronchery	// save back IV
993*e7be843bSPierre Pronchery	st1	{@data[3].4s}, [$ivp]
994*e7be843bSPierre Pronchery	b	100f
995*e7be843bSPierre Pronchery1:	// last block
996*e7be843bSPierre Pronchery	subs	$blocks,$blocks,#1
997*e7be843bSPierre Pronchery	b.lt	100f
998*e7be843bSPierre Pronchery	b.gt	1f
999*e7be843bSPierre Pronchery	ld1	{@data[0].4s},[$inp],#16
1000*e7be843bSPierre Pronchery	// save back IV
1001*e7be843bSPierre Pronchery	st1	{$data[0].4s}, [$ivp]
1002*e7be843bSPierre Pronchery___
1003*e7be843bSPierre Pronchery	&rev32(@datax[0],@data[0]);
1004*e7be843bSPierre Pronchery	&encrypt_1blk(@datax[0]);
1005*e7be843bSPierre Pronchery$code.=<<___;
1006*e7be843bSPierre Pronchery	eor	@datax[0].16b,@datax[0].16b,$ivec1.16b
1007*e7be843bSPierre Pronchery	st1	{@datax[0].4s},[$outp],#16
1008*e7be843bSPierre Pronchery	b	100f
1009*e7be843bSPierre Pronchery1:	// last two blocks
1010*e7be843bSPierre Pronchery	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
1011*e7be843bSPierre Pronchery	add	$ptr,$inp,#16
1012*e7be843bSPierre Pronchery	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
1013*e7be843bSPierre Pronchery	subs	$blocks,$blocks,1
1014*e7be843bSPierre Pronchery	b.gt	1f
1015*e7be843bSPierre Pronchery___
1016*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
1017*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
1018*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
1019*e7be843bSPierre Pronchery	&rev32(@data[3],@data[3]);
1020*e7be843bSPierre Pronchery$code.=<<___;
1021*e7be843bSPierre Pronchery	bl	_vpsm4_enc_4blks
1022*e7be843bSPierre Pronchery	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
1023*e7be843bSPierre Pronchery___
1024*e7be843bSPierre Pronchery	&transpose(@vtmp,@datax);
1025*e7be843bSPierre Pronchery$code.=<<___;
1026*e7be843bSPierre Pronchery	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1027*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1028*e7be843bSPierre Pronchery	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1029*e7be843bSPierre Pronchery	// save back IV
1030*e7be843bSPierre Pronchery	st1	{@data[1].4s}, [$ivp]
1031*e7be843bSPierre Pronchery	b	100f
1032*e7be843bSPierre Pronchery1:	// last 3 blocks
1033*e7be843bSPierre Pronchery	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
1034*e7be843bSPierre Pronchery___
1035*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
1036*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
1037*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
1038*e7be843bSPierre Pronchery	&rev32(@data[3],@data[3]);
1039*e7be843bSPierre Pronchery$code.=<<___;
1040*e7be843bSPierre Pronchery	bl	_vpsm4_enc_4blks
1041*e7be843bSPierre Pronchery	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1042*e7be843bSPierre Pronchery___
1043*e7be843bSPierre Pronchery	&transpose(@vtmp,@datax);
1044*e7be843bSPierre Pronchery$code.=<<___;
1045*e7be843bSPierre Pronchery	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1046*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1047*e7be843bSPierre Pronchery	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
1048*e7be843bSPierre Pronchery	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1049*e7be843bSPierre Pronchery	// save back IV
1050*e7be843bSPierre Pronchery	st1	{@data[2].4s}, [$ivp]
1051*e7be843bSPierre Pronchery100:
1052*e7be843bSPierre Pronchery	ldp	d10,d11,[sp,#16]
1053*e7be843bSPierre Pronchery	ldp	d12,d13,[sp,#32]
1054*e7be843bSPierre Pronchery	ldp	d14,d15,[sp,#48]
1055*e7be843bSPierre Pronchery	ldp	x29,x30,[sp,#64]
1056*e7be843bSPierre Pronchery	ldp	d8,d9,[sp],#80
1057*e7be843bSPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
1058*e7be843bSPierre Pronchery	ret
1059*e7be843bSPierre Pronchery.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1060*e7be843bSPierre Pronchery___
1061*e7be843bSPierre Pronchery}}}
1062*e7be843bSPierre Pronchery
1063*e7be843bSPierre Pronchery{{{
1064*e7be843bSPierre Proncherymy ($ivp)=("x4");
1065*e7be843bSPierre Proncherymy ($ctr)=("w5");
1066*e7be843bSPierre Proncherymy $ivec=("v3");
1067*e7be843bSPierre Pronchery
1068*e7be843bSPierre Pronchery$code.=<<___;
1069*e7be843bSPierre Pronchery.globl	${prefix}_ctr32_encrypt_blocks
1070*e7be843bSPierre Pronchery.type	${prefix}_ctr32_encrypt_blocks,%function
1071*e7be843bSPierre Pronchery.align	5
1072*e7be843bSPierre Pronchery${prefix}_ctr32_encrypt_blocks:
1073*e7be843bSPierre Pronchery	AARCH64_VALID_CALL_TARGET
1074*e7be843bSPierre Pronchery	ld1	{$ivec.4s},[$ivp]
1075*e7be843bSPierre Pronchery___
1076*e7be843bSPierre Pronchery	&rev32($ivec,$ivec);
1077*e7be843bSPierre Pronchery	&load_sbox();
1078*e7be843bSPierre Pronchery$code.=<<___;
1079*e7be843bSPierre Pronchery	cmp	$blocks,#1
1080*e7be843bSPierre Pronchery	b.ne	1f
1081*e7be843bSPierre Pronchery	// fast processing for one single block without
1082*e7be843bSPierre Pronchery	// context saving overhead
1083*e7be843bSPierre Pronchery___
1084*e7be843bSPierre Pronchery	&encrypt_1blk($ivec);
1085*e7be843bSPierre Pronchery$code.=<<___;
1086*e7be843bSPierre Pronchery	ld1	{@data[0].4s},[$inp]
1087*e7be843bSPierre Pronchery	eor	@data[0].16b,@data[0].16b,$ivec.16b
1088*e7be843bSPierre Pronchery	st1	{@data[0].4s},[$outp]
1089*e7be843bSPierre Pronchery	ret
1090*e7be843bSPierre Pronchery1:
1091*e7be843bSPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
1092*e7be843bSPierre Pronchery	stp	d8,d9,[sp,#-80]!
1093*e7be843bSPierre Pronchery	stp	d10,d11,[sp,#16]
1094*e7be843bSPierre Pronchery	stp	d12,d13,[sp,#32]
1095*e7be843bSPierre Pronchery	stp	d14,d15,[sp,#48]
1096*e7be843bSPierre Pronchery	stp	x29,x30,[sp,#64]
1097*e7be843bSPierre Pronchery	mov	$word0,$ivec.s[0]
1098*e7be843bSPierre Pronchery	mov	$word1,$ivec.s[1]
1099*e7be843bSPierre Pronchery	mov	$word2,$ivec.s[2]
1100*e7be843bSPierre Pronchery	mov	$ctr,$ivec.s[3]
1101*e7be843bSPierre Pronchery.Lctr32_4_blocks_process:
1102*e7be843bSPierre Pronchery	cmp	$blocks,#4
1103*e7be843bSPierre Pronchery	b.lt	1f
1104*e7be843bSPierre Pronchery	dup	@data[0].4s,$word0
1105*e7be843bSPierre Pronchery	dup	@data[1].4s,$word1
1106*e7be843bSPierre Pronchery	dup	@data[2].4s,$word2
1107*e7be843bSPierre Pronchery	mov	@data[3].s[0],$ctr
1108*e7be843bSPierre Pronchery	add	$ctr,$ctr,#1
1109*e7be843bSPierre Pronchery	mov	$data[3].s[1],$ctr
1110*e7be843bSPierre Pronchery	add	$ctr,$ctr,#1
1111*e7be843bSPierre Pronchery	mov	@data[3].s[2],$ctr
1112*e7be843bSPierre Pronchery	add	$ctr,$ctr,#1
1113*e7be843bSPierre Pronchery	mov	@data[3].s[3],$ctr
1114*e7be843bSPierre Pronchery	add	$ctr,$ctr,#1
1115*e7be843bSPierre Pronchery	cmp	$blocks,#8
1116*e7be843bSPierre Pronchery	b.ge	.Lctr32_8_blocks_process
1117*e7be843bSPierre Pronchery	bl	_vpsm4_enc_4blks
1118*e7be843bSPierre Pronchery	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1119*e7be843bSPierre Pronchery	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1120*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1121*e7be843bSPierre Pronchery	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1122*e7be843bSPierre Pronchery	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1123*e7be843bSPierre Pronchery	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1124*e7be843bSPierre Pronchery	subs	$blocks,$blocks,#4
1125*e7be843bSPierre Pronchery	b.ne	.Lctr32_4_blocks_process
1126*e7be843bSPierre Pronchery	b	100f
1127*e7be843bSPierre Pronchery.Lctr32_8_blocks_process:
1128*e7be843bSPierre Pronchery	dup	@datax[0].4s,$word0
1129*e7be843bSPierre Pronchery	dup	@datax[1].4s,$word1
1130*e7be843bSPierre Pronchery	dup	@datax[2].4s,$word2
1131*e7be843bSPierre Pronchery	mov	@datax[3].s[0],$ctr
1132*e7be843bSPierre Pronchery	add	$ctr,$ctr,#1
1133*e7be843bSPierre Pronchery	mov	$datax[3].s[1],$ctr
1134*e7be843bSPierre Pronchery	add	$ctr,$ctr,#1
1135*e7be843bSPierre Pronchery	mov	@datax[3].s[2],$ctr
1136*e7be843bSPierre Pronchery	add	$ctr,$ctr,#1
1137*e7be843bSPierre Pronchery	mov	@datax[3].s[3],$ctr
1138*e7be843bSPierre Pronchery	add	$ctr,$ctr,#1
1139*e7be843bSPierre Pronchery	bl	_vpsm4_enc_8blks
1140*e7be843bSPierre Pronchery	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1141*e7be843bSPierre Pronchery	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1142*e7be843bSPierre Pronchery	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1143*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1144*e7be843bSPierre Pronchery	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1145*e7be843bSPierre Pronchery	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1146*e7be843bSPierre Pronchery	eor	@data[0].16b,@data[0].16b,@datax[0].16b
1147*e7be843bSPierre Pronchery	eor	@data[1].16b,@data[1].16b,@datax[1].16b
1148*e7be843bSPierre Pronchery	eor	@data[2].16b,@data[2].16b,@datax[2].16b
1149*e7be843bSPierre Pronchery	eor	@data[3].16b,@data[3].16b,@datax[3].16b
1150*e7be843bSPierre Pronchery	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1151*e7be843bSPierre Pronchery	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1152*e7be843bSPierre Pronchery	subs	$blocks,$blocks,#8
1153*e7be843bSPierre Pronchery	b.ne	.Lctr32_4_blocks_process
1154*e7be843bSPierre Pronchery	b	100f
1155*e7be843bSPierre Pronchery1:	// last block processing
1156*e7be843bSPierre Pronchery	subs	$blocks,$blocks,#1
1157*e7be843bSPierre Pronchery	b.lt	100f
1158*e7be843bSPierre Pronchery	b.gt	1f
1159*e7be843bSPierre Pronchery	mov	$ivec.s[0],$word0
1160*e7be843bSPierre Pronchery	mov	$ivec.s[1],$word1
1161*e7be843bSPierre Pronchery	mov	$ivec.s[2],$word2
1162*e7be843bSPierre Pronchery	mov	$ivec.s[3],$ctr
1163*e7be843bSPierre Pronchery___
1164*e7be843bSPierre Pronchery	&encrypt_1blk($ivec);
1165*e7be843bSPierre Pronchery$code.=<<___;
1166*e7be843bSPierre Pronchery	ld1	{@data[0].4s},[$inp]
1167*e7be843bSPierre Pronchery	eor	@data[0].16b,@data[0].16b,$ivec.16b
1168*e7be843bSPierre Pronchery	st1	{@data[0].4s},[$outp]
1169*e7be843bSPierre Pronchery	b	100f
1170*e7be843bSPierre Pronchery1:	// last 2 blocks processing
1171*e7be843bSPierre Pronchery	dup	@data[0].4s,$word0
1172*e7be843bSPierre Pronchery	dup	@data[1].4s,$word1
1173*e7be843bSPierre Pronchery	dup	@data[2].4s,$word2
1174*e7be843bSPierre Pronchery	mov	@data[3].s[0],$ctr
1175*e7be843bSPierre Pronchery	add	$ctr,$ctr,#1
1176*e7be843bSPierre Pronchery	mov	@data[3].s[1],$ctr
1177*e7be843bSPierre Pronchery	subs	$blocks,$blocks,#1
1178*e7be843bSPierre Pronchery	b.ne	1f
1179*e7be843bSPierre Pronchery	bl	_vpsm4_enc_4blks
1180*e7be843bSPierre Pronchery	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1181*e7be843bSPierre Pronchery	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1182*e7be843bSPierre Pronchery	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1183*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1184*e7be843bSPierre Pronchery	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1185*e7be843bSPierre Pronchery	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1186*e7be843bSPierre Pronchery	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1187*e7be843bSPierre Pronchery	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1188*e7be843bSPierre Pronchery	b	100f
1189*e7be843bSPierre Pronchery1:	// last 3 blocks processing
1190*e7be843bSPierre Pronchery	add	$ctr,$ctr,#1
1191*e7be843bSPierre Pronchery	mov	@data[3].s[2],$ctr
1192*e7be843bSPierre Pronchery	bl	_vpsm4_enc_4blks
1193*e7be843bSPierre Pronchery	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1194*e7be843bSPierre Pronchery	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1195*e7be843bSPierre Pronchery	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1196*e7be843bSPierre Pronchery	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1197*e7be843bSPierre Pronchery	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1198*e7be843bSPierre Pronchery	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1199*e7be843bSPierre Pronchery	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1200*e7be843bSPierre Pronchery	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1201*e7be843bSPierre Pronchery	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1202*e7be843bSPierre Pronchery	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1203*e7be843bSPierre Pronchery100:
1204*e7be843bSPierre Pronchery	ldp	d10,d11,[sp,#16]
1205*e7be843bSPierre Pronchery	ldp	d12,d13,[sp,#32]
1206*e7be843bSPierre Pronchery	ldp	d14,d15,[sp,#48]
1207*e7be843bSPierre Pronchery	ldp	x29,x30,[sp,#64]
1208*e7be843bSPierre Pronchery	ldp	d8,d9,[sp],#80
1209*e7be843bSPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
1210*e7be843bSPierre Pronchery	ret
1211*e7be843bSPierre Pronchery.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1212*e7be843bSPierre Pronchery___
1213*e7be843bSPierre Pronchery}}}
1214*e7be843bSPierre Pronchery
1215*e7be843bSPierre Pronchery{{{
1216*e7be843bSPierre Proncherymy ($blocks,$len)=("x2","x2");
1217*e7be843bSPierre Proncherymy $ivp=("x5");
1218*e7be843bSPierre Proncherymy @twx=map("x$_",(12..27));
1219*e7be843bSPierre Proncherymy ($rks1,$rks2)=("x26","x27");
1220*e7be843bSPierre Proncherymy $lastBlk=("x26");
1221*e7be843bSPierre Proncherymy $enc=("w28");
1222*e7be843bSPierre Proncherymy $remain=("x29");
1223*e7be843bSPierre Pronchery
1224*e7be843bSPierre Proncherymy @tweak=@datax;
1225*e7be843bSPierre Pronchery
1226*e7be843bSPierre Proncherysub gen_xts_cipher() {
1227*e7be843bSPierre Pronchery	my $std = shift;
1228*e7be843bSPierre Pronchery$code.=<<___;
1229*e7be843bSPierre Pronchery.globl	${prefix}_xts_encrypt${std}
1230*e7be843bSPierre Pronchery.type	${prefix}_xts_encrypt${std},%function
1231*e7be843bSPierre Pronchery.align	5
1232*e7be843bSPierre Pronchery${prefix}_xts_encrypt${std}:
1233*e7be843bSPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
1234*e7be843bSPierre Pronchery	stp	x15, x16, [sp, #-0x10]!
1235*e7be843bSPierre Pronchery	stp	x17, x18, [sp, #-0x10]!
1236*e7be843bSPierre Pronchery	stp	x19, x20, [sp, #-0x10]!
1237*e7be843bSPierre Pronchery	stp	x21, x22, [sp, #-0x10]!
1238*e7be843bSPierre Pronchery	stp	x23, x24, [sp, #-0x10]!
1239*e7be843bSPierre Pronchery	stp	x25, x26, [sp, #-0x10]!
1240*e7be843bSPierre Pronchery	stp	x27, x28, [sp, #-0x10]!
1241*e7be843bSPierre Pronchery	stp	x29, x30, [sp, #-0x10]!
1242*e7be843bSPierre Pronchery	stp	d8, d9, [sp, #-0x10]!
1243*e7be843bSPierre Pronchery	stp	d10, d11, [sp, #-0x10]!
1244*e7be843bSPierre Pronchery	stp	d12, d13, [sp, #-0x10]!
1245*e7be843bSPierre Pronchery	stp	d14, d15, [sp, #-0x10]!
1246*e7be843bSPierre Pronchery	mov	$rks1,x3
1247*e7be843bSPierre Pronchery	mov	$rks2,x4
1248*e7be843bSPierre Pronchery	mov	$enc,w6
1249*e7be843bSPierre Pronchery	ld1	{@tweak[0].4s}, [$ivp]
1250*e7be843bSPierre Pronchery	mov	$rks,$rks2
1251*e7be843bSPierre Pronchery___
1252*e7be843bSPierre Pronchery	&load_sbox();
1253*e7be843bSPierre Pronchery	&rev32(@tweak[0],@tweak[0]);
1254*e7be843bSPierre Pronchery	&encrypt_1blk(@tweak[0]);
1255*e7be843bSPierre Pronchery$code.=<<___;
1256*e7be843bSPierre Pronchery	mov	$rks,$rks1
1257*e7be843bSPierre Pronchery	and	$remain,$len,#0x0F
1258*e7be843bSPierre Pronchery	// convert length into blocks
1259*e7be843bSPierre Pronchery	lsr	$blocks,$len,4
1260*e7be843bSPierre Pronchery	cmp	$blocks,#1
1261*e7be843bSPierre Pronchery	b.lt .return${std}
1262*e7be843bSPierre Pronchery
1263*e7be843bSPierre Pronchery	cmp $remain,0
1264*e7be843bSPierre Pronchery	// If the encryption/decryption Length is N times of 16,
1265*e7be843bSPierre Pronchery	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1266*e7be843bSPierre Pronchery	b.eq .xts_encrypt_blocks${std}
1267*e7be843bSPierre Pronchery
1268*e7be843bSPierre Pronchery	// If the encryption/decryption length is not N times of 16,
1269*e7be843bSPierre Pronchery	// the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
1270*e7be843bSPierre Pronchery	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1271*e7be843bSPierre Pronchery	subs $blocks,$blocks,#1
1272*e7be843bSPierre Pronchery	b.eq .only_2blks_tweak${std}
1273*e7be843bSPierre Pronchery.xts_encrypt_blocks${std}:
1274*e7be843bSPierre Pronchery___
1275*e7be843bSPierre Pronchery	&rbit(@tweak[0],@tweak[0],$std);
1276*e7be843bSPierre Pronchery	&rev32_armeb(@tweak[0],@tweak[0]);
1277*e7be843bSPierre Pronchery	&mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
1278*e7be843bSPierre Pronchery	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1279*e7be843bSPierre Pronchery	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1280*e7be843bSPierre Pronchery	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1281*e7be843bSPierre Pronchery	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1282*e7be843bSPierre Pronchery	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1283*e7be843bSPierre Pronchery	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1284*e7be843bSPierre Pronchery	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1285*e7be843bSPierre Pronchery$code.=<<___;
1286*e7be843bSPierre Pronchery.Lxts_8_blocks_process${std}:
1287*e7be843bSPierre Pronchery	cmp	$blocks,#8
1288*e7be843bSPierre Pronchery	b.lt	.Lxts_4_blocks_process${std}
1289*e7be843bSPierre Pronchery___
1290*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
1291*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
1292*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
1293*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
1294*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
1295*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
1296*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
1297*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
1298*e7be843bSPierre Pronchery$code.=<<___;
1299*e7be843bSPierre Pronchery	ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1300*e7be843bSPierre Pronchery___
1301*e7be843bSPierre Pronchery	&rbit(@vtmp[0],@vtmp[0],$std);
1302*e7be843bSPierre Pronchery	&rbit(@vtmp[1],@vtmp[1],$std);
1303*e7be843bSPierre Pronchery	&rbit(@vtmp[2],@vtmp[2],$std);
1304*e7be843bSPierre Pronchery	&rbit(@vtmp[3],@vtmp[3],$std);
1305*e7be843bSPierre Pronchery$code.=<<___;
1306*e7be843bSPierre Pronchery	eor @data[0].16b, @data[0].16b, @vtmp[0].16b
1307*e7be843bSPierre Pronchery	eor @data[1].16b, @data[1].16b, @vtmp[1].16b
1308*e7be843bSPierre Pronchery	eor @data[2].16b, @data[2].16b, @vtmp[2].16b
1309*e7be843bSPierre Pronchery	eor @data[3].16b, @data[3].16b, @vtmp[3].16b
1310*e7be843bSPierre Pronchery	ld1	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1311*e7be843bSPierre Pronchery___
1312*e7be843bSPierre Pronchery	&rbit(@vtmpx[0],@vtmpx[0],$std);
1313*e7be843bSPierre Pronchery	&rbit(@vtmpx[1],@vtmpx[1],$std);
1314*e7be843bSPierre Pronchery	&rbit(@vtmpx[2],@vtmpx[2],$std);
1315*e7be843bSPierre Pronchery	&rbit(@vtmpx[3],@vtmpx[3],$std);
1316*e7be843bSPierre Pronchery$code.=<<___;
1317*e7be843bSPierre Pronchery	eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
1318*e7be843bSPierre Pronchery	eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
1319*e7be843bSPierre Pronchery	eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
1320*e7be843bSPierre Pronchery	eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
1321*e7be843bSPierre Pronchery___
1322*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
1323*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
1324*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
1325*e7be843bSPierre Pronchery	&rev32(@data[3],@data[3]);
1326*e7be843bSPierre Pronchery	&rev32(@datax[0],@datax[0]);
1327*e7be843bSPierre Pronchery	&rev32(@datax[1],@datax[1]);
1328*e7be843bSPierre Pronchery	&rev32(@datax[2],@datax[2]);
1329*e7be843bSPierre Pronchery	&rev32(@datax[3],@datax[3]);
1330*e7be843bSPierre Pronchery	&transpose(@data,@vtmp);
1331*e7be843bSPierre Pronchery	&transpose(@datax,@vtmp);
1332*e7be843bSPierre Pronchery$code.=<<___;
1333*e7be843bSPierre Pronchery	bl	_${prefix}_enc_8blks
1334*e7be843bSPierre Pronchery___
1335*e7be843bSPierre Pronchery	&transpose(@vtmp,@datax);
1336*e7be843bSPierre Pronchery	&transpose(@data,@datax);
1337*e7be843bSPierre Pronchery
1338*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
1339*e7be843bSPierre Pronchery	&compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
1340*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
1341*e7be843bSPierre Pronchery	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1342*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
1343*e7be843bSPierre Pronchery	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1344*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
1345*e7be843bSPierre Pronchery	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1346*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1347*e7be843bSPierre Pronchery	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1348*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1349*e7be843bSPierre Pronchery	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1350*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1351*e7be843bSPierre Pronchery	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1352*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
1353*e7be843bSPierre Pronchery	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1354*e7be843bSPierre Pronchery$code.=<<___;
1355*e7be843bSPierre Pronchery	eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
1356*e7be843bSPierre Pronchery	eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
1357*e7be843bSPierre Pronchery	eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
1358*e7be843bSPierre Pronchery	eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
1359*e7be843bSPierre Pronchery	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1360*e7be843bSPierre Pronchery	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1361*e7be843bSPierre Pronchery	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1362*e7be843bSPierre Pronchery	eor @data[3].16b, @data[3].16b, @tweak[3].16b
1363*e7be843bSPierre Pronchery
1364*e7be843bSPierre Pronchery	// save the last tweak
1365*e7be843bSPierre Pronchery	st1	{@tweak[3].4s},[$ivp]
1366*e7be843bSPierre Pronchery	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1367*e7be843bSPierre Pronchery	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1368*e7be843bSPierre Pronchery	subs	$blocks,$blocks,#8
1369*e7be843bSPierre Pronchery	b.gt	.Lxts_8_blocks_process${std}
1370*e7be843bSPierre Pronchery	b	100f
1371*e7be843bSPierre Pronchery.Lxts_4_blocks_process${std}:
1372*e7be843bSPierre Pronchery___
1373*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
1374*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
1375*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
1376*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
1377*e7be843bSPierre Pronchery$code.=<<___;
1378*e7be843bSPierre Pronchery	cmp	$blocks,#4
1379*e7be843bSPierre Pronchery	b.lt	1f
1380*e7be843bSPierre Pronchery	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1381*e7be843bSPierre Pronchery___
1382*e7be843bSPierre Pronchery	&rbit(@tweak[0],@tweak[0],$std);
1383*e7be843bSPierre Pronchery	&rbit(@tweak[1],@tweak[1],$std);
1384*e7be843bSPierre Pronchery	&rbit(@tweak[2],@tweak[2],$std);
1385*e7be843bSPierre Pronchery	&rbit(@tweak[3],@tweak[3],$std);
1386*e7be843bSPierre Pronchery$code.=<<___;
1387*e7be843bSPierre Pronchery	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1388*e7be843bSPierre Pronchery	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1389*e7be843bSPierre Pronchery	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1390*e7be843bSPierre Pronchery	eor @data[3].16b, @data[3].16b, @tweak[3].16b
1391*e7be843bSPierre Pronchery___
1392*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
1393*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
1394*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
1395*e7be843bSPierre Pronchery	&rev32(@data[3],@data[3]);
1396*e7be843bSPierre Pronchery	&transpose(@data,@vtmp);
1397*e7be843bSPierre Pronchery$code.=<<___;
1398*e7be843bSPierre Pronchery	bl	_${prefix}_enc_4blks
1399*e7be843bSPierre Pronchery___
1400*e7be843bSPierre Pronchery	&transpose(@vtmp,@data);
1401*e7be843bSPierre Pronchery$code.=<<___;
1402*e7be843bSPierre Pronchery	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1403*e7be843bSPierre Pronchery	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1404*e7be843bSPierre Pronchery	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1405*e7be843bSPierre Pronchery	eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1406*e7be843bSPierre Pronchery	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1407*e7be843bSPierre Pronchery	sub	$blocks,$blocks,#4
1408*e7be843bSPierre Pronchery___
1409*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1410*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1411*e7be843bSPierre Pronchery	&mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1412*e7be843bSPierre Pronchery$code.=<<___;
1413*e7be843bSPierre Pronchery	// save the last tweak
1414*e7be843bSPierre Pronchery	st1	{@tweak[3].4s},[$ivp]
1415*e7be843bSPierre Pronchery1:
1416*e7be843bSPierre Pronchery	// process last block
1417*e7be843bSPierre Pronchery	cmp	$blocks,#1
1418*e7be843bSPierre Pronchery	b.lt	100f
1419*e7be843bSPierre Pronchery	b.gt	1f
1420*e7be843bSPierre Pronchery	ld1	{@data[0].4s},[$inp],#16
1421*e7be843bSPierre Pronchery___
1422*e7be843bSPierre Pronchery	&rbit(@tweak[0],@tweak[0],$std);
1423*e7be843bSPierre Pronchery$code.=<<___;
1424*e7be843bSPierre Pronchery	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1425*e7be843bSPierre Pronchery___
1426*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
1427*e7be843bSPierre Pronchery	&encrypt_1blk(@data[0]);
1428*e7be843bSPierre Pronchery$code.=<<___;
1429*e7be843bSPierre Pronchery	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1430*e7be843bSPierre Pronchery	st1	{@data[0].4s},[$outp],#16
1431*e7be843bSPierre Pronchery	// save the last tweak
1432*e7be843bSPierre Pronchery	st1	{@tweak[0].4s},[$ivp]
1433*e7be843bSPierre Pronchery	b	100f
1434*e7be843bSPierre Pronchery1:  // process last 2 blocks
1435*e7be843bSPierre Pronchery	cmp	$blocks,#2
1436*e7be843bSPierre Pronchery	b.gt	1f
1437*e7be843bSPierre Pronchery	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
1438*e7be843bSPierre Pronchery___
1439*e7be843bSPierre Pronchery	&rbit(@tweak[0],@tweak[0],$std);
1440*e7be843bSPierre Pronchery	&rbit(@tweak[1],@tweak[1],$std);
1441*e7be843bSPierre Pronchery$code.=<<___;
1442*e7be843bSPierre Pronchery	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1443*e7be843bSPierre Pronchery	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1444*e7be843bSPierre Pronchery___
1445*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
1446*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
1447*e7be843bSPierre Pronchery	&transpose(@data,@vtmp);
1448*e7be843bSPierre Pronchery$code.=<<___;
1449*e7be843bSPierre Pronchery	bl	_${prefix}_enc_4blks
1450*e7be843bSPierre Pronchery___
1451*e7be843bSPierre Pronchery	&transpose(@vtmp,@data);
1452*e7be843bSPierre Pronchery$code.=<<___;
1453*e7be843bSPierre Pronchery	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1454*e7be843bSPierre Pronchery	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1455*e7be843bSPierre Pronchery	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1456*e7be843bSPierre Pronchery	// save the last tweak
1457*e7be843bSPierre Pronchery	st1	{@tweak[1].4s},[$ivp]
1458*e7be843bSPierre Pronchery	b	100f
1459*e7be843bSPierre Pronchery1:  // process last 3 blocks
1460*e7be843bSPierre Pronchery	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1461*e7be843bSPierre Pronchery___
1462*e7be843bSPierre Pronchery	&rbit(@tweak[0],@tweak[0],$std);
1463*e7be843bSPierre Pronchery	&rbit(@tweak[1],@tweak[1],$std);
1464*e7be843bSPierre Pronchery	&rbit(@tweak[2],@tweak[2],$std);
1465*e7be843bSPierre Pronchery$code.=<<___;
1466*e7be843bSPierre Pronchery	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1467*e7be843bSPierre Pronchery	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1468*e7be843bSPierre Pronchery	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1469*e7be843bSPierre Pronchery___
1470*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
1471*e7be843bSPierre Pronchery	&rev32(@data[1],@data[1]);
1472*e7be843bSPierre Pronchery	&rev32(@data[2],@data[2]);
1473*e7be843bSPierre Pronchery	&transpose(@data,@vtmp);
1474*e7be843bSPierre Pronchery$code.=<<___;
1475*e7be843bSPierre Pronchery	bl	_${prefix}_enc_4blks
1476*e7be843bSPierre Pronchery___
1477*e7be843bSPierre Pronchery	&transpose(@vtmp,@data);
1478*e7be843bSPierre Pronchery$code.=<<___;
1479*e7be843bSPierre Pronchery	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1480*e7be843bSPierre Pronchery	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1481*e7be843bSPierre Pronchery	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1482*e7be843bSPierre Pronchery	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1483*e7be843bSPierre Pronchery	// save the last tweak
1484*e7be843bSPierre Pronchery	st1	{@tweak[2].4s},[$ivp]
1485*e7be843bSPierre Pronchery100:
1486*e7be843bSPierre Pronchery	cmp $remain,0
1487*e7be843bSPierre Pronchery	b.eq .return${std}
1488*e7be843bSPierre Pronchery
1489*e7be843bSPierre Pronchery// This branch calculates the last two tweaks,
1490*e7be843bSPierre Pronchery// while the encryption/decryption length is larger than 32
1491*e7be843bSPierre Pronchery.last_2blks_tweak${std}:
1492*e7be843bSPierre Pronchery	ld1	{@tweak[0].4s},[$ivp]
1493*e7be843bSPierre Pronchery___
1494*e7be843bSPierre Pronchery	&rev32_armeb(@tweak[0],@tweak[0]);
1495*e7be843bSPierre Pronchery	&compute_tweak_vec(@tweak[0],@tweak[1],$std);
1496*e7be843bSPierre Pronchery	&compute_tweak_vec(@tweak[1],@tweak[2],$std);
1497*e7be843bSPierre Pronchery$code.=<<___;
1498*e7be843bSPierre Pronchery	b .check_dec${std}
1499*e7be843bSPierre Pronchery
1500*e7be843bSPierre Pronchery
1501*e7be843bSPierre Pronchery// This branch calculates the last two tweaks,
1502*e7be843bSPierre Pronchery// while the encryption/decryption length is equal to 32, who only need two tweaks
1503*e7be843bSPierre Pronchery.only_2blks_tweak${std}:
1504*e7be843bSPierre Pronchery	mov @tweak[1].16b,@tweak[0].16b
1505*e7be843bSPierre Pronchery___
1506*e7be843bSPierre Pronchery	&rev32_armeb(@tweak[1],@tweak[1]);
1507*e7be843bSPierre Pronchery	&compute_tweak_vec(@tweak[1],@tweak[2],$std);
1508*e7be843bSPierre Pronchery$code.=<<___;
1509*e7be843bSPierre Pronchery	b .check_dec${std}
1510*e7be843bSPierre Pronchery
1511*e7be843bSPierre Pronchery
1512*e7be843bSPierre Pronchery// Determine whether encryption or decryption is required.
1513*e7be843bSPierre Pronchery// The last two tweaks need to be swapped for decryption.
1514*e7be843bSPierre Pronchery.check_dec${std}:
1515*e7be843bSPierre Pronchery	// encryption:1 decryption:0
1516*e7be843bSPierre Pronchery	cmp $enc,1
1517*e7be843bSPierre Pronchery	b.eq .process_last_2blks${std}
1518*e7be843bSPierre Pronchery	mov @vtmp[0].16B,@tweak[1].16b
1519*e7be843bSPierre Pronchery	mov @tweak[1].16B,@tweak[2].16b
1520*e7be843bSPierre Pronchery	mov @tweak[2].16B,@vtmp[0].16b
1521*e7be843bSPierre Pronchery
1522*e7be843bSPierre Pronchery.process_last_2blks${std}:
1523*e7be843bSPierre Pronchery___
1524*e7be843bSPierre Pronchery	&rev32_armeb(@tweak[1],@tweak[1]);
1525*e7be843bSPierre Pronchery	&rev32_armeb(@tweak[2],@tweak[2]);
1526*e7be843bSPierre Pronchery$code.=<<___;
1527*e7be843bSPierre Pronchery	ld1	{@data[0].4s},[$inp],#16
1528*e7be843bSPierre Pronchery	eor @data[0].16b, @data[0].16b, @tweak[1].16b
1529*e7be843bSPierre Pronchery___
1530*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
1531*e7be843bSPierre Pronchery	&encrypt_1blk(@data[0]);
1532*e7be843bSPierre Pronchery$code.=<<___;
1533*e7be843bSPierre Pronchery	eor @data[0].16b, @data[0].16b, @tweak[1].16b
1534*e7be843bSPierre Pronchery	st1	{@data[0].4s},[$outp],#16
1535*e7be843bSPierre Pronchery
1536*e7be843bSPierre Pronchery	sub $lastBlk,$outp,16
1537*e7be843bSPierre Pronchery	.loop${std}:
1538*e7be843bSPierre Pronchery		subs $remain,$remain,1
1539*e7be843bSPierre Pronchery		ldrb	$wtmp0,[$lastBlk,$remain]
1540*e7be843bSPierre Pronchery		ldrb	$wtmp1,[$inp,$remain]
1541*e7be843bSPierre Pronchery		strb	$wtmp1,[$lastBlk,$remain]
1542*e7be843bSPierre Pronchery		strb	$wtmp0,[$outp,$remain]
1543*e7be843bSPierre Pronchery	b.gt .loop${std}
1544*e7be843bSPierre Pronchery	ld1		{@data[0].4s}, [$lastBlk]
1545*e7be843bSPierre Pronchery	eor @data[0].16b, @data[0].16b, @tweak[2].16b
1546*e7be843bSPierre Pronchery___
1547*e7be843bSPierre Pronchery	&rev32(@data[0],@data[0]);
1548*e7be843bSPierre Pronchery	&encrypt_1blk(@data[0]);
1549*e7be843bSPierre Pronchery$code.=<<___;
1550*e7be843bSPierre Pronchery	eor @data[0].16b, @data[0].16b, @tweak[2].16b
1551*e7be843bSPierre Pronchery	st1		{@data[0].4s}, [$lastBlk]
1552*e7be843bSPierre Pronchery.return${std}:
1553*e7be843bSPierre Pronchery	ldp		d14, d15, [sp], #0x10
1554*e7be843bSPierre Pronchery	ldp		d12, d13, [sp], #0x10
1555*e7be843bSPierre Pronchery	ldp		d10, d11, [sp], #0x10
1556*e7be843bSPierre Pronchery	ldp		d8, d9, [sp], #0x10
1557*e7be843bSPierre Pronchery	ldp		x29, x30, [sp], #0x10
1558*e7be843bSPierre Pronchery	ldp		x27, x28, [sp], #0x10
1559*e7be843bSPierre Pronchery	ldp		x25, x26, [sp], #0x10
1560*e7be843bSPierre Pronchery	ldp		x23, x24, [sp], #0x10
1561*e7be843bSPierre Pronchery	ldp		x21, x22, [sp], #0x10
1562*e7be843bSPierre Pronchery	ldp		x19, x20, [sp], #0x10
1563*e7be843bSPierre Pronchery	ldp		x17, x18, [sp], #0x10
1564*e7be843bSPierre Pronchery	ldp		x15, x16, [sp], #0x10
1565*e7be843bSPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
1566*e7be843bSPierre Pronchery	ret
1567*e7be843bSPierre Pronchery.size	${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
1568*e7be843bSPierre Pronchery___
1569*e7be843bSPierre Pronchery} # end of gen_xts_cipher
1570*e7be843bSPierre Pronchery&gen_xts_cipher("_gb");
1571*e7be843bSPierre Pronchery&gen_xts_cipher("");
1572*e7be843bSPierre Pronchery}}}
1573*e7be843bSPierre Pronchery########################################
1574*e7be843bSPierre Proncheryopen SELF,$0;
1575*e7be843bSPierre Proncherywhile(<SELF>) {
1576*e7be843bSPierre Pronchery        next if (/^#!/);
1577*e7be843bSPierre Pronchery        last if (!s/^#/\/\// and !/^$/);
1578*e7be843bSPierre Pronchery        print;
1579*e7be843bSPierre Pronchery}
1580*e7be843bSPierre Proncheryclose SELF;
1581*e7be843bSPierre Pronchery
1582*e7be843bSPierre Proncheryforeach(split("\n",$code)) {
1583*e7be843bSPierre Pronchery	s/\`([^\`]*)\`/eval($1)/ge;
1584*e7be843bSPierre Pronchery	print $_,"\n";
1585*e7be843bSPierre Pronchery}
1586*e7be843bSPierre Pronchery
1587*e7be843bSPierre Proncheryclose STDOUT or die "error closing STDOUT: $!";
1588