xref: /freebsd/crypto/openssl/crypto/sm4/asm/vpsm4_ex-armv8.pl (revision e7be843b4a162e68651d3911f0357ed464915629)
1#! /usr/bin/env perl
2# Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# This module implements SM4 with ASIMD and AESE on AARCH64
11#
12# Dec 2022
13#
14
15# $output is the last argument if it looks like a file (it has an extension)
16# $flavour is the first argument if it doesn't look like a file
17$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
19
20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23die "can't locate arm-xlate.pl";
24
25open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26	or die "can't call $xlate: $!";
27*STDOUT=*OUT;
28
29$prefix="vpsm4_ex";
30my @vtmp=map("v$_",(0..3));
31my @qtmp=map("q$_",(0..3));
32my @data=map("v$_",(4..7));
33my @datax=map("v$_",(8..11));
34my ($rk0,$rk1)=("v12","v13");
35my ($rka,$rkb)=("v14","v15");
36my @vtmpx=map("v$_",(12..15));
37my ($vtmp4,$vtmp5)=("v24","v25");
38my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31");
39my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31");
40
41my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
42my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
43my ($xtmp1,$xtmp2)=("x8","x9");
44my ($ptr,$counter)=("x10","w11");
45my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
46
47sub rev32() {
48	my $dst = shift;
49	my $src = shift;
50
51	if ($src and ("$src" ne "$dst")) {
52$code.=<<___;
53#ifndef __AARCH64EB__
54	rev32	$dst.16b,$src.16b
55#else
56	mov	$dst.16b,$src.16b
57#endif
58___
59	} else {
60$code.=<<___;
61#ifndef __AARCH64EB__
62	rev32	$dst.16b,$dst.16b
63#endif
64___
65	}
66}
67
68sub rev32_armeb() {
69	my $dst = shift;
70	my $src = shift;
71
72	if ($src and ("$src" ne "$dst")) {
73$code.=<<___;
74#ifdef __AARCH64EB__
75	rev32	$dst.16b,$src.16b
76#else
77	mov	$dst.16b,$src.16b
78#endif
79___
80	} else {
81$code.=<<___;
82#ifdef __AARCH64EB__
83	rev32	$dst.16b,$dst.16b
84#endif
85___
86	}
87}
88
89sub rbit() {
90	my $dst = shift;
91	my $src = shift;
92	my $std = shift;
93
94	if ($src and ("$src" ne "$dst")) {
95		if ($std eq "_gb") {
96$code.=<<___;
97			rbit $dst.16b,$src.16b
98___
99		} else {
100$code.=<<___;
101			mov $dst.16b,$src.16b
102___
103		}
104	} else {
105		if ($std eq "_gb") {
106$code.=<<___;
107			rbit $dst.16b,$src.16b
108___
109		}
110	}
111}
112
113sub transpose() {
114	my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
115
116$code.=<<___;
117	zip1	$vt0.4s,$dat0.4s,$dat1.4s
118	zip2	$vt1.4s,$dat0.4s,$dat1.4s
119	zip1	$vt2.4s,$dat2.4s,$dat3.4s
120	zip2	$vt3.4s,$dat2.4s,$dat3.4s
121	zip1	$dat0.2d,$vt0.2d,$vt2.2d
122	zip2	$dat1.2d,$vt0.2d,$vt2.2d
123	zip1	$dat2.2d,$vt1.2d,$vt3.2d
124	zip2	$dat3.2d,$vt1.2d,$vt3.2d
125___
126}
127
128# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x)
129sub mul_matrix() {
130	my $x = shift;
131	my $higherMat = shift;
132	my $lowerMat = shift;
133	my $tmp = shift;
134$code.=<<___;
135	ushr	$tmp.16b, $x.16b, 4
136	and		$x.16b, $x.16b, $ANDMaskV.16b
137	tbl		$x.16b, {$lowerMat.16b}, $x.16b
138	tbl		$tmp.16b, {$higherMat.16b}, $tmp.16b
139	eor		$x.16b, $x.16b, $tmp.16b
140___
141}
142
143# sbox operations for 4-lane of words
144# sbox operation for 4-lane of words
145sub sbox() {
146	my $dat = shift;
147
148$code.=<<___;
149	// optimize sbox using AESE instruction
150	tbl	@vtmp[0].16b, {$dat.16b}, $MaskV.16b
151___
152	&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
153$code.=<<___;
154	eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
155	aese @vtmp[0].16b,@vtmp[1].16b
156___
157	&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4);
158$code.=<<___;
159	mov	$dat.16b,@vtmp[0].16b
160
161	// linear transformation
162	ushr	@vtmp[0].4s,$dat.4s,32-2
163	ushr	@vtmp[1].4s,$dat.4s,32-10
164	ushr	@vtmp[2].4s,$dat.4s,32-18
165	ushr	@vtmp[3].4s,$dat.4s,32-24
166	sli	@vtmp[0].4s,$dat.4s,2
167	sli	@vtmp[1].4s,$dat.4s,10
168	sli	@vtmp[2].4s,$dat.4s,18
169	sli	@vtmp[3].4s,$dat.4s,24
170	eor	$vtmp4.16b,@vtmp[0].16b,$dat.16b
171	eor	$vtmp4.16b,$vtmp4.16b,$vtmp[1].16b
172	eor	$dat.16b,@vtmp[2].16b,@vtmp[3].16b
173	eor	$dat.16b,$dat.16b,$vtmp4.16b
174___
175}
176
177# sbox operation for 8-lane of words
178sub sbox_double() {
179	my $dat = shift;
180	my $datx = shift;
181
182$code.=<<___;
183	// optimize sbox using AESE instruction
184	tbl	@vtmp[0].16b, {$dat.16b}, $MaskV.16b
185	tbl	@vtmp[1].16b, {$datx.16b}, $MaskV.16b
186___
187	&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
188	&mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4);
189$code.=<<___;
190	eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b
191	aese @vtmp[0].16b,$vtmp5.16b
192	aese @vtmp[1].16b,$vtmp5.16b
193___
194	&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4);
195	&mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4);
196$code.=<<___;
197	mov	$dat.16b,@vtmp[0].16b
198	mov	$datx.16b,@vtmp[1].16b
199
200	// linear transformation
201	ushr	@vtmp[0].4s,$dat.4s,32-2
202	ushr	$vtmp5.4s,$datx.4s,32-2
203	ushr	@vtmp[1].4s,$dat.4s,32-10
204	ushr	@vtmp[2].4s,$dat.4s,32-18
205	ushr	@vtmp[3].4s,$dat.4s,32-24
206	sli	@vtmp[0].4s,$dat.4s,2
207	sli	$vtmp5.4s,$datx.4s,2
208	sli	@vtmp[1].4s,$dat.4s,10
209	sli	@vtmp[2].4s,$dat.4s,18
210	sli	@vtmp[3].4s,$dat.4s,24
211	eor	$vtmp4.16b,@vtmp[0].16b,$dat.16b
212	eor	$vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
213	eor	$dat.16b,@vtmp[2].16b,@vtmp[3].16b
214	eor	$dat.16b,$dat.16b,$vtmp4.16b
215	ushr	@vtmp[1].4s,$datx.4s,32-10
216	ushr	@vtmp[2].4s,$datx.4s,32-18
217	ushr	@vtmp[3].4s,$datx.4s,32-24
218	sli	@vtmp[1].4s,$datx.4s,10
219	sli	@vtmp[2].4s,$datx.4s,18
220	sli	@vtmp[3].4s,$datx.4s,24
221	eor	$vtmp4.16b,$vtmp5.16b,$datx.16b
222	eor	$vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
223	eor	$datx.16b,@vtmp[2].16b,@vtmp[3].16b
224	eor	$datx.16b,$datx.16b,$vtmp4.16b
225___
226}
227
228# sbox operation for one single word
229sub sbox_1word () {
230	my $word = shift;
231
232$code.=<<___;
233	mov	@vtmp[3].s[0],$word
234	// optimize sbox using AESE instruction
235	tbl	@vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b
236___
237	&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
238$code.=<<___;
239	eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
240	aese @vtmp[0].16b,@vtmp[1].16b
241___
242	&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
243$code.=<<___;
244
245	mov	$wtmp0,@vtmp[0].s[0]
246	eor	$word,$wtmp0,$wtmp0,ror #32-2
247	eor	$word,$word,$wtmp0,ror #32-10
248	eor	$word,$word,$wtmp0,ror #32-18
249	eor	$word,$word,$wtmp0,ror #32-24
250___
251}
252
253# sm4 for one block of data, in scalar registers word0/word1/word2/word3
254sub sm4_1blk () {
255	my $kptr = shift;
256
257$code.=<<___;
258	ldp	$wtmp0,$wtmp1,[$kptr],8
259	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
260	eor	$tmpw,$word2,$word3
261	eor	$wtmp2,$wtmp0,$word1
262	eor	$tmpw,$tmpw,$wtmp2
263___
264	&sbox_1word($tmpw);
265$code.=<<___;
266	eor	$word0,$word0,$tmpw
267	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
268	eor	$tmpw,$word2,$word3
269	eor	$wtmp2,$word0,$wtmp1
270	eor	$tmpw,$tmpw,$wtmp2
271___
272	&sbox_1word($tmpw);
273$code.=<<___;
274	ldp	$wtmp0,$wtmp1,[$kptr],8
275	eor	$word1,$word1,$tmpw
276	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
277	eor	$tmpw,$word0,$word1
278	eor	$wtmp2,$wtmp0,$word3
279	eor	$tmpw,$tmpw,$wtmp2
280___
281	&sbox_1word($tmpw);
282$code.=<<___;
283	eor	$word2,$word2,$tmpw
284	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
285	eor	$tmpw,$word0,$word1
286	eor	$wtmp2,$word2,$wtmp1
287	eor	$tmpw,$tmpw,$wtmp2
288___
289	&sbox_1word($tmpw);
290$code.=<<___;
291	eor	$word3,$word3,$tmpw
292___
293}
294
295# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
296sub sm4_4blks () {
297	my $kptr = shift;
298
299$code.=<<___;
300	ldp	$wtmp0,$wtmp1,[$kptr],8
301	dup	$rk0.4s,$wtmp0
302	dup	$rk1.4s,$wtmp1
303
304	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
305	eor	$rka.16b,@data[2].16b,@data[3].16b
306	eor	$rk0.16b,@data[1].16b,$rk0.16b
307	eor	$rk0.16b,$rka.16b,$rk0.16b
308___
309	&sbox($rk0);
310$code.=<<___;
311	eor	@data[0].16b,@data[0].16b,$rk0.16b
312
313	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
314	eor	$rka.16b,$rka.16b,@data[0].16b
315	eor	$rk1.16b,$rka.16b,$rk1.16b
316___
317	&sbox($rk1);
318$code.=<<___;
319	ldp	$wtmp0,$wtmp1,[$kptr],8
320	eor	@data[1].16b,@data[1].16b,$rk1.16b
321
322	dup	$rk0.4s,$wtmp0
323	dup	$rk1.4s,$wtmp1
324
325	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
326	eor	$rka.16b,@data[0].16b,@data[1].16b
327	eor	$rk0.16b,@data[3].16b,$rk0.16b
328	eor	$rk0.16b,$rka.16b,$rk0.16b
329___
330	&sbox($rk0);
331$code.=<<___;
332	eor	@data[2].16b,@data[2].16b,$rk0.16b
333
334	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
335	eor	$rka.16b,$rka.16b,@data[2].16b
336	eor	$rk1.16b,$rka.16b,$rk1.16b
337___
338	&sbox($rk1);
339$code.=<<___;
340	eor	@data[3].16b,@data[3].16b,$rk1.16b
341___
342}
343
344# sm4 for 8 lanes of data, in neon registers
345# data0/data1/data2/data3 datax0/datax1/datax2/datax3
346sub sm4_8blks () {
347	my $kptr = shift;
348
349$code.=<<___;
350	ldp	$wtmp0,$wtmp1,[$kptr],8
351	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
352	dup	$rk0.4s,$wtmp0
353	eor	$rka.16b,@data[2].16b,@data[3].16b
354	eor	$rkb.16b,@datax[2].16b,@datax[3].16b
355	eor	@vtmp[0].16b,@data[1].16b,$rk0.16b
356	eor	@vtmp[1].16b,@datax[1].16b,$rk0.16b
357	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
358	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
359___
360	&sbox_double($rk0,$rk1);
361$code.=<<___;
362	eor	@data[0].16b,@data[0].16b,$rk0.16b
363	eor	@datax[0].16b,@datax[0].16b,$rk1.16b
364
365	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
366	dup	$rk1.4s,$wtmp1
367	eor	$rka.16b,$rka.16b,@data[0].16b
368	eor	$rkb.16b,$rkb.16b,@datax[0].16b
369	eor	$rk0.16b,$rka.16b,$rk1.16b
370	eor	$rk1.16b,$rkb.16b,$rk1.16b
371___
372	&sbox_double($rk0,$rk1);
373$code.=<<___;
374	ldp	$wtmp0,$wtmp1,[$kptr],8
375	eor	@data[1].16b,@data[1].16b,$rk0.16b
376	eor	@datax[1].16b,@datax[1].16b,$rk1.16b
377
378	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
379	dup	$rk0.4s,$wtmp0
380	eor	$rka.16b,@data[0].16b,@data[1].16b
381	eor	$rkb.16b,@datax[0].16b,@datax[1].16b
382	eor	@vtmp[0].16b,@data[3].16b,$rk0.16b
383	eor	@vtmp[1].16b,@datax[3].16b,$rk0.16b
384	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
385	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
386___
387	&sbox_double($rk0,$rk1);
388$code.=<<___;
389	eor	@data[2].16b,@data[2].16b,$rk0.16b
390	eor	@datax[2].16b,@datax[2].16b,$rk1.16b
391
392	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
393	dup	$rk1.4s,$wtmp1
394	eor	$rka.16b,$rka.16b,@data[2].16b
395	eor	$rkb.16b,$rkb.16b,@datax[2].16b
396	eor	$rk0.16b,$rka.16b,$rk1.16b
397	eor	$rk1.16b,$rkb.16b,$rk1.16b
398___
399	&sbox_double($rk0,$rk1);
400$code.=<<___;
401	eor	@data[3].16b,@data[3].16b,$rk0.16b
402	eor	@datax[3].16b,@datax[3].16b,$rk1.16b
403___
404}
405
406sub encrypt_1blk_norev() {
407	my $dat = shift;
408
409$code.=<<___;
410	mov	$ptr,$rks
411	mov	$counter,#8
412	mov	$word0,$dat.s[0]
413	mov	$word1,$dat.s[1]
414	mov	$word2,$dat.s[2]
415	mov	$word3,$dat.s[3]
41610:
417___
418	&sm4_1blk($ptr);
419$code.=<<___;
420	subs	$counter,$counter,#1
421	b.ne	10b
422	mov	$dat.s[0],$word3
423	mov	$dat.s[1],$word2
424	mov	$dat.s[2],$word1
425	mov	$dat.s[3],$word0
426___
427}
428
429sub encrypt_1blk() {
430	my $dat = shift;
431
432	&encrypt_1blk_norev($dat);
433	&rev32($dat,$dat);
434}
435
436sub encrypt_4blks() {
437$code.=<<___;
438	mov	$ptr,$rks
439	mov	$counter,#8
44010:
441___
442	&sm4_4blks($ptr);
443$code.=<<___;
444	subs	$counter,$counter,#1
445	b.ne	10b
446___
447	&rev32(@vtmp[3],@data[0]);
448	&rev32(@vtmp[2],@data[1]);
449	&rev32(@vtmp[1],@data[2]);
450	&rev32(@vtmp[0],@data[3]);
451}
452
453sub encrypt_8blks() {
454$code.=<<___;
455	mov	$ptr,$rks
456	mov	$counter,#8
45710:
458___
459	&sm4_8blks($ptr);
460$code.=<<___;
461	subs	$counter,$counter,#1
462	b.ne	10b
463___
464	&rev32(@vtmp[3],@data[0]);
465	&rev32(@vtmp[2],@data[1]);
466	&rev32(@vtmp[1],@data[2]);
467	&rev32(@vtmp[0],@data[3]);
468	&rev32(@data[3],@datax[0]);
469	&rev32(@data[2],@datax[1]);
470	&rev32(@data[1],@datax[2]);
471	&rev32(@data[0],@datax[3]);
472}
473
474sub load_sbox () {
475	my $data = shift;
476
477$code.=<<___;
478	adrp $xtmp2, .Lsbox_magic
479	ldr $MaskQ, [$xtmp2, #:lo12:.Lsbox_magic]
480	ldr $TAHMatQ, [$xtmp2, #:lo12:.Lsbox_magic+16]
481	ldr $TALMatQ, [$xtmp2, #:lo12:.Lsbox_magic+32]
482	ldr $ATAHMatQ, [$xtmp2, #:lo12:.Lsbox_magic+48]
483	ldr $ATALMatQ, [$xtmp2, #:lo12:.Lsbox_magic+64]
484	ldr $ANDMaskQ, [$xtmp2, #:lo12:.Lsbox_magic+80]
485___
486}
487
488sub mov_reg_to_vec() {
489	my $src0 = shift;
490	my $src1 = shift;
491	my $desv = shift;
492$code.=<<___;
493	mov $desv.d[0],$src0
494	mov $desv.d[1],$src1
495___
496	&rev32_armeb($desv,$desv);
497}
498
499sub mov_vec_to_reg() {
500	my $srcv = shift;
501	my $des0 = shift;
502	my $des1 = shift;
503$code.=<<___;
504	mov $des0,$srcv.d[0]
505	mov $des1,$srcv.d[1]
506___
507}
508
509sub compute_tweak() {
510	my $src0 = shift;
511	my $src1 = shift;
512	my $des0 = shift;
513	my $des1 = shift;
514$code.=<<___;
515	mov $wtmp0,0x87
516	extr	$xtmp2,$src1,$src1,#32
517	extr	$des1,$src1,$src0,#63
518	and	$wtmp1,$wtmp0,$wtmp2,asr#31
519	eor	$des0,$xtmp1,$src0,lsl#1
520___
521}
522
523sub compute_tweak_vec() {
524	my $src = shift;
525	my $des = shift;
526	my $std = shift;
527	&rbit(@vtmp[2],$src,$std);
528$code.=<<___;
529	adrp  $xtmp2, .Lxts_magic
530	ldr  @qtmp[0], [$xtmp2, #:lo12:.Lxts_magic]
531	shl  $des.16b, @vtmp[2].16b, #1
532	ext  @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
533	ushr @vtmp[1].16b, @vtmp[1].16b, #7
534	mul  @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
535	eor  $des.16b, $des.16b, @vtmp[1].16b
536___
537	&rbit($des,$des,$std);
538}
539
540$code=<<___;
541#include "arm_arch.h"
542.arch	armv8-a+crypto
543.text
544
545.type	_${prefix}_consts,%object
546.align	7
547_${prefix}_consts:
548.Lck:
549	.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
550	.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
551	.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
552	.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
553	.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
554	.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
555	.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
556	.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
557.Lfk:
558	.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
559.Lshuffles:
560	.quad 0x0B0A090807060504,0x030201000F0E0D0C
561.Lxts_magic:
562	.quad 0x0101010101010187,0x0101010101010101
563.Lsbox_magic:
564	.quad 0x0b0e0104070a0d00,0x0306090c0f020508
565	.quad 0x62185a2042387a00,0x22581a6002783a40
566	.quad 0x15df62a89e54e923,0xc10bb67c4a803df7
567	.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
568	.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
569	.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
570
571.size	_${prefix}_consts,.-_${prefix}_consts
572___
573
574{{{
575my ($key,$keys,$enc)=("x0","x1","w2");
576my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
577my ($vkey,$vfk,$vmap)=("v5","v6","v7");
578$code.=<<___;
579.type	_${prefix}_set_key,%function
580.align	4
581_${prefix}_set_key:
582	AARCH64_VALID_CALL_TARGET
583	ld1	{$vkey.4s},[$key]
584___
585	&load_sbox();
586	&rev32($vkey,$vkey);
587$code.=<<___;
588	adrp	$pointer,.Lshuffles
589	add	$pointer,$pointer,#:lo12:.Lshuffles
590	ld1	{$vmap.2d},[$pointer]
591	adrp	$pointer,.Lfk
592	add	$pointer,$pointer,#:lo12:.Lfk
593	ld1	{$vfk.2d},[$pointer]
594	eor	$vkey.16b,$vkey.16b,$vfk.16b
595	mov	$schedules,#32
596	adrp	$pointer,.Lck
597	add	$pointer,$pointer,#:lo12:.Lck
598	movi	@vtmp[0].16b,#64
599	cbnz	$enc,1f
600	add	$keys,$keys,124
6011:
602	mov	$wtmp,$vkey.s[1]
603	ldr	$roundkey,[$pointer],#4
604	eor	$roundkey,$roundkey,$wtmp
605	mov	$wtmp,$vkey.s[2]
606	eor	$roundkey,$roundkey,$wtmp
607	mov	$wtmp,$vkey.s[3]
608	eor	$roundkey,$roundkey,$wtmp
609	// optimize sbox using AESE instruction
610	mov	@data[0].s[0],$roundkey
611	tbl	@vtmp[0].16b, {@data[0].16b}, $MaskV.16b
612___
613	&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
614$code.=<<___;
615	eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
616	aese @vtmp[0].16b,@vtmp[1].16b
617___
618	&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
619$code.=<<___;
620	mov	$wtmp,@vtmp[0].s[0]
621	eor	$roundkey,$wtmp,$wtmp,ror #19
622	eor	$roundkey,$roundkey,$wtmp,ror #9
623	mov	$wtmp,$vkey.s[0]
624	eor	$roundkey,$roundkey,$wtmp
625	mov	$vkey.s[0],$roundkey
626	cbz	$enc,2f
627	str	$roundkey,[$keys],#4
628	b	3f
6292:
630	str	$roundkey,[$keys],#-4
6313:
632	tbl	$vkey.16b,{$vkey.16b},$vmap.16b
633	subs	$schedules,$schedules,#1
634	b.ne	1b
635	ret
636.size	_${prefix}_set_key,.-_${prefix}_set_key
637___
638}}}
639
640
641{{{
642$code.=<<___;
643.type	_${prefix}_enc_4blks,%function
644.align	4
645_${prefix}_enc_4blks:
646	AARCH64_VALID_CALL_TARGET
647___
648	&encrypt_4blks();
649$code.=<<___;
650	ret
651.size	_${prefix}_enc_4blks,.-_${prefix}_enc_4blks
652___
653}}}
654
655{{{
656$code.=<<___;
657.type	_${prefix}_enc_8blks,%function
658.align	4
659_${prefix}_enc_8blks:
660	AARCH64_VALID_CALL_TARGET
661___
662	&encrypt_8blks();
663$code.=<<___;
664	ret
665.size	_${prefix}_enc_8blks,.-_${prefix}_enc_8blks
666___
667}}}
668
669
670{{{
671my ($key,$keys)=("x0","x1");
672$code.=<<___;
673.globl	${prefix}_set_encrypt_key
674.type	${prefix}_set_encrypt_key,%function
675.align	5
676${prefix}_set_encrypt_key:
677	AARCH64_SIGN_LINK_REGISTER
678	stp	x29,x30,[sp,#-16]!
679	mov	w2,1
680	bl	_${prefix}_set_key
681	ldp	x29,x30,[sp],#16
682	AARCH64_VALIDATE_LINK_REGISTER
683	ret
684.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
685___
686}}}
687
688{{{
689my ($key,$keys)=("x0","x1");
690$code.=<<___;
691.globl	${prefix}_set_decrypt_key
692.type	${prefix}_set_decrypt_key,%function
693.align	5
694${prefix}_set_decrypt_key:
695	AARCH64_SIGN_LINK_REGISTER
696	stp	x29,x30,[sp,#-16]!
697	mov	w2,0
698	bl	_${prefix}_set_key
699	ldp	x29,x30,[sp],#16
700	AARCH64_VALIDATE_LINK_REGISTER
701	ret
702.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
703___
704}}}
705
706{{{
707sub gen_block () {
708	my $dir = shift;
709	my ($inp,$outp,$rk)=map("x$_",(0..2));
710
711$code.=<<___;
712.globl	${prefix}_${dir}crypt
713.type	${prefix}_${dir}crypt,%function
714.align	5
715${prefix}_${dir}crypt:
716	AARCH64_VALID_CALL_TARGET
717	ld1	{@data[0].4s},[$inp]
718___
719	&load_sbox();
720	&rev32(@data[0],@data[0]);
721$code.=<<___;
722	mov	$rks,$rk
723___
724	&encrypt_1blk(@data[0]);
725$code.=<<___;
726	st1	{@data[0].4s},[$outp]
727	ret
728.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
729___
730}
731&gen_block("en");
732&gen_block("de");
733}}}
734
735{{{
736$code.=<<___;
737.globl	${prefix}_ecb_encrypt
738.type	${prefix}_ecb_encrypt,%function
739.align	5
740${prefix}_ecb_encrypt:
741	AARCH64_SIGN_LINK_REGISTER
742	// convert length into blocks
743	lsr	x2,x2,4
744	stp	d8,d9,[sp,#-80]!
745	stp	d10,d11,[sp,#16]
746	stp	d12,d13,[sp,#32]
747	stp	d14,d15,[sp,#48]
748	stp	x29,x30,[sp,#64]
749___
750	&load_sbox();
751$code.=<<___;
752.Lecb_8_blocks_process:
753	cmp	$blocks,#8
754	b.lt	.Lecb_4_blocks_process
755	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
756	ld4	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
757___
758	&rev32(@data[0],@data[0]);
759	&rev32(@data[1],@data[1]);
760	&rev32(@data[2],@data[2]);
761	&rev32(@data[3],@data[3]);
762	&rev32(@datax[0],@datax[0]);
763	&rev32(@datax[1],@datax[1]);
764	&rev32(@datax[2],@datax[2]);
765	&rev32(@datax[3],@datax[3]);
766$code.=<<___;
767	bl	_${prefix}_enc_8blks
768	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
769	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
770	subs	$blocks,$blocks,#8
771	b.gt	.Lecb_8_blocks_process
772	b	100f
773.Lecb_4_blocks_process:
774	cmp	$blocks,#4
775	b.lt	1f
776	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
777___
778	&rev32(@data[0],@data[0]);
779	&rev32(@data[1],@data[1]);
780	&rev32(@data[2],@data[2]);
781	&rev32(@data[3],@data[3]);
782$code.=<<___;
783	bl	_${prefix}_enc_4blks
784	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
785	sub	$blocks,$blocks,#4
7861:
787	// process last block
788	cmp	$blocks,#1
789	b.lt	100f
790	b.gt	1f
791	ld1	{@data[0].4s},[$inp]
792___
793	&rev32(@data[0],@data[0]);
794	&encrypt_1blk(@data[0]);
795$code.=<<___;
796	st1	{@data[0].4s},[$outp]
797	b	100f
7981:	// process last 2 blocks
799	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
800	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
801	cmp	$blocks,#2
802	b.gt	1f
803___
804	&rev32(@data[0],@data[0]);
805	&rev32(@data[1],@data[1]);
806	&rev32(@data[2],@data[2]);
807	&rev32(@data[3],@data[3]);
808$code.=<<___;
809	bl	_${prefix}_enc_4blks
810	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
811	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp]
812	b	100f
8131:	// process last 3 blocks
814	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
815___
816	&rev32(@data[0],@data[0]);
817	&rev32(@data[1],@data[1]);
818	&rev32(@data[2],@data[2]);
819	&rev32(@data[3],@data[3]);
820$code.=<<___;
821	bl	_${prefix}_enc_4blks
822	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
823	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
824	st4	{@vtmp[0].s-@vtmp[3].s}[2],[$outp]
825100:
826	ldp	d10,d11,[sp,#16]
827	ldp	d12,d13,[sp,#32]
828	ldp	d14,d15,[sp,#48]
829	ldp	x29,x30,[sp,#64]
830	ldp	d8,d9,[sp],#80
831	AARCH64_VALIDATE_LINK_REGISTER
832	ret
833.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
834___
835}}}
836
837{{{
838my ($len,$ivp,$enc)=("x2","x4","w5");
839my $ivec0=("v3");
840my $ivec1=("v15");
841
842$code.=<<___;
843.globl	${prefix}_cbc_encrypt
844.type	${prefix}_cbc_encrypt,%function
845.align	5
846${prefix}_cbc_encrypt:
847	AARCH64_VALID_CALL_TARGET
848	lsr	$len,$len,4
849___
850	&load_sbox();
851$code.=<<___;
852	cbz	$enc,.Ldec
853	ld1	{$ivec0.4s},[$ivp]
854.Lcbc_4_blocks_enc:
855	cmp	$blocks,#4
856	b.lt	1f
857	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
858	eor	@data[0].16b,@data[0].16b,$ivec0.16b
859___
860	&rev32(@data[1],@data[1]);
861	&rev32(@data[0],@data[0]);
862	&rev32(@data[2],@data[2]);
863	&rev32(@data[3],@data[3]);
864	&encrypt_1blk_norev(@data[0]);
865$code.=<<___;
866	eor	@data[1].16b,@data[1].16b,@data[0].16b
867___
868	&encrypt_1blk_norev(@data[1]);
869	&rev32(@data[0],@data[0]);
870
871$code.=<<___;
872	eor	@data[2].16b,@data[2].16b,@data[1].16b
873___
874	&encrypt_1blk_norev(@data[2]);
875	&rev32(@data[1],@data[1]);
876$code.=<<___;
877	eor	@data[3].16b,@data[3].16b,@data[2].16b
878___
879	&encrypt_1blk_norev(@data[3]);
880	&rev32(@data[2],@data[2]);
881	&rev32(@data[3],@data[3]);
882$code.=<<___;
883	orr	$ivec0.16b,@data[3].16b,@data[3].16b
884	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
885	subs	$blocks,$blocks,#4
886	b.ne	.Lcbc_4_blocks_enc
887	b	2f
8881:
889	subs	$blocks,$blocks,#1
890	b.lt	2f
891	ld1	{@data[0].4s},[$inp],#16
892	eor	$ivec0.16b,$ivec0.16b,@data[0].16b
893___
894	&rev32($ivec0,$ivec0);
895	&encrypt_1blk($ivec0);
896$code.=<<___;
897	st1	{$ivec0.4s},[$outp],#16
898	b	1b
8992:
900	// save back IV
901	st1	{$ivec0.4s},[$ivp]
902	ret
903
904.Ldec:
905	// decryption mode starts
906	AARCH64_SIGN_LINK_REGISTER
907	stp	d8,d9,[sp,#-80]!
908	stp	d10,d11,[sp,#16]
909	stp	d12,d13,[sp,#32]
910	stp	d14,d15,[sp,#48]
911	stp	x29,x30,[sp,#64]
912.Lcbc_8_blocks_dec:
913	cmp	$blocks,#8
914	b.lt	1f
915	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
916	add	$ptr,$inp,#64
917	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
918___
919	&rev32(@data[0],@data[0]);
920	&rev32(@data[1],@data[1]);
921	&rev32(@data[2],@data[2]);
922	&rev32(@data[3],$data[3]);
923	&rev32(@datax[0],@datax[0]);
924	&rev32(@datax[1],@datax[1]);
925	&rev32(@datax[2],@datax[2]);
926	&rev32(@datax[3],$datax[3]);
927$code.=<<___;
928	bl	_${prefix}_enc_8blks
929___
930	&transpose(@vtmp,@datax);
931	&transpose(@data,@datax);
932$code.=<<___;
933	ld1	{$ivec1.4s},[$ivp]
934	ld1	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
935	// note ivec1 and vtmpx[3] are reusing the same register
936	// care needs to be taken to avoid conflict
937	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
938	ld1	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
939	eor	@vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
940	eor	@vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
941	eor	@vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
942	// save back IV
943	st1	{$vtmpx[3].4s}, [$ivp]
944	eor	@data[0].16b,@data[0].16b,$datax[3].16b
945	eor	@data[1].16b,@data[1].16b,@vtmpx[0].16b
946	eor	@data[2].16b,@data[2].16b,@vtmpx[1].16b
947	eor	@data[3].16b,$data[3].16b,@vtmpx[2].16b
948	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
949	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
950	subs	$blocks,$blocks,#8
951	b.gt	.Lcbc_8_blocks_dec
952	b.eq	100f
9531:
954	ld1	{$ivec1.4s},[$ivp]
955.Lcbc_4_blocks_dec:
956	cmp	$blocks,#4
957	b.lt	1f
958	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
959___
960	&rev32(@data[0],@data[0]);
961	&rev32(@data[1],@data[1]);
962	&rev32(@data[2],@data[2]);
963	&rev32(@data[3],$data[3]);
964$code.=<<___;
965	bl	_${prefix}_enc_4blks
966	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
967___
968	&transpose(@vtmp,@datax);
969$code.=<<___;
970	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
971	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
972	orr	$ivec1.16b,@data[3].16b,@data[3].16b
973	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
974	eor	@vtmp[3].16b,$vtmp[3].16b,@data[2].16b
975	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
976	subs	$blocks,$blocks,#4
977	b.gt	.Lcbc_4_blocks_dec
978	// save back IV
979	st1	{@data[3].4s}, [$ivp]
980	b	100f
9811:	// last block
982	subs	$blocks,$blocks,#1
983	b.lt	100f
984	b.gt	1f
985	ld1	{@data[0].4s},[$inp],#16
986	// save back IV
987	st1	{$data[0].4s}, [$ivp]
988___
989	&rev32(@datax[0],@data[0]);
990	&encrypt_1blk(@datax[0]);
991$code.=<<___;
992	eor	@datax[0].16b,@datax[0].16b,$ivec1.16b
993	st1	{@datax[0].4s},[$outp],#16
994	b	100f
9951:	// last two blocks
996	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
997	add	$ptr,$inp,#16
998	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
999	subs	$blocks,$blocks,1
1000	b.gt	1f
1001___
1002	&rev32(@data[0],@data[0]);
1003	&rev32(@data[1],@data[1]);
1004	&rev32(@data[2],@data[2]);
1005	&rev32(@data[3],@data[3]);
1006$code.=<<___;
1007	bl	_${prefix}_enc_4blks
1008	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
1009___
1010	&transpose(@vtmp,@datax);
1011$code.=<<___;
1012	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1013	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1014	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1015	// save back IV
1016	st1	{@data[1].4s}, [$ivp]
1017	b	100f
10181:	// last 3 blocks
1019	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
1020___
1021	&rev32(@data[0],@data[0]);
1022	&rev32(@data[1],@data[1]);
1023	&rev32(@data[2],@data[2]);
1024	&rev32(@data[3],@data[3]);
1025$code.=<<___;
1026	bl	_${prefix}_enc_4blks
1027	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1028___
1029	&transpose(@vtmp,@datax);
1030$code.=<<___;
1031	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1032	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1033	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
1034	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1035	// save back IV
1036	st1	{@data[2].4s}, [$ivp]
1037100:
1038	ldp	d10,d11,[sp,#16]
1039	ldp	d12,d13,[sp,#32]
1040	ldp	d14,d15,[sp,#48]
1041	ldp	x29,x30,[sp,#64]
1042	ldp	d8,d9,[sp],#80
1043	AARCH64_VALIDATE_LINK_REGISTER
1044	ret
1045.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1046___
1047}}}
1048
1049{{{
1050my ($ivp)=("x4");
1051my ($ctr)=("w5");
1052my $ivec=("v3");
1053
1054$code.=<<___;
1055.globl	${prefix}_ctr32_encrypt_blocks
1056.type	${prefix}_ctr32_encrypt_blocks,%function
1057.align	5
1058${prefix}_ctr32_encrypt_blocks:
1059	AARCH64_VALID_CALL_TARGET
1060	ld1	{$ivec.4s},[$ivp]
1061___
1062	&rev32($ivec,$ivec);
1063	&load_sbox();
1064$code.=<<___;
1065	cmp	$blocks,#1
1066	b.ne	1f
1067	// fast processing for one single block without
1068	// context saving overhead
1069___
1070	&encrypt_1blk($ivec);
1071$code.=<<___;
1072	ld1	{@data[0].4s},[$inp]
1073	eor	@data[0].16b,@data[0].16b,$ivec.16b
1074	st1	{@data[0].4s},[$outp]
1075	ret
10761:
1077	AARCH64_SIGN_LINK_REGISTER
1078	stp	d8,d9,[sp,#-80]!
1079	stp	d10,d11,[sp,#16]
1080	stp	d12,d13,[sp,#32]
1081	stp	d14,d15,[sp,#48]
1082	stp	x29,x30,[sp,#64]
1083	mov	$word0,$ivec.s[0]
1084	mov	$word1,$ivec.s[1]
1085	mov	$word2,$ivec.s[2]
1086	mov	$ctr,$ivec.s[3]
1087.Lctr32_4_blocks_process:
1088	cmp	$blocks,#4
1089	b.lt	1f
1090	dup	@data[0].4s,$word0
1091	dup	@data[1].4s,$word1
1092	dup	@data[2].4s,$word2
1093	mov	@data[3].s[0],$ctr
1094	add	$ctr,$ctr,#1
1095	mov	$data[3].s[1],$ctr
1096	add	$ctr,$ctr,#1
1097	mov	@data[3].s[2],$ctr
1098	add	$ctr,$ctr,#1
1099	mov	@data[3].s[3],$ctr
1100	add	$ctr,$ctr,#1
1101	cmp	$blocks,#8
1102	b.ge	.Lctr32_8_blocks_process
1103	bl	_${prefix}_enc_4blks
1104	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1105	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1106	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1107	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1108	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1109	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1110	subs	$blocks,$blocks,#4
1111	b.ne	.Lctr32_4_blocks_process
1112	b	100f
1113.Lctr32_8_blocks_process:
1114	dup	@datax[0].4s,$word0
1115	dup	@datax[1].4s,$word1
1116	dup	@datax[2].4s,$word2
1117	mov	@datax[3].s[0],$ctr
1118	add	$ctr,$ctr,#1
1119	mov	$datax[3].s[1],$ctr
1120	add	$ctr,$ctr,#1
1121	mov	@datax[3].s[2],$ctr
1122	add	$ctr,$ctr,#1
1123	mov	@datax[3].s[3],$ctr
1124	add	$ctr,$ctr,#1
1125	bl	_${prefix}_enc_8blks
1126	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1127	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1128	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1129	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1130	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1131	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1132	eor	@data[0].16b,@data[0].16b,@datax[0].16b
1133	eor	@data[1].16b,@data[1].16b,@datax[1].16b
1134	eor	@data[2].16b,@data[2].16b,@datax[2].16b
1135	eor	@data[3].16b,@data[3].16b,@datax[3].16b
1136	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1137	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1138	subs	$blocks,$blocks,#8
1139	b.ne	.Lctr32_4_blocks_process
1140	b	100f
11411:	// last block processing
1142	subs	$blocks,$blocks,#1
1143	b.lt	100f
1144	b.gt	1f
1145	mov	$ivec.s[0],$word0
1146	mov	$ivec.s[1],$word1
1147	mov	$ivec.s[2],$word2
1148	mov	$ivec.s[3],$ctr
1149___
1150	&encrypt_1blk($ivec);
1151$code.=<<___;
1152	ld1	{@data[0].4s},[$inp]
1153	eor	@data[0].16b,@data[0].16b,$ivec.16b
1154	st1	{@data[0].4s},[$outp]
1155	b	100f
11561:	// last 2 blocks processing
1157	dup	@data[0].4s,$word0
1158	dup	@data[1].4s,$word1
1159	dup	@data[2].4s,$word2
1160	mov	@data[3].s[0],$ctr
1161	add	$ctr,$ctr,#1
1162	mov	@data[3].s[1],$ctr
1163	subs	$blocks,$blocks,#1
1164	b.ne	1f
1165	bl	_${prefix}_enc_4blks
1166	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1167	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1168	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1169	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1170	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1171	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1172	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1173	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1174	b	100f
11751:	// last 3 blocks processing
1176	add	$ctr,$ctr,#1
1177	mov	@data[3].s[2],$ctr
1178	bl	_${prefix}_enc_4blks
1179	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1180	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1181	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1182	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1183	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1184	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1185	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1186	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1187	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1188	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1189100:
1190	ldp	d10,d11,[sp,#16]
1191	ldp	d12,d13,[sp,#32]
1192	ldp	d14,d15,[sp,#48]
1193	ldp	x29,x30,[sp,#64]
1194	ldp	d8,d9,[sp],#80
1195	AARCH64_VALIDATE_LINK_REGISTER
1196	ret
1197.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1198___
1199}}}
1200
1201
1202{{{
1203my ($blocks,$len)=("x2","x2");
1204my $ivp=("x5");
1205my @twx=map("x$_",(12..27));
1206my ($rks1,$rks2)=("x26","x27");
1207my $lastBlk=("x26");
1208my $enc=("w28");
1209my $remain=("x29");
1210
1211my @tweak=map("v$_",(16..23));
1212my $lastTweak=("v25");
1213
1214sub gen_xts_cipher() {
1215	my $std = shift;
1216$code.=<<___;
1217.globl	${prefix}_xts_encrypt${std}
1218.type	${prefix}_xts_encrypt${std},%function
1219.align	5
1220${prefix}_xts_encrypt${std}:
1221	AARCH64_SIGN_LINK_REGISTER
1222	stp	x15, x16, [sp, #-0x10]!
1223	stp	x17, x18, [sp, #-0x10]!
1224	stp	x19, x20, [sp, #-0x10]!
1225	stp	x21, x22, [sp, #-0x10]!
1226	stp	x23, x24, [sp, #-0x10]!
1227	stp	x25, x26, [sp, #-0x10]!
1228	stp	x27, x28, [sp, #-0x10]!
1229	stp	x29, x30, [sp, #-0x10]!
1230	stp	d8, d9, [sp, #-0x10]!
1231	stp	d10, d11, [sp, #-0x10]!
1232	stp	d12, d13, [sp, #-0x10]!
1233	stp	d14, d15, [sp, #-0x10]!
1234	mov	$rks1,x3
1235	mov	$rks2,x4
1236	mov	$enc,w6
1237	ld1	{@tweak[0].4s}, [$ivp]
1238	mov	$rks,$rks2
1239___
1240	&load_sbox();
1241	&rev32(@tweak[0],@tweak[0]);
1242	&encrypt_1blk(@tweak[0]);
1243$code.=<<___;
1244	mov	$rks,$rks1
1245	and	$remain,$len,#0x0F
1246	// convert length into blocks
1247	lsr	$blocks,$len,4
1248	cmp	$blocks,#1
1249	b.lt .return${std}
1250
1251	cmp $remain,0
1252	// If the encryption/decryption Length is N times of 16,
1253	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1254	b.eq .xts_encrypt_blocks${std}
1255
1256	// If the encryption/decryption length is not N times of 16,
1257	// the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
1258	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1259	subs $blocks,$blocks,#1
1260	b.eq .only_2blks_tweak${std}
1261.xts_encrypt_blocks${std}:
1262___
1263	&rbit(@tweak[0],@tweak[0],$std);
1264	&rev32_armeb(@tweak[0],@tweak[0]);
1265	&mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
1266	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1267	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1268	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1269	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1270	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1271	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1272	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1273$code.=<<___;
1274.Lxts_8_blocks_process${std}:
1275	cmp	$blocks,#8
1276___
1277	&mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
1278	&compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
1279	&mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
1280	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1281	&mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
1282	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1283	&mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
1284	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1285	&mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
1286	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1287	&mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
1288	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1289	&mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
1290	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1291	&mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
1292	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1293$code.=<<___;
1294	b.lt	.Lxts_4_blocks_process${std}
1295	ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1296___
1297	&rbit(@tweak[0],@tweak[0],$std);
1298	&rbit(@tweak[1],@tweak[1],$std);
1299	&rbit(@tweak[2],@tweak[2],$std);
1300	&rbit(@tweak[3],@tweak[3],$std);
1301$code.=<<___;
1302	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1303	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1304	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1305	eor @data[3].16b, @data[3].16b, @tweak[3].16b
1306	ld1	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1307___
1308	&rbit(@tweak[4],@tweak[4],$std);
1309	&rbit(@tweak[5],@tweak[5],$std);
1310	&rbit(@tweak[6],@tweak[6],$std);
1311	&rbit(@tweak[7],@tweak[7],$std);
1312$code.=<<___;
1313	eor @datax[0].16b, @datax[0].16b, @tweak[4].16b
1314	eor @datax[1].16b, @datax[1].16b, @tweak[5].16b
1315	eor @datax[2].16b, @datax[2].16b, @tweak[6].16b
1316	eor @datax[3].16b, @datax[3].16b, @tweak[7].16b
1317___
1318	&rev32(@data[0],@data[0]);
1319	&rev32(@data[1],@data[1]);
1320	&rev32(@data[2],@data[2]);
1321	&rev32(@data[3],@data[3]);
1322	&rev32(@datax[0],@datax[0]);
1323	&rev32(@datax[1],@datax[1]);
1324	&rev32(@datax[2],@datax[2]);
1325	&rev32(@datax[3],@datax[3]);
1326	&transpose(@data,@vtmp);
1327	&transpose(@datax,@vtmp);
1328$code.=<<___;
1329	bl	_${prefix}_enc_8blks
1330___
1331	&transpose(@vtmp,@datax);
1332	&transpose(@data,@datax);
1333$code.=<<___;
1334	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1335	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1336	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1337	eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1338	eor @data[0].16b, @data[0].16b, @tweak[4].16b
1339	eor @data[1].16b, @data[1].16b, @tweak[5].16b
1340	eor @data[2].16b, @data[2].16b, @tweak[6].16b
1341	eor @data[3].16b, @data[3].16b, @tweak[7].16b
1342
1343	// save the last tweak
1344	mov $lastTweak.16b,@tweak[7].16b
1345	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1346	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1347	subs	$blocks,$blocks,#8
1348	b.gt	.Lxts_8_blocks_process${std}
1349	b	100f
1350.Lxts_4_blocks_process${std}:
1351	cmp	$blocks,#4
1352	b.lt	1f
1353	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1354___
1355	&rbit(@tweak[0],@tweak[0],$std);
1356	&rbit(@tweak[1],@tweak[1],$std);
1357	&rbit(@tweak[2],@tweak[2],$std);
1358	&rbit(@tweak[3],@tweak[3],$std);
1359$code.=<<___;
1360	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1361	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1362	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1363	eor @data[3].16b, @data[3].16b, @tweak[3].16b
1364___
1365	&rev32(@data[0],@data[0]);
1366	&rev32(@data[1],@data[1]);
1367	&rev32(@data[2],@data[2]);
1368	&rev32(@data[3],@data[3]);
1369	&transpose(@data,@vtmp);
1370$code.=<<___;
1371	bl	_${prefix}_enc_4blks
1372___
1373	&transpose(@vtmp,@data);
1374$code.=<<___;
1375	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1376	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1377	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1378	eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1379	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1380	sub	$blocks,$blocks,#4
1381	mov @tweak[0].16b,@tweak[4].16b
1382	mov @tweak[1].16b,@tweak[5].16b
1383	mov @tweak[2].16b,@tweak[6].16b
1384	// save the last tweak
1385	mov $lastTweak.16b,@tweak[3].16b
13861:
1387	// process last block
1388	cmp	$blocks,#1
1389	b.lt	100f
1390	b.gt	1f
1391	ld1	{@data[0].4s},[$inp],#16
1392___
1393	&rbit(@tweak[0],@tweak[0],$std);
1394$code.=<<___;
1395	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1396___
1397	&rev32(@data[0],@data[0]);
1398	&encrypt_1blk(@data[0]);
1399$code.=<<___;
1400	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1401	st1	{@data[0].4s},[$outp],#16
1402	// save the last tweak
1403	mov $lastTweak.16b,@tweak[0].16b
1404	b	100f
14051:  // process last 2 blocks
1406	cmp	$blocks,#2
1407	b.gt	1f
1408	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
1409___
1410	&rbit(@tweak[0],@tweak[0],$std);
1411	&rbit(@tweak[1],@tweak[1],$std);
1412$code.=<<___;
1413	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1414	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1415___
1416	&rev32(@data[0],@data[0]);
1417	&rev32(@data[1],@data[1]);
1418	&transpose(@data,@vtmp);
1419$code.=<<___;
1420	bl	_${prefix}_enc_4blks
1421___
1422	&transpose(@vtmp,@data);
1423$code.=<<___;
1424	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1425	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1426	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1427	// save the last tweak
1428	mov $lastTweak.16b,@tweak[1].16b
1429	b	100f
14301:  // process last 3 blocks
1431	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1432___
1433	&rbit(@tweak[0],@tweak[0],$std);
1434	&rbit(@tweak[1],@tweak[1],$std);
1435	&rbit(@tweak[2],@tweak[2],$std);
1436$code.=<<___;
1437	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1438	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1439	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1440___
1441	&rev32(@data[0],@data[0]);
1442	&rev32(@data[1],@data[1]);
1443	&rev32(@data[2],@data[2]);
1444	&transpose(@data,@vtmp);
1445$code.=<<___;
1446	bl	_${prefix}_enc_4blks
1447___
1448	&transpose(@vtmp,@data);
1449$code.=<<___;
1450	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1451	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1452	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1453	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1454	// save the last tweak
1455	mov $lastTweak.16b,@tweak[2].16b
1456100:
1457	cmp $remain,0
1458	b.eq .return${std}
1459
1460// This branch calculates the last two tweaks,
1461// while the encryption/decryption length is larger than 32
1462.last_2blks_tweak${std}:
1463___
1464	&rev32_armeb($lastTweak,$lastTweak);
1465	&compute_tweak_vec($lastTweak,@tweak[1],$std);
1466	&compute_tweak_vec(@tweak[1],@tweak[2],$std);
1467$code.=<<___;
1468	b .check_dec${std}
1469
1470
1471// This branch calculates the last two tweaks,
1472// while the encryption/decryption length is equal to 32, who only need two tweaks
1473.only_2blks_tweak${std}:
1474	mov @tweak[1].16b,@tweak[0].16b
1475___
1476	&rev32_armeb(@tweak[1],@tweak[1]);
1477	&compute_tweak_vec(@tweak[1],@tweak[2],$std);
1478$code.=<<___;
1479	b .check_dec${std}
1480
1481
1482// Determine whether encryption or decryption is required.
1483// The last two tweaks need to be swapped for decryption.
1484.check_dec${std}:
1485	// encryption:1 decryption:0
1486	cmp $enc,1
1487	b.eq .process_last_2blks${std}
1488	mov @vtmp[0].16B,@tweak[1].16b
1489	mov @tweak[1].16B,@tweak[2].16b
1490	mov @tweak[2].16B,@vtmp[0].16b
1491
1492.process_last_2blks${std}:
1493___
1494	&rev32_armeb(@tweak[1],@tweak[1]);
1495	&rev32_armeb(@tweak[2],@tweak[2]);
1496$code.=<<___;
1497	ld1	{@data[0].4s},[$inp],#16
1498	eor @data[0].16b, @data[0].16b, @tweak[1].16b
1499___
1500	&rev32(@data[0],@data[0]);
1501	&encrypt_1blk(@data[0]);
1502$code.=<<___;
1503	eor @data[0].16b, @data[0].16b, @tweak[1].16b
1504	st1	{@data[0].4s},[$outp],#16
1505
1506	sub $lastBlk,$outp,16
1507	.loop${std}:
1508		subs $remain,$remain,1
1509		ldrb	$wtmp0,[$lastBlk,$remain]
1510		ldrb	$wtmp1,[$inp,$remain]
1511		strb	$wtmp1,[$lastBlk,$remain]
1512		strb	$wtmp0,[$outp,$remain]
1513	b.gt .loop${std}
1514	ld1		{@data[0].4s}, [$lastBlk]
1515	eor @data[0].16b, @data[0].16b, @tweak[2].16b
1516___
1517	&rev32(@data[0],@data[0]);
1518	&encrypt_1blk(@data[0]);
1519$code.=<<___;
1520	eor @data[0].16b, @data[0].16b, @tweak[2].16b
1521	st1		{@data[0].4s}, [$lastBlk]
1522.return${std}:
1523	ldp		d14, d15, [sp], #0x10
1524	ldp		d12, d13, [sp], #0x10
1525	ldp		d10, d11, [sp], #0x10
1526	ldp		d8, d9, [sp], #0x10
1527	ldp		x29, x30, [sp], #0x10
1528	ldp		x27, x28, [sp], #0x10
1529	ldp		x25, x26, [sp], #0x10
1530	ldp		x23, x24, [sp], #0x10
1531	ldp		x21, x22, [sp], #0x10
1532	ldp		x19, x20, [sp], #0x10
1533	ldp		x17, x18, [sp], #0x10
1534	ldp		x15, x16, [sp], #0x10
1535	AARCH64_VALIDATE_LINK_REGISTER
1536	ret
1537.size	${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
1538___
1539} # end of gen_xts_cipher
1540&gen_xts_cipher("_gb");
1541&gen_xts_cipher("");
1542}}}
1543
1544########################################
1545open SELF,$0;
1546while(<SELF>) {
1547		next if (/^#!/);
1548		last if (!s/^#/\/\// and !/^$/);
1549		print;
1550}
1551close SELF;
1552
1553foreach(split("\n",$code)) {
1554	s/\`([^\`]*)\`/eval($1)/ge;
1555	print $_,"\n";
1556}
1557
1558close STDOUT or die "error closing STDOUT: $!";
1559