xref: /freebsd/crypto/openssl/crypto/chacha/asm/chacha-armv8-sve.pl (revision e7be843b4a162e68651d3911f0357ed464915629)
1#! /usr/bin/env perl
2# Copyright 2022-2025  The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9#
10# ChaCha20 for ARMv8 via SVE
11#
12# $output is the last argument if it looks like a file (it has an extension)
13# $flavour is the first argument if it doesn't look like a file
14$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
15$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
16
17$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
18( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
19( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
20die "can't locate arm-xlate.pl";
21
22open OUT,"| \"$^X\" $xlate $flavour \"$output\""
23    or die "can't call $xlate: $!";
24*STDOUT=*OUT;
25
26sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
27{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
28  my $arg = pop;
29    $arg = "#$arg" if ($arg*1 eq $arg);
30    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
31}
32
33$prefix="chacha_sve";
34my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
35my ($veclen) = ("x5");
36my ($counter) = ("x6");
37my ($counter_w) = ("w6");
38my @xx=(7..22);
39my @sxx=map("x$_",@xx);
40my @sx=map("w$_",@xx);
41my @K=map("x$_",(23..30));
42my @elem=(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
43my @KL=map("w$_",(23..30));
44my @mx=map("z$_",@elem);
45my @vx=map("v$_",@elem);
46my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
47    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
48my ($zctr) = ("z16");
49my @tt=(17..24);
50my @xt=map("z$_",@tt);
51my @vt=map("v$_",@tt);
52my @perm=map("z$_",(25..30));
53my ($rot8) = ("z31");
54my @bak=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],@xt[4],@xt[5],@xt[6],@xt[7],@xt[0],@xt[1],$zctr,@xt[2],@xt[3],$rot8);
55my $debug_encoder=0;
56
57sub SVE_ADD() {
58	my $x = shift;
59	my $y = shift;
60
61$code.=<<___;
62	add	@mx[$x].s,@mx[$x].s,@mx[$y].s
63	.if mixin == 1
64		add	@sx[$x],@sx[$x],@sx[$y]
65	.endif
66___
67	if (@_) {
68		&SVE_ADD(@_);
69	}
70}
71
72sub SVE_EOR() {
73	my $x = shift;
74	my $y = shift;
75
76$code.=<<___;
77	eor	@mx[$x].d,@mx[$x].d,@mx[$y].d
78	.if mixin == 1
79		eor	@sx[$x],@sx[$x],@sx[$y]
80	.endif
81___
82	if (@_) {
83		&SVE_EOR(@_);
84	}
85}
86
87sub SVE_LSL() {
88	my $bits = shift;
89	my $x = shift;
90	my $y = shift;
91	my $next = $x + 1;
92
93$code.=<<___;
94	lsl	@xt[$x].s,@mx[$y].s,$bits
95___
96	if (@_) {
97		&SVE_LSL($bits,$next,@_);
98	}
99}
100
101sub SVE_LSR() {
102	my $bits = shift;
103	my $x = shift;
104
105$code.=<<___;
106	lsr	@mx[$x].s,@mx[$x].s,$bits
107	.if mixin == 1
108		ror	@sx[$x],@sx[$x],$bits
109	.endif
110___
111	if (@_) {
112		&SVE_LSR($bits,@_);
113	}
114}
115
116sub SVE_ORR() {
117	my $x = shift;
118	my $y = shift;
119	my $next = $x + 1;
120
121$code.=<<___;
122	orr	@mx[$y].d,@mx[$y].d,@xt[$x].d
123___
124	if (@_) {
125		&SVE_ORR($next,@_);
126	}
127}
128
129sub SVE_REV16() {
130	my $x = shift;
131
132$code.=<<___;
133	revh	@mx[$x].s,p0/m,@mx[$x].s
134	.if mixin == 1
135		ror	@sx[$x],@sx[$x],#16
136	.endif
137___
138	if (@_) {
139		&SVE_REV16(@_);
140	}
141}
142
143sub SVE_ROT8() {
144	my $x = shift;
145
146$code.=<<___;
147	tbl	@mx[$x].b,{@mx[$x].b},$rot8.b
148	.if mixin == 1
149		ror	@sx[$x],@sx[$x],#24
150	.endif
151___
152	if (@_) {
153		&SVE_ROT8(@_);
154	}
155}
156
157sub SVE2_XAR() {
158	my $bits = shift;
159	my $x = shift;
160	my $y = shift;
161	my $rbits = 32-$bits;
162
163$code.=<<___;
164	.if mixin == 1
165		eor	@sx[$x],@sx[$x],@sx[$y]
166	.endif
167	xar	@mx[$x].s,@mx[$x].s,@mx[$y].s,$rbits
168	.if mixin == 1
169		ror	@sx[$x],@sx[$x],$rbits
170	.endif
171___
172	if (@_) {
173		&SVE2_XAR($bits,@_);
174	}
175}
176
177sub SVE2_QR_GROUP() {
178	my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
179
180	&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
181	&SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
182
183	&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
184	&SVE2_XAR(12,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
185
186	&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
187	&SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
188
189	&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
190	&SVE2_XAR(7,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
191}
192
193sub SVE_QR_GROUP() {
194	my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
195
196	&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
197	&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
198	&SVE_REV16($d0,$d1,$d2,$d3);
199
200	&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
201	&SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
202	&SVE_LSL(12,0,$b0,$b1,$b2,$b3);
203	&SVE_LSR(20,$b0,$b1,$b2,$b3);
204	&SVE_ORR(0,$b0,$b1,$b2,$b3);
205
206	&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
207	&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
208	&SVE_ROT8($d0,$d1,$d2,$d3);
209
210	&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
211	&SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
212	&SVE_LSL(7,0,$b0,$b1,$b2,$b3);
213	&SVE_LSR(25,$b0,$b1,$b2,$b3);
214	&SVE_ORR(0,$b0,$b1,$b2,$b3);
215}
216
217sub SVE_INNER_BLOCK() {
218$code.=<<___;
219	mov	$counter,#10
22010:
221.align	5
222___
223	&SVE_QR_GROUP(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
224	&SVE_QR_GROUP(0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
225$code.=<<___;
226	sub	$counter,$counter,1
227	cbnz	$counter,10b
228___
229}
230
231sub SVE2_INNER_BLOCK() {
232$code.=<<___;
233	mov	$counter,#10
23410:
235.align	5
236___
237	&SVE2_QR_GROUP(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
238	&SVE2_QR_GROUP(0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
239$code.=<<___;
240	sub	$counter,$counter,1
241	cbnz	$counter,10b
242___
243}
244
245sub load_regs() {
246	my $offset = shift;
247	my $reg = shift;
248	my $next_offset = $offset + 1;
249$code.=<<___;
250	ld1w	{$reg.s},p0/z,[$inp,#$offset,MUL VL]
251___
252	if (@_) {
253		&load_regs($next_offset, @_);
254	} else {
255$code.=<<___;
256	addvl	$inp,$inp,$next_offset
257___
258	}
259}
260
261sub load() {
262	if (@_) {
263		&load_regs(0, @_);
264	}
265}
266
267sub store_regs() {
268	my $offset = shift;
269	my $reg = shift;
270	my $next_offset = $offset + 1;
271$code.=<<___;
272	st1w	{$reg.s},p0,[$outp,#$offset,MUL VL]
273___
274	if (@_) {
275		&store_regs($next_offset, @_);
276	} else {
277$code.=<<___;
278	addvl	$outp,$outp,$next_offset
279___
280	}
281}
282
283sub store() {
284	if (@_) {
285		&store_regs(0, @_);
286	}
287}
288
289sub transpose() {
290	my $xa = shift;
291	my $xb = shift;
292	my $xc = shift;
293	my $xd = shift;
294	my $xa1 = shift;
295	my $xb1 = shift;
296	my $xc1 = shift;
297	my $xd1 = shift;
298$code.=<<___;
299	zip1	@xt[0].s,$xa.s,$xb.s
300	zip2	@xt[1].s,$xa.s,$xb.s
301	zip1	@xt[2].s,$xc.s,$xd.s
302	zip2	@xt[3].s,$xc.s,$xd.s
303
304	zip1	@xt[4].s,$xa1.s,$xb1.s
305	zip2	@xt[5].s,$xa1.s,$xb1.s
306	zip1	@xt[6].s,$xc1.s,$xd1.s
307	zip2	@xt[7].s,$xc1.s,$xd1.s
308
309	zip1	$xa.d,@xt[0].d,@xt[2].d
310	zip2	$xb.d,@xt[0].d,@xt[2].d
311	zip1	$xc.d,@xt[1].d,@xt[3].d
312	zip2	$xd.d,@xt[1].d,@xt[3].d
313
314	zip1	$xa1.d,@xt[4].d,@xt[6].d
315	zip2	$xb1.d,@xt[4].d,@xt[6].d
316	zip1	$xc1.d,@xt[5].d,@xt[7].d
317	zip2	$xd1.d,@xt[5].d,@xt[7].d
318___
319}
320
321sub ACCUM() {
322	my $idx0 = shift;
323	my $idx1 = $idx0 + 1;
324	my $x0 = @sx[$idx0];
325	my $xx0 = @sxx[$idx0];
326	my $x1 = @sx[$idx1];
327	my $xx1 = @sxx[$idx1];
328	my $d = $idx0/2;
329	my ($tmp,$tmpw) = ($counter,$counter_w);
330	my $bk0 = @_ ? shift : @bak[$idx0];
331	my $bk1 = @_ ? shift : @bak[$idx1];
332
333$code.=<<___;
334	.if mixin == 1
335		add	@sx[$idx0],@sx[$idx0],@KL[$d]
336	.endif
337	add	@mx[$idx0].s,@mx[$idx0].s,$bk0.s
338	.if mixin == 1
339		add	@sxx[$idx1],@sxx[$idx1],@K[$d],lsr #32
340	.endif
341	add	@mx[$idx1].s,@mx[$idx1].s,$bk1.s
342	.if mixin == 1
343		add	@sxx[$idx0],@sxx[$idx0],$sxx[$idx1],lsl #32  // pack
344	.endif
345___
346}
347
348sub SCA_INP() {
349	my $idx0 = shift;
350	my $idx1 = $idx0 + 2;
351$code.=<<___;
352	.if mixin == 1
353		ldp	@sxx[$idx0],@sxx[$idx1],[$inp],#16
354	.endif
355___
356}
357
358sub SVE_ACCUM_STATES() {
359	my ($tmp,$tmpw) = ($counter,$counter_w);
360
361$code.=<<___;
362	lsr	$tmp,@K[5],#32
363	dup	@bak[10].s,@KL[5]
364	dup	@bak[11].s,$tmpw
365	lsr	$tmp,@K[6],#32
366	dup	@bak[13].s,$tmpw
367	lsr	$tmp,@K[7],#32
368___
369	&ACCUM(0);
370	&ACCUM(2);
371	&SCA_INP(1);
372	&ACCUM(4);
373	&ACCUM(6);
374	&SCA_INP(5);
375	&ACCUM(8);
376	&ACCUM(10);
377	&SCA_INP(9);
378$code.=<<___;
379	dup	@bak[14].s,@KL[7]
380	dup	@bak[0].s,$tmpw	// bak[15] not available for SVE
381___
382	&ACCUM(12);
383	&ACCUM(14, @bak[14],@bak[0]);
384	&SCA_INP(13);
385}
386
387sub SVE2_ACCUM_STATES() {
388	&ACCUM(0);
389	&ACCUM(2);
390	&SCA_INP(1);
391	&ACCUM(4);
392	&ACCUM(6);
393	&SCA_INP(5);
394	&ACCUM(8);
395	&ACCUM(10);
396	&SCA_INP(9);
397	&ACCUM(12);
398	&ACCUM(14);
399	&SCA_INP(13);
400}
401
402sub SCA_EOR() {
403	my $idx0 = shift;
404	my $idx1 = $idx0 + 1;
405$code.=<<___;
406	.if mixin == 1
407		eor	@sxx[$idx0],@sxx[$idx0],@sxx[$idx1]
408	.endif
409___
410}
411
412sub SCA_SAVE() {
413	my $idx0 = shift;
414	my $idx1 = shift;
415$code.=<<___;
416	.if mixin == 1
417		stp	@sxx[$idx0],@sxx[$idx1],[$outp],#16
418	.endif
419___
420}
421
422sub SVE_VL128_TRANSFORMS() {
423	&SCA_EOR(0);
424	&SCA_EOR(2);
425	&SCA_EOR(4);
426	&transpose($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
427	&SCA_EOR(6);
428	&SCA_EOR(8);
429	&SCA_EOR(10);
430	&transpose($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
431	&SCA_EOR(12);
432	&SCA_EOR(14);
433$code.=<<___;
434	ld1	{@vt[0].4s-@vt[3].4s},[$inp],#64
435	ld1	{@vt[4].4s-@vt[7].4s},[$inp],#64
436	eor	$xa0.d,$xa0.d,@xt[0].d
437	eor	$xb0.d,$xb0.d,@xt[1].d
438	eor	$xc0.d,$xc0.d,@xt[2].d
439	eor	$xd0.d,$xd0.d,@xt[3].d
440	eor	$xa1.d,$xa1.d,@xt[4].d
441	eor	$xb1.d,$xb1.d,@xt[5].d
442	eor	$xc1.d,$xc1.d,@xt[6].d
443	eor	$xd1.d,$xd1.d,@xt[7].d
444	ld1	{@vt[0].4s-@vt[3].4s},[$inp],#64
445	ld1	{@vt[4].4s-@vt[7].4s},[$inp],#64
446___
447	&SCA_SAVE(0,2);
448$code.=<<___;
449	eor	$xa2.d,$xa2.d,@xt[0].d
450	eor	$xb2.d,$xb2.d,@xt[1].d
451___
452	&SCA_SAVE(4,6);
453$code.=<<___;
454	eor	$xc2.d,$xc2.d,@xt[2].d
455	eor	$xd2.d,$xd2.d,@xt[3].d
456___
457	&SCA_SAVE(8,10);
458$code.=<<___;
459	eor	$xa3.d,$xa3.d,@xt[4].d
460	eor	$xb3.d,$xb3.d,@xt[5].d
461___
462	&SCA_SAVE(12,14);
463$code.=<<___;
464	eor	$xc3.d,$xc3.d,@xt[6].d
465	eor	$xd3.d,$xd3.d,@xt[7].d
466	st1	{@vx[0].4s-@vx[12].4s},[$outp],#64
467	st1	{@vx[1].4s-@vx[13].4s},[$outp],#64
468	st1	{@vx[2].4s-@vx[14].4s},[$outp],#64
469	st1	{@vx[3].4s-@vx[15].4s},[$outp],#64
470___
471}
472
473sub SVE_TRANSFORMS() {
474$code.=<<___;
475#ifdef	__AARCH64EB__
476	rev	@sxx[0],@sxx[0]
477	revb	@mx[0].s,p0/m,@mx[0].s
478	revb	@mx[1].s,p0/m,@mx[1].s
479	rev	@sxx[2],@sxx[2]
480	revb	@mx[2].s,p0/m,@mx[2].s
481	revb	@mx[3].s,p0/m,@mx[3].s
482	rev	@sxx[4],@sxx[4]
483	revb	@mx[4].s,p0/m,@mx[4].s
484	revb	@mx[5].s,p0/m,@mx[5].s
485	rev	@sxx[6],@sxx[6]
486	revb	@mx[6].s,p0/m,@mx[6].s
487	revb	@mx[7].s,p0/m,@mx[7].s
488	rev	@sxx[8],@sxx[8]
489	revb	@mx[8].s,p0/m,@mx[8].s
490	revb	@mx[9].s,p0/m,@mx[9].s
491	rev	@sxx[10],@sxx[10]
492	revb	@mx[10].s,p0/m,@mx[10].s
493	revb	@mx[11].s,p0/m,@mx[11].s
494	rev	@sxx[12],@sxx[12]
495	revb	@mx[12].s,p0/m,@mx[12].s
496	revb	@mx[13].s,p0/m,@mx[13].s
497	rev	@sxx[14],@sxx[14]
498	revb	@mx[14].s,p0/m,@mx[14].s
499	revb	@mx[15].s,p0/m,@mx[15].s
500#endif
501	.if mixin == 1
502		add	@K[6],@K[6],#1
503	.endif
504	cmp	$veclen,4
505	b.ne	200f
506___
507	&SVE_VL128_TRANSFORMS();
508$code.=<<___;
509	b	210f
510200:
511___
512	&transpose($xa0,$xb0,$xc0,$xd0,$xa1,$xb1,$xc1,$xd1);
513	&SCA_EOR(0);
514	&SCA_EOR(2);
515	&transpose($xa2,$xb2,$xc2,$xd2,$xa3,$xb3,$xc3,$xd3);
516	&SCA_EOR(4);
517	&SCA_EOR(6);
518	&transpose($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
519	&SCA_EOR(8);
520	&SCA_EOR(10);
521	&transpose($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
522	&SCA_EOR(12);
523	&SCA_EOR(14);
524	&load(@xt[0],@xt[1],@xt[2],@xt[3],@xt[4],@xt[5],@xt[6],@xt[7]);
525$code.=<<___;
526	eor	$xa0.d,$xa0.d,@xt[0].d
527	eor	$xa1.d,$xa1.d,@xt[1].d
528	eor	$xa2.d,$xa2.d,@xt[2].d
529	eor	$xa3.d,$xa3.d,@xt[3].d
530	eor	$xb0.d,$xb0.d,@xt[4].d
531	eor	$xb1.d,$xb1.d,@xt[5].d
532	eor	$xb2.d,$xb2.d,@xt[6].d
533	eor	$xb3.d,$xb3.d,@xt[7].d
534___
535	&load(@xt[0],@xt[1],@xt[2],@xt[3],@xt[4],@xt[5],@xt[6],@xt[7]);
536	&SCA_SAVE(0,2);
537$code.=<<___;
538	eor	$xc0.d,$xc0.d,@xt[0].d
539	eor	$xc1.d,$xc1.d,@xt[1].d
540___
541	&SCA_SAVE(4,6);
542$code.=<<___;
543	eor	$xc2.d,$xc2.d,@xt[2].d
544	eor	$xc3.d,$xc3.d,@xt[3].d
545___
546	&SCA_SAVE(8,10);
547$code.=<<___;
548	eor	$xd0.d,$xd0.d,@xt[4].d
549	eor	$xd1.d,$xd1.d,@xt[5].d
550___
551	&SCA_SAVE(12,14);
552$code.=<<___;
553	eor	$xd2.d,$xd2.d,@xt[6].d
554	eor	$xd3.d,$xd3.d,@xt[7].d
555___
556	&store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
557	&store($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
558$code.=<<___;
559210:
560	incw	@K[6], ALL, MUL #1
561___
562}
563
564sub SET_STATE_BAK() {
565	my $idx0 = shift;
566	my $idx1 = $idx0 + 1;
567	my $x0 = @sx[$idx0];
568	my $xx0 = @sxx[$idx0];
569	my $x1 = @sx[$idx1];
570	my $xx1 = @sxx[$idx1];
571	my $d = $idx0/2;
572
573$code.=<<___;
574	lsr	$xx1,@K[$d],#32
575	dup	@mx[$idx0].s,@KL[$d]
576	dup	@bak[$idx0].s,@KL[$d]
577	.if mixin == 1
578		mov	$x0,@KL[$d]
579	.endif
580	dup	@mx[$idx1].s,$x1
581	dup	@bak[$idx1].s,$x1
582___
583}
584
585sub SET_STATE() {
586	my $idx0 = shift;
587	my $idx1 = $idx0 + 1;
588	my $x0 = @sx[$idx0];
589	my $xx0 = @sxx[$idx0];
590	my $x1 = @sx[$idx1];
591	my $xx1 = @sxx[$idx1];
592	my $d = $idx0/2;
593
594$code.=<<___;
595	lsr	$xx1,@K[$d],#32
596	dup	@mx[$idx0].s,@KL[$d]
597	.if mixin == 1
598		mov	$x0,@KL[$d]
599	.endif
600	dup	@mx[$idx1].s,$x1
601___
602}
603
604sub SVE_LOAD_STATES() {
605	&SET_STATE_BAK(0);
606	&SET_STATE_BAK(2);
607	&SET_STATE_BAK(4);
608	&SET_STATE_BAK(6);
609	&SET_STATE_BAK(8);
610	&SET_STATE(10);
611	&SET_STATE(14);
612$code.=<<___;
613	.if mixin == 1
614		add	@sx[13],@KL[6],#1
615		mov	@sx[12],@KL[6]
616		index	$zctr.s,@sx[13],1
617		index	@mx[12].s,@sx[13],1
618	.else
619		index	$zctr.s,@KL[6],1
620		index	@mx[12].s,@KL[6],1
621	.endif
622	lsr	@sxx[13],@K[6],#32
623	dup	@mx[13].s,@sx[13]
624___
625}
626
627sub SVE2_LOAD_STATES() {
628	&SET_STATE_BAK(0);
629	&SET_STATE_BAK(2);
630	&SET_STATE_BAK(4);
631	&SET_STATE_BAK(6);
632	&SET_STATE_BAK(8);
633	&SET_STATE_BAK(10);
634	&SET_STATE_BAK(14);
635
636$code.=<<___;
637	.if mixin == 1
638		add	@sx[13],@KL[6],#1
639		mov	@sx[12],@KL[6]
640		index	$zctr.s,@sx[13],1
641		index	@mx[12].s,@sx[13],1
642	.else
643		index	$zctr.s,@KL[6],1
644		index	@mx[12].s,@KL[6],1
645	.endif
646	lsr	@sxx[13],@K[6],#32
647	dup	@mx[13].s,@sx[13]
648	dup	@bak[13].s,@sx[13]
649___
650}
651
652sub chacha20_sve() {
653	my ($tmp) = (@sxx[0]);
654
655$code.=<<___;
656.align	5
657100:
658	subs	$tmp,$len,$veclen,lsl #6
659	b.lt	110f
660	mov	$len,$tmp
661	b.eq	101f
662	cmp	$len,64
663	b.lt	101f
664	mixin=1
665___
666	&SVE_LOAD_STATES();
667	&SVE_INNER_BLOCK();
668	&SVE_ACCUM_STATES();
669	&SVE_TRANSFORMS();
670$code.=<<___;
671	subs	$len,$len,64
672	b.gt	100b
673	b	110f
674101:
675	mixin=0
676___
677	&SVE_LOAD_STATES();
678	&SVE_INNER_BLOCK();
679	&SVE_ACCUM_STATES();
680	&SVE_TRANSFORMS();
681$code.=<<___;
682110:
683___
684}
685
686sub chacha20_sve2() {
687	my ($tmp) = (@sxx[0]);
688
689$code.=<<___;
690.align	5
691100:
692	subs	$tmp,$len,$veclen,lsl #6
693	b.lt	110f
694	mov	$len,$tmp
695	b.eq	101f
696	cmp	$len,64
697	b.lt	101f
698	mixin=1
699___
700	&SVE2_LOAD_STATES();
701	&SVE2_INNER_BLOCK();
702	&SVE2_ACCUM_STATES();
703	&SVE_TRANSFORMS();
704$code.=<<___;
705	subs	$len,$len,64
706	b.gt	100b
707	b	110f
708101:
709	mixin=0
710___
711	&SVE2_LOAD_STATES();
712	&SVE2_INNER_BLOCK();
713	&SVE2_ACCUM_STATES();
714	&SVE_TRANSFORMS();
715$code.=<<___;
716110:
717___
718}
719
720
721{{{
722	my ($tmp,$tmpw) = ("x6", "w6");
723	my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10");
724	my ($sve2flag) = ("x7");
725
726$code.=<<___;
727#include "arm_arch.h"
728
729.arch   armv8-a
730
731.extern	OPENSSL_armcap_P
732.hidden	OPENSSL_armcap_P
733
734.text
735
736.rodata
737.align	5
738.type _${prefix}_consts,%object
739_${prefix}_consts:
740.Lchacha20_consts:
741.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
742.Lrot8:
743	.word 0x02010003,0x04040404,0x02010003,0x04040404
744.size _${prefix}_consts,.-_${prefix}_consts
745
746.previous
747
748.globl	ChaCha20_ctr32_sve
749.type	ChaCha20_ctr32_sve,%function
750.align	5
751ChaCha20_ctr32_sve:
752	AARCH64_VALID_CALL_TARGET
753	cntw	$veclen, ALL, MUL #1
754	cmp	$len,$veclen,lsl #6
755	b.lt	.Lreturn
756	mov	$sve2flag,0
757	adrp	$tmp,OPENSSL_armcap_P
758	ldr	$tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
759	tst	$tmpw,#ARMV8_SVE2
760	b.eq	1f
761	mov	$sve2flag,1
762	b	2f
7631:
764	cmp	$veclen,4
765	b.le	.Lreturn
766	adrp	$tmp,.Lrot8
767	add	$tmp,$tmp,#:lo12:.Lrot8
768	ldp	$tmpw0,$tmpw1,[$tmp]
769	index	$rot8.s,$tmpw0,$tmpw1
7702:
771	AARCH64_SIGN_LINK_REGISTER
772	stp	d8,d9,[sp,-192]!
773	stp	d10,d11,[sp,16]
774	stp	d12,d13,[sp,32]
775	stp	d14,d15,[sp,48]
776	stp	x16,x17,[sp,64]
777	stp	x18,x19,[sp,80]
778	stp	x20,x21,[sp,96]
779	stp	x22,x23,[sp,112]
780	stp	x24,x25,[sp,128]
781	stp	x26,x27,[sp,144]
782	stp	x28,x29,[sp,160]
783	str	x30,[sp,176]
784
785	adrp	$tmp,.Lchacha20_consts
786	add	$tmp,$tmp,#:lo12:.Lchacha20_consts
787	ldp	@K[0],@K[1],[$tmp]
788	ldp	@K[2],@K[3],[$key]
789	ldp	@K[4],@K[5],[$key, 16]
790	ldp	@K[6],@K[7],[$ctr]
791	ptrues	p0.s,ALL
792#ifdef	__AARCH64EB__
793	ror	@K[2],@K[2],#32
794	ror	@K[3],@K[3],#32
795	ror	@K[4],@K[4],#32
796	ror	@K[5],@K[5],#32
797	ror	@K[6],@K[6],#32
798	ror	@K[7],@K[7],#32
799#endif
800	cbz	$sve2flag, 1f
801___
802	&chacha20_sve2();
803$code.=<<___;
804	b	2f
8051:
806___
807	&chacha20_sve();
808$code.=<<___;
8092:
810	str	@KL[6],[$ctr]
811	ldp	d10,d11,[sp,16]
812	ldp	d12,d13,[sp,32]
813	ldp	d14,d15,[sp,48]
814	ldp	x16,x17,[sp,64]
815	ldp	x18,x19,[sp,80]
816	ldp	x20,x21,[sp,96]
817	ldp	x22,x23,[sp,112]
818	ldp	x24,x25,[sp,128]
819	ldp	x26,x27,[sp,144]
820	ldp	x28,x29,[sp,160]
821	ldr	x30,[sp,176]
822	ldp	d8,d9,[sp],192
823	AARCH64_VALIDATE_LINK_REGISTER
824.Lreturn:
825	ret
826.size	ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve
827___
828
829}}}
830
831########################################
832{
833my  %opcode_unpred = (
834	"movprfx"      => 0x0420BC00,
835	"eor"          => 0x04a03000,
836	"add"          => 0x04200000,
837	"orr"          => 0x04603000,
838	"lsl"          => 0x04209C00,
839	"lsr"          => 0x04209400,
840	"incw"         => 0x04B00000,
841	"xar"          => 0x04203400,
842	"zip1"         => 0x05206000,
843	"zip2"         => 0x05206400,
844	"uzp1"         => 0x05206800,
845	"uzp2"         => 0x05206C00,
846	"index"        => 0x04204C00,
847	"mov"          => 0x05203800,
848	"dup"          => 0x05203800,
849	"cntw"         => 0x04A0E000,
850	"tbl"          => 0x05203000);
851
852my  %opcode_imm_unpred = (
853	"dup"          => 0x2538C000,
854	"index"        => 0x04204400);
855
856my %opcode_scalar_pred = (
857	"mov"          => 0x0528A000,
858	"cpy"          => 0x0528A000,
859	"st4w"         => 0xE5606000,
860	"st1w"         => 0xE5004000,
861	"ld1w"         => 0xA5404000);
862
863my %opcode_gather_pred = (
864	"ld1w"         => 0x85204000);
865
866my  %opcode_pred = (
867	"eor"          => 0x04190000,
868	"add"          => 0x04000000,
869	"orr"          => 0x04180000,
870	"whilelo"      => 0x25200C00,
871	"whilelt"      => 0x25200400,
872	"cntp"         => 0x25208000,
873	"addvl"        => 0x04205000,
874	"lsl"          => 0x04038000,
875	"lsr"          => 0x04018000,
876	"sel"          => 0x0520C000,
877	"mov"          => 0x0520C000,
878	"ptrue"        => 0x2518E000,
879	"pfalse"       => 0x2518E400,
880	"ptrues"       => 0x2519E000,
881	"pnext"        => 0x2519C400,
882	"ld4w"         => 0xA560E000,
883	"st4w"         => 0xE570E000,
884	"st1w"         => 0xE500E000,
885	"ld1w"         => 0xA540A000,
886	"ld1rw"        => 0x8540C000,
887	"lasta"        => 0x0520A000,
888	"revh"         => 0x05258000,
889	"revb"         => 0x05248000);
890
891my  %tsize = (
892	'b'          => 0,
893	'h'          => 1,
894	's'          => 2,
895	'd'          => 3);
896
897my %sf = (
898	"w"          => 0,
899	"x"          => 1);
900
901my %pattern = (
902	"POW2"       => 0,
903	"VL1"        => 1,
904	"VL2"        => 2,
905	"VL3"        => 3,
906	"VL4"        => 4,
907	"VL5"        => 5,
908	"VL6"        => 6,
909	"VL7"        => 7,
910	"VL8"        => 8,
911	"VL16"       => 9,
912	"VL32"       => 10,
913	"VL64"       => 11,
914	"VL128"      => 12,
915	"VL256"      => 13,
916	"MUL4"       => 29,
917	"MUL3"       => 30,
918	"ALL"        => 31);
919
920sub create_verifier {
921	my $filename="./compile_sve.sh";
922
923$scripts = <<___;
924#! /bin/bash
925set -e
926CROSS_COMPILE=\${CROSS_COMPILE:-'aarch64-none-linux-gnu-'}
927
928[ -z "\$1" ] && exit 1
929ARCH=`uname -p | xargs echo -n`
930
931# need gcc-10 and above to compile SVE code
932# change this according to your system during debugging
933if [ \$ARCH == 'aarch64' ]; then
934	CC=gcc-11
935	OBJDUMP=objdump
936else
937	CC=\${CROSS_COMPILE}gcc
938	OBJDUMP=\${CROSS_COMPILE}objdump
939fi
940TMPFILE=/tmp/\$\$
941cat > \$TMPFILE.c << EOF
942extern __attribute__((noinline, section("disasm_output"))) void dummy_func()
943{
944	asm("\$@\\t\\n");
945}
946int main(int argc, char *argv[])
947{
948}
949EOF
950\$CC -march=armv8.2-a+sve+sve2 -o \$TMPFILE.out \$TMPFILE.c
951\$OBJDUMP -d \$TMPFILE.out | awk -F"\\n" -v RS="\\n\\n" '\$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",\$2}'
952rm \$TMPFILE.c \$TMPFILE.out
953___
954	open(FH, '>', $filename) or die $!;
955	print FH $scripts;
956	close(FH);
957	system("chmod a+x ./compile_sve.sh");
958}
959
960sub compile_sve {
961	return `./compile_sve.sh '@_'`
962}
963
964sub verify_inst {
965	my ($code,$inst)=@_;
966	my $hexcode = (sprintf "%08x", $code);
967
968	if ($debug_encoder == 1) {
969		my $expect=&compile_sve($inst);
970		if ($expect ne $hexcode) {
971			return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode);
972		}
973	}
974	return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst);
975}
976
977sub reg_code {
978	my $code = shift;
979
980	if ($code == "zr") {
981		return "31";
982	}
983	return $code;
984}
985
986sub encode_size_imm() {
987	my ($mnemonic, $isize, $const)=@_;
988	my $esize = (8<<$tsize{$isize});
989	my $tsize_imm = $esize + $const;
990
991	if ($mnemonic eq "lsr" || $mnemonic eq "xar") {
992		$tsize_imm = 2*$esize - $const;
993	}
994	return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16);
995}
996
997sub encode_shift_pred() {
998	my ($mnemonic, $isize, $const)=@_;
999	my $esize = (8<<$tsize{$isize});
1000	my $tsize_imm = $esize + $const;
1001
1002	if ($mnemonic eq "lsr") {
1003		$tsize_imm = 2*$esize - $const;
1004	}
1005	return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<5);
1006}
1007
1008sub sve_unpred {
1009	my ($mnemonic,$arg)=@_;
1010	my $inst = (sprintf "%s %s", $mnemonic,$arg);
1011
1012	if ($arg =~ m/z([0-9]+)\.([bhsd]),\s*\{\s*z([0-9]+)\.[bhsd].*\},\s*z([0-9]+)\.[bhsd].*/o) {
1013		return &verify_inst($opcode_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22)|($4<<16),
1014					$inst)
1015	} elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*([zwx][0-9]+.*)/o) {
1016       		my $regd = $1;
1017		my $isize = $2;
1018		my $regs=$3;
1019
1020		if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
1021			if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
1022				&& ((8<<$tsize{$isize}) > $2)) {
1023				return &verify_inst($opcode_unpred{$mnemonic}|$regd|($1<<5)|&encode_size_imm($mnemonic,$isize,$2),
1024					$inst);
1025			}
1026		} elsif($regs =~ m/[wx]([0-9]+),\s*[wx]([0-9]+)/o) {
1027			return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
1028		} elsif ($regs =~ m/[wx]([0-9]+),\s*#?([0-9]+)/o) {
1029			return &verify_inst($opcode_imm_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
1030		} elsif ($regs =~ m/[wx]([0-9]+)/o) {
1031			return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5), $inst);
1032		} else {
1033			my $encoded_size = 0;
1034			if (($mnemonic eq "add") || ($mnemonic =~ /zip./) || ($mnemonic =~ /uzp./) ) {
1035				$encoded_size = ($tsize{$isize}<<22);
1036			}
1037			if ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd],\s*([0-9]+)/o &&
1038				$1 == $regd) {
1039				return &verify_inst($opcode_unpred{$mnemonic}|$regd|($2<<5)|&encode_size_imm($mnemonic,$isize,$3), $inst);
1040			} elsif ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd]/o) {
1041				return &verify_inst($opcode_unpred{$mnemonic}|$regd|$encoded_size|($1<<5)|($2<<16), $inst);
1042			}
1043		}
1044	} elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*#?([0-9]+)/o) {
1045		return &verify_inst($opcode_imm_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22),
1046					$inst)
1047	}
1048	sprintf "%s // fail to parse", $inst;
1049}
1050
1051sub sve_pred {
1052	my ($mnemonic,,$arg)=@_;
1053	my $inst = (sprintf "%s %s", $mnemonic,$arg);
1054
1055	if ($arg =~ m/\{\s*z([0-9]+)\.([bhsd]).*\},\s*p([0-9])+(\/z)?,\s*\[(\s*[xs].*)\]/o) {
1056		my $zt = $1;
1057		my $size = $tsize{$2};
1058		my $pg = $3;
1059		my $addr = $5;
1060		my $xn = 31;
1061
1062		if ($addr =~ m/x([0-9]+)\s*/o) {
1063			$xn = $1;
1064		}
1065
1066		if ($mnemonic =~m/ld1r[bhwd]/o) {
1067			$size = 0;
1068		}
1069		if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
1070			return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
1071		} elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
1072			my $xs = ($2 eq "SXTW") ? 1 : 0;
1073			return &verify_inst($opcode_gather_pred{$mnemonic}|($xs<<22)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
1074		} elsif($addr =~ m/\w+\s*,\s*#?([0-9]+)/o) {
1075			return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
1076		} else {
1077			return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($xn<<5),$inst);
1078		}
1079	} elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/([mz]),\s*([zwx][0-9]+.*)/o) {
1080		my $regd = $1;
1081		my $isize = $2;
1082		my $pg = $3;
1083		my $mod = $4;
1084		my $regs = $5;
1085
1086		if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
1087			if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
1088				&& $regd == $1
1089				&& $mode == 'm'
1090				&& ((8<<$tsize{$isize}) > $2)) {
1091				return &verify_inst($opcode_pred{$mnemonic}|$regd|($pg<<10)|&encode_shift_pred($mnemonic,$isize,$2), $inst);
1092			}
1093		} elsif($regs =~ m/[wx]([0-9]+)/o) {
1094			return &verify_inst($opcode_scalar_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
1095		} elsif ($regs =~ m/z([0-9]+)[^,]*(?:,\s*z([0-9]+))?/o) {
1096			if ($mnemonic eq "sel") {
1097				return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($2<<16), $inst);
1098			} elsif ($mnemonic eq "mov") {
1099				return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($regd<<16), $inst);
1100			} elsif (length $2 > 0) {
1101				return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($2<<5), $inst);
1102			} else {
1103				return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
1104			}
1105		}
1106	} elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(\w+.*)/o) {
1107		my $pg = $1;
1108		my $isize = $2;
1109		my $regs = $3;
1110
1111		if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) {
1112			return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($sf{$1}<<12)|(&reg_code($2)<<5)|(&reg_code($3)<<16), $inst);
1113		} elsif ($regs =~ m/p([0-9]+),\s*p([0-9]+)\.[bhsd]/o) {
1114			return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($1<<5), $inst);
1115		} else {
1116			return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($pattern{$regs}<<5), $inst);
1117		}
1118	} elsif ($arg =~ m/p([0-9]+)\.([bhsd])/o) {
1119		return &verify_inst($opcode_pred{$mnemonic}|$1, $inst);
1120	}
1121
1122	sprintf "%s // fail to parse", $inst;
1123}
1124
1125sub sve_other {
1126	my ($mnemonic,$arg)=@_;
1127	my $inst = (sprintf "%s %s", $mnemonic,$arg);
1128
1129	if ($arg =~ m/x([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*p([0-9]+)\.([bhsd])/o) {
1130		return &verify_inst($opcode_pred{$mnemonic}|($tsize{$4}<<22)|$1|($2<<10)|($3<<5), $inst);
1131	} elsif ($arg =~ m/(x|w)([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*z([0-9]+)\.([bhsd])/o) {
1132		return &verify_inst($opcode_pred{$mnemonic}|($tsize{$5}<<22)|$1|($3<<10)|($4<<5)|$2, $inst);
1133	}elsif ($mnemonic =~ /inc[bhdw]/) {
1134		if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
1135			return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(2<<12)|(($3 - 1)<<16)|0xE000, $inst);
1136		} elsif ($arg =~ m/z([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
1137			return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16)|0xC000, $inst);
1138		} elsif ($arg =~ m/x([0-9]+)/o) {
1139			return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16)|0xE000, $inst);
1140		}
1141	} elsif ($mnemonic =~ /cnt[bhdw]/) {
1142		if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
1143			return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
1144		}
1145	} elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
1146		return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
1147	} elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) {
1148		return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst);
1149	}
1150	sprintf "%s // fail to parse", $inst;
1151}
1152}
1153
1154open SELF,$0;
1155while(<SELF>) {
1156	next if (/^#!/);
1157	last if (!s/^#/\/\// and !/^$/);
1158	print;
1159}
1160close SELF;
1161
1162if ($debug_encoder == 1) {
1163	&create_verifier();
1164}
1165
1166foreach(split("\n",$code)) {
1167	s/\`([^\`]*)\`/eval($1)/ge;
1168	s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
1169	s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
1170	s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
1171	s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
1172	s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
1173	s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
1174	s/\b(movprfx|lasta|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z|w).*)/sve_other($1,$2)/ge;
1175	print $_,"\n";
1176}
1177
1178close STDOUT or die "error closing STDOUT: $!";
1179