xref: /freebsd/crypto/openssl/crypto/chacha/asm/chacha-armv8.pl (revision e7be843b4a162e68651d3911f0357ed464915629)
1#! /usr/bin/env perl
2# Copyright 2016-2025 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# June 2015
18#
19# ChaCha20 for ARMv8.
20#
21# April 2019
22#
23# Replace 3xNEON+1xIALU code path with 4+1. 4+1 is actually fastest
24# option on most(*), but not all, processors, yet 6+2 is retained.
25# This is because penalties are considered tolerable in comparison to
26# improvement on processors where 6+2 helps. Most notably +37% on
27# ThunderX2. It's server-oriented processor which will have to serve
28# as many requests as possible. While others are mostly clients, when
29# performance doesn't have to be absolute top-notch, just fast enough,
30# as majority of time is spent "entertaining" relatively slow human.
31#
32# Performance in cycles per byte out of large buffer.
33#
34#			IALU/gcc-4.9	4xNEON+1xIALU	6xNEON+2xIALU
35#
36# Apple A7		5.50/+49%	2.72		1.60
37# Cortex-A53		8.40/+80%	4.06		4.45(*)
38# Cortex-A57		8.06/+43%	4.15		4.40(*)
39# Denver		4.50/+82%	2.30		2.70(*)
40# X-Gene		9.50/+46%	8.20		8.90(*)
41# Mongoose		8.00/+44%	2.74		3.12(*)
42# Kryo			8.17/+50%	4.47		4.65(*)
43# ThunderX2		7.22/+48%	5.64		4.10
44#
45# (*)	slower than 4+1:-(
46
47# $output is the last argument if it looks like a file (it has an extension)
48# $flavour is the first argument if it doesn't look like a file
49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51
52$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
54( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
55die "can't locate arm-xlate.pl";
56
57open OUT,"| \"$^X\" $xlate $flavour \"$output\""
58    or die "can't call $xlate: $!";
59*STDOUT=*OUT;
60
61sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
62{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
63  my $arg = pop;
64    $arg = "#$arg" if ($arg*1 eq $arg);
65    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
66}
67
68my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
69
70my @x=map("x$_",(5..17,19..21));
71my @d=map("x$_",(22..28,30));
72
73sub ROUND {
74my ($a0,$b0,$c0,$d0)=@_;
75my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
76my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
77my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
78
79    (
80	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
81	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
82	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
83	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
84	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
85	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
86	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
87	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
88	"&ror_32	(@x[$d0],@x[$d0],16)",
89	 "&ror_32	(@x[$d1],@x[$d1],16)",
90	  "&ror_32	(@x[$d2],@x[$d2],16)",
91	   "&ror_32	(@x[$d3],@x[$d3],16)",
92
93	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
94	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
95	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
96	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
97	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
98	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
99	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
100	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
101	"&ror_32	(@x[$b0],@x[$b0],20)",
102	 "&ror_32	(@x[$b1],@x[$b1],20)",
103	  "&ror_32	(@x[$b2],@x[$b2],20)",
104	   "&ror_32	(@x[$b3],@x[$b3],20)",
105
106	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
107	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
108	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
109	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
110	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
111	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
112	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
113	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
114	"&ror_32	(@x[$d0],@x[$d0],24)",
115	 "&ror_32	(@x[$d1],@x[$d1],24)",
116	  "&ror_32	(@x[$d2],@x[$d2],24)",
117	   "&ror_32	(@x[$d3],@x[$d3],24)",
118
119	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
120	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
121	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
122	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
123	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
124	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
125	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
126	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
127	"&ror_32	(@x[$b0],@x[$b0],25)",
128	 "&ror_32	(@x[$b1],@x[$b1],25)",
129	  "&ror_32	(@x[$b2],@x[$b2],25)",
130	   "&ror_32	(@x[$b3],@x[$b3],25)"
131    );
132}
133
134$code.=<<___;
135#include "arm_arch.h"
136#ifndef	__KERNEL__
137.extern	OPENSSL_armcap_P
138.hidden	OPENSSL_armcap_P
139
140.extern ChaCha20_ctr32_sve
141#endif
142
143.rodata
144
145.align	5
146.Lsigma:
147.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
148.Lone:
149.long	1,2,3,4
150.Lrot24:
151.long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
152.asciz	"ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm"
153
154.text
155
156.globl	ChaCha20_ctr32_dflt
157.type	ChaCha20_ctr32_dflt,%function
158.align	5
159ChaCha20_ctr32_dflt:
160	AARCH64_SIGN_LINK_REGISTER
161	cmp	$len,#192
162	b.lo	.Lshort
163#ifndef	__KERNEL__
164	adrp	x17,OPENSSL_armcap_P
165	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
166.Lcheck_neon:
167	tst	w17,#ARMV7_NEON
168	b.ne	.LChaCha20_neon
169#endif
170
171.Lshort:
172	stp	x29,x30,[sp,#-96]!
173	add	x29,sp,#0
174
175	adrp	@x[0],.Lsigma
176	add	@x[0],@x[0],:lo12:.Lsigma
177	stp	x19,x20,[sp,#16]
178	stp	x21,x22,[sp,#32]
179	stp	x23,x24,[sp,#48]
180	stp	x25,x26,[sp,#64]
181	stp	x27,x28,[sp,#80]
182	sub	sp,sp,#64
183
184	ldp	@d[0],@d[1],[@x[0]]		// load sigma
185	ldp	@d[2],@d[3],[$key]		// load key
186	ldp	@d[4],@d[5],[$key,#16]
187	ldp	@d[6],@d[7],[$ctr]		// load counter
188#ifdef	__AARCH64EB__
189	ror	@d[2],@d[2],#32
190	ror	@d[3],@d[3],#32
191	ror	@d[4],@d[4],#32
192	ror	@d[5],@d[5],#32
193	ror	@d[6],@d[6],#32
194	ror	@d[7],@d[7],#32
195#endif
196
197.Loop_outer:
198	mov.32	@x[0],@d[0]			// unpack key block
199	lsr	@x[1],@d[0],#32
200	mov.32	@x[2],@d[1]
201	lsr	@x[3],@d[1],#32
202	mov.32	@x[4],@d[2]
203	lsr	@x[5],@d[2],#32
204	mov.32	@x[6],@d[3]
205	lsr	@x[7],@d[3],#32
206	mov.32	@x[8],@d[4]
207	lsr	@x[9],@d[4],#32
208	mov.32	@x[10],@d[5]
209	lsr	@x[11],@d[5],#32
210	mov.32	@x[12],@d[6]
211	lsr	@x[13],@d[6],#32
212	mov.32	@x[14],@d[7]
213	lsr	@x[15],@d[7],#32
214
215	mov	$ctr,#10
216	subs	$len,$len,#64
217.Loop:
218	sub	$ctr,$ctr,#1
219___
220	foreach (&ROUND(0, 4, 8,12)) { eval; }
221	foreach (&ROUND(0, 5,10,15)) { eval; }
222$code.=<<___;
223	cbnz	$ctr,.Loop
224
225	add.32	@x[0],@x[0],@d[0]		// accumulate key block
226	add	@x[1],@x[1],@d[0],lsr#32
227	add.32	@x[2],@x[2],@d[1]
228	add	@x[3],@x[3],@d[1],lsr#32
229	add.32	@x[4],@x[4],@d[2]
230	add	@x[5],@x[5],@d[2],lsr#32
231	add.32	@x[6],@x[6],@d[3]
232	add	@x[7],@x[7],@d[3],lsr#32
233	add.32	@x[8],@x[8],@d[4]
234	add	@x[9],@x[9],@d[4],lsr#32
235	add.32	@x[10],@x[10],@d[5]
236	add	@x[11],@x[11],@d[5],lsr#32
237	add.32	@x[12],@x[12],@d[6]
238	add	@x[13],@x[13],@d[6],lsr#32
239	add.32	@x[14],@x[14],@d[7]
240	add	@x[15],@x[15],@d[7],lsr#32
241
242	b.lo	.Ltail
243
244	add	@x[0],@x[0],@x[1],lsl#32	// pack
245	add	@x[2],@x[2],@x[3],lsl#32
246	ldp	@x[1],@x[3],[$inp,#0]		// load input
247	add	@x[4],@x[4],@x[5],lsl#32
248	add	@x[6],@x[6],@x[7],lsl#32
249	ldp	@x[5],@x[7],[$inp,#16]
250	add	@x[8],@x[8],@x[9],lsl#32
251	add	@x[10],@x[10],@x[11],lsl#32
252	ldp	@x[9],@x[11],[$inp,#32]
253	add	@x[12],@x[12],@x[13],lsl#32
254	add	@x[14],@x[14],@x[15],lsl#32
255	ldp	@x[13],@x[15],[$inp,#48]
256	add	$inp,$inp,#64
257#ifdef	__AARCH64EB__
258	rev	@x[0],@x[0]
259	rev	@x[2],@x[2]
260	rev	@x[4],@x[4]
261	rev	@x[6],@x[6]
262	rev	@x[8],@x[8]
263	rev	@x[10],@x[10]
264	rev	@x[12],@x[12]
265	rev	@x[14],@x[14]
266#endif
267	eor	@x[0],@x[0],@x[1]
268	eor	@x[2],@x[2],@x[3]
269	eor	@x[4],@x[4],@x[5]
270	eor	@x[6],@x[6],@x[7]
271	eor	@x[8],@x[8],@x[9]
272	eor	@x[10],@x[10],@x[11]
273	eor	@x[12],@x[12],@x[13]
274	eor	@x[14],@x[14],@x[15]
275
276	stp	@x[0],@x[2],[$out,#0]		// store output
277	 add	@d[6],@d[6],#1			// increment counter
278	stp	@x[4],@x[6],[$out,#16]
279	stp	@x[8],@x[10],[$out,#32]
280	stp	@x[12],@x[14],[$out,#48]
281	add	$out,$out,#64
282
283	b.hi	.Loop_outer
284
285	ldp	x19,x20,[x29,#16]
286	add	sp,sp,#64
287	ldp	x21,x22,[x29,#32]
288	ldp	x23,x24,[x29,#48]
289	ldp	x25,x26,[x29,#64]
290	ldp	x27,x28,[x29,#80]
291	ldp	x29,x30,[sp],#96
292.Labort:
293	AARCH64_VALIDATE_LINK_REGISTER
294	ret
295
296.align	4
297.Ltail:
298	add	$len,$len,#64
299.Less_than_64:
300	sub	$out,$out,#1
301	add	$inp,$inp,$len
302	add	$out,$out,$len
303	add	$ctr,sp,$len
304	neg	$len,$len
305
306	add	@x[0],@x[0],@x[1],lsl#32	// pack
307	add	@x[2],@x[2],@x[3],lsl#32
308	add	@x[4],@x[4],@x[5],lsl#32
309	add	@x[6],@x[6],@x[7],lsl#32
310	add	@x[8],@x[8],@x[9],lsl#32
311	add	@x[10],@x[10],@x[11],lsl#32
312	add	@x[12],@x[12],@x[13],lsl#32
313	add	@x[14],@x[14],@x[15],lsl#32
314#ifdef	__AARCH64EB__
315	rev	@x[0],@x[0]
316	rev	@x[2],@x[2]
317	rev	@x[4],@x[4]
318	rev	@x[6],@x[6]
319	rev	@x[8],@x[8]
320	rev	@x[10],@x[10]
321	rev	@x[12],@x[12]
322	rev	@x[14],@x[14]
323#endif
324	stp	@x[0],@x[2],[sp,#0]
325	stp	@x[4],@x[6],[sp,#16]
326	stp	@x[8],@x[10],[sp,#32]
327	stp	@x[12],@x[14],[sp,#48]
328
329.Loop_tail:
330	ldrb	w10,[$inp,$len]
331	ldrb	w11,[$ctr,$len]
332	add	$len,$len,#1
333	eor	w10,w10,w11
334	strb	w10,[$out,$len]
335	cbnz	$len,.Loop_tail
336
337	stp	xzr,xzr,[sp,#0]
338	stp	xzr,xzr,[sp,#16]
339	stp	xzr,xzr,[sp,#32]
340	stp	xzr,xzr,[sp,#48]
341
342	ldp	x19,x20,[x29,#16]
343	add	sp,sp,#64
344	ldp	x21,x22,[x29,#32]
345	ldp	x23,x24,[x29,#48]
346	ldp	x25,x26,[x29,#64]
347	ldp	x27,x28,[x29,#80]
348	ldp	x29,x30,[sp],#96
349	AARCH64_VALIDATE_LINK_REGISTER
350	ret
351.size	ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt
352
353.globl	ChaCha20_ctr32
354.type	ChaCha20_ctr32,%function
355.align	5
356ChaCha20_ctr32:
357	AARCH64_SIGN_LINK_REGISTER
358	cbz	$len,.Labort
359	cmp	$len,#192
360	b.lo	.Lshort
361#ifndef	__KERNEL__
362	adrp	x17,OPENSSL_armcap_P
363	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
364	tst	w17,#ARMV8_SVE
365	b.eq	.Lcheck_neon
366	stp	x29,x30,[sp,#-16]!
367	sub	sp,sp,#16
368	// SVE handling will inevitably increment the counter
369	// Neon/Scalar code that follows to process tail data needs to
370	// use new counter, unfortunately the input counter buffer
371	// pointed to by ctr is meant to be read-only per API contract
372	// we have to copy the buffer to stack to be writable by SVE
373	ldp	x5,x6,[$ctr]
374	stp	x5,x6,[sp]
375	mov	$ctr,sp
376	bl	ChaCha20_ctr32_sve
377	cbz	$len,1f
378	bl	ChaCha20_ctr32_dflt
3791:
380	add	sp,sp,#16
381	ldp	x29,x30,[sp],#16
382	AARCH64_VALIDATE_LINK_REGISTER
383	ret
384#endif
385	b	.Lshort
386.size	ChaCha20_ctr32,.-ChaCha20_ctr32
387___
388
389{{{
390my @K = map("v$_.4s",(0..3));
391my ($xt0,$xt1,$xt2,$xt3, $CTR,$ROT24) = map("v$_.4s",(4..9));
392my @X = map("v$_.4s",(16,20,24,28, 17,21,25,29, 18,22,26,30, 19,23,27,31));
393my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
394    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @X;
395
396sub NEON_lane_ROUND {
397my ($a0,$b0,$c0,$d0)=@_;
398my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
399my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
400my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
401my @x=map("'$_'",@X);
402
403	(
404	"&add		(@x[$a0],@x[$a0],@x[$b0])",	# Q1
405	 "&add		(@x[$a1],@x[$a1],@x[$b1])",	# Q2
406	  "&add		(@x[$a2],@x[$a2],@x[$b2])",	# Q3
407	   "&add	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
408	"&eor		(@x[$d0],@x[$d0],@x[$a0])",
409	 "&eor		(@x[$d1],@x[$d1],@x[$a1])",
410	  "&eor		(@x[$d2],@x[$d2],@x[$a2])",
411	   "&eor	(@x[$d3],@x[$d3],@x[$a3])",
412	"&rev32_16	(@x[$d0],@x[$d0])",
413	 "&rev32_16	(@x[$d1],@x[$d1])",
414	  "&rev32_16	(@x[$d2],@x[$d2])",
415	   "&rev32_16	(@x[$d3],@x[$d3])",
416
417	"&add		(@x[$c0],@x[$c0],@x[$d0])",
418	 "&add		(@x[$c1],@x[$c1],@x[$d1])",
419	  "&add		(@x[$c2],@x[$c2],@x[$d2])",
420	   "&add	(@x[$c3],@x[$c3],@x[$d3])",
421	"&eor		('$xt0',@x[$b0],@x[$c0])",
422	 "&eor		('$xt1',@x[$b1],@x[$c1])",
423	  "&eor		('$xt2',@x[$b2],@x[$c2])",
424	   "&eor	('$xt3',@x[$b3],@x[$c3])",
425	"&ushr		(@x[$b0],'$xt0',20)",
426	 "&ushr		(@x[$b1],'$xt1',20)",
427	  "&ushr	(@x[$b2],'$xt2',20)",
428	   "&ushr	(@x[$b3],'$xt3',20)",
429	"&sli		(@x[$b0],'$xt0',12)",
430	 "&sli		(@x[$b1],'$xt1',12)",
431	  "&sli		(@x[$b2],'$xt2',12)",
432	   "&sli	(@x[$b3],'$xt3',12)",
433
434	"&add		(@x[$a0],@x[$a0],@x[$b0])",
435	 "&add		(@x[$a1],@x[$a1],@x[$b1])",
436	  "&add		(@x[$a2],@x[$a2],@x[$b2])",
437	   "&add	(@x[$a3],@x[$a3],@x[$b3])",
438	"&eor		('$xt0',@x[$d0],@x[$a0])",
439	 "&eor		('$xt1',@x[$d1],@x[$a1])",
440	  "&eor		('$xt2',@x[$d2],@x[$a2])",
441	   "&eor	('$xt3',@x[$d3],@x[$a3])",
442	"&tbl		(@x[$d0],'{$xt0}','$ROT24')",
443	 "&tbl		(@x[$d1],'{$xt1}','$ROT24')",
444	  "&tbl		(@x[$d2],'{$xt2}','$ROT24')",
445	   "&tbl	(@x[$d3],'{$xt3}','$ROT24')",
446
447	"&add		(@x[$c0],@x[$c0],@x[$d0])",
448	 "&add		(@x[$c1],@x[$c1],@x[$d1])",
449	  "&add		(@x[$c2],@x[$c2],@x[$d2])",
450	   "&add	(@x[$c3],@x[$c3],@x[$d3])",
451	"&eor		('$xt0',@x[$b0],@x[$c0])",
452	 "&eor		('$xt1',@x[$b1],@x[$c1])",
453	  "&eor		('$xt2',@x[$b2],@x[$c2])",
454	   "&eor	('$xt3',@x[$b3],@x[$c3])",
455	"&ushr		(@x[$b0],'$xt0',25)",
456	 "&ushr		(@x[$b1],'$xt1',25)",
457	  "&ushr	(@x[$b2],'$xt2',25)",
458	   "&ushr	(@x[$b3],'$xt3',25)",
459	"&sli		(@x[$b0],'$xt0',7)",
460	 "&sli		(@x[$b1],'$xt1',7)",
461	  "&sli		(@x[$b2],'$xt2',7)",
462	   "&sli	(@x[$b3],'$xt3',7)"
463	);
464}
465
466$code.=<<___;
467
468#ifdef	__KERNEL__
469.globl	ChaCha20_neon
470#endif
471.type	ChaCha20_neon,%function
472.align	5
473ChaCha20_neon:
474	AARCH64_SIGN_LINK_REGISTER
475.LChaCha20_neon:
476	stp	x29,x30,[sp,#-96]!
477	add	x29,sp,#0
478
479	adrp	@x[0],.Lsigma
480	add	@x[0],@x[0],:lo12:.Lsigma
481	stp	x19,x20,[sp,#16]
482	stp	x21,x22,[sp,#32]
483	stp	x23,x24,[sp,#48]
484	stp	x25,x26,[sp,#64]
485	stp	x27,x28,[sp,#80]
486	cmp	$len,#512
487	b.hs	.L512_or_more_neon
488
489	sub	sp,sp,#64
490
491	ldp	@d[0],@d[1],[@x[0]]		// load sigma
492	ld1	{@K[0]},[@x[0]],#16
493	ldp	@d[2],@d[3],[$key]		// load key
494	ldp	@d[4],@d[5],[$key,#16]
495	ld1	{@K[1],@K[2]},[$key]
496	ldp	@d[6],@d[7],[$ctr]		// load counter
497	ld1	{@K[3]},[$ctr]
498	stp	d8,d9,[sp]			// meet ABI requirements
499	ld1	{$CTR,$ROT24},[@x[0]]
500#ifdef	__AARCH64EB__
501	rev64	@K[0],@K[0]
502	ror	@d[2],@d[2],#32
503	ror	@d[3],@d[3],#32
504	ror	@d[4],@d[4],#32
505	ror	@d[5],@d[5],#32
506	ror	@d[6],@d[6],#32
507	ror	@d[7],@d[7],#32
508#endif
509
510.Loop_outer_neon:
511	dup	$xa0,@{K[0]}[0]			// unpack key block
512	 mov.32	@x[0],@d[0]
513	dup	$xa1,@{K[0]}[1]
514	 lsr	@x[1],@d[0],#32
515	dup	$xa2,@{K[0]}[2]
516	 mov.32	@x[2],@d[1]
517	dup	$xa3,@{K[0]}[3]
518	 lsr	@x[3],@d[1],#32
519	dup	$xb0,@{K[1]}[0]
520	 mov.32	@x[4],@d[2]
521	dup	$xb1,@{K[1]}[1]
522	 lsr	@x[5],@d[2],#32
523	dup	$xb2,@{K[1]}[2]
524	 mov.32	@x[6],@d[3]
525	dup	$xb3,@{K[1]}[3]
526	 lsr	@x[7],@d[3],#32
527	dup	$xd0,@{K[3]}[0]
528	 mov.32	@x[8],@d[4]
529	dup	$xd1,@{K[3]}[1]
530	 lsr	@x[9],@d[4],#32
531	dup	$xd2,@{K[3]}[2]
532	 mov.32	@x[10],@d[5]
533	dup	$xd3,@{K[3]}[3]
534	 lsr	@x[11],@d[5],#32
535	add	$xd0,$xd0,$CTR
536	 mov.32	@x[12],@d[6]
537	dup	$xc0,@{K[2]}[0]
538	 lsr	@x[13],@d[6],#32
539	dup	$xc1,@{K[2]}[1]
540	 mov.32	@x[14],@d[7]
541	dup	$xc2,@{K[2]}[2]
542	 lsr	@x[15],@d[7],#32
543	dup	$xc3,@{K[2]}[3]
544
545	mov	$ctr,#10
546	subs	$len,$len,#320
547.Loop_neon:
548	sub	$ctr,$ctr,#1
549___
550	my @plus_one=&ROUND(0,4,8,12);
551	foreach (&NEON_lane_ROUND(0,4,8,12))  { eval; eval(shift(@plus_one)); }
552
553	@plus_one=&ROUND(0,5,10,15);
554	foreach (&NEON_lane_ROUND(0,5,10,15)) { eval; eval(shift(@plus_one)); }
555$code.=<<___;
556	cbnz	$ctr,.Loop_neon
557
558	add	$xd0,$xd0,$CTR
559
560	zip1	$xt0,$xa0,$xa1			// transpose data
561	zip1	$xt1,$xa2,$xa3
562	zip2	$xt2,$xa0,$xa1
563	zip2	$xt3,$xa2,$xa3
564	zip1.64	$xa0,$xt0,$xt1
565	zip2.64	$xa1,$xt0,$xt1
566	zip1.64	$xa2,$xt2,$xt3
567	zip2.64	$xa3,$xt2,$xt3
568
569	zip1	$xt0,$xb0,$xb1
570	zip1	$xt1,$xb2,$xb3
571	zip2	$xt2,$xb0,$xb1
572	zip2	$xt3,$xb2,$xb3
573	zip1.64	$xb0,$xt0,$xt1
574	zip2.64	$xb1,$xt0,$xt1
575	zip1.64	$xb2,$xt2,$xt3
576	zip2.64	$xb3,$xt2,$xt3
577
578	zip1	$xt0,$xc0,$xc1
579	 add.32	@x[0],@x[0],@d[0]		// accumulate key block
580	zip1	$xt1,$xc2,$xc3
581	 add	@x[1],@x[1],@d[0],lsr#32
582	zip2	$xt2,$xc0,$xc1
583	 add.32	@x[2],@x[2],@d[1]
584	zip2	$xt3,$xc2,$xc3
585	 add	@x[3],@x[3],@d[1],lsr#32
586	zip1.64	$xc0,$xt0,$xt1
587	 add.32	@x[4],@x[4],@d[2]
588	zip2.64	$xc1,$xt0,$xt1
589	 add	@x[5],@x[5],@d[2],lsr#32
590	zip1.64	$xc2,$xt2,$xt3
591	 add.32	@x[6],@x[6],@d[3]
592	zip2.64	$xc3,$xt2,$xt3
593	 add	@x[7],@x[7],@d[3],lsr#32
594
595	zip1	$xt0,$xd0,$xd1
596	 add.32	@x[8],@x[8],@d[4]
597	zip1	$xt1,$xd2,$xd3
598	 add	@x[9],@x[9],@d[4],lsr#32
599	zip2	$xt2,$xd0,$xd1
600	 add.32	@x[10],@x[10],@d[5]
601	zip2	$xt3,$xd2,$xd3
602	 add	@x[11],@x[11],@d[5],lsr#32
603	zip1.64	$xd0,$xt0,$xt1
604	 add.32	@x[12],@x[12],@d[6]
605	zip2.64	$xd1,$xt0,$xt1
606	 add	@x[13],@x[13],@d[6],lsr#32
607	zip1.64	$xd2,$xt2,$xt3
608	 add.32	@x[14],@x[14],@d[7]
609	zip2.64	$xd3,$xt2,$xt3
610	 add	@x[15],@x[15],@d[7],lsr#32
611
612	b.lo	.Ltail_neon
613
614	add	@x[0],@x[0],@x[1],lsl#32	// pack
615	add	@x[2],@x[2],@x[3],lsl#32
616	ldp	@x[1],@x[3],[$inp,#0]		// load input
617	 add	$xa0,$xa0,@K[0]			// accumulate key block
618	add	@x[4],@x[4],@x[5],lsl#32
619	add	@x[6],@x[6],@x[7],lsl#32
620	ldp	@x[5],@x[7],[$inp,#16]
621	 add	$xb0,$xb0,@K[1]
622	add	@x[8],@x[8],@x[9],lsl#32
623	add	@x[10],@x[10],@x[11],lsl#32
624	ldp	@x[9],@x[11],[$inp,#32]
625	 add	$xc0,$xc0,@K[2]
626	add	@x[12],@x[12],@x[13],lsl#32
627	add	@x[14],@x[14],@x[15],lsl#32
628	ldp	@x[13],@x[15],[$inp,#48]
629	 add	$xd0,$xd0,@K[3]
630	add	$inp,$inp,#64
631#ifdef	__AARCH64EB__
632	rev	@x[0],@x[0]
633	rev	@x[2],@x[2]
634	rev	@x[4],@x[4]
635	rev	@x[6],@x[6]
636	rev	@x[8],@x[8]
637	rev	@x[10],@x[10]
638	rev	@x[12],@x[12]
639	rev	@x[14],@x[14]
640#endif
641	ld1.8	{$xt0-$xt3},[$inp],#64
642	eor	@x[0],@x[0],@x[1]
643	 add	$xa1,$xa1,@K[0]
644	eor	@x[2],@x[2],@x[3]
645	 add	$xb1,$xb1,@K[1]
646	eor	@x[4],@x[4],@x[5]
647	 add	$xc1,$xc1,@K[2]
648	eor	@x[6],@x[6],@x[7]
649	 add	$xd1,$xd1,@K[3]
650	eor	@x[8],@x[8],@x[9]
651	 eor	$xa0,$xa0,$xt0
652	 movi	$xt0,#5
653	eor	@x[10],@x[10],@x[11]
654	 eor	$xb0,$xb0,$xt1
655	eor	@x[12],@x[12],@x[13]
656	 eor	$xc0,$xc0,$xt2
657	eor	@x[14],@x[14],@x[15]
658	 eor	$xd0,$xd0,$xt3
659	 add	$CTR,$CTR,$xt0			// += 5
660	 ld1.8	{$xt0-$xt3},[$inp],#64
661
662	stp	@x[0],@x[2],[$out,#0]		// store output
663	 add	@d[6],@d[6],#5			// increment counter
664	stp	@x[4],@x[6],[$out,#16]
665	stp	@x[8],@x[10],[$out,#32]
666	stp	@x[12],@x[14],[$out,#48]
667	add	$out,$out,#64
668
669	st1.8	{$xa0-$xd0},[$out],#64
670	 add	$xa2,$xa2,@K[0]
671	 add	$xb2,$xb2,@K[1]
672	 add	$xc2,$xc2,@K[2]
673	 add	$xd2,$xd2,@K[3]
674	ld1.8	{$xa0-$xd0},[$inp],#64
675
676	eor	$xa1,$xa1,$xt0
677	eor	$xb1,$xb1,$xt1
678	eor	$xc1,$xc1,$xt2
679	eor	$xd1,$xd1,$xt3
680	st1.8	{$xa1-$xd1},[$out],#64
681	 add	$xa3,$xa3,@K[0]
682	 add	$xb3,$xb3,@K[1]
683	 add	$xc3,$xc3,@K[2]
684	 add	$xd3,$xd3,@K[3]
685	ld1.8	{$xa1-$xd1},[$inp],#64
686
687	eor	$xa2,$xa2,$xa0
688	eor	$xb2,$xb2,$xb0
689	eor	$xc2,$xc2,$xc0
690	eor	$xd2,$xd2,$xd0
691	st1.8	{$xa2-$xd2},[$out],#64
692
693	eor	$xa3,$xa3,$xa1
694	eor	$xb3,$xb3,$xb1
695	eor	$xc3,$xc3,$xc1
696	eor	$xd3,$xd3,$xd1
697	st1.8	{$xa3-$xd3},[$out],#64
698
699	b.hi	.Loop_outer_neon
700
701	ldp	d8,d9,[sp]			// meet ABI requirements
702
703	ldp	x19,x20,[x29,#16]
704	add	sp,sp,#64
705	ldp	x21,x22,[x29,#32]
706	ldp	x23,x24,[x29,#48]
707	ldp	x25,x26,[x29,#64]
708	ldp	x27,x28,[x29,#80]
709	ldp	x29,x30,[sp],#96
710	AARCH64_VALIDATE_LINK_REGISTER
711	ret
712
713.align	4
714.Ltail_neon:
715	add	$len,$len,#320
716	ldp	d8,d9,[sp]			// meet ABI requirements
717	cmp	$len,#64
718	b.lo	.Less_than_64
719
720	add	@x[0],@x[0],@x[1],lsl#32	// pack
721	add	@x[2],@x[2],@x[3],lsl#32
722	ldp	@x[1],@x[3],[$inp,#0]		// load input
723	add	@x[4],@x[4],@x[5],lsl#32
724	add	@x[6],@x[6],@x[7],lsl#32
725	ldp	@x[5],@x[7],[$inp,#16]
726	add	@x[8],@x[8],@x[9],lsl#32
727	add	@x[10],@x[10],@x[11],lsl#32
728	ldp	@x[9],@x[11],[$inp,#32]
729	add	@x[12],@x[12],@x[13],lsl#32
730	add	@x[14],@x[14],@x[15],lsl#32
731	ldp	@x[13],@x[15],[$inp,#48]
732	add	$inp,$inp,#64
733#ifdef	__AARCH64EB__
734	rev	@x[0],@x[0]
735	rev	@x[2],@x[2]
736	rev	@x[4],@x[4]
737	rev	@x[6],@x[6]
738	rev	@x[8],@x[8]
739	rev	@x[10],@x[10]
740	rev	@x[12],@x[12]
741	rev	@x[14],@x[14]
742#endif
743	eor	@x[0],@x[0],@x[1]
744	eor	@x[2],@x[2],@x[3]
745	eor	@x[4],@x[4],@x[5]
746	eor	@x[6],@x[6],@x[7]
747	eor	@x[8],@x[8],@x[9]
748	eor	@x[10],@x[10],@x[11]
749	eor	@x[12],@x[12],@x[13]
750	eor	@x[14],@x[14],@x[15]
751
752	stp	@x[0],@x[2],[$out,#0]		// store output
753	 add	$xa0,$xa0,@K[0]			// accumulate key block
754	stp	@x[4],@x[6],[$out,#16]
755	 add	$xb0,$xb0,@K[1]
756	stp	@x[8],@x[10],[$out,#32]
757	 add	$xc0,$xc0,@K[2]
758	stp	@x[12],@x[14],[$out,#48]
759	 add	$xd0,$xd0,@K[3]
760	add	$out,$out,#64
761	b.eq	.Ldone_neon
762	sub	$len,$len,#64
763	cmp	$len,#64
764	b.lo	.Last_neon
765
766	ld1.8	{$xt0-$xt3},[$inp],#64
767	eor	$xa0,$xa0,$xt0
768	eor	$xb0,$xb0,$xt1
769	eor	$xc0,$xc0,$xt2
770	eor	$xd0,$xd0,$xt3
771	st1.8	{$xa0-$xd0},[$out],#64
772	b.eq	.Ldone_neon
773
774	add	$xa0,$xa1,@K[0]
775	add	$xb0,$xb1,@K[1]
776	sub	$len,$len,#64
777	add	$xc0,$xc1,@K[2]
778	cmp	$len,#64
779	add	$xd0,$xd1,@K[3]
780	b.lo	.Last_neon
781
782	ld1.8	{$xt0-$xt3},[$inp],#64
783	eor	$xa1,$xa0,$xt0
784	eor	$xb1,$xb0,$xt1
785	eor	$xc1,$xc0,$xt2
786	eor	$xd1,$xd0,$xt3
787	st1.8	{$xa1-$xd1},[$out],#64
788	b.eq	.Ldone_neon
789
790	add	$xa0,$xa2,@K[0]
791	add	$xb0,$xb2,@K[1]
792	sub	$len,$len,#64
793	add	$xc0,$xc2,@K[2]
794	cmp	$len,#64
795	add	$xd0,$xd2,@K[3]
796	b.lo	.Last_neon
797
798	ld1.8	{$xt0-$xt3},[$inp],#64
799	eor	$xa2,$xa0,$xt0
800	eor	$xb2,$xb0,$xt1
801	eor	$xc2,$xc0,$xt2
802	eor	$xd2,$xd0,$xt3
803	st1.8	{$xa2-$xd2},[$out],#64
804	b.eq	.Ldone_neon
805
806	add	$xa0,$xa3,@K[0]
807	add	$xb0,$xb3,@K[1]
808	add	$xc0,$xc3,@K[2]
809	add	$xd0,$xd3,@K[3]
810	sub	$len,$len,#64
811
812.Last_neon:
813	st1.8	{$xa0-$xd0},[sp]
814
815	sub	$out,$out,#1
816	add	$inp,$inp,$len
817	add	$out,$out,$len
818	add	$ctr,sp,$len
819	neg	$len,$len
820
821.Loop_tail_neon:
822	ldrb	w10,[$inp,$len]
823	ldrb	w11,[$ctr,$len]
824	add	$len,$len,#1
825	eor	w10,w10,w11
826	strb	w10,[$out,$len]
827	cbnz	$len,.Loop_tail_neon
828
829	stp	xzr,xzr,[sp,#0]
830	stp	xzr,xzr,[sp,#16]
831	stp	xzr,xzr,[sp,#32]
832	stp	xzr,xzr,[sp,#48]
833
834.Ldone_neon:
835	ldp	x19,x20,[x29,#16]
836	add	sp,sp,#64
837	ldp	x21,x22,[x29,#32]
838	ldp	x23,x24,[x29,#48]
839	ldp	x25,x26,[x29,#64]
840	ldp	x27,x28,[x29,#80]
841	ldp	x29,x30,[sp],#96
842	AARCH64_VALIDATE_LINK_REGISTER
843	ret
844.size	ChaCha20_neon,.-ChaCha20_neon
845___
846{
847my @K = map("v$_.4s",(0..6));
848my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
849my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
850    $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(8..31));
851my $rot24 = @K[6];
852my $ONE = "v7.4s";
853
854sub NEONROUND {
855my $odd = pop;
856my ($a,$b,$c,$d,$t)=@_;
857
858	(
859	"&add		('$a','$a','$b')",
860	"&eor		('$d','$d','$a')",
861	"&rev32_16	('$d','$d')",		# vrot ($d,16)
862
863	"&add		('$c','$c','$d')",
864	"&eor		('$t','$b','$c')",
865	"&ushr		('$b','$t',20)",
866	"&sli		('$b','$t',12)",
867
868	"&add		('$a','$a','$b')",
869	"&eor		('$d','$d','$a')",
870	"&tbl		('$d','{$d}','$rot24')",
871
872	"&add		('$c','$c','$d')",
873	"&eor		('$t','$b','$c')",
874	"&ushr		('$b','$t',25)",
875	"&sli		('$b','$t',7)",
876
877	"&ext		('$c','$c','$c',8)",
878	"&ext		('$d','$d','$d',$odd?4:12)",
879	"&ext		('$b','$b','$b',$odd?12:4)"
880	);
881}
882
883$code.=<<___;
884.type	ChaCha20_512_neon,%function
885.align	5
886ChaCha20_512_neon:
887	AARCH64_SIGN_LINK_REGISTER
888	stp	x29,x30,[sp,#-96]!
889	add	x29,sp,#0
890
891	adrp	@x[0],.Lsigma
892	add	@x[0],@x[0],:lo12:.Lsigma
893	stp	x19,x20,[sp,#16]
894	stp	x21,x22,[sp,#32]
895	stp	x23,x24,[sp,#48]
896	stp	x25,x26,[sp,#64]
897	stp	x27,x28,[sp,#80]
898
899.L512_or_more_neon:
900	sub	sp,sp,#128+64
901
902	eor	$ONE,$ONE,$ONE
903	ldp	@d[0],@d[1],[@x[0]]		// load sigma
904	ld1	{@K[0]},[@x[0]],#16
905	ldp	@d[2],@d[3],[$key]		// load key
906	ldp	@d[4],@d[5],[$key,#16]
907	ld1	{@K[1],@K[2]},[$key]
908	ldp	@d[6],@d[7],[$ctr]		// load counter
909	ld1	{@K[3]},[$ctr]
910	ld1	{$ONE}[0],[@x[0]]
911	add	$key,@x[0],#16			// .Lrot24
912#ifdef	__AARCH64EB__
913	rev64	@K[0],@K[0]
914	ror	@d[2],@d[2],#32
915	ror	@d[3],@d[3],#32
916	ror	@d[4],@d[4],#32
917	ror	@d[5],@d[5],#32
918	ror	@d[6],@d[6],#32
919	ror	@d[7],@d[7],#32
920#endif
921	add	@K[3],@K[3],$ONE		// += 1
922	stp	@K[0],@K[1],[sp,#0]		// off-load key block, invariant part
923	add	@K[3],@K[3],$ONE		// not typo
924	str	@K[2],[sp,#32]
925	add	@K[4],@K[3],$ONE
926	add	@K[5],@K[4],$ONE
927	add	@K[6],@K[5],$ONE
928	shl	$ONE,$ONE,#2			// 1 -> 4
929
930	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
931	stp	d10,d11,[sp,#128+16]
932	stp	d12,d13,[sp,#128+32]
933	stp	d14,d15,[sp,#128+48]
934
935	sub	$len,$len,#512			// not typo
936
937.Loop_outer_512_neon:
938	 mov	$A0,@K[0]
939	 mov	$A1,@K[0]
940	 mov	$A2,@K[0]
941	 mov	$A3,@K[0]
942	 mov	$A4,@K[0]
943	 mov	$A5,@K[0]
944	 mov	$B0,@K[1]
945	mov.32	@x[0],@d[0]			// unpack key block
946	 mov	$B1,@K[1]
947	lsr	@x[1],@d[0],#32
948	 mov	$B2,@K[1]
949	mov.32	@x[2],@d[1]
950	 mov	$B3,@K[1]
951	lsr	@x[3],@d[1],#32
952	 mov	$B4,@K[1]
953	mov.32	@x[4],@d[2]
954	 mov	$B5,@K[1]
955	lsr	@x[5],@d[2],#32
956	 mov	$D0,@K[3]
957	mov.32	@x[6],@d[3]
958	 mov	$D1,@K[4]
959	lsr	@x[7],@d[3],#32
960	 mov	$D2,@K[5]
961	mov.32	@x[8],@d[4]
962	 mov	$D3,@K[6]
963	lsr	@x[9],@d[4],#32
964	 mov	$C0,@K[2]
965	mov.32	@x[10],@d[5]
966	 mov	$C1,@K[2]
967	lsr	@x[11],@d[5],#32
968	 add	$D4,$D0,$ONE			// +4
969	mov.32	@x[12],@d[6]
970	 add	$D5,$D1,$ONE			// +4
971	lsr	@x[13],@d[6],#32
972	 mov	$C2,@K[2]
973	mov.32	@x[14],@d[7]
974	 mov	$C3,@K[2]
975	lsr	@x[15],@d[7],#32
976	 mov	$C4,@K[2]
977	 stp	@K[3],@K[4],[sp,#48]		// off-load key block, variable part
978	 mov	$C5,@K[2]
979	 stp	@K[5],@K[6],[sp,#80]
980
981	mov	$ctr,#5
982	ld1	{$rot24},[$key]
983	subs	$len,$len,#512
984.Loop_upper_neon:
985	sub	$ctr,$ctr,#1
986___
987	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
988	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
989	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
990	my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
991	my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
992	my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
993	my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
994	my $diff = ($#thread0+1)*6 - $#thread67 - 1;
995	my $i = 0;
996
997	foreach (@thread0) {
998		eval;			eval(shift(@thread67));
999		eval(shift(@thread1));	eval(shift(@thread67));
1000		eval(shift(@thread2));	eval(shift(@thread67));
1001		eval(shift(@thread3));	eval(shift(@thread67));
1002		eval(shift(@thread4));	eval(shift(@thread67));
1003		eval(shift(@thread5));	eval(shift(@thread67));
1004	}
1005
1006	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
1007	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
1008	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
1009	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
1010	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
1011	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
1012	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
1013
1014	foreach (@thread0) {
1015		eval;			eval(shift(@thread67));
1016		eval(shift(@thread1));	eval(shift(@thread67));
1017		eval(shift(@thread2));	eval(shift(@thread67));
1018		eval(shift(@thread3));	eval(shift(@thread67));
1019		eval(shift(@thread4));	eval(shift(@thread67));
1020		eval(shift(@thread5));	eval(shift(@thread67));
1021	}
1022$code.=<<___;
1023	cbnz	$ctr,.Loop_upper_neon
1024
1025	add.32	@x[0],@x[0],@d[0]		// accumulate key block
1026	add	@x[1],@x[1],@d[0],lsr#32
1027	add.32	@x[2],@x[2],@d[1]
1028	add	@x[3],@x[3],@d[1],lsr#32
1029	add.32	@x[4],@x[4],@d[2]
1030	add	@x[5],@x[5],@d[2],lsr#32
1031	add.32	@x[6],@x[6],@d[3]
1032	add	@x[7],@x[7],@d[3],lsr#32
1033	add.32	@x[8],@x[8],@d[4]
1034	add	@x[9],@x[9],@d[4],lsr#32
1035	add.32	@x[10],@x[10],@d[5]
1036	add	@x[11],@x[11],@d[5],lsr#32
1037	add.32	@x[12],@x[12],@d[6]
1038	add	@x[13],@x[13],@d[6],lsr#32
1039	add.32	@x[14],@x[14],@d[7]
1040	add	@x[15],@x[15],@d[7],lsr#32
1041
1042	add	@x[0],@x[0],@x[1],lsl#32	// pack
1043	add	@x[2],@x[2],@x[3],lsl#32
1044	ldp	@x[1],@x[3],[$inp,#0]		// load input
1045	add	@x[4],@x[4],@x[5],lsl#32
1046	add	@x[6],@x[6],@x[7],lsl#32
1047	ldp	@x[5],@x[7],[$inp,#16]
1048	add	@x[8],@x[8],@x[9],lsl#32
1049	add	@x[10],@x[10],@x[11],lsl#32
1050	ldp	@x[9],@x[11],[$inp,#32]
1051	add	@x[12],@x[12],@x[13],lsl#32
1052	add	@x[14],@x[14],@x[15],lsl#32
1053	ldp	@x[13],@x[15],[$inp,#48]
1054	add	$inp,$inp,#64
1055#ifdef	__AARCH64EB__
1056	rev	@x[0],@x[0]
1057	rev	@x[2],@x[2]
1058	rev	@x[4],@x[4]
1059	rev	@x[6],@x[6]
1060	rev	@x[8],@x[8]
1061	rev	@x[10],@x[10]
1062	rev	@x[12],@x[12]
1063	rev	@x[14],@x[14]
1064#endif
1065	eor	@x[0],@x[0],@x[1]
1066	eor	@x[2],@x[2],@x[3]
1067	eor	@x[4],@x[4],@x[5]
1068	eor	@x[6],@x[6],@x[7]
1069	eor	@x[8],@x[8],@x[9]
1070	eor	@x[10],@x[10],@x[11]
1071	eor	@x[12],@x[12],@x[13]
1072	eor	@x[14],@x[14],@x[15]
1073
1074	 stp	@x[0],@x[2],[$out,#0]		// store output
1075	 add	@d[6],@d[6],#1			// increment counter
1076	mov.32	@x[0],@d[0]			// unpack key block
1077	lsr	@x[1],@d[0],#32
1078	 stp	@x[4],@x[6],[$out,#16]
1079	mov.32	@x[2],@d[1]
1080	lsr	@x[3],@d[1],#32
1081	 stp	@x[8],@x[10],[$out,#32]
1082	mov.32	@x[4],@d[2]
1083	lsr	@x[5],@d[2],#32
1084	 stp	@x[12],@x[14],[$out,#48]
1085	 add	$out,$out,#64
1086	mov.32	@x[6],@d[3]
1087	lsr	@x[7],@d[3],#32
1088	mov.32	@x[8],@d[4]
1089	lsr	@x[9],@d[4],#32
1090	mov.32	@x[10],@d[5]
1091	lsr	@x[11],@d[5],#32
1092	mov.32	@x[12],@d[6]
1093	lsr	@x[13],@d[6],#32
1094	mov.32	@x[14],@d[7]
1095	lsr	@x[15],@d[7],#32
1096
1097	mov	$ctr,#5
1098.Loop_lower_neon:
1099	sub	$ctr,$ctr,#1
1100___
1101	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
1102	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
1103	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
1104	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
1105	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
1106	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
1107	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
1108
1109	foreach (@thread0) {
1110		eval;			eval(shift(@thread67));
1111		eval(shift(@thread1));	eval(shift(@thread67));
1112		eval(shift(@thread2));	eval(shift(@thread67));
1113		eval(shift(@thread3));	eval(shift(@thread67));
1114		eval(shift(@thread4));	eval(shift(@thread67));
1115		eval(shift(@thread5));	eval(shift(@thread67));
1116	}
1117
1118	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
1119	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
1120	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
1121	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
1122	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
1123	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
1124	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
1125
1126	foreach (@thread0) {
1127		eval;			eval(shift(@thread67));
1128		eval(shift(@thread1));	eval(shift(@thread67));
1129		eval(shift(@thread2));	eval(shift(@thread67));
1130		eval(shift(@thread3));	eval(shift(@thread67));
1131		eval(shift(@thread4));	eval(shift(@thread67));
1132		eval(shift(@thread5));	eval(shift(@thread67));
1133	}
1134$code.=<<___;
1135	cbnz	$ctr,.Loop_lower_neon
1136
1137	add.32	@x[0],@x[0],@d[0]		// accumulate key block
1138	 ldp	@K[0],@K[1],[sp,#0]
1139	add	@x[1],@x[1],@d[0],lsr#32
1140	 ldp	@K[2],@K[3],[sp,#32]
1141	add.32	@x[2],@x[2],@d[1]
1142	 ldp	@K[4],@K[5],[sp,#64]
1143	add	@x[3],@x[3],@d[1],lsr#32
1144	 ldr	@K[6],[sp,#96]
1145	 add	$A0,$A0,@K[0]
1146	add.32	@x[4],@x[4],@d[2]
1147	 add	$A1,$A1,@K[0]
1148	add	@x[5],@x[5],@d[2],lsr#32
1149	 add	$A2,$A2,@K[0]
1150	add.32	@x[6],@x[6],@d[3]
1151	 add	$A3,$A3,@K[0]
1152	add	@x[7],@x[7],@d[3],lsr#32
1153	 add	$A4,$A4,@K[0]
1154	add.32	@x[8],@x[8],@d[4]
1155	 add	$A5,$A5,@K[0]
1156	add	@x[9],@x[9],@d[4],lsr#32
1157	 add	$C0,$C0,@K[2]
1158	add.32	@x[10],@x[10],@d[5]
1159	 add	$C1,$C1,@K[2]
1160	add	@x[11],@x[11],@d[5],lsr#32
1161	 add	$C2,$C2,@K[2]
1162	add.32	@x[12],@x[12],@d[6]
1163	 add	$C3,$C3,@K[2]
1164	add	@x[13],@x[13],@d[6],lsr#32
1165	 add	$C4,$C4,@K[2]
1166	add.32	@x[14],@x[14],@d[7]
1167	 add	$C5,$C5,@K[2]
1168	add	@x[15],@x[15],@d[7],lsr#32
1169	 add	$D4,$D4,$ONE			// +4
1170	add	@x[0],@x[0],@x[1],lsl#32	// pack
1171	 add	$D5,$D5,$ONE			// +4
1172	add	@x[2],@x[2],@x[3],lsl#32
1173	 add	$D0,$D0,@K[3]
1174	ldp	@x[1],@x[3],[$inp,#0]		// load input
1175	 add	$D1,$D1,@K[4]
1176	add	@x[4],@x[4],@x[5],lsl#32
1177	 add	$D2,$D2,@K[5]
1178	add	@x[6],@x[6],@x[7],lsl#32
1179	 add	$D3,$D3,@K[6]
1180	ldp	@x[5],@x[7],[$inp,#16]
1181	 add	$D4,$D4,@K[3]
1182	add	@x[8],@x[8],@x[9],lsl#32
1183	 add	$D5,$D5,@K[4]
1184	add	@x[10],@x[10],@x[11],lsl#32
1185	 add	$B0,$B0,@K[1]
1186	ldp	@x[9],@x[11],[$inp,#32]
1187	 add	$B1,$B1,@K[1]
1188	add	@x[12],@x[12],@x[13],lsl#32
1189	 add	$B2,$B2,@K[1]
1190	add	@x[14],@x[14],@x[15],lsl#32
1191	 add	$B3,$B3,@K[1]
1192	ldp	@x[13],@x[15],[$inp,#48]
1193	 add	$B4,$B4,@K[1]
1194	add	$inp,$inp,#64
1195	 add	$B5,$B5,@K[1]
1196
1197#ifdef	__AARCH64EB__
1198	rev	@x[0],@x[0]
1199	rev	@x[2],@x[2]
1200	rev	@x[4],@x[4]
1201	rev	@x[6],@x[6]
1202	rev	@x[8],@x[8]
1203	rev	@x[10],@x[10]
1204	rev	@x[12],@x[12]
1205	rev	@x[14],@x[14]
1206#endif
1207	ld1.8	{$T0-$T3},[$inp],#64
1208	eor	@x[0],@x[0],@x[1]
1209	eor	@x[2],@x[2],@x[3]
1210	eor	@x[4],@x[4],@x[5]
1211	eor	@x[6],@x[6],@x[7]
1212	eor	@x[8],@x[8],@x[9]
1213	 eor	$A0,$A0,$T0
1214	eor	@x[10],@x[10],@x[11]
1215	 eor	$B0,$B0,$T1
1216	eor	@x[12],@x[12],@x[13]
1217	 eor	$C0,$C0,$T2
1218	eor	@x[14],@x[14],@x[15]
1219	 eor	$D0,$D0,$T3
1220	 ld1.8	{$T0-$T3},[$inp],#64
1221
1222	stp	@x[0],@x[2],[$out,#0]		// store output
1223	 add	@d[6],@d[6],#7			// increment counter
1224	stp	@x[4],@x[6],[$out,#16]
1225	stp	@x[8],@x[10],[$out,#32]
1226	stp	@x[12],@x[14],[$out,#48]
1227	add	$out,$out,#64
1228	st1.8	{$A0-$D0},[$out],#64
1229
1230	ld1.8	{$A0-$D0},[$inp],#64
1231	eor	$A1,$A1,$T0
1232	eor	$B1,$B1,$T1
1233	eor	$C1,$C1,$T2
1234	eor	$D1,$D1,$T3
1235	st1.8	{$A1-$D1},[$out],#64
1236
1237	ld1.8	{$A1-$D1},[$inp],#64
1238	eor	$A2,$A2,$A0
1239	 ldp	@K[0],@K[1],[sp,#0]
1240	eor	$B2,$B2,$B0
1241	 ldp	@K[2],@K[3],[sp,#32]
1242	eor	$C2,$C2,$C0
1243	eor	$D2,$D2,$D0
1244	st1.8	{$A2-$D2},[$out],#64
1245
1246	ld1.8	{$A2-$D2},[$inp],#64
1247	eor	$A3,$A3,$A1
1248	eor	$B3,$B3,$B1
1249	eor	$C3,$C3,$C1
1250	eor	$D3,$D3,$D1
1251	st1.8	{$A3-$D3},[$out],#64
1252
1253	ld1.8	{$A3-$D3},[$inp],#64
1254	eor	$A4,$A4,$A2
1255	eor	$B4,$B4,$B2
1256	eor	$C4,$C4,$C2
1257	eor	$D4,$D4,$D2
1258	st1.8	{$A4-$D4},[$out],#64
1259
1260	shl	$A0,$ONE,#1			// 4 -> 8
1261	eor	$A5,$A5,$A3
1262	eor	$B5,$B5,$B3
1263	eor	$C5,$C5,$C3
1264	eor	$D5,$D5,$D3
1265	st1.8	{$A5-$D5},[$out],#64
1266
1267	add	@K[3],@K[3],$A0			// += 8
1268	add	@K[4],@K[4],$A0
1269	add	@K[5],@K[5],$A0
1270	add	@K[6],@K[6],$A0
1271
1272	b.hs	.Loop_outer_512_neon
1273
1274	adds	$len,$len,#512
1275	ushr	$ONE,$ONE,#1			// 4 -> 2
1276
1277	ldp	d10,d11,[sp,#128+16]		// meet ABI requirements
1278	ldp	d12,d13,[sp,#128+32]
1279	ldp	d14,d15,[sp,#128+48]
1280
1281	stp	@K[0],@K[0],[sp,#0]		// wipe off-load area
1282	stp	@K[0],@K[0],[sp,#32]
1283	stp	@K[0],@K[0],[sp,#64]
1284
1285	b.eq	.Ldone_512_neon
1286
1287	sub	$key,$key,#16			// .Lone
1288	cmp	$len,#192
1289	add	sp,sp,#128
1290	sub	@K[3],@K[3],$ONE		// -= 2
1291	ld1	{$CTR,$ROT24},[$key]
1292	b.hs	.Loop_outer_neon
1293
1294	ldp	d8,d9,[sp,#0]			// meet ABI requirements
1295	eor	@K[1],@K[1],@K[1]
1296	eor	@K[2],@K[2],@K[2]
1297	eor	@K[3],@K[3],@K[3]
1298	eor	@K[4],@K[4],@K[4]
1299	eor	@K[5],@K[5],@K[5]
1300	eor	@K[6],@K[6],@K[6]
1301	b	.Loop_outer
1302
1303.Ldone_512_neon:
1304	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1305	ldp	x19,x20,[x29,#16]
1306	add	sp,sp,#128+64
1307	ldp	x21,x22,[x29,#32]
1308	ldp	x23,x24,[x29,#48]
1309	ldp	x25,x26,[x29,#64]
1310	ldp	x27,x28,[x29,#80]
1311	ldp	x29,x30,[sp],#96
1312	AARCH64_VALIDATE_LINK_REGISTER
1313	ret
1314.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1315___
1316}
1317}}}
1318
1319foreach (split("\n",$code)) {
1320	s/\`([^\`]*)\`/eval $1/geo;
1321
1322	(s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1))	or
1323	(m/\b(eor|ext|mov|tbl)\b/ and (s/\.4s/\.16b/g or 1))	or
1324	(s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1))	or
1325	(m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1))	or
1326	(m/\b(dup|ld1)\b/ and (s/\.4(s}?\[[0-3]\])/.$1/g or 1))	or
1327	(s/\b(zip[12])\.64\b/$1/ and (s/\.4s/\.2d/g or 1))	or
1328	(s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1329
1330	#s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1331
1332	print $_,"\n";
1333}
1334close STDOUT or die "error closing STDOUT: $!";	# flush
1335