xref: /freebsd/sys/crypto/openssl/aarch64/chacha-armv8.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from chacha-armv8.pl. */
2#include "arm_arch.h"
3#ifndef	__KERNEL__
4
5.hidden	OPENSSL_armcap_P
6
7
8#endif
9
10.section	.rodata
11
12.align	5
13.Lsigma:
14.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
15.Lone:
16.long	1,2,3,4
17.Lrot24:
18.long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
19.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
20.align	2
21
22.text
23
24.globl	ChaCha20_ctr32_dflt
25.type	ChaCha20_ctr32_dflt,%function
26.align	5
27ChaCha20_ctr32_dflt:
28	AARCH64_SIGN_LINK_REGISTER
29	cmp	x2,#192
30	b.lo	.Lshort
31#ifndef	__KERNEL__
32	adrp	x17,OPENSSL_armcap_P
33	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
34.Lcheck_neon:
35	tst	w17,#ARMV7_NEON
36	b.ne	.LChaCha20_neon
37#endif
38
39.Lshort:
40	stp	x29,x30,[sp,#-96]!
41	add	x29,sp,#0
42
43	adrp	x5,.Lsigma
44	add	x5,x5,#:lo12:.Lsigma
45	stp	x19,x20,[sp,#16]
46	stp	x21,x22,[sp,#32]
47	stp	x23,x24,[sp,#48]
48	stp	x25,x26,[sp,#64]
49	stp	x27,x28,[sp,#80]
50	sub	sp,sp,#64
51
52	ldp	x22,x23,[x5]		// load sigma
53	ldp	x24,x25,[x3]		// load key
54	ldp	x26,x27,[x3,#16]
55	ldp	x28,x30,[x4]		// load counter
56#ifdef	__AARCH64EB__
57	ror	x24,x24,#32
58	ror	x25,x25,#32
59	ror	x26,x26,#32
60	ror	x27,x27,#32
61	ror	x28,x28,#32
62	ror	x30,x30,#32
63#endif
64
65.Loop_outer:
66	mov	w5,w22			// unpack key block
67	lsr	x6,x22,#32
68	mov	w7,w23
69	lsr	x8,x23,#32
70	mov	w9,w24
71	lsr	x10,x24,#32
72	mov	w11,w25
73	lsr	x12,x25,#32
74	mov	w13,w26
75	lsr	x14,x26,#32
76	mov	w15,w27
77	lsr	x16,x27,#32
78	mov	w17,w28
79	lsr	x19,x28,#32
80	mov	w20,w30
81	lsr	x21,x30,#32
82
83	mov	x4,#10
84	subs	x2,x2,#64
85.Loop:
86	sub	x4,x4,#1
87	add	w5,w5,w9
88	add	w6,w6,w10
89	add	w7,w7,w11
90	add	w8,w8,w12
91	eor	w17,w17,w5
92	eor	w19,w19,w6
93	eor	w20,w20,w7
94	eor	w21,w21,w8
95	ror	w17,w17,#16
96	ror	w19,w19,#16
97	ror	w20,w20,#16
98	ror	w21,w21,#16
99	add	w13,w13,w17
100	add	w14,w14,w19
101	add	w15,w15,w20
102	add	w16,w16,w21
103	eor	w9,w9,w13
104	eor	w10,w10,w14
105	eor	w11,w11,w15
106	eor	w12,w12,w16
107	ror	w9,w9,#20
108	ror	w10,w10,#20
109	ror	w11,w11,#20
110	ror	w12,w12,#20
111	add	w5,w5,w9
112	add	w6,w6,w10
113	add	w7,w7,w11
114	add	w8,w8,w12
115	eor	w17,w17,w5
116	eor	w19,w19,w6
117	eor	w20,w20,w7
118	eor	w21,w21,w8
119	ror	w17,w17,#24
120	ror	w19,w19,#24
121	ror	w20,w20,#24
122	ror	w21,w21,#24
123	add	w13,w13,w17
124	add	w14,w14,w19
125	add	w15,w15,w20
126	add	w16,w16,w21
127	eor	w9,w9,w13
128	eor	w10,w10,w14
129	eor	w11,w11,w15
130	eor	w12,w12,w16
131	ror	w9,w9,#25
132	ror	w10,w10,#25
133	ror	w11,w11,#25
134	ror	w12,w12,#25
135	add	w5,w5,w10
136	add	w6,w6,w11
137	add	w7,w7,w12
138	add	w8,w8,w9
139	eor	w21,w21,w5
140	eor	w17,w17,w6
141	eor	w19,w19,w7
142	eor	w20,w20,w8
143	ror	w21,w21,#16
144	ror	w17,w17,#16
145	ror	w19,w19,#16
146	ror	w20,w20,#16
147	add	w15,w15,w21
148	add	w16,w16,w17
149	add	w13,w13,w19
150	add	w14,w14,w20
151	eor	w10,w10,w15
152	eor	w11,w11,w16
153	eor	w12,w12,w13
154	eor	w9,w9,w14
155	ror	w10,w10,#20
156	ror	w11,w11,#20
157	ror	w12,w12,#20
158	ror	w9,w9,#20
159	add	w5,w5,w10
160	add	w6,w6,w11
161	add	w7,w7,w12
162	add	w8,w8,w9
163	eor	w21,w21,w5
164	eor	w17,w17,w6
165	eor	w19,w19,w7
166	eor	w20,w20,w8
167	ror	w21,w21,#24
168	ror	w17,w17,#24
169	ror	w19,w19,#24
170	ror	w20,w20,#24
171	add	w15,w15,w21
172	add	w16,w16,w17
173	add	w13,w13,w19
174	add	w14,w14,w20
175	eor	w10,w10,w15
176	eor	w11,w11,w16
177	eor	w12,w12,w13
178	eor	w9,w9,w14
179	ror	w10,w10,#25
180	ror	w11,w11,#25
181	ror	w12,w12,#25
182	ror	w9,w9,#25
183	cbnz	x4,.Loop
184
185	add	w5,w5,w22		// accumulate key block
186	add	x6,x6,x22,lsr#32
187	add	w7,w7,w23
188	add	x8,x8,x23,lsr#32
189	add	w9,w9,w24
190	add	x10,x10,x24,lsr#32
191	add	w11,w11,w25
192	add	x12,x12,x25,lsr#32
193	add	w13,w13,w26
194	add	x14,x14,x26,lsr#32
195	add	w15,w15,w27
196	add	x16,x16,x27,lsr#32
197	add	w17,w17,w28
198	add	x19,x19,x28,lsr#32
199	add	w20,w20,w30
200	add	x21,x21,x30,lsr#32
201
202	b.lo	.Ltail
203
204	add	x5,x5,x6,lsl#32	// pack
205	add	x7,x7,x8,lsl#32
206	ldp	x6,x8,[x1,#0]		// load input
207	add	x9,x9,x10,lsl#32
208	add	x11,x11,x12,lsl#32
209	ldp	x10,x12,[x1,#16]
210	add	x13,x13,x14,lsl#32
211	add	x15,x15,x16,lsl#32
212	ldp	x14,x16,[x1,#32]
213	add	x17,x17,x19,lsl#32
214	add	x20,x20,x21,lsl#32
215	ldp	x19,x21,[x1,#48]
216	add	x1,x1,#64
217#ifdef	__AARCH64EB__
218	rev	x5,x5
219	rev	x7,x7
220	rev	x9,x9
221	rev	x11,x11
222	rev	x13,x13
223	rev	x15,x15
224	rev	x17,x17
225	rev	x20,x20
226#endif
227	eor	x5,x5,x6
228	eor	x7,x7,x8
229	eor	x9,x9,x10
230	eor	x11,x11,x12
231	eor	x13,x13,x14
232	eor	x15,x15,x16
233	eor	x17,x17,x19
234	eor	x20,x20,x21
235
236	stp	x5,x7,[x0,#0]		// store output
237	add	x28,x28,#1			// increment counter
238	stp	x9,x11,[x0,#16]
239	stp	x13,x15,[x0,#32]
240	stp	x17,x20,[x0,#48]
241	add	x0,x0,#64
242
243	b.hi	.Loop_outer
244
245	ldp	x19,x20,[x29,#16]
246	add	sp,sp,#64
247	ldp	x21,x22,[x29,#32]
248	ldp	x23,x24,[x29,#48]
249	ldp	x25,x26,[x29,#64]
250	ldp	x27,x28,[x29,#80]
251	ldp	x29,x30,[sp],#96
252.Labort:
253	AARCH64_VALIDATE_LINK_REGISTER
254	ret
255
256.align	4
257.Ltail:
258	add	x2,x2,#64
259.Less_than_64:
260	sub	x0,x0,#1
261	add	x1,x1,x2
262	add	x0,x0,x2
263	add	x4,sp,x2
264	neg	x2,x2
265
266	add	x5,x5,x6,lsl#32	// pack
267	add	x7,x7,x8,lsl#32
268	add	x9,x9,x10,lsl#32
269	add	x11,x11,x12,lsl#32
270	add	x13,x13,x14,lsl#32
271	add	x15,x15,x16,lsl#32
272	add	x17,x17,x19,lsl#32
273	add	x20,x20,x21,lsl#32
274#ifdef	__AARCH64EB__
275	rev	x5,x5
276	rev	x7,x7
277	rev	x9,x9
278	rev	x11,x11
279	rev	x13,x13
280	rev	x15,x15
281	rev	x17,x17
282	rev	x20,x20
283#endif
284	stp	x5,x7,[sp,#0]
285	stp	x9,x11,[sp,#16]
286	stp	x13,x15,[sp,#32]
287	stp	x17,x20,[sp,#48]
288
289.Loop_tail:
290	ldrb	w10,[x1,x2]
291	ldrb	w11,[x4,x2]
292	add	x2,x2,#1
293	eor	w10,w10,w11
294	strb	w10,[x0,x2]
295	cbnz	x2,.Loop_tail
296
297	stp	xzr,xzr,[sp,#0]
298	stp	xzr,xzr,[sp,#16]
299	stp	xzr,xzr,[sp,#32]
300	stp	xzr,xzr,[sp,#48]
301
302	ldp	x19,x20,[x29,#16]
303	add	sp,sp,#64
304	ldp	x21,x22,[x29,#32]
305	ldp	x23,x24,[x29,#48]
306	ldp	x25,x26,[x29,#64]
307	ldp	x27,x28,[x29,#80]
308	ldp	x29,x30,[sp],#96
309	AARCH64_VALIDATE_LINK_REGISTER
310	ret
311.size	ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt
312
313.globl	ChaCha20_ctr32
314.type	ChaCha20_ctr32,%function
315.align	5
316ChaCha20_ctr32:
317	AARCH64_SIGN_LINK_REGISTER
318	cbz	x2,.Labort
319	cmp	x2,#192
320	b.lo	.Lshort
321#ifndef	__KERNEL__
322	adrp	x17,OPENSSL_armcap_P
323	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
324	tst	w17,#ARMV8_SVE
325	b.eq	.Lcheck_neon
326	stp	x29,x30,[sp,#-16]!
327	sub	sp,sp,#16
328	// SVE handling will inevitably increment the counter
329	// Neon/Scalar code that follows to process tail data needs to
330	// use new counter, unfortunately the input counter buffer
331	// pointed to by ctr is meant to be read-only per API contract
332	// we have to copy the buffer to stack to be writable by SVE
333	ldp	x5,x6,[x4]
334	stp	x5,x6,[sp]
335	mov	x4,sp
336	bl	ChaCha20_ctr32_sve
337	cbz	x2,1f
338	bl	ChaCha20_ctr32_dflt
3391:
340	add	sp,sp,#16
341	ldp	x29,x30,[sp],#16
342	AARCH64_VALIDATE_LINK_REGISTER
343	ret
344#endif
345	b	.Lshort
346.size	ChaCha20_ctr32,.-ChaCha20_ctr32
347
348#ifdef	__KERNEL__
349.globl	ChaCha20_neon
350#endif
351.type	ChaCha20_neon,%function
352.align	5
353ChaCha20_neon:
354	AARCH64_SIGN_LINK_REGISTER
355.LChaCha20_neon:
356	stp	x29,x30,[sp,#-96]!
357	add	x29,sp,#0
358
359	adrp	x5,.Lsigma
360	add	x5,x5,#:lo12:.Lsigma
361	stp	x19,x20,[sp,#16]
362	stp	x21,x22,[sp,#32]
363	stp	x23,x24,[sp,#48]
364	stp	x25,x26,[sp,#64]
365	stp	x27,x28,[sp,#80]
366	cmp	x2,#512
367	b.hs	.L512_or_more_neon
368
369	sub	sp,sp,#64
370
371	ldp	x22,x23,[x5]		// load sigma
372	ld1	{v0.4s},[x5],#16
373	ldp	x24,x25,[x3]		// load key
374	ldp	x26,x27,[x3,#16]
375	ld1	{v1.4s,v2.4s},[x3]
376	ldp	x28,x30,[x4]		// load counter
377	ld1	{v3.4s},[x4]
378	stp	d8,d9,[sp]			// meet ABI requirements
379	ld1	{v8.4s,v9.4s},[x5]
380#ifdef	__AARCH64EB__
381	rev64	v0.4s,v0.4s
382	ror	x24,x24,#32
383	ror	x25,x25,#32
384	ror	x26,x26,#32
385	ror	x27,x27,#32
386	ror	x28,x28,#32
387	ror	x30,x30,#32
388#endif
389
390.Loop_outer_neon:
391	dup	v16.4s,v0.s[0]			// unpack key block
392	mov	w5,w22
393	dup	v20.4s,v0.s[1]
394	lsr	x6,x22,#32
395	dup	v24.4s,v0.s[2]
396	mov	w7,w23
397	dup	v28.4s,v0.s[3]
398	lsr	x8,x23,#32
399	dup	v17.4s,v1.s[0]
400	mov	w9,w24
401	dup	v21.4s,v1.s[1]
402	lsr	x10,x24,#32
403	dup	v25.4s,v1.s[2]
404	mov	w11,w25
405	dup	v29.4s,v1.s[3]
406	lsr	x12,x25,#32
407	dup	v19.4s,v3.s[0]
408	mov	w13,w26
409	dup	v23.4s,v3.s[1]
410	lsr	x14,x26,#32
411	dup	v27.4s,v3.s[2]
412	mov	w15,w27
413	dup	v31.4s,v3.s[3]
414	lsr	x16,x27,#32
415	add	v19.4s,v19.4s,v8.4s
416	mov	w17,w28
417	dup	v18.4s,v2.s[0]
418	lsr	x19,x28,#32
419	dup	v22.4s,v2.s[1]
420	mov	w20,w30
421	dup	v26.4s,v2.s[2]
422	lsr	x21,x30,#32
423	dup	v30.4s,v2.s[3]
424
425	mov	x4,#10
426	subs	x2,x2,#320
427.Loop_neon:
428	sub	x4,x4,#1
429	add	v16.4s,v16.4s,v17.4s
430	add	w5,w5,w9
431	add	v20.4s,v20.4s,v21.4s
432	add	w6,w6,w10
433	add	v24.4s,v24.4s,v25.4s
434	add	w7,w7,w11
435	add	v28.4s,v28.4s,v29.4s
436	add	w8,w8,w12
437	eor	v19.16b,v19.16b,v16.16b
438	eor	w17,w17,w5
439	eor	v23.16b,v23.16b,v20.16b
440	eor	w19,w19,w6
441	eor	v27.16b,v27.16b,v24.16b
442	eor	w20,w20,w7
443	eor	v31.16b,v31.16b,v28.16b
444	eor	w21,w21,w8
445	rev32	v19.8h,v19.8h
446	ror	w17,w17,#16
447	rev32	v23.8h,v23.8h
448	ror	w19,w19,#16
449	rev32	v27.8h,v27.8h
450	ror	w20,w20,#16
451	rev32	v31.8h,v31.8h
452	ror	w21,w21,#16
453	add	v18.4s,v18.4s,v19.4s
454	add	w13,w13,w17
455	add	v22.4s,v22.4s,v23.4s
456	add	w14,w14,w19
457	add	v26.4s,v26.4s,v27.4s
458	add	w15,w15,w20
459	add	v30.4s,v30.4s,v31.4s
460	add	w16,w16,w21
461	eor	v4.16b,v17.16b,v18.16b
462	eor	w9,w9,w13
463	eor	v5.16b,v21.16b,v22.16b
464	eor	w10,w10,w14
465	eor	v6.16b,v25.16b,v26.16b
466	eor	w11,w11,w15
467	eor	v7.16b,v29.16b,v30.16b
468	eor	w12,w12,w16
469	ushr	v17.4s,v4.4s,#20
470	ror	w9,w9,#20
471	ushr	v21.4s,v5.4s,#20
472	ror	w10,w10,#20
473	ushr	v25.4s,v6.4s,#20
474	ror	w11,w11,#20
475	ushr	v29.4s,v7.4s,#20
476	ror	w12,w12,#20
477	sli	v17.4s,v4.4s,#12
478	add	w5,w5,w9
479	sli	v21.4s,v5.4s,#12
480	add	w6,w6,w10
481	sli	v25.4s,v6.4s,#12
482	add	w7,w7,w11
483	sli	v29.4s,v7.4s,#12
484	add	w8,w8,w12
485	add	v16.4s,v16.4s,v17.4s
486	eor	w17,w17,w5
487	add	v20.4s,v20.4s,v21.4s
488	eor	w19,w19,w6
489	add	v24.4s,v24.4s,v25.4s
490	eor	w20,w20,w7
491	add	v28.4s,v28.4s,v29.4s
492	eor	w21,w21,w8
493	eor	v4.16b,v19.16b,v16.16b
494	ror	w17,w17,#24
495	eor	v5.16b,v23.16b,v20.16b
496	ror	w19,w19,#24
497	eor	v6.16b,v27.16b,v24.16b
498	ror	w20,w20,#24
499	eor	v7.16b,v31.16b,v28.16b
500	ror	w21,w21,#24
501	tbl	v19.16b,{v4.16b},v9.16b
502	add	w13,w13,w17
503	tbl	v23.16b,{v5.16b},v9.16b
504	add	w14,w14,w19
505	tbl	v27.16b,{v6.16b},v9.16b
506	add	w15,w15,w20
507	tbl	v31.16b,{v7.16b},v9.16b
508	add	w16,w16,w21
509	add	v18.4s,v18.4s,v19.4s
510	eor	w9,w9,w13
511	add	v22.4s,v22.4s,v23.4s
512	eor	w10,w10,w14
513	add	v26.4s,v26.4s,v27.4s
514	eor	w11,w11,w15
515	add	v30.4s,v30.4s,v31.4s
516	eor	w12,w12,w16
517	eor	v4.16b,v17.16b,v18.16b
518	ror	w9,w9,#25
519	eor	v5.16b,v21.16b,v22.16b
520	ror	w10,w10,#25
521	eor	v6.16b,v25.16b,v26.16b
522	ror	w11,w11,#25
523	eor	v7.16b,v29.16b,v30.16b
524	ror	w12,w12,#25
525	ushr	v17.4s,v4.4s,#25
526	ushr	v21.4s,v5.4s,#25
527	ushr	v25.4s,v6.4s,#25
528	ushr	v29.4s,v7.4s,#25
529	sli	v17.4s,v4.4s,#7
530	sli	v21.4s,v5.4s,#7
531	sli	v25.4s,v6.4s,#7
532	sli	v29.4s,v7.4s,#7
533	add	v16.4s,v16.4s,v21.4s
534	add	w5,w5,w10
535	add	v20.4s,v20.4s,v25.4s
536	add	w6,w6,w11
537	add	v24.4s,v24.4s,v29.4s
538	add	w7,w7,w12
539	add	v28.4s,v28.4s,v17.4s
540	add	w8,w8,w9
541	eor	v31.16b,v31.16b,v16.16b
542	eor	w21,w21,w5
543	eor	v19.16b,v19.16b,v20.16b
544	eor	w17,w17,w6
545	eor	v23.16b,v23.16b,v24.16b
546	eor	w19,w19,w7
547	eor	v27.16b,v27.16b,v28.16b
548	eor	w20,w20,w8
549	rev32	v31.8h,v31.8h
550	ror	w21,w21,#16
551	rev32	v19.8h,v19.8h
552	ror	w17,w17,#16
553	rev32	v23.8h,v23.8h
554	ror	w19,w19,#16
555	rev32	v27.8h,v27.8h
556	ror	w20,w20,#16
557	add	v26.4s,v26.4s,v31.4s
558	add	w15,w15,w21
559	add	v30.4s,v30.4s,v19.4s
560	add	w16,w16,w17
561	add	v18.4s,v18.4s,v23.4s
562	add	w13,w13,w19
563	add	v22.4s,v22.4s,v27.4s
564	add	w14,w14,w20
565	eor	v4.16b,v21.16b,v26.16b
566	eor	w10,w10,w15
567	eor	v5.16b,v25.16b,v30.16b
568	eor	w11,w11,w16
569	eor	v6.16b,v29.16b,v18.16b
570	eor	w12,w12,w13
571	eor	v7.16b,v17.16b,v22.16b
572	eor	w9,w9,w14
573	ushr	v21.4s,v4.4s,#20
574	ror	w10,w10,#20
575	ushr	v25.4s,v5.4s,#20
576	ror	w11,w11,#20
577	ushr	v29.4s,v6.4s,#20
578	ror	w12,w12,#20
579	ushr	v17.4s,v7.4s,#20
580	ror	w9,w9,#20
581	sli	v21.4s,v4.4s,#12
582	add	w5,w5,w10
583	sli	v25.4s,v5.4s,#12
584	add	w6,w6,w11
585	sli	v29.4s,v6.4s,#12
586	add	w7,w7,w12
587	sli	v17.4s,v7.4s,#12
588	add	w8,w8,w9
589	add	v16.4s,v16.4s,v21.4s
590	eor	w21,w21,w5
591	add	v20.4s,v20.4s,v25.4s
592	eor	w17,w17,w6
593	add	v24.4s,v24.4s,v29.4s
594	eor	w19,w19,w7
595	add	v28.4s,v28.4s,v17.4s
596	eor	w20,w20,w8
597	eor	v4.16b,v31.16b,v16.16b
598	ror	w21,w21,#24
599	eor	v5.16b,v19.16b,v20.16b
600	ror	w17,w17,#24
601	eor	v6.16b,v23.16b,v24.16b
602	ror	w19,w19,#24
603	eor	v7.16b,v27.16b,v28.16b
604	ror	w20,w20,#24
605	tbl	v31.16b,{v4.16b},v9.16b
606	add	w15,w15,w21
607	tbl	v19.16b,{v5.16b},v9.16b
608	add	w16,w16,w17
609	tbl	v23.16b,{v6.16b},v9.16b
610	add	w13,w13,w19
611	tbl	v27.16b,{v7.16b},v9.16b
612	add	w14,w14,w20
613	add	v26.4s,v26.4s,v31.4s
614	eor	w10,w10,w15
615	add	v30.4s,v30.4s,v19.4s
616	eor	w11,w11,w16
617	add	v18.4s,v18.4s,v23.4s
618	eor	w12,w12,w13
619	add	v22.4s,v22.4s,v27.4s
620	eor	w9,w9,w14
621	eor	v4.16b,v21.16b,v26.16b
622	ror	w10,w10,#25
623	eor	v5.16b,v25.16b,v30.16b
624	ror	w11,w11,#25
625	eor	v6.16b,v29.16b,v18.16b
626	ror	w12,w12,#25
627	eor	v7.16b,v17.16b,v22.16b
628	ror	w9,w9,#25
629	ushr	v21.4s,v4.4s,#25
630	ushr	v25.4s,v5.4s,#25
631	ushr	v29.4s,v6.4s,#25
632	ushr	v17.4s,v7.4s,#25
633	sli	v21.4s,v4.4s,#7
634	sli	v25.4s,v5.4s,#7
635	sli	v29.4s,v6.4s,#7
636	sli	v17.4s,v7.4s,#7
637	cbnz	x4,.Loop_neon
638
639	add	v19.4s,v19.4s,v8.4s
640
641	zip1	v4.4s,v16.4s,v20.4s			// transpose data
642	zip1	v5.4s,v24.4s,v28.4s
643	zip2	v6.4s,v16.4s,v20.4s
644	zip2	v7.4s,v24.4s,v28.4s
645	zip1	v16.2d,v4.2d,v5.2d
646	zip2	v20.2d,v4.2d,v5.2d
647	zip1	v24.2d,v6.2d,v7.2d
648	zip2	v28.2d,v6.2d,v7.2d
649
650	zip1	v4.4s,v17.4s,v21.4s
651	zip1	v5.4s,v25.4s,v29.4s
652	zip2	v6.4s,v17.4s,v21.4s
653	zip2	v7.4s,v25.4s,v29.4s
654	zip1	v17.2d,v4.2d,v5.2d
655	zip2	v21.2d,v4.2d,v5.2d
656	zip1	v25.2d,v6.2d,v7.2d
657	zip2	v29.2d,v6.2d,v7.2d
658
659	zip1	v4.4s,v18.4s,v22.4s
660	add	w5,w5,w22		// accumulate key block
661	zip1	v5.4s,v26.4s,v30.4s
662	add	x6,x6,x22,lsr#32
663	zip2	v6.4s,v18.4s,v22.4s
664	add	w7,w7,w23
665	zip2	v7.4s,v26.4s,v30.4s
666	add	x8,x8,x23,lsr#32
667	zip1	v18.2d,v4.2d,v5.2d
668	add	w9,w9,w24
669	zip2	v22.2d,v4.2d,v5.2d
670	add	x10,x10,x24,lsr#32
671	zip1	v26.2d,v6.2d,v7.2d
672	add	w11,w11,w25
673	zip2	v30.2d,v6.2d,v7.2d
674	add	x12,x12,x25,lsr#32
675
676	zip1	v4.4s,v19.4s,v23.4s
677	add	w13,w13,w26
678	zip1	v5.4s,v27.4s,v31.4s
679	add	x14,x14,x26,lsr#32
680	zip2	v6.4s,v19.4s,v23.4s
681	add	w15,w15,w27
682	zip2	v7.4s,v27.4s,v31.4s
683	add	x16,x16,x27,lsr#32
684	zip1	v19.2d,v4.2d,v5.2d
685	add	w17,w17,w28
686	zip2	v23.2d,v4.2d,v5.2d
687	add	x19,x19,x28,lsr#32
688	zip1	v27.2d,v6.2d,v7.2d
689	add	w20,w20,w30
690	zip2	v31.2d,v6.2d,v7.2d
691	add	x21,x21,x30,lsr#32
692
693	b.lo	.Ltail_neon
694
695	add	x5,x5,x6,lsl#32	// pack
696	add	x7,x7,x8,lsl#32
697	ldp	x6,x8,[x1,#0]		// load input
698	add	v16.4s,v16.4s,v0.4s			// accumulate key block
699	add	x9,x9,x10,lsl#32
700	add	x11,x11,x12,lsl#32
701	ldp	x10,x12,[x1,#16]
702	add	v17.4s,v17.4s,v1.4s
703	add	x13,x13,x14,lsl#32
704	add	x15,x15,x16,lsl#32
705	ldp	x14,x16,[x1,#32]
706	add	v18.4s,v18.4s,v2.4s
707	add	x17,x17,x19,lsl#32
708	add	x20,x20,x21,lsl#32
709	ldp	x19,x21,[x1,#48]
710	add	v19.4s,v19.4s,v3.4s
711	add	x1,x1,#64
712#ifdef	__AARCH64EB__
713	rev	x5,x5
714	rev	x7,x7
715	rev	x9,x9
716	rev	x11,x11
717	rev	x13,x13
718	rev	x15,x15
719	rev	x17,x17
720	rev	x20,x20
721#endif
722	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
723	eor	x5,x5,x6
724	add	v20.4s,v20.4s,v0.4s
725	eor	x7,x7,x8
726	add	v21.4s,v21.4s,v1.4s
727	eor	x9,x9,x10
728	add	v22.4s,v22.4s,v2.4s
729	eor	x11,x11,x12
730	add	v23.4s,v23.4s,v3.4s
731	eor	x13,x13,x14
732	eor	v16.16b,v16.16b,v4.16b
733	movi	v4.4s,#5
734	eor	x15,x15,x16
735	eor	v17.16b,v17.16b,v5.16b
736	eor	x17,x17,x19
737	eor	v18.16b,v18.16b,v6.16b
738	eor	x20,x20,x21
739	eor	v19.16b,v19.16b,v7.16b
740	add	v8.4s,v8.4s,v4.4s			// += 5
741	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
742
743	stp	x5,x7,[x0,#0]		// store output
744	add	x28,x28,#5			// increment counter
745	stp	x9,x11,[x0,#16]
746	stp	x13,x15,[x0,#32]
747	stp	x17,x20,[x0,#48]
748	add	x0,x0,#64
749
750	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
751	add	v24.4s,v24.4s,v0.4s
752	add	v25.4s,v25.4s,v1.4s
753	add	v26.4s,v26.4s,v2.4s
754	add	v27.4s,v27.4s,v3.4s
755	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
756
757	eor	v20.16b,v20.16b,v4.16b
758	eor	v21.16b,v21.16b,v5.16b
759	eor	v22.16b,v22.16b,v6.16b
760	eor	v23.16b,v23.16b,v7.16b
761	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
762	add	v28.4s,v28.4s,v0.4s
763	add	v29.4s,v29.4s,v1.4s
764	add	v30.4s,v30.4s,v2.4s
765	add	v31.4s,v31.4s,v3.4s
766	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
767
768	eor	v24.16b,v24.16b,v16.16b
769	eor	v25.16b,v25.16b,v17.16b
770	eor	v26.16b,v26.16b,v18.16b
771	eor	v27.16b,v27.16b,v19.16b
772	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
773
774	eor	v28.16b,v28.16b,v20.16b
775	eor	v29.16b,v29.16b,v21.16b
776	eor	v30.16b,v30.16b,v22.16b
777	eor	v31.16b,v31.16b,v23.16b
778	st1	{v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64
779
780	b.hi	.Loop_outer_neon
781
782	ldp	d8,d9,[sp]			// meet ABI requirements
783
784	ldp	x19,x20,[x29,#16]
785	add	sp,sp,#64
786	ldp	x21,x22,[x29,#32]
787	ldp	x23,x24,[x29,#48]
788	ldp	x25,x26,[x29,#64]
789	ldp	x27,x28,[x29,#80]
790	ldp	x29,x30,[sp],#96
791	AARCH64_VALIDATE_LINK_REGISTER
792	ret
793
794.align	4
795.Ltail_neon:
796	add	x2,x2,#320
797	ldp	d8,d9,[sp]			// meet ABI requirements
798	cmp	x2,#64
799	b.lo	.Less_than_64
800
801	add	x5,x5,x6,lsl#32	// pack
802	add	x7,x7,x8,lsl#32
803	ldp	x6,x8,[x1,#0]		// load input
804	add	x9,x9,x10,lsl#32
805	add	x11,x11,x12,lsl#32
806	ldp	x10,x12,[x1,#16]
807	add	x13,x13,x14,lsl#32
808	add	x15,x15,x16,lsl#32
809	ldp	x14,x16,[x1,#32]
810	add	x17,x17,x19,lsl#32
811	add	x20,x20,x21,lsl#32
812	ldp	x19,x21,[x1,#48]
813	add	x1,x1,#64
814#ifdef	__AARCH64EB__
815	rev	x5,x5
816	rev	x7,x7
817	rev	x9,x9
818	rev	x11,x11
819	rev	x13,x13
820	rev	x15,x15
821	rev	x17,x17
822	rev	x20,x20
823#endif
824	eor	x5,x5,x6
825	eor	x7,x7,x8
826	eor	x9,x9,x10
827	eor	x11,x11,x12
828	eor	x13,x13,x14
829	eor	x15,x15,x16
830	eor	x17,x17,x19
831	eor	x20,x20,x21
832
833	stp	x5,x7,[x0,#0]		// store output
834	add	v16.4s,v16.4s,v0.4s			// accumulate key block
835	stp	x9,x11,[x0,#16]
836	add	v17.4s,v17.4s,v1.4s
837	stp	x13,x15,[x0,#32]
838	add	v18.4s,v18.4s,v2.4s
839	stp	x17,x20,[x0,#48]
840	add	v19.4s,v19.4s,v3.4s
841	add	x0,x0,#64
842	b.eq	.Ldone_neon
843	sub	x2,x2,#64
844	cmp	x2,#64
845	b.lo	.Last_neon
846
847	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
848	eor	v16.16b,v16.16b,v4.16b
849	eor	v17.16b,v17.16b,v5.16b
850	eor	v18.16b,v18.16b,v6.16b
851	eor	v19.16b,v19.16b,v7.16b
852	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
853	b.eq	.Ldone_neon
854
855	add	v16.4s,v20.4s,v0.4s
856	add	v17.4s,v21.4s,v1.4s
857	sub	x2,x2,#64
858	add	v18.4s,v22.4s,v2.4s
859	cmp	x2,#64
860	add	v19.4s,v23.4s,v3.4s
861	b.lo	.Last_neon
862
863	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
864	eor	v20.16b,v16.16b,v4.16b
865	eor	v21.16b,v17.16b,v5.16b
866	eor	v22.16b,v18.16b,v6.16b
867	eor	v23.16b,v19.16b,v7.16b
868	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
869	b.eq	.Ldone_neon
870
871	add	v16.4s,v24.4s,v0.4s
872	add	v17.4s,v25.4s,v1.4s
873	sub	x2,x2,#64
874	add	v18.4s,v26.4s,v2.4s
875	cmp	x2,#64
876	add	v19.4s,v27.4s,v3.4s
877	b.lo	.Last_neon
878
879	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
880	eor	v24.16b,v16.16b,v4.16b
881	eor	v25.16b,v17.16b,v5.16b
882	eor	v26.16b,v18.16b,v6.16b
883	eor	v27.16b,v19.16b,v7.16b
884	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
885	b.eq	.Ldone_neon
886
887	add	v16.4s,v28.4s,v0.4s
888	add	v17.4s,v29.4s,v1.4s
889	add	v18.4s,v30.4s,v2.4s
890	add	v19.4s,v31.4s,v3.4s
891	sub	x2,x2,#64
892
893.Last_neon:
894	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
895
896	sub	x0,x0,#1
897	add	x1,x1,x2
898	add	x0,x0,x2
899	add	x4,sp,x2
900	neg	x2,x2
901
902.Loop_tail_neon:
903	ldrb	w10,[x1,x2]
904	ldrb	w11,[x4,x2]
905	add	x2,x2,#1
906	eor	w10,w10,w11
907	strb	w10,[x0,x2]
908	cbnz	x2,.Loop_tail_neon
909
910	stp	xzr,xzr,[sp,#0]
911	stp	xzr,xzr,[sp,#16]
912	stp	xzr,xzr,[sp,#32]
913	stp	xzr,xzr,[sp,#48]
914
915.Ldone_neon:
916	ldp	x19,x20,[x29,#16]
917	add	sp,sp,#64
918	ldp	x21,x22,[x29,#32]
919	ldp	x23,x24,[x29,#48]
920	ldp	x25,x26,[x29,#64]
921	ldp	x27,x28,[x29,#80]
922	ldp	x29,x30,[sp],#96
923	AARCH64_VALIDATE_LINK_REGISTER
924	ret
925.size	ChaCha20_neon,.-ChaCha20_neon
926.type	ChaCha20_512_neon,%function
927.align	5
928ChaCha20_512_neon:
929	AARCH64_SIGN_LINK_REGISTER
930	stp	x29,x30,[sp,#-96]!
931	add	x29,sp,#0
932
933	adrp	x5,.Lsigma
934	add	x5,x5,#:lo12:.Lsigma
935	stp	x19,x20,[sp,#16]
936	stp	x21,x22,[sp,#32]
937	stp	x23,x24,[sp,#48]
938	stp	x25,x26,[sp,#64]
939	stp	x27,x28,[sp,#80]
940
941.L512_or_more_neon:
942	sub	sp,sp,#128+64
943
944	eor	v7.16b,v7.16b,v7.16b
945	ldp	x22,x23,[x5]		// load sigma
946	ld1	{v0.4s},[x5],#16
947	ldp	x24,x25,[x3]		// load key
948	ldp	x26,x27,[x3,#16]
949	ld1	{v1.4s,v2.4s},[x3]
950	ldp	x28,x30,[x4]		// load counter
951	ld1	{v3.4s},[x4]
952	ld1	{v7.s}[0],[x5]
953	add	x3,x5,#16			// .Lrot24
954#ifdef	__AARCH64EB__
955	rev64	v0.4s,v0.4s
956	ror	x24,x24,#32
957	ror	x25,x25,#32
958	ror	x26,x26,#32
959	ror	x27,x27,#32
960	ror	x28,x28,#32
961	ror	x30,x30,#32
962#endif
963	add	v3.4s,v3.4s,v7.4s		// += 1
964	stp	q0,q1,[sp,#0]		// off-load key block, invariant part
965	add	v3.4s,v3.4s,v7.4s		// not typo
966	str	q2,[sp,#32]
967	add	v4.4s,v3.4s,v7.4s
968	add	v5.4s,v4.4s,v7.4s
969	add	v6.4s,v5.4s,v7.4s
970	shl	v7.4s,v7.4s,#2			// 1 -> 4
971
972	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
973	stp	d10,d11,[sp,#128+16]
974	stp	d12,d13,[sp,#128+32]
975	stp	d14,d15,[sp,#128+48]
976
977	sub	x2,x2,#512			// not typo
978
979.Loop_outer_512_neon:
980	mov	v8.16b,v0.16b
981	mov	v12.16b,v0.16b
982	mov	v16.16b,v0.16b
983	mov	v20.16b,v0.16b
984	mov	v24.16b,v0.16b
985	mov	v28.16b,v0.16b
986	mov	v9.16b,v1.16b
987	mov	w5,w22			// unpack key block
988	mov	v13.16b,v1.16b
989	lsr	x6,x22,#32
990	mov	v17.16b,v1.16b
991	mov	w7,w23
992	mov	v21.16b,v1.16b
993	lsr	x8,x23,#32
994	mov	v25.16b,v1.16b
995	mov	w9,w24
996	mov	v29.16b,v1.16b
997	lsr	x10,x24,#32
998	mov	v11.16b,v3.16b
999	mov	w11,w25
1000	mov	v15.16b,v4.16b
1001	lsr	x12,x25,#32
1002	mov	v19.16b,v5.16b
1003	mov	w13,w26
1004	mov	v23.16b,v6.16b
1005	lsr	x14,x26,#32
1006	mov	v10.16b,v2.16b
1007	mov	w15,w27
1008	mov	v14.16b,v2.16b
1009	lsr	x16,x27,#32
1010	add	v27.4s,v11.4s,v7.4s			// +4
1011	mov	w17,w28
1012	add	v31.4s,v15.4s,v7.4s			// +4
1013	lsr	x19,x28,#32
1014	mov	v18.16b,v2.16b
1015	mov	w20,w30
1016	mov	v22.16b,v2.16b
1017	lsr	x21,x30,#32
1018	mov	v26.16b,v2.16b
1019	stp	q3,q4,[sp,#48]		// off-load key block, variable part
1020	mov	v30.16b,v2.16b
1021	stp	q5,q6,[sp,#80]
1022
1023	mov	x4,#5
1024	ld1	{v6.4s},[x3]
1025	subs	x2,x2,#512
1026.Loop_upper_neon:
1027	sub	x4,x4,#1
1028	add	v8.4s,v8.4s,v9.4s
1029	add	w5,w5,w9
1030	add	v12.4s,v12.4s,v13.4s
1031	add	w6,w6,w10
1032	add	v16.4s,v16.4s,v17.4s
1033	add	w7,w7,w11
1034	add	v20.4s,v20.4s,v21.4s
1035	add	w8,w8,w12
1036	add	v24.4s,v24.4s,v25.4s
1037	eor	w17,w17,w5
1038	add	v28.4s,v28.4s,v29.4s
1039	eor	w19,w19,w6
1040	eor	v11.16b,v11.16b,v8.16b
1041	eor	w20,w20,w7
1042	eor	v15.16b,v15.16b,v12.16b
1043	eor	w21,w21,w8
1044	eor	v19.16b,v19.16b,v16.16b
1045	ror	w17,w17,#16
1046	eor	v23.16b,v23.16b,v20.16b
1047	ror	w19,w19,#16
1048	eor	v27.16b,v27.16b,v24.16b
1049	ror	w20,w20,#16
1050	eor	v31.16b,v31.16b,v28.16b
1051	ror	w21,w21,#16
1052	rev32	v11.8h,v11.8h
1053	add	w13,w13,w17
1054	rev32	v15.8h,v15.8h
1055	add	w14,w14,w19
1056	rev32	v19.8h,v19.8h
1057	add	w15,w15,w20
1058	rev32	v23.8h,v23.8h
1059	add	w16,w16,w21
1060	rev32	v27.8h,v27.8h
1061	eor	w9,w9,w13
1062	rev32	v31.8h,v31.8h
1063	eor	w10,w10,w14
1064	add	v10.4s,v10.4s,v11.4s
1065	eor	w11,w11,w15
1066	add	v14.4s,v14.4s,v15.4s
1067	eor	w12,w12,w16
1068	add	v18.4s,v18.4s,v19.4s
1069	ror	w9,w9,#20
1070	add	v22.4s,v22.4s,v23.4s
1071	ror	w10,w10,#20
1072	add	v26.4s,v26.4s,v27.4s
1073	ror	w11,w11,#20
1074	add	v30.4s,v30.4s,v31.4s
1075	ror	w12,w12,#20
1076	eor	v0.16b,v9.16b,v10.16b
1077	add	w5,w5,w9
1078	eor	v1.16b,v13.16b,v14.16b
1079	add	w6,w6,w10
1080	eor	v2.16b,v17.16b,v18.16b
1081	add	w7,w7,w11
1082	eor	v3.16b,v21.16b,v22.16b
1083	add	w8,w8,w12
1084	eor	v4.16b,v25.16b,v26.16b
1085	eor	w17,w17,w5
1086	eor	v5.16b,v29.16b,v30.16b
1087	eor	w19,w19,w6
1088	ushr	v9.4s,v0.4s,#20
1089	eor	w20,w20,w7
1090	ushr	v13.4s,v1.4s,#20
1091	eor	w21,w21,w8
1092	ushr	v17.4s,v2.4s,#20
1093	ror	w17,w17,#24
1094	ushr	v21.4s,v3.4s,#20
1095	ror	w19,w19,#24
1096	ushr	v25.4s,v4.4s,#20
1097	ror	w20,w20,#24
1098	ushr	v29.4s,v5.4s,#20
1099	ror	w21,w21,#24
1100	sli	v9.4s,v0.4s,#12
1101	add	w13,w13,w17
1102	sli	v13.4s,v1.4s,#12
1103	add	w14,w14,w19
1104	sli	v17.4s,v2.4s,#12
1105	add	w15,w15,w20
1106	sli	v21.4s,v3.4s,#12
1107	add	w16,w16,w21
1108	sli	v25.4s,v4.4s,#12
1109	eor	w9,w9,w13
1110	sli	v29.4s,v5.4s,#12
1111	eor	w10,w10,w14
1112	add	v8.4s,v8.4s,v9.4s
1113	eor	w11,w11,w15
1114	add	v12.4s,v12.4s,v13.4s
1115	eor	w12,w12,w16
1116	add	v16.4s,v16.4s,v17.4s
1117	ror	w9,w9,#25
1118	add	v20.4s,v20.4s,v21.4s
1119	ror	w10,w10,#25
1120	add	v24.4s,v24.4s,v25.4s
1121	ror	w11,w11,#25
1122	add	v28.4s,v28.4s,v29.4s
1123	ror	w12,w12,#25
1124	eor	v11.16b,v11.16b,v8.16b
1125	add	w5,w5,w10
1126	eor	v15.16b,v15.16b,v12.16b
1127	add	w6,w6,w11
1128	eor	v19.16b,v19.16b,v16.16b
1129	add	w7,w7,w12
1130	eor	v23.16b,v23.16b,v20.16b
1131	add	w8,w8,w9
1132	eor	v27.16b,v27.16b,v24.16b
1133	eor	w21,w21,w5
1134	eor	v31.16b,v31.16b,v28.16b
1135	eor	w17,w17,w6
1136	tbl	v11.16b,{v11.16b},v6.16b
1137	eor	w19,w19,w7
1138	tbl	v15.16b,{v15.16b},v6.16b
1139	eor	w20,w20,w8
1140	tbl	v19.16b,{v19.16b},v6.16b
1141	ror	w21,w21,#16
1142	tbl	v23.16b,{v23.16b},v6.16b
1143	ror	w17,w17,#16
1144	tbl	v27.16b,{v27.16b},v6.16b
1145	ror	w19,w19,#16
1146	tbl	v31.16b,{v31.16b},v6.16b
1147	ror	w20,w20,#16
1148	add	v10.4s,v10.4s,v11.4s
1149	add	w15,w15,w21
1150	add	v14.4s,v14.4s,v15.4s
1151	add	w16,w16,w17
1152	add	v18.4s,v18.4s,v19.4s
1153	add	w13,w13,w19
1154	add	v22.4s,v22.4s,v23.4s
1155	add	w14,w14,w20
1156	add	v26.4s,v26.4s,v27.4s
1157	eor	w10,w10,w15
1158	add	v30.4s,v30.4s,v31.4s
1159	eor	w11,w11,w16
1160	eor	v0.16b,v9.16b,v10.16b
1161	eor	w12,w12,w13
1162	eor	v1.16b,v13.16b,v14.16b
1163	eor	w9,w9,w14
1164	eor	v2.16b,v17.16b,v18.16b
1165	ror	w10,w10,#20
1166	eor	v3.16b,v21.16b,v22.16b
1167	ror	w11,w11,#20
1168	eor	v4.16b,v25.16b,v26.16b
1169	ror	w12,w12,#20
1170	eor	v5.16b,v29.16b,v30.16b
1171	ror	w9,w9,#20
1172	ushr	v9.4s,v0.4s,#25
1173	add	w5,w5,w10
1174	ushr	v13.4s,v1.4s,#25
1175	add	w6,w6,w11
1176	ushr	v17.4s,v2.4s,#25
1177	add	w7,w7,w12
1178	ushr	v21.4s,v3.4s,#25
1179	add	w8,w8,w9
1180	ushr	v25.4s,v4.4s,#25
1181	eor	w21,w21,w5
1182	ushr	v29.4s,v5.4s,#25
1183	eor	w17,w17,w6
1184	sli	v9.4s,v0.4s,#7
1185	eor	w19,w19,w7
1186	sli	v13.4s,v1.4s,#7
1187	eor	w20,w20,w8
1188	sli	v17.4s,v2.4s,#7
1189	ror	w21,w21,#24
1190	sli	v21.4s,v3.4s,#7
1191	ror	w17,w17,#24
1192	sli	v25.4s,v4.4s,#7
1193	ror	w19,w19,#24
1194	sli	v29.4s,v5.4s,#7
1195	ror	w20,w20,#24
1196	ext	v10.16b,v10.16b,v10.16b,#8
1197	add	w15,w15,w21
1198	ext	v14.16b,v14.16b,v14.16b,#8
1199	add	w16,w16,w17
1200	ext	v18.16b,v18.16b,v18.16b,#8
1201	add	w13,w13,w19
1202	ext	v22.16b,v22.16b,v22.16b,#8
1203	add	w14,w14,w20
1204	ext	v26.16b,v26.16b,v26.16b,#8
1205	eor	w10,w10,w15
1206	ext	v30.16b,v30.16b,v30.16b,#8
1207	eor	w11,w11,w16
1208	ext	v11.16b,v11.16b,v11.16b,#12
1209	eor	w12,w12,w13
1210	ext	v15.16b,v15.16b,v15.16b,#12
1211	eor	w9,w9,w14
1212	ext	v19.16b,v19.16b,v19.16b,#12
1213	ror	w10,w10,#25
1214	ext	v23.16b,v23.16b,v23.16b,#12
1215	ror	w11,w11,#25
1216	ext	v27.16b,v27.16b,v27.16b,#12
1217	ror	w12,w12,#25
1218	ext	v31.16b,v31.16b,v31.16b,#12
1219	ror	w9,w9,#25
1220	ext	v9.16b,v9.16b,v9.16b,#4
1221	ext	v13.16b,v13.16b,v13.16b,#4
1222	ext	v17.16b,v17.16b,v17.16b,#4
1223	ext	v21.16b,v21.16b,v21.16b,#4
1224	ext	v25.16b,v25.16b,v25.16b,#4
1225	ext	v29.16b,v29.16b,v29.16b,#4
1226	add	v8.4s,v8.4s,v9.4s
1227	add	w5,w5,w9
1228	add	v12.4s,v12.4s,v13.4s
1229	add	w6,w6,w10
1230	add	v16.4s,v16.4s,v17.4s
1231	add	w7,w7,w11
1232	add	v20.4s,v20.4s,v21.4s
1233	add	w8,w8,w12
1234	add	v24.4s,v24.4s,v25.4s
1235	eor	w17,w17,w5
1236	add	v28.4s,v28.4s,v29.4s
1237	eor	w19,w19,w6
1238	eor	v11.16b,v11.16b,v8.16b
1239	eor	w20,w20,w7
1240	eor	v15.16b,v15.16b,v12.16b
1241	eor	w21,w21,w8
1242	eor	v19.16b,v19.16b,v16.16b
1243	ror	w17,w17,#16
1244	eor	v23.16b,v23.16b,v20.16b
1245	ror	w19,w19,#16
1246	eor	v27.16b,v27.16b,v24.16b
1247	ror	w20,w20,#16
1248	eor	v31.16b,v31.16b,v28.16b
1249	ror	w21,w21,#16
1250	rev32	v11.8h,v11.8h
1251	add	w13,w13,w17
1252	rev32	v15.8h,v15.8h
1253	add	w14,w14,w19
1254	rev32	v19.8h,v19.8h
1255	add	w15,w15,w20
1256	rev32	v23.8h,v23.8h
1257	add	w16,w16,w21
1258	rev32	v27.8h,v27.8h
1259	eor	w9,w9,w13
1260	rev32	v31.8h,v31.8h
1261	eor	w10,w10,w14
1262	add	v10.4s,v10.4s,v11.4s
1263	eor	w11,w11,w15
1264	add	v14.4s,v14.4s,v15.4s
1265	eor	w12,w12,w16
1266	add	v18.4s,v18.4s,v19.4s
1267	ror	w9,w9,#20
1268	add	v22.4s,v22.4s,v23.4s
1269	ror	w10,w10,#20
1270	add	v26.4s,v26.4s,v27.4s
1271	ror	w11,w11,#20
1272	add	v30.4s,v30.4s,v31.4s
1273	ror	w12,w12,#20
1274	eor	v0.16b,v9.16b,v10.16b
1275	add	w5,w5,w9
1276	eor	v1.16b,v13.16b,v14.16b
1277	add	w6,w6,w10
1278	eor	v2.16b,v17.16b,v18.16b
1279	add	w7,w7,w11
1280	eor	v3.16b,v21.16b,v22.16b
1281	add	w8,w8,w12
1282	eor	v4.16b,v25.16b,v26.16b
1283	eor	w17,w17,w5
1284	eor	v5.16b,v29.16b,v30.16b
1285	eor	w19,w19,w6
1286	ushr	v9.4s,v0.4s,#20
1287	eor	w20,w20,w7
1288	ushr	v13.4s,v1.4s,#20
1289	eor	w21,w21,w8
1290	ushr	v17.4s,v2.4s,#20
1291	ror	w17,w17,#24
1292	ushr	v21.4s,v3.4s,#20
1293	ror	w19,w19,#24
1294	ushr	v25.4s,v4.4s,#20
1295	ror	w20,w20,#24
1296	ushr	v29.4s,v5.4s,#20
1297	ror	w21,w21,#24
1298	sli	v9.4s,v0.4s,#12
1299	add	w13,w13,w17
1300	sli	v13.4s,v1.4s,#12
1301	add	w14,w14,w19
1302	sli	v17.4s,v2.4s,#12
1303	add	w15,w15,w20
1304	sli	v21.4s,v3.4s,#12
1305	add	w16,w16,w21
1306	sli	v25.4s,v4.4s,#12
1307	eor	w9,w9,w13
1308	sli	v29.4s,v5.4s,#12
1309	eor	w10,w10,w14
1310	add	v8.4s,v8.4s,v9.4s
1311	eor	w11,w11,w15
1312	add	v12.4s,v12.4s,v13.4s
1313	eor	w12,w12,w16
1314	add	v16.4s,v16.4s,v17.4s
1315	ror	w9,w9,#25
1316	add	v20.4s,v20.4s,v21.4s
1317	ror	w10,w10,#25
1318	add	v24.4s,v24.4s,v25.4s
1319	ror	w11,w11,#25
1320	add	v28.4s,v28.4s,v29.4s
1321	ror	w12,w12,#25
1322	eor	v11.16b,v11.16b,v8.16b
1323	add	w5,w5,w10
1324	eor	v15.16b,v15.16b,v12.16b
1325	add	w6,w6,w11
1326	eor	v19.16b,v19.16b,v16.16b
1327	add	w7,w7,w12
1328	eor	v23.16b,v23.16b,v20.16b
1329	add	w8,w8,w9
1330	eor	v27.16b,v27.16b,v24.16b
1331	eor	w21,w21,w5
1332	eor	v31.16b,v31.16b,v28.16b
1333	eor	w17,w17,w6
1334	tbl	v11.16b,{v11.16b},v6.16b
1335	eor	w19,w19,w7
1336	tbl	v15.16b,{v15.16b},v6.16b
1337	eor	w20,w20,w8
1338	tbl	v19.16b,{v19.16b},v6.16b
1339	ror	w21,w21,#16
1340	tbl	v23.16b,{v23.16b},v6.16b
1341	ror	w17,w17,#16
1342	tbl	v27.16b,{v27.16b},v6.16b
1343	ror	w19,w19,#16
1344	tbl	v31.16b,{v31.16b},v6.16b
1345	ror	w20,w20,#16
1346	add	v10.4s,v10.4s,v11.4s
1347	add	w15,w15,w21
1348	add	v14.4s,v14.4s,v15.4s
1349	add	w16,w16,w17
1350	add	v18.4s,v18.4s,v19.4s
1351	add	w13,w13,w19
1352	add	v22.4s,v22.4s,v23.4s
1353	add	w14,w14,w20
1354	add	v26.4s,v26.4s,v27.4s
1355	eor	w10,w10,w15
1356	add	v30.4s,v30.4s,v31.4s
1357	eor	w11,w11,w16
1358	eor	v0.16b,v9.16b,v10.16b
1359	eor	w12,w12,w13
1360	eor	v1.16b,v13.16b,v14.16b
1361	eor	w9,w9,w14
1362	eor	v2.16b,v17.16b,v18.16b
1363	ror	w10,w10,#20
1364	eor	v3.16b,v21.16b,v22.16b
1365	ror	w11,w11,#20
1366	eor	v4.16b,v25.16b,v26.16b
1367	ror	w12,w12,#20
1368	eor	v5.16b,v29.16b,v30.16b
1369	ror	w9,w9,#20
1370	ushr	v9.4s,v0.4s,#25
1371	add	w5,w5,w10
1372	ushr	v13.4s,v1.4s,#25
1373	add	w6,w6,w11
1374	ushr	v17.4s,v2.4s,#25
1375	add	w7,w7,w12
1376	ushr	v21.4s,v3.4s,#25
1377	add	w8,w8,w9
1378	ushr	v25.4s,v4.4s,#25
1379	eor	w21,w21,w5
1380	ushr	v29.4s,v5.4s,#25
1381	eor	w17,w17,w6
1382	sli	v9.4s,v0.4s,#7
1383	eor	w19,w19,w7
1384	sli	v13.4s,v1.4s,#7
1385	eor	w20,w20,w8
1386	sli	v17.4s,v2.4s,#7
1387	ror	w21,w21,#24
1388	sli	v21.4s,v3.4s,#7
1389	ror	w17,w17,#24
1390	sli	v25.4s,v4.4s,#7
1391	ror	w19,w19,#24
1392	sli	v29.4s,v5.4s,#7
1393	ror	w20,w20,#24
1394	ext	v10.16b,v10.16b,v10.16b,#8
1395	add	w15,w15,w21
1396	ext	v14.16b,v14.16b,v14.16b,#8
1397	add	w16,w16,w17
1398	ext	v18.16b,v18.16b,v18.16b,#8
1399	add	w13,w13,w19
1400	ext	v22.16b,v22.16b,v22.16b,#8
1401	add	w14,w14,w20
1402	ext	v26.16b,v26.16b,v26.16b,#8
1403	eor	w10,w10,w15
1404	ext	v30.16b,v30.16b,v30.16b,#8
1405	eor	w11,w11,w16
1406	ext	v11.16b,v11.16b,v11.16b,#4
1407	eor	w12,w12,w13
1408	ext	v15.16b,v15.16b,v15.16b,#4
1409	eor	w9,w9,w14
1410	ext	v19.16b,v19.16b,v19.16b,#4
1411	ror	w10,w10,#25
1412	ext	v23.16b,v23.16b,v23.16b,#4
1413	ror	w11,w11,#25
1414	ext	v27.16b,v27.16b,v27.16b,#4
1415	ror	w12,w12,#25
1416	ext	v31.16b,v31.16b,v31.16b,#4
1417	ror	w9,w9,#25
1418	ext	v9.16b,v9.16b,v9.16b,#12
1419	ext	v13.16b,v13.16b,v13.16b,#12
1420	ext	v17.16b,v17.16b,v17.16b,#12
1421	ext	v21.16b,v21.16b,v21.16b,#12
1422	ext	v25.16b,v25.16b,v25.16b,#12
1423	ext	v29.16b,v29.16b,v29.16b,#12
1424	cbnz	x4,.Loop_upper_neon
1425
1426	add	w5,w5,w22		// accumulate key block
1427	add	x6,x6,x22,lsr#32
1428	add	w7,w7,w23
1429	add	x8,x8,x23,lsr#32
1430	add	w9,w9,w24
1431	add	x10,x10,x24,lsr#32
1432	add	w11,w11,w25
1433	add	x12,x12,x25,lsr#32
1434	add	w13,w13,w26
1435	add	x14,x14,x26,lsr#32
1436	add	w15,w15,w27
1437	add	x16,x16,x27,lsr#32
1438	add	w17,w17,w28
1439	add	x19,x19,x28,lsr#32
1440	add	w20,w20,w30
1441	add	x21,x21,x30,lsr#32
1442
1443	add	x5,x5,x6,lsl#32	// pack
1444	add	x7,x7,x8,lsl#32
1445	ldp	x6,x8,[x1,#0]		// load input
1446	add	x9,x9,x10,lsl#32
1447	add	x11,x11,x12,lsl#32
1448	ldp	x10,x12,[x1,#16]
1449	add	x13,x13,x14,lsl#32
1450	add	x15,x15,x16,lsl#32
1451	ldp	x14,x16,[x1,#32]
1452	add	x17,x17,x19,lsl#32
1453	add	x20,x20,x21,lsl#32
1454	ldp	x19,x21,[x1,#48]
1455	add	x1,x1,#64
1456#ifdef	__AARCH64EB__
1457	rev	x5,x5
1458	rev	x7,x7
1459	rev	x9,x9
1460	rev	x11,x11
1461	rev	x13,x13
1462	rev	x15,x15
1463	rev	x17,x17
1464	rev	x20,x20
1465#endif
1466	eor	x5,x5,x6
1467	eor	x7,x7,x8
1468	eor	x9,x9,x10
1469	eor	x11,x11,x12
1470	eor	x13,x13,x14
1471	eor	x15,x15,x16
1472	eor	x17,x17,x19
1473	eor	x20,x20,x21
1474
1475	stp	x5,x7,[x0,#0]		// store output
1476	add	x28,x28,#1			// increment counter
1477	mov	w5,w22			// unpack key block
1478	lsr	x6,x22,#32
1479	stp	x9,x11,[x0,#16]
1480	mov	w7,w23
1481	lsr	x8,x23,#32
1482	stp	x13,x15,[x0,#32]
1483	mov	w9,w24
1484	lsr	x10,x24,#32
1485	stp	x17,x20,[x0,#48]
1486	add	x0,x0,#64
1487	mov	w11,w25
1488	lsr	x12,x25,#32
1489	mov	w13,w26
1490	lsr	x14,x26,#32
1491	mov	w15,w27
1492	lsr	x16,x27,#32
1493	mov	w17,w28
1494	lsr	x19,x28,#32
1495	mov	w20,w30
1496	lsr	x21,x30,#32
1497
1498	mov	x4,#5
1499.Loop_lower_neon:
1500	sub	x4,x4,#1
1501	add	v8.4s,v8.4s,v9.4s
1502	add	w5,w5,w9
1503	add	v12.4s,v12.4s,v13.4s
1504	add	w6,w6,w10
1505	add	v16.4s,v16.4s,v17.4s
1506	add	w7,w7,w11
1507	add	v20.4s,v20.4s,v21.4s
1508	add	w8,w8,w12
1509	add	v24.4s,v24.4s,v25.4s
1510	eor	w17,w17,w5
1511	add	v28.4s,v28.4s,v29.4s
1512	eor	w19,w19,w6
1513	eor	v11.16b,v11.16b,v8.16b
1514	eor	w20,w20,w7
1515	eor	v15.16b,v15.16b,v12.16b
1516	eor	w21,w21,w8
1517	eor	v19.16b,v19.16b,v16.16b
1518	ror	w17,w17,#16
1519	eor	v23.16b,v23.16b,v20.16b
1520	ror	w19,w19,#16
1521	eor	v27.16b,v27.16b,v24.16b
1522	ror	w20,w20,#16
1523	eor	v31.16b,v31.16b,v28.16b
1524	ror	w21,w21,#16
1525	rev32	v11.8h,v11.8h
1526	add	w13,w13,w17
1527	rev32	v15.8h,v15.8h
1528	add	w14,w14,w19
1529	rev32	v19.8h,v19.8h
1530	add	w15,w15,w20
1531	rev32	v23.8h,v23.8h
1532	add	w16,w16,w21
1533	rev32	v27.8h,v27.8h
1534	eor	w9,w9,w13
1535	rev32	v31.8h,v31.8h
1536	eor	w10,w10,w14
1537	add	v10.4s,v10.4s,v11.4s
1538	eor	w11,w11,w15
1539	add	v14.4s,v14.4s,v15.4s
1540	eor	w12,w12,w16
1541	add	v18.4s,v18.4s,v19.4s
1542	ror	w9,w9,#20
1543	add	v22.4s,v22.4s,v23.4s
1544	ror	w10,w10,#20
1545	add	v26.4s,v26.4s,v27.4s
1546	ror	w11,w11,#20
1547	add	v30.4s,v30.4s,v31.4s
1548	ror	w12,w12,#20
1549	eor	v0.16b,v9.16b,v10.16b
1550	add	w5,w5,w9
1551	eor	v1.16b,v13.16b,v14.16b
1552	add	w6,w6,w10
1553	eor	v2.16b,v17.16b,v18.16b
1554	add	w7,w7,w11
1555	eor	v3.16b,v21.16b,v22.16b
1556	add	w8,w8,w12
1557	eor	v4.16b,v25.16b,v26.16b
1558	eor	w17,w17,w5
1559	eor	v5.16b,v29.16b,v30.16b
1560	eor	w19,w19,w6
1561	ushr	v9.4s,v0.4s,#20
1562	eor	w20,w20,w7
1563	ushr	v13.4s,v1.4s,#20
1564	eor	w21,w21,w8
1565	ushr	v17.4s,v2.4s,#20
1566	ror	w17,w17,#24
1567	ushr	v21.4s,v3.4s,#20
1568	ror	w19,w19,#24
1569	ushr	v25.4s,v4.4s,#20
1570	ror	w20,w20,#24
1571	ushr	v29.4s,v5.4s,#20
1572	ror	w21,w21,#24
1573	sli	v9.4s,v0.4s,#12
1574	add	w13,w13,w17
1575	sli	v13.4s,v1.4s,#12
1576	add	w14,w14,w19
1577	sli	v17.4s,v2.4s,#12
1578	add	w15,w15,w20
1579	sli	v21.4s,v3.4s,#12
1580	add	w16,w16,w21
1581	sli	v25.4s,v4.4s,#12
1582	eor	w9,w9,w13
1583	sli	v29.4s,v5.4s,#12
1584	eor	w10,w10,w14
1585	add	v8.4s,v8.4s,v9.4s
1586	eor	w11,w11,w15
1587	add	v12.4s,v12.4s,v13.4s
1588	eor	w12,w12,w16
1589	add	v16.4s,v16.4s,v17.4s
1590	ror	w9,w9,#25
1591	add	v20.4s,v20.4s,v21.4s
1592	ror	w10,w10,#25
1593	add	v24.4s,v24.4s,v25.4s
1594	ror	w11,w11,#25
1595	add	v28.4s,v28.4s,v29.4s
1596	ror	w12,w12,#25
1597	eor	v11.16b,v11.16b,v8.16b
1598	add	w5,w5,w10
1599	eor	v15.16b,v15.16b,v12.16b
1600	add	w6,w6,w11
1601	eor	v19.16b,v19.16b,v16.16b
1602	add	w7,w7,w12
1603	eor	v23.16b,v23.16b,v20.16b
1604	add	w8,w8,w9
1605	eor	v27.16b,v27.16b,v24.16b
1606	eor	w21,w21,w5
1607	eor	v31.16b,v31.16b,v28.16b
1608	eor	w17,w17,w6
1609	tbl	v11.16b,{v11.16b},v6.16b
1610	eor	w19,w19,w7
1611	tbl	v15.16b,{v15.16b},v6.16b
1612	eor	w20,w20,w8
1613	tbl	v19.16b,{v19.16b},v6.16b
1614	ror	w21,w21,#16
1615	tbl	v23.16b,{v23.16b},v6.16b
1616	ror	w17,w17,#16
1617	tbl	v27.16b,{v27.16b},v6.16b
1618	ror	w19,w19,#16
1619	tbl	v31.16b,{v31.16b},v6.16b
1620	ror	w20,w20,#16
1621	add	v10.4s,v10.4s,v11.4s
1622	add	w15,w15,w21
1623	add	v14.4s,v14.4s,v15.4s
1624	add	w16,w16,w17
1625	add	v18.4s,v18.4s,v19.4s
1626	add	w13,w13,w19
1627	add	v22.4s,v22.4s,v23.4s
1628	add	w14,w14,w20
1629	add	v26.4s,v26.4s,v27.4s
1630	eor	w10,w10,w15
1631	add	v30.4s,v30.4s,v31.4s
1632	eor	w11,w11,w16
1633	eor	v0.16b,v9.16b,v10.16b
1634	eor	w12,w12,w13
1635	eor	v1.16b,v13.16b,v14.16b
1636	eor	w9,w9,w14
1637	eor	v2.16b,v17.16b,v18.16b
1638	ror	w10,w10,#20
1639	eor	v3.16b,v21.16b,v22.16b
1640	ror	w11,w11,#20
1641	eor	v4.16b,v25.16b,v26.16b
1642	ror	w12,w12,#20
1643	eor	v5.16b,v29.16b,v30.16b
1644	ror	w9,w9,#20
1645	ushr	v9.4s,v0.4s,#25
1646	add	w5,w5,w10
1647	ushr	v13.4s,v1.4s,#25
1648	add	w6,w6,w11
1649	ushr	v17.4s,v2.4s,#25
1650	add	w7,w7,w12
1651	ushr	v21.4s,v3.4s,#25
1652	add	w8,w8,w9
1653	ushr	v25.4s,v4.4s,#25
1654	eor	w21,w21,w5
1655	ushr	v29.4s,v5.4s,#25
1656	eor	w17,w17,w6
1657	sli	v9.4s,v0.4s,#7
1658	eor	w19,w19,w7
1659	sli	v13.4s,v1.4s,#7
1660	eor	w20,w20,w8
1661	sli	v17.4s,v2.4s,#7
1662	ror	w21,w21,#24
1663	sli	v21.4s,v3.4s,#7
1664	ror	w17,w17,#24
1665	sli	v25.4s,v4.4s,#7
1666	ror	w19,w19,#24
1667	sli	v29.4s,v5.4s,#7
1668	ror	w20,w20,#24
1669	ext	v10.16b,v10.16b,v10.16b,#8
1670	add	w15,w15,w21
1671	ext	v14.16b,v14.16b,v14.16b,#8
1672	add	w16,w16,w17
1673	ext	v18.16b,v18.16b,v18.16b,#8
1674	add	w13,w13,w19
1675	ext	v22.16b,v22.16b,v22.16b,#8
1676	add	w14,w14,w20
1677	ext	v26.16b,v26.16b,v26.16b,#8
1678	eor	w10,w10,w15
1679	ext	v30.16b,v30.16b,v30.16b,#8
1680	eor	w11,w11,w16
1681	ext	v11.16b,v11.16b,v11.16b,#12
1682	eor	w12,w12,w13
1683	ext	v15.16b,v15.16b,v15.16b,#12
1684	eor	w9,w9,w14
1685	ext	v19.16b,v19.16b,v19.16b,#12
1686	ror	w10,w10,#25
1687	ext	v23.16b,v23.16b,v23.16b,#12
1688	ror	w11,w11,#25
1689	ext	v27.16b,v27.16b,v27.16b,#12
1690	ror	w12,w12,#25
1691	ext	v31.16b,v31.16b,v31.16b,#12
1692	ror	w9,w9,#25
1693	ext	v9.16b,v9.16b,v9.16b,#4
1694	ext	v13.16b,v13.16b,v13.16b,#4
1695	ext	v17.16b,v17.16b,v17.16b,#4
1696	ext	v21.16b,v21.16b,v21.16b,#4
1697	ext	v25.16b,v25.16b,v25.16b,#4
1698	ext	v29.16b,v29.16b,v29.16b,#4
1699	add	v8.4s,v8.4s,v9.4s
1700	add	w5,w5,w9
1701	add	v12.4s,v12.4s,v13.4s
1702	add	w6,w6,w10
1703	add	v16.4s,v16.4s,v17.4s
1704	add	w7,w7,w11
1705	add	v20.4s,v20.4s,v21.4s
1706	add	w8,w8,w12
1707	add	v24.4s,v24.4s,v25.4s
1708	eor	w17,w17,w5
1709	add	v28.4s,v28.4s,v29.4s
1710	eor	w19,w19,w6
1711	eor	v11.16b,v11.16b,v8.16b
1712	eor	w20,w20,w7
1713	eor	v15.16b,v15.16b,v12.16b
1714	eor	w21,w21,w8
1715	eor	v19.16b,v19.16b,v16.16b
1716	ror	w17,w17,#16
1717	eor	v23.16b,v23.16b,v20.16b
1718	ror	w19,w19,#16
1719	eor	v27.16b,v27.16b,v24.16b
1720	ror	w20,w20,#16
1721	eor	v31.16b,v31.16b,v28.16b
1722	ror	w21,w21,#16
1723	rev32	v11.8h,v11.8h
1724	add	w13,w13,w17
1725	rev32	v15.8h,v15.8h
1726	add	w14,w14,w19
1727	rev32	v19.8h,v19.8h
1728	add	w15,w15,w20
1729	rev32	v23.8h,v23.8h
1730	add	w16,w16,w21
1731	rev32	v27.8h,v27.8h
1732	eor	w9,w9,w13
1733	rev32	v31.8h,v31.8h
1734	eor	w10,w10,w14
1735	add	v10.4s,v10.4s,v11.4s
1736	eor	w11,w11,w15
1737	add	v14.4s,v14.4s,v15.4s
1738	eor	w12,w12,w16
1739	add	v18.4s,v18.4s,v19.4s
1740	ror	w9,w9,#20
1741	add	v22.4s,v22.4s,v23.4s
1742	ror	w10,w10,#20
1743	add	v26.4s,v26.4s,v27.4s
1744	ror	w11,w11,#20
1745	add	v30.4s,v30.4s,v31.4s
1746	ror	w12,w12,#20
1747	eor	v0.16b,v9.16b,v10.16b
1748	add	w5,w5,w9
1749	eor	v1.16b,v13.16b,v14.16b
1750	add	w6,w6,w10
1751	eor	v2.16b,v17.16b,v18.16b
1752	add	w7,w7,w11
1753	eor	v3.16b,v21.16b,v22.16b
1754	add	w8,w8,w12
1755	eor	v4.16b,v25.16b,v26.16b
1756	eor	w17,w17,w5
1757	eor	v5.16b,v29.16b,v30.16b
1758	eor	w19,w19,w6
1759	ushr	v9.4s,v0.4s,#20
1760	eor	w20,w20,w7
1761	ushr	v13.4s,v1.4s,#20
1762	eor	w21,w21,w8
1763	ushr	v17.4s,v2.4s,#20
1764	ror	w17,w17,#24
1765	ushr	v21.4s,v3.4s,#20
1766	ror	w19,w19,#24
1767	ushr	v25.4s,v4.4s,#20
1768	ror	w20,w20,#24
1769	ushr	v29.4s,v5.4s,#20
1770	ror	w21,w21,#24
1771	sli	v9.4s,v0.4s,#12
1772	add	w13,w13,w17
1773	sli	v13.4s,v1.4s,#12
1774	add	w14,w14,w19
1775	sli	v17.4s,v2.4s,#12
1776	add	w15,w15,w20
1777	sli	v21.4s,v3.4s,#12
1778	add	w16,w16,w21
1779	sli	v25.4s,v4.4s,#12
1780	eor	w9,w9,w13
1781	sli	v29.4s,v5.4s,#12
1782	eor	w10,w10,w14
1783	add	v8.4s,v8.4s,v9.4s
1784	eor	w11,w11,w15
1785	add	v12.4s,v12.4s,v13.4s
1786	eor	w12,w12,w16
1787	add	v16.4s,v16.4s,v17.4s
1788	ror	w9,w9,#25
1789	add	v20.4s,v20.4s,v21.4s
1790	ror	w10,w10,#25
1791	add	v24.4s,v24.4s,v25.4s
1792	ror	w11,w11,#25
1793	add	v28.4s,v28.4s,v29.4s
1794	ror	w12,w12,#25
1795	eor	v11.16b,v11.16b,v8.16b
1796	add	w5,w5,w10
1797	eor	v15.16b,v15.16b,v12.16b
1798	add	w6,w6,w11
1799	eor	v19.16b,v19.16b,v16.16b
1800	add	w7,w7,w12
1801	eor	v23.16b,v23.16b,v20.16b
1802	add	w8,w8,w9
1803	eor	v27.16b,v27.16b,v24.16b
1804	eor	w21,w21,w5
1805	eor	v31.16b,v31.16b,v28.16b
1806	eor	w17,w17,w6
1807	tbl	v11.16b,{v11.16b},v6.16b
1808	eor	w19,w19,w7
1809	tbl	v15.16b,{v15.16b},v6.16b
1810	eor	w20,w20,w8
1811	tbl	v19.16b,{v19.16b},v6.16b
1812	ror	w21,w21,#16
1813	tbl	v23.16b,{v23.16b},v6.16b
1814	ror	w17,w17,#16
1815	tbl	v27.16b,{v27.16b},v6.16b
1816	ror	w19,w19,#16
1817	tbl	v31.16b,{v31.16b},v6.16b
1818	ror	w20,w20,#16
1819	add	v10.4s,v10.4s,v11.4s
1820	add	w15,w15,w21
1821	add	v14.4s,v14.4s,v15.4s
1822	add	w16,w16,w17
1823	add	v18.4s,v18.4s,v19.4s
1824	add	w13,w13,w19
1825	add	v22.4s,v22.4s,v23.4s
1826	add	w14,w14,w20
1827	add	v26.4s,v26.4s,v27.4s
1828	eor	w10,w10,w15
1829	add	v30.4s,v30.4s,v31.4s
1830	eor	w11,w11,w16
1831	eor	v0.16b,v9.16b,v10.16b
1832	eor	w12,w12,w13
1833	eor	v1.16b,v13.16b,v14.16b
1834	eor	w9,w9,w14
1835	eor	v2.16b,v17.16b,v18.16b
1836	ror	w10,w10,#20
1837	eor	v3.16b,v21.16b,v22.16b
1838	ror	w11,w11,#20
1839	eor	v4.16b,v25.16b,v26.16b
1840	ror	w12,w12,#20
1841	eor	v5.16b,v29.16b,v30.16b
1842	ror	w9,w9,#20
1843	ushr	v9.4s,v0.4s,#25
1844	add	w5,w5,w10
1845	ushr	v13.4s,v1.4s,#25
1846	add	w6,w6,w11
1847	ushr	v17.4s,v2.4s,#25
1848	add	w7,w7,w12
1849	ushr	v21.4s,v3.4s,#25
1850	add	w8,w8,w9
1851	ushr	v25.4s,v4.4s,#25
1852	eor	w21,w21,w5
1853	ushr	v29.4s,v5.4s,#25
1854	eor	w17,w17,w6
1855	sli	v9.4s,v0.4s,#7
1856	eor	w19,w19,w7
1857	sli	v13.4s,v1.4s,#7
1858	eor	w20,w20,w8
1859	sli	v17.4s,v2.4s,#7
1860	ror	w21,w21,#24
1861	sli	v21.4s,v3.4s,#7
1862	ror	w17,w17,#24
1863	sli	v25.4s,v4.4s,#7
1864	ror	w19,w19,#24
1865	sli	v29.4s,v5.4s,#7
1866	ror	w20,w20,#24
1867	ext	v10.16b,v10.16b,v10.16b,#8
1868	add	w15,w15,w21
1869	ext	v14.16b,v14.16b,v14.16b,#8
1870	add	w16,w16,w17
1871	ext	v18.16b,v18.16b,v18.16b,#8
1872	add	w13,w13,w19
1873	ext	v22.16b,v22.16b,v22.16b,#8
1874	add	w14,w14,w20
1875	ext	v26.16b,v26.16b,v26.16b,#8
1876	eor	w10,w10,w15
1877	ext	v30.16b,v30.16b,v30.16b,#8
1878	eor	w11,w11,w16
1879	ext	v11.16b,v11.16b,v11.16b,#4
1880	eor	w12,w12,w13
1881	ext	v15.16b,v15.16b,v15.16b,#4
1882	eor	w9,w9,w14
1883	ext	v19.16b,v19.16b,v19.16b,#4
1884	ror	w10,w10,#25
1885	ext	v23.16b,v23.16b,v23.16b,#4
1886	ror	w11,w11,#25
1887	ext	v27.16b,v27.16b,v27.16b,#4
1888	ror	w12,w12,#25
1889	ext	v31.16b,v31.16b,v31.16b,#4
1890	ror	w9,w9,#25
1891	ext	v9.16b,v9.16b,v9.16b,#12
1892	ext	v13.16b,v13.16b,v13.16b,#12
1893	ext	v17.16b,v17.16b,v17.16b,#12
1894	ext	v21.16b,v21.16b,v21.16b,#12
1895	ext	v25.16b,v25.16b,v25.16b,#12
1896	ext	v29.16b,v29.16b,v29.16b,#12
1897	cbnz	x4,.Loop_lower_neon
1898
1899	add	w5,w5,w22		// accumulate key block
1900	ldp	q0,q1,[sp,#0]
1901	add	x6,x6,x22,lsr#32
1902	ldp	q2,q3,[sp,#32]
1903	add	w7,w7,w23
1904	ldp	q4,q5,[sp,#64]
1905	add	x8,x8,x23,lsr#32
1906	ldr	q6,[sp,#96]
1907	add	v8.4s,v8.4s,v0.4s
1908	add	w9,w9,w24
1909	add	v12.4s,v12.4s,v0.4s
1910	add	x10,x10,x24,lsr#32
1911	add	v16.4s,v16.4s,v0.4s
1912	add	w11,w11,w25
1913	add	v20.4s,v20.4s,v0.4s
1914	add	x12,x12,x25,lsr#32
1915	add	v24.4s,v24.4s,v0.4s
1916	add	w13,w13,w26
1917	add	v28.4s,v28.4s,v0.4s
1918	add	x14,x14,x26,lsr#32
1919	add	v10.4s,v10.4s,v2.4s
1920	add	w15,w15,w27
1921	add	v14.4s,v14.4s,v2.4s
1922	add	x16,x16,x27,lsr#32
1923	add	v18.4s,v18.4s,v2.4s
1924	add	w17,w17,w28
1925	add	v22.4s,v22.4s,v2.4s
1926	add	x19,x19,x28,lsr#32
1927	add	v26.4s,v26.4s,v2.4s
1928	add	w20,w20,w30
1929	add	v30.4s,v30.4s,v2.4s
1930	add	x21,x21,x30,lsr#32
1931	add	v27.4s,v27.4s,v7.4s			// +4
1932	add	x5,x5,x6,lsl#32	// pack
1933	add	v31.4s,v31.4s,v7.4s			// +4
1934	add	x7,x7,x8,lsl#32
1935	add	v11.4s,v11.4s,v3.4s
1936	ldp	x6,x8,[x1,#0]		// load input
1937	add	v15.4s,v15.4s,v4.4s
1938	add	x9,x9,x10,lsl#32
1939	add	v19.4s,v19.4s,v5.4s
1940	add	x11,x11,x12,lsl#32
1941	add	v23.4s,v23.4s,v6.4s
1942	ldp	x10,x12,[x1,#16]
1943	add	v27.4s,v27.4s,v3.4s
1944	add	x13,x13,x14,lsl#32
1945	add	v31.4s,v31.4s,v4.4s
1946	add	x15,x15,x16,lsl#32
1947	add	v9.4s,v9.4s,v1.4s
1948	ldp	x14,x16,[x1,#32]
1949	add	v13.4s,v13.4s,v1.4s
1950	add	x17,x17,x19,lsl#32
1951	add	v17.4s,v17.4s,v1.4s
1952	add	x20,x20,x21,lsl#32
1953	add	v21.4s,v21.4s,v1.4s
1954	ldp	x19,x21,[x1,#48]
1955	add	v25.4s,v25.4s,v1.4s
1956	add	x1,x1,#64
1957	add	v29.4s,v29.4s,v1.4s
1958
1959#ifdef	__AARCH64EB__
1960	rev	x5,x5
1961	rev	x7,x7
1962	rev	x9,x9
1963	rev	x11,x11
1964	rev	x13,x13
1965	rev	x15,x15
1966	rev	x17,x17
1967	rev	x20,x20
1968#endif
1969	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1970	eor	x5,x5,x6
1971	eor	x7,x7,x8
1972	eor	x9,x9,x10
1973	eor	x11,x11,x12
1974	eor	x13,x13,x14
1975	eor	v8.16b,v8.16b,v0.16b
1976	eor	x15,x15,x16
1977	eor	v9.16b,v9.16b,v1.16b
1978	eor	x17,x17,x19
1979	eor	v10.16b,v10.16b,v2.16b
1980	eor	x20,x20,x21
1981	eor	v11.16b,v11.16b,v3.16b
1982	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1983
1984	stp	x5,x7,[x0,#0]		// store output
1985	add	x28,x28,#7			// increment counter
1986	stp	x9,x11,[x0,#16]
1987	stp	x13,x15,[x0,#32]
1988	stp	x17,x20,[x0,#48]
1989	add	x0,x0,#64
1990	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1991
1992	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1993	eor	v12.16b,v12.16b,v0.16b
1994	eor	v13.16b,v13.16b,v1.16b
1995	eor	v14.16b,v14.16b,v2.16b
1996	eor	v15.16b,v15.16b,v3.16b
1997	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1998
1999	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
2000	eor	v16.16b,v16.16b,v8.16b
2001	ldp	q0,q1,[sp,#0]
2002	eor	v17.16b,v17.16b,v9.16b
2003	ldp	q2,q3,[sp,#32]
2004	eor	v18.16b,v18.16b,v10.16b
2005	eor	v19.16b,v19.16b,v11.16b
2006	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
2007
2008	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
2009	eor	v20.16b,v20.16b,v12.16b
2010	eor	v21.16b,v21.16b,v13.16b
2011	eor	v22.16b,v22.16b,v14.16b
2012	eor	v23.16b,v23.16b,v15.16b
2013	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
2014
2015	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
2016	eor	v24.16b,v24.16b,v16.16b
2017	eor	v25.16b,v25.16b,v17.16b
2018	eor	v26.16b,v26.16b,v18.16b
2019	eor	v27.16b,v27.16b,v19.16b
2020	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
2021
2022	shl	v8.4s,v7.4s,#1			// 4 -> 8
2023	eor	v28.16b,v28.16b,v20.16b
2024	eor	v29.16b,v29.16b,v21.16b
2025	eor	v30.16b,v30.16b,v22.16b
2026	eor	v31.16b,v31.16b,v23.16b
2027	st1	{v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64
2028
2029	add	v3.4s,v3.4s,v8.4s			// += 8
2030	add	v4.4s,v4.4s,v8.4s
2031	add	v5.4s,v5.4s,v8.4s
2032	add	v6.4s,v6.4s,v8.4s
2033
2034	b.hs	.Loop_outer_512_neon
2035
2036	adds	x2,x2,#512
2037	ushr	v7.4s,v7.4s,#1			// 4 -> 2
2038
2039	ldp	d10,d11,[sp,#128+16]		// meet ABI requirements
2040	ldp	d12,d13,[sp,#128+32]
2041	ldp	d14,d15,[sp,#128+48]
2042
2043	stp	q0,q0,[sp,#0]		// wipe off-load area
2044	stp	q0,q0,[sp,#32]
2045	stp	q0,q0,[sp,#64]
2046
2047	b.eq	.Ldone_512_neon
2048
2049	sub	x3,x3,#16			// .Lone
2050	cmp	x2,#192
2051	add	sp,sp,#128
2052	sub	v3.4s,v3.4s,v7.4s		// -= 2
2053	ld1	{v8.4s,v9.4s},[x3]
2054	b.hs	.Loop_outer_neon
2055
2056	ldp	d8,d9,[sp,#0]			// meet ABI requirements
2057	eor	v1.16b,v1.16b,v1.16b
2058	eor	v2.16b,v2.16b,v2.16b
2059	eor	v3.16b,v3.16b,v3.16b
2060	eor	v4.16b,v4.16b,v4.16b
2061	eor	v5.16b,v5.16b,v5.16b
2062	eor	v6.16b,v6.16b,v6.16b
2063	b	.Loop_outer
2064
2065.Ldone_512_neon:
2066	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
2067	ldp	x19,x20,[x29,#16]
2068	add	sp,sp,#128+64
2069	ldp	x21,x22,[x29,#32]
2070	ldp	x23,x24,[x29,#48]
2071	ldp	x25,x26,[x29,#64]
2072	ldp	x27,x28,[x29,#80]
2073	ldp	x29,x30,[sp],#96
2074	AARCH64_VALIDATE_LINK_REGISTER
2075	ret
2076.size	ChaCha20_512_neon,.-ChaCha20_512_neon
2077