xref: /freebsd/sys/crypto/openssl/aarch64/chacha-armv8.S (revision 59c8e88e72633afbc47a4ace0d2170d00d51f7dc)
1/* Do not modify. This file is auto-generated from chacha-armv8.pl. */
2#include "arm_arch.h"
3#ifndef	__KERNEL__
4
5.hidden	OPENSSL_armcap_P
6#endif
7
8.text
9
10.align	5
11.Lsigma:
12.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
13.Lone:
14.long	1,2,3,4
15.Lrot24:
16.long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
17.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
18.align	2
19
20.globl	ChaCha20_ctr32
21.type	ChaCha20_ctr32,%function
22.align	5
23ChaCha20_ctr32:
24	AARCH64_SIGN_LINK_REGISTER
25	cbz	x2,.Labort
26	cmp	x2,#192
27	b.lo	.Lshort
28
29#ifndef	__KERNEL__
30	adrp	x17,OPENSSL_armcap_P
31	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
32	tst	w17,#ARMV7_NEON
33	b.ne	.LChaCha20_neon
34#endif
35
36.Lshort:
37	stp	x29,x30,[sp,#-96]!
38	add	x29,sp,#0
39
40	adr	x5,.Lsigma
41	stp	x19,x20,[sp,#16]
42	stp	x21,x22,[sp,#32]
43	stp	x23,x24,[sp,#48]
44	stp	x25,x26,[sp,#64]
45	stp	x27,x28,[sp,#80]
46	sub	sp,sp,#64
47
48	ldp	x22,x23,[x5]		// load sigma
49	ldp	x24,x25,[x3]		// load key
50	ldp	x26,x27,[x3,#16]
51	ldp	x28,x30,[x4]		// load counter
52#ifdef	__AARCH64EB__
53	ror	x24,x24,#32
54	ror	x25,x25,#32
55	ror	x26,x26,#32
56	ror	x27,x27,#32
57	ror	x28,x28,#32
58	ror	x30,x30,#32
59#endif
60
61.Loop_outer:
62	mov	w5,w22			// unpack key block
63	lsr	x6,x22,#32
64	mov	w7,w23
65	lsr	x8,x23,#32
66	mov	w9,w24
67	lsr	x10,x24,#32
68	mov	w11,w25
69	lsr	x12,x25,#32
70	mov	w13,w26
71	lsr	x14,x26,#32
72	mov	w15,w27
73	lsr	x16,x27,#32
74	mov	w17,w28
75	lsr	x19,x28,#32
76	mov	w20,w30
77	lsr	x21,x30,#32
78
79	mov	x4,#10
80	subs	x2,x2,#64
81.Loop:
82	sub	x4,x4,#1
83	add	w5,w5,w9
84	add	w6,w6,w10
85	add	w7,w7,w11
86	add	w8,w8,w12
87	eor	w17,w17,w5
88	eor	w19,w19,w6
89	eor	w20,w20,w7
90	eor	w21,w21,w8
91	ror	w17,w17,#16
92	ror	w19,w19,#16
93	ror	w20,w20,#16
94	ror	w21,w21,#16
95	add	w13,w13,w17
96	add	w14,w14,w19
97	add	w15,w15,w20
98	add	w16,w16,w21
99	eor	w9,w9,w13
100	eor	w10,w10,w14
101	eor	w11,w11,w15
102	eor	w12,w12,w16
103	ror	w9,w9,#20
104	ror	w10,w10,#20
105	ror	w11,w11,#20
106	ror	w12,w12,#20
107	add	w5,w5,w9
108	add	w6,w6,w10
109	add	w7,w7,w11
110	add	w8,w8,w12
111	eor	w17,w17,w5
112	eor	w19,w19,w6
113	eor	w20,w20,w7
114	eor	w21,w21,w8
115	ror	w17,w17,#24
116	ror	w19,w19,#24
117	ror	w20,w20,#24
118	ror	w21,w21,#24
119	add	w13,w13,w17
120	add	w14,w14,w19
121	add	w15,w15,w20
122	add	w16,w16,w21
123	eor	w9,w9,w13
124	eor	w10,w10,w14
125	eor	w11,w11,w15
126	eor	w12,w12,w16
127	ror	w9,w9,#25
128	ror	w10,w10,#25
129	ror	w11,w11,#25
130	ror	w12,w12,#25
131	add	w5,w5,w10
132	add	w6,w6,w11
133	add	w7,w7,w12
134	add	w8,w8,w9
135	eor	w21,w21,w5
136	eor	w17,w17,w6
137	eor	w19,w19,w7
138	eor	w20,w20,w8
139	ror	w21,w21,#16
140	ror	w17,w17,#16
141	ror	w19,w19,#16
142	ror	w20,w20,#16
143	add	w15,w15,w21
144	add	w16,w16,w17
145	add	w13,w13,w19
146	add	w14,w14,w20
147	eor	w10,w10,w15
148	eor	w11,w11,w16
149	eor	w12,w12,w13
150	eor	w9,w9,w14
151	ror	w10,w10,#20
152	ror	w11,w11,#20
153	ror	w12,w12,#20
154	ror	w9,w9,#20
155	add	w5,w5,w10
156	add	w6,w6,w11
157	add	w7,w7,w12
158	add	w8,w8,w9
159	eor	w21,w21,w5
160	eor	w17,w17,w6
161	eor	w19,w19,w7
162	eor	w20,w20,w8
163	ror	w21,w21,#24
164	ror	w17,w17,#24
165	ror	w19,w19,#24
166	ror	w20,w20,#24
167	add	w15,w15,w21
168	add	w16,w16,w17
169	add	w13,w13,w19
170	add	w14,w14,w20
171	eor	w10,w10,w15
172	eor	w11,w11,w16
173	eor	w12,w12,w13
174	eor	w9,w9,w14
175	ror	w10,w10,#25
176	ror	w11,w11,#25
177	ror	w12,w12,#25
178	ror	w9,w9,#25
179	cbnz	x4,.Loop
180
181	add	w5,w5,w22		// accumulate key block
182	add	x6,x6,x22,lsr#32
183	add	w7,w7,w23
184	add	x8,x8,x23,lsr#32
185	add	w9,w9,w24
186	add	x10,x10,x24,lsr#32
187	add	w11,w11,w25
188	add	x12,x12,x25,lsr#32
189	add	w13,w13,w26
190	add	x14,x14,x26,lsr#32
191	add	w15,w15,w27
192	add	x16,x16,x27,lsr#32
193	add	w17,w17,w28
194	add	x19,x19,x28,lsr#32
195	add	w20,w20,w30
196	add	x21,x21,x30,lsr#32
197
198	b.lo	.Ltail
199
200	add	x5,x5,x6,lsl#32	// pack
201	add	x7,x7,x8,lsl#32
202	ldp	x6,x8,[x1,#0]		// load input
203	add	x9,x9,x10,lsl#32
204	add	x11,x11,x12,lsl#32
205	ldp	x10,x12,[x1,#16]
206	add	x13,x13,x14,lsl#32
207	add	x15,x15,x16,lsl#32
208	ldp	x14,x16,[x1,#32]
209	add	x17,x17,x19,lsl#32
210	add	x20,x20,x21,lsl#32
211	ldp	x19,x21,[x1,#48]
212	add	x1,x1,#64
213#ifdef	__AARCH64EB__
214	rev	x5,x5
215	rev	x7,x7
216	rev	x9,x9
217	rev	x11,x11
218	rev	x13,x13
219	rev	x15,x15
220	rev	x17,x17
221	rev	x20,x20
222#endif
223	eor	x5,x5,x6
224	eor	x7,x7,x8
225	eor	x9,x9,x10
226	eor	x11,x11,x12
227	eor	x13,x13,x14
228	eor	x15,x15,x16
229	eor	x17,x17,x19
230	eor	x20,x20,x21
231
232	stp	x5,x7,[x0,#0]		// store output
233	add	x28,x28,#1			// increment counter
234	stp	x9,x11,[x0,#16]
235	stp	x13,x15,[x0,#32]
236	stp	x17,x20,[x0,#48]
237	add	x0,x0,#64
238
239	b.hi	.Loop_outer
240
241	ldp	x19,x20,[x29,#16]
242	add	sp,sp,#64
243	ldp	x21,x22,[x29,#32]
244	ldp	x23,x24,[x29,#48]
245	ldp	x25,x26,[x29,#64]
246	ldp	x27,x28,[x29,#80]
247	ldp	x29,x30,[sp],#96
248.Labort:
249	AARCH64_VALIDATE_LINK_REGISTER
250	ret
251
252.align	4
253.Ltail:
254	add	x2,x2,#64
255.Less_than_64:
256	sub	x0,x0,#1
257	add	x1,x1,x2
258	add	x0,x0,x2
259	add	x4,sp,x2
260	neg	x2,x2
261
262	add	x5,x5,x6,lsl#32	// pack
263	add	x7,x7,x8,lsl#32
264	add	x9,x9,x10,lsl#32
265	add	x11,x11,x12,lsl#32
266	add	x13,x13,x14,lsl#32
267	add	x15,x15,x16,lsl#32
268	add	x17,x17,x19,lsl#32
269	add	x20,x20,x21,lsl#32
270#ifdef	__AARCH64EB__
271	rev	x5,x5
272	rev	x7,x7
273	rev	x9,x9
274	rev	x11,x11
275	rev	x13,x13
276	rev	x15,x15
277	rev	x17,x17
278	rev	x20,x20
279#endif
280	stp	x5,x7,[sp,#0]
281	stp	x9,x11,[sp,#16]
282	stp	x13,x15,[sp,#32]
283	stp	x17,x20,[sp,#48]
284
285.Loop_tail:
286	ldrb	w10,[x1,x2]
287	ldrb	w11,[x4,x2]
288	add	x2,x2,#1
289	eor	w10,w10,w11
290	strb	w10,[x0,x2]
291	cbnz	x2,.Loop_tail
292
293	stp	xzr,xzr,[sp,#0]
294	stp	xzr,xzr,[sp,#16]
295	stp	xzr,xzr,[sp,#32]
296	stp	xzr,xzr,[sp,#48]
297
298	ldp	x19,x20,[x29,#16]
299	add	sp,sp,#64
300	ldp	x21,x22,[x29,#32]
301	ldp	x23,x24,[x29,#48]
302	ldp	x25,x26,[x29,#64]
303	ldp	x27,x28,[x29,#80]
304	ldp	x29,x30,[sp],#96
305	AARCH64_VALIDATE_LINK_REGISTER
306	ret
307.size	ChaCha20_ctr32,.-ChaCha20_ctr32
308
309#ifdef	__KERNEL__
310.globl	ChaCha20_neon
311#endif
312.type	ChaCha20_neon,%function
313.align	5
314ChaCha20_neon:
315	AARCH64_SIGN_LINK_REGISTER
316.LChaCha20_neon:
317	stp	x29,x30,[sp,#-96]!
318	add	x29,sp,#0
319
320	adr	x5,.Lsigma
321	stp	x19,x20,[sp,#16]
322	stp	x21,x22,[sp,#32]
323	stp	x23,x24,[sp,#48]
324	stp	x25,x26,[sp,#64]
325	stp	x27,x28,[sp,#80]
326	cmp	x2,#512
327	b.hs	.L512_or_more_neon
328
329	sub	sp,sp,#64
330
331	ldp	x22,x23,[x5]		// load sigma
332	ld1	{v0.4s},[x5],#16
333	ldp	x24,x25,[x3]		// load key
334	ldp	x26,x27,[x3,#16]
335	ld1	{v1.4s,v2.4s},[x3]
336	ldp	x28,x30,[x4]		// load counter
337	ld1	{v3.4s},[x4]
338	stp	d8,d9,[sp]			// meet ABI requirements
339	ld1	{v8.4s,v9.4s},[x5]
340#ifdef	__AARCH64EB__
341	rev64	v0.4s,v0.4s
342	ror	x24,x24,#32
343	ror	x25,x25,#32
344	ror	x26,x26,#32
345	ror	x27,x27,#32
346	ror	x28,x28,#32
347	ror	x30,x30,#32
348#endif
349
350.Loop_outer_neon:
351	dup	v16.4s,v0.s[0]			// unpack key block
352	mov	w5,w22
353	dup	v20.4s,v0.s[1]
354	lsr	x6,x22,#32
355	dup	v24.4s,v0.s[2]
356	mov	w7,w23
357	dup	v28.4s,v0.s[3]
358	lsr	x8,x23,#32
359	dup	v17.4s,v1.s[0]
360	mov	w9,w24
361	dup	v21.4s,v1.s[1]
362	lsr	x10,x24,#32
363	dup	v25.4s,v1.s[2]
364	mov	w11,w25
365	dup	v29.4s,v1.s[3]
366	lsr	x12,x25,#32
367	dup	v19.4s,v3.s[0]
368	mov	w13,w26
369	dup	v23.4s,v3.s[1]
370	lsr	x14,x26,#32
371	dup	v27.4s,v3.s[2]
372	mov	w15,w27
373	dup	v31.4s,v3.s[3]
374	lsr	x16,x27,#32
375	add	v19.4s,v19.4s,v8.4s
376	mov	w17,w28
377	dup	v18.4s,v2.s[0]
378	lsr	x19,x28,#32
379	dup	v22.4s,v2.s[1]
380	mov	w20,w30
381	dup	v26.4s,v2.s[2]
382	lsr	x21,x30,#32
383	dup	v30.4s,v2.s[3]
384
385	mov	x4,#10
386	subs	x2,x2,#320
387.Loop_neon:
388	sub	x4,x4,#1
389	add	v16.4s,v16.4s,v17.4s
390	add	w5,w5,w9
391	add	v20.4s,v20.4s,v21.4s
392	add	w6,w6,w10
393	add	v24.4s,v24.4s,v25.4s
394	add	w7,w7,w11
395	add	v28.4s,v28.4s,v29.4s
396	add	w8,w8,w12
397	eor	v19.16b,v19.16b,v16.16b
398	eor	w17,w17,w5
399	eor	v23.16b,v23.16b,v20.16b
400	eor	w19,w19,w6
401	eor	v27.16b,v27.16b,v24.16b
402	eor	w20,w20,w7
403	eor	v31.16b,v31.16b,v28.16b
404	eor	w21,w21,w8
405	rev32	v19.8h,v19.8h
406	ror	w17,w17,#16
407	rev32	v23.8h,v23.8h
408	ror	w19,w19,#16
409	rev32	v27.8h,v27.8h
410	ror	w20,w20,#16
411	rev32	v31.8h,v31.8h
412	ror	w21,w21,#16
413	add	v18.4s,v18.4s,v19.4s
414	add	w13,w13,w17
415	add	v22.4s,v22.4s,v23.4s
416	add	w14,w14,w19
417	add	v26.4s,v26.4s,v27.4s
418	add	w15,w15,w20
419	add	v30.4s,v30.4s,v31.4s
420	add	w16,w16,w21
421	eor	v4.16b,v17.16b,v18.16b
422	eor	w9,w9,w13
423	eor	v5.16b,v21.16b,v22.16b
424	eor	w10,w10,w14
425	eor	v6.16b,v25.16b,v26.16b
426	eor	w11,w11,w15
427	eor	v7.16b,v29.16b,v30.16b
428	eor	w12,w12,w16
429	ushr	v17.4s,v4.4s,#20
430	ror	w9,w9,#20
431	ushr	v21.4s,v5.4s,#20
432	ror	w10,w10,#20
433	ushr	v25.4s,v6.4s,#20
434	ror	w11,w11,#20
435	ushr	v29.4s,v7.4s,#20
436	ror	w12,w12,#20
437	sli	v17.4s,v4.4s,#12
438	add	w5,w5,w9
439	sli	v21.4s,v5.4s,#12
440	add	w6,w6,w10
441	sli	v25.4s,v6.4s,#12
442	add	w7,w7,w11
443	sli	v29.4s,v7.4s,#12
444	add	w8,w8,w12
445	add	v16.4s,v16.4s,v17.4s
446	eor	w17,w17,w5
447	add	v20.4s,v20.4s,v21.4s
448	eor	w19,w19,w6
449	add	v24.4s,v24.4s,v25.4s
450	eor	w20,w20,w7
451	add	v28.4s,v28.4s,v29.4s
452	eor	w21,w21,w8
453	eor	v4.16b,v19.16b,v16.16b
454	ror	w17,w17,#24
455	eor	v5.16b,v23.16b,v20.16b
456	ror	w19,w19,#24
457	eor	v6.16b,v27.16b,v24.16b
458	ror	w20,w20,#24
459	eor	v7.16b,v31.16b,v28.16b
460	ror	w21,w21,#24
461	tbl	v19.16b,{v4.16b},v9.16b
462	add	w13,w13,w17
463	tbl	v23.16b,{v5.16b},v9.16b
464	add	w14,w14,w19
465	tbl	v27.16b,{v6.16b},v9.16b
466	add	w15,w15,w20
467	tbl	v31.16b,{v7.16b},v9.16b
468	add	w16,w16,w21
469	add	v18.4s,v18.4s,v19.4s
470	eor	w9,w9,w13
471	add	v22.4s,v22.4s,v23.4s
472	eor	w10,w10,w14
473	add	v26.4s,v26.4s,v27.4s
474	eor	w11,w11,w15
475	add	v30.4s,v30.4s,v31.4s
476	eor	w12,w12,w16
477	eor	v4.16b,v17.16b,v18.16b
478	ror	w9,w9,#25
479	eor	v5.16b,v21.16b,v22.16b
480	ror	w10,w10,#25
481	eor	v6.16b,v25.16b,v26.16b
482	ror	w11,w11,#25
483	eor	v7.16b,v29.16b,v30.16b
484	ror	w12,w12,#25
485	ushr	v17.4s,v4.4s,#25
486	ushr	v21.4s,v5.4s,#25
487	ushr	v25.4s,v6.4s,#25
488	ushr	v29.4s,v7.4s,#25
489	sli	v17.4s,v4.4s,#7
490	sli	v21.4s,v5.4s,#7
491	sli	v25.4s,v6.4s,#7
492	sli	v29.4s,v7.4s,#7
493	add	v16.4s,v16.4s,v21.4s
494	add	w5,w5,w10
495	add	v20.4s,v20.4s,v25.4s
496	add	w6,w6,w11
497	add	v24.4s,v24.4s,v29.4s
498	add	w7,w7,w12
499	add	v28.4s,v28.4s,v17.4s
500	add	w8,w8,w9
501	eor	v31.16b,v31.16b,v16.16b
502	eor	w21,w21,w5
503	eor	v19.16b,v19.16b,v20.16b
504	eor	w17,w17,w6
505	eor	v23.16b,v23.16b,v24.16b
506	eor	w19,w19,w7
507	eor	v27.16b,v27.16b,v28.16b
508	eor	w20,w20,w8
509	rev32	v31.8h,v31.8h
510	ror	w21,w21,#16
511	rev32	v19.8h,v19.8h
512	ror	w17,w17,#16
513	rev32	v23.8h,v23.8h
514	ror	w19,w19,#16
515	rev32	v27.8h,v27.8h
516	ror	w20,w20,#16
517	add	v26.4s,v26.4s,v31.4s
518	add	w15,w15,w21
519	add	v30.4s,v30.4s,v19.4s
520	add	w16,w16,w17
521	add	v18.4s,v18.4s,v23.4s
522	add	w13,w13,w19
523	add	v22.4s,v22.4s,v27.4s
524	add	w14,w14,w20
525	eor	v4.16b,v21.16b,v26.16b
526	eor	w10,w10,w15
527	eor	v5.16b,v25.16b,v30.16b
528	eor	w11,w11,w16
529	eor	v6.16b,v29.16b,v18.16b
530	eor	w12,w12,w13
531	eor	v7.16b,v17.16b,v22.16b
532	eor	w9,w9,w14
533	ushr	v21.4s,v4.4s,#20
534	ror	w10,w10,#20
535	ushr	v25.4s,v5.4s,#20
536	ror	w11,w11,#20
537	ushr	v29.4s,v6.4s,#20
538	ror	w12,w12,#20
539	ushr	v17.4s,v7.4s,#20
540	ror	w9,w9,#20
541	sli	v21.4s,v4.4s,#12
542	add	w5,w5,w10
543	sli	v25.4s,v5.4s,#12
544	add	w6,w6,w11
545	sli	v29.4s,v6.4s,#12
546	add	w7,w7,w12
547	sli	v17.4s,v7.4s,#12
548	add	w8,w8,w9
549	add	v16.4s,v16.4s,v21.4s
550	eor	w21,w21,w5
551	add	v20.4s,v20.4s,v25.4s
552	eor	w17,w17,w6
553	add	v24.4s,v24.4s,v29.4s
554	eor	w19,w19,w7
555	add	v28.4s,v28.4s,v17.4s
556	eor	w20,w20,w8
557	eor	v4.16b,v31.16b,v16.16b
558	ror	w21,w21,#24
559	eor	v5.16b,v19.16b,v20.16b
560	ror	w17,w17,#24
561	eor	v6.16b,v23.16b,v24.16b
562	ror	w19,w19,#24
563	eor	v7.16b,v27.16b,v28.16b
564	ror	w20,w20,#24
565	tbl	v31.16b,{v4.16b},v9.16b
566	add	w15,w15,w21
567	tbl	v19.16b,{v5.16b},v9.16b
568	add	w16,w16,w17
569	tbl	v23.16b,{v6.16b},v9.16b
570	add	w13,w13,w19
571	tbl	v27.16b,{v7.16b},v9.16b
572	add	w14,w14,w20
573	add	v26.4s,v26.4s,v31.4s
574	eor	w10,w10,w15
575	add	v30.4s,v30.4s,v19.4s
576	eor	w11,w11,w16
577	add	v18.4s,v18.4s,v23.4s
578	eor	w12,w12,w13
579	add	v22.4s,v22.4s,v27.4s
580	eor	w9,w9,w14
581	eor	v4.16b,v21.16b,v26.16b
582	ror	w10,w10,#25
583	eor	v5.16b,v25.16b,v30.16b
584	ror	w11,w11,#25
585	eor	v6.16b,v29.16b,v18.16b
586	ror	w12,w12,#25
587	eor	v7.16b,v17.16b,v22.16b
588	ror	w9,w9,#25
589	ushr	v21.4s,v4.4s,#25
590	ushr	v25.4s,v5.4s,#25
591	ushr	v29.4s,v6.4s,#25
592	ushr	v17.4s,v7.4s,#25
593	sli	v21.4s,v4.4s,#7
594	sli	v25.4s,v5.4s,#7
595	sli	v29.4s,v6.4s,#7
596	sli	v17.4s,v7.4s,#7
597	cbnz	x4,.Loop_neon
598
599	add	v19.4s,v19.4s,v8.4s
600
601	zip1	v4.4s,v16.4s,v20.4s			// transpose data
602	zip1	v5.4s,v24.4s,v28.4s
603	zip2	v6.4s,v16.4s,v20.4s
604	zip2	v7.4s,v24.4s,v28.4s
605	zip1	v16.2d,v4.2d,v5.2d
606	zip2	v20.2d,v4.2d,v5.2d
607	zip1	v24.2d,v6.2d,v7.2d
608	zip2	v28.2d,v6.2d,v7.2d
609
610	zip1	v4.4s,v17.4s,v21.4s
611	zip1	v5.4s,v25.4s,v29.4s
612	zip2	v6.4s,v17.4s,v21.4s
613	zip2	v7.4s,v25.4s,v29.4s
614	zip1	v17.2d,v4.2d,v5.2d
615	zip2	v21.2d,v4.2d,v5.2d
616	zip1	v25.2d,v6.2d,v7.2d
617	zip2	v29.2d,v6.2d,v7.2d
618
619	zip1	v4.4s,v18.4s,v22.4s
620	add	w5,w5,w22		// accumulate key block
621	zip1	v5.4s,v26.4s,v30.4s
622	add	x6,x6,x22,lsr#32
623	zip2	v6.4s,v18.4s,v22.4s
624	add	w7,w7,w23
625	zip2	v7.4s,v26.4s,v30.4s
626	add	x8,x8,x23,lsr#32
627	zip1	v18.2d,v4.2d,v5.2d
628	add	w9,w9,w24
629	zip2	v22.2d,v4.2d,v5.2d
630	add	x10,x10,x24,lsr#32
631	zip1	v26.2d,v6.2d,v7.2d
632	add	w11,w11,w25
633	zip2	v30.2d,v6.2d,v7.2d
634	add	x12,x12,x25,lsr#32
635
636	zip1	v4.4s,v19.4s,v23.4s
637	add	w13,w13,w26
638	zip1	v5.4s,v27.4s,v31.4s
639	add	x14,x14,x26,lsr#32
640	zip2	v6.4s,v19.4s,v23.4s
641	add	w15,w15,w27
642	zip2	v7.4s,v27.4s,v31.4s
643	add	x16,x16,x27,lsr#32
644	zip1	v19.2d,v4.2d,v5.2d
645	add	w17,w17,w28
646	zip2	v23.2d,v4.2d,v5.2d
647	add	x19,x19,x28,lsr#32
648	zip1	v27.2d,v6.2d,v7.2d
649	add	w20,w20,w30
650	zip2	v31.2d,v6.2d,v7.2d
651	add	x21,x21,x30,lsr#32
652
653	b.lo	.Ltail_neon
654
655	add	x5,x5,x6,lsl#32	// pack
656	add	x7,x7,x8,lsl#32
657	ldp	x6,x8,[x1,#0]		// load input
658	add	v16.4s,v16.4s,v0.4s			// accumulate key block
659	add	x9,x9,x10,lsl#32
660	add	x11,x11,x12,lsl#32
661	ldp	x10,x12,[x1,#16]
662	add	v17.4s,v17.4s,v1.4s
663	add	x13,x13,x14,lsl#32
664	add	x15,x15,x16,lsl#32
665	ldp	x14,x16,[x1,#32]
666	add	v18.4s,v18.4s,v2.4s
667	add	x17,x17,x19,lsl#32
668	add	x20,x20,x21,lsl#32
669	ldp	x19,x21,[x1,#48]
670	add	v19.4s,v19.4s,v3.4s
671	add	x1,x1,#64
672#ifdef	__AARCH64EB__
673	rev	x5,x5
674	rev	x7,x7
675	rev	x9,x9
676	rev	x11,x11
677	rev	x13,x13
678	rev	x15,x15
679	rev	x17,x17
680	rev	x20,x20
681#endif
682	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
683	eor	x5,x5,x6
684	add	v20.4s,v20.4s,v0.4s
685	eor	x7,x7,x8
686	add	v21.4s,v21.4s,v1.4s
687	eor	x9,x9,x10
688	add	v22.4s,v22.4s,v2.4s
689	eor	x11,x11,x12
690	add	v23.4s,v23.4s,v3.4s
691	eor	x13,x13,x14
692	eor	v16.16b,v16.16b,v4.16b
693	movi	v4.4s,#5
694	eor	x15,x15,x16
695	eor	v17.16b,v17.16b,v5.16b
696	eor	x17,x17,x19
697	eor	v18.16b,v18.16b,v6.16b
698	eor	x20,x20,x21
699	eor	v19.16b,v19.16b,v7.16b
700	add	v8.4s,v8.4s,v4.4s			// += 5
701	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
702
703	stp	x5,x7,[x0,#0]		// store output
704	add	x28,x28,#5			// increment counter
705	stp	x9,x11,[x0,#16]
706	stp	x13,x15,[x0,#32]
707	stp	x17,x20,[x0,#48]
708	add	x0,x0,#64
709
710	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
711	add	v24.4s,v24.4s,v0.4s
712	add	v25.4s,v25.4s,v1.4s
713	add	v26.4s,v26.4s,v2.4s
714	add	v27.4s,v27.4s,v3.4s
715	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
716
717	eor	v20.16b,v20.16b,v4.16b
718	eor	v21.16b,v21.16b,v5.16b
719	eor	v22.16b,v22.16b,v6.16b
720	eor	v23.16b,v23.16b,v7.16b
721	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
722	add	v28.4s,v28.4s,v0.4s
723	add	v29.4s,v29.4s,v1.4s
724	add	v30.4s,v30.4s,v2.4s
725	add	v31.4s,v31.4s,v3.4s
726	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
727
728	eor	v24.16b,v24.16b,v16.16b
729	eor	v25.16b,v25.16b,v17.16b
730	eor	v26.16b,v26.16b,v18.16b
731	eor	v27.16b,v27.16b,v19.16b
732	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
733
734	eor	v28.16b,v28.16b,v20.16b
735	eor	v29.16b,v29.16b,v21.16b
736	eor	v30.16b,v30.16b,v22.16b
737	eor	v31.16b,v31.16b,v23.16b
738	st1	{v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64
739
740	b.hi	.Loop_outer_neon
741
742	ldp	d8,d9,[sp]			// meet ABI requirements
743
744	ldp	x19,x20,[x29,#16]
745	add	sp,sp,#64
746	ldp	x21,x22,[x29,#32]
747	ldp	x23,x24,[x29,#48]
748	ldp	x25,x26,[x29,#64]
749	ldp	x27,x28,[x29,#80]
750	ldp	x29,x30,[sp],#96
751	AARCH64_VALIDATE_LINK_REGISTER
752	ret
753
754.align	4
755.Ltail_neon:
756	add	x2,x2,#320
757	ldp	d8,d9,[sp]			// meet ABI requirements
758	cmp	x2,#64
759	b.lo	.Less_than_64
760
761	add	x5,x5,x6,lsl#32	// pack
762	add	x7,x7,x8,lsl#32
763	ldp	x6,x8,[x1,#0]		// load input
764	add	x9,x9,x10,lsl#32
765	add	x11,x11,x12,lsl#32
766	ldp	x10,x12,[x1,#16]
767	add	x13,x13,x14,lsl#32
768	add	x15,x15,x16,lsl#32
769	ldp	x14,x16,[x1,#32]
770	add	x17,x17,x19,lsl#32
771	add	x20,x20,x21,lsl#32
772	ldp	x19,x21,[x1,#48]
773	add	x1,x1,#64
774#ifdef	__AARCH64EB__
775	rev	x5,x5
776	rev	x7,x7
777	rev	x9,x9
778	rev	x11,x11
779	rev	x13,x13
780	rev	x15,x15
781	rev	x17,x17
782	rev	x20,x20
783#endif
784	eor	x5,x5,x6
785	eor	x7,x7,x8
786	eor	x9,x9,x10
787	eor	x11,x11,x12
788	eor	x13,x13,x14
789	eor	x15,x15,x16
790	eor	x17,x17,x19
791	eor	x20,x20,x21
792
793	stp	x5,x7,[x0,#0]		// store output
794	add	v16.4s,v16.4s,v0.4s			// accumulate key block
795	stp	x9,x11,[x0,#16]
796	add	v17.4s,v17.4s,v1.4s
797	stp	x13,x15,[x0,#32]
798	add	v18.4s,v18.4s,v2.4s
799	stp	x17,x20,[x0,#48]
800	add	v19.4s,v19.4s,v3.4s
801	add	x0,x0,#64
802	b.eq	.Ldone_neon
803	sub	x2,x2,#64
804	cmp	x2,#64
805	b.lo	.Last_neon
806
807	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
808	eor	v16.16b,v16.16b,v4.16b
809	eor	v17.16b,v17.16b,v5.16b
810	eor	v18.16b,v18.16b,v6.16b
811	eor	v19.16b,v19.16b,v7.16b
812	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
813	b.eq	.Ldone_neon
814
815	add	v16.4s,v20.4s,v0.4s
816	add	v17.4s,v21.4s,v1.4s
817	sub	x2,x2,#64
818	add	v18.4s,v22.4s,v2.4s
819	cmp	x2,#64
820	add	v19.4s,v23.4s,v3.4s
821	b.lo	.Last_neon
822
823	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
824	eor	v20.16b,v16.16b,v4.16b
825	eor	v21.16b,v17.16b,v5.16b
826	eor	v22.16b,v18.16b,v6.16b
827	eor	v23.16b,v19.16b,v7.16b
828	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
829	b.eq	.Ldone_neon
830
831	add	v16.4s,v24.4s,v0.4s
832	add	v17.4s,v25.4s,v1.4s
833	sub	x2,x2,#64
834	add	v18.4s,v26.4s,v2.4s
835	cmp	x2,#64
836	add	v19.4s,v27.4s,v3.4s
837	b.lo	.Last_neon
838
839	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
840	eor	v24.16b,v16.16b,v4.16b
841	eor	v25.16b,v17.16b,v5.16b
842	eor	v26.16b,v18.16b,v6.16b
843	eor	v27.16b,v19.16b,v7.16b
844	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
845	b.eq	.Ldone_neon
846
847	add	v16.4s,v28.4s,v0.4s
848	add	v17.4s,v29.4s,v1.4s
849	add	v18.4s,v30.4s,v2.4s
850	add	v19.4s,v31.4s,v3.4s
851	sub	x2,x2,#64
852
853.Last_neon:
854	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
855
856	sub	x0,x0,#1
857	add	x1,x1,x2
858	add	x0,x0,x2
859	add	x4,sp,x2
860	neg	x2,x2
861
862.Loop_tail_neon:
863	ldrb	w10,[x1,x2]
864	ldrb	w11,[x4,x2]
865	add	x2,x2,#1
866	eor	w10,w10,w11
867	strb	w10,[x0,x2]
868	cbnz	x2,.Loop_tail_neon
869
870	stp	xzr,xzr,[sp,#0]
871	stp	xzr,xzr,[sp,#16]
872	stp	xzr,xzr,[sp,#32]
873	stp	xzr,xzr,[sp,#48]
874
875.Ldone_neon:
876	ldp	x19,x20,[x29,#16]
877	add	sp,sp,#64
878	ldp	x21,x22,[x29,#32]
879	ldp	x23,x24,[x29,#48]
880	ldp	x25,x26,[x29,#64]
881	ldp	x27,x28,[x29,#80]
882	ldp	x29,x30,[sp],#96
883	AARCH64_VALIDATE_LINK_REGISTER
884	ret
885.size	ChaCha20_neon,.-ChaCha20_neon
886.type	ChaCha20_512_neon,%function
887.align	5
888ChaCha20_512_neon:
889	AARCH64_SIGN_LINK_REGISTER
890	stp	x29,x30,[sp,#-96]!
891	add	x29,sp,#0
892
893	adr	x5,.Lsigma
894	stp	x19,x20,[sp,#16]
895	stp	x21,x22,[sp,#32]
896	stp	x23,x24,[sp,#48]
897	stp	x25,x26,[sp,#64]
898	stp	x27,x28,[sp,#80]
899
900.L512_or_more_neon:
901	sub	sp,sp,#128+64
902
903	eor	v7.16b,v7.16b,v7.16b
904	ldp	x22,x23,[x5]		// load sigma
905	ld1	{v0.4s},[x5],#16
906	ldp	x24,x25,[x3]		// load key
907	ldp	x26,x27,[x3,#16]
908	ld1	{v1.4s,v2.4s},[x3]
909	ldp	x28,x30,[x4]		// load counter
910	ld1	{v3.4s},[x4]
911	ld1	{v7.s}[0],[x5]
912	add	x3,x5,#16			// .Lrot24
913#ifdef	__AARCH64EB__
914	rev64	v0.4s,v0.4s
915	ror	x24,x24,#32
916	ror	x25,x25,#32
917	ror	x26,x26,#32
918	ror	x27,x27,#32
919	ror	x28,x28,#32
920	ror	x30,x30,#32
921#endif
922	add	v3.4s,v3.4s,v7.4s		// += 1
923	stp	q0,q1,[sp,#0]		// off-load key block, invariant part
924	add	v3.4s,v3.4s,v7.4s		// not typo
925	str	q2,[sp,#32]
926	add	v4.4s,v3.4s,v7.4s
927	add	v5.4s,v4.4s,v7.4s
928	add	v6.4s,v5.4s,v7.4s
929	shl	v7.4s,v7.4s,#2			// 1 -> 4
930
931	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
932	stp	d10,d11,[sp,#128+16]
933	stp	d12,d13,[sp,#128+32]
934	stp	d14,d15,[sp,#128+48]
935
936	sub	x2,x2,#512			// not typo
937
938.Loop_outer_512_neon:
939	mov	v8.16b,v0.16b
940	mov	v12.16b,v0.16b
941	mov	v16.16b,v0.16b
942	mov	v20.16b,v0.16b
943	mov	v24.16b,v0.16b
944	mov	v28.16b,v0.16b
945	mov	v9.16b,v1.16b
946	mov	w5,w22			// unpack key block
947	mov	v13.16b,v1.16b
948	lsr	x6,x22,#32
949	mov	v17.16b,v1.16b
950	mov	w7,w23
951	mov	v21.16b,v1.16b
952	lsr	x8,x23,#32
953	mov	v25.16b,v1.16b
954	mov	w9,w24
955	mov	v29.16b,v1.16b
956	lsr	x10,x24,#32
957	mov	v11.16b,v3.16b
958	mov	w11,w25
959	mov	v15.16b,v4.16b
960	lsr	x12,x25,#32
961	mov	v19.16b,v5.16b
962	mov	w13,w26
963	mov	v23.16b,v6.16b
964	lsr	x14,x26,#32
965	mov	v10.16b,v2.16b
966	mov	w15,w27
967	mov	v14.16b,v2.16b
968	lsr	x16,x27,#32
969	add	v27.4s,v11.4s,v7.4s			// +4
970	mov	w17,w28
971	add	v31.4s,v15.4s,v7.4s			// +4
972	lsr	x19,x28,#32
973	mov	v18.16b,v2.16b
974	mov	w20,w30
975	mov	v22.16b,v2.16b
976	lsr	x21,x30,#32
977	mov	v26.16b,v2.16b
978	stp	q3,q4,[sp,#48]		// off-load key block, variable part
979	mov	v30.16b,v2.16b
980	stp	q5,q6,[sp,#80]
981
982	mov	x4,#5
983	ld1	{v6.4s},[x3]
984	subs	x2,x2,#512
985.Loop_upper_neon:
986	sub	x4,x4,#1
987	add	v8.4s,v8.4s,v9.4s
988	add	w5,w5,w9
989	add	v12.4s,v12.4s,v13.4s
990	add	w6,w6,w10
991	add	v16.4s,v16.4s,v17.4s
992	add	w7,w7,w11
993	add	v20.4s,v20.4s,v21.4s
994	add	w8,w8,w12
995	add	v24.4s,v24.4s,v25.4s
996	eor	w17,w17,w5
997	add	v28.4s,v28.4s,v29.4s
998	eor	w19,w19,w6
999	eor	v11.16b,v11.16b,v8.16b
1000	eor	w20,w20,w7
1001	eor	v15.16b,v15.16b,v12.16b
1002	eor	w21,w21,w8
1003	eor	v19.16b,v19.16b,v16.16b
1004	ror	w17,w17,#16
1005	eor	v23.16b,v23.16b,v20.16b
1006	ror	w19,w19,#16
1007	eor	v27.16b,v27.16b,v24.16b
1008	ror	w20,w20,#16
1009	eor	v31.16b,v31.16b,v28.16b
1010	ror	w21,w21,#16
1011	rev32	v11.8h,v11.8h
1012	add	w13,w13,w17
1013	rev32	v15.8h,v15.8h
1014	add	w14,w14,w19
1015	rev32	v19.8h,v19.8h
1016	add	w15,w15,w20
1017	rev32	v23.8h,v23.8h
1018	add	w16,w16,w21
1019	rev32	v27.8h,v27.8h
1020	eor	w9,w9,w13
1021	rev32	v31.8h,v31.8h
1022	eor	w10,w10,w14
1023	add	v10.4s,v10.4s,v11.4s
1024	eor	w11,w11,w15
1025	add	v14.4s,v14.4s,v15.4s
1026	eor	w12,w12,w16
1027	add	v18.4s,v18.4s,v19.4s
1028	ror	w9,w9,#20
1029	add	v22.4s,v22.4s,v23.4s
1030	ror	w10,w10,#20
1031	add	v26.4s,v26.4s,v27.4s
1032	ror	w11,w11,#20
1033	add	v30.4s,v30.4s,v31.4s
1034	ror	w12,w12,#20
1035	eor	v0.16b,v9.16b,v10.16b
1036	add	w5,w5,w9
1037	eor	v1.16b,v13.16b,v14.16b
1038	add	w6,w6,w10
1039	eor	v2.16b,v17.16b,v18.16b
1040	add	w7,w7,w11
1041	eor	v3.16b,v21.16b,v22.16b
1042	add	w8,w8,w12
1043	eor	v4.16b,v25.16b,v26.16b
1044	eor	w17,w17,w5
1045	eor	v5.16b,v29.16b,v30.16b
1046	eor	w19,w19,w6
1047	ushr	v9.4s,v0.4s,#20
1048	eor	w20,w20,w7
1049	ushr	v13.4s,v1.4s,#20
1050	eor	w21,w21,w8
1051	ushr	v17.4s,v2.4s,#20
1052	ror	w17,w17,#24
1053	ushr	v21.4s,v3.4s,#20
1054	ror	w19,w19,#24
1055	ushr	v25.4s,v4.4s,#20
1056	ror	w20,w20,#24
1057	ushr	v29.4s,v5.4s,#20
1058	ror	w21,w21,#24
1059	sli	v9.4s,v0.4s,#12
1060	add	w13,w13,w17
1061	sli	v13.4s,v1.4s,#12
1062	add	w14,w14,w19
1063	sli	v17.4s,v2.4s,#12
1064	add	w15,w15,w20
1065	sli	v21.4s,v3.4s,#12
1066	add	w16,w16,w21
1067	sli	v25.4s,v4.4s,#12
1068	eor	w9,w9,w13
1069	sli	v29.4s,v5.4s,#12
1070	eor	w10,w10,w14
1071	add	v8.4s,v8.4s,v9.4s
1072	eor	w11,w11,w15
1073	add	v12.4s,v12.4s,v13.4s
1074	eor	w12,w12,w16
1075	add	v16.4s,v16.4s,v17.4s
1076	ror	w9,w9,#25
1077	add	v20.4s,v20.4s,v21.4s
1078	ror	w10,w10,#25
1079	add	v24.4s,v24.4s,v25.4s
1080	ror	w11,w11,#25
1081	add	v28.4s,v28.4s,v29.4s
1082	ror	w12,w12,#25
1083	eor	v11.16b,v11.16b,v8.16b
1084	add	w5,w5,w10
1085	eor	v15.16b,v15.16b,v12.16b
1086	add	w6,w6,w11
1087	eor	v19.16b,v19.16b,v16.16b
1088	add	w7,w7,w12
1089	eor	v23.16b,v23.16b,v20.16b
1090	add	w8,w8,w9
1091	eor	v27.16b,v27.16b,v24.16b
1092	eor	w21,w21,w5
1093	eor	v31.16b,v31.16b,v28.16b
1094	eor	w17,w17,w6
1095	tbl	v11.16b,{v11.16b},v6.16b
1096	eor	w19,w19,w7
1097	tbl	v15.16b,{v15.16b},v6.16b
1098	eor	w20,w20,w8
1099	tbl	v19.16b,{v19.16b},v6.16b
1100	ror	w21,w21,#16
1101	tbl	v23.16b,{v23.16b},v6.16b
1102	ror	w17,w17,#16
1103	tbl	v27.16b,{v27.16b},v6.16b
1104	ror	w19,w19,#16
1105	tbl	v31.16b,{v31.16b},v6.16b
1106	ror	w20,w20,#16
1107	add	v10.4s,v10.4s,v11.4s
1108	add	w15,w15,w21
1109	add	v14.4s,v14.4s,v15.4s
1110	add	w16,w16,w17
1111	add	v18.4s,v18.4s,v19.4s
1112	add	w13,w13,w19
1113	add	v22.4s,v22.4s,v23.4s
1114	add	w14,w14,w20
1115	add	v26.4s,v26.4s,v27.4s
1116	eor	w10,w10,w15
1117	add	v30.4s,v30.4s,v31.4s
1118	eor	w11,w11,w16
1119	eor	v0.16b,v9.16b,v10.16b
1120	eor	w12,w12,w13
1121	eor	v1.16b,v13.16b,v14.16b
1122	eor	w9,w9,w14
1123	eor	v2.16b,v17.16b,v18.16b
1124	ror	w10,w10,#20
1125	eor	v3.16b,v21.16b,v22.16b
1126	ror	w11,w11,#20
1127	eor	v4.16b,v25.16b,v26.16b
1128	ror	w12,w12,#20
1129	eor	v5.16b,v29.16b,v30.16b
1130	ror	w9,w9,#20
1131	ushr	v9.4s,v0.4s,#25
1132	add	w5,w5,w10
1133	ushr	v13.4s,v1.4s,#25
1134	add	w6,w6,w11
1135	ushr	v17.4s,v2.4s,#25
1136	add	w7,w7,w12
1137	ushr	v21.4s,v3.4s,#25
1138	add	w8,w8,w9
1139	ushr	v25.4s,v4.4s,#25
1140	eor	w21,w21,w5
1141	ushr	v29.4s,v5.4s,#25
1142	eor	w17,w17,w6
1143	sli	v9.4s,v0.4s,#7
1144	eor	w19,w19,w7
1145	sli	v13.4s,v1.4s,#7
1146	eor	w20,w20,w8
1147	sli	v17.4s,v2.4s,#7
1148	ror	w21,w21,#24
1149	sli	v21.4s,v3.4s,#7
1150	ror	w17,w17,#24
1151	sli	v25.4s,v4.4s,#7
1152	ror	w19,w19,#24
1153	sli	v29.4s,v5.4s,#7
1154	ror	w20,w20,#24
1155	ext	v10.16b,v10.16b,v10.16b,#8
1156	add	w15,w15,w21
1157	ext	v14.16b,v14.16b,v14.16b,#8
1158	add	w16,w16,w17
1159	ext	v18.16b,v18.16b,v18.16b,#8
1160	add	w13,w13,w19
1161	ext	v22.16b,v22.16b,v22.16b,#8
1162	add	w14,w14,w20
1163	ext	v26.16b,v26.16b,v26.16b,#8
1164	eor	w10,w10,w15
1165	ext	v30.16b,v30.16b,v30.16b,#8
1166	eor	w11,w11,w16
1167	ext	v11.16b,v11.16b,v11.16b,#12
1168	eor	w12,w12,w13
1169	ext	v15.16b,v15.16b,v15.16b,#12
1170	eor	w9,w9,w14
1171	ext	v19.16b,v19.16b,v19.16b,#12
1172	ror	w10,w10,#25
1173	ext	v23.16b,v23.16b,v23.16b,#12
1174	ror	w11,w11,#25
1175	ext	v27.16b,v27.16b,v27.16b,#12
1176	ror	w12,w12,#25
1177	ext	v31.16b,v31.16b,v31.16b,#12
1178	ror	w9,w9,#25
1179	ext	v9.16b,v9.16b,v9.16b,#4
1180	ext	v13.16b,v13.16b,v13.16b,#4
1181	ext	v17.16b,v17.16b,v17.16b,#4
1182	ext	v21.16b,v21.16b,v21.16b,#4
1183	ext	v25.16b,v25.16b,v25.16b,#4
1184	ext	v29.16b,v29.16b,v29.16b,#4
1185	add	v8.4s,v8.4s,v9.4s
1186	add	w5,w5,w9
1187	add	v12.4s,v12.4s,v13.4s
1188	add	w6,w6,w10
1189	add	v16.4s,v16.4s,v17.4s
1190	add	w7,w7,w11
1191	add	v20.4s,v20.4s,v21.4s
1192	add	w8,w8,w12
1193	add	v24.4s,v24.4s,v25.4s
1194	eor	w17,w17,w5
1195	add	v28.4s,v28.4s,v29.4s
1196	eor	w19,w19,w6
1197	eor	v11.16b,v11.16b,v8.16b
1198	eor	w20,w20,w7
1199	eor	v15.16b,v15.16b,v12.16b
1200	eor	w21,w21,w8
1201	eor	v19.16b,v19.16b,v16.16b
1202	ror	w17,w17,#16
1203	eor	v23.16b,v23.16b,v20.16b
1204	ror	w19,w19,#16
1205	eor	v27.16b,v27.16b,v24.16b
1206	ror	w20,w20,#16
1207	eor	v31.16b,v31.16b,v28.16b
1208	ror	w21,w21,#16
1209	rev32	v11.8h,v11.8h
1210	add	w13,w13,w17
1211	rev32	v15.8h,v15.8h
1212	add	w14,w14,w19
1213	rev32	v19.8h,v19.8h
1214	add	w15,w15,w20
1215	rev32	v23.8h,v23.8h
1216	add	w16,w16,w21
1217	rev32	v27.8h,v27.8h
1218	eor	w9,w9,w13
1219	rev32	v31.8h,v31.8h
1220	eor	w10,w10,w14
1221	add	v10.4s,v10.4s,v11.4s
1222	eor	w11,w11,w15
1223	add	v14.4s,v14.4s,v15.4s
1224	eor	w12,w12,w16
1225	add	v18.4s,v18.4s,v19.4s
1226	ror	w9,w9,#20
1227	add	v22.4s,v22.4s,v23.4s
1228	ror	w10,w10,#20
1229	add	v26.4s,v26.4s,v27.4s
1230	ror	w11,w11,#20
1231	add	v30.4s,v30.4s,v31.4s
1232	ror	w12,w12,#20
1233	eor	v0.16b,v9.16b,v10.16b
1234	add	w5,w5,w9
1235	eor	v1.16b,v13.16b,v14.16b
1236	add	w6,w6,w10
1237	eor	v2.16b,v17.16b,v18.16b
1238	add	w7,w7,w11
1239	eor	v3.16b,v21.16b,v22.16b
1240	add	w8,w8,w12
1241	eor	v4.16b,v25.16b,v26.16b
1242	eor	w17,w17,w5
1243	eor	v5.16b,v29.16b,v30.16b
1244	eor	w19,w19,w6
1245	ushr	v9.4s,v0.4s,#20
1246	eor	w20,w20,w7
1247	ushr	v13.4s,v1.4s,#20
1248	eor	w21,w21,w8
1249	ushr	v17.4s,v2.4s,#20
1250	ror	w17,w17,#24
1251	ushr	v21.4s,v3.4s,#20
1252	ror	w19,w19,#24
1253	ushr	v25.4s,v4.4s,#20
1254	ror	w20,w20,#24
1255	ushr	v29.4s,v5.4s,#20
1256	ror	w21,w21,#24
1257	sli	v9.4s,v0.4s,#12
1258	add	w13,w13,w17
1259	sli	v13.4s,v1.4s,#12
1260	add	w14,w14,w19
1261	sli	v17.4s,v2.4s,#12
1262	add	w15,w15,w20
1263	sli	v21.4s,v3.4s,#12
1264	add	w16,w16,w21
1265	sli	v25.4s,v4.4s,#12
1266	eor	w9,w9,w13
1267	sli	v29.4s,v5.4s,#12
1268	eor	w10,w10,w14
1269	add	v8.4s,v8.4s,v9.4s
1270	eor	w11,w11,w15
1271	add	v12.4s,v12.4s,v13.4s
1272	eor	w12,w12,w16
1273	add	v16.4s,v16.4s,v17.4s
1274	ror	w9,w9,#25
1275	add	v20.4s,v20.4s,v21.4s
1276	ror	w10,w10,#25
1277	add	v24.4s,v24.4s,v25.4s
1278	ror	w11,w11,#25
1279	add	v28.4s,v28.4s,v29.4s
1280	ror	w12,w12,#25
1281	eor	v11.16b,v11.16b,v8.16b
1282	add	w5,w5,w10
1283	eor	v15.16b,v15.16b,v12.16b
1284	add	w6,w6,w11
1285	eor	v19.16b,v19.16b,v16.16b
1286	add	w7,w7,w12
1287	eor	v23.16b,v23.16b,v20.16b
1288	add	w8,w8,w9
1289	eor	v27.16b,v27.16b,v24.16b
1290	eor	w21,w21,w5
1291	eor	v31.16b,v31.16b,v28.16b
1292	eor	w17,w17,w6
1293	tbl	v11.16b,{v11.16b},v6.16b
1294	eor	w19,w19,w7
1295	tbl	v15.16b,{v15.16b},v6.16b
1296	eor	w20,w20,w8
1297	tbl	v19.16b,{v19.16b},v6.16b
1298	ror	w21,w21,#16
1299	tbl	v23.16b,{v23.16b},v6.16b
1300	ror	w17,w17,#16
1301	tbl	v27.16b,{v27.16b},v6.16b
1302	ror	w19,w19,#16
1303	tbl	v31.16b,{v31.16b},v6.16b
1304	ror	w20,w20,#16
1305	add	v10.4s,v10.4s,v11.4s
1306	add	w15,w15,w21
1307	add	v14.4s,v14.4s,v15.4s
1308	add	w16,w16,w17
1309	add	v18.4s,v18.4s,v19.4s
1310	add	w13,w13,w19
1311	add	v22.4s,v22.4s,v23.4s
1312	add	w14,w14,w20
1313	add	v26.4s,v26.4s,v27.4s
1314	eor	w10,w10,w15
1315	add	v30.4s,v30.4s,v31.4s
1316	eor	w11,w11,w16
1317	eor	v0.16b,v9.16b,v10.16b
1318	eor	w12,w12,w13
1319	eor	v1.16b,v13.16b,v14.16b
1320	eor	w9,w9,w14
1321	eor	v2.16b,v17.16b,v18.16b
1322	ror	w10,w10,#20
1323	eor	v3.16b,v21.16b,v22.16b
1324	ror	w11,w11,#20
1325	eor	v4.16b,v25.16b,v26.16b
1326	ror	w12,w12,#20
1327	eor	v5.16b,v29.16b,v30.16b
1328	ror	w9,w9,#20
1329	ushr	v9.4s,v0.4s,#25
1330	add	w5,w5,w10
1331	ushr	v13.4s,v1.4s,#25
1332	add	w6,w6,w11
1333	ushr	v17.4s,v2.4s,#25
1334	add	w7,w7,w12
1335	ushr	v21.4s,v3.4s,#25
1336	add	w8,w8,w9
1337	ushr	v25.4s,v4.4s,#25
1338	eor	w21,w21,w5
1339	ushr	v29.4s,v5.4s,#25
1340	eor	w17,w17,w6
1341	sli	v9.4s,v0.4s,#7
1342	eor	w19,w19,w7
1343	sli	v13.4s,v1.4s,#7
1344	eor	w20,w20,w8
1345	sli	v17.4s,v2.4s,#7
1346	ror	w21,w21,#24
1347	sli	v21.4s,v3.4s,#7
1348	ror	w17,w17,#24
1349	sli	v25.4s,v4.4s,#7
1350	ror	w19,w19,#24
1351	sli	v29.4s,v5.4s,#7
1352	ror	w20,w20,#24
1353	ext	v10.16b,v10.16b,v10.16b,#8
1354	add	w15,w15,w21
1355	ext	v14.16b,v14.16b,v14.16b,#8
1356	add	w16,w16,w17
1357	ext	v18.16b,v18.16b,v18.16b,#8
1358	add	w13,w13,w19
1359	ext	v22.16b,v22.16b,v22.16b,#8
1360	add	w14,w14,w20
1361	ext	v26.16b,v26.16b,v26.16b,#8
1362	eor	w10,w10,w15
1363	ext	v30.16b,v30.16b,v30.16b,#8
1364	eor	w11,w11,w16
1365	ext	v11.16b,v11.16b,v11.16b,#4
1366	eor	w12,w12,w13
1367	ext	v15.16b,v15.16b,v15.16b,#4
1368	eor	w9,w9,w14
1369	ext	v19.16b,v19.16b,v19.16b,#4
1370	ror	w10,w10,#25
1371	ext	v23.16b,v23.16b,v23.16b,#4
1372	ror	w11,w11,#25
1373	ext	v27.16b,v27.16b,v27.16b,#4
1374	ror	w12,w12,#25
1375	ext	v31.16b,v31.16b,v31.16b,#4
1376	ror	w9,w9,#25
1377	ext	v9.16b,v9.16b,v9.16b,#12
1378	ext	v13.16b,v13.16b,v13.16b,#12
1379	ext	v17.16b,v17.16b,v17.16b,#12
1380	ext	v21.16b,v21.16b,v21.16b,#12
1381	ext	v25.16b,v25.16b,v25.16b,#12
1382	ext	v29.16b,v29.16b,v29.16b,#12
1383	cbnz	x4,.Loop_upper_neon
1384
1385	add	w5,w5,w22		// accumulate key block
1386	add	x6,x6,x22,lsr#32
1387	add	w7,w7,w23
1388	add	x8,x8,x23,lsr#32
1389	add	w9,w9,w24
1390	add	x10,x10,x24,lsr#32
1391	add	w11,w11,w25
1392	add	x12,x12,x25,lsr#32
1393	add	w13,w13,w26
1394	add	x14,x14,x26,lsr#32
1395	add	w15,w15,w27
1396	add	x16,x16,x27,lsr#32
1397	add	w17,w17,w28
1398	add	x19,x19,x28,lsr#32
1399	add	w20,w20,w30
1400	add	x21,x21,x30,lsr#32
1401
1402	add	x5,x5,x6,lsl#32	// pack
1403	add	x7,x7,x8,lsl#32
1404	ldp	x6,x8,[x1,#0]		// load input
1405	add	x9,x9,x10,lsl#32
1406	add	x11,x11,x12,lsl#32
1407	ldp	x10,x12,[x1,#16]
1408	add	x13,x13,x14,lsl#32
1409	add	x15,x15,x16,lsl#32
1410	ldp	x14,x16,[x1,#32]
1411	add	x17,x17,x19,lsl#32
1412	add	x20,x20,x21,lsl#32
1413	ldp	x19,x21,[x1,#48]
1414	add	x1,x1,#64
1415#ifdef	__AARCH64EB__
1416	rev	x5,x5
1417	rev	x7,x7
1418	rev	x9,x9
1419	rev	x11,x11
1420	rev	x13,x13
1421	rev	x15,x15
1422	rev	x17,x17
1423	rev	x20,x20
1424#endif
1425	eor	x5,x5,x6
1426	eor	x7,x7,x8
1427	eor	x9,x9,x10
1428	eor	x11,x11,x12
1429	eor	x13,x13,x14
1430	eor	x15,x15,x16
1431	eor	x17,x17,x19
1432	eor	x20,x20,x21
1433
1434	stp	x5,x7,[x0,#0]		// store output
1435	add	x28,x28,#1			// increment counter
1436	mov	w5,w22			// unpack key block
1437	lsr	x6,x22,#32
1438	stp	x9,x11,[x0,#16]
1439	mov	w7,w23
1440	lsr	x8,x23,#32
1441	stp	x13,x15,[x0,#32]
1442	mov	w9,w24
1443	lsr	x10,x24,#32
1444	stp	x17,x20,[x0,#48]
1445	add	x0,x0,#64
1446	mov	w11,w25
1447	lsr	x12,x25,#32
1448	mov	w13,w26
1449	lsr	x14,x26,#32
1450	mov	w15,w27
1451	lsr	x16,x27,#32
1452	mov	w17,w28
1453	lsr	x19,x28,#32
1454	mov	w20,w30
1455	lsr	x21,x30,#32
1456
1457	mov	x4,#5
1458.Loop_lower_neon:
1459	sub	x4,x4,#1
1460	add	v8.4s,v8.4s,v9.4s
1461	add	w5,w5,w9
1462	add	v12.4s,v12.4s,v13.4s
1463	add	w6,w6,w10
1464	add	v16.4s,v16.4s,v17.4s
1465	add	w7,w7,w11
1466	add	v20.4s,v20.4s,v21.4s
1467	add	w8,w8,w12
1468	add	v24.4s,v24.4s,v25.4s
1469	eor	w17,w17,w5
1470	add	v28.4s,v28.4s,v29.4s
1471	eor	w19,w19,w6
1472	eor	v11.16b,v11.16b,v8.16b
1473	eor	w20,w20,w7
1474	eor	v15.16b,v15.16b,v12.16b
1475	eor	w21,w21,w8
1476	eor	v19.16b,v19.16b,v16.16b
1477	ror	w17,w17,#16
1478	eor	v23.16b,v23.16b,v20.16b
1479	ror	w19,w19,#16
1480	eor	v27.16b,v27.16b,v24.16b
1481	ror	w20,w20,#16
1482	eor	v31.16b,v31.16b,v28.16b
1483	ror	w21,w21,#16
1484	rev32	v11.8h,v11.8h
1485	add	w13,w13,w17
1486	rev32	v15.8h,v15.8h
1487	add	w14,w14,w19
1488	rev32	v19.8h,v19.8h
1489	add	w15,w15,w20
1490	rev32	v23.8h,v23.8h
1491	add	w16,w16,w21
1492	rev32	v27.8h,v27.8h
1493	eor	w9,w9,w13
1494	rev32	v31.8h,v31.8h
1495	eor	w10,w10,w14
1496	add	v10.4s,v10.4s,v11.4s
1497	eor	w11,w11,w15
1498	add	v14.4s,v14.4s,v15.4s
1499	eor	w12,w12,w16
1500	add	v18.4s,v18.4s,v19.4s
1501	ror	w9,w9,#20
1502	add	v22.4s,v22.4s,v23.4s
1503	ror	w10,w10,#20
1504	add	v26.4s,v26.4s,v27.4s
1505	ror	w11,w11,#20
1506	add	v30.4s,v30.4s,v31.4s
1507	ror	w12,w12,#20
1508	eor	v0.16b,v9.16b,v10.16b
1509	add	w5,w5,w9
1510	eor	v1.16b,v13.16b,v14.16b
1511	add	w6,w6,w10
1512	eor	v2.16b,v17.16b,v18.16b
1513	add	w7,w7,w11
1514	eor	v3.16b,v21.16b,v22.16b
1515	add	w8,w8,w12
1516	eor	v4.16b,v25.16b,v26.16b
1517	eor	w17,w17,w5
1518	eor	v5.16b,v29.16b,v30.16b
1519	eor	w19,w19,w6
1520	ushr	v9.4s,v0.4s,#20
1521	eor	w20,w20,w7
1522	ushr	v13.4s,v1.4s,#20
1523	eor	w21,w21,w8
1524	ushr	v17.4s,v2.4s,#20
1525	ror	w17,w17,#24
1526	ushr	v21.4s,v3.4s,#20
1527	ror	w19,w19,#24
1528	ushr	v25.4s,v4.4s,#20
1529	ror	w20,w20,#24
1530	ushr	v29.4s,v5.4s,#20
1531	ror	w21,w21,#24
1532	sli	v9.4s,v0.4s,#12
1533	add	w13,w13,w17
1534	sli	v13.4s,v1.4s,#12
1535	add	w14,w14,w19
1536	sli	v17.4s,v2.4s,#12
1537	add	w15,w15,w20
1538	sli	v21.4s,v3.4s,#12
1539	add	w16,w16,w21
1540	sli	v25.4s,v4.4s,#12
1541	eor	w9,w9,w13
1542	sli	v29.4s,v5.4s,#12
1543	eor	w10,w10,w14
1544	add	v8.4s,v8.4s,v9.4s
1545	eor	w11,w11,w15
1546	add	v12.4s,v12.4s,v13.4s
1547	eor	w12,w12,w16
1548	add	v16.4s,v16.4s,v17.4s
1549	ror	w9,w9,#25
1550	add	v20.4s,v20.4s,v21.4s
1551	ror	w10,w10,#25
1552	add	v24.4s,v24.4s,v25.4s
1553	ror	w11,w11,#25
1554	add	v28.4s,v28.4s,v29.4s
1555	ror	w12,w12,#25
1556	eor	v11.16b,v11.16b,v8.16b
1557	add	w5,w5,w10
1558	eor	v15.16b,v15.16b,v12.16b
1559	add	w6,w6,w11
1560	eor	v19.16b,v19.16b,v16.16b
1561	add	w7,w7,w12
1562	eor	v23.16b,v23.16b,v20.16b
1563	add	w8,w8,w9
1564	eor	v27.16b,v27.16b,v24.16b
1565	eor	w21,w21,w5
1566	eor	v31.16b,v31.16b,v28.16b
1567	eor	w17,w17,w6
1568	tbl	v11.16b,{v11.16b},v6.16b
1569	eor	w19,w19,w7
1570	tbl	v15.16b,{v15.16b},v6.16b
1571	eor	w20,w20,w8
1572	tbl	v19.16b,{v19.16b},v6.16b
1573	ror	w21,w21,#16
1574	tbl	v23.16b,{v23.16b},v6.16b
1575	ror	w17,w17,#16
1576	tbl	v27.16b,{v27.16b},v6.16b
1577	ror	w19,w19,#16
1578	tbl	v31.16b,{v31.16b},v6.16b
1579	ror	w20,w20,#16
1580	add	v10.4s,v10.4s,v11.4s
1581	add	w15,w15,w21
1582	add	v14.4s,v14.4s,v15.4s
1583	add	w16,w16,w17
1584	add	v18.4s,v18.4s,v19.4s
1585	add	w13,w13,w19
1586	add	v22.4s,v22.4s,v23.4s
1587	add	w14,w14,w20
1588	add	v26.4s,v26.4s,v27.4s
1589	eor	w10,w10,w15
1590	add	v30.4s,v30.4s,v31.4s
1591	eor	w11,w11,w16
1592	eor	v0.16b,v9.16b,v10.16b
1593	eor	w12,w12,w13
1594	eor	v1.16b,v13.16b,v14.16b
1595	eor	w9,w9,w14
1596	eor	v2.16b,v17.16b,v18.16b
1597	ror	w10,w10,#20
1598	eor	v3.16b,v21.16b,v22.16b
1599	ror	w11,w11,#20
1600	eor	v4.16b,v25.16b,v26.16b
1601	ror	w12,w12,#20
1602	eor	v5.16b,v29.16b,v30.16b
1603	ror	w9,w9,#20
1604	ushr	v9.4s,v0.4s,#25
1605	add	w5,w5,w10
1606	ushr	v13.4s,v1.4s,#25
1607	add	w6,w6,w11
1608	ushr	v17.4s,v2.4s,#25
1609	add	w7,w7,w12
1610	ushr	v21.4s,v3.4s,#25
1611	add	w8,w8,w9
1612	ushr	v25.4s,v4.4s,#25
1613	eor	w21,w21,w5
1614	ushr	v29.4s,v5.4s,#25
1615	eor	w17,w17,w6
1616	sli	v9.4s,v0.4s,#7
1617	eor	w19,w19,w7
1618	sli	v13.4s,v1.4s,#7
1619	eor	w20,w20,w8
1620	sli	v17.4s,v2.4s,#7
1621	ror	w21,w21,#24
1622	sli	v21.4s,v3.4s,#7
1623	ror	w17,w17,#24
1624	sli	v25.4s,v4.4s,#7
1625	ror	w19,w19,#24
1626	sli	v29.4s,v5.4s,#7
1627	ror	w20,w20,#24
1628	ext	v10.16b,v10.16b,v10.16b,#8
1629	add	w15,w15,w21
1630	ext	v14.16b,v14.16b,v14.16b,#8
1631	add	w16,w16,w17
1632	ext	v18.16b,v18.16b,v18.16b,#8
1633	add	w13,w13,w19
1634	ext	v22.16b,v22.16b,v22.16b,#8
1635	add	w14,w14,w20
1636	ext	v26.16b,v26.16b,v26.16b,#8
1637	eor	w10,w10,w15
1638	ext	v30.16b,v30.16b,v30.16b,#8
1639	eor	w11,w11,w16
1640	ext	v11.16b,v11.16b,v11.16b,#12
1641	eor	w12,w12,w13
1642	ext	v15.16b,v15.16b,v15.16b,#12
1643	eor	w9,w9,w14
1644	ext	v19.16b,v19.16b,v19.16b,#12
1645	ror	w10,w10,#25
1646	ext	v23.16b,v23.16b,v23.16b,#12
1647	ror	w11,w11,#25
1648	ext	v27.16b,v27.16b,v27.16b,#12
1649	ror	w12,w12,#25
1650	ext	v31.16b,v31.16b,v31.16b,#12
1651	ror	w9,w9,#25
1652	ext	v9.16b,v9.16b,v9.16b,#4
1653	ext	v13.16b,v13.16b,v13.16b,#4
1654	ext	v17.16b,v17.16b,v17.16b,#4
1655	ext	v21.16b,v21.16b,v21.16b,#4
1656	ext	v25.16b,v25.16b,v25.16b,#4
1657	ext	v29.16b,v29.16b,v29.16b,#4
1658	add	v8.4s,v8.4s,v9.4s
1659	add	w5,w5,w9
1660	add	v12.4s,v12.4s,v13.4s
1661	add	w6,w6,w10
1662	add	v16.4s,v16.4s,v17.4s
1663	add	w7,w7,w11
1664	add	v20.4s,v20.4s,v21.4s
1665	add	w8,w8,w12
1666	add	v24.4s,v24.4s,v25.4s
1667	eor	w17,w17,w5
1668	add	v28.4s,v28.4s,v29.4s
1669	eor	w19,w19,w6
1670	eor	v11.16b,v11.16b,v8.16b
1671	eor	w20,w20,w7
1672	eor	v15.16b,v15.16b,v12.16b
1673	eor	w21,w21,w8
1674	eor	v19.16b,v19.16b,v16.16b
1675	ror	w17,w17,#16
1676	eor	v23.16b,v23.16b,v20.16b
1677	ror	w19,w19,#16
1678	eor	v27.16b,v27.16b,v24.16b
1679	ror	w20,w20,#16
1680	eor	v31.16b,v31.16b,v28.16b
1681	ror	w21,w21,#16
1682	rev32	v11.8h,v11.8h
1683	add	w13,w13,w17
1684	rev32	v15.8h,v15.8h
1685	add	w14,w14,w19
1686	rev32	v19.8h,v19.8h
1687	add	w15,w15,w20
1688	rev32	v23.8h,v23.8h
1689	add	w16,w16,w21
1690	rev32	v27.8h,v27.8h
1691	eor	w9,w9,w13
1692	rev32	v31.8h,v31.8h
1693	eor	w10,w10,w14
1694	add	v10.4s,v10.4s,v11.4s
1695	eor	w11,w11,w15
1696	add	v14.4s,v14.4s,v15.4s
1697	eor	w12,w12,w16
1698	add	v18.4s,v18.4s,v19.4s
1699	ror	w9,w9,#20
1700	add	v22.4s,v22.4s,v23.4s
1701	ror	w10,w10,#20
1702	add	v26.4s,v26.4s,v27.4s
1703	ror	w11,w11,#20
1704	add	v30.4s,v30.4s,v31.4s
1705	ror	w12,w12,#20
1706	eor	v0.16b,v9.16b,v10.16b
1707	add	w5,w5,w9
1708	eor	v1.16b,v13.16b,v14.16b
1709	add	w6,w6,w10
1710	eor	v2.16b,v17.16b,v18.16b
1711	add	w7,w7,w11
1712	eor	v3.16b,v21.16b,v22.16b
1713	add	w8,w8,w12
1714	eor	v4.16b,v25.16b,v26.16b
1715	eor	w17,w17,w5
1716	eor	v5.16b,v29.16b,v30.16b
1717	eor	w19,w19,w6
1718	ushr	v9.4s,v0.4s,#20
1719	eor	w20,w20,w7
1720	ushr	v13.4s,v1.4s,#20
1721	eor	w21,w21,w8
1722	ushr	v17.4s,v2.4s,#20
1723	ror	w17,w17,#24
1724	ushr	v21.4s,v3.4s,#20
1725	ror	w19,w19,#24
1726	ushr	v25.4s,v4.4s,#20
1727	ror	w20,w20,#24
1728	ushr	v29.4s,v5.4s,#20
1729	ror	w21,w21,#24
1730	sli	v9.4s,v0.4s,#12
1731	add	w13,w13,w17
1732	sli	v13.4s,v1.4s,#12
1733	add	w14,w14,w19
1734	sli	v17.4s,v2.4s,#12
1735	add	w15,w15,w20
1736	sli	v21.4s,v3.4s,#12
1737	add	w16,w16,w21
1738	sli	v25.4s,v4.4s,#12
1739	eor	w9,w9,w13
1740	sli	v29.4s,v5.4s,#12
1741	eor	w10,w10,w14
1742	add	v8.4s,v8.4s,v9.4s
1743	eor	w11,w11,w15
1744	add	v12.4s,v12.4s,v13.4s
1745	eor	w12,w12,w16
1746	add	v16.4s,v16.4s,v17.4s
1747	ror	w9,w9,#25
1748	add	v20.4s,v20.4s,v21.4s
1749	ror	w10,w10,#25
1750	add	v24.4s,v24.4s,v25.4s
1751	ror	w11,w11,#25
1752	add	v28.4s,v28.4s,v29.4s
1753	ror	w12,w12,#25
1754	eor	v11.16b,v11.16b,v8.16b
1755	add	w5,w5,w10
1756	eor	v15.16b,v15.16b,v12.16b
1757	add	w6,w6,w11
1758	eor	v19.16b,v19.16b,v16.16b
1759	add	w7,w7,w12
1760	eor	v23.16b,v23.16b,v20.16b
1761	add	w8,w8,w9
1762	eor	v27.16b,v27.16b,v24.16b
1763	eor	w21,w21,w5
1764	eor	v31.16b,v31.16b,v28.16b
1765	eor	w17,w17,w6
1766	tbl	v11.16b,{v11.16b},v6.16b
1767	eor	w19,w19,w7
1768	tbl	v15.16b,{v15.16b},v6.16b
1769	eor	w20,w20,w8
1770	tbl	v19.16b,{v19.16b},v6.16b
1771	ror	w21,w21,#16
1772	tbl	v23.16b,{v23.16b},v6.16b
1773	ror	w17,w17,#16
1774	tbl	v27.16b,{v27.16b},v6.16b
1775	ror	w19,w19,#16
1776	tbl	v31.16b,{v31.16b},v6.16b
1777	ror	w20,w20,#16
1778	add	v10.4s,v10.4s,v11.4s
1779	add	w15,w15,w21
1780	add	v14.4s,v14.4s,v15.4s
1781	add	w16,w16,w17
1782	add	v18.4s,v18.4s,v19.4s
1783	add	w13,w13,w19
1784	add	v22.4s,v22.4s,v23.4s
1785	add	w14,w14,w20
1786	add	v26.4s,v26.4s,v27.4s
1787	eor	w10,w10,w15
1788	add	v30.4s,v30.4s,v31.4s
1789	eor	w11,w11,w16
1790	eor	v0.16b,v9.16b,v10.16b
1791	eor	w12,w12,w13
1792	eor	v1.16b,v13.16b,v14.16b
1793	eor	w9,w9,w14
1794	eor	v2.16b,v17.16b,v18.16b
1795	ror	w10,w10,#20
1796	eor	v3.16b,v21.16b,v22.16b
1797	ror	w11,w11,#20
1798	eor	v4.16b,v25.16b,v26.16b
1799	ror	w12,w12,#20
1800	eor	v5.16b,v29.16b,v30.16b
1801	ror	w9,w9,#20
1802	ushr	v9.4s,v0.4s,#25
1803	add	w5,w5,w10
1804	ushr	v13.4s,v1.4s,#25
1805	add	w6,w6,w11
1806	ushr	v17.4s,v2.4s,#25
1807	add	w7,w7,w12
1808	ushr	v21.4s,v3.4s,#25
1809	add	w8,w8,w9
1810	ushr	v25.4s,v4.4s,#25
1811	eor	w21,w21,w5
1812	ushr	v29.4s,v5.4s,#25
1813	eor	w17,w17,w6
1814	sli	v9.4s,v0.4s,#7
1815	eor	w19,w19,w7
1816	sli	v13.4s,v1.4s,#7
1817	eor	w20,w20,w8
1818	sli	v17.4s,v2.4s,#7
1819	ror	w21,w21,#24
1820	sli	v21.4s,v3.4s,#7
1821	ror	w17,w17,#24
1822	sli	v25.4s,v4.4s,#7
1823	ror	w19,w19,#24
1824	sli	v29.4s,v5.4s,#7
1825	ror	w20,w20,#24
1826	ext	v10.16b,v10.16b,v10.16b,#8
1827	add	w15,w15,w21
1828	ext	v14.16b,v14.16b,v14.16b,#8
1829	add	w16,w16,w17
1830	ext	v18.16b,v18.16b,v18.16b,#8
1831	add	w13,w13,w19
1832	ext	v22.16b,v22.16b,v22.16b,#8
1833	add	w14,w14,w20
1834	ext	v26.16b,v26.16b,v26.16b,#8
1835	eor	w10,w10,w15
1836	ext	v30.16b,v30.16b,v30.16b,#8
1837	eor	w11,w11,w16
1838	ext	v11.16b,v11.16b,v11.16b,#4
1839	eor	w12,w12,w13
1840	ext	v15.16b,v15.16b,v15.16b,#4
1841	eor	w9,w9,w14
1842	ext	v19.16b,v19.16b,v19.16b,#4
1843	ror	w10,w10,#25
1844	ext	v23.16b,v23.16b,v23.16b,#4
1845	ror	w11,w11,#25
1846	ext	v27.16b,v27.16b,v27.16b,#4
1847	ror	w12,w12,#25
1848	ext	v31.16b,v31.16b,v31.16b,#4
1849	ror	w9,w9,#25
1850	ext	v9.16b,v9.16b,v9.16b,#12
1851	ext	v13.16b,v13.16b,v13.16b,#12
1852	ext	v17.16b,v17.16b,v17.16b,#12
1853	ext	v21.16b,v21.16b,v21.16b,#12
1854	ext	v25.16b,v25.16b,v25.16b,#12
1855	ext	v29.16b,v29.16b,v29.16b,#12
1856	cbnz	x4,.Loop_lower_neon
1857
1858	add	w5,w5,w22		// accumulate key block
1859	ldp	q0,q1,[sp,#0]
1860	add	x6,x6,x22,lsr#32
1861	ldp	q2,q3,[sp,#32]
1862	add	w7,w7,w23
1863	ldp	q4,q5,[sp,#64]
1864	add	x8,x8,x23,lsr#32
1865	ldr	q6,[sp,#96]
1866	add	v8.4s,v8.4s,v0.4s
1867	add	w9,w9,w24
1868	add	v12.4s,v12.4s,v0.4s
1869	add	x10,x10,x24,lsr#32
1870	add	v16.4s,v16.4s,v0.4s
1871	add	w11,w11,w25
1872	add	v20.4s,v20.4s,v0.4s
1873	add	x12,x12,x25,lsr#32
1874	add	v24.4s,v24.4s,v0.4s
1875	add	w13,w13,w26
1876	add	v28.4s,v28.4s,v0.4s
1877	add	x14,x14,x26,lsr#32
1878	add	v10.4s,v10.4s,v2.4s
1879	add	w15,w15,w27
1880	add	v14.4s,v14.4s,v2.4s
1881	add	x16,x16,x27,lsr#32
1882	add	v18.4s,v18.4s,v2.4s
1883	add	w17,w17,w28
1884	add	v22.4s,v22.4s,v2.4s
1885	add	x19,x19,x28,lsr#32
1886	add	v26.4s,v26.4s,v2.4s
1887	add	w20,w20,w30
1888	add	v30.4s,v30.4s,v2.4s
1889	add	x21,x21,x30,lsr#32
1890	add	v27.4s,v27.4s,v7.4s			// +4
1891	add	x5,x5,x6,lsl#32	// pack
1892	add	v31.4s,v31.4s,v7.4s			// +4
1893	add	x7,x7,x8,lsl#32
1894	add	v11.4s,v11.4s,v3.4s
1895	ldp	x6,x8,[x1,#0]		// load input
1896	add	v15.4s,v15.4s,v4.4s
1897	add	x9,x9,x10,lsl#32
1898	add	v19.4s,v19.4s,v5.4s
1899	add	x11,x11,x12,lsl#32
1900	add	v23.4s,v23.4s,v6.4s
1901	ldp	x10,x12,[x1,#16]
1902	add	v27.4s,v27.4s,v3.4s
1903	add	x13,x13,x14,lsl#32
1904	add	v31.4s,v31.4s,v4.4s
1905	add	x15,x15,x16,lsl#32
1906	add	v9.4s,v9.4s,v1.4s
1907	ldp	x14,x16,[x1,#32]
1908	add	v13.4s,v13.4s,v1.4s
1909	add	x17,x17,x19,lsl#32
1910	add	v17.4s,v17.4s,v1.4s
1911	add	x20,x20,x21,lsl#32
1912	add	v21.4s,v21.4s,v1.4s
1913	ldp	x19,x21,[x1,#48]
1914	add	v25.4s,v25.4s,v1.4s
1915	add	x1,x1,#64
1916	add	v29.4s,v29.4s,v1.4s
1917
1918#ifdef	__AARCH64EB__
1919	rev	x5,x5
1920	rev	x7,x7
1921	rev	x9,x9
1922	rev	x11,x11
1923	rev	x13,x13
1924	rev	x15,x15
1925	rev	x17,x17
1926	rev	x20,x20
1927#endif
1928	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1929	eor	x5,x5,x6
1930	eor	x7,x7,x8
1931	eor	x9,x9,x10
1932	eor	x11,x11,x12
1933	eor	x13,x13,x14
1934	eor	v8.16b,v8.16b,v0.16b
1935	eor	x15,x15,x16
1936	eor	v9.16b,v9.16b,v1.16b
1937	eor	x17,x17,x19
1938	eor	v10.16b,v10.16b,v2.16b
1939	eor	x20,x20,x21
1940	eor	v11.16b,v11.16b,v3.16b
1941	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1942
1943	stp	x5,x7,[x0,#0]		// store output
1944	add	x28,x28,#7			// increment counter
1945	stp	x9,x11,[x0,#16]
1946	stp	x13,x15,[x0,#32]
1947	stp	x17,x20,[x0,#48]
1948	add	x0,x0,#64
1949	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1950
1951	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1952	eor	v12.16b,v12.16b,v0.16b
1953	eor	v13.16b,v13.16b,v1.16b
1954	eor	v14.16b,v14.16b,v2.16b
1955	eor	v15.16b,v15.16b,v3.16b
1956	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1957
1958	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1959	eor	v16.16b,v16.16b,v8.16b
1960	ldp	q0,q1,[sp,#0]
1961	eor	v17.16b,v17.16b,v9.16b
1962	ldp	q2,q3,[sp,#32]
1963	eor	v18.16b,v18.16b,v10.16b
1964	eor	v19.16b,v19.16b,v11.16b
1965	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1966
1967	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
1968	eor	v20.16b,v20.16b,v12.16b
1969	eor	v21.16b,v21.16b,v13.16b
1970	eor	v22.16b,v22.16b,v14.16b
1971	eor	v23.16b,v23.16b,v15.16b
1972	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1973
1974	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1975	eor	v24.16b,v24.16b,v16.16b
1976	eor	v25.16b,v25.16b,v17.16b
1977	eor	v26.16b,v26.16b,v18.16b
1978	eor	v27.16b,v27.16b,v19.16b
1979	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
1980
1981	shl	v8.4s,v7.4s,#1			// 4 -> 8
1982	eor	v28.16b,v28.16b,v20.16b
1983	eor	v29.16b,v29.16b,v21.16b
1984	eor	v30.16b,v30.16b,v22.16b
1985	eor	v31.16b,v31.16b,v23.16b
1986	st1	{v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64
1987
1988	add	v3.4s,v3.4s,v8.4s			// += 8
1989	add	v4.4s,v4.4s,v8.4s
1990	add	v5.4s,v5.4s,v8.4s
1991	add	v6.4s,v6.4s,v8.4s
1992
1993	b.hs	.Loop_outer_512_neon
1994
1995	adds	x2,x2,#512
1996	ushr	v7.4s,v7.4s,#1			// 4 -> 2
1997
1998	ldp	d10,d11,[sp,#128+16]		// meet ABI requirements
1999	ldp	d12,d13,[sp,#128+32]
2000	ldp	d14,d15,[sp,#128+48]
2001
2002	stp	q0,q0,[sp,#0]		// wipe off-load area
2003	stp	q0,q0,[sp,#32]
2004	stp	q0,q0,[sp,#64]
2005
2006	b.eq	.Ldone_512_neon
2007
2008	sub	x3,x3,#16			// .Lone
2009	cmp	x2,#192
2010	add	sp,sp,#128
2011	sub	v3.4s,v3.4s,v7.4s		// -= 2
2012	ld1	{v8.4s,v9.4s},[x3]
2013	b.hs	.Loop_outer_neon
2014
2015	ldp	d8,d9,[sp,#0]			// meet ABI requirements
2016	eor	v1.16b,v1.16b,v1.16b
2017	eor	v2.16b,v2.16b,v2.16b
2018	eor	v3.16b,v3.16b,v3.16b
2019	eor	v4.16b,v4.16b,v4.16b
2020	eor	v5.16b,v5.16b,v5.16b
2021	eor	v6.16b,v6.16b,v6.16b
2022	b	.Loop_outer
2023
2024.Ldone_512_neon:
2025	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
2026	ldp	x19,x20,[x29,#16]
2027	add	sp,sp,#128+64
2028	ldp	x21,x22,[x29,#32]
2029	ldp	x23,x24,[x29,#48]
2030	ldp	x25,x26,[x29,#64]
2031	ldp	x27,x28,[x29,#80]
2032	ldp	x29,x30,[sp],#96
2033	AARCH64_VALIDATE_LINK_REGISTER
2034	ret
2035.size	ChaCha20_512_neon,.-ChaCha20_512_neon
2036