xref: /freebsd/sys/crypto/openssl/aarch64/chacha-armv8.S (revision f2d48b5e2c3b45850585e4d7aee324fe148afbf2)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from chacha-armv8.pl. */
3#include "arm_arch.h"
4
5.text
6
7
8.hidden	OPENSSL_armcap_P
9
10.align	5
11.Lsigma:
12.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
13.Lone:
14.long	1,0,0,0
15.LOPENSSL_armcap_P:
16#ifdef	__ILP32__
17.long	OPENSSL_armcap_P-.
18#else
19.quad	OPENSSL_armcap_P-.
20#endif
21.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
22.align	2
23
24.globl	ChaCha20_ctr32
25.type	ChaCha20_ctr32,%function
26.align	5
27ChaCha20_ctr32:
28	cbz	x2,.Labort
29	adr	x5,.LOPENSSL_armcap_P
30	cmp	x2,#192
31	b.lo	.Lshort
32#ifdef	__ILP32__
33	ldrsw	x6,[x5]
34#else
35	ldr	x6,[x5]
36#endif
37	ldr	w17,[x6,x5]
38	tst	w17,#ARMV7_NEON
39	b.ne	ChaCha20_neon
40
41.Lshort:
42.inst	0xd503233f			// paciasp
43	stp	x29,x30,[sp,#-96]!
44	add	x29,sp,#0
45
46	adr	x5,.Lsigma
47	stp	x19,x20,[sp,#16]
48	stp	x21,x22,[sp,#32]
49	stp	x23,x24,[sp,#48]
50	stp	x25,x26,[sp,#64]
51	stp	x27,x28,[sp,#80]
52	sub	sp,sp,#64
53
54	ldp	x22,x23,[x5]		// load sigma
55	ldp	x24,x25,[x3]		// load key
56	ldp	x26,x27,[x3,#16]
57	ldp	x28,x30,[x4]		// load counter
58#ifdef	__ARMEB__
59	ror	x24,x24,#32
60	ror	x25,x25,#32
61	ror	x26,x26,#32
62	ror	x27,x27,#32
63	ror	x28,x28,#32
64	ror	x30,x30,#32
65#endif
66
67.Loop_outer:
68	mov	w5,w22			// unpack key block
69	lsr	x6,x22,#32
70	mov	w7,w23
71	lsr	x8,x23,#32
72	mov	w9,w24
73	lsr	x10,x24,#32
74	mov	w11,w25
75	lsr	x12,x25,#32
76	mov	w13,w26
77	lsr	x14,x26,#32
78	mov	w15,w27
79	lsr	x16,x27,#32
80	mov	w17,w28
81	lsr	x19,x28,#32
82	mov	w20,w30
83	lsr	x21,x30,#32
84
85	mov	x4,#10
86	subs	x2,x2,#64
87.Loop:
88	sub	x4,x4,#1
89	add	w5,w5,w9
90	add	w6,w6,w10
91	add	w7,w7,w11
92	add	w8,w8,w12
93	eor	w17,w17,w5
94	eor	w19,w19,w6
95	eor	w20,w20,w7
96	eor	w21,w21,w8
97	ror	w17,w17,#16
98	ror	w19,w19,#16
99	ror	w20,w20,#16
100	ror	w21,w21,#16
101	add	w13,w13,w17
102	add	w14,w14,w19
103	add	w15,w15,w20
104	add	w16,w16,w21
105	eor	w9,w9,w13
106	eor	w10,w10,w14
107	eor	w11,w11,w15
108	eor	w12,w12,w16
109	ror	w9,w9,#20
110	ror	w10,w10,#20
111	ror	w11,w11,#20
112	ror	w12,w12,#20
113	add	w5,w5,w9
114	add	w6,w6,w10
115	add	w7,w7,w11
116	add	w8,w8,w12
117	eor	w17,w17,w5
118	eor	w19,w19,w6
119	eor	w20,w20,w7
120	eor	w21,w21,w8
121	ror	w17,w17,#24
122	ror	w19,w19,#24
123	ror	w20,w20,#24
124	ror	w21,w21,#24
125	add	w13,w13,w17
126	add	w14,w14,w19
127	add	w15,w15,w20
128	add	w16,w16,w21
129	eor	w9,w9,w13
130	eor	w10,w10,w14
131	eor	w11,w11,w15
132	eor	w12,w12,w16
133	ror	w9,w9,#25
134	ror	w10,w10,#25
135	ror	w11,w11,#25
136	ror	w12,w12,#25
137	add	w5,w5,w10
138	add	w6,w6,w11
139	add	w7,w7,w12
140	add	w8,w8,w9
141	eor	w21,w21,w5
142	eor	w17,w17,w6
143	eor	w19,w19,w7
144	eor	w20,w20,w8
145	ror	w21,w21,#16
146	ror	w17,w17,#16
147	ror	w19,w19,#16
148	ror	w20,w20,#16
149	add	w15,w15,w21
150	add	w16,w16,w17
151	add	w13,w13,w19
152	add	w14,w14,w20
153	eor	w10,w10,w15
154	eor	w11,w11,w16
155	eor	w12,w12,w13
156	eor	w9,w9,w14
157	ror	w10,w10,#20
158	ror	w11,w11,#20
159	ror	w12,w12,#20
160	ror	w9,w9,#20
161	add	w5,w5,w10
162	add	w6,w6,w11
163	add	w7,w7,w12
164	add	w8,w8,w9
165	eor	w21,w21,w5
166	eor	w17,w17,w6
167	eor	w19,w19,w7
168	eor	w20,w20,w8
169	ror	w21,w21,#24
170	ror	w17,w17,#24
171	ror	w19,w19,#24
172	ror	w20,w20,#24
173	add	w15,w15,w21
174	add	w16,w16,w17
175	add	w13,w13,w19
176	add	w14,w14,w20
177	eor	w10,w10,w15
178	eor	w11,w11,w16
179	eor	w12,w12,w13
180	eor	w9,w9,w14
181	ror	w10,w10,#25
182	ror	w11,w11,#25
183	ror	w12,w12,#25
184	ror	w9,w9,#25
185	cbnz	x4,.Loop
186
187	add	w5,w5,w22		// accumulate key block
188	add	x6,x6,x22,lsr#32
189	add	w7,w7,w23
190	add	x8,x8,x23,lsr#32
191	add	w9,w9,w24
192	add	x10,x10,x24,lsr#32
193	add	w11,w11,w25
194	add	x12,x12,x25,lsr#32
195	add	w13,w13,w26
196	add	x14,x14,x26,lsr#32
197	add	w15,w15,w27
198	add	x16,x16,x27,lsr#32
199	add	w17,w17,w28
200	add	x19,x19,x28,lsr#32
201	add	w20,w20,w30
202	add	x21,x21,x30,lsr#32
203
204	b.lo	.Ltail
205
206	add	x5,x5,x6,lsl#32	// pack
207	add	x7,x7,x8,lsl#32
208	ldp	x6,x8,[x1,#0]		// load input
209	add	x9,x9,x10,lsl#32
210	add	x11,x11,x12,lsl#32
211	ldp	x10,x12,[x1,#16]
212	add	x13,x13,x14,lsl#32
213	add	x15,x15,x16,lsl#32
214	ldp	x14,x16,[x1,#32]
215	add	x17,x17,x19,lsl#32
216	add	x20,x20,x21,lsl#32
217	ldp	x19,x21,[x1,#48]
218	add	x1,x1,#64
219#ifdef	__ARMEB__
220	rev	x5,x5
221	rev	x7,x7
222	rev	x9,x9
223	rev	x11,x11
224	rev	x13,x13
225	rev	x15,x15
226	rev	x17,x17
227	rev	x20,x20
228#endif
229	eor	x5,x5,x6
230	eor	x7,x7,x8
231	eor	x9,x9,x10
232	eor	x11,x11,x12
233	eor	x13,x13,x14
234	eor	x15,x15,x16
235	eor	x17,x17,x19
236	eor	x20,x20,x21
237
238	stp	x5,x7,[x0,#0]		// store output
239	add	x28,x28,#1			// increment counter
240	stp	x9,x11,[x0,#16]
241	stp	x13,x15,[x0,#32]
242	stp	x17,x20,[x0,#48]
243	add	x0,x0,#64
244
245	b.hi	.Loop_outer
246
247	ldp	x19,x20,[x29,#16]
248	add	sp,sp,#64
249	ldp	x21,x22,[x29,#32]
250	ldp	x23,x24,[x29,#48]
251	ldp	x25,x26,[x29,#64]
252	ldp	x27,x28,[x29,#80]
253	ldp	x29,x30,[sp],#96
254.inst	0xd50323bf			// autiasp
255.Labort:
256	ret
257
258.align	4
259.Ltail:
260	add	x2,x2,#64
261.Less_than_64:
262	sub	x0,x0,#1
263	add	x1,x1,x2
264	add	x0,x0,x2
265	add	x4,sp,x2
266	neg	x2,x2
267
268	add	x5,x5,x6,lsl#32	// pack
269	add	x7,x7,x8,lsl#32
270	add	x9,x9,x10,lsl#32
271	add	x11,x11,x12,lsl#32
272	add	x13,x13,x14,lsl#32
273	add	x15,x15,x16,lsl#32
274	add	x17,x17,x19,lsl#32
275	add	x20,x20,x21,lsl#32
276#ifdef	__ARMEB__
277	rev	x5,x5
278	rev	x7,x7
279	rev	x9,x9
280	rev	x11,x11
281	rev	x13,x13
282	rev	x15,x15
283	rev	x17,x17
284	rev	x20,x20
285#endif
286	stp	x5,x7,[sp,#0]
287	stp	x9,x11,[sp,#16]
288	stp	x13,x15,[sp,#32]
289	stp	x17,x20,[sp,#48]
290
291.Loop_tail:
292	ldrb	w10,[x1,x2]
293	ldrb	w11,[x4,x2]
294	add	x2,x2,#1
295	eor	w10,w10,w11
296	strb	w10,[x0,x2]
297	cbnz	x2,.Loop_tail
298
299	stp	xzr,xzr,[sp,#0]
300	stp	xzr,xzr,[sp,#16]
301	stp	xzr,xzr,[sp,#32]
302	stp	xzr,xzr,[sp,#48]
303
304	ldp	x19,x20,[x29,#16]
305	add	sp,sp,#64
306	ldp	x21,x22,[x29,#32]
307	ldp	x23,x24,[x29,#48]
308	ldp	x25,x26,[x29,#64]
309	ldp	x27,x28,[x29,#80]
310	ldp	x29,x30,[sp],#96
311.inst	0xd50323bf			// autiasp
312	ret
313.size	ChaCha20_ctr32,.-ChaCha20_ctr32
314
315.type	ChaCha20_neon,%function
316.align	5
317ChaCha20_neon:
318.inst	0xd503233f			// paciasp
319	stp	x29,x30,[sp,#-96]!
320	add	x29,sp,#0
321
322	adr	x5,.Lsigma
323	stp	x19,x20,[sp,#16]
324	stp	x21,x22,[sp,#32]
325	stp	x23,x24,[sp,#48]
326	stp	x25,x26,[sp,#64]
327	stp	x27,x28,[sp,#80]
328	cmp	x2,#512
329	b.hs	.L512_or_more_neon
330
331	sub	sp,sp,#64
332
333	ldp	x22,x23,[x5]		// load sigma
334	ld1	{v24.4s},[x5],#16
335	ldp	x24,x25,[x3]		// load key
336	ldp	x26,x27,[x3,#16]
337	ld1	{v25.4s,v26.4s},[x3]
338	ldp	x28,x30,[x4]		// load counter
339	ld1	{v27.4s},[x4]
340	ld1	{v31.4s},[x5]
341#ifdef	__ARMEB__
342	rev64	v24.4s,v24.4s
343	ror	x24,x24,#32
344	ror	x25,x25,#32
345	ror	x26,x26,#32
346	ror	x27,x27,#32
347	ror	x28,x28,#32
348	ror	x30,x30,#32
349#endif
350	add	v27.4s,v27.4s,v31.4s		// += 1
351	add	v28.4s,v27.4s,v31.4s
352	add	v29.4s,v28.4s,v31.4s
353	shl	v31.4s,v31.4s,#2			// 1 -> 4
354
355.Loop_outer_neon:
356	mov	w5,w22			// unpack key block
357	lsr	x6,x22,#32
358	mov	v0.16b,v24.16b
359	mov	w7,w23
360	lsr	x8,x23,#32
361	mov	v4.16b,v24.16b
362	mov	w9,w24
363	lsr	x10,x24,#32
364	mov	v16.16b,v24.16b
365	mov	w11,w25
366	mov	v1.16b,v25.16b
367	lsr	x12,x25,#32
368	mov	v5.16b,v25.16b
369	mov	w13,w26
370	mov	v17.16b,v25.16b
371	lsr	x14,x26,#32
372	mov	v3.16b,v27.16b
373	mov	w15,w27
374	mov	v7.16b,v28.16b
375	lsr	x16,x27,#32
376	mov	v19.16b,v29.16b
377	mov	w17,w28
378	mov	v2.16b,v26.16b
379	lsr	x19,x28,#32
380	mov	v6.16b,v26.16b
381	mov	w20,w30
382	mov	v18.16b,v26.16b
383	lsr	x21,x30,#32
384
385	mov	x4,#10
386	subs	x2,x2,#256
387.Loop_neon:
388	sub	x4,x4,#1
389	add	v0.4s,v0.4s,v1.4s
390	add	w5,w5,w9
391	add	v4.4s,v4.4s,v5.4s
392	add	w6,w6,w10
393	add	v16.4s,v16.4s,v17.4s
394	add	w7,w7,w11
395	eor	v3.16b,v3.16b,v0.16b
396	add	w8,w8,w12
397	eor	v7.16b,v7.16b,v4.16b
398	eor	w17,w17,w5
399	eor	v19.16b,v19.16b,v16.16b
400	eor	w19,w19,w6
401	rev32	v3.8h,v3.8h
402	eor	w20,w20,w7
403	rev32	v7.8h,v7.8h
404	eor	w21,w21,w8
405	rev32	v19.8h,v19.8h
406	ror	w17,w17,#16
407	add	v2.4s,v2.4s,v3.4s
408	ror	w19,w19,#16
409	add	v6.4s,v6.4s,v7.4s
410	ror	w20,w20,#16
411	add	v18.4s,v18.4s,v19.4s
412	ror	w21,w21,#16
413	eor	v20.16b,v1.16b,v2.16b
414	add	w13,w13,w17
415	eor	v21.16b,v5.16b,v6.16b
416	add	w14,w14,w19
417	eor	v22.16b,v17.16b,v18.16b
418	add	w15,w15,w20
419	ushr	v1.4s,v20.4s,#20
420	add	w16,w16,w21
421	ushr	v5.4s,v21.4s,#20
422	eor	w9,w9,w13
423	ushr	v17.4s,v22.4s,#20
424	eor	w10,w10,w14
425	sli	v1.4s,v20.4s,#12
426	eor	w11,w11,w15
427	sli	v5.4s,v21.4s,#12
428	eor	w12,w12,w16
429	sli	v17.4s,v22.4s,#12
430	ror	w9,w9,#20
431	add	v0.4s,v0.4s,v1.4s
432	ror	w10,w10,#20
433	add	v4.4s,v4.4s,v5.4s
434	ror	w11,w11,#20
435	add	v16.4s,v16.4s,v17.4s
436	ror	w12,w12,#20
437	eor	v20.16b,v3.16b,v0.16b
438	add	w5,w5,w9
439	eor	v21.16b,v7.16b,v4.16b
440	add	w6,w6,w10
441	eor	v22.16b,v19.16b,v16.16b
442	add	w7,w7,w11
443	ushr	v3.4s,v20.4s,#24
444	add	w8,w8,w12
445	ushr	v7.4s,v21.4s,#24
446	eor	w17,w17,w5
447	ushr	v19.4s,v22.4s,#24
448	eor	w19,w19,w6
449	sli	v3.4s,v20.4s,#8
450	eor	w20,w20,w7
451	sli	v7.4s,v21.4s,#8
452	eor	w21,w21,w8
453	sli	v19.4s,v22.4s,#8
454	ror	w17,w17,#24
455	add	v2.4s,v2.4s,v3.4s
456	ror	w19,w19,#24
457	add	v6.4s,v6.4s,v7.4s
458	ror	w20,w20,#24
459	add	v18.4s,v18.4s,v19.4s
460	ror	w21,w21,#24
461	eor	v20.16b,v1.16b,v2.16b
462	add	w13,w13,w17
463	eor	v21.16b,v5.16b,v6.16b
464	add	w14,w14,w19
465	eor	v22.16b,v17.16b,v18.16b
466	add	w15,w15,w20
467	ushr	v1.4s,v20.4s,#25
468	add	w16,w16,w21
469	ushr	v5.4s,v21.4s,#25
470	eor	w9,w9,w13
471	ushr	v17.4s,v22.4s,#25
472	eor	w10,w10,w14
473	sli	v1.4s,v20.4s,#7
474	eor	w11,w11,w15
475	sli	v5.4s,v21.4s,#7
476	eor	w12,w12,w16
477	sli	v17.4s,v22.4s,#7
478	ror	w9,w9,#25
479	ext	v2.16b,v2.16b,v2.16b,#8
480	ror	w10,w10,#25
481	ext	v6.16b,v6.16b,v6.16b,#8
482	ror	w11,w11,#25
483	ext	v18.16b,v18.16b,v18.16b,#8
484	ror	w12,w12,#25
485	ext	v3.16b,v3.16b,v3.16b,#12
486	ext	v7.16b,v7.16b,v7.16b,#12
487	ext	v19.16b,v19.16b,v19.16b,#12
488	ext	v1.16b,v1.16b,v1.16b,#4
489	ext	v5.16b,v5.16b,v5.16b,#4
490	ext	v17.16b,v17.16b,v17.16b,#4
491	add	v0.4s,v0.4s,v1.4s
492	add	w5,w5,w10
493	add	v4.4s,v4.4s,v5.4s
494	add	w6,w6,w11
495	add	v16.4s,v16.4s,v17.4s
496	add	w7,w7,w12
497	eor	v3.16b,v3.16b,v0.16b
498	add	w8,w8,w9
499	eor	v7.16b,v7.16b,v4.16b
500	eor	w21,w21,w5
501	eor	v19.16b,v19.16b,v16.16b
502	eor	w17,w17,w6
503	rev32	v3.8h,v3.8h
504	eor	w19,w19,w7
505	rev32	v7.8h,v7.8h
506	eor	w20,w20,w8
507	rev32	v19.8h,v19.8h
508	ror	w21,w21,#16
509	add	v2.4s,v2.4s,v3.4s
510	ror	w17,w17,#16
511	add	v6.4s,v6.4s,v7.4s
512	ror	w19,w19,#16
513	add	v18.4s,v18.4s,v19.4s
514	ror	w20,w20,#16
515	eor	v20.16b,v1.16b,v2.16b
516	add	w15,w15,w21
517	eor	v21.16b,v5.16b,v6.16b
518	add	w16,w16,w17
519	eor	v22.16b,v17.16b,v18.16b
520	add	w13,w13,w19
521	ushr	v1.4s,v20.4s,#20
522	add	w14,w14,w20
523	ushr	v5.4s,v21.4s,#20
524	eor	w10,w10,w15
525	ushr	v17.4s,v22.4s,#20
526	eor	w11,w11,w16
527	sli	v1.4s,v20.4s,#12
528	eor	w12,w12,w13
529	sli	v5.4s,v21.4s,#12
530	eor	w9,w9,w14
531	sli	v17.4s,v22.4s,#12
532	ror	w10,w10,#20
533	add	v0.4s,v0.4s,v1.4s
534	ror	w11,w11,#20
535	add	v4.4s,v4.4s,v5.4s
536	ror	w12,w12,#20
537	add	v16.4s,v16.4s,v17.4s
538	ror	w9,w9,#20
539	eor	v20.16b,v3.16b,v0.16b
540	add	w5,w5,w10
541	eor	v21.16b,v7.16b,v4.16b
542	add	w6,w6,w11
543	eor	v22.16b,v19.16b,v16.16b
544	add	w7,w7,w12
545	ushr	v3.4s,v20.4s,#24
546	add	w8,w8,w9
547	ushr	v7.4s,v21.4s,#24
548	eor	w21,w21,w5
549	ushr	v19.4s,v22.4s,#24
550	eor	w17,w17,w6
551	sli	v3.4s,v20.4s,#8
552	eor	w19,w19,w7
553	sli	v7.4s,v21.4s,#8
554	eor	w20,w20,w8
555	sli	v19.4s,v22.4s,#8
556	ror	w21,w21,#24
557	add	v2.4s,v2.4s,v3.4s
558	ror	w17,w17,#24
559	add	v6.4s,v6.4s,v7.4s
560	ror	w19,w19,#24
561	add	v18.4s,v18.4s,v19.4s
562	ror	w20,w20,#24
563	eor	v20.16b,v1.16b,v2.16b
564	add	w15,w15,w21
565	eor	v21.16b,v5.16b,v6.16b
566	add	w16,w16,w17
567	eor	v22.16b,v17.16b,v18.16b
568	add	w13,w13,w19
569	ushr	v1.4s,v20.4s,#25
570	add	w14,w14,w20
571	ushr	v5.4s,v21.4s,#25
572	eor	w10,w10,w15
573	ushr	v17.4s,v22.4s,#25
574	eor	w11,w11,w16
575	sli	v1.4s,v20.4s,#7
576	eor	w12,w12,w13
577	sli	v5.4s,v21.4s,#7
578	eor	w9,w9,w14
579	sli	v17.4s,v22.4s,#7
580	ror	w10,w10,#25
581	ext	v2.16b,v2.16b,v2.16b,#8
582	ror	w11,w11,#25
583	ext	v6.16b,v6.16b,v6.16b,#8
584	ror	w12,w12,#25
585	ext	v18.16b,v18.16b,v18.16b,#8
586	ror	w9,w9,#25
587	ext	v3.16b,v3.16b,v3.16b,#4
588	ext	v7.16b,v7.16b,v7.16b,#4
589	ext	v19.16b,v19.16b,v19.16b,#4
590	ext	v1.16b,v1.16b,v1.16b,#12
591	ext	v5.16b,v5.16b,v5.16b,#12
592	ext	v17.16b,v17.16b,v17.16b,#12
593	cbnz	x4,.Loop_neon
594
595	add	w5,w5,w22		// accumulate key block
596	add	v0.4s,v0.4s,v24.4s
597	add	x6,x6,x22,lsr#32
598	add	v4.4s,v4.4s,v24.4s
599	add	w7,w7,w23
600	add	v16.4s,v16.4s,v24.4s
601	add	x8,x8,x23,lsr#32
602	add	v2.4s,v2.4s,v26.4s
603	add	w9,w9,w24
604	add	v6.4s,v6.4s,v26.4s
605	add	x10,x10,x24,lsr#32
606	add	v18.4s,v18.4s,v26.4s
607	add	w11,w11,w25
608	add	v3.4s,v3.4s,v27.4s
609	add	x12,x12,x25,lsr#32
610	add	w13,w13,w26
611	add	v7.4s,v7.4s,v28.4s
612	add	x14,x14,x26,lsr#32
613	add	w15,w15,w27
614	add	v19.4s,v19.4s,v29.4s
615	add	x16,x16,x27,lsr#32
616	add	w17,w17,w28
617	add	v1.4s,v1.4s,v25.4s
618	add	x19,x19,x28,lsr#32
619	add	w20,w20,w30
620	add	v5.4s,v5.4s,v25.4s
621	add	x21,x21,x30,lsr#32
622	add	v17.4s,v17.4s,v25.4s
623
624	b.lo	.Ltail_neon
625
626	add	x5,x5,x6,lsl#32	// pack
627	add	x7,x7,x8,lsl#32
628	ldp	x6,x8,[x1,#0]		// load input
629	add	x9,x9,x10,lsl#32
630	add	x11,x11,x12,lsl#32
631	ldp	x10,x12,[x1,#16]
632	add	x13,x13,x14,lsl#32
633	add	x15,x15,x16,lsl#32
634	ldp	x14,x16,[x1,#32]
635	add	x17,x17,x19,lsl#32
636	add	x20,x20,x21,lsl#32
637	ldp	x19,x21,[x1,#48]
638	add	x1,x1,#64
639#ifdef	__ARMEB__
640	rev	x5,x5
641	rev	x7,x7
642	rev	x9,x9
643	rev	x11,x11
644	rev	x13,x13
645	rev	x15,x15
646	rev	x17,x17
647	rev	x20,x20
648#endif
649	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
650	eor	x5,x5,x6
651	eor	x7,x7,x8
652	eor	x9,x9,x10
653	eor	x11,x11,x12
654	eor	x13,x13,x14
655	eor	v0.16b,v0.16b,v20.16b
656	eor	x15,x15,x16
657	eor	v1.16b,v1.16b,v21.16b
658	eor	x17,x17,x19
659	eor	v2.16b,v2.16b,v22.16b
660	eor	x20,x20,x21
661	eor	v3.16b,v3.16b,v23.16b
662	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
663
664	stp	x5,x7,[x0,#0]		// store output
665	add	x28,x28,#4			// increment counter
666	stp	x9,x11,[x0,#16]
667	add	v27.4s,v27.4s,v31.4s		// += 4
668	stp	x13,x15,[x0,#32]
669	add	v28.4s,v28.4s,v31.4s
670	stp	x17,x20,[x0,#48]
671	add	v29.4s,v29.4s,v31.4s
672	add	x0,x0,#64
673
674	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
675	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
676
677	eor	v4.16b,v4.16b,v20.16b
678	eor	v5.16b,v5.16b,v21.16b
679	eor	v6.16b,v6.16b,v22.16b
680	eor	v7.16b,v7.16b,v23.16b
681	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
682
683	eor	v16.16b,v16.16b,v0.16b
684	eor	v17.16b,v17.16b,v1.16b
685	eor	v18.16b,v18.16b,v2.16b
686	eor	v19.16b,v19.16b,v3.16b
687	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
688
689	b.hi	.Loop_outer_neon
690
691	ldp	x19,x20,[x29,#16]
692	add	sp,sp,#64
693	ldp	x21,x22,[x29,#32]
694	ldp	x23,x24,[x29,#48]
695	ldp	x25,x26,[x29,#64]
696	ldp	x27,x28,[x29,#80]
697	ldp	x29,x30,[sp],#96
698.inst	0xd50323bf			// autiasp
699	ret
700
701.Ltail_neon:
702	add	x2,x2,#256
703	cmp	x2,#64
704	b.lo	.Less_than_64
705
706	add	x5,x5,x6,lsl#32	// pack
707	add	x7,x7,x8,lsl#32
708	ldp	x6,x8,[x1,#0]		// load input
709	add	x9,x9,x10,lsl#32
710	add	x11,x11,x12,lsl#32
711	ldp	x10,x12,[x1,#16]
712	add	x13,x13,x14,lsl#32
713	add	x15,x15,x16,lsl#32
714	ldp	x14,x16,[x1,#32]
715	add	x17,x17,x19,lsl#32
716	add	x20,x20,x21,lsl#32
717	ldp	x19,x21,[x1,#48]
718	add	x1,x1,#64
719#ifdef	__ARMEB__
720	rev	x5,x5
721	rev	x7,x7
722	rev	x9,x9
723	rev	x11,x11
724	rev	x13,x13
725	rev	x15,x15
726	rev	x17,x17
727	rev	x20,x20
728#endif
729	eor	x5,x5,x6
730	eor	x7,x7,x8
731	eor	x9,x9,x10
732	eor	x11,x11,x12
733	eor	x13,x13,x14
734	eor	x15,x15,x16
735	eor	x17,x17,x19
736	eor	x20,x20,x21
737
738	stp	x5,x7,[x0,#0]		// store output
739	add	x28,x28,#4			// increment counter
740	stp	x9,x11,[x0,#16]
741	stp	x13,x15,[x0,#32]
742	stp	x17,x20,[x0,#48]
743	add	x0,x0,#64
744	b.eq	.Ldone_neon
745	sub	x2,x2,#64
746	cmp	x2,#64
747	b.lo	.Less_than_128
748
749	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
750	eor	v0.16b,v0.16b,v20.16b
751	eor	v1.16b,v1.16b,v21.16b
752	eor	v2.16b,v2.16b,v22.16b
753	eor	v3.16b,v3.16b,v23.16b
754	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
755	b.eq	.Ldone_neon
756	sub	x2,x2,#64
757	cmp	x2,#64
758	b.lo	.Less_than_192
759
760	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
761	eor	v4.16b,v4.16b,v20.16b
762	eor	v5.16b,v5.16b,v21.16b
763	eor	v6.16b,v6.16b,v22.16b
764	eor	v7.16b,v7.16b,v23.16b
765	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
766	b.eq	.Ldone_neon
767	sub	x2,x2,#64
768
769	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
770	b	.Last_neon
771
772.Less_than_128:
773	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
774	b	.Last_neon
775.Less_than_192:
776	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
777	b	.Last_neon
778
779.align	4
780.Last_neon:
781	sub	x0,x0,#1
782	add	x1,x1,x2
783	add	x0,x0,x2
784	add	x4,sp,x2
785	neg	x2,x2
786
787.Loop_tail_neon:
788	ldrb	w10,[x1,x2]
789	ldrb	w11,[x4,x2]
790	add	x2,x2,#1
791	eor	w10,w10,w11
792	strb	w10,[x0,x2]
793	cbnz	x2,.Loop_tail_neon
794
795	stp	xzr,xzr,[sp,#0]
796	stp	xzr,xzr,[sp,#16]
797	stp	xzr,xzr,[sp,#32]
798	stp	xzr,xzr,[sp,#48]
799
800.Ldone_neon:
801	ldp	x19,x20,[x29,#16]
802	add	sp,sp,#64
803	ldp	x21,x22,[x29,#32]
804	ldp	x23,x24,[x29,#48]
805	ldp	x25,x26,[x29,#64]
806	ldp	x27,x28,[x29,#80]
807	ldp	x29,x30,[sp],#96
808.inst	0xd50323bf			// autiasp
809	ret
810.size	ChaCha20_neon,.-ChaCha20_neon
811.type	ChaCha20_512_neon,%function
812.align	5
813ChaCha20_512_neon:
814.inst	0xd503233f			// paciasp
815	stp	x29,x30,[sp,#-96]!
816	add	x29,sp,#0
817
818	adr	x5,.Lsigma
819	stp	x19,x20,[sp,#16]
820	stp	x21,x22,[sp,#32]
821	stp	x23,x24,[sp,#48]
822	stp	x25,x26,[sp,#64]
823	stp	x27,x28,[sp,#80]
824
825.L512_or_more_neon:
826	sub	sp,sp,#128+64
827
828	ldp	x22,x23,[x5]		// load sigma
829	ld1	{v24.4s},[x5],#16
830	ldp	x24,x25,[x3]		// load key
831	ldp	x26,x27,[x3,#16]
832	ld1	{v25.4s,v26.4s},[x3]
833	ldp	x28,x30,[x4]		// load counter
834	ld1	{v27.4s},[x4]
835	ld1	{v31.4s},[x5]
836#ifdef	__ARMEB__
837	rev64	v24.4s,v24.4s
838	ror	x24,x24,#32
839	ror	x25,x25,#32
840	ror	x26,x26,#32
841	ror	x27,x27,#32
842	ror	x28,x28,#32
843	ror	x30,x30,#32
844#endif
845	add	v27.4s,v27.4s,v31.4s		// += 1
846	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
847	add	v27.4s,v27.4s,v31.4s		// not typo
848	str	q26,[sp,#32]
849	add	v28.4s,v27.4s,v31.4s
850	add	v29.4s,v28.4s,v31.4s
851	add	v30.4s,v29.4s,v31.4s
852	shl	v31.4s,v31.4s,#2			// 1 -> 4
853
854	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
855	stp	d10,d11,[sp,#128+16]
856	stp	d12,d13,[sp,#128+32]
857	stp	d14,d15,[sp,#128+48]
858
859	sub	x2,x2,#512			// not typo
860
861.Loop_outer_512_neon:
862	mov	v0.16b,v24.16b
863	mov	v4.16b,v24.16b
864	mov	v8.16b,v24.16b
865	mov	v12.16b,v24.16b
866	mov	v16.16b,v24.16b
867	mov	v20.16b,v24.16b
868	mov	v1.16b,v25.16b
869	mov	w5,w22			// unpack key block
870	mov	v5.16b,v25.16b
871	lsr	x6,x22,#32
872	mov	v9.16b,v25.16b
873	mov	w7,w23
874	mov	v13.16b,v25.16b
875	lsr	x8,x23,#32
876	mov	v17.16b,v25.16b
877	mov	w9,w24
878	mov	v21.16b,v25.16b
879	lsr	x10,x24,#32
880	mov	v3.16b,v27.16b
881	mov	w11,w25
882	mov	v7.16b,v28.16b
883	lsr	x12,x25,#32
884	mov	v11.16b,v29.16b
885	mov	w13,w26
886	mov	v15.16b,v30.16b
887	lsr	x14,x26,#32
888	mov	v2.16b,v26.16b
889	mov	w15,w27
890	mov	v6.16b,v26.16b
891	lsr	x16,x27,#32
892	add	v19.4s,v3.4s,v31.4s			// +4
893	mov	w17,w28
894	add	v23.4s,v7.4s,v31.4s			// +4
895	lsr	x19,x28,#32
896	mov	v10.16b,v26.16b
897	mov	w20,w30
898	mov	v14.16b,v26.16b
899	lsr	x21,x30,#32
900	mov	v18.16b,v26.16b
901	stp	q27,q28,[sp,#48]		// off-load key block, variable part
902	mov	v22.16b,v26.16b
903	str	q29,[sp,#80]
904
905	mov	x4,#5
906	subs	x2,x2,#512
907.Loop_upper_neon:
908	sub	x4,x4,#1
909	add	v0.4s,v0.4s,v1.4s
910	add	w5,w5,w9
911	add	v4.4s,v4.4s,v5.4s
912	add	w6,w6,w10
913	add	v8.4s,v8.4s,v9.4s
914	add	w7,w7,w11
915	add	v12.4s,v12.4s,v13.4s
916	add	w8,w8,w12
917	add	v16.4s,v16.4s,v17.4s
918	eor	w17,w17,w5
919	add	v20.4s,v20.4s,v21.4s
920	eor	w19,w19,w6
921	eor	v3.16b,v3.16b,v0.16b
922	eor	w20,w20,w7
923	eor	v7.16b,v7.16b,v4.16b
924	eor	w21,w21,w8
925	eor	v11.16b,v11.16b,v8.16b
926	ror	w17,w17,#16
927	eor	v15.16b,v15.16b,v12.16b
928	ror	w19,w19,#16
929	eor	v19.16b,v19.16b,v16.16b
930	ror	w20,w20,#16
931	eor	v23.16b,v23.16b,v20.16b
932	ror	w21,w21,#16
933	rev32	v3.8h,v3.8h
934	add	w13,w13,w17
935	rev32	v7.8h,v7.8h
936	add	w14,w14,w19
937	rev32	v11.8h,v11.8h
938	add	w15,w15,w20
939	rev32	v15.8h,v15.8h
940	add	w16,w16,w21
941	rev32	v19.8h,v19.8h
942	eor	w9,w9,w13
943	rev32	v23.8h,v23.8h
944	eor	w10,w10,w14
945	add	v2.4s,v2.4s,v3.4s
946	eor	w11,w11,w15
947	add	v6.4s,v6.4s,v7.4s
948	eor	w12,w12,w16
949	add	v10.4s,v10.4s,v11.4s
950	ror	w9,w9,#20
951	add	v14.4s,v14.4s,v15.4s
952	ror	w10,w10,#20
953	add	v18.4s,v18.4s,v19.4s
954	ror	w11,w11,#20
955	add	v22.4s,v22.4s,v23.4s
956	ror	w12,w12,#20
957	eor	v24.16b,v1.16b,v2.16b
958	add	w5,w5,w9
959	eor	v25.16b,v5.16b,v6.16b
960	add	w6,w6,w10
961	eor	v26.16b,v9.16b,v10.16b
962	add	w7,w7,w11
963	eor	v27.16b,v13.16b,v14.16b
964	add	w8,w8,w12
965	eor	v28.16b,v17.16b,v18.16b
966	eor	w17,w17,w5
967	eor	v29.16b,v21.16b,v22.16b
968	eor	w19,w19,w6
969	ushr	v1.4s,v24.4s,#20
970	eor	w20,w20,w7
971	ushr	v5.4s,v25.4s,#20
972	eor	w21,w21,w8
973	ushr	v9.4s,v26.4s,#20
974	ror	w17,w17,#24
975	ushr	v13.4s,v27.4s,#20
976	ror	w19,w19,#24
977	ushr	v17.4s,v28.4s,#20
978	ror	w20,w20,#24
979	ushr	v21.4s,v29.4s,#20
980	ror	w21,w21,#24
981	sli	v1.4s,v24.4s,#12
982	add	w13,w13,w17
983	sli	v5.4s,v25.4s,#12
984	add	w14,w14,w19
985	sli	v9.4s,v26.4s,#12
986	add	w15,w15,w20
987	sli	v13.4s,v27.4s,#12
988	add	w16,w16,w21
989	sli	v17.4s,v28.4s,#12
990	eor	w9,w9,w13
991	sli	v21.4s,v29.4s,#12
992	eor	w10,w10,w14
993	add	v0.4s,v0.4s,v1.4s
994	eor	w11,w11,w15
995	add	v4.4s,v4.4s,v5.4s
996	eor	w12,w12,w16
997	add	v8.4s,v8.4s,v9.4s
998	ror	w9,w9,#25
999	add	v12.4s,v12.4s,v13.4s
1000	ror	w10,w10,#25
1001	add	v16.4s,v16.4s,v17.4s
1002	ror	w11,w11,#25
1003	add	v20.4s,v20.4s,v21.4s
1004	ror	w12,w12,#25
1005	eor	v24.16b,v3.16b,v0.16b
1006	add	w5,w5,w10
1007	eor	v25.16b,v7.16b,v4.16b
1008	add	w6,w6,w11
1009	eor	v26.16b,v11.16b,v8.16b
1010	add	w7,w7,w12
1011	eor	v27.16b,v15.16b,v12.16b
1012	add	w8,w8,w9
1013	eor	v28.16b,v19.16b,v16.16b
1014	eor	w21,w21,w5
1015	eor	v29.16b,v23.16b,v20.16b
1016	eor	w17,w17,w6
1017	ushr	v3.4s,v24.4s,#24
1018	eor	w19,w19,w7
1019	ushr	v7.4s,v25.4s,#24
1020	eor	w20,w20,w8
1021	ushr	v11.4s,v26.4s,#24
1022	ror	w21,w21,#16
1023	ushr	v15.4s,v27.4s,#24
1024	ror	w17,w17,#16
1025	ushr	v19.4s,v28.4s,#24
1026	ror	w19,w19,#16
1027	ushr	v23.4s,v29.4s,#24
1028	ror	w20,w20,#16
1029	sli	v3.4s,v24.4s,#8
1030	add	w15,w15,w21
1031	sli	v7.4s,v25.4s,#8
1032	add	w16,w16,w17
1033	sli	v11.4s,v26.4s,#8
1034	add	w13,w13,w19
1035	sli	v15.4s,v27.4s,#8
1036	add	w14,w14,w20
1037	sli	v19.4s,v28.4s,#8
1038	eor	w10,w10,w15
1039	sli	v23.4s,v29.4s,#8
1040	eor	w11,w11,w16
1041	add	v2.4s,v2.4s,v3.4s
1042	eor	w12,w12,w13
1043	add	v6.4s,v6.4s,v7.4s
1044	eor	w9,w9,w14
1045	add	v10.4s,v10.4s,v11.4s
1046	ror	w10,w10,#20
1047	add	v14.4s,v14.4s,v15.4s
1048	ror	w11,w11,#20
1049	add	v18.4s,v18.4s,v19.4s
1050	ror	w12,w12,#20
1051	add	v22.4s,v22.4s,v23.4s
1052	ror	w9,w9,#20
1053	eor	v24.16b,v1.16b,v2.16b
1054	add	w5,w5,w10
1055	eor	v25.16b,v5.16b,v6.16b
1056	add	w6,w6,w11
1057	eor	v26.16b,v9.16b,v10.16b
1058	add	w7,w7,w12
1059	eor	v27.16b,v13.16b,v14.16b
1060	add	w8,w8,w9
1061	eor	v28.16b,v17.16b,v18.16b
1062	eor	w21,w21,w5
1063	eor	v29.16b,v21.16b,v22.16b
1064	eor	w17,w17,w6
1065	ushr	v1.4s,v24.4s,#25
1066	eor	w19,w19,w7
1067	ushr	v5.4s,v25.4s,#25
1068	eor	w20,w20,w8
1069	ushr	v9.4s,v26.4s,#25
1070	ror	w21,w21,#24
1071	ushr	v13.4s,v27.4s,#25
1072	ror	w17,w17,#24
1073	ushr	v17.4s,v28.4s,#25
1074	ror	w19,w19,#24
1075	ushr	v21.4s,v29.4s,#25
1076	ror	w20,w20,#24
1077	sli	v1.4s,v24.4s,#7
1078	add	w15,w15,w21
1079	sli	v5.4s,v25.4s,#7
1080	add	w16,w16,w17
1081	sli	v9.4s,v26.4s,#7
1082	add	w13,w13,w19
1083	sli	v13.4s,v27.4s,#7
1084	add	w14,w14,w20
1085	sli	v17.4s,v28.4s,#7
1086	eor	w10,w10,w15
1087	sli	v21.4s,v29.4s,#7
1088	eor	w11,w11,w16
1089	ext	v2.16b,v2.16b,v2.16b,#8
1090	eor	w12,w12,w13
1091	ext	v6.16b,v6.16b,v6.16b,#8
1092	eor	w9,w9,w14
1093	ext	v10.16b,v10.16b,v10.16b,#8
1094	ror	w10,w10,#25
1095	ext	v14.16b,v14.16b,v14.16b,#8
1096	ror	w11,w11,#25
1097	ext	v18.16b,v18.16b,v18.16b,#8
1098	ror	w12,w12,#25
1099	ext	v22.16b,v22.16b,v22.16b,#8
1100	ror	w9,w9,#25
1101	ext	v3.16b,v3.16b,v3.16b,#12
1102	ext	v7.16b,v7.16b,v7.16b,#12
1103	ext	v11.16b,v11.16b,v11.16b,#12
1104	ext	v15.16b,v15.16b,v15.16b,#12
1105	ext	v19.16b,v19.16b,v19.16b,#12
1106	ext	v23.16b,v23.16b,v23.16b,#12
1107	ext	v1.16b,v1.16b,v1.16b,#4
1108	ext	v5.16b,v5.16b,v5.16b,#4
1109	ext	v9.16b,v9.16b,v9.16b,#4
1110	ext	v13.16b,v13.16b,v13.16b,#4
1111	ext	v17.16b,v17.16b,v17.16b,#4
1112	ext	v21.16b,v21.16b,v21.16b,#4
1113	add	v0.4s,v0.4s,v1.4s
1114	add	w5,w5,w9
1115	add	v4.4s,v4.4s,v5.4s
1116	add	w6,w6,w10
1117	add	v8.4s,v8.4s,v9.4s
1118	add	w7,w7,w11
1119	add	v12.4s,v12.4s,v13.4s
1120	add	w8,w8,w12
1121	add	v16.4s,v16.4s,v17.4s
1122	eor	w17,w17,w5
1123	add	v20.4s,v20.4s,v21.4s
1124	eor	w19,w19,w6
1125	eor	v3.16b,v3.16b,v0.16b
1126	eor	w20,w20,w7
1127	eor	v7.16b,v7.16b,v4.16b
1128	eor	w21,w21,w8
1129	eor	v11.16b,v11.16b,v8.16b
1130	ror	w17,w17,#16
1131	eor	v15.16b,v15.16b,v12.16b
1132	ror	w19,w19,#16
1133	eor	v19.16b,v19.16b,v16.16b
1134	ror	w20,w20,#16
1135	eor	v23.16b,v23.16b,v20.16b
1136	ror	w21,w21,#16
1137	rev32	v3.8h,v3.8h
1138	add	w13,w13,w17
1139	rev32	v7.8h,v7.8h
1140	add	w14,w14,w19
1141	rev32	v11.8h,v11.8h
1142	add	w15,w15,w20
1143	rev32	v15.8h,v15.8h
1144	add	w16,w16,w21
1145	rev32	v19.8h,v19.8h
1146	eor	w9,w9,w13
1147	rev32	v23.8h,v23.8h
1148	eor	w10,w10,w14
1149	add	v2.4s,v2.4s,v3.4s
1150	eor	w11,w11,w15
1151	add	v6.4s,v6.4s,v7.4s
1152	eor	w12,w12,w16
1153	add	v10.4s,v10.4s,v11.4s
1154	ror	w9,w9,#20
1155	add	v14.4s,v14.4s,v15.4s
1156	ror	w10,w10,#20
1157	add	v18.4s,v18.4s,v19.4s
1158	ror	w11,w11,#20
1159	add	v22.4s,v22.4s,v23.4s
1160	ror	w12,w12,#20
1161	eor	v24.16b,v1.16b,v2.16b
1162	add	w5,w5,w9
1163	eor	v25.16b,v5.16b,v6.16b
1164	add	w6,w6,w10
1165	eor	v26.16b,v9.16b,v10.16b
1166	add	w7,w7,w11
1167	eor	v27.16b,v13.16b,v14.16b
1168	add	w8,w8,w12
1169	eor	v28.16b,v17.16b,v18.16b
1170	eor	w17,w17,w5
1171	eor	v29.16b,v21.16b,v22.16b
1172	eor	w19,w19,w6
1173	ushr	v1.4s,v24.4s,#20
1174	eor	w20,w20,w7
1175	ushr	v5.4s,v25.4s,#20
1176	eor	w21,w21,w8
1177	ushr	v9.4s,v26.4s,#20
1178	ror	w17,w17,#24
1179	ushr	v13.4s,v27.4s,#20
1180	ror	w19,w19,#24
1181	ushr	v17.4s,v28.4s,#20
1182	ror	w20,w20,#24
1183	ushr	v21.4s,v29.4s,#20
1184	ror	w21,w21,#24
1185	sli	v1.4s,v24.4s,#12
1186	add	w13,w13,w17
1187	sli	v5.4s,v25.4s,#12
1188	add	w14,w14,w19
1189	sli	v9.4s,v26.4s,#12
1190	add	w15,w15,w20
1191	sli	v13.4s,v27.4s,#12
1192	add	w16,w16,w21
1193	sli	v17.4s,v28.4s,#12
1194	eor	w9,w9,w13
1195	sli	v21.4s,v29.4s,#12
1196	eor	w10,w10,w14
1197	add	v0.4s,v0.4s,v1.4s
1198	eor	w11,w11,w15
1199	add	v4.4s,v4.4s,v5.4s
1200	eor	w12,w12,w16
1201	add	v8.4s,v8.4s,v9.4s
1202	ror	w9,w9,#25
1203	add	v12.4s,v12.4s,v13.4s
1204	ror	w10,w10,#25
1205	add	v16.4s,v16.4s,v17.4s
1206	ror	w11,w11,#25
1207	add	v20.4s,v20.4s,v21.4s
1208	ror	w12,w12,#25
1209	eor	v24.16b,v3.16b,v0.16b
1210	add	w5,w5,w10
1211	eor	v25.16b,v7.16b,v4.16b
1212	add	w6,w6,w11
1213	eor	v26.16b,v11.16b,v8.16b
1214	add	w7,w7,w12
1215	eor	v27.16b,v15.16b,v12.16b
1216	add	w8,w8,w9
1217	eor	v28.16b,v19.16b,v16.16b
1218	eor	w21,w21,w5
1219	eor	v29.16b,v23.16b,v20.16b
1220	eor	w17,w17,w6
1221	ushr	v3.4s,v24.4s,#24
1222	eor	w19,w19,w7
1223	ushr	v7.4s,v25.4s,#24
1224	eor	w20,w20,w8
1225	ushr	v11.4s,v26.4s,#24
1226	ror	w21,w21,#16
1227	ushr	v15.4s,v27.4s,#24
1228	ror	w17,w17,#16
1229	ushr	v19.4s,v28.4s,#24
1230	ror	w19,w19,#16
1231	ushr	v23.4s,v29.4s,#24
1232	ror	w20,w20,#16
1233	sli	v3.4s,v24.4s,#8
1234	add	w15,w15,w21
1235	sli	v7.4s,v25.4s,#8
1236	add	w16,w16,w17
1237	sli	v11.4s,v26.4s,#8
1238	add	w13,w13,w19
1239	sli	v15.4s,v27.4s,#8
1240	add	w14,w14,w20
1241	sli	v19.4s,v28.4s,#8
1242	eor	w10,w10,w15
1243	sli	v23.4s,v29.4s,#8
1244	eor	w11,w11,w16
1245	add	v2.4s,v2.4s,v3.4s
1246	eor	w12,w12,w13
1247	add	v6.4s,v6.4s,v7.4s
1248	eor	w9,w9,w14
1249	add	v10.4s,v10.4s,v11.4s
1250	ror	w10,w10,#20
1251	add	v14.4s,v14.4s,v15.4s
1252	ror	w11,w11,#20
1253	add	v18.4s,v18.4s,v19.4s
1254	ror	w12,w12,#20
1255	add	v22.4s,v22.4s,v23.4s
1256	ror	w9,w9,#20
1257	eor	v24.16b,v1.16b,v2.16b
1258	add	w5,w5,w10
1259	eor	v25.16b,v5.16b,v6.16b
1260	add	w6,w6,w11
1261	eor	v26.16b,v9.16b,v10.16b
1262	add	w7,w7,w12
1263	eor	v27.16b,v13.16b,v14.16b
1264	add	w8,w8,w9
1265	eor	v28.16b,v17.16b,v18.16b
1266	eor	w21,w21,w5
1267	eor	v29.16b,v21.16b,v22.16b
1268	eor	w17,w17,w6
1269	ushr	v1.4s,v24.4s,#25
1270	eor	w19,w19,w7
1271	ushr	v5.4s,v25.4s,#25
1272	eor	w20,w20,w8
1273	ushr	v9.4s,v26.4s,#25
1274	ror	w21,w21,#24
1275	ushr	v13.4s,v27.4s,#25
1276	ror	w17,w17,#24
1277	ushr	v17.4s,v28.4s,#25
1278	ror	w19,w19,#24
1279	ushr	v21.4s,v29.4s,#25
1280	ror	w20,w20,#24
1281	sli	v1.4s,v24.4s,#7
1282	add	w15,w15,w21
1283	sli	v5.4s,v25.4s,#7
1284	add	w16,w16,w17
1285	sli	v9.4s,v26.4s,#7
1286	add	w13,w13,w19
1287	sli	v13.4s,v27.4s,#7
1288	add	w14,w14,w20
1289	sli	v17.4s,v28.4s,#7
1290	eor	w10,w10,w15
1291	sli	v21.4s,v29.4s,#7
1292	eor	w11,w11,w16
1293	ext	v2.16b,v2.16b,v2.16b,#8
1294	eor	w12,w12,w13
1295	ext	v6.16b,v6.16b,v6.16b,#8
1296	eor	w9,w9,w14
1297	ext	v10.16b,v10.16b,v10.16b,#8
1298	ror	w10,w10,#25
1299	ext	v14.16b,v14.16b,v14.16b,#8
1300	ror	w11,w11,#25
1301	ext	v18.16b,v18.16b,v18.16b,#8
1302	ror	w12,w12,#25
1303	ext	v22.16b,v22.16b,v22.16b,#8
1304	ror	w9,w9,#25
1305	ext	v3.16b,v3.16b,v3.16b,#4
1306	ext	v7.16b,v7.16b,v7.16b,#4
1307	ext	v11.16b,v11.16b,v11.16b,#4
1308	ext	v15.16b,v15.16b,v15.16b,#4
1309	ext	v19.16b,v19.16b,v19.16b,#4
1310	ext	v23.16b,v23.16b,v23.16b,#4
1311	ext	v1.16b,v1.16b,v1.16b,#12
1312	ext	v5.16b,v5.16b,v5.16b,#12
1313	ext	v9.16b,v9.16b,v9.16b,#12
1314	ext	v13.16b,v13.16b,v13.16b,#12
1315	ext	v17.16b,v17.16b,v17.16b,#12
1316	ext	v21.16b,v21.16b,v21.16b,#12
1317	cbnz	x4,.Loop_upper_neon
1318
1319	add	w5,w5,w22		// accumulate key block
1320	add	x6,x6,x22,lsr#32
1321	add	w7,w7,w23
1322	add	x8,x8,x23,lsr#32
1323	add	w9,w9,w24
1324	add	x10,x10,x24,lsr#32
1325	add	w11,w11,w25
1326	add	x12,x12,x25,lsr#32
1327	add	w13,w13,w26
1328	add	x14,x14,x26,lsr#32
1329	add	w15,w15,w27
1330	add	x16,x16,x27,lsr#32
1331	add	w17,w17,w28
1332	add	x19,x19,x28,lsr#32
1333	add	w20,w20,w30
1334	add	x21,x21,x30,lsr#32
1335
1336	add	x5,x5,x6,lsl#32	// pack
1337	add	x7,x7,x8,lsl#32
1338	ldp	x6,x8,[x1,#0]		// load input
1339	add	x9,x9,x10,lsl#32
1340	add	x11,x11,x12,lsl#32
1341	ldp	x10,x12,[x1,#16]
1342	add	x13,x13,x14,lsl#32
1343	add	x15,x15,x16,lsl#32
1344	ldp	x14,x16,[x1,#32]
1345	add	x17,x17,x19,lsl#32
1346	add	x20,x20,x21,lsl#32
1347	ldp	x19,x21,[x1,#48]
1348	add	x1,x1,#64
1349#ifdef	__ARMEB__
1350	rev	x5,x5
1351	rev	x7,x7
1352	rev	x9,x9
1353	rev	x11,x11
1354	rev	x13,x13
1355	rev	x15,x15
1356	rev	x17,x17
1357	rev	x20,x20
1358#endif
1359	eor	x5,x5,x6
1360	eor	x7,x7,x8
1361	eor	x9,x9,x10
1362	eor	x11,x11,x12
1363	eor	x13,x13,x14
1364	eor	x15,x15,x16
1365	eor	x17,x17,x19
1366	eor	x20,x20,x21
1367
1368	stp	x5,x7,[x0,#0]		// store output
1369	add	x28,x28,#1			// increment counter
1370	mov	w5,w22			// unpack key block
1371	lsr	x6,x22,#32
1372	stp	x9,x11,[x0,#16]
1373	mov	w7,w23
1374	lsr	x8,x23,#32
1375	stp	x13,x15,[x0,#32]
1376	mov	w9,w24
1377	lsr	x10,x24,#32
1378	stp	x17,x20,[x0,#48]
1379	add	x0,x0,#64
1380	mov	w11,w25
1381	lsr	x12,x25,#32
1382	mov	w13,w26
1383	lsr	x14,x26,#32
1384	mov	w15,w27
1385	lsr	x16,x27,#32
1386	mov	w17,w28
1387	lsr	x19,x28,#32
1388	mov	w20,w30
1389	lsr	x21,x30,#32
1390
1391	mov	x4,#5
1392.Loop_lower_neon:
1393	sub	x4,x4,#1
1394	add	v0.4s,v0.4s,v1.4s
1395	add	w5,w5,w9
1396	add	v4.4s,v4.4s,v5.4s
1397	add	w6,w6,w10
1398	add	v8.4s,v8.4s,v9.4s
1399	add	w7,w7,w11
1400	add	v12.4s,v12.4s,v13.4s
1401	add	w8,w8,w12
1402	add	v16.4s,v16.4s,v17.4s
1403	eor	w17,w17,w5
1404	add	v20.4s,v20.4s,v21.4s
1405	eor	w19,w19,w6
1406	eor	v3.16b,v3.16b,v0.16b
1407	eor	w20,w20,w7
1408	eor	v7.16b,v7.16b,v4.16b
1409	eor	w21,w21,w8
1410	eor	v11.16b,v11.16b,v8.16b
1411	ror	w17,w17,#16
1412	eor	v15.16b,v15.16b,v12.16b
1413	ror	w19,w19,#16
1414	eor	v19.16b,v19.16b,v16.16b
1415	ror	w20,w20,#16
1416	eor	v23.16b,v23.16b,v20.16b
1417	ror	w21,w21,#16
1418	rev32	v3.8h,v3.8h
1419	add	w13,w13,w17
1420	rev32	v7.8h,v7.8h
1421	add	w14,w14,w19
1422	rev32	v11.8h,v11.8h
1423	add	w15,w15,w20
1424	rev32	v15.8h,v15.8h
1425	add	w16,w16,w21
1426	rev32	v19.8h,v19.8h
1427	eor	w9,w9,w13
1428	rev32	v23.8h,v23.8h
1429	eor	w10,w10,w14
1430	add	v2.4s,v2.4s,v3.4s
1431	eor	w11,w11,w15
1432	add	v6.4s,v6.4s,v7.4s
1433	eor	w12,w12,w16
1434	add	v10.4s,v10.4s,v11.4s
1435	ror	w9,w9,#20
1436	add	v14.4s,v14.4s,v15.4s
1437	ror	w10,w10,#20
1438	add	v18.4s,v18.4s,v19.4s
1439	ror	w11,w11,#20
1440	add	v22.4s,v22.4s,v23.4s
1441	ror	w12,w12,#20
1442	eor	v24.16b,v1.16b,v2.16b
1443	add	w5,w5,w9
1444	eor	v25.16b,v5.16b,v6.16b
1445	add	w6,w6,w10
1446	eor	v26.16b,v9.16b,v10.16b
1447	add	w7,w7,w11
1448	eor	v27.16b,v13.16b,v14.16b
1449	add	w8,w8,w12
1450	eor	v28.16b,v17.16b,v18.16b
1451	eor	w17,w17,w5
1452	eor	v29.16b,v21.16b,v22.16b
1453	eor	w19,w19,w6
1454	ushr	v1.4s,v24.4s,#20
1455	eor	w20,w20,w7
1456	ushr	v5.4s,v25.4s,#20
1457	eor	w21,w21,w8
1458	ushr	v9.4s,v26.4s,#20
1459	ror	w17,w17,#24
1460	ushr	v13.4s,v27.4s,#20
1461	ror	w19,w19,#24
1462	ushr	v17.4s,v28.4s,#20
1463	ror	w20,w20,#24
1464	ushr	v21.4s,v29.4s,#20
1465	ror	w21,w21,#24
1466	sli	v1.4s,v24.4s,#12
1467	add	w13,w13,w17
1468	sli	v5.4s,v25.4s,#12
1469	add	w14,w14,w19
1470	sli	v9.4s,v26.4s,#12
1471	add	w15,w15,w20
1472	sli	v13.4s,v27.4s,#12
1473	add	w16,w16,w21
1474	sli	v17.4s,v28.4s,#12
1475	eor	w9,w9,w13
1476	sli	v21.4s,v29.4s,#12
1477	eor	w10,w10,w14
1478	add	v0.4s,v0.4s,v1.4s
1479	eor	w11,w11,w15
1480	add	v4.4s,v4.4s,v5.4s
1481	eor	w12,w12,w16
1482	add	v8.4s,v8.4s,v9.4s
1483	ror	w9,w9,#25
1484	add	v12.4s,v12.4s,v13.4s
1485	ror	w10,w10,#25
1486	add	v16.4s,v16.4s,v17.4s
1487	ror	w11,w11,#25
1488	add	v20.4s,v20.4s,v21.4s
1489	ror	w12,w12,#25
1490	eor	v24.16b,v3.16b,v0.16b
1491	add	w5,w5,w10
1492	eor	v25.16b,v7.16b,v4.16b
1493	add	w6,w6,w11
1494	eor	v26.16b,v11.16b,v8.16b
1495	add	w7,w7,w12
1496	eor	v27.16b,v15.16b,v12.16b
1497	add	w8,w8,w9
1498	eor	v28.16b,v19.16b,v16.16b
1499	eor	w21,w21,w5
1500	eor	v29.16b,v23.16b,v20.16b
1501	eor	w17,w17,w6
1502	ushr	v3.4s,v24.4s,#24
1503	eor	w19,w19,w7
1504	ushr	v7.4s,v25.4s,#24
1505	eor	w20,w20,w8
1506	ushr	v11.4s,v26.4s,#24
1507	ror	w21,w21,#16
1508	ushr	v15.4s,v27.4s,#24
1509	ror	w17,w17,#16
1510	ushr	v19.4s,v28.4s,#24
1511	ror	w19,w19,#16
1512	ushr	v23.4s,v29.4s,#24
1513	ror	w20,w20,#16
1514	sli	v3.4s,v24.4s,#8
1515	add	w15,w15,w21
1516	sli	v7.4s,v25.4s,#8
1517	add	w16,w16,w17
1518	sli	v11.4s,v26.4s,#8
1519	add	w13,w13,w19
1520	sli	v15.4s,v27.4s,#8
1521	add	w14,w14,w20
1522	sli	v19.4s,v28.4s,#8
1523	eor	w10,w10,w15
1524	sli	v23.4s,v29.4s,#8
1525	eor	w11,w11,w16
1526	add	v2.4s,v2.4s,v3.4s
1527	eor	w12,w12,w13
1528	add	v6.4s,v6.4s,v7.4s
1529	eor	w9,w9,w14
1530	add	v10.4s,v10.4s,v11.4s
1531	ror	w10,w10,#20
1532	add	v14.4s,v14.4s,v15.4s
1533	ror	w11,w11,#20
1534	add	v18.4s,v18.4s,v19.4s
1535	ror	w12,w12,#20
1536	add	v22.4s,v22.4s,v23.4s
1537	ror	w9,w9,#20
1538	eor	v24.16b,v1.16b,v2.16b
1539	add	w5,w5,w10
1540	eor	v25.16b,v5.16b,v6.16b
1541	add	w6,w6,w11
1542	eor	v26.16b,v9.16b,v10.16b
1543	add	w7,w7,w12
1544	eor	v27.16b,v13.16b,v14.16b
1545	add	w8,w8,w9
1546	eor	v28.16b,v17.16b,v18.16b
1547	eor	w21,w21,w5
1548	eor	v29.16b,v21.16b,v22.16b
1549	eor	w17,w17,w6
1550	ushr	v1.4s,v24.4s,#25
1551	eor	w19,w19,w7
1552	ushr	v5.4s,v25.4s,#25
1553	eor	w20,w20,w8
1554	ushr	v9.4s,v26.4s,#25
1555	ror	w21,w21,#24
1556	ushr	v13.4s,v27.4s,#25
1557	ror	w17,w17,#24
1558	ushr	v17.4s,v28.4s,#25
1559	ror	w19,w19,#24
1560	ushr	v21.4s,v29.4s,#25
1561	ror	w20,w20,#24
1562	sli	v1.4s,v24.4s,#7
1563	add	w15,w15,w21
1564	sli	v5.4s,v25.4s,#7
1565	add	w16,w16,w17
1566	sli	v9.4s,v26.4s,#7
1567	add	w13,w13,w19
1568	sli	v13.4s,v27.4s,#7
1569	add	w14,w14,w20
1570	sli	v17.4s,v28.4s,#7
1571	eor	w10,w10,w15
1572	sli	v21.4s,v29.4s,#7
1573	eor	w11,w11,w16
1574	ext	v2.16b,v2.16b,v2.16b,#8
1575	eor	w12,w12,w13
1576	ext	v6.16b,v6.16b,v6.16b,#8
1577	eor	w9,w9,w14
1578	ext	v10.16b,v10.16b,v10.16b,#8
1579	ror	w10,w10,#25
1580	ext	v14.16b,v14.16b,v14.16b,#8
1581	ror	w11,w11,#25
1582	ext	v18.16b,v18.16b,v18.16b,#8
1583	ror	w12,w12,#25
1584	ext	v22.16b,v22.16b,v22.16b,#8
1585	ror	w9,w9,#25
1586	ext	v3.16b,v3.16b,v3.16b,#12
1587	ext	v7.16b,v7.16b,v7.16b,#12
1588	ext	v11.16b,v11.16b,v11.16b,#12
1589	ext	v15.16b,v15.16b,v15.16b,#12
1590	ext	v19.16b,v19.16b,v19.16b,#12
1591	ext	v23.16b,v23.16b,v23.16b,#12
1592	ext	v1.16b,v1.16b,v1.16b,#4
1593	ext	v5.16b,v5.16b,v5.16b,#4
1594	ext	v9.16b,v9.16b,v9.16b,#4
1595	ext	v13.16b,v13.16b,v13.16b,#4
1596	ext	v17.16b,v17.16b,v17.16b,#4
1597	ext	v21.16b,v21.16b,v21.16b,#4
1598	add	v0.4s,v0.4s,v1.4s
1599	add	w5,w5,w9
1600	add	v4.4s,v4.4s,v5.4s
1601	add	w6,w6,w10
1602	add	v8.4s,v8.4s,v9.4s
1603	add	w7,w7,w11
1604	add	v12.4s,v12.4s,v13.4s
1605	add	w8,w8,w12
1606	add	v16.4s,v16.4s,v17.4s
1607	eor	w17,w17,w5
1608	add	v20.4s,v20.4s,v21.4s
1609	eor	w19,w19,w6
1610	eor	v3.16b,v3.16b,v0.16b
1611	eor	w20,w20,w7
1612	eor	v7.16b,v7.16b,v4.16b
1613	eor	w21,w21,w8
1614	eor	v11.16b,v11.16b,v8.16b
1615	ror	w17,w17,#16
1616	eor	v15.16b,v15.16b,v12.16b
1617	ror	w19,w19,#16
1618	eor	v19.16b,v19.16b,v16.16b
1619	ror	w20,w20,#16
1620	eor	v23.16b,v23.16b,v20.16b
1621	ror	w21,w21,#16
1622	rev32	v3.8h,v3.8h
1623	add	w13,w13,w17
1624	rev32	v7.8h,v7.8h
1625	add	w14,w14,w19
1626	rev32	v11.8h,v11.8h
1627	add	w15,w15,w20
1628	rev32	v15.8h,v15.8h
1629	add	w16,w16,w21
1630	rev32	v19.8h,v19.8h
1631	eor	w9,w9,w13
1632	rev32	v23.8h,v23.8h
1633	eor	w10,w10,w14
1634	add	v2.4s,v2.4s,v3.4s
1635	eor	w11,w11,w15
1636	add	v6.4s,v6.4s,v7.4s
1637	eor	w12,w12,w16
1638	add	v10.4s,v10.4s,v11.4s
1639	ror	w9,w9,#20
1640	add	v14.4s,v14.4s,v15.4s
1641	ror	w10,w10,#20
1642	add	v18.4s,v18.4s,v19.4s
1643	ror	w11,w11,#20
1644	add	v22.4s,v22.4s,v23.4s
1645	ror	w12,w12,#20
1646	eor	v24.16b,v1.16b,v2.16b
1647	add	w5,w5,w9
1648	eor	v25.16b,v5.16b,v6.16b
1649	add	w6,w6,w10
1650	eor	v26.16b,v9.16b,v10.16b
1651	add	w7,w7,w11
1652	eor	v27.16b,v13.16b,v14.16b
1653	add	w8,w8,w12
1654	eor	v28.16b,v17.16b,v18.16b
1655	eor	w17,w17,w5
1656	eor	v29.16b,v21.16b,v22.16b
1657	eor	w19,w19,w6
1658	ushr	v1.4s,v24.4s,#20
1659	eor	w20,w20,w7
1660	ushr	v5.4s,v25.4s,#20
1661	eor	w21,w21,w8
1662	ushr	v9.4s,v26.4s,#20
1663	ror	w17,w17,#24
1664	ushr	v13.4s,v27.4s,#20
1665	ror	w19,w19,#24
1666	ushr	v17.4s,v28.4s,#20
1667	ror	w20,w20,#24
1668	ushr	v21.4s,v29.4s,#20
1669	ror	w21,w21,#24
1670	sli	v1.4s,v24.4s,#12
1671	add	w13,w13,w17
1672	sli	v5.4s,v25.4s,#12
1673	add	w14,w14,w19
1674	sli	v9.4s,v26.4s,#12
1675	add	w15,w15,w20
1676	sli	v13.4s,v27.4s,#12
1677	add	w16,w16,w21
1678	sli	v17.4s,v28.4s,#12
1679	eor	w9,w9,w13
1680	sli	v21.4s,v29.4s,#12
1681	eor	w10,w10,w14
1682	add	v0.4s,v0.4s,v1.4s
1683	eor	w11,w11,w15
1684	add	v4.4s,v4.4s,v5.4s
1685	eor	w12,w12,w16
1686	add	v8.4s,v8.4s,v9.4s
1687	ror	w9,w9,#25
1688	add	v12.4s,v12.4s,v13.4s
1689	ror	w10,w10,#25
1690	add	v16.4s,v16.4s,v17.4s
1691	ror	w11,w11,#25
1692	add	v20.4s,v20.4s,v21.4s
1693	ror	w12,w12,#25
1694	eor	v24.16b,v3.16b,v0.16b
1695	add	w5,w5,w10
1696	eor	v25.16b,v7.16b,v4.16b
1697	add	w6,w6,w11
1698	eor	v26.16b,v11.16b,v8.16b
1699	add	w7,w7,w12
1700	eor	v27.16b,v15.16b,v12.16b
1701	add	w8,w8,w9
1702	eor	v28.16b,v19.16b,v16.16b
1703	eor	w21,w21,w5
1704	eor	v29.16b,v23.16b,v20.16b
1705	eor	w17,w17,w6
1706	ushr	v3.4s,v24.4s,#24
1707	eor	w19,w19,w7
1708	ushr	v7.4s,v25.4s,#24
1709	eor	w20,w20,w8
1710	ushr	v11.4s,v26.4s,#24
1711	ror	w21,w21,#16
1712	ushr	v15.4s,v27.4s,#24
1713	ror	w17,w17,#16
1714	ushr	v19.4s,v28.4s,#24
1715	ror	w19,w19,#16
1716	ushr	v23.4s,v29.4s,#24
1717	ror	w20,w20,#16
1718	sli	v3.4s,v24.4s,#8
1719	add	w15,w15,w21
1720	sli	v7.4s,v25.4s,#8
1721	add	w16,w16,w17
1722	sli	v11.4s,v26.4s,#8
1723	add	w13,w13,w19
1724	sli	v15.4s,v27.4s,#8
1725	add	w14,w14,w20
1726	sli	v19.4s,v28.4s,#8
1727	eor	w10,w10,w15
1728	sli	v23.4s,v29.4s,#8
1729	eor	w11,w11,w16
1730	add	v2.4s,v2.4s,v3.4s
1731	eor	w12,w12,w13
1732	add	v6.4s,v6.4s,v7.4s
1733	eor	w9,w9,w14
1734	add	v10.4s,v10.4s,v11.4s
1735	ror	w10,w10,#20
1736	add	v14.4s,v14.4s,v15.4s
1737	ror	w11,w11,#20
1738	add	v18.4s,v18.4s,v19.4s
1739	ror	w12,w12,#20
1740	add	v22.4s,v22.4s,v23.4s
1741	ror	w9,w9,#20
1742	eor	v24.16b,v1.16b,v2.16b
1743	add	w5,w5,w10
1744	eor	v25.16b,v5.16b,v6.16b
1745	add	w6,w6,w11
1746	eor	v26.16b,v9.16b,v10.16b
1747	add	w7,w7,w12
1748	eor	v27.16b,v13.16b,v14.16b
1749	add	w8,w8,w9
1750	eor	v28.16b,v17.16b,v18.16b
1751	eor	w21,w21,w5
1752	eor	v29.16b,v21.16b,v22.16b
1753	eor	w17,w17,w6
1754	ushr	v1.4s,v24.4s,#25
1755	eor	w19,w19,w7
1756	ushr	v5.4s,v25.4s,#25
1757	eor	w20,w20,w8
1758	ushr	v9.4s,v26.4s,#25
1759	ror	w21,w21,#24
1760	ushr	v13.4s,v27.4s,#25
1761	ror	w17,w17,#24
1762	ushr	v17.4s,v28.4s,#25
1763	ror	w19,w19,#24
1764	ushr	v21.4s,v29.4s,#25
1765	ror	w20,w20,#24
1766	sli	v1.4s,v24.4s,#7
1767	add	w15,w15,w21
1768	sli	v5.4s,v25.4s,#7
1769	add	w16,w16,w17
1770	sli	v9.4s,v26.4s,#7
1771	add	w13,w13,w19
1772	sli	v13.4s,v27.4s,#7
1773	add	w14,w14,w20
1774	sli	v17.4s,v28.4s,#7
1775	eor	w10,w10,w15
1776	sli	v21.4s,v29.4s,#7
1777	eor	w11,w11,w16
1778	ext	v2.16b,v2.16b,v2.16b,#8
1779	eor	w12,w12,w13
1780	ext	v6.16b,v6.16b,v6.16b,#8
1781	eor	w9,w9,w14
1782	ext	v10.16b,v10.16b,v10.16b,#8
1783	ror	w10,w10,#25
1784	ext	v14.16b,v14.16b,v14.16b,#8
1785	ror	w11,w11,#25
1786	ext	v18.16b,v18.16b,v18.16b,#8
1787	ror	w12,w12,#25
1788	ext	v22.16b,v22.16b,v22.16b,#8
1789	ror	w9,w9,#25
1790	ext	v3.16b,v3.16b,v3.16b,#4
1791	ext	v7.16b,v7.16b,v7.16b,#4
1792	ext	v11.16b,v11.16b,v11.16b,#4
1793	ext	v15.16b,v15.16b,v15.16b,#4
1794	ext	v19.16b,v19.16b,v19.16b,#4
1795	ext	v23.16b,v23.16b,v23.16b,#4
1796	ext	v1.16b,v1.16b,v1.16b,#12
1797	ext	v5.16b,v5.16b,v5.16b,#12
1798	ext	v9.16b,v9.16b,v9.16b,#12
1799	ext	v13.16b,v13.16b,v13.16b,#12
1800	ext	v17.16b,v17.16b,v17.16b,#12
1801	ext	v21.16b,v21.16b,v21.16b,#12
1802	cbnz	x4,.Loop_lower_neon
1803
1804	add	w5,w5,w22		// accumulate key block
1805	ldp	q24,q25,[sp,#0]
1806	add	x6,x6,x22,lsr#32
1807	ldp	q26,q27,[sp,#32]
1808	add	w7,w7,w23
1809	ldp	q28,q29,[sp,#64]
1810	add	x8,x8,x23,lsr#32
1811	add	v0.4s,v0.4s,v24.4s
1812	add	w9,w9,w24
1813	add	v4.4s,v4.4s,v24.4s
1814	add	x10,x10,x24,lsr#32
1815	add	v8.4s,v8.4s,v24.4s
1816	add	w11,w11,w25
1817	add	v12.4s,v12.4s,v24.4s
1818	add	x12,x12,x25,lsr#32
1819	add	v16.4s,v16.4s,v24.4s
1820	add	w13,w13,w26
1821	add	v20.4s,v20.4s,v24.4s
1822	add	x14,x14,x26,lsr#32
1823	add	v2.4s,v2.4s,v26.4s
1824	add	w15,w15,w27
1825	add	v6.4s,v6.4s,v26.4s
1826	add	x16,x16,x27,lsr#32
1827	add	v10.4s,v10.4s,v26.4s
1828	add	w17,w17,w28
1829	add	v14.4s,v14.4s,v26.4s
1830	add	x19,x19,x28,lsr#32
1831	add	v18.4s,v18.4s,v26.4s
1832	add	w20,w20,w30
1833	add	v22.4s,v22.4s,v26.4s
1834	add	x21,x21,x30,lsr#32
1835	add	v19.4s,v19.4s,v31.4s			// +4
1836	add	x5,x5,x6,lsl#32	// pack
1837	add	v23.4s,v23.4s,v31.4s			// +4
1838	add	x7,x7,x8,lsl#32
1839	add	v3.4s,v3.4s,v27.4s
1840	ldp	x6,x8,[x1,#0]		// load input
1841	add	v7.4s,v7.4s,v28.4s
1842	add	x9,x9,x10,lsl#32
1843	add	v11.4s,v11.4s,v29.4s
1844	add	x11,x11,x12,lsl#32
1845	add	v15.4s,v15.4s,v30.4s
1846	ldp	x10,x12,[x1,#16]
1847	add	v19.4s,v19.4s,v27.4s
1848	add	x13,x13,x14,lsl#32
1849	add	v23.4s,v23.4s,v28.4s
1850	add	x15,x15,x16,lsl#32
1851	add	v1.4s,v1.4s,v25.4s
1852	ldp	x14,x16,[x1,#32]
1853	add	v5.4s,v5.4s,v25.4s
1854	add	x17,x17,x19,lsl#32
1855	add	v9.4s,v9.4s,v25.4s
1856	add	x20,x20,x21,lsl#32
1857	add	v13.4s,v13.4s,v25.4s
1858	ldp	x19,x21,[x1,#48]
1859	add	v17.4s,v17.4s,v25.4s
1860	add	x1,x1,#64
1861	add	v21.4s,v21.4s,v25.4s
1862
1863#ifdef	__ARMEB__
1864	rev	x5,x5
1865	rev	x7,x7
1866	rev	x9,x9
1867	rev	x11,x11
1868	rev	x13,x13
1869	rev	x15,x15
1870	rev	x17,x17
1871	rev	x20,x20
1872#endif
1873	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1874	eor	x5,x5,x6
1875	eor	x7,x7,x8
1876	eor	x9,x9,x10
1877	eor	x11,x11,x12
1878	eor	x13,x13,x14
1879	eor	v0.16b,v0.16b,v24.16b
1880	eor	x15,x15,x16
1881	eor	v1.16b,v1.16b,v25.16b
1882	eor	x17,x17,x19
1883	eor	v2.16b,v2.16b,v26.16b
1884	eor	x20,x20,x21
1885	eor	v3.16b,v3.16b,v27.16b
1886	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1887
1888	stp	x5,x7,[x0,#0]		// store output
1889	add	x28,x28,#7			// increment counter
1890	stp	x9,x11,[x0,#16]
1891	stp	x13,x15,[x0,#32]
1892	stp	x17,x20,[x0,#48]
1893	add	x0,x0,#64
1894	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1895
1896	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1897	eor	v4.16b,v4.16b,v24.16b
1898	eor	v5.16b,v5.16b,v25.16b
1899	eor	v6.16b,v6.16b,v26.16b
1900	eor	v7.16b,v7.16b,v27.16b
1901	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1902
1903	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1904	eor	v8.16b,v8.16b,v0.16b
1905	ldp	q24,q25,[sp,#0]
1906	eor	v9.16b,v9.16b,v1.16b
1907	ldp	q26,q27,[sp,#32]
1908	eor	v10.16b,v10.16b,v2.16b
1909	eor	v11.16b,v11.16b,v3.16b
1910	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1911
1912	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1913	eor	v12.16b,v12.16b,v4.16b
1914	eor	v13.16b,v13.16b,v5.16b
1915	eor	v14.16b,v14.16b,v6.16b
1916	eor	v15.16b,v15.16b,v7.16b
1917	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1918
1919	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1920	eor	v16.16b,v16.16b,v8.16b
1921	eor	v17.16b,v17.16b,v9.16b
1922	eor	v18.16b,v18.16b,v10.16b
1923	eor	v19.16b,v19.16b,v11.16b
1924	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1925
1926	shl	v0.4s,v31.4s,#1			// 4 -> 8
1927	eor	v20.16b,v20.16b,v12.16b
1928	eor	v21.16b,v21.16b,v13.16b
1929	eor	v22.16b,v22.16b,v14.16b
1930	eor	v23.16b,v23.16b,v15.16b
1931	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1932
1933	add	v27.4s,v27.4s,v0.4s			// += 8
1934	add	v28.4s,v28.4s,v0.4s
1935	add	v29.4s,v29.4s,v0.4s
1936	add	v30.4s,v30.4s,v0.4s
1937
1938	b.hs	.Loop_outer_512_neon
1939
1940	adds	x2,x2,#512
1941	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1942
1943	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1944	ldp	d10,d11,[sp,#128+16]
1945	ldp	d12,d13,[sp,#128+32]
1946	ldp	d14,d15,[sp,#128+48]
1947
1948	stp	q24,q31,[sp,#0]		// wipe off-load area
1949	stp	q24,q31,[sp,#32]
1950	stp	q24,q31,[sp,#64]
1951
1952	b.eq	.Ldone_512_neon
1953
1954	cmp	x2,#192
1955	sub	v27.4s,v27.4s,v0.4s			// -= 1
1956	sub	v28.4s,v28.4s,v0.4s
1957	sub	v29.4s,v29.4s,v0.4s
1958	add	sp,sp,#128
1959	b.hs	.Loop_outer_neon
1960
1961	eor	v25.16b,v25.16b,v25.16b
1962	eor	v26.16b,v26.16b,v26.16b
1963	eor	v27.16b,v27.16b,v27.16b
1964	eor	v28.16b,v28.16b,v28.16b
1965	eor	v29.16b,v29.16b,v29.16b
1966	eor	v30.16b,v30.16b,v30.16b
1967	b	.Loop_outer
1968
1969.Ldone_512_neon:
1970	ldp	x19,x20,[x29,#16]
1971	add	sp,sp,#128+64
1972	ldp	x21,x22,[x29,#32]
1973	ldp	x23,x24,[x29,#48]
1974	ldp	x25,x26,[x29,#64]
1975	ldp	x27,x28,[x29,#80]
1976	ldp	x29,x30,[sp],#96
1977.inst	0xd50323bf			// autiasp
1978	ret
1979.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1980