xref: /freebsd/sys/crypto/openssl/aarch64/chacha-armv8.S (revision bd9588bca05f5cbdeac6e5f9f426b2589301d7c6)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from chacha-armv8.pl. */
2bc3d5698SJohn Baldwin#include "arm_arch.h"
3*bd9588bcSAndrew Turner#ifndef	__KERNEL__
4bc3d5698SJohn Baldwin
5c3c73b4fSJung-uk Kim.hidden	OPENSSL_armcap_P
6c0855eaaSJohn Baldwin#endif
7c0855eaaSJohn Baldwin
8c0855eaaSJohn Baldwin.text
9bc3d5698SJohn Baldwin
10bc3d5698SJohn Baldwin.align	5
11bc3d5698SJohn Baldwin.Lsigma:
12bc3d5698SJohn Baldwin.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
13bc3d5698SJohn Baldwin.Lone:
14c0855eaaSJohn Baldwin.long	1,2,3,4
15c0855eaaSJohn Baldwin.Lrot24:
16c0855eaaSJohn Baldwin.long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
17c0855eaaSJohn Baldwin.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
18bc3d5698SJohn Baldwin.align	2
19bc3d5698SJohn Baldwin
20bc3d5698SJohn Baldwin.globl	ChaCha20_ctr32
21bc3d5698SJohn Baldwin.type	ChaCha20_ctr32,%function
22bc3d5698SJohn Baldwin.align	5
23bc3d5698SJohn BaldwinChaCha20_ctr32:
24*bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
25bc3d5698SJohn Baldwin	cbz	x2,.Labort
26bc3d5698SJohn Baldwin	cmp	x2,#192
27bc3d5698SJohn Baldwin	b.lo	.Lshort
28c0855eaaSJohn Baldwin
29c0855eaaSJohn Baldwin#ifndef	__KERNEL__
30c0855eaaSJohn Baldwin	adrp	x17,OPENSSL_armcap_P
31c0855eaaSJohn Baldwin	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
32bc3d5698SJohn Baldwin	tst	w17,#ARMV7_NEON
33c0855eaaSJohn Baldwin	b.ne	.LChaCha20_neon
34c0855eaaSJohn Baldwin#endif
35bc3d5698SJohn Baldwin
36bc3d5698SJohn Baldwin.Lshort:
37bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-96]!
38bc3d5698SJohn Baldwin	add	x29,sp,#0
39bc3d5698SJohn Baldwin
40bc3d5698SJohn Baldwin	adr	x5,.Lsigma
41bc3d5698SJohn Baldwin	stp	x19,x20,[sp,#16]
42bc3d5698SJohn Baldwin	stp	x21,x22,[sp,#32]
43bc3d5698SJohn Baldwin	stp	x23,x24,[sp,#48]
44bc3d5698SJohn Baldwin	stp	x25,x26,[sp,#64]
45bc3d5698SJohn Baldwin	stp	x27,x28,[sp,#80]
46bc3d5698SJohn Baldwin	sub	sp,sp,#64
47bc3d5698SJohn Baldwin
48bc3d5698SJohn Baldwin	ldp	x22,x23,[x5]		// load sigma
49bc3d5698SJohn Baldwin	ldp	x24,x25,[x3]		// load key
50bc3d5698SJohn Baldwin	ldp	x26,x27,[x3,#16]
51bc3d5698SJohn Baldwin	ldp	x28,x30,[x4]		// load counter
52c0855eaaSJohn Baldwin#ifdef	__AARCH64EB__
53bc3d5698SJohn Baldwin	ror	x24,x24,#32
54bc3d5698SJohn Baldwin	ror	x25,x25,#32
55bc3d5698SJohn Baldwin	ror	x26,x26,#32
56bc3d5698SJohn Baldwin	ror	x27,x27,#32
57bc3d5698SJohn Baldwin	ror	x28,x28,#32
58bc3d5698SJohn Baldwin	ror	x30,x30,#32
59bc3d5698SJohn Baldwin#endif
60bc3d5698SJohn Baldwin
61bc3d5698SJohn Baldwin.Loop_outer:
62bc3d5698SJohn Baldwin	mov	w5,w22			// unpack key block
63bc3d5698SJohn Baldwin	lsr	x6,x22,#32
64bc3d5698SJohn Baldwin	mov	w7,w23
65bc3d5698SJohn Baldwin	lsr	x8,x23,#32
66bc3d5698SJohn Baldwin	mov	w9,w24
67bc3d5698SJohn Baldwin	lsr	x10,x24,#32
68bc3d5698SJohn Baldwin	mov	w11,w25
69bc3d5698SJohn Baldwin	lsr	x12,x25,#32
70bc3d5698SJohn Baldwin	mov	w13,w26
71bc3d5698SJohn Baldwin	lsr	x14,x26,#32
72bc3d5698SJohn Baldwin	mov	w15,w27
73bc3d5698SJohn Baldwin	lsr	x16,x27,#32
74bc3d5698SJohn Baldwin	mov	w17,w28
75bc3d5698SJohn Baldwin	lsr	x19,x28,#32
76bc3d5698SJohn Baldwin	mov	w20,w30
77bc3d5698SJohn Baldwin	lsr	x21,x30,#32
78bc3d5698SJohn Baldwin
79bc3d5698SJohn Baldwin	mov	x4,#10
80bc3d5698SJohn Baldwin	subs	x2,x2,#64
81bc3d5698SJohn Baldwin.Loop:
82bc3d5698SJohn Baldwin	sub	x4,x4,#1
83bc3d5698SJohn Baldwin	add	w5,w5,w9
84bc3d5698SJohn Baldwin	add	w6,w6,w10
85bc3d5698SJohn Baldwin	add	w7,w7,w11
86bc3d5698SJohn Baldwin	add	w8,w8,w12
87bc3d5698SJohn Baldwin	eor	w17,w17,w5
88bc3d5698SJohn Baldwin	eor	w19,w19,w6
89bc3d5698SJohn Baldwin	eor	w20,w20,w7
90bc3d5698SJohn Baldwin	eor	w21,w21,w8
91bc3d5698SJohn Baldwin	ror	w17,w17,#16
92bc3d5698SJohn Baldwin	ror	w19,w19,#16
93bc3d5698SJohn Baldwin	ror	w20,w20,#16
94bc3d5698SJohn Baldwin	ror	w21,w21,#16
95bc3d5698SJohn Baldwin	add	w13,w13,w17
96bc3d5698SJohn Baldwin	add	w14,w14,w19
97bc3d5698SJohn Baldwin	add	w15,w15,w20
98bc3d5698SJohn Baldwin	add	w16,w16,w21
99bc3d5698SJohn Baldwin	eor	w9,w9,w13
100bc3d5698SJohn Baldwin	eor	w10,w10,w14
101bc3d5698SJohn Baldwin	eor	w11,w11,w15
102bc3d5698SJohn Baldwin	eor	w12,w12,w16
103bc3d5698SJohn Baldwin	ror	w9,w9,#20
104bc3d5698SJohn Baldwin	ror	w10,w10,#20
105bc3d5698SJohn Baldwin	ror	w11,w11,#20
106bc3d5698SJohn Baldwin	ror	w12,w12,#20
107bc3d5698SJohn Baldwin	add	w5,w5,w9
108bc3d5698SJohn Baldwin	add	w6,w6,w10
109bc3d5698SJohn Baldwin	add	w7,w7,w11
110bc3d5698SJohn Baldwin	add	w8,w8,w12
111bc3d5698SJohn Baldwin	eor	w17,w17,w5
112bc3d5698SJohn Baldwin	eor	w19,w19,w6
113bc3d5698SJohn Baldwin	eor	w20,w20,w7
114bc3d5698SJohn Baldwin	eor	w21,w21,w8
115bc3d5698SJohn Baldwin	ror	w17,w17,#24
116bc3d5698SJohn Baldwin	ror	w19,w19,#24
117bc3d5698SJohn Baldwin	ror	w20,w20,#24
118bc3d5698SJohn Baldwin	ror	w21,w21,#24
119bc3d5698SJohn Baldwin	add	w13,w13,w17
120bc3d5698SJohn Baldwin	add	w14,w14,w19
121bc3d5698SJohn Baldwin	add	w15,w15,w20
122bc3d5698SJohn Baldwin	add	w16,w16,w21
123bc3d5698SJohn Baldwin	eor	w9,w9,w13
124bc3d5698SJohn Baldwin	eor	w10,w10,w14
125bc3d5698SJohn Baldwin	eor	w11,w11,w15
126bc3d5698SJohn Baldwin	eor	w12,w12,w16
127bc3d5698SJohn Baldwin	ror	w9,w9,#25
128bc3d5698SJohn Baldwin	ror	w10,w10,#25
129bc3d5698SJohn Baldwin	ror	w11,w11,#25
130bc3d5698SJohn Baldwin	ror	w12,w12,#25
131bc3d5698SJohn Baldwin	add	w5,w5,w10
132bc3d5698SJohn Baldwin	add	w6,w6,w11
133bc3d5698SJohn Baldwin	add	w7,w7,w12
134bc3d5698SJohn Baldwin	add	w8,w8,w9
135bc3d5698SJohn Baldwin	eor	w21,w21,w5
136bc3d5698SJohn Baldwin	eor	w17,w17,w6
137bc3d5698SJohn Baldwin	eor	w19,w19,w7
138bc3d5698SJohn Baldwin	eor	w20,w20,w8
139bc3d5698SJohn Baldwin	ror	w21,w21,#16
140bc3d5698SJohn Baldwin	ror	w17,w17,#16
141bc3d5698SJohn Baldwin	ror	w19,w19,#16
142bc3d5698SJohn Baldwin	ror	w20,w20,#16
143bc3d5698SJohn Baldwin	add	w15,w15,w21
144bc3d5698SJohn Baldwin	add	w16,w16,w17
145bc3d5698SJohn Baldwin	add	w13,w13,w19
146bc3d5698SJohn Baldwin	add	w14,w14,w20
147bc3d5698SJohn Baldwin	eor	w10,w10,w15
148bc3d5698SJohn Baldwin	eor	w11,w11,w16
149bc3d5698SJohn Baldwin	eor	w12,w12,w13
150bc3d5698SJohn Baldwin	eor	w9,w9,w14
151bc3d5698SJohn Baldwin	ror	w10,w10,#20
152bc3d5698SJohn Baldwin	ror	w11,w11,#20
153bc3d5698SJohn Baldwin	ror	w12,w12,#20
154bc3d5698SJohn Baldwin	ror	w9,w9,#20
155bc3d5698SJohn Baldwin	add	w5,w5,w10
156bc3d5698SJohn Baldwin	add	w6,w6,w11
157bc3d5698SJohn Baldwin	add	w7,w7,w12
158bc3d5698SJohn Baldwin	add	w8,w8,w9
159bc3d5698SJohn Baldwin	eor	w21,w21,w5
160bc3d5698SJohn Baldwin	eor	w17,w17,w6
161bc3d5698SJohn Baldwin	eor	w19,w19,w7
162bc3d5698SJohn Baldwin	eor	w20,w20,w8
163bc3d5698SJohn Baldwin	ror	w21,w21,#24
164bc3d5698SJohn Baldwin	ror	w17,w17,#24
165bc3d5698SJohn Baldwin	ror	w19,w19,#24
166bc3d5698SJohn Baldwin	ror	w20,w20,#24
167bc3d5698SJohn Baldwin	add	w15,w15,w21
168bc3d5698SJohn Baldwin	add	w16,w16,w17
169bc3d5698SJohn Baldwin	add	w13,w13,w19
170bc3d5698SJohn Baldwin	add	w14,w14,w20
171bc3d5698SJohn Baldwin	eor	w10,w10,w15
172bc3d5698SJohn Baldwin	eor	w11,w11,w16
173bc3d5698SJohn Baldwin	eor	w12,w12,w13
174bc3d5698SJohn Baldwin	eor	w9,w9,w14
175bc3d5698SJohn Baldwin	ror	w10,w10,#25
176bc3d5698SJohn Baldwin	ror	w11,w11,#25
177bc3d5698SJohn Baldwin	ror	w12,w12,#25
178bc3d5698SJohn Baldwin	ror	w9,w9,#25
179bc3d5698SJohn Baldwin	cbnz	x4,.Loop
180bc3d5698SJohn Baldwin
181bc3d5698SJohn Baldwin	add	w5,w5,w22		// accumulate key block
182bc3d5698SJohn Baldwin	add	x6,x6,x22,lsr#32
183bc3d5698SJohn Baldwin	add	w7,w7,w23
184bc3d5698SJohn Baldwin	add	x8,x8,x23,lsr#32
185bc3d5698SJohn Baldwin	add	w9,w9,w24
186bc3d5698SJohn Baldwin	add	x10,x10,x24,lsr#32
187bc3d5698SJohn Baldwin	add	w11,w11,w25
188bc3d5698SJohn Baldwin	add	x12,x12,x25,lsr#32
189bc3d5698SJohn Baldwin	add	w13,w13,w26
190bc3d5698SJohn Baldwin	add	x14,x14,x26,lsr#32
191bc3d5698SJohn Baldwin	add	w15,w15,w27
192bc3d5698SJohn Baldwin	add	x16,x16,x27,lsr#32
193bc3d5698SJohn Baldwin	add	w17,w17,w28
194bc3d5698SJohn Baldwin	add	x19,x19,x28,lsr#32
195bc3d5698SJohn Baldwin	add	w20,w20,w30
196bc3d5698SJohn Baldwin	add	x21,x21,x30,lsr#32
197bc3d5698SJohn Baldwin
198bc3d5698SJohn Baldwin	b.lo	.Ltail
199bc3d5698SJohn Baldwin
200bc3d5698SJohn Baldwin	add	x5,x5,x6,lsl#32	// pack
201bc3d5698SJohn Baldwin	add	x7,x7,x8,lsl#32
202bc3d5698SJohn Baldwin	ldp	x6,x8,[x1,#0]		// load input
203bc3d5698SJohn Baldwin	add	x9,x9,x10,lsl#32
204bc3d5698SJohn Baldwin	add	x11,x11,x12,lsl#32
205bc3d5698SJohn Baldwin	ldp	x10,x12,[x1,#16]
206bc3d5698SJohn Baldwin	add	x13,x13,x14,lsl#32
207bc3d5698SJohn Baldwin	add	x15,x15,x16,lsl#32
208bc3d5698SJohn Baldwin	ldp	x14,x16,[x1,#32]
209bc3d5698SJohn Baldwin	add	x17,x17,x19,lsl#32
210bc3d5698SJohn Baldwin	add	x20,x20,x21,lsl#32
211bc3d5698SJohn Baldwin	ldp	x19,x21,[x1,#48]
212bc3d5698SJohn Baldwin	add	x1,x1,#64
213c0855eaaSJohn Baldwin#ifdef	__AARCH64EB__
214bc3d5698SJohn Baldwin	rev	x5,x5
215bc3d5698SJohn Baldwin	rev	x7,x7
216bc3d5698SJohn Baldwin	rev	x9,x9
217bc3d5698SJohn Baldwin	rev	x11,x11
218bc3d5698SJohn Baldwin	rev	x13,x13
219bc3d5698SJohn Baldwin	rev	x15,x15
220bc3d5698SJohn Baldwin	rev	x17,x17
221bc3d5698SJohn Baldwin	rev	x20,x20
222bc3d5698SJohn Baldwin#endif
223bc3d5698SJohn Baldwin	eor	x5,x5,x6
224bc3d5698SJohn Baldwin	eor	x7,x7,x8
225bc3d5698SJohn Baldwin	eor	x9,x9,x10
226bc3d5698SJohn Baldwin	eor	x11,x11,x12
227bc3d5698SJohn Baldwin	eor	x13,x13,x14
228bc3d5698SJohn Baldwin	eor	x15,x15,x16
229bc3d5698SJohn Baldwin	eor	x17,x17,x19
230bc3d5698SJohn Baldwin	eor	x20,x20,x21
231bc3d5698SJohn Baldwin
232bc3d5698SJohn Baldwin	stp	x5,x7,[x0,#0]		// store output
233bc3d5698SJohn Baldwin	add	x28,x28,#1			// increment counter
234bc3d5698SJohn Baldwin	stp	x9,x11,[x0,#16]
235bc3d5698SJohn Baldwin	stp	x13,x15,[x0,#32]
236bc3d5698SJohn Baldwin	stp	x17,x20,[x0,#48]
237bc3d5698SJohn Baldwin	add	x0,x0,#64
238bc3d5698SJohn Baldwin
239bc3d5698SJohn Baldwin	b.hi	.Loop_outer
240bc3d5698SJohn Baldwin
241bc3d5698SJohn Baldwin	ldp	x19,x20,[x29,#16]
242bc3d5698SJohn Baldwin	add	sp,sp,#64
243bc3d5698SJohn Baldwin	ldp	x21,x22,[x29,#32]
244bc3d5698SJohn Baldwin	ldp	x23,x24,[x29,#48]
245bc3d5698SJohn Baldwin	ldp	x25,x26,[x29,#64]
246bc3d5698SJohn Baldwin	ldp	x27,x28,[x29,#80]
247bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#96
248bc3d5698SJohn Baldwin.Labort:
249*bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
250bc3d5698SJohn Baldwin	ret
251bc3d5698SJohn Baldwin
252bc3d5698SJohn Baldwin.align	4
253bc3d5698SJohn Baldwin.Ltail:
254bc3d5698SJohn Baldwin	add	x2,x2,#64
255bc3d5698SJohn Baldwin.Less_than_64:
256bc3d5698SJohn Baldwin	sub	x0,x0,#1
257bc3d5698SJohn Baldwin	add	x1,x1,x2
258bc3d5698SJohn Baldwin	add	x0,x0,x2
259bc3d5698SJohn Baldwin	add	x4,sp,x2
260bc3d5698SJohn Baldwin	neg	x2,x2
261bc3d5698SJohn Baldwin
262bc3d5698SJohn Baldwin	add	x5,x5,x6,lsl#32	// pack
263bc3d5698SJohn Baldwin	add	x7,x7,x8,lsl#32
264bc3d5698SJohn Baldwin	add	x9,x9,x10,lsl#32
265bc3d5698SJohn Baldwin	add	x11,x11,x12,lsl#32
266bc3d5698SJohn Baldwin	add	x13,x13,x14,lsl#32
267bc3d5698SJohn Baldwin	add	x15,x15,x16,lsl#32
268bc3d5698SJohn Baldwin	add	x17,x17,x19,lsl#32
269bc3d5698SJohn Baldwin	add	x20,x20,x21,lsl#32
270c0855eaaSJohn Baldwin#ifdef	__AARCH64EB__
271bc3d5698SJohn Baldwin	rev	x5,x5
272bc3d5698SJohn Baldwin	rev	x7,x7
273bc3d5698SJohn Baldwin	rev	x9,x9
274bc3d5698SJohn Baldwin	rev	x11,x11
275bc3d5698SJohn Baldwin	rev	x13,x13
276bc3d5698SJohn Baldwin	rev	x15,x15
277bc3d5698SJohn Baldwin	rev	x17,x17
278bc3d5698SJohn Baldwin	rev	x20,x20
279bc3d5698SJohn Baldwin#endif
280bc3d5698SJohn Baldwin	stp	x5,x7,[sp,#0]
281bc3d5698SJohn Baldwin	stp	x9,x11,[sp,#16]
282bc3d5698SJohn Baldwin	stp	x13,x15,[sp,#32]
283bc3d5698SJohn Baldwin	stp	x17,x20,[sp,#48]
284bc3d5698SJohn Baldwin
285bc3d5698SJohn Baldwin.Loop_tail:
286bc3d5698SJohn Baldwin	ldrb	w10,[x1,x2]
287bc3d5698SJohn Baldwin	ldrb	w11,[x4,x2]
288bc3d5698SJohn Baldwin	add	x2,x2,#1
289bc3d5698SJohn Baldwin	eor	w10,w10,w11
290bc3d5698SJohn Baldwin	strb	w10,[x0,x2]
291bc3d5698SJohn Baldwin	cbnz	x2,.Loop_tail
292bc3d5698SJohn Baldwin
293bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#0]
294bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#16]
295bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#32]
296bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#48]
297bc3d5698SJohn Baldwin
298bc3d5698SJohn Baldwin	ldp	x19,x20,[x29,#16]
299bc3d5698SJohn Baldwin	add	sp,sp,#64
300bc3d5698SJohn Baldwin	ldp	x21,x22,[x29,#32]
301bc3d5698SJohn Baldwin	ldp	x23,x24,[x29,#48]
302bc3d5698SJohn Baldwin	ldp	x25,x26,[x29,#64]
303bc3d5698SJohn Baldwin	ldp	x27,x28,[x29,#80]
304bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#96
305*bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
306bc3d5698SJohn Baldwin	ret
307bc3d5698SJohn Baldwin.size	ChaCha20_ctr32,.-ChaCha20_ctr32
308bc3d5698SJohn Baldwin
309c0855eaaSJohn Baldwin#ifdef	__KERNEL__
310c0855eaaSJohn Baldwin.globl	ChaCha20_neon
311c0855eaaSJohn Baldwin#endif
312bc3d5698SJohn Baldwin.type	ChaCha20_neon,%function
313bc3d5698SJohn Baldwin.align	5
314bc3d5698SJohn BaldwinChaCha20_neon:
315*bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
316c0855eaaSJohn Baldwin.LChaCha20_neon:
317bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-96]!
318bc3d5698SJohn Baldwin	add	x29,sp,#0
319bc3d5698SJohn Baldwin
320bc3d5698SJohn Baldwin	adr	x5,.Lsigma
321bc3d5698SJohn Baldwin	stp	x19,x20,[sp,#16]
322bc3d5698SJohn Baldwin	stp	x21,x22,[sp,#32]
323bc3d5698SJohn Baldwin	stp	x23,x24,[sp,#48]
324bc3d5698SJohn Baldwin	stp	x25,x26,[sp,#64]
325bc3d5698SJohn Baldwin	stp	x27,x28,[sp,#80]
326bc3d5698SJohn Baldwin	cmp	x2,#512
327bc3d5698SJohn Baldwin	b.hs	.L512_or_more_neon
328bc3d5698SJohn Baldwin
329bc3d5698SJohn Baldwin	sub	sp,sp,#64
330bc3d5698SJohn Baldwin
331bc3d5698SJohn Baldwin	ldp	x22,x23,[x5]		// load sigma
332c0855eaaSJohn Baldwin	ld1	{v0.4s},[x5],#16
333bc3d5698SJohn Baldwin	ldp	x24,x25,[x3]		// load key
334bc3d5698SJohn Baldwin	ldp	x26,x27,[x3,#16]
335c0855eaaSJohn Baldwin	ld1	{v1.4s,v2.4s},[x3]
336bc3d5698SJohn Baldwin	ldp	x28,x30,[x4]		// load counter
337c0855eaaSJohn Baldwin	ld1	{v3.4s},[x4]
338c0855eaaSJohn Baldwin	stp	d8,d9,[sp]			// meet ABI requirements
339c0855eaaSJohn Baldwin	ld1	{v8.4s,v9.4s},[x5]
340c0855eaaSJohn Baldwin#ifdef	__AARCH64EB__
341c0855eaaSJohn Baldwin	rev64	v0.4s,v0.4s
342bc3d5698SJohn Baldwin	ror	x24,x24,#32
343bc3d5698SJohn Baldwin	ror	x25,x25,#32
344bc3d5698SJohn Baldwin	ror	x26,x26,#32
345bc3d5698SJohn Baldwin	ror	x27,x27,#32
346bc3d5698SJohn Baldwin	ror	x28,x28,#32
347bc3d5698SJohn Baldwin	ror	x30,x30,#32
348bc3d5698SJohn Baldwin#endif
349bc3d5698SJohn Baldwin
350bc3d5698SJohn Baldwin.Loop_outer_neon:
351c0855eaaSJohn Baldwin	dup	v16.4s,v0.s[0]			// unpack key block
352c0855eaaSJohn Baldwin	mov	w5,w22
353c0855eaaSJohn Baldwin	dup	v20.4s,v0.s[1]
354bc3d5698SJohn Baldwin	lsr	x6,x22,#32
355c0855eaaSJohn Baldwin	dup	v24.4s,v0.s[2]
356bc3d5698SJohn Baldwin	mov	w7,w23
357c0855eaaSJohn Baldwin	dup	v28.4s,v0.s[3]
358bc3d5698SJohn Baldwin	lsr	x8,x23,#32
359c0855eaaSJohn Baldwin	dup	v17.4s,v1.s[0]
360bc3d5698SJohn Baldwin	mov	w9,w24
361c0855eaaSJohn Baldwin	dup	v21.4s,v1.s[1]
362bc3d5698SJohn Baldwin	lsr	x10,x24,#32
363c0855eaaSJohn Baldwin	dup	v25.4s,v1.s[2]
364bc3d5698SJohn Baldwin	mov	w11,w25
365c0855eaaSJohn Baldwin	dup	v29.4s,v1.s[3]
366bc3d5698SJohn Baldwin	lsr	x12,x25,#32
367c0855eaaSJohn Baldwin	dup	v19.4s,v3.s[0]
368bc3d5698SJohn Baldwin	mov	w13,w26
369c0855eaaSJohn Baldwin	dup	v23.4s,v3.s[1]
370bc3d5698SJohn Baldwin	lsr	x14,x26,#32
371c0855eaaSJohn Baldwin	dup	v27.4s,v3.s[2]
372bc3d5698SJohn Baldwin	mov	w15,w27
373c0855eaaSJohn Baldwin	dup	v31.4s,v3.s[3]
374bc3d5698SJohn Baldwin	lsr	x16,x27,#32
375c0855eaaSJohn Baldwin	add	v19.4s,v19.4s,v8.4s
376bc3d5698SJohn Baldwin	mov	w17,w28
377c0855eaaSJohn Baldwin	dup	v18.4s,v2.s[0]
378bc3d5698SJohn Baldwin	lsr	x19,x28,#32
379c0855eaaSJohn Baldwin	dup	v22.4s,v2.s[1]
380bc3d5698SJohn Baldwin	mov	w20,w30
381c0855eaaSJohn Baldwin	dup	v26.4s,v2.s[2]
382bc3d5698SJohn Baldwin	lsr	x21,x30,#32
383c0855eaaSJohn Baldwin	dup	v30.4s,v2.s[3]
384bc3d5698SJohn Baldwin
385bc3d5698SJohn Baldwin	mov	x4,#10
386c0855eaaSJohn Baldwin	subs	x2,x2,#320
387bc3d5698SJohn Baldwin.Loop_neon:
388bc3d5698SJohn Baldwin	sub	x4,x4,#1
389bc3d5698SJohn Baldwin	add	v16.4s,v16.4s,v17.4s
390c0855eaaSJohn Baldwin	add	w5,w5,w9
391c0855eaaSJohn Baldwin	add	v20.4s,v20.4s,v21.4s
392c0855eaaSJohn Baldwin	add	w6,w6,w10
393c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v25.4s
394bc3d5698SJohn Baldwin	add	w7,w7,w11
395c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v29.4s
396bc3d5698SJohn Baldwin	add	w8,w8,w12
397bc3d5698SJohn Baldwin	eor	v19.16b,v19.16b,v16.16b
398c0855eaaSJohn Baldwin	eor	w17,w17,w5
399c0855eaaSJohn Baldwin	eor	v23.16b,v23.16b,v20.16b
400bc3d5698SJohn Baldwin	eor	w19,w19,w6
401c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v24.16b
402bc3d5698SJohn Baldwin	eor	w20,w20,w7
403c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v28.16b
404bc3d5698SJohn Baldwin	eor	w21,w21,w8
405bc3d5698SJohn Baldwin	rev32	v19.8h,v19.8h
406bc3d5698SJohn Baldwin	ror	w17,w17,#16
407c0855eaaSJohn Baldwin	rev32	v23.8h,v23.8h
408bc3d5698SJohn Baldwin	ror	w19,w19,#16
409c0855eaaSJohn Baldwin	rev32	v27.8h,v27.8h
410bc3d5698SJohn Baldwin	ror	w20,w20,#16
411c0855eaaSJohn Baldwin	rev32	v31.8h,v31.8h
412bc3d5698SJohn Baldwin	ror	w21,w21,#16
413c0855eaaSJohn Baldwin	add	v18.4s,v18.4s,v19.4s
414bc3d5698SJohn Baldwin	add	w13,w13,w17
415c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v23.4s
416bc3d5698SJohn Baldwin	add	w14,w14,w19
417c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v27.4s
418bc3d5698SJohn Baldwin	add	w15,w15,w20
419c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v31.4s
420bc3d5698SJohn Baldwin	add	w16,w16,w21
421c0855eaaSJohn Baldwin	eor	v4.16b,v17.16b,v18.16b
422bc3d5698SJohn Baldwin	eor	w9,w9,w13
423c0855eaaSJohn Baldwin	eor	v5.16b,v21.16b,v22.16b
424bc3d5698SJohn Baldwin	eor	w10,w10,w14
425c0855eaaSJohn Baldwin	eor	v6.16b,v25.16b,v26.16b
426bc3d5698SJohn Baldwin	eor	w11,w11,w15
427c0855eaaSJohn Baldwin	eor	v7.16b,v29.16b,v30.16b
428bc3d5698SJohn Baldwin	eor	w12,w12,w16
429c0855eaaSJohn Baldwin	ushr	v17.4s,v4.4s,#20
430bc3d5698SJohn Baldwin	ror	w9,w9,#20
431c0855eaaSJohn Baldwin	ushr	v21.4s,v5.4s,#20
432bc3d5698SJohn Baldwin	ror	w10,w10,#20
433c0855eaaSJohn Baldwin	ushr	v25.4s,v6.4s,#20
434bc3d5698SJohn Baldwin	ror	w11,w11,#20
435c0855eaaSJohn Baldwin	ushr	v29.4s,v7.4s,#20
436bc3d5698SJohn Baldwin	ror	w12,w12,#20
437c0855eaaSJohn Baldwin	sli	v17.4s,v4.4s,#12
438bc3d5698SJohn Baldwin	add	w5,w5,w9
439c0855eaaSJohn Baldwin	sli	v21.4s,v5.4s,#12
440bc3d5698SJohn Baldwin	add	w6,w6,w10
441c0855eaaSJohn Baldwin	sli	v25.4s,v6.4s,#12
442bc3d5698SJohn Baldwin	add	w7,w7,w11
443c0855eaaSJohn Baldwin	sli	v29.4s,v7.4s,#12
444bc3d5698SJohn Baldwin	add	w8,w8,w12
445c0855eaaSJohn Baldwin	add	v16.4s,v16.4s,v17.4s
446bc3d5698SJohn Baldwin	eor	w17,w17,w5
447c0855eaaSJohn Baldwin	add	v20.4s,v20.4s,v21.4s
448bc3d5698SJohn Baldwin	eor	w19,w19,w6
449c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v25.4s
450bc3d5698SJohn Baldwin	eor	w20,w20,w7
451c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v29.4s
452bc3d5698SJohn Baldwin	eor	w21,w21,w8
453c0855eaaSJohn Baldwin	eor	v4.16b,v19.16b,v16.16b
454bc3d5698SJohn Baldwin	ror	w17,w17,#24
455c0855eaaSJohn Baldwin	eor	v5.16b,v23.16b,v20.16b
456bc3d5698SJohn Baldwin	ror	w19,w19,#24
457c0855eaaSJohn Baldwin	eor	v6.16b,v27.16b,v24.16b
458bc3d5698SJohn Baldwin	ror	w20,w20,#24
459c0855eaaSJohn Baldwin	eor	v7.16b,v31.16b,v28.16b
460bc3d5698SJohn Baldwin	ror	w21,w21,#24
461c0855eaaSJohn Baldwin	tbl	v19.16b,{v4.16b},v9.16b
462bc3d5698SJohn Baldwin	add	w13,w13,w17
463c0855eaaSJohn Baldwin	tbl	v23.16b,{v5.16b},v9.16b
464bc3d5698SJohn Baldwin	add	w14,w14,w19
465c0855eaaSJohn Baldwin	tbl	v27.16b,{v6.16b},v9.16b
466bc3d5698SJohn Baldwin	add	w15,w15,w20
467c0855eaaSJohn Baldwin	tbl	v31.16b,{v7.16b},v9.16b
468bc3d5698SJohn Baldwin	add	w16,w16,w21
469c0855eaaSJohn Baldwin	add	v18.4s,v18.4s,v19.4s
470bc3d5698SJohn Baldwin	eor	w9,w9,w13
471c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v23.4s
472bc3d5698SJohn Baldwin	eor	w10,w10,w14
473c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v27.4s
474bc3d5698SJohn Baldwin	eor	w11,w11,w15
475c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v31.4s
476bc3d5698SJohn Baldwin	eor	w12,w12,w16
477c0855eaaSJohn Baldwin	eor	v4.16b,v17.16b,v18.16b
478bc3d5698SJohn Baldwin	ror	w9,w9,#25
479c0855eaaSJohn Baldwin	eor	v5.16b,v21.16b,v22.16b
480bc3d5698SJohn Baldwin	ror	w10,w10,#25
481c0855eaaSJohn Baldwin	eor	v6.16b,v25.16b,v26.16b
482bc3d5698SJohn Baldwin	ror	w11,w11,#25
483c0855eaaSJohn Baldwin	eor	v7.16b,v29.16b,v30.16b
484bc3d5698SJohn Baldwin	ror	w12,w12,#25
485c0855eaaSJohn Baldwin	ushr	v17.4s,v4.4s,#25
486c0855eaaSJohn Baldwin	ushr	v21.4s,v5.4s,#25
487c0855eaaSJohn Baldwin	ushr	v25.4s,v6.4s,#25
488c0855eaaSJohn Baldwin	ushr	v29.4s,v7.4s,#25
489c0855eaaSJohn Baldwin	sli	v17.4s,v4.4s,#7
490c0855eaaSJohn Baldwin	sli	v21.4s,v5.4s,#7
491c0855eaaSJohn Baldwin	sli	v25.4s,v6.4s,#7
492c0855eaaSJohn Baldwin	sli	v29.4s,v7.4s,#7
493c0855eaaSJohn Baldwin	add	v16.4s,v16.4s,v21.4s
494bc3d5698SJohn Baldwin	add	w5,w5,w10
495c0855eaaSJohn Baldwin	add	v20.4s,v20.4s,v25.4s
496bc3d5698SJohn Baldwin	add	w6,w6,w11
497c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v29.4s
498bc3d5698SJohn Baldwin	add	w7,w7,w12
499c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v17.4s
500bc3d5698SJohn Baldwin	add	w8,w8,w9
501c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v16.16b
502bc3d5698SJohn Baldwin	eor	w21,w21,w5
503c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v20.16b
504bc3d5698SJohn Baldwin	eor	w17,w17,w6
505c0855eaaSJohn Baldwin	eor	v23.16b,v23.16b,v24.16b
506bc3d5698SJohn Baldwin	eor	w19,w19,w7
507c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v28.16b
508bc3d5698SJohn Baldwin	eor	w20,w20,w8
509c0855eaaSJohn Baldwin	rev32	v31.8h,v31.8h
510bc3d5698SJohn Baldwin	ror	w21,w21,#16
511c0855eaaSJohn Baldwin	rev32	v19.8h,v19.8h
512bc3d5698SJohn Baldwin	ror	w17,w17,#16
513c0855eaaSJohn Baldwin	rev32	v23.8h,v23.8h
514bc3d5698SJohn Baldwin	ror	w19,w19,#16
515c0855eaaSJohn Baldwin	rev32	v27.8h,v27.8h
516bc3d5698SJohn Baldwin	ror	w20,w20,#16
517c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v31.4s
518bc3d5698SJohn Baldwin	add	w15,w15,w21
519c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v19.4s
520bc3d5698SJohn Baldwin	add	w16,w16,w17
521c0855eaaSJohn Baldwin	add	v18.4s,v18.4s,v23.4s
522bc3d5698SJohn Baldwin	add	w13,w13,w19
523c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v27.4s
524bc3d5698SJohn Baldwin	add	w14,w14,w20
525c0855eaaSJohn Baldwin	eor	v4.16b,v21.16b,v26.16b
526bc3d5698SJohn Baldwin	eor	w10,w10,w15
527c0855eaaSJohn Baldwin	eor	v5.16b,v25.16b,v30.16b
528bc3d5698SJohn Baldwin	eor	w11,w11,w16
529c0855eaaSJohn Baldwin	eor	v6.16b,v29.16b,v18.16b
530bc3d5698SJohn Baldwin	eor	w12,w12,w13
531c0855eaaSJohn Baldwin	eor	v7.16b,v17.16b,v22.16b
532bc3d5698SJohn Baldwin	eor	w9,w9,w14
533c0855eaaSJohn Baldwin	ushr	v21.4s,v4.4s,#20
534bc3d5698SJohn Baldwin	ror	w10,w10,#20
535c0855eaaSJohn Baldwin	ushr	v25.4s,v5.4s,#20
536bc3d5698SJohn Baldwin	ror	w11,w11,#20
537c0855eaaSJohn Baldwin	ushr	v29.4s,v6.4s,#20
538bc3d5698SJohn Baldwin	ror	w12,w12,#20
539c0855eaaSJohn Baldwin	ushr	v17.4s,v7.4s,#20
540bc3d5698SJohn Baldwin	ror	w9,w9,#20
541c0855eaaSJohn Baldwin	sli	v21.4s,v4.4s,#12
542bc3d5698SJohn Baldwin	add	w5,w5,w10
543c0855eaaSJohn Baldwin	sli	v25.4s,v5.4s,#12
544bc3d5698SJohn Baldwin	add	w6,w6,w11
545c0855eaaSJohn Baldwin	sli	v29.4s,v6.4s,#12
546bc3d5698SJohn Baldwin	add	w7,w7,w12
547c0855eaaSJohn Baldwin	sli	v17.4s,v7.4s,#12
548bc3d5698SJohn Baldwin	add	w8,w8,w9
549c0855eaaSJohn Baldwin	add	v16.4s,v16.4s,v21.4s
550bc3d5698SJohn Baldwin	eor	w21,w21,w5
551c0855eaaSJohn Baldwin	add	v20.4s,v20.4s,v25.4s
552bc3d5698SJohn Baldwin	eor	w17,w17,w6
553c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v29.4s
554bc3d5698SJohn Baldwin	eor	w19,w19,w7
555c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v17.4s
556bc3d5698SJohn Baldwin	eor	w20,w20,w8
557c0855eaaSJohn Baldwin	eor	v4.16b,v31.16b,v16.16b
558bc3d5698SJohn Baldwin	ror	w21,w21,#24
559c0855eaaSJohn Baldwin	eor	v5.16b,v19.16b,v20.16b
560bc3d5698SJohn Baldwin	ror	w17,w17,#24
561c0855eaaSJohn Baldwin	eor	v6.16b,v23.16b,v24.16b
562bc3d5698SJohn Baldwin	ror	w19,w19,#24
563c0855eaaSJohn Baldwin	eor	v7.16b,v27.16b,v28.16b
564bc3d5698SJohn Baldwin	ror	w20,w20,#24
565c0855eaaSJohn Baldwin	tbl	v31.16b,{v4.16b},v9.16b
566bc3d5698SJohn Baldwin	add	w15,w15,w21
567c0855eaaSJohn Baldwin	tbl	v19.16b,{v5.16b},v9.16b
568bc3d5698SJohn Baldwin	add	w16,w16,w17
569c0855eaaSJohn Baldwin	tbl	v23.16b,{v6.16b},v9.16b
570bc3d5698SJohn Baldwin	add	w13,w13,w19
571c0855eaaSJohn Baldwin	tbl	v27.16b,{v7.16b},v9.16b
572bc3d5698SJohn Baldwin	add	w14,w14,w20
573c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v31.4s
574bc3d5698SJohn Baldwin	eor	w10,w10,w15
575c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v19.4s
576bc3d5698SJohn Baldwin	eor	w11,w11,w16
577c0855eaaSJohn Baldwin	add	v18.4s,v18.4s,v23.4s
578bc3d5698SJohn Baldwin	eor	w12,w12,w13
579c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v27.4s
580bc3d5698SJohn Baldwin	eor	w9,w9,w14
581c0855eaaSJohn Baldwin	eor	v4.16b,v21.16b,v26.16b
582bc3d5698SJohn Baldwin	ror	w10,w10,#25
583c0855eaaSJohn Baldwin	eor	v5.16b,v25.16b,v30.16b
584bc3d5698SJohn Baldwin	ror	w11,w11,#25
585c0855eaaSJohn Baldwin	eor	v6.16b,v29.16b,v18.16b
586bc3d5698SJohn Baldwin	ror	w12,w12,#25
587c0855eaaSJohn Baldwin	eor	v7.16b,v17.16b,v22.16b
588bc3d5698SJohn Baldwin	ror	w9,w9,#25
589c0855eaaSJohn Baldwin	ushr	v21.4s,v4.4s,#25
590c0855eaaSJohn Baldwin	ushr	v25.4s,v5.4s,#25
591c0855eaaSJohn Baldwin	ushr	v29.4s,v6.4s,#25
592c0855eaaSJohn Baldwin	ushr	v17.4s,v7.4s,#25
593c0855eaaSJohn Baldwin	sli	v21.4s,v4.4s,#7
594c0855eaaSJohn Baldwin	sli	v25.4s,v5.4s,#7
595c0855eaaSJohn Baldwin	sli	v29.4s,v6.4s,#7
596c0855eaaSJohn Baldwin	sli	v17.4s,v7.4s,#7
597bc3d5698SJohn Baldwin	cbnz	x4,.Loop_neon
598bc3d5698SJohn Baldwin
599c0855eaaSJohn Baldwin	add	v19.4s,v19.4s,v8.4s
600c0855eaaSJohn Baldwin
601c0855eaaSJohn Baldwin	zip1	v4.4s,v16.4s,v20.4s			// transpose data
602c0855eaaSJohn Baldwin	zip1	v5.4s,v24.4s,v28.4s
603c0855eaaSJohn Baldwin	zip2	v6.4s,v16.4s,v20.4s
604c0855eaaSJohn Baldwin	zip2	v7.4s,v24.4s,v28.4s
605c0855eaaSJohn Baldwin	zip1	v16.2d,v4.2d,v5.2d
606c0855eaaSJohn Baldwin	zip2	v20.2d,v4.2d,v5.2d
607c0855eaaSJohn Baldwin	zip1	v24.2d,v6.2d,v7.2d
608c0855eaaSJohn Baldwin	zip2	v28.2d,v6.2d,v7.2d
609c0855eaaSJohn Baldwin
610c0855eaaSJohn Baldwin	zip1	v4.4s,v17.4s,v21.4s
611c0855eaaSJohn Baldwin	zip1	v5.4s,v25.4s,v29.4s
612c0855eaaSJohn Baldwin	zip2	v6.4s,v17.4s,v21.4s
613c0855eaaSJohn Baldwin	zip2	v7.4s,v25.4s,v29.4s
614c0855eaaSJohn Baldwin	zip1	v17.2d,v4.2d,v5.2d
615c0855eaaSJohn Baldwin	zip2	v21.2d,v4.2d,v5.2d
616c0855eaaSJohn Baldwin	zip1	v25.2d,v6.2d,v7.2d
617c0855eaaSJohn Baldwin	zip2	v29.2d,v6.2d,v7.2d
618c0855eaaSJohn Baldwin
619c0855eaaSJohn Baldwin	zip1	v4.4s,v18.4s,v22.4s
620bc3d5698SJohn Baldwin	add	w5,w5,w22		// accumulate key block
621c0855eaaSJohn Baldwin	zip1	v5.4s,v26.4s,v30.4s
622bc3d5698SJohn Baldwin	add	x6,x6,x22,lsr#32
623c0855eaaSJohn Baldwin	zip2	v6.4s,v18.4s,v22.4s
624bc3d5698SJohn Baldwin	add	w7,w7,w23
625c0855eaaSJohn Baldwin	zip2	v7.4s,v26.4s,v30.4s
626bc3d5698SJohn Baldwin	add	x8,x8,x23,lsr#32
627c0855eaaSJohn Baldwin	zip1	v18.2d,v4.2d,v5.2d
628bc3d5698SJohn Baldwin	add	w9,w9,w24
629c0855eaaSJohn Baldwin	zip2	v22.2d,v4.2d,v5.2d
630bc3d5698SJohn Baldwin	add	x10,x10,x24,lsr#32
631c0855eaaSJohn Baldwin	zip1	v26.2d,v6.2d,v7.2d
632bc3d5698SJohn Baldwin	add	w11,w11,w25
633c0855eaaSJohn Baldwin	zip2	v30.2d,v6.2d,v7.2d
634bc3d5698SJohn Baldwin	add	x12,x12,x25,lsr#32
635c0855eaaSJohn Baldwin
636c0855eaaSJohn Baldwin	zip1	v4.4s,v19.4s,v23.4s
637bc3d5698SJohn Baldwin	add	w13,w13,w26
638c0855eaaSJohn Baldwin	zip1	v5.4s,v27.4s,v31.4s
639bc3d5698SJohn Baldwin	add	x14,x14,x26,lsr#32
640c0855eaaSJohn Baldwin	zip2	v6.4s,v19.4s,v23.4s
641bc3d5698SJohn Baldwin	add	w15,w15,w27
642c0855eaaSJohn Baldwin	zip2	v7.4s,v27.4s,v31.4s
643bc3d5698SJohn Baldwin	add	x16,x16,x27,lsr#32
644c0855eaaSJohn Baldwin	zip1	v19.2d,v4.2d,v5.2d
645bc3d5698SJohn Baldwin	add	w17,w17,w28
646c0855eaaSJohn Baldwin	zip2	v23.2d,v4.2d,v5.2d
647bc3d5698SJohn Baldwin	add	x19,x19,x28,lsr#32
648c0855eaaSJohn Baldwin	zip1	v27.2d,v6.2d,v7.2d
649bc3d5698SJohn Baldwin	add	w20,w20,w30
650c0855eaaSJohn Baldwin	zip2	v31.2d,v6.2d,v7.2d
651bc3d5698SJohn Baldwin	add	x21,x21,x30,lsr#32
652bc3d5698SJohn Baldwin
653bc3d5698SJohn Baldwin	b.lo	.Ltail_neon
654bc3d5698SJohn Baldwin
655bc3d5698SJohn Baldwin	add	x5,x5,x6,lsl#32	// pack
656bc3d5698SJohn Baldwin	add	x7,x7,x8,lsl#32
657bc3d5698SJohn Baldwin	ldp	x6,x8,[x1,#0]		// load input
658c0855eaaSJohn Baldwin	add	v16.4s,v16.4s,v0.4s			// accumulate key block
659bc3d5698SJohn Baldwin	add	x9,x9,x10,lsl#32
660bc3d5698SJohn Baldwin	add	x11,x11,x12,lsl#32
661bc3d5698SJohn Baldwin	ldp	x10,x12,[x1,#16]
662c0855eaaSJohn Baldwin	add	v17.4s,v17.4s,v1.4s
663bc3d5698SJohn Baldwin	add	x13,x13,x14,lsl#32
664bc3d5698SJohn Baldwin	add	x15,x15,x16,lsl#32
665bc3d5698SJohn Baldwin	ldp	x14,x16,[x1,#32]
666c0855eaaSJohn Baldwin	add	v18.4s,v18.4s,v2.4s
667bc3d5698SJohn Baldwin	add	x17,x17,x19,lsl#32
668bc3d5698SJohn Baldwin	add	x20,x20,x21,lsl#32
669bc3d5698SJohn Baldwin	ldp	x19,x21,[x1,#48]
670c0855eaaSJohn Baldwin	add	v19.4s,v19.4s,v3.4s
671bc3d5698SJohn Baldwin	add	x1,x1,#64
672c0855eaaSJohn Baldwin#ifdef	__AARCH64EB__
673bc3d5698SJohn Baldwin	rev	x5,x5
674bc3d5698SJohn Baldwin	rev	x7,x7
675bc3d5698SJohn Baldwin	rev	x9,x9
676bc3d5698SJohn Baldwin	rev	x11,x11
677bc3d5698SJohn Baldwin	rev	x13,x13
678bc3d5698SJohn Baldwin	rev	x15,x15
679bc3d5698SJohn Baldwin	rev	x17,x17
680bc3d5698SJohn Baldwin	rev	x20,x20
681bc3d5698SJohn Baldwin#endif
682c0855eaaSJohn Baldwin	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
683bc3d5698SJohn Baldwin	eor	x5,x5,x6
684c0855eaaSJohn Baldwin	add	v20.4s,v20.4s,v0.4s
685bc3d5698SJohn Baldwin	eor	x7,x7,x8
686c0855eaaSJohn Baldwin	add	v21.4s,v21.4s,v1.4s
687bc3d5698SJohn Baldwin	eor	x9,x9,x10
688c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v2.4s
689bc3d5698SJohn Baldwin	eor	x11,x11,x12
690c0855eaaSJohn Baldwin	add	v23.4s,v23.4s,v3.4s
691bc3d5698SJohn Baldwin	eor	x13,x13,x14
692c0855eaaSJohn Baldwin	eor	v16.16b,v16.16b,v4.16b
693c0855eaaSJohn Baldwin	movi	v4.4s,#5
694bc3d5698SJohn Baldwin	eor	x15,x15,x16
695c0855eaaSJohn Baldwin	eor	v17.16b,v17.16b,v5.16b
696bc3d5698SJohn Baldwin	eor	x17,x17,x19
697c0855eaaSJohn Baldwin	eor	v18.16b,v18.16b,v6.16b
698bc3d5698SJohn Baldwin	eor	x20,x20,x21
699c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v7.16b
700c0855eaaSJohn Baldwin	add	v8.4s,v8.4s,v4.4s			// += 5
701c0855eaaSJohn Baldwin	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
702bc3d5698SJohn Baldwin
703bc3d5698SJohn Baldwin	stp	x5,x7,[x0,#0]		// store output
704c0855eaaSJohn Baldwin	add	x28,x28,#5			// increment counter
705bc3d5698SJohn Baldwin	stp	x9,x11,[x0,#16]
706bc3d5698SJohn Baldwin	stp	x13,x15,[x0,#32]
707bc3d5698SJohn Baldwin	stp	x17,x20,[x0,#48]
708bc3d5698SJohn Baldwin	add	x0,x0,#64
709bc3d5698SJohn Baldwin
710bc3d5698SJohn Baldwin	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
711c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v0.4s
712c0855eaaSJohn Baldwin	add	v25.4s,v25.4s,v1.4s
713c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v2.4s
714c0855eaaSJohn Baldwin	add	v27.4s,v27.4s,v3.4s
715c0855eaaSJohn Baldwin	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
716c0855eaaSJohn Baldwin
717c0855eaaSJohn Baldwin	eor	v20.16b,v20.16b,v4.16b
718c0855eaaSJohn Baldwin	eor	v21.16b,v21.16b,v5.16b
719c0855eaaSJohn Baldwin	eor	v22.16b,v22.16b,v6.16b
720c0855eaaSJohn Baldwin	eor	v23.16b,v23.16b,v7.16b
721c0855eaaSJohn Baldwin	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
722c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v0.4s
723c0855eaaSJohn Baldwin	add	v29.4s,v29.4s,v1.4s
724c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v2.4s
725c0855eaaSJohn Baldwin	add	v31.4s,v31.4s,v3.4s
726c0855eaaSJohn Baldwin	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
727c0855eaaSJohn Baldwin
728c0855eaaSJohn Baldwin	eor	v24.16b,v24.16b,v16.16b
729c0855eaaSJohn Baldwin	eor	v25.16b,v25.16b,v17.16b
730c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v18.16b
731c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v19.16b
732c0855eaaSJohn Baldwin	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
733c0855eaaSJohn Baldwin
734c0855eaaSJohn Baldwin	eor	v28.16b,v28.16b,v20.16b
735c0855eaaSJohn Baldwin	eor	v29.16b,v29.16b,v21.16b
736c0855eaaSJohn Baldwin	eor	v30.16b,v30.16b,v22.16b
737c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v23.16b
738c0855eaaSJohn Baldwin	st1	{v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64
739bc3d5698SJohn Baldwin
740bc3d5698SJohn Baldwin	b.hi	.Loop_outer_neon
741bc3d5698SJohn Baldwin
742c0855eaaSJohn Baldwin	ldp	d8,d9,[sp]			// meet ABI requirements
743c0855eaaSJohn Baldwin
744bc3d5698SJohn Baldwin	ldp	x19,x20,[x29,#16]
745bc3d5698SJohn Baldwin	add	sp,sp,#64
746bc3d5698SJohn Baldwin	ldp	x21,x22,[x29,#32]
747bc3d5698SJohn Baldwin	ldp	x23,x24,[x29,#48]
748bc3d5698SJohn Baldwin	ldp	x25,x26,[x29,#64]
749bc3d5698SJohn Baldwin	ldp	x27,x28,[x29,#80]
750bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#96
751*bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
752bc3d5698SJohn Baldwin	ret
753bc3d5698SJohn Baldwin
754c0855eaaSJohn Baldwin.align	4
755bc3d5698SJohn Baldwin.Ltail_neon:
756c0855eaaSJohn Baldwin	add	x2,x2,#320
757c0855eaaSJohn Baldwin	ldp	d8,d9,[sp]			// meet ABI requirements
758bc3d5698SJohn Baldwin	cmp	x2,#64
759bc3d5698SJohn Baldwin	b.lo	.Less_than_64
760bc3d5698SJohn Baldwin
761bc3d5698SJohn Baldwin	add	x5,x5,x6,lsl#32	// pack
762bc3d5698SJohn Baldwin	add	x7,x7,x8,lsl#32
763bc3d5698SJohn Baldwin	ldp	x6,x8,[x1,#0]		// load input
764bc3d5698SJohn Baldwin	add	x9,x9,x10,lsl#32
765bc3d5698SJohn Baldwin	add	x11,x11,x12,lsl#32
766bc3d5698SJohn Baldwin	ldp	x10,x12,[x1,#16]
767bc3d5698SJohn Baldwin	add	x13,x13,x14,lsl#32
768bc3d5698SJohn Baldwin	add	x15,x15,x16,lsl#32
769bc3d5698SJohn Baldwin	ldp	x14,x16,[x1,#32]
770bc3d5698SJohn Baldwin	add	x17,x17,x19,lsl#32
771bc3d5698SJohn Baldwin	add	x20,x20,x21,lsl#32
772bc3d5698SJohn Baldwin	ldp	x19,x21,[x1,#48]
773bc3d5698SJohn Baldwin	add	x1,x1,#64
774c0855eaaSJohn Baldwin#ifdef	__AARCH64EB__
775bc3d5698SJohn Baldwin	rev	x5,x5
776bc3d5698SJohn Baldwin	rev	x7,x7
777bc3d5698SJohn Baldwin	rev	x9,x9
778bc3d5698SJohn Baldwin	rev	x11,x11
779bc3d5698SJohn Baldwin	rev	x13,x13
780bc3d5698SJohn Baldwin	rev	x15,x15
781bc3d5698SJohn Baldwin	rev	x17,x17
782bc3d5698SJohn Baldwin	rev	x20,x20
783bc3d5698SJohn Baldwin#endif
784bc3d5698SJohn Baldwin	eor	x5,x5,x6
785bc3d5698SJohn Baldwin	eor	x7,x7,x8
786bc3d5698SJohn Baldwin	eor	x9,x9,x10
787bc3d5698SJohn Baldwin	eor	x11,x11,x12
788bc3d5698SJohn Baldwin	eor	x13,x13,x14
789bc3d5698SJohn Baldwin	eor	x15,x15,x16
790bc3d5698SJohn Baldwin	eor	x17,x17,x19
791bc3d5698SJohn Baldwin	eor	x20,x20,x21
792bc3d5698SJohn Baldwin
793bc3d5698SJohn Baldwin	stp	x5,x7,[x0,#0]		// store output
794c0855eaaSJohn Baldwin	add	v16.4s,v16.4s,v0.4s			// accumulate key block
795bc3d5698SJohn Baldwin	stp	x9,x11,[x0,#16]
796c0855eaaSJohn Baldwin	add	v17.4s,v17.4s,v1.4s
797bc3d5698SJohn Baldwin	stp	x13,x15,[x0,#32]
798c0855eaaSJohn Baldwin	add	v18.4s,v18.4s,v2.4s
799bc3d5698SJohn Baldwin	stp	x17,x20,[x0,#48]
800c0855eaaSJohn Baldwin	add	v19.4s,v19.4s,v3.4s
801bc3d5698SJohn Baldwin	add	x0,x0,#64
802bc3d5698SJohn Baldwin	b.eq	.Ldone_neon
803bc3d5698SJohn Baldwin	sub	x2,x2,#64
804bc3d5698SJohn Baldwin	cmp	x2,#64
805c0855eaaSJohn Baldwin	b.lo	.Last_neon
806bc3d5698SJohn Baldwin
807c0855eaaSJohn Baldwin	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
808c0855eaaSJohn Baldwin	eor	v16.16b,v16.16b,v4.16b
809c0855eaaSJohn Baldwin	eor	v17.16b,v17.16b,v5.16b
810c0855eaaSJohn Baldwin	eor	v18.16b,v18.16b,v6.16b
811c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v7.16b
812c0855eaaSJohn Baldwin	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
813bc3d5698SJohn Baldwin	b.eq	.Ldone_neon
814c0855eaaSJohn Baldwin
815c0855eaaSJohn Baldwin	add	v16.4s,v20.4s,v0.4s
816c0855eaaSJohn Baldwin	add	v17.4s,v21.4s,v1.4s
817bc3d5698SJohn Baldwin	sub	x2,x2,#64
818c0855eaaSJohn Baldwin	add	v18.4s,v22.4s,v2.4s
819bc3d5698SJohn Baldwin	cmp	x2,#64
820c0855eaaSJohn Baldwin	add	v19.4s,v23.4s,v3.4s
821c0855eaaSJohn Baldwin	b.lo	.Last_neon
822bc3d5698SJohn Baldwin
823c0855eaaSJohn Baldwin	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
824c0855eaaSJohn Baldwin	eor	v20.16b,v16.16b,v4.16b
825c0855eaaSJohn Baldwin	eor	v21.16b,v17.16b,v5.16b
826c0855eaaSJohn Baldwin	eor	v22.16b,v18.16b,v6.16b
827c0855eaaSJohn Baldwin	eor	v23.16b,v19.16b,v7.16b
828c0855eaaSJohn Baldwin	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
829bc3d5698SJohn Baldwin	b.eq	.Ldone_neon
830c0855eaaSJohn Baldwin
831c0855eaaSJohn Baldwin	add	v16.4s,v24.4s,v0.4s
832c0855eaaSJohn Baldwin	add	v17.4s,v25.4s,v1.4s
833c0855eaaSJohn Baldwin	sub	x2,x2,#64
834c0855eaaSJohn Baldwin	add	v18.4s,v26.4s,v2.4s
835c0855eaaSJohn Baldwin	cmp	x2,#64
836c0855eaaSJohn Baldwin	add	v19.4s,v27.4s,v3.4s
837c0855eaaSJohn Baldwin	b.lo	.Last_neon
838c0855eaaSJohn Baldwin
839c0855eaaSJohn Baldwin	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
840c0855eaaSJohn Baldwin	eor	v24.16b,v16.16b,v4.16b
841c0855eaaSJohn Baldwin	eor	v25.16b,v17.16b,v5.16b
842c0855eaaSJohn Baldwin	eor	v26.16b,v18.16b,v6.16b
843c0855eaaSJohn Baldwin	eor	v27.16b,v19.16b,v7.16b
844c0855eaaSJohn Baldwin	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
845c0855eaaSJohn Baldwin	b.eq	.Ldone_neon
846c0855eaaSJohn Baldwin
847c0855eaaSJohn Baldwin	add	v16.4s,v28.4s,v0.4s
848c0855eaaSJohn Baldwin	add	v17.4s,v29.4s,v1.4s
849c0855eaaSJohn Baldwin	add	v18.4s,v30.4s,v2.4s
850c0855eaaSJohn Baldwin	add	v19.4s,v31.4s,v3.4s
851bc3d5698SJohn Baldwin	sub	x2,x2,#64
852bc3d5698SJohn Baldwin
853bc3d5698SJohn Baldwin.Last_neon:
854c0855eaaSJohn Baldwin	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
855c0855eaaSJohn Baldwin
856bc3d5698SJohn Baldwin	sub	x0,x0,#1
857bc3d5698SJohn Baldwin	add	x1,x1,x2
858bc3d5698SJohn Baldwin	add	x0,x0,x2
859bc3d5698SJohn Baldwin	add	x4,sp,x2
860bc3d5698SJohn Baldwin	neg	x2,x2
861bc3d5698SJohn Baldwin
862bc3d5698SJohn Baldwin.Loop_tail_neon:
863bc3d5698SJohn Baldwin	ldrb	w10,[x1,x2]
864bc3d5698SJohn Baldwin	ldrb	w11,[x4,x2]
865bc3d5698SJohn Baldwin	add	x2,x2,#1
866bc3d5698SJohn Baldwin	eor	w10,w10,w11
867bc3d5698SJohn Baldwin	strb	w10,[x0,x2]
868bc3d5698SJohn Baldwin	cbnz	x2,.Loop_tail_neon
869bc3d5698SJohn Baldwin
870bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#0]
871bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#16]
872bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#32]
873bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#48]
874bc3d5698SJohn Baldwin
875bc3d5698SJohn Baldwin.Ldone_neon:
876bc3d5698SJohn Baldwin	ldp	x19,x20,[x29,#16]
877bc3d5698SJohn Baldwin	add	sp,sp,#64
878bc3d5698SJohn Baldwin	ldp	x21,x22,[x29,#32]
879bc3d5698SJohn Baldwin	ldp	x23,x24,[x29,#48]
880bc3d5698SJohn Baldwin	ldp	x25,x26,[x29,#64]
881bc3d5698SJohn Baldwin	ldp	x27,x28,[x29,#80]
882bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#96
883*bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
884bc3d5698SJohn Baldwin	ret
885bc3d5698SJohn Baldwin.size	ChaCha20_neon,.-ChaCha20_neon
886bc3d5698SJohn Baldwin.type	ChaCha20_512_neon,%function
887bc3d5698SJohn Baldwin.align	5
888bc3d5698SJohn BaldwinChaCha20_512_neon:
889*bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
890bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-96]!
891bc3d5698SJohn Baldwin	add	x29,sp,#0
892bc3d5698SJohn Baldwin
893bc3d5698SJohn Baldwin	adr	x5,.Lsigma
894bc3d5698SJohn Baldwin	stp	x19,x20,[sp,#16]
895bc3d5698SJohn Baldwin	stp	x21,x22,[sp,#32]
896bc3d5698SJohn Baldwin	stp	x23,x24,[sp,#48]
897bc3d5698SJohn Baldwin	stp	x25,x26,[sp,#64]
898bc3d5698SJohn Baldwin	stp	x27,x28,[sp,#80]
899bc3d5698SJohn Baldwin
900bc3d5698SJohn Baldwin.L512_or_more_neon:
901bc3d5698SJohn Baldwin	sub	sp,sp,#128+64
902bc3d5698SJohn Baldwin
903c0855eaaSJohn Baldwin	eor	v7.16b,v7.16b,v7.16b
904bc3d5698SJohn Baldwin	ldp	x22,x23,[x5]		// load sigma
905c0855eaaSJohn Baldwin	ld1	{v0.4s},[x5],#16
906bc3d5698SJohn Baldwin	ldp	x24,x25,[x3]		// load key
907bc3d5698SJohn Baldwin	ldp	x26,x27,[x3,#16]
908c0855eaaSJohn Baldwin	ld1	{v1.4s,v2.4s},[x3]
909bc3d5698SJohn Baldwin	ldp	x28,x30,[x4]		// load counter
910c0855eaaSJohn Baldwin	ld1	{v3.4s},[x4]
911c0855eaaSJohn Baldwin	ld1	{v7.s}[0],[x5]
912c0855eaaSJohn Baldwin	add	x3,x5,#16			// .Lrot24
913c0855eaaSJohn Baldwin#ifdef	__AARCH64EB__
914c0855eaaSJohn Baldwin	rev64	v0.4s,v0.4s
915bc3d5698SJohn Baldwin	ror	x24,x24,#32
916bc3d5698SJohn Baldwin	ror	x25,x25,#32
917bc3d5698SJohn Baldwin	ror	x26,x26,#32
918bc3d5698SJohn Baldwin	ror	x27,x27,#32
919bc3d5698SJohn Baldwin	ror	x28,x28,#32
920bc3d5698SJohn Baldwin	ror	x30,x30,#32
921bc3d5698SJohn Baldwin#endif
922c0855eaaSJohn Baldwin	add	v3.4s,v3.4s,v7.4s		// += 1
923c0855eaaSJohn Baldwin	stp	q0,q1,[sp,#0]		// off-load key block, invariant part
924c0855eaaSJohn Baldwin	add	v3.4s,v3.4s,v7.4s		// not typo
925c0855eaaSJohn Baldwin	str	q2,[sp,#32]
926c0855eaaSJohn Baldwin	add	v4.4s,v3.4s,v7.4s
927c0855eaaSJohn Baldwin	add	v5.4s,v4.4s,v7.4s
928c0855eaaSJohn Baldwin	add	v6.4s,v5.4s,v7.4s
929c0855eaaSJohn Baldwin	shl	v7.4s,v7.4s,#2			// 1 -> 4
930bc3d5698SJohn Baldwin
931bc3d5698SJohn Baldwin	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
932bc3d5698SJohn Baldwin	stp	d10,d11,[sp,#128+16]
933bc3d5698SJohn Baldwin	stp	d12,d13,[sp,#128+32]
934bc3d5698SJohn Baldwin	stp	d14,d15,[sp,#128+48]
935bc3d5698SJohn Baldwin
936bc3d5698SJohn Baldwin	sub	x2,x2,#512			// not typo
937bc3d5698SJohn Baldwin
938bc3d5698SJohn Baldwin.Loop_outer_512_neon:
939c0855eaaSJohn Baldwin	mov	v8.16b,v0.16b
940c0855eaaSJohn Baldwin	mov	v12.16b,v0.16b
941c0855eaaSJohn Baldwin	mov	v16.16b,v0.16b
942c0855eaaSJohn Baldwin	mov	v20.16b,v0.16b
943c0855eaaSJohn Baldwin	mov	v24.16b,v0.16b
944c0855eaaSJohn Baldwin	mov	v28.16b,v0.16b
945c0855eaaSJohn Baldwin	mov	v9.16b,v1.16b
946bc3d5698SJohn Baldwin	mov	w5,w22			// unpack key block
947c0855eaaSJohn Baldwin	mov	v13.16b,v1.16b
948bc3d5698SJohn Baldwin	lsr	x6,x22,#32
949c0855eaaSJohn Baldwin	mov	v17.16b,v1.16b
950bc3d5698SJohn Baldwin	mov	w7,w23
951c0855eaaSJohn Baldwin	mov	v21.16b,v1.16b
952bc3d5698SJohn Baldwin	lsr	x8,x23,#32
953c0855eaaSJohn Baldwin	mov	v25.16b,v1.16b
954bc3d5698SJohn Baldwin	mov	w9,w24
955c0855eaaSJohn Baldwin	mov	v29.16b,v1.16b
956bc3d5698SJohn Baldwin	lsr	x10,x24,#32
957c0855eaaSJohn Baldwin	mov	v11.16b,v3.16b
958bc3d5698SJohn Baldwin	mov	w11,w25
959c0855eaaSJohn Baldwin	mov	v15.16b,v4.16b
960bc3d5698SJohn Baldwin	lsr	x12,x25,#32
961c0855eaaSJohn Baldwin	mov	v19.16b,v5.16b
962bc3d5698SJohn Baldwin	mov	w13,w26
963c0855eaaSJohn Baldwin	mov	v23.16b,v6.16b
964bc3d5698SJohn Baldwin	lsr	x14,x26,#32
965c0855eaaSJohn Baldwin	mov	v10.16b,v2.16b
966bc3d5698SJohn Baldwin	mov	w15,w27
967c0855eaaSJohn Baldwin	mov	v14.16b,v2.16b
968bc3d5698SJohn Baldwin	lsr	x16,x27,#32
969c0855eaaSJohn Baldwin	add	v27.4s,v11.4s,v7.4s			// +4
970bc3d5698SJohn Baldwin	mov	w17,w28
971c0855eaaSJohn Baldwin	add	v31.4s,v15.4s,v7.4s			// +4
972bc3d5698SJohn Baldwin	lsr	x19,x28,#32
973c0855eaaSJohn Baldwin	mov	v18.16b,v2.16b
974bc3d5698SJohn Baldwin	mov	w20,w30
975c0855eaaSJohn Baldwin	mov	v22.16b,v2.16b
976bc3d5698SJohn Baldwin	lsr	x21,x30,#32
977c0855eaaSJohn Baldwin	mov	v26.16b,v2.16b
978c0855eaaSJohn Baldwin	stp	q3,q4,[sp,#48]		// off-load key block, variable part
979c0855eaaSJohn Baldwin	mov	v30.16b,v2.16b
980c0855eaaSJohn Baldwin	stp	q5,q6,[sp,#80]
981bc3d5698SJohn Baldwin
982bc3d5698SJohn Baldwin	mov	x4,#5
983c0855eaaSJohn Baldwin	ld1	{v6.4s},[x3]
984bc3d5698SJohn Baldwin	subs	x2,x2,#512
985bc3d5698SJohn Baldwin.Loop_upper_neon:
986bc3d5698SJohn Baldwin	sub	x4,x4,#1
987bc3d5698SJohn Baldwin	add	v8.4s,v8.4s,v9.4s
988c0855eaaSJohn Baldwin	add	w5,w5,w9
989bc3d5698SJohn Baldwin	add	v12.4s,v12.4s,v13.4s
990c0855eaaSJohn Baldwin	add	w6,w6,w10
991bc3d5698SJohn Baldwin	add	v16.4s,v16.4s,v17.4s
992c0855eaaSJohn Baldwin	add	w7,w7,w11
993bc3d5698SJohn Baldwin	add	v20.4s,v20.4s,v21.4s
994c0855eaaSJohn Baldwin	add	w8,w8,w12
995c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v25.4s
996c0855eaaSJohn Baldwin	eor	w17,w17,w5
997c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v29.4s
998bc3d5698SJohn Baldwin	eor	w19,w19,w6
999bc3d5698SJohn Baldwin	eor	v11.16b,v11.16b,v8.16b
1000bc3d5698SJohn Baldwin	eor	w20,w20,w7
1001c0855eaaSJohn Baldwin	eor	v15.16b,v15.16b,v12.16b
1002bc3d5698SJohn Baldwin	eor	w21,w21,w8
1003c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v16.16b
1004bc3d5698SJohn Baldwin	ror	w17,w17,#16
1005c0855eaaSJohn Baldwin	eor	v23.16b,v23.16b,v20.16b
1006bc3d5698SJohn Baldwin	ror	w19,w19,#16
1007c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v24.16b
1008bc3d5698SJohn Baldwin	ror	w20,w20,#16
1009c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v28.16b
1010c0855eaaSJohn Baldwin	ror	w21,w21,#16
1011c0855eaaSJohn Baldwin	rev32	v11.8h,v11.8h
1012c0855eaaSJohn Baldwin	add	w13,w13,w17
1013c0855eaaSJohn Baldwin	rev32	v15.8h,v15.8h
1014c0855eaaSJohn Baldwin	add	w14,w14,w19
1015c0855eaaSJohn Baldwin	rev32	v19.8h,v19.8h
1016c0855eaaSJohn Baldwin	add	w15,w15,w20
1017c0855eaaSJohn Baldwin	rev32	v23.8h,v23.8h
1018c0855eaaSJohn Baldwin	add	w16,w16,w21
1019c0855eaaSJohn Baldwin	rev32	v27.8h,v27.8h
1020c0855eaaSJohn Baldwin	eor	w9,w9,w13
1021c0855eaaSJohn Baldwin	rev32	v31.8h,v31.8h
1022c0855eaaSJohn Baldwin	eor	w10,w10,w14
1023bc3d5698SJohn Baldwin	add	v10.4s,v10.4s,v11.4s
1024c0855eaaSJohn Baldwin	eor	w11,w11,w15
1025bc3d5698SJohn Baldwin	add	v14.4s,v14.4s,v15.4s
1026c0855eaaSJohn Baldwin	eor	w12,w12,w16
1027bc3d5698SJohn Baldwin	add	v18.4s,v18.4s,v19.4s
1028bc3d5698SJohn Baldwin	ror	w9,w9,#20
1029c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v23.4s
1030c0855eaaSJohn Baldwin	ror	w10,w10,#20
1031c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v27.4s
1032c0855eaaSJohn Baldwin	ror	w11,w11,#20
1033c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v31.4s
1034c0855eaaSJohn Baldwin	ror	w12,w12,#20
1035c0855eaaSJohn Baldwin	eor	v0.16b,v9.16b,v10.16b
1036c0855eaaSJohn Baldwin	add	w5,w5,w9
1037c0855eaaSJohn Baldwin	eor	v1.16b,v13.16b,v14.16b
1038c0855eaaSJohn Baldwin	add	w6,w6,w10
1039c0855eaaSJohn Baldwin	eor	v2.16b,v17.16b,v18.16b
1040c0855eaaSJohn Baldwin	add	w7,w7,w11
1041c0855eaaSJohn Baldwin	eor	v3.16b,v21.16b,v22.16b
1042c0855eaaSJohn Baldwin	add	w8,w8,w12
1043c0855eaaSJohn Baldwin	eor	v4.16b,v25.16b,v26.16b
1044c0855eaaSJohn Baldwin	eor	w17,w17,w5
1045c0855eaaSJohn Baldwin	eor	v5.16b,v29.16b,v30.16b
1046c0855eaaSJohn Baldwin	eor	w19,w19,w6
1047c0855eaaSJohn Baldwin	ushr	v9.4s,v0.4s,#20
1048c0855eaaSJohn Baldwin	eor	w20,w20,w7
1049c0855eaaSJohn Baldwin	ushr	v13.4s,v1.4s,#20
1050c0855eaaSJohn Baldwin	eor	w21,w21,w8
1051c0855eaaSJohn Baldwin	ushr	v17.4s,v2.4s,#20
1052bc3d5698SJohn Baldwin	ror	w17,w17,#24
1053c0855eaaSJohn Baldwin	ushr	v21.4s,v3.4s,#20
1054bc3d5698SJohn Baldwin	ror	w19,w19,#24
1055c0855eaaSJohn Baldwin	ushr	v25.4s,v4.4s,#20
1056bc3d5698SJohn Baldwin	ror	w20,w20,#24
1057c0855eaaSJohn Baldwin	ushr	v29.4s,v5.4s,#20
1058c0855eaaSJohn Baldwin	ror	w21,w21,#24
1059c0855eaaSJohn Baldwin	sli	v9.4s,v0.4s,#12
1060c0855eaaSJohn Baldwin	add	w13,w13,w17
1061c0855eaaSJohn Baldwin	sli	v13.4s,v1.4s,#12
1062c0855eaaSJohn Baldwin	add	w14,w14,w19
1063c0855eaaSJohn Baldwin	sli	v17.4s,v2.4s,#12
1064c0855eaaSJohn Baldwin	add	w15,w15,w20
1065c0855eaaSJohn Baldwin	sli	v21.4s,v3.4s,#12
1066c0855eaaSJohn Baldwin	add	w16,w16,w21
1067c0855eaaSJohn Baldwin	sli	v25.4s,v4.4s,#12
1068c0855eaaSJohn Baldwin	eor	w9,w9,w13
1069c0855eaaSJohn Baldwin	sli	v29.4s,v5.4s,#12
1070c0855eaaSJohn Baldwin	eor	w10,w10,w14
1071c0855eaaSJohn Baldwin	add	v8.4s,v8.4s,v9.4s
1072c0855eaaSJohn Baldwin	eor	w11,w11,w15
1073c0855eaaSJohn Baldwin	add	v12.4s,v12.4s,v13.4s
1074c0855eaaSJohn Baldwin	eor	w12,w12,w16
1075c0855eaaSJohn Baldwin	add	v16.4s,v16.4s,v17.4s
1076bc3d5698SJohn Baldwin	ror	w9,w9,#25
1077c0855eaaSJohn Baldwin	add	v20.4s,v20.4s,v21.4s
1078c0855eaaSJohn Baldwin	ror	w10,w10,#25
1079c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v25.4s
1080c0855eaaSJohn Baldwin	ror	w11,w11,#25
1081c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v29.4s
1082c0855eaaSJohn Baldwin	ror	w12,w12,#25
1083c0855eaaSJohn Baldwin	eor	v11.16b,v11.16b,v8.16b
1084c0855eaaSJohn Baldwin	add	w5,w5,w10
1085c0855eaaSJohn Baldwin	eor	v15.16b,v15.16b,v12.16b
1086c0855eaaSJohn Baldwin	add	w6,w6,w11
1087c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v16.16b
1088c0855eaaSJohn Baldwin	add	w7,w7,w12
1089c0855eaaSJohn Baldwin	eor	v23.16b,v23.16b,v20.16b
1090c0855eaaSJohn Baldwin	add	w8,w8,w9
1091c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v24.16b
1092c0855eaaSJohn Baldwin	eor	w21,w21,w5
1093c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v28.16b
1094c0855eaaSJohn Baldwin	eor	w17,w17,w6
1095c0855eaaSJohn Baldwin	tbl	v11.16b,{v11.16b},v6.16b
1096c0855eaaSJohn Baldwin	eor	w19,w19,w7
1097c0855eaaSJohn Baldwin	tbl	v15.16b,{v15.16b},v6.16b
1098c0855eaaSJohn Baldwin	eor	w20,w20,w8
1099c0855eaaSJohn Baldwin	tbl	v19.16b,{v19.16b},v6.16b
1100c0855eaaSJohn Baldwin	ror	w21,w21,#16
1101c0855eaaSJohn Baldwin	tbl	v23.16b,{v23.16b},v6.16b
1102c0855eaaSJohn Baldwin	ror	w17,w17,#16
1103c0855eaaSJohn Baldwin	tbl	v27.16b,{v27.16b},v6.16b
1104c0855eaaSJohn Baldwin	ror	w19,w19,#16
1105c0855eaaSJohn Baldwin	tbl	v31.16b,{v31.16b},v6.16b
1106c0855eaaSJohn Baldwin	ror	w20,w20,#16
1107c0855eaaSJohn Baldwin	add	v10.4s,v10.4s,v11.4s
1108c0855eaaSJohn Baldwin	add	w15,w15,w21
1109c0855eaaSJohn Baldwin	add	v14.4s,v14.4s,v15.4s
1110c0855eaaSJohn Baldwin	add	w16,w16,w17
1111c0855eaaSJohn Baldwin	add	v18.4s,v18.4s,v19.4s
1112c0855eaaSJohn Baldwin	add	w13,w13,w19
1113c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v23.4s
1114c0855eaaSJohn Baldwin	add	w14,w14,w20
1115c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v27.4s
1116c0855eaaSJohn Baldwin	eor	w10,w10,w15
1117c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v31.4s
1118c0855eaaSJohn Baldwin	eor	w11,w11,w16
1119c0855eaaSJohn Baldwin	eor	v0.16b,v9.16b,v10.16b
1120c0855eaaSJohn Baldwin	eor	w12,w12,w13
1121c0855eaaSJohn Baldwin	eor	v1.16b,v13.16b,v14.16b
1122c0855eaaSJohn Baldwin	eor	w9,w9,w14
1123c0855eaaSJohn Baldwin	eor	v2.16b,v17.16b,v18.16b
1124c0855eaaSJohn Baldwin	ror	w10,w10,#20
1125c0855eaaSJohn Baldwin	eor	v3.16b,v21.16b,v22.16b
1126c0855eaaSJohn Baldwin	ror	w11,w11,#20
1127c0855eaaSJohn Baldwin	eor	v4.16b,v25.16b,v26.16b
1128c0855eaaSJohn Baldwin	ror	w12,w12,#20
1129c0855eaaSJohn Baldwin	eor	v5.16b,v29.16b,v30.16b
1130c0855eaaSJohn Baldwin	ror	w9,w9,#20
1131c0855eaaSJohn Baldwin	ushr	v9.4s,v0.4s,#25
1132c0855eaaSJohn Baldwin	add	w5,w5,w10
1133c0855eaaSJohn Baldwin	ushr	v13.4s,v1.4s,#25
1134c0855eaaSJohn Baldwin	add	w6,w6,w11
1135c0855eaaSJohn Baldwin	ushr	v17.4s,v2.4s,#25
1136c0855eaaSJohn Baldwin	add	w7,w7,w12
1137c0855eaaSJohn Baldwin	ushr	v21.4s,v3.4s,#25
1138c0855eaaSJohn Baldwin	add	w8,w8,w9
1139c0855eaaSJohn Baldwin	ushr	v25.4s,v4.4s,#25
1140c0855eaaSJohn Baldwin	eor	w21,w21,w5
1141c0855eaaSJohn Baldwin	ushr	v29.4s,v5.4s,#25
1142c0855eaaSJohn Baldwin	eor	w17,w17,w6
1143c0855eaaSJohn Baldwin	sli	v9.4s,v0.4s,#7
1144c0855eaaSJohn Baldwin	eor	w19,w19,w7
1145c0855eaaSJohn Baldwin	sli	v13.4s,v1.4s,#7
1146c0855eaaSJohn Baldwin	eor	w20,w20,w8
1147c0855eaaSJohn Baldwin	sli	v17.4s,v2.4s,#7
1148c0855eaaSJohn Baldwin	ror	w21,w21,#24
1149c0855eaaSJohn Baldwin	sli	v21.4s,v3.4s,#7
1150c0855eaaSJohn Baldwin	ror	w17,w17,#24
1151c0855eaaSJohn Baldwin	sli	v25.4s,v4.4s,#7
1152c0855eaaSJohn Baldwin	ror	w19,w19,#24
1153c0855eaaSJohn Baldwin	sli	v29.4s,v5.4s,#7
1154c0855eaaSJohn Baldwin	ror	w20,w20,#24
1155c0855eaaSJohn Baldwin	ext	v10.16b,v10.16b,v10.16b,#8
1156c0855eaaSJohn Baldwin	add	w15,w15,w21
1157c0855eaaSJohn Baldwin	ext	v14.16b,v14.16b,v14.16b,#8
1158c0855eaaSJohn Baldwin	add	w16,w16,w17
1159c0855eaaSJohn Baldwin	ext	v18.16b,v18.16b,v18.16b,#8
1160c0855eaaSJohn Baldwin	add	w13,w13,w19
1161c0855eaaSJohn Baldwin	ext	v22.16b,v22.16b,v22.16b,#8
1162c0855eaaSJohn Baldwin	add	w14,w14,w20
1163c0855eaaSJohn Baldwin	ext	v26.16b,v26.16b,v26.16b,#8
1164c0855eaaSJohn Baldwin	eor	w10,w10,w15
1165c0855eaaSJohn Baldwin	ext	v30.16b,v30.16b,v30.16b,#8
1166c0855eaaSJohn Baldwin	eor	w11,w11,w16
1167bc3d5698SJohn Baldwin	ext	v11.16b,v11.16b,v11.16b,#12
1168c0855eaaSJohn Baldwin	eor	w12,w12,w13
1169bc3d5698SJohn Baldwin	ext	v15.16b,v15.16b,v15.16b,#12
1170c0855eaaSJohn Baldwin	eor	w9,w9,w14
1171bc3d5698SJohn Baldwin	ext	v19.16b,v19.16b,v19.16b,#12
1172c0855eaaSJohn Baldwin	ror	w10,w10,#25
1173bc3d5698SJohn Baldwin	ext	v23.16b,v23.16b,v23.16b,#12
1174c0855eaaSJohn Baldwin	ror	w11,w11,#25
1175c0855eaaSJohn Baldwin	ext	v27.16b,v27.16b,v27.16b,#12
1176c0855eaaSJohn Baldwin	ror	w12,w12,#25
1177c0855eaaSJohn Baldwin	ext	v31.16b,v31.16b,v31.16b,#12
1178c0855eaaSJohn Baldwin	ror	w9,w9,#25
1179bc3d5698SJohn Baldwin	ext	v9.16b,v9.16b,v9.16b,#4
1180bc3d5698SJohn Baldwin	ext	v13.16b,v13.16b,v13.16b,#4
1181bc3d5698SJohn Baldwin	ext	v17.16b,v17.16b,v17.16b,#4
1182bc3d5698SJohn Baldwin	ext	v21.16b,v21.16b,v21.16b,#4
1183c0855eaaSJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#4
1184c0855eaaSJohn Baldwin	ext	v29.16b,v29.16b,v29.16b,#4
1185bc3d5698SJohn Baldwin	add	v8.4s,v8.4s,v9.4s
1186c0855eaaSJohn Baldwin	add	w5,w5,w9
1187bc3d5698SJohn Baldwin	add	v12.4s,v12.4s,v13.4s
1188c0855eaaSJohn Baldwin	add	w6,w6,w10
1189bc3d5698SJohn Baldwin	add	v16.4s,v16.4s,v17.4s
1190c0855eaaSJohn Baldwin	add	w7,w7,w11
1191bc3d5698SJohn Baldwin	add	v20.4s,v20.4s,v21.4s
1192c0855eaaSJohn Baldwin	add	w8,w8,w12
1193c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v25.4s
1194c0855eaaSJohn Baldwin	eor	w17,w17,w5
1195c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v29.4s
1196bc3d5698SJohn Baldwin	eor	w19,w19,w6
1197bc3d5698SJohn Baldwin	eor	v11.16b,v11.16b,v8.16b
1198bc3d5698SJohn Baldwin	eor	w20,w20,w7
1199c0855eaaSJohn Baldwin	eor	v15.16b,v15.16b,v12.16b
1200bc3d5698SJohn Baldwin	eor	w21,w21,w8
1201c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v16.16b
1202bc3d5698SJohn Baldwin	ror	w17,w17,#16
1203c0855eaaSJohn Baldwin	eor	v23.16b,v23.16b,v20.16b
1204bc3d5698SJohn Baldwin	ror	w19,w19,#16
1205c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v24.16b
1206bc3d5698SJohn Baldwin	ror	w20,w20,#16
1207c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v28.16b
1208c0855eaaSJohn Baldwin	ror	w21,w21,#16
1209c0855eaaSJohn Baldwin	rev32	v11.8h,v11.8h
1210c0855eaaSJohn Baldwin	add	w13,w13,w17
1211c0855eaaSJohn Baldwin	rev32	v15.8h,v15.8h
1212c0855eaaSJohn Baldwin	add	w14,w14,w19
1213c0855eaaSJohn Baldwin	rev32	v19.8h,v19.8h
1214c0855eaaSJohn Baldwin	add	w15,w15,w20
1215c0855eaaSJohn Baldwin	rev32	v23.8h,v23.8h
1216c0855eaaSJohn Baldwin	add	w16,w16,w21
1217c0855eaaSJohn Baldwin	rev32	v27.8h,v27.8h
1218c0855eaaSJohn Baldwin	eor	w9,w9,w13
1219c0855eaaSJohn Baldwin	rev32	v31.8h,v31.8h
1220c0855eaaSJohn Baldwin	eor	w10,w10,w14
1221bc3d5698SJohn Baldwin	add	v10.4s,v10.4s,v11.4s
1222c0855eaaSJohn Baldwin	eor	w11,w11,w15
1223bc3d5698SJohn Baldwin	add	v14.4s,v14.4s,v15.4s
1224c0855eaaSJohn Baldwin	eor	w12,w12,w16
1225bc3d5698SJohn Baldwin	add	v18.4s,v18.4s,v19.4s
1226bc3d5698SJohn Baldwin	ror	w9,w9,#20
1227c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v23.4s
1228c0855eaaSJohn Baldwin	ror	w10,w10,#20
1229c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v27.4s
1230c0855eaaSJohn Baldwin	ror	w11,w11,#20
1231c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v31.4s
1232c0855eaaSJohn Baldwin	ror	w12,w12,#20
1233c0855eaaSJohn Baldwin	eor	v0.16b,v9.16b,v10.16b
1234c0855eaaSJohn Baldwin	add	w5,w5,w9
1235c0855eaaSJohn Baldwin	eor	v1.16b,v13.16b,v14.16b
1236c0855eaaSJohn Baldwin	add	w6,w6,w10
1237c0855eaaSJohn Baldwin	eor	v2.16b,v17.16b,v18.16b
1238c0855eaaSJohn Baldwin	add	w7,w7,w11
1239c0855eaaSJohn Baldwin	eor	v3.16b,v21.16b,v22.16b
1240c0855eaaSJohn Baldwin	add	w8,w8,w12
1241c0855eaaSJohn Baldwin	eor	v4.16b,v25.16b,v26.16b
1242c0855eaaSJohn Baldwin	eor	w17,w17,w5
1243c0855eaaSJohn Baldwin	eor	v5.16b,v29.16b,v30.16b
1244c0855eaaSJohn Baldwin	eor	w19,w19,w6
1245c0855eaaSJohn Baldwin	ushr	v9.4s,v0.4s,#20
1246c0855eaaSJohn Baldwin	eor	w20,w20,w7
1247c0855eaaSJohn Baldwin	ushr	v13.4s,v1.4s,#20
1248c0855eaaSJohn Baldwin	eor	w21,w21,w8
1249c0855eaaSJohn Baldwin	ushr	v17.4s,v2.4s,#20
1250bc3d5698SJohn Baldwin	ror	w17,w17,#24
1251c0855eaaSJohn Baldwin	ushr	v21.4s,v3.4s,#20
1252bc3d5698SJohn Baldwin	ror	w19,w19,#24
1253c0855eaaSJohn Baldwin	ushr	v25.4s,v4.4s,#20
1254bc3d5698SJohn Baldwin	ror	w20,w20,#24
1255c0855eaaSJohn Baldwin	ushr	v29.4s,v5.4s,#20
1256c0855eaaSJohn Baldwin	ror	w21,w21,#24
1257c0855eaaSJohn Baldwin	sli	v9.4s,v0.4s,#12
1258c0855eaaSJohn Baldwin	add	w13,w13,w17
1259c0855eaaSJohn Baldwin	sli	v13.4s,v1.4s,#12
1260c0855eaaSJohn Baldwin	add	w14,w14,w19
1261c0855eaaSJohn Baldwin	sli	v17.4s,v2.4s,#12
1262c0855eaaSJohn Baldwin	add	w15,w15,w20
1263c0855eaaSJohn Baldwin	sli	v21.4s,v3.4s,#12
1264c0855eaaSJohn Baldwin	add	w16,w16,w21
1265c0855eaaSJohn Baldwin	sli	v25.4s,v4.4s,#12
1266c0855eaaSJohn Baldwin	eor	w9,w9,w13
1267c0855eaaSJohn Baldwin	sli	v29.4s,v5.4s,#12
1268c0855eaaSJohn Baldwin	eor	w10,w10,w14
1269c0855eaaSJohn Baldwin	add	v8.4s,v8.4s,v9.4s
1270c0855eaaSJohn Baldwin	eor	w11,w11,w15
1271c0855eaaSJohn Baldwin	add	v12.4s,v12.4s,v13.4s
1272c0855eaaSJohn Baldwin	eor	w12,w12,w16
1273c0855eaaSJohn Baldwin	add	v16.4s,v16.4s,v17.4s
1274bc3d5698SJohn Baldwin	ror	w9,w9,#25
1275c0855eaaSJohn Baldwin	add	v20.4s,v20.4s,v21.4s
1276c0855eaaSJohn Baldwin	ror	w10,w10,#25
1277c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v25.4s
1278c0855eaaSJohn Baldwin	ror	w11,w11,#25
1279c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v29.4s
1280c0855eaaSJohn Baldwin	ror	w12,w12,#25
1281c0855eaaSJohn Baldwin	eor	v11.16b,v11.16b,v8.16b
1282c0855eaaSJohn Baldwin	add	w5,w5,w10
1283c0855eaaSJohn Baldwin	eor	v15.16b,v15.16b,v12.16b
1284c0855eaaSJohn Baldwin	add	w6,w6,w11
1285c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v16.16b
1286c0855eaaSJohn Baldwin	add	w7,w7,w12
1287c0855eaaSJohn Baldwin	eor	v23.16b,v23.16b,v20.16b
1288c0855eaaSJohn Baldwin	add	w8,w8,w9
1289c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v24.16b
1290c0855eaaSJohn Baldwin	eor	w21,w21,w5
1291c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v28.16b
1292c0855eaaSJohn Baldwin	eor	w17,w17,w6
1293c0855eaaSJohn Baldwin	tbl	v11.16b,{v11.16b},v6.16b
1294c0855eaaSJohn Baldwin	eor	w19,w19,w7
1295c0855eaaSJohn Baldwin	tbl	v15.16b,{v15.16b},v6.16b
1296c0855eaaSJohn Baldwin	eor	w20,w20,w8
1297c0855eaaSJohn Baldwin	tbl	v19.16b,{v19.16b},v6.16b
1298c0855eaaSJohn Baldwin	ror	w21,w21,#16
1299c0855eaaSJohn Baldwin	tbl	v23.16b,{v23.16b},v6.16b
1300c0855eaaSJohn Baldwin	ror	w17,w17,#16
1301c0855eaaSJohn Baldwin	tbl	v27.16b,{v27.16b},v6.16b
1302c0855eaaSJohn Baldwin	ror	w19,w19,#16
1303c0855eaaSJohn Baldwin	tbl	v31.16b,{v31.16b},v6.16b
1304c0855eaaSJohn Baldwin	ror	w20,w20,#16
1305c0855eaaSJohn Baldwin	add	v10.4s,v10.4s,v11.4s
1306c0855eaaSJohn Baldwin	add	w15,w15,w21
1307c0855eaaSJohn Baldwin	add	v14.4s,v14.4s,v15.4s
1308c0855eaaSJohn Baldwin	add	w16,w16,w17
1309c0855eaaSJohn Baldwin	add	v18.4s,v18.4s,v19.4s
1310c0855eaaSJohn Baldwin	add	w13,w13,w19
1311c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v23.4s
1312c0855eaaSJohn Baldwin	add	w14,w14,w20
1313c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v27.4s
1314c0855eaaSJohn Baldwin	eor	w10,w10,w15
1315c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v31.4s
1316c0855eaaSJohn Baldwin	eor	w11,w11,w16
1317c0855eaaSJohn Baldwin	eor	v0.16b,v9.16b,v10.16b
1318c0855eaaSJohn Baldwin	eor	w12,w12,w13
1319c0855eaaSJohn Baldwin	eor	v1.16b,v13.16b,v14.16b
1320c0855eaaSJohn Baldwin	eor	w9,w9,w14
1321c0855eaaSJohn Baldwin	eor	v2.16b,v17.16b,v18.16b
1322c0855eaaSJohn Baldwin	ror	w10,w10,#20
1323c0855eaaSJohn Baldwin	eor	v3.16b,v21.16b,v22.16b
1324c0855eaaSJohn Baldwin	ror	w11,w11,#20
1325c0855eaaSJohn Baldwin	eor	v4.16b,v25.16b,v26.16b
1326c0855eaaSJohn Baldwin	ror	w12,w12,#20
1327c0855eaaSJohn Baldwin	eor	v5.16b,v29.16b,v30.16b
1328c0855eaaSJohn Baldwin	ror	w9,w9,#20
1329c0855eaaSJohn Baldwin	ushr	v9.4s,v0.4s,#25
1330c0855eaaSJohn Baldwin	add	w5,w5,w10
1331c0855eaaSJohn Baldwin	ushr	v13.4s,v1.4s,#25
1332c0855eaaSJohn Baldwin	add	w6,w6,w11
1333c0855eaaSJohn Baldwin	ushr	v17.4s,v2.4s,#25
1334c0855eaaSJohn Baldwin	add	w7,w7,w12
1335c0855eaaSJohn Baldwin	ushr	v21.4s,v3.4s,#25
1336c0855eaaSJohn Baldwin	add	w8,w8,w9
1337c0855eaaSJohn Baldwin	ushr	v25.4s,v4.4s,#25
1338c0855eaaSJohn Baldwin	eor	w21,w21,w5
1339c0855eaaSJohn Baldwin	ushr	v29.4s,v5.4s,#25
1340c0855eaaSJohn Baldwin	eor	w17,w17,w6
1341c0855eaaSJohn Baldwin	sli	v9.4s,v0.4s,#7
1342c0855eaaSJohn Baldwin	eor	w19,w19,w7
1343c0855eaaSJohn Baldwin	sli	v13.4s,v1.4s,#7
1344c0855eaaSJohn Baldwin	eor	w20,w20,w8
1345c0855eaaSJohn Baldwin	sli	v17.4s,v2.4s,#7
1346c0855eaaSJohn Baldwin	ror	w21,w21,#24
1347c0855eaaSJohn Baldwin	sli	v21.4s,v3.4s,#7
1348c0855eaaSJohn Baldwin	ror	w17,w17,#24
1349c0855eaaSJohn Baldwin	sli	v25.4s,v4.4s,#7
1350c0855eaaSJohn Baldwin	ror	w19,w19,#24
1351c0855eaaSJohn Baldwin	sli	v29.4s,v5.4s,#7
1352c0855eaaSJohn Baldwin	ror	w20,w20,#24
1353c0855eaaSJohn Baldwin	ext	v10.16b,v10.16b,v10.16b,#8
1354c0855eaaSJohn Baldwin	add	w15,w15,w21
1355c0855eaaSJohn Baldwin	ext	v14.16b,v14.16b,v14.16b,#8
1356c0855eaaSJohn Baldwin	add	w16,w16,w17
1357c0855eaaSJohn Baldwin	ext	v18.16b,v18.16b,v18.16b,#8
1358c0855eaaSJohn Baldwin	add	w13,w13,w19
1359c0855eaaSJohn Baldwin	ext	v22.16b,v22.16b,v22.16b,#8
1360c0855eaaSJohn Baldwin	add	w14,w14,w20
1361c0855eaaSJohn Baldwin	ext	v26.16b,v26.16b,v26.16b,#8
1362c0855eaaSJohn Baldwin	eor	w10,w10,w15
1363c0855eaaSJohn Baldwin	ext	v30.16b,v30.16b,v30.16b,#8
1364c0855eaaSJohn Baldwin	eor	w11,w11,w16
1365bc3d5698SJohn Baldwin	ext	v11.16b,v11.16b,v11.16b,#4
1366c0855eaaSJohn Baldwin	eor	w12,w12,w13
1367bc3d5698SJohn Baldwin	ext	v15.16b,v15.16b,v15.16b,#4
1368c0855eaaSJohn Baldwin	eor	w9,w9,w14
1369bc3d5698SJohn Baldwin	ext	v19.16b,v19.16b,v19.16b,#4
1370c0855eaaSJohn Baldwin	ror	w10,w10,#25
1371bc3d5698SJohn Baldwin	ext	v23.16b,v23.16b,v23.16b,#4
1372c0855eaaSJohn Baldwin	ror	w11,w11,#25
1373c0855eaaSJohn Baldwin	ext	v27.16b,v27.16b,v27.16b,#4
1374c0855eaaSJohn Baldwin	ror	w12,w12,#25
1375c0855eaaSJohn Baldwin	ext	v31.16b,v31.16b,v31.16b,#4
1376c0855eaaSJohn Baldwin	ror	w9,w9,#25
1377bc3d5698SJohn Baldwin	ext	v9.16b,v9.16b,v9.16b,#12
1378bc3d5698SJohn Baldwin	ext	v13.16b,v13.16b,v13.16b,#12
1379bc3d5698SJohn Baldwin	ext	v17.16b,v17.16b,v17.16b,#12
1380bc3d5698SJohn Baldwin	ext	v21.16b,v21.16b,v21.16b,#12
1381c0855eaaSJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#12
1382c0855eaaSJohn Baldwin	ext	v29.16b,v29.16b,v29.16b,#12
1383bc3d5698SJohn Baldwin	cbnz	x4,.Loop_upper_neon
1384bc3d5698SJohn Baldwin
1385bc3d5698SJohn Baldwin	add	w5,w5,w22		// accumulate key block
1386bc3d5698SJohn Baldwin	add	x6,x6,x22,lsr#32
1387bc3d5698SJohn Baldwin	add	w7,w7,w23
1388bc3d5698SJohn Baldwin	add	x8,x8,x23,lsr#32
1389bc3d5698SJohn Baldwin	add	w9,w9,w24
1390bc3d5698SJohn Baldwin	add	x10,x10,x24,lsr#32
1391bc3d5698SJohn Baldwin	add	w11,w11,w25
1392bc3d5698SJohn Baldwin	add	x12,x12,x25,lsr#32
1393bc3d5698SJohn Baldwin	add	w13,w13,w26
1394bc3d5698SJohn Baldwin	add	x14,x14,x26,lsr#32
1395bc3d5698SJohn Baldwin	add	w15,w15,w27
1396bc3d5698SJohn Baldwin	add	x16,x16,x27,lsr#32
1397bc3d5698SJohn Baldwin	add	w17,w17,w28
1398bc3d5698SJohn Baldwin	add	x19,x19,x28,lsr#32
1399bc3d5698SJohn Baldwin	add	w20,w20,w30
1400bc3d5698SJohn Baldwin	add	x21,x21,x30,lsr#32
1401bc3d5698SJohn Baldwin
1402bc3d5698SJohn Baldwin	add	x5,x5,x6,lsl#32	// pack
1403bc3d5698SJohn Baldwin	add	x7,x7,x8,lsl#32
1404bc3d5698SJohn Baldwin	ldp	x6,x8,[x1,#0]		// load input
1405bc3d5698SJohn Baldwin	add	x9,x9,x10,lsl#32
1406bc3d5698SJohn Baldwin	add	x11,x11,x12,lsl#32
1407bc3d5698SJohn Baldwin	ldp	x10,x12,[x1,#16]
1408bc3d5698SJohn Baldwin	add	x13,x13,x14,lsl#32
1409bc3d5698SJohn Baldwin	add	x15,x15,x16,lsl#32
1410bc3d5698SJohn Baldwin	ldp	x14,x16,[x1,#32]
1411bc3d5698SJohn Baldwin	add	x17,x17,x19,lsl#32
1412bc3d5698SJohn Baldwin	add	x20,x20,x21,lsl#32
1413bc3d5698SJohn Baldwin	ldp	x19,x21,[x1,#48]
1414bc3d5698SJohn Baldwin	add	x1,x1,#64
1415c0855eaaSJohn Baldwin#ifdef	__AARCH64EB__
1416bc3d5698SJohn Baldwin	rev	x5,x5
1417bc3d5698SJohn Baldwin	rev	x7,x7
1418bc3d5698SJohn Baldwin	rev	x9,x9
1419bc3d5698SJohn Baldwin	rev	x11,x11
1420bc3d5698SJohn Baldwin	rev	x13,x13
1421bc3d5698SJohn Baldwin	rev	x15,x15
1422bc3d5698SJohn Baldwin	rev	x17,x17
1423bc3d5698SJohn Baldwin	rev	x20,x20
1424bc3d5698SJohn Baldwin#endif
1425bc3d5698SJohn Baldwin	eor	x5,x5,x6
1426bc3d5698SJohn Baldwin	eor	x7,x7,x8
1427bc3d5698SJohn Baldwin	eor	x9,x9,x10
1428bc3d5698SJohn Baldwin	eor	x11,x11,x12
1429bc3d5698SJohn Baldwin	eor	x13,x13,x14
1430bc3d5698SJohn Baldwin	eor	x15,x15,x16
1431bc3d5698SJohn Baldwin	eor	x17,x17,x19
1432bc3d5698SJohn Baldwin	eor	x20,x20,x21
1433bc3d5698SJohn Baldwin
1434bc3d5698SJohn Baldwin	stp	x5,x7,[x0,#0]		// store output
1435bc3d5698SJohn Baldwin	add	x28,x28,#1			// increment counter
1436bc3d5698SJohn Baldwin	mov	w5,w22			// unpack key block
1437bc3d5698SJohn Baldwin	lsr	x6,x22,#32
1438bc3d5698SJohn Baldwin	stp	x9,x11,[x0,#16]
1439bc3d5698SJohn Baldwin	mov	w7,w23
1440bc3d5698SJohn Baldwin	lsr	x8,x23,#32
1441bc3d5698SJohn Baldwin	stp	x13,x15,[x0,#32]
1442bc3d5698SJohn Baldwin	mov	w9,w24
1443bc3d5698SJohn Baldwin	lsr	x10,x24,#32
1444bc3d5698SJohn Baldwin	stp	x17,x20,[x0,#48]
1445bc3d5698SJohn Baldwin	add	x0,x0,#64
1446bc3d5698SJohn Baldwin	mov	w11,w25
1447bc3d5698SJohn Baldwin	lsr	x12,x25,#32
1448bc3d5698SJohn Baldwin	mov	w13,w26
1449bc3d5698SJohn Baldwin	lsr	x14,x26,#32
1450bc3d5698SJohn Baldwin	mov	w15,w27
1451bc3d5698SJohn Baldwin	lsr	x16,x27,#32
1452bc3d5698SJohn Baldwin	mov	w17,w28
1453bc3d5698SJohn Baldwin	lsr	x19,x28,#32
1454bc3d5698SJohn Baldwin	mov	w20,w30
1455bc3d5698SJohn Baldwin	lsr	x21,x30,#32
1456bc3d5698SJohn Baldwin
1457bc3d5698SJohn Baldwin	mov	x4,#5
1458bc3d5698SJohn Baldwin.Loop_lower_neon:
1459bc3d5698SJohn Baldwin	sub	x4,x4,#1
1460bc3d5698SJohn Baldwin	add	v8.4s,v8.4s,v9.4s
1461c0855eaaSJohn Baldwin	add	w5,w5,w9
1462bc3d5698SJohn Baldwin	add	v12.4s,v12.4s,v13.4s
1463c0855eaaSJohn Baldwin	add	w6,w6,w10
1464bc3d5698SJohn Baldwin	add	v16.4s,v16.4s,v17.4s
1465c0855eaaSJohn Baldwin	add	w7,w7,w11
1466bc3d5698SJohn Baldwin	add	v20.4s,v20.4s,v21.4s
1467c0855eaaSJohn Baldwin	add	w8,w8,w12
1468c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v25.4s
1469c0855eaaSJohn Baldwin	eor	w17,w17,w5
1470c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v29.4s
1471bc3d5698SJohn Baldwin	eor	w19,w19,w6
1472bc3d5698SJohn Baldwin	eor	v11.16b,v11.16b,v8.16b
1473bc3d5698SJohn Baldwin	eor	w20,w20,w7
1474c0855eaaSJohn Baldwin	eor	v15.16b,v15.16b,v12.16b
1475bc3d5698SJohn Baldwin	eor	w21,w21,w8
1476c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v16.16b
1477bc3d5698SJohn Baldwin	ror	w17,w17,#16
1478c0855eaaSJohn Baldwin	eor	v23.16b,v23.16b,v20.16b
1479bc3d5698SJohn Baldwin	ror	w19,w19,#16
1480c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v24.16b
1481bc3d5698SJohn Baldwin	ror	w20,w20,#16
1482c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v28.16b
1483c0855eaaSJohn Baldwin	ror	w21,w21,#16
1484c0855eaaSJohn Baldwin	rev32	v11.8h,v11.8h
1485c0855eaaSJohn Baldwin	add	w13,w13,w17
1486c0855eaaSJohn Baldwin	rev32	v15.8h,v15.8h
1487c0855eaaSJohn Baldwin	add	w14,w14,w19
1488c0855eaaSJohn Baldwin	rev32	v19.8h,v19.8h
1489c0855eaaSJohn Baldwin	add	w15,w15,w20
1490c0855eaaSJohn Baldwin	rev32	v23.8h,v23.8h
1491c0855eaaSJohn Baldwin	add	w16,w16,w21
1492c0855eaaSJohn Baldwin	rev32	v27.8h,v27.8h
1493c0855eaaSJohn Baldwin	eor	w9,w9,w13
1494c0855eaaSJohn Baldwin	rev32	v31.8h,v31.8h
1495c0855eaaSJohn Baldwin	eor	w10,w10,w14
1496bc3d5698SJohn Baldwin	add	v10.4s,v10.4s,v11.4s
1497c0855eaaSJohn Baldwin	eor	w11,w11,w15
1498bc3d5698SJohn Baldwin	add	v14.4s,v14.4s,v15.4s
1499c0855eaaSJohn Baldwin	eor	w12,w12,w16
1500bc3d5698SJohn Baldwin	add	v18.4s,v18.4s,v19.4s
1501bc3d5698SJohn Baldwin	ror	w9,w9,#20
1502c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v23.4s
1503c0855eaaSJohn Baldwin	ror	w10,w10,#20
1504c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v27.4s
1505c0855eaaSJohn Baldwin	ror	w11,w11,#20
1506c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v31.4s
1507c0855eaaSJohn Baldwin	ror	w12,w12,#20
1508c0855eaaSJohn Baldwin	eor	v0.16b,v9.16b,v10.16b
1509c0855eaaSJohn Baldwin	add	w5,w5,w9
1510c0855eaaSJohn Baldwin	eor	v1.16b,v13.16b,v14.16b
1511c0855eaaSJohn Baldwin	add	w6,w6,w10
1512c0855eaaSJohn Baldwin	eor	v2.16b,v17.16b,v18.16b
1513c0855eaaSJohn Baldwin	add	w7,w7,w11
1514c0855eaaSJohn Baldwin	eor	v3.16b,v21.16b,v22.16b
1515c0855eaaSJohn Baldwin	add	w8,w8,w12
1516c0855eaaSJohn Baldwin	eor	v4.16b,v25.16b,v26.16b
1517c0855eaaSJohn Baldwin	eor	w17,w17,w5
1518c0855eaaSJohn Baldwin	eor	v5.16b,v29.16b,v30.16b
1519c0855eaaSJohn Baldwin	eor	w19,w19,w6
1520c0855eaaSJohn Baldwin	ushr	v9.4s,v0.4s,#20
1521c0855eaaSJohn Baldwin	eor	w20,w20,w7
1522c0855eaaSJohn Baldwin	ushr	v13.4s,v1.4s,#20
1523c0855eaaSJohn Baldwin	eor	w21,w21,w8
1524c0855eaaSJohn Baldwin	ushr	v17.4s,v2.4s,#20
1525bc3d5698SJohn Baldwin	ror	w17,w17,#24
1526c0855eaaSJohn Baldwin	ushr	v21.4s,v3.4s,#20
1527bc3d5698SJohn Baldwin	ror	w19,w19,#24
1528c0855eaaSJohn Baldwin	ushr	v25.4s,v4.4s,#20
1529bc3d5698SJohn Baldwin	ror	w20,w20,#24
1530c0855eaaSJohn Baldwin	ushr	v29.4s,v5.4s,#20
1531c0855eaaSJohn Baldwin	ror	w21,w21,#24
1532c0855eaaSJohn Baldwin	sli	v9.4s,v0.4s,#12
1533c0855eaaSJohn Baldwin	add	w13,w13,w17
1534c0855eaaSJohn Baldwin	sli	v13.4s,v1.4s,#12
1535c0855eaaSJohn Baldwin	add	w14,w14,w19
1536c0855eaaSJohn Baldwin	sli	v17.4s,v2.4s,#12
1537c0855eaaSJohn Baldwin	add	w15,w15,w20
1538c0855eaaSJohn Baldwin	sli	v21.4s,v3.4s,#12
1539c0855eaaSJohn Baldwin	add	w16,w16,w21
1540c0855eaaSJohn Baldwin	sli	v25.4s,v4.4s,#12
1541c0855eaaSJohn Baldwin	eor	w9,w9,w13
1542c0855eaaSJohn Baldwin	sli	v29.4s,v5.4s,#12
1543c0855eaaSJohn Baldwin	eor	w10,w10,w14
1544c0855eaaSJohn Baldwin	add	v8.4s,v8.4s,v9.4s
1545c0855eaaSJohn Baldwin	eor	w11,w11,w15
1546c0855eaaSJohn Baldwin	add	v12.4s,v12.4s,v13.4s
1547c0855eaaSJohn Baldwin	eor	w12,w12,w16
1548c0855eaaSJohn Baldwin	add	v16.4s,v16.4s,v17.4s
1549bc3d5698SJohn Baldwin	ror	w9,w9,#25
1550c0855eaaSJohn Baldwin	add	v20.4s,v20.4s,v21.4s
1551c0855eaaSJohn Baldwin	ror	w10,w10,#25
1552c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v25.4s
1553c0855eaaSJohn Baldwin	ror	w11,w11,#25
1554c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v29.4s
1555c0855eaaSJohn Baldwin	ror	w12,w12,#25
1556c0855eaaSJohn Baldwin	eor	v11.16b,v11.16b,v8.16b
1557c0855eaaSJohn Baldwin	add	w5,w5,w10
1558c0855eaaSJohn Baldwin	eor	v15.16b,v15.16b,v12.16b
1559c0855eaaSJohn Baldwin	add	w6,w6,w11
1560c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v16.16b
1561c0855eaaSJohn Baldwin	add	w7,w7,w12
1562c0855eaaSJohn Baldwin	eor	v23.16b,v23.16b,v20.16b
1563c0855eaaSJohn Baldwin	add	w8,w8,w9
1564c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v24.16b
1565c0855eaaSJohn Baldwin	eor	w21,w21,w5
1566c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v28.16b
1567c0855eaaSJohn Baldwin	eor	w17,w17,w6
1568c0855eaaSJohn Baldwin	tbl	v11.16b,{v11.16b},v6.16b
1569c0855eaaSJohn Baldwin	eor	w19,w19,w7
1570c0855eaaSJohn Baldwin	tbl	v15.16b,{v15.16b},v6.16b
1571c0855eaaSJohn Baldwin	eor	w20,w20,w8
1572c0855eaaSJohn Baldwin	tbl	v19.16b,{v19.16b},v6.16b
1573c0855eaaSJohn Baldwin	ror	w21,w21,#16
1574c0855eaaSJohn Baldwin	tbl	v23.16b,{v23.16b},v6.16b
1575c0855eaaSJohn Baldwin	ror	w17,w17,#16
1576c0855eaaSJohn Baldwin	tbl	v27.16b,{v27.16b},v6.16b
1577c0855eaaSJohn Baldwin	ror	w19,w19,#16
1578c0855eaaSJohn Baldwin	tbl	v31.16b,{v31.16b},v6.16b
1579c0855eaaSJohn Baldwin	ror	w20,w20,#16
1580c0855eaaSJohn Baldwin	add	v10.4s,v10.4s,v11.4s
1581c0855eaaSJohn Baldwin	add	w15,w15,w21
1582c0855eaaSJohn Baldwin	add	v14.4s,v14.4s,v15.4s
1583c0855eaaSJohn Baldwin	add	w16,w16,w17
1584c0855eaaSJohn Baldwin	add	v18.4s,v18.4s,v19.4s
1585c0855eaaSJohn Baldwin	add	w13,w13,w19
1586c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v23.4s
1587c0855eaaSJohn Baldwin	add	w14,w14,w20
1588c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v27.4s
1589c0855eaaSJohn Baldwin	eor	w10,w10,w15
1590c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v31.4s
1591c0855eaaSJohn Baldwin	eor	w11,w11,w16
1592c0855eaaSJohn Baldwin	eor	v0.16b,v9.16b,v10.16b
1593c0855eaaSJohn Baldwin	eor	w12,w12,w13
1594c0855eaaSJohn Baldwin	eor	v1.16b,v13.16b,v14.16b
1595c0855eaaSJohn Baldwin	eor	w9,w9,w14
1596c0855eaaSJohn Baldwin	eor	v2.16b,v17.16b,v18.16b
1597c0855eaaSJohn Baldwin	ror	w10,w10,#20
1598c0855eaaSJohn Baldwin	eor	v3.16b,v21.16b,v22.16b
1599c0855eaaSJohn Baldwin	ror	w11,w11,#20
1600c0855eaaSJohn Baldwin	eor	v4.16b,v25.16b,v26.16b
1601c0855eaaSJohn Baldwin	ror	w12,w12,#20
1602c0855eaaSJohn Baldwin	eor	v5.16b,v29.16b,v30.16b
1603c0855eaaSJohn Baldwin	ror	w9,w9,#20
1604c0855eaaSJohn Baldwin	ushr	v9.4s,v0.4s,#25
1605c0855eaaSJohn Baldwin	add	w5,w5,w10
1606c0855eaaSJohn Baldwin	ushr	v13.4s,v1.4s,#25
1607c0855eaaSJohn Baldwin	add	w6,w6,w11
1608c0855eaaSJohn Baldwin	ushr	v17.4s,v2.4s,#25
1609c0855eaaSJohn Baldwin	add	w7,w7,w12
1610c0855eaaSJohn Baldwin	ushr	v21.4s,v3.4s,#25
1611c0855eaaSJohn Baldwin	add	w8,w8,w9
1612c0855eaaSJohn Baldwin	ushr	v25.4s,v4.4s,#25
1613c0855eaaSJohn Baldwin	eor	w21,w21,w5
1614c0855eaaSJohn Baldwin	ushr	v29.4s,v5.4s,#25
1615c0855eaaSJohn Baldwin	eor	w17,w17,w6
1616c0855eaaSJohn Baldwin	sli	v9.4s,v0.4s,#7
1617c0855eaaSJohn Baldwin	eor	w19,w19,w7
1618c0855eaaSJohn Baldwin	sli	v13.4s,v1.4s,#7
1619c0855eaaSJohn Baldwin	eor	w20,w20,w8
1620c0855eaaSJohn Baldwin	sli	v17.4s,v2.4s,#7
1621c0855eaaSJohn Baldwin	ror	w21,w21,#24
1622c0855eaaSJohn Baldwin	sli	v21.4s,v3.4s,#7
1623c0855eaaSJohn Baldwin	ror	w17,w17,#24
1624c0855eaaSJohn Baldwin	sli	v25.4s,v4.4s,#7
1625c0855eaaSJohn Baldwin	ror	w19,w19,#24
1626c0855eaaSJohn Baldwin	sli	v29.4s,v5.4s,#7
1627c0855eaaSJohn Baldwin	ror	w20,w20,#24
1628c0855eaaSJohn Baldwin	ext	v10.16b,v10.16b,v10.16b,#8
1629c0855eaaSJohn Baldwin	add	w15,w15,w21
1630c0855eaaSJohn Baldwin	ext	v14.16b,v14.16b,v14.16b,#8
1631c0855eaaSJohn Baldwin	add	w16,w16,w17
1632c0855eaaSJohn Baldwin	ext	v18.16b,v18.16b,v18.16b,#8
1633c0855eaaSJohn Baldwin	add	w13,w13,w19
1634c0855eaaSJohn Baldwin	ext	v22.16b,v22.16b,v22.16b,#8
1635c0855eaaSJohn Baldwin	add	w14,w14,w20
1636c0855eaaSJohn Baldwin	ext	v26.16b,v26.16b,v26.16b,#8
1637c0855eaaSJohn Baldwin	eor	w10,w10,w15
1638c0855eaaSJohn Baldwin	ext	v30.16b,v30.16b,v30.16b,#8
1639c0855eaaSJohn Baldwin	eor	w11,w11,w16
1640bc3d5698SJohn Baldwin	ext	v11.16b,v11.16b,v11.16b,#12
1641c0855eaaSJohn Baldwin	eor	w12,w12,w13
1642bc3d5698SJohn Baldwin	ext	v15.16b,v15.16b,v15.16b,#12
1643c0855eaaSJohn Baldwin	eor	w9,w9,w14
1644bc3d5698SJohn Baldwin	ext	v19.16b,v19.16b,v19.16b,#12
1645c0855eaaSJohn Baldwin	ror	w10,w10,#25
1646bc3d5698SJohn Baldwin	ext	v23.16b,v23.16b,v23.16b,#12
1647c0855eaaSJohn Baldwin	ror	w11,w11,#25
1648c0855eaaSJohn Baldwin	ext	v27.16b,v27.16b,v27.16b,#12
1649c0855eaaSJohn Baldwin	ror	w12,w12,#25
1650c0855eaaSJohn Baldwin	ext	v31.16b,v31.16b,v31.16b,#12
1651c0855eaaSJohn Baldwin	ror	w9,w9,#25
1652bc3d5698SJohn Baldwin	ext	v9.16b,v9.16b,v9.16b,#4
1653bc3d5698SJohn Baldwin	ext	v13.16b,v13.16b,v13.16b,#4
1654bc3d5698SJohn Baldwin	ext	v17.16b,v17.16b,v17.16b,#4
1655bc3d5698SJohn Baldwin	ext	v21.16b,v21.16b,v21.16b,#4
1656c0855eaaSJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#4
1657c0855eaaSJohn Baldwin	ext	v29.16b,v29.16b,v29.16b,#4
1658bc3d5698SJohn Baldwin	add	v8.4s,v8.4s,v9.4s
1659c0855eaaSJohn Baldwin	add	w5,w5,w9
1660bc3d5698SJohn Baldwin	add	v12.4s,v12.4s,v13.4s
1661c0855eaaSJohn Baldwin	add	w6,w6,w10
1662bc3d5698SJohn Baldwin	add	v16.4s,v16.4s,v17.4s
1663c0855eaaSJohn Baldwin	add	w7,w7,w11
1664bc3d5698SJohn Baldwin	add	v20.4s,v20.4s,v21.4s
1665c0855eaaSJohn Baldwin	add	w8,w8,w12
1666c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v25.4s
1667c0855eaaSJohn Baldwin	eor	w17,w17,w5
1668c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v29.4s
1669bc3d5698SJohn Baldwin	eor	w19,w19,w6
1670bc3d5698SJohn Baldwin	eor	v11.16b,v11.16b,v8.16b
1671bc3d5698SJohn Baldwin	eor	w20,w20,w7
1672c0855eaaSJohn Baldwin	eor	v15.16b,v15.16b,v12.16b
1673bc3d5698SJohn Baldwin	eor	w21,w21,w8
1674c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v16.16b
1675bc3d5698SJohn Baldwin	ror	w17,w17,#16
1676c0855eaaSJohn Baldwin	eor	v23.16b,v23.16b,v20.16b
1677bc3d5698SJohn Baldwin	ror	w19,w19,#16
1678c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v24.16b
1679bc3d5698SJohn Baldwin	ror	w20,w20,#16
1680c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v28.16b
1681c0855eaaSJohn Baldwin	ror	w21,w21,#16
1682c0855eaaSJohn Baldwin	rev32	v11.8h,v11.8h
1683c0855eaaSJohn Baldwin	add	w13,w13,w17
1684c0855eaaSJohn Baldwin	rev32	v15.8h,v15.8h
1685c0855eaaSJohn Baldwin	add	w14,w14,w19
1686c0855eaaSJohn Baldwin	rev32	v19.8h,v19.8h
1687c0855eaaSJohn Baldwin	add	w15,w15,w20
1688c0855eaaSJohn Baldwin	rev32	v23.8h,v23.8h
1689c0855eaaSJohn Baldwin	add	w16,w16,w21
1690c0855eaaSJohn Baldwin	rev32	v27.8h,v27.8h
1691c0855eaaSJohn Baldwin	eor	w9,w9,w13
1692c0855eaaSJohn Baldwin	rev32	v31.8h,v31.8h
1693c0855eaaSJohn Baldwin	eor	w10,w10,w14
1694bc3d5698SJohn Baldwin	add	v10.4s,v10.4s,v11.4s
1695c0855eaaSJohn Baldwin	eor	w11,w11,w15
1696bc3d5698SJohn Baldwin	add	v14.4s,v14.4s,v15.4s
1697c0855eaaSJohn Baldwin	eor	w12,w12,w16
1698bc3d5698SJohn Baldwin	add	v18.4s,v18.4s,v19.4s
1699bc3d5698SJohn Baldwin	ror	w9,w9,#20
1700c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v23.4s
1701c0855eaaSJohn Baldwin	ror	w10,w10,#20
1702c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v27.4s
1703c0855eaaSJohn Baldwin	ror	w11,w11,#20
1704c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v31.4s
1705c0855eaaSJohn Baldwin	ror	w12,w12,#20
1706c0855eaaSJohn Baldwin	eor	v0.16b,v9.16b,v10.16b
1707c0855eaaSJohn Baldwin	add	w5,w5,w9
1708c0855eaaSJohn Baldwin	eor	v1.16b,v13.16b,v14.16b
1709c0855eaaSJohn Baldwin	add	w6,w6,w10
1710c0855eaaSJohn Baldwin	eor	v2.16b,v17.16b,v18.16b
1711c0855eaaSJohn Baldwin	add	w7,w7,w11
1712c0855eaaSJohn Baldwin	eor	v3.16b,v21.16b,v22.16b
1713c0855eaaSJohn Baldwin	add	w8,w8,w12
1714c0855eaaSJohn Baldwin	eor	v4.16b,v25.16b,v26.16b
1715c0855eaaSJohn Baldwin	eor	w17,w17,w5
1716c0855eaaSJohn Baldwin	eor	v5.16b,v29.16b,v30.16b
1717c0855eaaSJohn Baldwin	eor	w19,w19,w6
1718c0855eaaSJohn Baldwin	ushr	v9.4s,v0.4s,#20
1719c0855eaaSJohn Baldwin	eor	w20,w20,w7
1720c0855eaaSJohn Baldwin	ushr	v13.4s,v1.4s,#20
1721c0855eaaSJohn Baldwin	eor	w21,w21,w8
1722c0855eaaSJohn Baldwin	ushr	v17.4s,v2.4s,#20
1723bc3d5698SJohn Baldwin	ror	w17,w17,#24
1724c0855eaaSJohn Baldwin	ushr	v21.4s,v3.4s,#20
1725bc3d5698SJohn Baldwin	ror	w19,w19,#24
1726c0855eaaSJohn Baldwin	ushr	v25.4s,v4.4s,#20
1727bc3d5698SJohn Baldwin	ror	w20,w20,#24
1728c0855eaaSJohn Baldwin	ushr	v29.4s,v5.4s,#20
1729c0855eaaSJohn Baldwin	ror	w21,w21,#24
1730c0855eaaSJohn Baldwin	sli	v9.4s,v0.4s,#12
1731c0855eaaSJohn Baldwin	add	w13,w13,w17
1732c0855eaaSJohn Baldwin	sli	v13.4s,v1.4s,#12
1733c0855eaaSJohn Baldwin	add	w14,w14,w19
1734c0855eaaSJohn Baldwin	sli	v17.4s,v2.4s,#12
1735c0855eaaSJohn Baldwin	add	w15,w15,w20
1736c0855eaaSJohn Baldwin	sli	v21.4s,v3.4s,#12
1737c0855eaaSJohn Baldwin	add	w16,w16,w21
1738c0855eaaSJohn Baldwin	sli	v25.4s,v4.4s,#12
1739c0855eaaSJohn Baldwin	eor	w9,w9,w13
1740c0855eaaSJohn Baldwin	sli	v29.4s,v5.4s,#12
1741c0855eaaSJohn Baldwin	eor	w10,w10,w14
1742c0855eaaSJohn Baldwin	add	v8.4s,v8.4s,v9.4s
1743c0855eaaSJohn Baldwin	eor	w11,w11,w15
1744c0855eaaSJohn Baldwin	add	v12.4s,v12.4s,v13.4s
1745c0855eaaSJohn Baldwin	eor	w12,w12,w16
1746c0855eaaSJohn Baldwin	add	v16.4s,v16.4s,v17.4s
1747bc3d5698SJohn Baldwin	ror	w9,w9,#25
1748c0855eaaSJohn Baldwin	add	v20.4s,v20.4s,v21.4s
1749c0855eaaSJohn Baldwin	ror	w10,w10,#25
1750c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v25.4s
1751c0855eaaSJohn Baldwin	ror	w11,w11,#25
1752c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v29.4s
1753c0855eaaSJohn Baldwin	ror	w12,w12,#25
1754c0855eaaSJohn Baldwin	eor	v11.16b,v11.16b,v8.16b
1755c0855eaaSJohn Baldwin	add	w5,w5,w10
1756c0855eaaSJohn Baldwin	eor	v15.16b,v15.16b,v12.16b
1757c0855eaaSJohn Baldwin	add	w6,w6,w11
1758c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v16.16b
1759c0855eaaSJohn Baldwin	add	w7,w7,w12
1760c0855eaaSJohn Baldwin	eor	v23.16b,v23.16b,v20.16b
1761c0855eaaSJohn Baldwin	add	w8,w8,w9
1762c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v24.16b
1763c0855eaaSJohn Baldwin	eor	w21,w21,w5
1764c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v28.16b
1765c0855eaaSJohn Baldwin	eor	w17,w17,w6
1766c0855eaaSJohn Baldwin	tbl	v11.16b,{v11.16b},v6.16b
1767c0855eaaSJohn Baldwin	eor	w19,w19,w7
1768c0855eaaSJohn Baldwin	tbl	v15.16b,{v15.16b},v6.16b
1769c0855eaaSJohn Baldwin	eor	w20,w20,w8
1770c0855eaaSJohn Baldwin	tbl	v19.16b,{v19.16b},v6.16b
1771c0855eaaSJohn Baldwin	ror	w21,w21,#16
1772c0855eaaSJohn Baldwin	tbl	v23.16b,{v23.16b},v6.16b
1773c0855eaaSJohn Baldwin	ror	w17,w17,#16
1774c0855eaaSJohn Baldwin	tbl	v27.16b,{v27.16b},v6.16b
1775c0855eaaSJohn Baldwin	ror	w19,w19,#16
1776c0855eaaSJohn Baldwin	tbl	v31.16b,{v31.16b},v6.16b
1777c0855eaaSJohn Baldwin	ror	w20,w20,#16
1778c0855eaaSJohn Baldwin	add	v10.4s,v10.4s,v11.4s
1779c0855eaaSJohn Baldwin	add	w15,w15,w21
1780c0855eaaSJohn Baldwin	add	v14.4s,v14.4s,v15.4s
1781c0855eaaSJohn Baldwin	add	w16,w16,w17
1782c0855eaaSJohn Baldwin	add	v18.4s,v18.4s,v19.4s
1783c0855eaaSJohn Baldwin	add	w13,w13,w19
1784c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v23.4s
1785c0855eaaSJohn Baldwin	add	w14,w14,w20
1786c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v27.4s
1787c0855eaaSJohn Baldwin	eor	w10,w10,w15
1788c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v31.4s
1789c0855eaaSJohn Baldwin	eor	w11,w11,w16
1790c0855eaaSJohn Baldwin	eor	v0.16b,v9.16b,v10.16b
1791c0855eaaSJohn Baldwin	eor	w12,w12,w13
1792c0855eaaSJohn Baldwin	eor	v1.16b,v13.16b,v14.16b
1793c0855eaaSJohn Baldwin	eor	w9,w9,w14
1794c0855eaaSJohn Baldwin	eor	v2.16b,v17.16b,v18.16b
1795c0855eaaSJohn Baldwin	ror	w10,w10,#20
1796c0855eaaSJohn Baldwin	eor	v3.16b,v21.16b,v22.16b
1797c0855eaaSJohn Baldwin	ror	w11,w11,#20
1798c0855eaaSJohn Baldwin	eor	v4.16b,v25.16b,v26.16b
1799c0855eaaSJohn Baldwin	ror	w12,w12,#20
1800c0855eaaSJohn Baldwin	eor	v5.16b,v29.16b,v30.16b
1801c0855eaaSJohn Baldwin	ror	w9,w9,#20
1802c0855eaaSJohn Baldwin	ushr	v9.4s,v0.4s,#25
1803c0855eaaSJohn Baldwin	add	w5,w5,w10
1804c0855eaaSJohn Baldwin	ushr	v13.4s,v1.4s,#25
1805c0855eaaSJohn Baldwin	add	w6,w6,w11
1806c0855eaaSJohn Baldwin	ushr	v17.4s,v2.4s,#25
1807c0855eaaSJohn Baldwin	add	w7,w7,w12
1808c0855eaaSJohn Baldwin	ushr	v21.4s,v3.4s,#25
1809c0855eaaSJohn Baldwin	add	w8,w8,w9
1810c0855eaaSJohn Baldwin	ushr	v25.4s,v4.4s,#25
1811c0855eaaSJohn Baldwin	eor	w21,w21,w5
1812c0855eaaSJohn Baldwin	ushr	v29.4s,v5.4s,#25
1813c0855eaaSJohn Baldwin	eor	w17,w17,w6
1814c0855eaaSJohn Baldwin	sli	v9.4s,v0.4s,#7
1815c0855eaaSJohn Baldwin	eor	w19,w19,w7
1816c0855eaaSJohn Baldwin	sli	v13.4s,v1.4s,#7
1817c0855eaaSJohn Baldwin	eor	w20,w20,w8
1818c0855eaaSJohn Baldwin	sli	v17.4s,v2.4s,#7
1819c0855eaaSJohn Baldwin	ror	w21,w21,#24
1820c0855eaaSJohn Baldwin	sli	v21.4s,v3.4s,#7
1821c0855eaaSJohn Baldwin	ror	w17,w17,#24
1822c0855eaaSJohn Baldwin	sli	v25.4s,v4.4s,#7
1823c0855eaaSJohn Baldwin	ror	w19,w19,#24
1824c0855eaaSJohn Baldwin	sli	v29.4s,v5.4s,#7
1825c0855eaaSJohn Baldwin	ror	w20,w20,#24
1826c0855eaaSJohn Baldwin	ext	v10.16b,v10.16b,v10.16b,#8
1827c0855eaaSJohn Baldwin	add	w15,w15,w21
1828c0855eaaSJohn Baldwin	ext	v14.16b,v14.16b,v14.16b,#8
1829c0855eaaSJohn Baldwin	add	w16,w16,w17
1830c0855eaaSJohn Baldwin	ext	v18.16b,v18.16b,v18.16b,#8
1831c0855eaaSJohn Baldwin	add	w13,w13,w19
1832c0855eaaSJohn Baldwin	ext	v22.16b,v22.16b,v22.16b,#8
1833c0855eaaSJohn Baldwin	add	w14,w14,w20
1834c0855eaaSJohn Baldwin	ext	v26.16b,v26.16b,v26.16b,#8
1835c0855eaaSJohn Baldwin	eor	w10,w10,w15
1836c0855eaaSJohn Baldwin	ext	v30.16b,v30.16b,v30.16b,#8
1837c0855eaaSJohn Baldwin	eor	w11,w11,w16
1838bc3d5698SJohn Baldwin	ext	v11.16b,v11.16b,v11.16b,#4
1839c0855eaaSJohn Baldwin	eor	w12,w12,w13
1840bc3d5698SJohn Baldwin	ext	v15.16b,v15.16b,v15.16b,#4
1841c0855eaaSJohn Baldwin	eor	w9,w9,w14
1842bc3d5698SJohn Baldwin	ext	v19.16b,v19.16b,v19.16b,#4
1843c0855eaaSJohn Baldwin	ror	w10,w10,#25
1844bc3d5698SJohn Baldwin	ext	v23.16b,v23.16b,v23.16b,#4
1845c0855eaaSJohn Baldwin	ror	w11,w11,#25
1846c0855eaaSJohn Baldwin	ext	v27.16b,v27.16b,v27.16b,#4
1847c0855eaaSJohn Baldwin	ror	w12,w12,#25
1848c0855eaaSJohn Baldwin	ext	v31.16b,v31.16b,v31.16b,#4
1849c0855eaaSJohn Baldwin	ror	w9,w9,#25
1850bc3d5698SJohn Baldwin	ext	v9.16b,v9.16b,v9.16b,#12
1851bc3d5698SJohn Baldwin	ext	v13.16b,v13.16b,v13.16b,#12
1852bc3d5698SJohn Baldwin	ext	v17.16b,v17.16b,v17.16b,#12
1853bc3d5698SJohn Baldwin	ext	v21.16b,v21.16b,v21.16b,#12
1854c0855eaaSJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#12
1855c0855eaaSJohn Baldwin	ext	v29.16b,v29.16b,v29.16b,#12
1856bc3d5698SJohn Baldwin	cbnz	x4,.Loop_lower_neon
1857bc3d5698SJohn Baldwin
1858bc3d5698SJohn Baldwin	add	w5,w5,w22		// accumulate key block
1859c0855eaaSJohn Baldwin	ldp	q0,q1,[sp,#0]
1860bc3d5698SJohn Baldwin	add	x6,x6,x22,lsr#32
1861c0855eaaSJohn Baldwin	ldp	q2,q3,[sp,#32]
1862bc3d5698SJohn Baldwin	add	w7,w7,w23
1863c0855eaaSJohn Baldwin	ldp	q4,q5,[sp,#64]
1864bc3d5698SJohn Baldwin	add	x8,x8,x23,lsr#32
1865c0855eaaSJohn Baldwin	ldr	q6,[sp,#96]
1866c0855eaaSJohn Baldwin	add	v8.4s,v8.4s,v0.4s
1867bc3d5698SJohn Baldwin	add	w9,w9,w24
1868c0855eaaSJohn Baldwin	add	v12.4s,v12.4s,v0.4s
1869bc3d5698SJohn Baldwin	add	x10,x10,x24,lsr#32
1870c0855eaaSJohn Baldwin	add	v16.4s,v16.4s,v0.4s
1871bc3d5698SJohn Baldwin	add	w11,w11,w25
1872c0855eaaSJohn Baldwin	add	v20.4s,v20.4s,v0.4s
1873bc3d5698SJohn Baldwin	add	x12,x12,x25,lsr#32
1874c0855eaaSJohn Baldwin	add	v24.4s,v24.4s,v0.4s
1875bc3d5698SJohn Baldwin	add	w13,w13,w26
1876c0855eaaSJohn Baldwin	add	v28.4s,v28.4s,v0.4s
1877bc3d5698SJohn Baldwin	add	x14,x14,x26,lsr#32
1878c0855eaaSJohn Baldwin	add	v10.4s,v10.4s,v2.4s
1879bc3d5698SJohn Baldwin	add	w15,w15,w27
1880c0855eaaSJohn Baldwin	add	v14.4s,v14.4s,v2.4s
1881bc3d5698SJohn Baldwin	add	x16,x16,x27,lsr#32
1882c0855eaaSJohn Baldwin	add	v18.4s,v18.4s,v2.4s
1883bc3d5698SJohn Baldwin	add	w17,w17,w28
1884c0855eaaSJohn Baldwin	add	v22.4s,v22.4s,v2.4s
1885bc3d5698SJohn Baldwin	add	x19,x19,x28,lsr#32
1886c0855eaaSJohn Baldwin	add	v26.4s,v26.4s,v2.4s
1887bc3d5698SJohn Baldwin	add	w20,w20,w30
1888c0855eaaSJohn Baldwin	add	v30.4s,v30.4s,v2.4s
1889bc3d5698SJohn Baldwin	add	x21,x21,x30,lsr#32
1890c0855eaaSJohn Baldwin	add	v27.4s,v27.4s,v7.4s			// +4
1891bc3d5698SJohn Baldwin	add	x5,x5,x6,lsl#32	// pack
1892c0855eaaSJohn Baldwin	add	v31.4s,v31.4s,v7.4s			// +4
1893bc3d5698SJohn Baldwin	add	x7,x7,x8,lsl#32
1894c0855eaaSJohn Baldwin	add	v11.4s,v11.4s,v3.4s
1895bc3d5698SJohn Baldwin	ldp	x6,x8,[x1,#0]		// load input
1896c0855eaaSJohn Baldwin	add	v15.4s,v15.4s,v4.4s
1897bc3d5698SJohn Baldwin	add	x9,x9,x10,lsl#32
1898c0855eaaSJohn Baldwin	add	v19.4s,v19.4s,v5.4s
1899bc3d5698SJohn Baldwin	add	x11,x11,x12,lsl#32
1900c0855eaaSJohn Baldwin	add	v23.4s,v23.4s,v6.4s
1901bc3d5698SJohn Baldwin	ldp	x10,x12,[x1,#16]
1902c0855eaaSJohn Baldwin	add	v27.4s,v27.4s,v3.4s
1903bc3d5698SJohn Baldwin	add	x13,x13,x14,lsl#32
1904c0855eaaSJohn Baldwin	add	v31.4s,v31.4s,v4.4s
1905bc3d5698SJohn Baldwin	add	x15,x15,x16,lsl#32
1906c0855eaaSJohn Baldwin	add	v9.4s,v9.4s,v1.4s
1907bc3d5698SJohn Baldwin	ldp	x14,x16,[x1,#32]
1908c0855eaaSJohn Baldwin	add	v13.4s,v13.4s,v1.4s
1909bc3d5698SJohn Baldwin	add	x17,x17,x19,lsl#32
1910c0855eaaSJohn Baldwin	add	v17.4s,v17.4s,v1.4s
1911bc3d5698SJohn Baldwin	add	x20,x20,x21,lsl#32
1912c0855eaaSJohn Baldwin	add	v21.4s,v21.4s,v1.4s
1913bc3d5698SJohn Baldwin	ldp	x19,x21,[x1,#48]
1914c0855eaaSJohn Baldwin	add	v25.4s,v25.4s,v1.4s
1915bc3d5698SJohn Baldwin	add	x1,x1,#64
1916c0855eaaSJohn Baldwin	add	v29.4s,v29.4s,v1.4s
1917bc3d5698SJohn Baldwin
1918c0855eaaSJohn Baldwin#ifdef	__AARCH64EB__
1919bc3d5698SJohn Baldwin	rev	x5,x5
1920bc3d5698SJohn Baldwin	rev	x7,x7
1921bc3d5698SJohn Baldwin	rev	x9,x9
1922bc3d5698SJohn Baldwin	rev	x11,x11
1923bc3d5698SJohn Baldwin	rev	x13,x13
1924bc3d5698SJohn Baldwin	rev	x15,x15
1925bc3d5698SJohn Baldwin	rev	x17,x17
1926bc3d5698SJohn Baldwin	rev	x20,x20
1927bc3d5698SJohn Baldwin#endif
1928c0855eaaSJohn Baldwin	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1929bc3d5698SJohn Baldwin	eor	x5,x5,x6
1930bc3d5698SJohn Baldwin	eor	x7,x7,x8
1931bc3d5698SJohn Baldwin	eor	x9,x9,x10
1932bc3d5698SJohn Baldwin	eor	x11,x11,x12
1933bc3d5698SJohn Baldwin	eor	x13,x13,x14
1934c0855eaaSJohn Baldwin	eor	v8.16b,v8.16b,v0.16b
1935bc3d5698SJohn Baldwin	eor	x15,x15,x16
1936c0855eaaSJohn Baldwin	eor	v9.16b,v9.16b,v1.16b
1937bc3d5698SJohn Baldwin	eor	x17,x17,x19
1938c0855eaaSJohn Baldwin	eor	v10.16b,v10.16b,v2.16b
1939bc3d5698SJohn Baldwin	eor	x20,x20,x21
1940c0855eaaSJohn Baldwin	eor	v11.16b,v11.16b,v3.16b
1941c0855eaaSJohn Baldwin	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1942bc3d5698SJohn Baldwin
1943bc3d5698SJohn Baldwin	stp	x5,x7,[x0,#0]		// store output
1944bc3d5698SJohn Baldwin	add	x28,x28,#7			// increment counter
1945bc3d5698SJohn Baldwin	stp	x9,x11,[x0,#16]
1946bc3d5698SJohn Baldwin	stp	x13,x15,[x0,#32]
1947bc3d5698SJohn Baldwin	stp	x17,x20,[x0,#48]
1948bc3d5698SJohn Baldwin	add	x0,x0,#64
1949bc3d5698SJohn Baldwin	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1950bc3d5698SJohn Baldwin
1951bc3d5698SJohn Baldwin	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1952c0855eaaSJohn Baldwin	eor	v12.16b,v12.16b,v0.16b
1953c0855eaaSJohn Baldwin	eor	v13.16b,v13.16b,v1.16b
1954c0855eaaSJohn Baldwin	eor	v14.16b,v14.16b,v2.16b
1955c0855eaaSJohn Baldwin	eor	v15.16b,v15.16b,v3.16b
1956bc3d5698SJohn Baldwin	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1957bc3d5698SJohn Baldwin
1958bc3d5698SJohn Baldwin	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1959bc3d5698SJohn Baldwin	eor	v16.16b,v16.16b,v8.16b
1960c0855eaaSJohn Baldwin	ldp	q0,q1,[sp,#0]
1961bc3d5698SJohn Baldwin	eor	v17.16b,v17.16b,v9.16b
1962c0855eaaSJohn Baldwin	ldp	q2,q3,[sp,#32]
1963bc3d5698SJohn Baldwin	eor	v18.16b,v18.16b,v10.16b
1964bc3d5698SJohn Baldwin	eor	v19.16b,v19.16b,v11.16b
1965bc3d5698SJohn Baldwin	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1966bc3d5698SJohn Baldwin
1967c0855eaaSJohn Baldwin	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
1968bc3d5698SJohn Baldwin	eor	v20.16b,v20.16b,v12.16b
1969bc3d5698SJohn Baldwin	eor	v21.16b,v21.16b,v13.16b
1970bc3d5698SJohn Baldwin	eor	v22.16b,v22.16b,v14.16b
1971bc3d5698SJohn Baldwin	eor	v23.16b,v23.16b,v15.16b
1972bc3d5698SJohn Baldwin	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1973bc3d5698SJohn Baldwin
1974c0855eaaSJohn Baldwin	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1975c0855eaaSJohn Baldwin	eor	v24.16b,v24.16b,v16.16b
1976c0855eaaSJohn Baldwin	eor	v25.16b,v25.16b,v17.16b
1977c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v18.16b
1978c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v19.16b
1979c0855eaaSJohn Baldwin	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
1980c0855eaaSJohn Baldwin
1981c0855eaaSJohn Baldwin	shl	v8.4s,v7.4s,#1			// 4 -> 8
1982c0855eaaSJohn Baldwin	eor	v28.16b,v28.16b,v20.16b
1983c0855eaaSJohn Baldwin	eor	v29.16b,v29.16b,v21.16b
1984c0855eaaSJohn Baldwin	eor	v30.16b,v30.16b,v22.16b
1985c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v23.16b
1986c0855eaaSJohn Baldwin	st1	{v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64
1987c0855eaaSJohn Baldwin
1988c0855eaaSJohn Baldwin	add	v3.4s,v3.4s,v8.4s			// += 8
1989c0855eaaSJohn Baldwin	add	v4.4s,v4.4s,v8.4s
1990c0855eaaSJohn Baldwin	add	v5.4s,v5.4s,v8.4s
1991c0855eaaSJohn Baldwin	add	v6.4s,v6.4s,v8.4s
1992bc3d5698SJohn Baldwin
1993bc3d5698SJohn Baldwin	b.hs	.Loop_outer_512_neon
1994bc3d5698SJohn Baldwin
1995bc3d5698SJohn Baldwin	adds	x2,x2,#512
1996c0855eaaSJohn Baldwin	ushr	v7.4s,v7.4s,#1			// 4 -> 2
1997bc3d5698SJohn Baldwin
1998c0855eaaSJohn Baldwin	ldp	d10,d11,[sp,#128+16]		// meet ABI requirements
1999bc3d5698SJohn Baldwin	ldp	d12,d13,[sp,#128+32]
2000bc3d5698SJohn Baldwin	ldp	d14,d15,[sp,#128+48]
2001bc3d5698SJohn Baldwin
2002c0855eaaSJohn Baldwin	stp	q0,q0,[sp,#0]		// wipe off-load area
2003c0855eaaSJohn Baldwin	stp	q0,q0,[sp,#32]
2004c0855eaaSJohn Baldwin	stp	q0,q0,[sp,#64]
2005bc3d5698SJohn Baldwin
2006bc3d5698SJohn Baldwin	b.eq	.Ldone_512_neon
2007bc3d5698SJohn Baldwin
2008c0855eaaSJohn Baldwin	sub	x3,x3,#16			// .Lone
2009bc3d5698SJohn Baldwin	cmp	x2,#192
2010bc3d5698SJohn Baldwin	add	sp,sp,#128
2011c0855eaaSJohn Baldwin	sub	v3.4s,v3.4s,v7.4s		// -= 2
2012c0855eaaSJohn Baldwin	ld1	{v8.4s,v9.4s},[x3]
2013bc3d5698SJohn Baldwin	b.hs	.Loop_outer_neon
2014bc3d5698SJohn Baldwin
2015c0855eaaSJohn Baldwin	ldp	d8,d9,[sp,#0]			// meet ABI requirements
2016c0855eaaSJohn Baldwin	eor	v1.16b,v1.16b,v1.16b
2017c0855eaaSJohn Baldwin	eor	v2.16b,v2.16b,v2.16b
2018c0855eaaSJohn Baldwin	eor	v3.16b,v3.16b,v3.16b
2019c0855eaaSJohn Baldwin	eor	v4.16b,v4.16b,v4.16b
2020c0855eaaSJohn Baldwin	eor	v5.16b,v5.16b,v5.16b
2021c0855eaaSJohn Baldwin	eor	v6.16b,v6.16b,v6.16b
2022bc3d5698SJohn Baldwin	b	.Loop_outer
2023bc3d5698SJohn Baldwin
2024bc3d5698SJohn Baldwin.Ldone_512_neon:
2025c0855eaaSJohn Baldwin	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
2026bc3d5698SJohn Baldwin	ldp	x19,x20,[x29,#16]
2027bc3d5698SJohn Baldwin	add	sp,sp,#128+64
2028bc3d5698SJohn Baldwin	ldp	x21,x22,[x29,#32]
2029bc3d5698SJohn Baldwin	ldp	x23,x24,[x29,#48]
2030bc3d5698SJohn Baldwin	ldp	x25,x26,[x29,#64]
2031bc3d5698SJohn Baldwin	ldp	x27,x28,[x29,#80]
2032bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#96
2033*bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
2034bc3d5698SJohn Baldwin	ret
2035bc3d5698SJohn Baldwin.size	ChaCha20_512_neon,.-ChaCha20_512_neon
2036