xref: /freebsd/sys/crypto/openssl/arm/chacha-armv4.S (revision c0855eaa3ee9614804b6bd6a255aa9f71e095f43)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from chacha-armv4.pl. */
2bc3d5698SJohn Baldwin#include "arm_arch.h"
3bc3d5698SJohn Baldwin
4bc3d5698SJohn Baldwin#if defined(__thumb2__) || defined(__clang__)
5bc3d5698SJohn Baldwin.syntax	unified
6bc3d5698SJohn Baldwin#endif
7bc3d5698SJohn Baldwin#if defined(__thumb2__)
8bc3d5698SJohn Baldwin.thumb
9bc3d5698SJohn Baldwin#else
10bc3d5698SJohn Baldwin.code	32
11bc3d5698SJohn Baldwin#endif
12bc3d5698SJohn Baldwin
13bc3d5698SJohn Baldwin#if defined(__thumb2__) || defined(__clang__)
14bc3d5698SJohn Baldwin#define ldrhsb	ldrbhs
15bc3d5698SJohn Baldwin#endif
16bc3d5698SJohn Baldwin
17*c0855eaaSJohn Baldwin.text
18*c0855eaaSJohn Baldwin
19bc3d5698SJohn Baldwin.align	5
20bc3d5698SJohn Baldwin.Lsigma:
21bc3d5698SJohn Baldwin.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
22bc3d5698SJohn Baldwin.Lone:
23bc3d5698SJohn Baldwin.long	1,0,0,0
24bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7
25bc3d5698SJohn Baldwin.LOPENSSL_armcap:
26*c0855eaaSJohn Baldwin# ifdef	_WIN32
27*c0855eaaSJohn Baldwin.word	OPENSSL_armcap_P
28*c0855eaaSJohn Baldwin# else
29bc3d5698SJohn Baldwin.word	OPENSSL_armcap_P-.LChaCha20_ctr32
30*c0855eaaSJohn Baldwin# endif
31bc3d5698SJohn Baldwin#else
32bc3d5698SJohn Baldwin.word	-1
33bc3d5698SJohn Baldwin#endif
34bc3d5698SJohn Baldwin
35bc3d5698SJohn Baldwin.globl	ChaCha20_ctr32
36bc3d5698SJohn Baldwin.type	ChaCha20_ctr32,%function
37bc3d5698SJohn Baldwin.align	5
38bc3d5698SJohn BaldwinChaCha20_ctr32:
39bc3d5698SJohn Baldwin.LChaCha20_ctr32:
40bc3d5698SJohn Baldwin	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
41bc3d5698SJohn Baldwin	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
42bc3d5698SJohn Baldwin#if __ARM_ARCH__<7 && !defined(__thumb2__)
43bc3d5698SJohn Baldwin	sub	r14,pc,#16		@ ChaCha20_ctr32
44bc3d5698SJohn Baldwin#else
45bc3d5698SJohn Baldwin	adr	r14,.LChaCha20_ctr32
46bc3d5698SJohn Baldwin#endif
47bc3d5698SJohn Baldwin	cmp	r2,#0			@ len==0?
48bc3d5698SJohn Baldwin#ifdef	__thumb2__
49bc3d5698SJohn Baldwin	itt	eq
50bc3d5698SJohn Baldwin#endif
51bc3d5698SJohn Baldwin	addeq	sp,sp,#4*3
52bc3d5698SJohn Baldwin	beq	.Lno_data
53bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7
54bc3d5698SJohn Baldwin	cmp	r2,#192			@ test len
55bc3d5698SJohn Baldwin	bls	.Lshort
56bc3d5698SJohn Baldwin	ldr	r4,[r14,#-32]
57*c0855eaaSJohn Baldwin# if !defined(_WIN32)
58bc3d5698SJohn Baldwin	ldr	r4,[r14,r4]
59*c0855eaaSJohn Baldwin# endif
60*c0855eaaSJohn Baldwin# if defined(__APPLE__) || defined(_WIN32)
61bc3d5698SJohn Baldwin	ldr	r4,[r4]
62bc3d5698SJohn Baldwin# endif
63bc3d5698SJohn Baldwin	tst	r4,#ARMV7_NEON
64bc3d5698SJohn Baldwin	bne	.LChaCha20_neon
65bc3d5698SJohn Baldwin.Lshort:
66bc3d5698SJohn Baldwin#endif
67bc3d5698SJohn Baldwin	ldmia	r12,{r4,r5,r6,r7}		@ load counter and nonce
68bc3d5698SJohn Baldwin	sub	sp,sp,#4*(16)		@ off-load area
69bc3d5698SJohn Baldwin	sub	r14,r14,#64		@ .Lsigma
70bc3d5698SJohn Baldwin	stmdb	sp!,{r4,r5,r6,r7}		@ copy counter and nonce
71bc3d5698SJohn Baldwin	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
72bc3d5698SJohn Baldwin	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
73bc3d5698SJohn Baldwin	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy key
74bc3d5698SJohn Baldwin	stmdb	sp!,{r0,r1,r2,r3}		@ copy sigma
75bc3d5698SJohn Baldwin	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
76bc3d5698SJohn Baldwin	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
77bc3d5698SJohn Baldwin	b	.Loop_outer_enter
78bc3d5698SJohn Baldwin
79bc3d5698SJohn Baldwin.align	4
80bc3d5698SJohn Baldwin.Loop_outer:
81bc3d5698SJohn Baldwin	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
82bc3d5698SJohn Baldwin	str	r11,[sp,#4*(32+2)]	@ save len
83bc3d5698SJohn Baldwin	str	r12,  [sp,#4*(32+1)]	@ save inp
84bc3d5698SJohn Baldwin	str	r14,  [sp,#4*(32+0)]	@ save out
85bc3d5698SJohn Baldwin.Loop_outer_enter:
86bc3d5698SJohn Baldwin	ldr	r11, [sp,#4*(15)]
87bc3d5698SJohn Baldwin	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
88bc3d5698SJohn Baldwin	ldr	r10, [sp,#4*(13)]
89bc3d5698SJohn Baldwin	ldr	r14,[sp,#4*(14)]
90bc3d5698SJohn Baldwin	str	r11, [sp,#4*(16+15)]
91bc3d5698SJohn Baldwin	mov	r11,#10
92bc3d5698SJohn Baldwin	b	.Loop
93bc3d5698SJohn Baldwin
94bc3d5698SJohn Baldwin.align	4
95bc3d5698SJohn Baldwin.Loop:
96bc3d5698SJohn Baldwin	subs	r11,r11,#1
97bc3d5698SJohn Baldwin	add	r0,r0,r4
98bc3d5698SJohn Baldwin	mov	r12,r12,ror#16
99bc3d5698SJohn Baldwin	add	r1,r1,r5
100bc3d5698SJohn Baldwin	mov	r10,r10,ror#16
101bc3d5698SJohn Baldwin	eor	r12,r12,r0,ror#16
102bc3d5698SJohn Baldwin	eor	r10,r10,r1,ror#16
103bc3d5698SJohn Baldwin	add	r8,r8,r12
104bc3d5698SJohn Baldwin	mov	r4,r4,ror#20
105bc3d5698SJohn Baldwin	add	r9,r9,r10
106bc3d5698SJohn Baldwin	mov	r5,r5,ror#20
107bc3d5698SJohn Baldwin	eor	r4,r4,r8,ror#20
108bc3d5698SJohn Baldwin	eor	r5,r5,r9,ror#20
109bc3d5698SJohn Baldwin	add	r0,r0,r4
110bc3d5698SJohn Baldwin	mov	r12,r12,ror#24
111bc3d5698SJohn Baldwin	add	r1,r1,r5
112bc3d5698SJohn Baldwin	mov	r10,r10,ror#24
113bc3d5698SJohn Baldwin	eor	r12,r12,r0,ror#24
114bc3d5698SJohn Baldwin	eor	r10,r10,r1,ror#24
115bc3d5698SJohn Baldwin	add	r8,r8,r12
116bc3d5698SJohn Baldwin	mov	r4,r4,ror#25
117bc3d5698SJohn Baldwin	add	r9,r9,r10
118bc3d5698SJohn Baldwin	mov	r5,r5,ror#25
119bc3d5698SJohn Baldwin	str	r10,[sp,#4*(16+13)]
120bc3d5698SJohn Baldwin	ldr	r10,[sp,#4*(16+15)]
121bc3d5698SJohn Baldwin	eor	r4,r4,r8,ror#25
122bc3d5698SJohn Baldwin	eor	r5,r5,r9,ror#25
123bc3d5698SJohn Baldwin	str	r8,[sp,#4*(16+8)]
124bc3d5698SJohn Baldwin	ldr	r8,[sp,#4*(16+10)]
125bc3d5698SJohn Baldwin	add	r2,r2,r6
126bc3d5698SJohn Baldwin	mov	r14,r14,ror#16
127bc3d5698SJohn Baldwin	str	r9,[sp,#4*(16+9)]
128bc3d5698SJohn Baldwin	ldr	r9,[sp,#4*(16+11)]
129bc3d5698SJohn Baldwin	add	r3,r3,r7
130bc3d5698SJohn Baldwin	mov	r10,r10,ror#16
131bc3d5698SJohn Baldwin	eor	r14,r14,r2,ror#16
132bc3d5698SJohn Baldwin	eor	r10,r10,r3,ror#16
133bc3d5698SJohn Baldwin	add	r8,r8,r14
134bc3d5698SJohn Baldwin	mov	r6,r6,ror#20
135bc3d5698SJohn Baldwin	add	r9,r9,r10
136bc3d5698SJohn Baldwin	mov	r7,r7,ror#20
137bc3d5698SJohn Baldwin	eor	r6,r6,r8,ror#20
138bc3d5698SJohn Baldwin	eor	r7,r7,r9,ror#20
139bc3d5698SJohn Baldwin	add	r2,r2,r6
140bc3d5698SJohn Baldwin	mov	r14,r14,ror#24
141bc3d5698SJohn Baldwin	add	r3,r3,r7
142bc3d5698SJohn Baldwin	mov	r10,r10,ror#24
143bc3d5698SJohn Baldwin	eor	r14,r14,r2,ror#24
144bc3d5698SJohn Baldwin	eor	r10,r10,r3,ror#24
145bc3d5698SJohn Baldwin	add	r8,r8,r14
146bc3d5698SJohn Baldwin	mov	r6,r6,ror#25
147bc3d5698SJohn Baldwin	add	r9,r9,r10
148bc3d5698SJohn Baldwin	mov	r7,r7,ror#25
149bc3d5698SJohn Baldwin	eor	r6,r6,r8,ror#25
150bc3d5698SJohn Baldwin	eor	r7,r7,r9,ror#25
151bc3d5698SJohn Baldwin	add	r0,r0,r5
152bc3d5698SJohn Baldwin	mov	r10,r10,ror#16
153bc3d5698SJohn Baldwin	add	r1,r1,r6
154bc3d5698SJohn Baldwin	mov	r12,r12,ror#16
155bc3d5698SJohn Baldwin	eor	r10,r10,r0,ror#16
156bc3d5698SJohn Baldwin	eor	r12,r12,r1,ror#16
157bc3d5698SJohn Baldwin	add	r8,r8,r10
158bc3d5698SJohn Baldwin	mov	r5,r5,ror#20
159bc3d5698SJohn Baldwin	add	r9,r9,r12
160bc3d5698SJohn Baldwin	mov	r6,r6,ror#20
161bc3d5698SJohn Baldwin	eor	r5,r5,r8,ror#20
162bc3d5698SJohn Baldwin	eor	r6,r6,r9,ror#20
163bc3d5698SJohn Baldwin	add	r0,r0,r5
164bc3d5698SJohn Baldwin	mov	r10,r10,ror#24
165bc3d5698SJohn Baldwin	add	r1,r1,r6
166bc3d5698SJohn Baldwin	mov	r12,r12,ror#24
167bc3d5698SJohn Baldwin	eor	r10,r10,r0,ror#24
168bc3d5698SJohn Baldwin	eor	r12,r12,r1,ror#24
169bc3d5698SJohn Baldwin	add	r8,r8,r10
170bc3d5698SJohn Baldwin	mov	r5,r5,ror#25
171bc3d5698SJohn Baldwin	str	r10,[sp,#4*(16+15)]
172bc3d5698SJohn Baldwin	ldr	r10,[sp,#4*(16+13)]
173bc3d5698SJohn Baldwin	add	r9,r9,r12
174bc3d5698SJohn Baldwin	mov	r6,r6,ror#25
175bc3d5698SJohn Baldwin	eor	r5,r5,r8,ror#25
176bc3d5698SJohn Baldwin	eor	r6,r6,r9,ror#25
177bc3d5698SJohn Baldwin	str	r8,[sp,#4*(16+10)]
178bc3d5698SJohn Baldwin	ldr	r8,[sp,#4*(16+8)]
179bc3d5698SJohn Baldwin	add	r2,r2,r7
180bc3d5698SJohn Baldwin	mov	r10,r10,ror#16
181bc3d5698SJohn Baldwin	str	r9,[sp,#4*(16+11)]
182bc3d5698SJohn Baldwin	ldr	r9,[sp,#4*(16+9)]
183bc3d5698SJohn Baldwin	add	r3,r3,r4
184bc3d5698SJohn Baldwin	mov	r14,r14,ror#16
185bc3d5698SJohn Baldwin	eor	r10,r10,r2,ror#16
186bc3d5698SJohn Baldwin	eor	r14,r14,r3,ror#16
187bc3d5698SJohn Baldwin	add	r8,r8,r10
188bc3d5698SJohn Baldwin	mov	r7,r7,ror#20
189bc3d5698SJohn Baldwin	add	r9,r9,r14
190bc3d5698SJohn Baldwin	mov	r4,r4,ror#20
191bc3d5698SJohn Baldwin	eor	r7,r7,r8,ror#20
192bc3d5698SJohn Baldwin	eor	r4,r4,r9,ror#20
193bc3d5698SJohn Baldwin	add	r2,r2,r7
194bc3d5698SJohn Baldwin	mov	r10,r10,ror#24
195bc3d5698SJohn Baldwin	add	r3,r3,r4
196bc3d5698SJohn Baldwin	mov	r14,r14,ror#24
197bc3d5698SJohn Baldwin	eor	r10,r10,r2,ror#24
198bc3d5698SJohn Baldwin	eor	r14,r14,r3,ror#24
199bc3d5698SJohn Baldwin	add	r8,r8,r10
200bc3d5698SJohn Baldwin	mov	r7,r7,ror#25
201bc3d5698SJohn Baldwin	add	r9,r9,r14
202bc3d5698SJohn Baldwin	mov	r4,r4,ror#25
203bc3d5698SJohn Baldwin	eor	r7,r7,r8,ror#25
204bc3d5698SJohn Baldwin	eor	r4,r4,r9,ror#25
205bc3d5698SJohn Baldwin	bne	.Loop
206bc3d5698SJohn Baldwin
207bc3d5698SJohn Baldwin	ldr	r11,[sp,#4*(32+2)]	@ load len
208bc3d5698SJohn Baldwin
209bc3d5698SJohn Baldwin	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
210bc3d5698SJohn Baldwin	str	r9, [sp,#4*(16+9)]
211bc3d5698SJohn Baldwin	str	r12,[sp,#4*(16+12)]
212bc3d5698SJohn Baldwin	str	r10, [sp,#4*(16+13)]
213bc3d5698SJohn Baldwin	str	r14,[sp,#4*(16+14)]
214bc3d5698SJohn Baldwin
215bc3d5698SJohn Baldwin	@ at this point we have first half of 512-bit result in
216bc3d5698SJohn Baldwin	@ rx and second half at sp+4*(16+8)
217bc3d5698SJohn Baldwin
218bc3d5698SJohn Baldwin	cmp	r11,#64		@ done yet?
219bc3d5698SJohn Baldwin#ifdef	__thumb2__
220bc3d5698SJohn Baldwin	itete	lo
221bc3d5698SJohn Baldwin#endif
222bc3d5698SJohn Baldwin	addlo	r12,sp,#4*(0)		@ shortcut or ...
223bc3d5698SJohn Baldwin	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
224bc3d5698SJohn Baldwin	addlo	r14,sp,#4*(0)		@ shortcut or ...
225bc3d5698SJohn Baldwin	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
226bc3d5698SJohn Baldwin
227bc3d5698SJohn Baldwin	ldr	r8,[sp,#4*(0)]	@ load key material
228bc3d5698SJohn Baldwin	ldr	r9,[sp,#4*(1)]
229bc3d5698SJohn Baldwin
230bc3d5698SJohn Baldwin#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
231bc3d5698SJohn Baldwin# if __ARM_ARCH__<7
232bc3d5698SJohn Baldwin	orr	r10,r12,r14
233bc3d5698SJohn Baldwin	tst	r10,#3		@ are input and output aligned?
234bc3d5698SJohn Baldwin	ldr	r10,[sp,#4*(2)]
235bc3d5698SJohn Baldwin	bne	.Lunaligned
236bc3d5698SJohn Baldwin	cmp	r11,#64		@ restore flags
237bc3d5698SJohn Baldwin# else
238bc3d5698SJohn Baldwin	ldr	r10,[sp,#4*(2)]
239bc3d5698SJohn Baldwin# endif
240bc3d5698SJohn Baldwin	ldr	r11,[sp,#4*(3)]
241bc3d5698SJohn Baldwin
242bc3d5698SJohn Baldwin	add	r0,r0,r8	@ accumulate key material
243bc3d5698SJohn Baldwin	add	r1,r1,r9
244bc3d5698SJohn Baldwin# ifdef	__thumb2__
245bc3d5698SJohn Baldwin	itt	hs
246bc3d5698SJohn Baldwin# endif
247bc3d5698SJohn Baldwin	ldrhs	r8,[r12],#16		@ load input
248bc3d5698SJohn Baldwin	ldrhs	r9,[r12,#-12]
249bc3d5698SJohn Baldwin
250bc3d5698SJohn Baldwin	add	r2,r2,r10
251bc3d5698SJohn Baldwin	add	r3,r3,r11
252bc3d5698SJohn Baldwin# ifdef	__thumb2__
253bc3d5698SJohn Baldwin	itt	hs
254bc3d5698SJohn Baldwin# endif
255bc3d5698SJohn Baldwin	ldrhs	r10,[r12,#-8]
256bc3d5698SJohn Baldwin	ldrhs	r11,[r12,#-4]
257bc3d5698SJohn Baldwin# if __ARM_ARCH__>=6 && defined(__ARMEB__)
258bc3d5698SJohn Baldwin	rev	r0,r0
259bc3d5698SJohn Baldwin	rev	r1,r1
260bc3d5698SJohn Baldwin	rev	r2,r2
261bc3d5698SJohn Baldwin	rev	r3,r3
262bc3d5698SJohn Baldwin# endif
263bc3d5698SJohn Baldwin# ifdef	__thumb2__
264bc3d5698SJohn Baldwin	itt	hs
265bc3d5698SJohn Baldwin# endif
266bc3d5698SJohn Baldwin	eorhs	r0,r0,r8	@ xor with input
267bc3d5698SJohn Baldwin	eorhs	r1,r1,r9
268bc3d5698SJohn Baldwin	add	r8,sp,#4*(4)
269bc3d5698SJohn Baldwin	str	r0,[r14],#16		@ store output
270bc3d5698SJohn Baldwin# ifdef	__thumb2__
271bc3d5698SJohn Baldwin	itt	hs
272bc3d5698SJohn Baldwin# endif
273bc3d5698SJohn Baldwin	eorhs	r2,r2,r10
274bc3d5698SJohn Baldwin	eorhs	r3,r3,r11
275bc3d5698SJohn Baldwin	ldmia	r8,{r8,r9,r10,r11}	@ load key material
276bc3d5698SJohn Baldwin	str	r1,[r14,#-12]
277bc3d5698SJohn Baldwin	str	r2,[r14,#-8]
278bc3d5698SJohn Baldwin	str	r3,[r14,#-4]
279bc3d5698SJohn Baldwin
280bc3d5698SJohn Baldwin	add	r4,r4,r8	@ accumulate key material
281bc3d5698SJohn Baldwin	add	r5,r5,r9
282bc3d5698SJohn Baldwin# ifdef	__thumb2__
283bc3d5698SJohn Baldwin	itt	hs
284bc3d5698SJohn Baldwin# endif
285bc3d5698SJohn Baldwin	ldrhs	r8,[r12],#16		@ load input
286bc3d5698SJohn Baldwin	ldrhs	r9,[r12,#-12]
287bc3d5698SJohn Baldwin	add	r6,r6,r10
288bc3d5698SJohn Baldwin	add	r7,r7,r11
289bc3d5698SJohn Baldwin# ifdef	__thumb2__
290bc3d5698SJohn Baldwin	itt	hs
291bc3d5698SJohn Baldwin# endif
292bc3d5698SJohn Baldwin	ldrhs	r10,[r12,#-8]
293bc3d5698SJohn Baldwin	ldrhs	r11,[r12,#-4]
294bc3d5698SJohn Baldwin# if __ARM_ARCH__>=6 && defined(__ARMEB__)
295bc3d5698SJohn Baldwin	rev	r4,r4
296bc3d5698SJohn Baldwin	rev	r5,r5
297bc3d5698SJohn Baldwin	rev	r6,r6
298bc3d5698SJohn Baldwin	rev	r7,r7
299bc3d5698SJohn Baldwin# endif
300bc3d5698SJohn Baldwin# ifdef	__thumb2__
301bc3d5698SJohn Baldwin	itt	hs
302bc3d5698SJohn Baldwin# endif
303bc3d5698SJohn Baldwin	eorhs	r4,r4,r8
304bc3d5698SJohn Baldwin	eorhs	r5,r5,r9
305bc3d5698SJohn Baldwin	add	r8,sp,#4*(8)
306bc3d5698SJohn Baldwin	str	r4,[r14],#16		@ store output
307bc3d5698SJohn Baldwin# ifdef	__thumb2__
308bc3d5698SJohn Baldwin	itt	hs
309bc3d5698SJohn Baldwin# endif
310bc3d5698SJohn Baldwin	eorhs	r6,r6,r10
311bc3d5698SJohn Baldwin	eorhs	r7,r7,r11
312bc3d5698SJohn Baldwin	str	r5,[r14,#-12]
313bc3d5698SJohn Baldwin	ldmia	r8,{r8,r9,r10,r11}	@ load key material
314bc3d5698SJohn Baldwin	str	r6,[r14,#-8]
315bc3d5698SJohn Baldwin	add	r0,sp,#4*(16+8)
316bc3d5698SJohn Baldwin	str	r7,[r14,#-4]
317bc3d5698SJohn Baldwin
318bc3d5698SJohn Baldwin	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
319bc3d5698SJohn Baldwin
320bc3d5698SJohn Baldwin	add	r0,r0,r8	@ accumulate key material
321bc3d5698SJohn Baldwin	add	r1,r1,r9
322bc3d5698SJohn Baldwin# ifdef	__thumb2__
323bc3d5698SJohn Baldwin	itt	hs
324bc3d5698SJohn Baldwin# endif
325bc3d5698SJohn Baldwin	ldrhs	r8,[r12],#16		@ load input
326bc3d5698SJohn Baldwin	ldrhs	r9,[r12,#-12]
327bc3d5698SJohn Baldwin# ifdef	__thumb2__
328bc3d5698SJohn Baldwin	itt	hi
329bc3d5698SJohn Baldwin# endif
330bc3d5698SJohn Baldwin	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
331bc3d5698SJohn Baldwin	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
332bc3d5698SJohn Baldwin	add	r2,r2,r10
333bc3d5698SJohn Baldwin	add	r3,r3,r11
334bc3d5698SJohn Baldwin# ifdef	__thumb2__
335bc3d5698SJohn Baldwin	itt	hs
336bc3d5698SJohn Baldwin# endif
337bc3d5698SJohn Baldwin	ldrhs	r10,[r12,#-8]
338bc3d5698SJohn Baldwin	ldrhs	r11,[r12,#-4]
339bc3d5698SJohn Baldwin# if __ARM_ARCH__>=6 && defined(__ARMEB__)
340bc3d5698SJohn Baldwin	rev	r0,r0
341bc3d5698SJohn Baldwin	rev	r1,r1
342bc3d5698SJohn Baldwin	rev	r2,r2
343bc3d5698SJohn Baldwin	rev	r3,r3
344bc3d5698SJohn Baldwin# endif
345bc3d5698SJohn Baldwin# ifdef	__thumb2__
346bc3d5698SJohn Baldwin	itt	hs
347bc3d5698SJohn Baldwin# endif
348bc3d5698SJohn Baldwin	eorhs	r0,r0,r8
349bc3d5698SJohn Baldwin	eorhs	r1,r1,r9
350bc3d5698SJohn Baldwin	add	r8,sp,#4*(12)
351bc3d5698SJohn Baldwin	str	r0,[r14],#16		@ store output
352bc3d5698SJohn Baldwin# ifdef	__thumb2__
353bc3d5698SJohn Baldwin	itt	hs
354bc3d5698SJohn Baldwin# endif
355bc3d5698SJohn Baldwin	eorhs	r2,r2,r10
356bc3d5698SJohn Baldwin	eorhs	r3,r3,r11
357bc3d5698SJohn Baldwin	str	r1,[r14,#-12]
358bc3d5698SJohn Baldwin	ldmia	r8,{r8,r9,r10,r11}	@ load key material
359bc3d5698SJohn Baldwin	str	r2,[r14,#-8]
360bc3d5698SJohn Baldwin	str	r3,[r14,#-4]
361bc3d5698SJohn Baldwin
362bc3d5698SJohn Baldwin	add	r4,r4,r8	@ accumulate key material
363bc3d5698SJohn Baldwin	add	r5,r5,r9
364bc3d5698SJohn Baldwin# ifdef	__thumb2__
365bc3d5698SJohn Baldwin	itt	hi
366bc3d5698SJohn Baldwin# endif
367bc3d5698SJohn Baldwin	addhi	r8,r8,#1		@ next counter value
368bc3d5698SJohn Baldwin	strhi	r8,[sp,#4*(12)]	@ save next counter value
369bc3d5698SJohn Baldwin# ifdef	__thumb2__
370bc3d5698SJohn Baldwin	itt	hs
371bc3d5698SJohn Baldwin# endif
372bc3d5698SJohn Baldwin	ldrhs	r8,[r12],#16		@ load input
373bc3d5698SJohn Baldwin	ldrhs	r9,[r12,#-12]
374bc3d5698SJohn Baldwin	add	r6,r6,r10
375bc3d5698SJohn Baldwin	add	r7,r7,r11
376bc3d5698SJohn Baldwin# ifdef	__thumb2__
377bc3d5698SJohn Baldwin	itt	hs
378bc3d5698SJohn Baldwin# endif
379bc3d5698SJohn Baldwin	ldrhs	r10,[r12,#-8]
380bc3d5698SJohn Baldwin	ldrhs	r11,[r12,#-4]
381bc3d5698SJohn Baldwin# if __ARM_ARCH__>=6 && defined(__ARMEB__)
382bc3d5698SJohn Baldwin	rev	r4,r4
383bc3d5698SJohn Baldwin	rev	r5,r5
384bc3d5698SJohn Baldwin	rev	r6,r6
385bc3d5698SJohn Baldwin	rev	r7,r7
386bc3d5698SJohn Baldwin# endif
387bc3d5698SJohn Baldwin# ifdef	__thumb2__
388bc3d5698SJohn Baldwin	itt	hs
389bc3d5698SJohn Baldwin# endif
390bc3d5698SJohn Baldwin	eorhs	r4,r4,r8
391bc3d5698SJohn Baldwin	eorhs	r5,r5,r9
392bc3d5698SJohn Baldwin# ifdef	__thumb2__
393bc3d5698SJohn Baldwin	it	ne
394bc3d5698SJohn Baldwin# endif
395bc3d5698SJohn Baldwin	ldrne	r8,[sp,#4*(32+2)]	@ re-load len
396bc3d5698SJohn Baldwin# ifdef	__thumb2__
397bc3d5698SJohn Baldwin	itt	hs
398bc3d5698SJohn Baldwin# endif
399bc3d5698SJohn Baldwin	eorhs	r6,r6,r10
400bc3d5698SJohn Baldwin	eorhs	r7,r7,r11
401bc3d5698SJohn Baldwin	str	r4,[r14],#16		@ store output
402bc3d5698SJohn Baldwin	str	r5,[r14,#-12]
403bc3d5698SJohn Baldwin# ifdef	__thumb2__
404bc3d5698SJohn Baldwin	it	hs
405bc3d5698SJohn Baldwin# endif
406bc3d5698SJohn Baldwin	subhs	r11,r8,#64		@ len-=64
407bc3d5698SJohn Baldwin	str	r6,[r14,#-8]
408bc3d5698SJohn Baldwin	str	r7,[r14,#-4]
409bc3d5698SJohn Baldwin	bhi	.Loop_outer
410bc3d5698SJohn Baldwin
411bc3d5698SJohn Baldwin	beq	.Ldone
412bc3d5698SJohn Baldwin# if __ARM_ARCH__<7
413bc3d5698SJohn Baldwin	b	.Ltail
414bc3d5698SJohn Baldwin
415bc3d5698SJohn Baldwin.align	4
416bc3d5698SJohn Baldwin.Lunaligned:@ unaligned endian-neutral path
417bc3d5698SJohn Baldwin	cmp	r11,#64		@ restore flags
418bc3d5698SJohn Baldwin# endif
419bc3d5698SJohn Baldwin#endif
420bc3d5698SJohn Baldwin#if __ARM_ARCH__<7
421bc3d5698SJohn Baldwin	ldr	r11,[sp,#4*(3)]
422bc3d5698SJohn Baldwin	add	r0,r0,r8		@ accumulate key material
423bc3d5698SJohn Baldwin	add	r1,r1,r9
424bc3d5698SJohn Baldwin	add	r2,r2,r10
425bc3d5698SJohn Baldwin# ifdef	__thumb2__
426bc3d5698SJohn Baldwin	itete	lo
427bc3d5698SJohn Baldwin# endif
428bc3d5698SJohn Baldwin	eorlo	r8,r8,r8		@ zero or ...
429bc3d5698SJohn Baldwin	ldrhsb	r8,[r12],#16			@ ... load input
430bc3d5698SJohn Baldwin	eorlo	r9,r9,r9
431bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-12]
432bc3d5698SJohn Baldwin
433bc3d5698SJohn Baldwin	add	r3,r3,r11
434bc3d5698SJohn Baldwin# ifdef	__thumb2__
435bc3d5698SJohn Baldwin	itete	lo
436bc3d5698SJohn Baldwin# endif
437bc3d5698SJohn Baldwin	eorlo	r10,r10,r10
438bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-8]
439bc3d5698SJohn Baldwin	eorlo	r11,r11,r11
440bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-4]
441bc3d5698SJohn Baldwin
442bc3d5698SJohn Baldwin	eor	r0,r8,r0		@ xor with input (or zero)
443bc3d5698SJohn Baldwin	eor	r1,r9,r1
444bc3d5698SJohn Baldwin# ifdef	__thumb2__
445bc3d5698SJohn Baldwin	itt	hs
446bc3d5698SJohn Baldwin# endif
447bc3d5698SJohn Baldwin	ldrhsb	r8,[r12,#-15]		@ load more input
448bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-11]
449bc3d5698SJohn Baldwin	eor	r2,r10,r2
450bc3d5698SJohn Baldwin	strb	r0,[r14],#16		@ store output
451bc3d5698SJohn Baldwin	eor	r3,r11,r3
452bc3d5698SJohn Baldwin# ifdef	__thumb2__
453bc3d5698SJohn Baldwin	itt	hs
454bc3d5698SJohn Baldwin# endif
455bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-7]
456bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-3]
457bc3d5698SJohn Baldwin	strb	r1,[r14,#-12]
458bc3d5698SJohn Baldwin	eor	r0,r8,r0,lsr#8
459bc3d5698SJohn Baldwin	strb	r2,[r14,#-8]
460bc3d5698SJohn Baldwin	eor	r1,r9,r1,lsr#8
461bc3d5698SJohn Baldwin# ifdef	__thumb2__
462bc3d5698SJohn Baldwin	itt	hs
463bc3d5698SJohn Baldwin# endif
464bc3d5698SJohn Baldwin	ldrhsb	r8,[r12,#-14]		@ load more input
465bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-10]
466bc3d5698SJohn Baldwin	strb	r3,[r14,#-4]
467bc3d5698SJohn Baldwin	eor	r2,r10,r2,lsr#8
468bc3d5698SJohn Baldwin	strb	r0,[r14,#-15]
469bc3d5698SJohn Baldwin	eor	r3,r11,r3,lsr#8
470bc3d5698SJohn Baldwin# ifdef	__thumb2__
471bc3d5698SJohn Baldwin	itt	hs
472bc3d5698SJohn Baldwin# endif
473bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-6]
474bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-2]
475bc3d5698SJohn Baldwin	strb	r1,[r14,#-11]
476bc3d5698SJohn Baldwin	eor	r0,r8,r0,lsr#8
477bc3d5698SJohn Baldwin	strb	r2,[r14,#-7]
478bc3d5698SJohn Baldwin	eor	r1,r9,r1,lsr#8
479bc3d5698SJohn Baldwin# ifdef	__thumb2__
480bc3d5698SJohn Baldwin	itt	hs
481bc3d5698SJohn Baldwin# endif
482bc3d5698SJohn Baldwin	ldrhsb	r8,[r12,#-13]		@ load more input
483bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-9]
484bc3d5698SJohn Baldwin	strb	r3,[r14,#-3]
485bc3d5698SJohn Baldwin	eor	r2,r10,r2,lsr#8
486bc3d5698SJohn Baldwin	strb	r0,[r14,#-14]
487bc3d5698SJohn Baldwin	eor	r3,r11,r3,lsr#8
488bc3d5698SJohn Baldwin# ifdef	__thumb2__
489bc3d5698SJohn Baldwin	itt	hs
490bc3d5698SJohn Baldwin# endif
491bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-5]
492bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-1]
493bc3d5698SJohn Baldwin	strb	r1,[r14,#-10]
494bc3d5698SJohn Baldwin	strb	r2,[r14,#-6]
495bc3d5698SJohn Baldwin	eor	r0,r8,r0,lsr#8
496bc3d5698SJohn Baldwin	strb	r3,[r14,#-2]
497bc3d5698SJohn Baldwin	eor	r1,r9,r1,lsr#8
498bc3d5698SJohn Baldwin	strb	r0,[r14,#-13]
499bc3d5698SJohn Baldwin	eor	r2,r10,r2,lsr#8
500bc3d5698SJohn Baldwin	strb	r1,[r14,#-9]
501bc3d5698SJohn Baldwin	eor	r3,r11,r3,lsr#8
502bc3d5698SJohn Baldwin	strb	r2,[r14,#-5]
503bc3d5698SJohn Baldwin	strb	r3,[r14,#-1]
504bc3d5698SJohn Baldwin	add	r8,sp,#4*(4+0)
505bc3d5698SJohn Baldwin	ldmia	r8,{r8,r9,r10,r11}		@ load key material
506bc3d5698SJohn Baldwin	add	r0,sp,#4*(16+8)
507bc3d5698SJohn Baldwin	add	r4,r4,r8		@ accumulate key material
508bc3d5698SJohn Baldwin	add	r5,r5,r9
509bc3d5698SJohn Baldwin	add	r6,r6,r10
510bc3d5698SJohn Baldwin# ifdef	__thumb2__
511bc3d5698SJohn Baldwin	itete	lo
512bc3d5698SJohn Baldwin# endif
513bc3d5698SJohn Baldwin	eorlo	r8,r8,r8		@ zero or ...
514bc3d5698SJohn Baldwin	ldrhsb	r8,[r12],#16			@ ... load input
515bc3d5698SJohn Baldwin	eorlo	r9,r9,r9
516bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-12]
517bc3d5698SJohn Baldwin
518bc3d5698SJohn Baldwin	add	r7,r7,r11
519bc3d5698SJohn Baldwin# ifdef	__thumb2__
520bc3d5698SJohn Baldwin	itete	lo
521bc3d5698SJohn Baldwin# endif
522bc3d5698SJohn Baldwin	eorlo	r10,r10,r10
523bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-8]
524bc3d5698SJohn Baldwin	eorlo	r11,r11,r11
525bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-4]
526bc3d5698SJohn Baldwin
527bc3d5698SJohn Baldwin	eor	r4,r8,r4		@ xor with input (or zero)
528bc3d5698SJohn Baldwin	eor	r5,r9,r5
529bc3d5698SJohn Baldwin# ifdef	__thumb2__
530bc3d5698SJohn Baldwin	itt	hs
531bc3d5698SJohn Baldwin# endif
532bc3d5698SJohn Baldwin	ldrhsb	r8,[r12,#-15]		@ load more input
533bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-11]
534bc3d5698SJohn Baldwin	eor	r6,r10,r6
535bc3d5698SJohn Baldwin	strb	r4,[r14],#16		@ store output
536bc3d5698SJohn Baldwin	eor	r7,r11,r7
537bc3d5698SJohn Baldwin# ifdef	__thumb2__
538bc3d5698SJohn Baldwin	itt	hs
539bc3d5698SJohn Baldwin# endif
540bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-7]
541bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-3]
542bc3d5698SJohn Baldwin	strb	r5,[r14,#-12]
543bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#8
544bc3d5698SJohn Baldwin	strb	r6,[r14,#-8]
545bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#8
546bc3d5698SJohn Baldwin# ifdef	__thumb2__
547bc3d5698SJohn Baldwin	itt	hs
548bc3d5698SJohn Baldwin# endif
549bc3d5698SJohn Baldwin	ldrhsb	r8,[r12,#-14]		@ load more input
550bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-10]
551bc3d5698SJohn Baldwin	strb	r7,[r14,#-4]
552bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#8
553bc3d5698SJohn Baldwin	strb	r4,[r14,#-15]
554bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#8
555bc3d5698SJohn Baldwin# ifdef	__thumb2__
556bc3d5698SJohn Baldwin	itt	hs
557bc3d5698SJohn Baldwin# endif
558bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-6]
559bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-2]
560bc3d5698SJohn Baldwin	strb	r5,[r14,#-11]
561bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#8
562bc3d5698SJohn Baldwin	strb	r6,[r14,#-7]
563bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#8
564bc3d5698SJohn Baldwin# ifdef	__thumb2__
565bc3d5698SJohn Baldwin	itt	hs
566bc3d5698SJohn Baldwin# endif
567bc3d5698SJohn Baldwin	ldrhsb	r8,[r12,#-13]		@ load more input
568bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-9]
569bc3d5698SJohn Baldwin	strb	r7,[r14,#-3]
570bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#8
571bc3d5698SJohn Baldwin	strb	r4,[r14,#-14]
572bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#8
573bc3d5698SJohn Baldwin# ifdef	__thumb2__
574bc3d5698SJohn Baldwin	itt	hs
575bc3d5698SJohn Baldwin# endif
576bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-5]
577bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-1]
578bc3d5698SJohn Baldwin	strb	r5,[r14,#-10]
579bc3d5698SJohn Baldwin	strb	r6,[r14,#-6]
580bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#8
581bc3d5698SJohn Baldwin	strb	r7,[r14,#-2]
582bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#8
583bc3d5698SJohn Baldwin	strb	r4,[r14,#-13]
584bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#8
585bc3d5698SJohn Baldwin	strb	r5,[r14,#-9]
586bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#8
587bc3d5698SJohn Baldwin	strb	r6,[r14,#-5]
588bc3d5698SJohn Baldwin	strb	r7,[r14,#-1]
589bc3d5698SJohn Baldwin	add	r8,sp,#4*(4+4)
590bc3d5698SJohn Baldwin	ldmia	r8,{r8,r9,r10,r11}		@ load key material
591bc3d5698SJohn Baldwin	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}		@ load second half
592bc3d5698SJohn Baldwin# ifdef	__thumb2__
593bc3d5698SJohn Baldwin	itt	hi
594bc3d5698SJohn Baldwin# endif
595bc3d5698SJohn Baldwin	strhi	r10,[sp,#4*(16+10)]		@ copy "rx"
596bc3d5698SJohn Baldwin	strhi	r11,[sp,#4*(16+11)]		@ copy "rx"
597bc3d5698SJohn Baldwin	add	r0,r0,r8		@ accumulate key material
598bc3d5698SJohn Baldwin	add	r1,r1,r9
599bc3d5698SJohn Baldwin	add	r2,r2,r10
600bc3d5698SJohn Baldwin# ifdef	__thumb2__
601bc3d5698SJohn Baldwin	itete	lo
602bc3d5698SJohn Baldwin# endif
603bc3d5698SJohn Baldwin	eorlo	r8,r8,r8		@ zero or ...
604bc3d5698SJohn Baldwin	ldrhsb	r8,[r12],#16			@ ... load input
605bc3d5698SJohn Baldwin	eorlo	r9,r9,r9
606bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-12]
607bc3d5698SJohn Baldwin
608bc3d5698SJohn Baldwin	add	r3,r3,r11
609bc3d5698SJohn Baldwin# ifdef	__thumb2__
610bc3d5698SJohn Baldwin	itete	lo
611bc3d5698SJohn Baldwin# endif
612bc3d5698SJohn Baldwin	eorlo	r10,r10,r10
613bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-8]
614bc3d5698SJohn Baldwin	eorlo	r11,r11,r11
615bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-4]
616bc3d5698SJohn Baldwin
617bc3d5698SJohn Baldwin	eor	r0,r8,r0		@ xor with input (or zero)
618bc3d5698SJohn Baldwin	eor	r1,r9,r1
619bc3d5698SJohn Baldwin# ifdef	__thumb2__
620bc3d5698SJohn Baldwin	itt	hs
621bc3d5698SJohn Baldwin# endif
622bc3d5698SJohn Baldwin	ldrhsb	r8,[r12,#-15]		@ load more input
623bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-11]
624bc3d5698SJohn Baldwin	eor	r2,r10,r2
625bc3d5698SJohn Baldwin	strb	r0,[r14],#16		@ store output
626bc3d5698SJohn Baldwin	eor	r3,r11,r3
627bc3d5698SJohn Baldwin# ifdef	__thumb2__
628bc3d5698SJohn Baldwin	itt	hs
629bc3d5698SJohn Baldwin# endif
630bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-7]
631bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-3]
632bc3d5698SJohn Baldwin	strb	r1,[r14,#-12]
633bc3d5698SJohn Baldwin	eor	r0,r8,r0,lsr#8
634bc3d5698SJohn Baldwin	strb	r2,[r14,#-8]
635bc3d5698SJohn Baldwin	eor	r1,r9,r1,lsr#8
636bc3d5698SJohn Baldwin# ifdef	__thumb2__
637bc3d5698SJohn Baldwin	itt	hs
638bc3d5698SJohn Baldwin# endif
639bc3d5698SJohn Baldwin	ldrhsb	r8,[r12,#-14]		@ load more input
640bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-10]
641bc3d5698SJohn Baldwin	strb	r3,[r14,#-4]
642bc3d5698SJohn Baldwin	eor	r2,r10,r2,lsr#8
643bc3d5698SJohn Baldwin	strb	r0,[r14,#-15]
644bc3d5698SJohn Baldwin	eor	r3,r11,r3,lsr#8
645bc3d5698SJohn Baldwin# ifdef	__thumb2__
646bc3d5698SJohn Baldwin	itt	hs
647bc3d5698SJohn Baldwin# endif
648bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-6]
649bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-2]
650bc3d5698SJohn Baldwin	strb	r1,[r14,#-11]
651bc3d5698SJohn Baldwin	eor	r0,r8,r0,lsr#8
652bc3d5698SJohn Baldwin	strb	r2,[r14,#-7]
653bc3d5698SJohn Baldwin	eor	r1,r9,r1,lsr#8
654bc3d5698SJohn Baldwin# ifdef	__thumb2__
655bc3d5698SJohn Baldwin	itt	hs
656bc3d5698SJohn Baldwin# endif
657bc3d5698SJohn Baldwin	ldrhsb	r8,[r12,#-13]		@ load more input
658bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-9]
659bc3d5698SJohn Baldwin	strb	r3,[r14,#-3]
660bc3d5698SJohn Baldwin	eor	r2,r10,r2,lsr#8
661bc3d5698SJohn Baldwin	strb	r0,[r14,#-14]
662bc3d5698SJohn Baldwin	eor	r3,r11,r3,lsr#8
663bc3d5698SJohn Baldwin# ifdef	__thumb2__
664bc3d5698SJohn Baldwin	itt	hs
665bc3d5698SJohn Baldwin# endif
666bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-5]
667bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-1]
668bc3d5698SJohn Baldwin	strb	r1,[r14,#-10]
669bc3d5698SJohn Baldwin	strb	r2,[r14,#-6]
670bc3d5698SJohn Baldwin	eor	r0,r8,r0,lsr#8
671bc3d5698SJohn Baldwin	strb	r3,[r14,#-2]
672bc3d5698SJohn Baldwin	eor	r1,r9,r1,lsr#8
673bc3d5698SJohn Baldwin	strb	r0,[r14,#-13]
674bc3d5698SJohn Baldwin	eor	r2,r10,r2,lsr#8
675bc3d5698SJohn Baldwin	strb	r1,[r14,#-9]
676bc3d5698SJohn Baldwin	eor	r3,r11,r3,lsr#8
677bc3d5698SJohn Baldwin	strb	r2,[r14,#-5]
678bc3d5698SJohn Baldwin	strb	r3,[r14,#-1]
679bc3d5698SJohn Baldwin	add	r8,sp,#4*(4+8)
680bc3d5698SJohn Baldwin	ldmia	r8,{r8,r9,r10,r11}		@ load key material
681bc3d5698SJohn Baldwin	add	r4,r4,r8		@ accumulate key material
682bc3d5698SJohn Baldwin# ifdef	__thumb2__
683bc3d5698SJohn Baldwin	itt	hi
684bc3d5698SJohn Baldwin# endif
685bc3d5698SJohn Baldwin	addhi	r8,r8,#1			@ next counter value
686bc3d5698SJohn Baldwin	strhi	r8,[sp,#4*(12)]		@ save next counter value
687bc3d5698SJohn Baldwin	add	r5,r5,r9
688bc3d5698SJohn Baldwin	add	r6,r6,r10
689bc3d5698SJohn Baldwin# ifdef	__thumb2__
690bc3d5698SJohn Baldwin	itete	lo
691bc3d5698SJohn Baldwin# endif
692bc3d5698SJohn Baldwin	eorlo	r8,r8,r8		@ zero or ...
693bc3d5698SJohn Baldwin	ldrhsb	r8,[r12],#16			@ ... load input
694bc3d5698SJohn Baldwin	eorlo	r9,r9,r9
695bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-12]
696bc3d5698SJohn Baldwin
697bc3d5698SJohn Baldwin	add	r7,r7,r11
698bc3d5698SJohn Baldwin# ifdef	__thumb2__
699bc3d5698SJohn Baldwin	itete	lo
700bc3d5698SJohn Baldwin# endif
701bc3d5698SJohn Baldwin	eorlo	r10,r10,r10
702bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-8]
703bc3d5698SJohn Baldwin	eorlo	r11,r11,r11
704bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-4]
705bc3d5698SJohn Baldwin
706bc3d5698SJohn Baldwin	eor	r4,r8,r4		@ xor with input (or zero)
707bc3d5698SJohn Baldwin	eor	r5,r9,r5
708bc3d5698SJohn Baldwin# ifdef	__thumb2__
709bc3d5698SJohn Baldwin	itt	hs
710bc3d5698SJohn Baldwin# endif
711bc3d5698SJohn Baldwin	ldrhsb	r8,[r12,#-15]		@ load more input
712bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-11]
713bc3d5698SJohn Baldwin	eor	r6,r10,r6
714bc3d5698SJohn Baldwin	strb	r4,[r14],#16		@ store output
715bc3d5698SJohn Baldwin	eor	r7,r11,r7
716bc3d5698SJohn Baldwin# ifdef	__thumb2__
717bc3d5698SJohn Baldwin	itt	hs
718bc3d5698SJohn Baldwin# endif
719bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-7]
720bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-3]
721bc3d5698SJohn Baldwin	strb	r5,[r14,#-12]
722bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#8
723bc3d5698SJohn Baldwin	strb	r6,[r14,#-8]
724bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#8
725bc3d5698SJohn Baldwin# ifdef	__thumb2__
726bc3d5698SJohn Baldwin	itt	hs
727bc3d5698SJohn Baldwin# endif
728bc3d5698SJohn Baldwin	ldrhsb	r8,[r12,#-14]		@ load more input
729bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-10]
730bc3d5698SJohn Baldwin	strb	r7,[r14,#-4]
731bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#8
732bc3d5698SJohn Baldwin	strb	r4,[r14,#-15]
733bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#8
734bc3d5698SJohn Baldwin# ifdef	__thumb2__
735bc3d5698SJohn Baldwin	itt	hs
736bc3d5698SJohn Baldwin# endif
737bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-6]
738bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-2]
739bc3d5698SJohn Baldwin	strb	r5,[r14,#-11]
740bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#8
741bc3d5698SJohn Baldwin	strb	r6,[r14,#-7]
742bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#8
743bc3d5698SJohn Baldwin# ifdef	__thumb2__
744bc3d5698SJohn Baldwin	itt	hs
745bc3d5698SJohn Baldwin# endif
746bc3d5698SJohn Baldwin	ldrhsb	r8,[r12,#-13]		@ load more input
747bc3d5698SJohn Baldwin	ldrhsb	r9,[r12,#-9]
748bc3d5698SJohn Baldwin	strb	r7,[r14,#-3]
749bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#8
750bc3d5698SJohn Baldwin	strb	r4,[r14,#-14]
751bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#8
752bc3d5698SJohn Baldwin# ifdef	__thumb2__
753bc3d5698SJohn Baldwin	itt	hs
754bc3d5698SJohn Baldwin# endif
755bc3d5698SJohn Baldwin	ldrhsb	r10,[r12,#-5]
756bc3d5698SJohn Baldwin	ldrhsb	r11,[r12,#-1]
757bc3d5698SJohn Baldwin	strb	r5,[r14,#-10]
758bc3d5698SJohn Baldwin	strb	r6,[r14,#-6]
759bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#8
760bc3d5698SJohn Baldwin	strb	r7,[r14,#-2]
761bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#8
762bc3d5698SJohn Baldwin	strb	r4,[r14,#-13]
763bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#8
764bc3d5698SJohn Baldwin	strb	r5,[r14,#-9]
765bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#8
766bc3d5698SJohn Baldwin	strb	r6,[r14,#-5]
767bc3d5698SJohn Baldwin	strb	r7,[r14,#-1]
768bc3d5698SJohn Baldwin# ifdef	__thumb2__
769bc3d5698SJohn Baldwin	it	ne
770bc3d5698SJohn Baldwin# endif
771bc3d5698SJohn Baldwin	ldrne	r8,[sp,#4*(32+2)]		@ re-load len
772bc3d5698SJohn Baldwin# ifdef	__thumb2__
773bc3d5698SJohn Baldwin	it	hs
774bc3d5698SJohn Baldwin# endif
775bc3d5698SJohn Baldwin	subhs	r11,r8,#64			@ len-=64
776bc3d5698SJohn Baldwin	bhi	.Loop_outer
777bc3d5698SJohn Baldwin
778bc3d5698SJohn Baldwin	beq	.Ldone
779bc3d5698SJohn Baldwin#endif
780bc3d5698SJohn Baldwin
781bc3d5698SJohn Baldwin.Ltail:
782bc3d5698SJohn Baldwin	ldr	r12,[sp,#4*(32+1)]	@ load inp
783bc3d5698SJohn Baldwin	add	r9,sp,#4*(0)
784bc3d5698SJohn Baldwin	ldr	r14,[sp,#4*(32+0)]	@ load out
785bc3d5698SJohn Baldwin
786bc3d5698SJohn Baldwin.Loop_tail:
787bc3d5698SJohn Baldwin	ldrb	r10,[r9],#1	@ read buffer on stack
788bc3d5698SJohn Baldwin	ldrb	r11,[r12],#1		@ read input
789bc3d5698SJohn Baldwin	subs	r8,r8,#1
790bc3d5698SJohn Baldwin	eor	r11,r11,r10
791bc3d5698SJohn Baldwin	strb	r11,[r14],#1		@ store output
792bc3d5698SJohn Baldwin	bne	.Loop_tail
793bc3d5698SJohn Baldwin
794bc3d5698SJohn Baldwin.Ldone:
795bc3d5698SJohn Baldwin	add	sp,sp,#4*(32+3)
796bc3d5698SJohn Baldwin.Lno_data:
797bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
798bc3d5698SJohn Baldwin.size	ChaCha20_ctr32,.-ChaCha20_ctr32
799bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7
800bc3d5698SJohn Baldwin.arch	armv7-a
801bc3d5698SJohn Baldwin.fpu	neon
802bc3d5698SJohn Baldwin
803bc3d5698SJohn Baldwin.type	ChaCha20_neon,%function
804bc3d5698SJohn Baldwin.align	5
805bc3d5698SJohn BaldwinChaCha20_neon:
806bc3d5698SJohn Baldwin	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
807bc3d5698SJohn Baldwin	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
808bc3d5698SJohn Baldwin.LChaCha20_neon:
809bc3d5698SJohn Baldwin	adr	r14,.Lsigma
810bc3d5698SJohn Baldwin	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI spec says so
811bc3d5698SJohn Baldwin	stmdb	sp!,{r0,r1,r2,r3}
812bc3d5698SJohn Baldwin
813bc3d5698SJohn Baldwin	vld1.32	{q1,q2},[r3]		@ load key
814bc3d5698SJohn Baldwin	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
815bc3d5698SJohn Baldwin
816bc3d5698SJohn Baldwin	sub	sp,sp,#4*(16+16)
817bc3d5698SJohn Baldwin	vld1.32	{q3},[r12]		@ load counter and nonce
818bc3d5698SJohn Baldwin	add	r12,sp,#4*8
819bc3d5698SJohn Baldwin	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
820bc3d5698SJohn Baldwin	vld1.32	{q0},[r14]!		@ load sigma
821bc3d5698SJohn Baldwin	vld1.32	{q12},[r14]		@ one
822bc3d5698SJohn Baldwin	vst1.32	{q2,q3},[r12]		@ copy 1/2key|counter|nonce
823bc3d5698SJohn Baldwin	vst1.32	{q0,q1},[sp]		@ copy sigma|1/2key
824bc3d5698SJohn Baldwin
825bc3d5698SJohn Baldwin	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
826bc3d5698SJohn Baldwin	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
827bc3d5698SJohn Baldwin	vshl.i32	d26,d24,#1	@ two
828bc3d5698SJohn Baldwin	vstr	d24,[sp,#4*(16+0)]
829bc3d5698SJohn Baldwin	vshl.i32	d28,d24,#2	@ four
830bc3d5698SJohn Baldwin	vstr	d26,[sp,#4*(16+2)]
831bc3d5698SJohn Baldwin	vmov	q4,q0
832bc3d5698SJohn Baldwin	vstr	d28,[sp,#4*(16+4)]
833bc3d5698SJohn Baldwin	vmov	q8,q0
834bc3d5698SJohn Baldwin	vmov	q5,q1
835bc3d5698SJohn Baldwin	vmov	q9,q1
836bc3d5698SJohn Baldwin	b	.Loop_neon_enter
837bc3d5698SJohn Baldwin
838bc3d5698SJohn Baldwin.align	4
839bc3d5698SJohn Baldwin.Loop_neon_outer:
840bc3d5698SJohn Baldwin	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
841bc3d5698SJohn Baldwin	cmp	r11,#64*2		@ if len<=64*2
842bc3d5698SJohn Baldwin	bls	.Lbreak_neon		@ switch to integer-only
843bc3d5698SJohn Baldwin	vmov	q4,q0
844bc3d5698SJohn Baldwin	str	r11,[sp,#4*(32+2)]	@ save len
845bc3d5698SJohn Baldwin	vmov	q8,q0
846bc3d5698SJohn Baldwin	str	r12,  [sp,#4*(32+1)]	@ save inp
847bc3d5698SJohn Baldwin	vmov	q5,q1
848bc3d5698SJohn Baldwin	str	r14,  [sp,#4*(32+0)]	@ save out
849bc3d5698SJohn Baldwin	vmov	q9,q1
850bc3d5698SJohn Baldwin.Loop_neon_enter:
851bc3d5698SJohn Baldwin	ldr	r11, [sp,#4*(15)]
852bc3d5698SJohn Baldwin	vadd.i32	q7,q3,q12		@ counter+1
853bc3d5698SJohn Baldwin	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
854bc3d5698SJohn Baldwin	vmov	q6,q2
855bc3d5698SJohn Baldwin	ldr	r10, [sp,#4*(13)]
856bc3d5698SJohn Baldwin	vmov	q10,q2
857bc3d5698SJohn Baldwin	ldr	r14,[sp,#4*(14)]
858bc3d5698SJohn Baldwin	vadd.i32	q11,q7,q12		@ counter+2
859bc3d5698SJohn Baldwin	str	r11, [sp,#4*(16+15)]
860bc3d5698SJohn Baldwin	mov	r11,#10
861bc3d5698SJohn Baldwin	add	r12,r12,#3	@ counter+3
862bc3d5698SJohn Baldwin	b	.Loop_neon
863bc3d5698SJohn Baldwin
864bc3d5698SJohn Baldwin.align	4
865bc3d5698SJohn Baldwin.Loop_neon:
866bc3d5698SJohn Baldwin	subs	r11,r11,#1
867bc3d5698SJohn Baldwin	vadd.i32	q0,q0,q1
868bc3d5698SJohn Baldwin	add	r0,r0,r4
869bc3d5698SJohn Baldwin	vadd.i32	q4,q4,q5
870bc3d5698SJohn Baldwin	mov	r12,r12,ror#16
871bc3d5698SJohn Baldwin	vadd.i32	q8,q8,q9
872bc3d5698SJohn Baldwin	add	r1,r1,r5
873bc3d5698SJohn Baldwin	veor	q3,q3,q0
874bc3d5698SJohn Baldwin	mov	r10,r10,ror#16
875bc3d5698SJohn Baldwin	veor	q7,q7,q4
876bc3d5698SJohn Baldwin	eor	r12,r12,r0,ror#16
877bc3d5698SJohn Baldwin	veor	q11,q11,q8
878bc3d5698SJohn Baldwin	eor	r10,r10,r1,ror#16
879bc3d5698SJohn Baldwin	vrev32.16	q3,q3
880bc3d5698SJohn Baldwin	add	r8,r8,r12
881bc3d5698SJohn Baldwin	vrev32.16	q7,q7
882bc3d5698SJohn Baldwin	mov	r4,r4,ror#20
883bc3d5698SJohn Baldwin	vrev32.16	q11,q11
884bc3d5698SJohn Baldwin	add	r9,r9,r10
885bc3d5698SJohn Baldwin	vadd.i32	q2,q2,q3
886bc3d5698SJohn Baldwin	mov	r5,r5,ror#20
887bc3d5698SJohn Baldwin	vadd.i32	q6,q6,q7
888bc3d5698SJohn Baldwin	eor	r4,r4,r8,ror#20
889bc3d5698SJohn Baldwin	vadd.i32	q10,q10,q11
890bc3d5698SJohn Baldwin	eor	r5,r5,r9,ror#20
891bc3d5698SJohn Baldwin	veor	q12,q1,q2
892bc3d5698SJohn Baldwin	add	r0,r0,r4
893bc3d5698SJohn Baldwin	veor	q13,q5,q6
894bc3d5698SJohn Baldwin	mov	r12,r12,ror#24
895bc3d5698SJohn Baldwin	veor	q14,q9,q10
896bc3d5698SJohn Baldwin	add	r1,r1,r5
897bc3d5698SJohn Baldwin	vshr.u32	q1,q12,#20
898bc3d5698SJohn Baldwin	mov	r10,r10,ror#24
899bc3d5698SJohn Baldwin	vshr.u32	q5,q13,#20
900bc3d5698SJohn Baldwin	eor	r12,r12,r0,ror#24
901bc3d5698SJohn Baldwin	vshr.u32	q9,q14,#20
902bc3d5698SJohn Baldwin	eor	r10,r10,r1,ror#24
903bc3d5698SJohn Baldwin	vsli.32	q1,q12,#12
904bc3d5698SJohn Baldwin	add	r8,r8,r12
905bc3d5698SJohn Baldwin	vsli.32	q5,q13,#12
906bc3d5698SJohn Baldwin	mov	r4,r4,ror#25
907bc3d5698SJohn Baldwin	vsli.32	q9,q14,#12
908bc3d5698SJohn Baldwin	add	r9,r9,r10
909bc3d5698SJohn Baldwin	vadd.i32	q0,q0,q1
910bc3d5698SJohn Baldwin	mov	r5,r5,ror#25
911bc3d5698SJohn Baldwin	vadd.i32	q4,q4,q5
912bc3d5698SJohn Baldwin	str	r10,[sp,#4*(16+13)]
913bc3d5698SJohn Baldwin	vadd.i32	q8,q8,q9
914bc3d5698SJohn Baldwin	ldr	r10,[sp,#4*(16+15)]
915bc3d5698SJohn Baldwin	veor	q12,q3,q0
916bc3d5698SJohn Baldwin	eor	r4,r4,r8,ror#25
917bc3d5698SJohn Baldwin	veor	q13,q7,q4
918bc3d5698SJohn Baldwin	eor	r5,r5,r9,ror#25
919bc3d5698SJohn Baldwin	veor	q14,q11,q8
920bc3d5698SJohn Baldwin	str	r8,[sp,#4*(16+8)]
921bc3d5698SJohn Baldwin	vshr.u32	q3,q12,#24
922bc3d5698SJohn Baldwin	ldr	r8,[sp,#4*(16+10)]
923bc3d5698SJohn Baldwin	vshr.u32	q7,q13,#24
924bc3d5698SJohn Baldwin	add	r2,r2,r6
925bc3d5698SJohn Baldwin	vshr.u32	q11,q14,#24
926bc3d5698SJohn Baldwin	mov	r14,r14,ror#16
927bc3d5698SJohn Baldwin	vsli.32	q3,q12,#8
928bc3d5698SJohn Baldwin	str	r9,[sp,#4*(16+9)]
929bc3d5698SJohn Baldwin	vsli.32	q7,q13,#8
930bc3d5698SJohn Baldwin	ldr	r9,[sp,#4*(16+11)]
931bc3d5698SJohn Baldwin	vsli.32	q11,q14,#8
932bc3d5698SJohn Baldwin	add	r3,r3,r7
933bc3d5698SJohn Baldwin	vadd.i32	q2,q2,q3
934bc3d5698SJohn Baldwin	mov	r10,r10,ror#16
935bc3d5698SJohn Baldwin	vadd.i32	q6,q6,q7
936bc3d5698SJohn Baldwin	eor	r14,r14,r2,ror#16
937bc3d5698SJohn Baldwin	vadd.i32	q10,q10,q11
938bc3d5698SJohn Baldwin	eor	r10,r10,r3,ror#16
939bc3d5698SJohn Baldwin	veor	q12,q1,q2
940bc3d5698SJohn Baldwin	add	r8,r8,r14
941bc3d5698SJohn Baldwin	veor	q13,q5,q6
942bc3d5698SJohn Baldwin	mov	r6,r6,ror#20
943bc3d5698SJohn Baldwin	veor	q14,q9,q10
944bc3d5698SJohn Baldwin	add	r9,r9,r10
945bc3d5698SJohn Baldwin	vshr.u32	q1,q12,#25
946bc3d5698SJohn Baldwin	mov	r7,r7,ror#20
947bc3d5698SJohn Baldwin	vshr.u32	q5,q13,#25
948bc3d5698SJohn Baldwin	eor	r6,r6,r8,ror#20
949bc3d5698SJohn Baldwin	vshr.u32	q9,q14,#25
950bc3d5698SJohn Baldwin	eor	r7,r7,r9,ror#20
951bc3d5698SJohn Baldwin	vsli.32	q1,q12,#7
952bc3d5698SJohn Baldwin	add	r2,r2,r6
953bc3d5698SJohn Baldwin	vsli.32	q5,q13,#7
954bc3d5698SJohn Baldwin	mov	r14,r14,ror#24
955bc3d5698SJohn Baldwin	vsli.32	q9,q14,#7
956bc3d5698SJohn Baldwin	add	r3,r3,r7
957bc3d5698SJohn Baldwin	vext.8	q2,q2,q2,#8
958bc3d5698SJohn Baldwin	mov	r10,r10,ror#24
959bc3d5698SJohn Baldwin	vext.8	q6,q6,q6,#8
960bc3d5698SJohn Baldwin	eor	r14,r14,r2,ror#24
961bc3d5698SJohn Baldwin	vext.8	q10,q10,q10,#8
962bc3d5698SJohn Baldwin	eor	r10,r10,r3,ror#24
963bc3d5698SJohn Baldwin	vext.8	q1,q1,q1,#4
964bc3d5698SJohn Baldwin	add	r8,r8,r14
965bc3d5698SJohn Baldwin	vext.8	q5,q5,q5,#4
966bc3d5698SJohn Baldwin	mov	r6,r6,ror#25
967bc3d5698SJohn Baldwin	vext.8	q9,q9,q9,#4
968bc3d5698SJohn Baldwin	add	r9,r9,r10
969bc3d5698SJohn Baldwin	vext.8	q3,q3,q3,#12
970bc3d5698SJohn Baldwin	mov	r7,r7,ror#25
971bc3d5698SJohn Baldwin	vext.8	q7,q7,q7,#12
972bc3d5698SJohn Baldwin	eor	r6,r6,r8,ror#25
973bc3d5698SJohn Baldwin	vext.8	q11,q11,q11,#12
974bc3d5698SJohn Baldwin	eor	r7,r7,r9,ror#25
975bc3d5698SJohn Baldwin	vadd.i32	q0,q0,q1
976bc3d5698SJohn Baldwin	add	r0,r0,r5
977bc3d5698SJohn Baldwin	vadd.i32	q4,q4,q5
978bc3d5698SJohn Baldwin	mov	r10,r10,ror#16
979bc3d5698SJohn Baldwin	vadd.i32	q8,q8,q9
980bc3d5698SJohn Baldwin	add	r1,r1,r6
981bc3d5698SJohn Baldwin	veor	q3,q3,q0
982bc3d5698SJohn Baldwin	mov	r12,r12,ror#16
983bc3d5698SJohn Baldwin	veor	q7,q7,q4
984bc3d5698SJohn Baldwin	eor	r10,r10,r0,ror#16
985bc3d5698SJohn Baldwin	veor	q11,q11,q8
986bc3d5698SJohn Baldwin	eor	r12,r12,r1,ror#16
987bc3d5698SJohn Baldwin	vrev32.16	q3,q3
988bc3d5698SJohn Baldwin	add	r8,r8,r10
989bc3d5698SJohn Baldwin	vrev32.16	q7,q7
990bc3d5698SJohn Baldwin	mov	r5,r5,ror#20
991bc3d5698SJohn Baldwin	vrev32.16	q11,q11
992bc3d5698SJohn Baldwin	add	r9,r9,r12
993bc3d5698SJohn Baldwin	vadd.i32	q2,q2,q3
994bc3d5698SJohn Baldwin	mov	r6,r6,ror#20
995bc3d5698SJohn Baldwin	vadd.i32	q6,q6,q7
996bc3d5698SJohn Baldwin	eor	r5,r5,r8,ror#20
997bc3d5698SJohn Baldwin	vadd.i32	q10,q10,q11
998bc3d5698SJohn Baldwin	eor	r6,r6,r9,ror#20
999bc3d5698SJohn Baldwin	veor	q12,q1,q2
1000bc3d5698SJohn Baldwin	add	r0,r0,r5
1001bc3d5698SJohn Baldwin	veor	q13,q5,q6
1002bc3d5698SJohn Baldwin	mov	r10,r10,ror#24
1003bc3d5698SJohn Baldwin	veor	q14,q9,q10
1004bc3d5698SJohn Baldwin	add	r1,r1,r6
1005bc3d5698SJohn Baldwin	vshr.u32	q1,q12,#20
1006bc3d5698SJohn Baldwin	mov	r12,r12,ror#24
1007bc3d5698SJohn Baldwin	vshr.u32	q5,q13,#20
1008bc3d5698SJohn Baldwin	eor	r10,r10,r0,ror#24
1009bc3d5698SJohn Baldwin	vshr.u32	q9,q14,#20
1010bc3d5698SJohn Baldwin	eor	r12,r12,r1,ror#24
1011bc3d5698SJohn Baldwin	vsli.32	q1,q12,#12
1012bc3d5698SJohn Baldwin	add	r8,r8,r10
1013bc3d5698SJohn Baldwin	vsli.32	q5,q13,#12
1014bc3d5698SJohn Baldwin	mov	r5,r5,ror#25
1015bc3d5698SJohn Baldwin	vsli.32	q9,q14,#12
1016bc3d5698SJohn Baldwin	str	r10,[sp,#4*(16+15)]
1017bc3d5698SJohn Baldwin	vadd.i32	q0,q0,q1
1018bc3d5698SJohn Baldwin	ldr	r10,[sp,#4*(16+13)]
1019bc3d5698SJohn Baldwin	vadd.i32	q4,q4,q5
1020bc3d5698SJohn Baldwin	add	r9,r9,r12
1021bc3d5698SJohn Baldwin	vadd.i32	q8,q8,q9
1022bc3d5698SJohn Baldwin	mov	r6,r6,ror#25
1023bc3d5698SJohn Baldwin	veor	q12,q3,q0
1024bc3d5698SJohn Baldwin	eor	r5,r5,r8,ror#25
1025bc3d5698SJohn Baldwin	veor	q13,q7,q4
1026bc3d5698SJohn Baldwin	eor	r6,r6,r9,ror#25
1027bc3d5698SJohn Baldwin	veor	q14,q11,q8
1028bc3d5698SJohn Baldwin	str	r8,[sp,#4*(16+10)]
1029bc3d5698SJohn Baldwin	vshr.u32	q3,q12,#24
1030bc3d5698SJohn Baldwin	ldr	r8,[sp,#4*(16+8)]
1031bc3d5698SJohn Baldwin	vshr.u32	q7,q13,#24
1032bc3d5698SJohn Baldwin	add	r2,r2,r7
1033bc3d5698SJohn Baldwin	vshr.u32	q11,q14,#24
1034bc3d5698SJohn Baldwin	mov	r10,r10,ror#16
1035bc3d5698SJohn Baldwin	vsli.32	q3,q12,#8
1036bc3d5698SJohn Baldwin	str	r9,[sp,#4*(16+11)]
1037bc3d5698SJohn Baldwin	vsli.32	q7,q13,#8
1038bc3d5698SJohn Baldwin	ldr	r9,[sp,#4*(16+9)]
1039bc3d5698SJohn Baldwin	vsli.32	q11,q14,#8
1040bc3d5698SJohn Baldwin	add	r3,r3,r4
1041bc3d5698SJohn Baldwin	vadd.i32	q2,q2,q3
1042bc3d5698SJohn Baldwin	mov	r14,r14,ror#16
1043bc3d5698SJohn Baldwin	vadd.i32	q6,q6,q7
1044bc3d5698SJohn Baldwin	eor	r10,r10,r2,ror#16
1045bc3d5698SJohn Baldwin	vadd.i32	q10,q10,q11
1046bc3d5698SJohn Baldwin	eor	r14,r14,r3,ror#16
1047bc3d5698SJohn Baldwin	veor	q12,q1,q2
1048bc3d5698SJohn Baldwin	add	r8,r8,r10
1049bc3d5698SJohn Baldwin	veor	q13,q5,q6
1050bc3d5698SJohn Baldwin	mov	r7,r7,ror#20
1051bc3d5698SJohn Baldwin	veor	q14,q9,q10
1052bc3d5698SJohn Baldwin	add	r9,r9,r14
1053bc3d5698SJohn Baldwin	vshr.u32	q1,q12,#25
1054bc3d5698SJohn Baldwin	mov	r4,r4,ror#20
1055bc3d5698SJohn Baldwin	vshr.u32	q5,q13,#25
1056bc3d5698SJohn Baldwin	eor	r7,r7,r8,ror#20
1057bc3d5698SJohn Baldwin	vshr.u32	q9,q14,#25
1058bc3d5698SJohn Baldwin	eor	r4,r4,r9,ror#20
1059bc3d5698SJohn Baldwin	vsli.32	q1,q12,#7
1060bc3d5698SJohn Baldwin	add	r2,r2,r7
1061bc3d5698SJohn Baldwin	vsli.32	q5,q13,#7
1062bc3d5698SJohn Baldwin	mov	r10,r10,ror#24
1063bc3d5698SJohn Baldwin	vsli.32	q9,q14,#7
1064bc3d5698SJohn Baldwin	add	r3,r3,r4
1065bc3d5698SJohn Baldwin	vext.8	q2,q2,q2,#8
1066bc3d5698SJohn Baldwin	mov	r14,r14,ror#24
1067bc3d5698SJohn Baldwin	vext.8	q6,q6,q6,#8
1068bc3d5698SJohn Baldwin	eor	r10,r10,r2,ror#24
1069bc3d5698SJohn Baldwin	vext.8	q10,q10,q10,#8
1070bc3d5698SJohn Baldwin	eor	r14,r14,r3,ror#24
1071bc3d5698SJohn Baldwin	vext.8	q1,q1,q1,#12
1072bc3d5698SJohn Baldwin	add	r8,r8,r10
1073bc3d5698SJohn Baldwin	vext.8	q5,q5,q5,#12
1074bc3d5698SJohn Baldwin	mov	r7,r7,ror#25
1075bc3d5698SJohn Baldwin	vext.8	q9,q9,q9,#12
1076bc3d5698SJohn Baldwin	add	r9,r9,r14
1077bc3d5698SJohn Baldwin	vext.8	q3,q3,q3,#4
1078bc3d5698SJohn Baldwin	mov	r4,r4,ror#25
1079bc3d5698SJohn Baldwin	vext.8	q7,q7,q7,#4
1080bc3d5698SJohn Baldwin	eor	r7,r7,r8,ror#25
1081bc3d5698SJohn Baldwin	vext.8	q11,q11,q11,#4
1082bc3d5698SJohn Baldwin	eor	r4,r4,r9,ror#25
1083bc3d5698SJohn Baldwin	bne	.Loop_neon
1084bc3d5698SJohn Baldwin
1085bc3d5698SJohn Baldwin	add	r11,sp,#32
1086bc3d5698SJohn Baldwin	vld1.32	{q12,q13},[sp]		@ load key material
1087bc3d5698SJohn Baldwin	vld1.32	{q14,q15},[r11]
1088bc3d5698SJohn Baldwin
1089bc3d5698SJohn Baldwin	ldr	r11,[sp,#4*(32+2)]	@ load len
1090bc3d5698SJohn Baldwin
1091bc3d5698SJohn Baldwin	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
1092bc3d5698SJohn Baldwin	str	r9, [sp,#4*(16+9)]
1093bc3d5698SJohn Baldwin	str	r12,[sp,#4*(16+12)]
1094bc3d5698SJohn Baldwin	str	r10, [sp,#4*(16+13)]
1095bc3d5698SJohn Baldwin	str	r14,[sp,#4*(16+14)]
1096bc3d5698SJohn Baldwin
1097bc3d5698SJohn Baldwin	@ at this point we have first half of 512-bit result in
1098bc3d5698SJohn Baldwin	@ rx and second half at sp+4*(16+8)
1099bc3d5698SJohn Baldwin
1100bc3d5698SJohn Baldwin	ldr	r12,[sp,#4*(32+1)]	@ load inp
1101bc3d5698SJohn Baldwin	ldr	r14,[sp,#4*(32+0)]	@ load out
1102bc3d5698SJohn Baldwin
1103bc3d5698SJohn Baldwin	vadd.i32	q0,q0,q12		@ accumulate key material
1104bc3d5698SJohn Baldwin	vadd.i32	q4,q4,q12
1105bc3d5698SJohn Baldwin	vadd.i32	q8,q8,q12
1106bc3d5698SJohn Baldwin	vldr	d24,[sp,#4*(16+0)]	@ one
1107bc3d5698SJohn Baldwin
1108bc3d5698SJohn Baldwin	vadd.i32	q1,q1,q13
1109bc3d5698SJohn Baldwin	vadd.i32	q5,q5,q13
1110bc3d5698SJohn Baldwin	vadd.i32	q9,q9,q13
1111bc3d5698SJohn Baldwin	vldr	d26,[sp,#4*(16+2)]	@ two
1112bc3d5698SJohn Baldwin
1113bc3d5698SJohn Baldwin	vadd.i32	q2,q2,q14
1114bc3d5698SJohn Baldwin	vadd.i32	q6,q6,q14
1115bc3d5698SJohn Baldwin	vadd.i32	q10,q10,q14
1116bc3d5698SJohn Baldwin	vadd.i32	d14,d14,d24	@ counter+1
1117bc3d5698SJohn Baldwin	vadd.i32	d22,d22,d26	@ counter+2
1118bc3d5698SJohn Baldwin
1119bc3d5698SJohn Baldwin	vadd.i32	q3,q3,q15
1120bc3d5698SJohn Baldwin	vadd.i32	q7,q7,q15
1121bc3d5698SJohn Baldwin	vadd.i32	q11,q11,q15
1122bc3d5698SJohn Baldwin
1123bc3d5698SJohn Baldwin	cmp	r11,#64*4
1124bc3d5698SJohn Baldwin	blo	.Ltail_neon
1125bc3d5698SJohn Baldwin
1126bc3d5698SJohn Baldwin	vld1.8	{q12,q13},[r12]!	@ load input
1127bc3d5698SJohn Baldwin	mov	r11,sp
1128bc3d5698SJohn Baldwin	vld1.8	{q14,q15},[r12]!
1129bc3d5698SJohn Baldwin	veor	q0,q0,q12		@ xor with input
1130bc3d5698SJohn Baldwin	veor	q1,q1,q13
1131bc3d5698SJohn Baldwin	vld1.8	{q12,q13},[r12]!
1132bc3d5698SJohn Baldwin	veor	q2,q2,q14
1133bc3d5698SJohn Baldwin	veor	q3,q3,q15
1134bc3d5698SJohn Baldwin	vld1.8	{q14,q15},[r12]!
1135bc3d5698SJohn Baldwin
1136bc3d5698SJohn Baldwin	veor	q4,q4,q12
1137bc3d5698SJohn Baldwin	vst1.8	{q0,q1},[r14]!	@ store output
1138bc3d5698SJohn Baldwin	veor	q5,q5,q13
1139bc3d5698SJohn Baldwin	vld1.8	{q12,q13},[r12]!
1140bc3d5698SJohn Baldwin	veor	q6,q6,q14
1141bc3d5698SJohn Baldwin	vst1.8	{q2,q3},[r14]!
1142bc3d5698SJohn Baldwin	veor	q7,q7,q15
1143bc3d5698SJohn Baldwin	vld1.8	{q14,q15},[r12]!
1144bc3d5698SJohn Baldwin
1145bc3d5698SJohn Baldwin	veor	q8,q8,q12
1146bc3d5698SJohn Baldwin	vld1.32	{q0,q1},[r11]!	@ load for next iteration
1147bc3d5698SJohn Baldwin	veor	d25,d25,d25
1148bc3d5698SJohn Baldwin	vldr	d24,[sp,#4*(16+4)]	@ four
1149bc3d5698SJohn Baldwin	veor	q9,q9,q13
1150bc3d5698SJohn Baldwin	vld1.32	{q2,q3},[r11]
1151bc3d5698SJohn Baldwin	veor	q10,q10,q14
1152bc3d5698SJohn Baldwin	vst1.8	{q4,q5},[r14]!
1153bc3d5698SJohn Baldwin	veor	q11,q11,q15
1154bc3d5698SJohn Baldwin	vst1.8	{q6,q7},[r14]!
1155bc3d5698SJohn Baldwin
1156bc3d5698SJohn Baldwin	vadd.i32	d6,d6,d24	@ next counter value
1157bc3d5698SJohn Baldwin	vldr	d24,[sp,#4*(16+0)]	@ one
1158bc3d5698SJohn Baldwin
1159bc3d5698SJohn Baldwin	ldmia	sp,{r8,r9,r10,r11}	@ load key material
1160bc3d5698SJohn Baldwin	add	r0,r0,r8	@ accumulate key material
1161bc3d5698SJohn Baldwin	ldr	r8,[r12],#16		@ load input
1162bc3d5698SJohn Baldwin	vst1.8	{q8,q9},[r14]!
1163bc3d5698SJohn Baldwin	add	r1,r1,r9
1164bc3d5698SJohn Baldwin	ldr	r9,[r12,#-12]
1165bc3d5698SJohn Baldwin	vst1.8	{q10,q11},[r14]!
1166bc3d5698SJohn Baldwin	add	r2,r2,r10
1167bc3d5698SJohn Baldwin	ldr	r10,[r12,#-8]
1168bc3d5698SJohn Baldwin	add	r3,r3,r11
1169bc3d5698SJohn Baldwin	ldr	r11,[r12,#-4]
1170bc3d5698SJohn Baldwin# ifdef	__ARMEB__
1171bc3d5698SJohn Baldwin	rev	r0,r0
1172bc3d5698SJohn Baldwin	rev	r1,r1
1173bc3d5698SJohn Baldwin	rev	r2,r2
1174bc3d5698SJohn Baldwin	rev	r3,r3
1175bc3d5698SJohn Baldwin# endif
1176bc3d5698SJohn Baldwin	eor	r0,r0,r8	@ xor with input
1177bc3d5698SJohn Baldwin	add	r8,sp,#4*(4)
1178bc3d5698SJohn Baldwin	eor	r1,r1,r9
1179bc3d5698SJohn Baldwin	str	r0,[r14],#16		@ store output
1180bc3d5698SJohn Baldwin	eor	r2,r2,r10
1181bc3d5698SJohn Baldwin	str	r1,[r14,#-12]
1182bc3d5698SJohn Baldwin	eor	r3,r3,r11
1183bc3d5698SJohn Baldwin	ldmia	r8,{r8,r9,r10,r11}	@ load key material
1184bc3d5698SJohn Baldwin	str	r2,[r14,#-8]
1185bc3d5698SJohn Baldwin	str	r3,[r14,#-4]
1186bc3d5698SJohn Baldwin
1187bc3d5698SJohn Baldwin	add	r4,r4,r8	@ accumulate key material
1188bc3d5698SJohn Baldwin	ldr	r8,[r12],#16		@ load input
1189bc3d5698SJohn Baldwin	add	r5,r5,r9
1190bc3d5698SJohn Baldwin	ldr	r9,[r12,#-12]
1191bc3d5698SJohn Baldwin	add	r6,r6,r10
1192bc3d5698SJohn Baldwin	ldr	r10,[r12,#-8]
1193bc3d5698SJohn Baldwin	add	r7,r7,r11
1194bc3d5698SJohn Baldwin	ldr	r11,[r12,#-4]
1195bc3d5698SJohn Baldwin# ifdef	__ARMEB__
1196bc3d5698SJohn Baldwin	rev	r4,r4
1197bc3d5698SJohn Baldwin	rev	r5,r5
1198bc3d5698SJohn Baldwin	rev	r6,r6
1199bc3d5698SJohn Baldwin	rev	r7,r7
1200bc3d5698SJohn Baldwin# endif
1201bc3d5698SJohn Baldwin	eor	r4,r4,r8
1202bc3d5698SJohn Baldwin	add	r8,sp,#4*(8)
1203bc3d5698SJohn Baldwin	eor	r5,r5,r9
1204bc3d5698SJohn Baldwin	str	r4,[r14],#16		@ store output
1205bc3d5698SJohn Baldwin	eor	r6,r6,r10
1206bc3d5698SJohn Baldwin	str	r5,[r14,#-12]
1207bc3d5698SJohn Baldwin	eor	r7,r7,r11
1208bc3d5698SJohn Baldwin	ldmia	r8,{r8,r9,r10,r11}	@ load key material
1209bc3d5698SJohn Baldwin	str	r6,[r14,#-8]
1210bc3d5698SJohn Baldwin	add	r0,sp,#4*(16+8)
1211bc3d5698SJohn Baldwin	str	r7,[r14,#-4]
1212bc3d5698SJohn Baldwin
1213bc3d5698SJohn Baldwin	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
1214bc3d5698SJohn Baldwin
1215bc3d5698SJohn Baldwin	add	r0,r0,r8	@ accumulate key material
1216bc3d5698SJohn Baldwin	ldr	r8,[r12],#16		@ load input
1217bc3d5698SJohn Baldwin	add	r1,r1,r9
1218bc3d5698SJohn Baldwin	ldr	r9,[r12,#-12]
1219bc3d5698SJohn Baldwin# ifdef	__thumb2__
1220bc3d5698SJohn Baldwin	it	hi
1221bc3d5698SJohn Baldwin# endif
1222bc3d5698SJohn Baldwin	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
1223bc3d5698SJohn Baldwin	add	r2,r2,r10
1224bc3d5698SJohn Baldwin	ldr	r10,[r12,#-8]
1225bc3d5698SJohn Baldwin# ifdef	__thumb2__
1226bc3d5698SJohn Baldwin	it	hi
1227bc3d5698SJohn Baldwin# endif
1228bc3d5698SJohn Baldwin	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
1229bc3d5698SJohn Baldwin	add	r3,r3,r11
1230bc3d5698SJohn Baldwin	ldr	r11,[r12,#-4]
1231bc3d5698SJohn Baldwin# ifdef	__ARMEB__
1232bc3d5698SJohn Baldwin	rev	r0,r0
1233bc3d5698SJohn Baldwin	rev	r1,r1
1234bc3d5698SJohn Baldwin	rev	r2,r2
1235bc3d5698SJohn Baldwin	rev	r3,r3
1236bc3d5698SJohn Baldwin# endif
1237bc3d5698SJohn Baldwin	eor	r0,r0,r8
1238bc3d5698SJohn Baldwin	add	r8,sp,#4*(12)
1239bc3d5698SJohn Baldwin	eor	r1,r1,r9
1240bc3d5698SJohn Baldwin	str	r0,[r14],#16		@ store output
1241bc3d5698SJohn Baldwin	eor	r2,r2,r10
1242bc3d5698SJohn Baldwin	str	r1,[r14,#-12]
1243bc3d5698SJohn Baldwin	eor	r3,r3,r11
1244bc3d5698SJohn Baldwin	ldmia	r8,{r8,r9,r10,r11}	@ load key material
1245bc3d5698SJohn Baldwin	str	r2,[r14,#-8]
1246bc3d5698SJohn Baldwin	str	r3,[r14,#-4]
1247bc3d5698SJohn Baldwin
1248bc3d5698SJohn Baldwin	add	r4,r4,r8	@ accumulate key material
1249bc3d5698SJohn Baldwin	add	r8,r8,#4		@ next counter value
1250bc3d5698SJohn Baldwin	add	r5,r5,r9
1251bc3d5698SJohn Baldwin	str	r8,[sp,#4*(12)]	@ save next counter value
1252bc3d5698SJohn Baldwin	ldr	r8,[r12],#16		@ load input
1253bc3d5698SJohn Baldwin	add	r6,r6,r10
1254bc3d5698SJohn Baldwin	add	r4,r4,#3		@ counter+3
1255bc3d5698SJohn Baldwin	ldr	r9,[r12,#-12]
1256bc3d5698SJohn Baldwin	add	r7,r7,r11
1257bc3d5698SJohn Baldwin	ldr	r10,[r12,#-8]
1258bc3d5698SJohn Baldwin	ldr	r11,[r12,#-4]
1259bc3d5698SJohn Baldwin# ifdef	__ARMEB__
1260bc3d5698SJohn Baldwin	rev	r4,r4
1261bc3d5698SJohn Baldwin	rev	r5,r5
1262bc3d5698SJohn Baldwin	rev	r6,r6
1263bc3d5698SJohn Baldwin	rev	r7,r7
1264bc3d5698SJohn Baldwin# endif
1265bc3d5698SJohn Baldwin	eor	r4,r4,r8
1266bc3d5698SJohn Baldwin# ifdef	__thumb2__
1267bc3d5698SJohn Baldwin	it	hi
1268bc3d5698SJohn Baldwin# endif
1269bc3d5698SJohn Baldwin	ldrhi	r8,[sp,#4*(32+2)]	@ re-load len
1270bc3d5698SJohn Baldwin	eor	r5,r5,r9
1271bc3d5698SJohn Baldwin	eor	r6,r6,r10
1272bc3d5698SJohn Baldwin	str	r4,[r14],#16		@ store output
1273bc3d5698SJohn Baldwin	eor	r7,r7,r11
1274bc3d5698SJohn Baldwin	str	r5,[r14,#-12]
1275bc3d5698SJohn Baldwin	sub	r11,r8,#64*4	@ len-=64*4
1276bc3d5698SJohn Baldwin	str	r6,[r14,#-8]
1277bc3d5698SJohn Baldwin	str	r7,[r14,#-4]
1278bc3d5698SJohn Baldwin	bhi	.Loop_neon_outer
1279bc3d5698SJohn Baldwin
1280bc3d5698SJohn Baldwin	b	.Ldone_neon
1281bc3d5698SJohn Baldwin
1282bc3d5698SJohn Baldwin.align	4
1283bc3d5698SJohn Baldwin.Lbreak_neon:
1284bc3d5698SJohn Baldwin	@ harmonize NEON and integer-only stack frames: load data
1285bc3d5698SJohn Baldwin	@ from NEON frame, but save to integer-only one; distance
1286bc3d5698SJohn Baldwin	@ between the two is 4*(32+4+16-32)=4*(20).
1287bc3d5698SJohn Baldwin
1288bc3d5698SJohn Baldwin	str	r11, [sp,#4*(20+32+2)]	@ save len
1289bc3d5698SJohn Baldwin	add	r11,sp,#4*(32+4)
1290bc3d5698SJohn Baldwin	str	r12,   [sp,#4*(20+32+1)]	@ save inp
1291bc3d5698SJohn Baldwin	str	r14,   [sp,#4*(20+32+0)]	@ save out
1292bc3d5698SJohn Baldwin
1293bc3d5698SJohn Baldwin	ldr	r12,[sp,#4*(16+10)]
1294bc3d5698SJohn Baldwin	ldr	r14,[sp,#4*(16+11)]
1295bc3d5698SJohn Baldwin	vldmia	r11,{d8,d9,d10,d11,d12,d13,d14,d15}			@ fulfill ABI requirement
1296bc3d5698SJohn Baldwin	str	r12,[sp,#4*(20+16+10)]	@ copy "rx"
1297bc3d5698SJohn Baldwin	str	r14,[sp,#4*(20+16+11)]	@ copy "rx"
1298bc3d5698SJohn Baldwin
1299bc3d5698SJohn Baldwin	ldr	r11, [sp,#4*(15)]
1300bc3d5698SJohn Baldwin	ldr	r12,[sp,#4*(12)]		@ modulo-scheduled load
1301bc3d5698SJohn Baldwin	ldr	r10, [sp,#4*(13)]
1302bc3d5698SJohn Baldwin	ldr	r14,[sp,#4*(14)]
1303bc3d5698SJohn Baldwin	str	r11, [sp,#4*(20+16+15)]
1304bc3d5698SJohn Baldwin	add	r11,sp,#4*(20)
1305bc3d5698SJohn Baldwin	vst1.32	{q0,q1},[r11]!		@ copy key
1306bc3d5698SJohn Baldwin	add	sp,sp,#4*(20)			@ switch frame
1307bc3d5698SJohn Baldwin	vst1.32	{q2,q3},[r11]
1308bc3d5698SJohn Baldwin	mov	r11,#10
1309bc3d5698SJohn Baldwin	b	.Loop				@ go integer-only
1310bc3d5698SJohn Baldwin
1311bc3d5698SJohn Baldwin.align	4
1312bc3d5698SJohn Baldwin.Ltail_neon:
1313bc3d5698SJohn Baldwin	cmp	r11,#64*3
1314bc3d5698SJohn Baldwin	bhs	.L192_or_more_neon
1315bc3d5698SJohn Baldwin	cmp	r11,#64*2
1316bc3d5698SJohn Baldwin	bhs	.L128_or_more_neon
1317bc3d5698SJohn Baldwin	cmp	r11,#64*1
1318bc3d5698SJohn Baldwin	bhs	.L64_or_more_neon
1319bc3d5698SJohn Baldwin
1320bc3d5698SJohn Baldwin	add	r8,sp,#4*(8)
1321bc3d5698SJohn Baldwin	vst1.8	{q0,q1},[sp]
1322bc3d5698SJohn Baldwin	add	r10,sp,#4*(0)
1323bc3d5698SJohn Baldwin	vst1.8	{q2,q3},[r8]
1324bc3d5698SJohn Baldwin	b	.Loop_tail_neon
1325bc3d5698SJohn Baldwin
1326bc3d5698SJohn Baldwin.align	4
1327bc3d5698SJohn Baldwin.L64_or_more_neon:
1328bc3d5698SJohn Baldwin	vld1.8	{q12,q13},[r12]!
1329bc3d5698SJohn Baldwin	vld1.8	{q14,q15},[r12]!
1330bc3d5698SJohn Baldwin	veor	q0,q0,q12
1331bc3d5698SJohn Baldwin	veor	q1,q1,q13
1332bc3d5698SJohn Baldwin	veor	q2,q2,q14
1333bc3d5698SJohn Baldwin	veor	q3,q3,q15
1334bc3d5698SJohn Baldwin	vst1.8	{q0,q1},[r14]!
1335bc3d5698SJohn Baldwin	vst1.8	{q2,q3},[r14]!
1336bc3d5698SJohn Baldwin
1337bc3d5698SJohn Baldwin	beq	.Ldone_neon
1338bc3d5698SJohn Baldwin
1339bc3d5698SJohn Baldwin	add	r8,sp,#4*(8)
1340bc3d5698SJohn Baldwin	vst1.8	{q4,q5},[sp]
1341bc3d5698SJohn Baldwin	add	r10,sp,#4*(0)
1342bc3d5698SJohn Baldwin	vst1.8	{q6,q7},[r8]
1343bc3d5698SJohn Baldwin	sub	r11,r11,#64*1	@ len-=64*1
1344bc3d5698SJohn Baldwin	b	.Loop_tail_neon
1345bc3d5698SJohn Baldwin
1346bc3d5698SJohn Baldwin.align	4
1347bc3d5698SJohn Baldwin.L128_or_more_neon:
1348bc3d5698SJohn Baldwin	vld1.8	{q12,q13},[r12]!
1349bc3d5698SJohn Baldwin	vld1.8	{q14,q15},[r12]!
1350bc3d5698SJohn Baldwin	veor	q0,q0,q12
1351bc3d5698SJohn Baldwin	veor	q1,q1,q13
1352bc3d5698SJohn Baldwin	vld1.8	{q12,q13},[r12]!
1353bc3d5698SJohn Baldwin	veor	q2,q2,q14
1354bc3d5698SJohn Baldwin	veor	q3,q3,q15
1355bc3d5698SJohn Baldwin	vld1.8	{q14,q15},[r12]!
1356bc3d5698SJohn Baldwin
1357bc3d5698SJohn Baldwin	veor	q4,q4,q12
1358bc3d5698SJohn Baldwin	veor	q5,q5,q13
1359bc3d5698SJohn Baldwin	vst1.8	{q0,q1},[r14]!
1360bc3d5698SJohn Baldwin	veor	q6,q6,q14
1361bc3d5698SJohn Baldwin	vst1.8	{q2,q3},[r14]!
1362bc3d5698SJohn Baldwin	veor	q7,q7,q15
1363bc3d5698SJohn Baldwin	vst1.8	{q4,q5},[r14]!
1364bc3d5698SJohn Baldwin	vst1.8	{q6,q7},[r14]!
1365bc3d5698SJohn Baldwin
1366bc3d5698SJohn Baldwin	beq	.Ldone_neon
1367bc3d5698SJohn Baldwin
1368bc3d5698SJohn Baldwin	add	r8,sp,#4*(8)
1369bc3d5698SJohn Baldwin	vst1.8	{q8,q9},[sp]
1370bc3d5698SJohn Baldwin	add	r10,sp,#4*(0)
1371bc3d5698SJohn Baldwin	vst1.8	{q10,q11},[r8]
1372bc3d5698SJohn Baldwin	sub	r11,r11,#64*2	@ len-=64*2
1373bc3d5698SJohn Baldwin	b	.Loop_tail_neon
1374bc3d5698SJohn Baldwin
1375bc3d5698SJohn Baldwin.align	4
1376bc3d5698SJohn Baldwin.L192_or_more_neon:
1377bc3d5698SJohn Baldwin	vld1.8	{q12,q13},[r12]!
1378bc3d5698SJohn Baldwin	vld1.8	{q14,q15},[r12]!
1379bc3d5698SJohn Baldwin	veor	q0,q0,q12
1380bc3d5698SJohn Baldwin	veor	q1,q1,q13
1381bc3d5698SJohn Baldwin	vld1.8	{q12,q13},[r12]!
1382bc3d5698SJohn Baldwin	veor	q2,q2,q14
1383bc3d5698SJohn Baldwin	veor	q3,q3,q15
1384bc3d5698SJohn Baldwin	vld1.8	{q14,q15},[r12]!
1385bc3d5698SJohn Baldwin
1386bc3d5698SJohn Baldwin	veor	q4,q4,q12
1387bc3d5698SJohn Baldwin	veor	q5,q5,q13
1388bc3d5698SJohn Baldwin	vld1.8	{q12,q13},[r12]!
1389bc3d5698SJohn Baldwin	veor	q6,q6,q14
1390bc3d5698SJohn Baldwin	vst1.8	{q0,q1},[r14]!
1391bc3d5698SJohn Baldwin	veor	q7,q7,q15
1392bc3d5698SJohn Baldwin	vld1.8	{q14,q15},[r12]!
1393bc3d5698SJohn Baldwin
1394bc3d5698SJohn Baldwin	veor	q8,q8,q12
1395bc3d5698SJohn Baldwin	vst1.8	{q2,q3},[r14]!
1396bc3d5698SJohn Baldwin	veor	q9,q9,q13
1397bc3d5698SJohn Baldwin	vst1.8	{q4,q5},[r14]!
1398bc3d5698SJohn Baldwin	veor	q10,q10,q14
1399bc3d5698SJohn Baldwin	vst1.8	{q6,q7},[r14]!
1400bc3d5698SJohn Baldwin	veor	q11,q11,q15
1401bc3d5698SJohn Baldwin	vst1.8	{q8,q9},[r14]!
1402bc3d5698SJohn Baldwin	vst1.8	{q10,q11},[r14]!
1403bc3d5698SJohn Baldwin
1404bc3d5698SJohn Baldwin	beq	.Ldone_neon
1405bc3d5698SJohn Baldwin
1406bc3d5698SJohn Baldwin	ldmia	sp,{r8,r9,r10,r11}	@ load key material
1407bc3d5698SJohn Baldwin	add	r0,r0,r8	@ accumulate key material
1408bc3d5698SJohn Baldwin	add	r8,sp,#4*(4)
1409bc3d5698SJohn Baldwin	add	r1,r1,r9
1410bc3d5698SJohn Baldwin	add	r2,r2,r10
1411bc3d5698SJohn Baldwin	add	r3,r3,r11
1412bc3d5698SJohn Baldwin	ldmia	r8,{r8,r9,r10,r11}	@ load key material
1413bc3d5698SJohn Baldwin
1414bc3d5698SJohn Baldwin	add	r4,r4,r8	@ accumulate key material
1415bc3d5698SJohn Baldwin	add	r8,sp,#4*(8)
1416bc3d5698SJohn Baldwin	add	r5,r5,r9
1417bc3d5698SJohn Baldwin	add	r6,r6,r10
1418bc3d5698SJohn Baldwin	add	r7,r7,r11
1419bc3d5698SJohn Baldwin	ldmia	r8,{r8,r9,r10,r11}	@ load key material
1420bc3d5698SJohn Baldwin# ifdef	__ARMEB__
1421bc3d5698SJohn Baldwin	rev	r0,r0
1422bc3d5698SJohn Baldwin	rev	r1,r1
1423bc3d5698SJohn Baldwin	rev	r2,r2
1424bc3d5698SJohn Baldwin	rev	r3,r3
1425bc3d5698SJohn Baldwin	rev	r4,r4
1426bc3d5698SJohn Baldwin	rev	r5,r5
1427bc3d5698SJohn Baldwin	rev	r6,r6
1428bc3d5698SJohn Baldwin	rev	r7,r7
1429bc3d5698SJohn Baldwin# endif
1430bc3d5698SJohn Baldwin	stmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7}
1431bc3d5698SJohn Baldwin	add	r0,sp,#4*(16+8)
1432bc3d5698SJohn Baldwin
1433bc3d5698SJohn Baldwin	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
1434bc3d5698SJohn Baldwin
1435bc3d5698SJohn Baldwin	add	r0,r0,r8	@ accumulate key material
1436bc3d5698SJohn Baldwin	add	r8,sp,#4*(12)
1437bc3d5698SJohn Baldwin	add	r1,r1,r9
1438bc3d5698SJohn Baldwin	add	r2,r2,r10
1439bc3d5698SJohn Baldwin	add	r3,r3,r11
1440bc3d5698SJohn Baldwin	ldmia	r8,{r8,r9,r10,r11}	@ load key material
1441bc3d5698SJohn Baldwin
1442bc3d5698SJohn Baldwin	add	r4,r4,r8	@ accumulate key material
1443bc3d5698SJohn Baldwin	add	r8,sp,#4*(8)
1444bc3d5698SJohn Baldwin	add	r5,r5,r9
1445bc3d5698SJohn Baldwin	add	r4,r4,#3		@ counter+3
1446bc3d5698SJohn Baldwin	add	r6,r6,r10
1447bc3d5698SJohn Baldwin	add	r7,r7,r11
1448bc3d5698SJohn Baldwin	ldr	r11,[sp,#4*(32+2)]	@ re-load len
1449bc3d5698SJohn Baldwin# ifdef	__ARMEB__
1450bc3d5698SJohn Baldwin	rev	r0,r0
1451bc3d5698SJohn Baldwin	rev	r1,r1
1452bc3d5698SJohn Baldwin	rev	r2,r2
1453bc3d5698SJohn Baldwin	rev	r3,r3
1454bc3d5698SJohn Baldwin	rev	r4,r4
1455bc3d5698SJohn Baldwin	rev	r5,r5
1456bc3d5698SJohn Baldwin	rev	r6,r6
1457bc3d5698SJohn Baldwin	rev	r7,r7
1458bc3d5698SJohn Baldwin# endif
1459bc3d5698SJohn Baldwin	stmia	r8,{r0,r1,r2,r3,r4,r5,r6,r7}
1460bc3d5698SJohn Baldwin	add	r10,sp,#4*(0)
1461bc3d5698SJohn Baldwin	sub	r11,r11,#64*3	@ len-=64*3
1462bc3d5698SJohn Baldwin
1463bc3d5698SJohn Baldwin.Loop_tail_neon:
1464bc3d5698SJohn Baldwin	ldrb	r8,[r10],#1	@ read buffer on stack
1465bc3d5698SJohn Baldwin	ldrb	r9,[r12],#1		@ read input
1466bc3d5698SJohn Baldwin	subs	r11,r11,#1
1467bc3d5698SJohn Baldwin	eor	r8,r8,r9
1468bc3d5698SJohn Baldwin	strb	r8,[r14],#1		@ store output
1469bc3d5698SJohn Baldwin	bne	.Loop_tail_neon
1470bc3d5698SJohn Baldwin
1471bc3d5698SJohn Baldwin.Ldone_neon:
1472bc3d5698SJohn Baldwin	add	sp,sp,#4*(32+4)
1473bc3d5698SJohn Baldwin	vldmia	sp,{d8,d9,d10,d11,d12,d13,d14,d15}
1474bc3d5698SJohn Baldwin	add	sp,sp,#4*(16+3)
1475bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
1476bc3d5698SJohn Baldwin.size	ChaCha20_neon,.-ChaCha20_neon
1477bc3d5698SJohn Baldwin.comm	OPENSSL_armcap_P,4,4
1478bc3d5698SJohn Baldwin#endif
1479