xref: /freebsd/sys/crypto/openssl/arm/keccak1600-armv4.S (revision c8e7f78a3d28ff6e6223ed136ada8e1e2f34965e)
1/* Do not modify. This file is auto-generated from keccak1600-armv4.pl. */
2#include "arm_arch.h"
3
4#if defined(__thumb2__)
5.syntax	unified
6.thumb
7#else
8.code	32
9#endif
10
11.text
12
13.type	iotas32, %object
14.align	5
15iotas32:
16.long	0x00000001, 0x00000000
17.long	0x00000000, 0x00000089
18.long	0x00000000, 0x8000008b
19.long	0x00000000, 0x80008080
20.long	0x00000001, 0x0000008b
21.long	0x00000001, 0x00008000
22.long	0x00000001, 0x80008088
23.long	0x00000001, 0x80000082
24.long	0x00000000, 0x0000000b
25.long	0x00000000, 0x0000000a
26.long	0x00000001, 0x00008082
27.long	0x00000000, 0x00008003
28.long	0x00000001, 0x0000808b
29.long	0x00000001, 0x8000000b
30.long	0x00000001, 0x8000008a
31.long	0x00000001, 0x80000081
32.long	0x00000000, 0x80000081
33.long	0x00000000, 0x80000008
34.long	0x00000000, 0x00000083
35.long	0x00000000, 0x80008003
36.long	0x00000001, 0x80008088
37.long	0x00000000, 0x80000088
38.long	0x00000001, 0x00008000
39.long	0x00000000, 0x80008082
40.size	iotas32,.-iotas32
41
42.type	KeccakF1600_int, %function
43.align	5
44KeccakF1600_int:
45	add	r9,sp,#176
46	add	r12,sp,#0
47	add	r10,sp,#40
48	ldmia	r9,{r4,r5,r6,r7,r8,r9}		@ A[4][2..4]
49KeccakF1600_enter:
50	str	lr,[sp,#440]
51	eor	r11,r11,r11
52	str	r11,[sp,#444]
53	b	.Lround2x
54
55.align	4
56.Lround2x:
57	ldmia	r12,{r0,r1,r2,r3}		@ A[0][0..1]
58	ldmia	r10,{r10,r11,r12,r14}	@ A[1][0..1]
59#ifdef	__thumb2__
60	eor	r0,r0,r10
61	eor	r1,r1,r11
62	eor	r2,r2,r12
63	ldrd	r10,r11,[sp,#56]
64	eor	r3,r3,r14
65	ldrd	r12,r14,[sp,#64]
66	eor	r4,r4,r10
67	eor	r5,r5,r11
68	eor	r6,r6,r12
69	ldrd	r10,r11,[sp,#72]
70	eor	r7,r7,r14
71	ldrd	r12,r14,[sp,#80]
72	eor	r8,r8,r10
73	eor	r9,r9,r11
74	eor	r0,r0,r12
75	ldrd	r10,r11,[sp,#88]
76	eor	r1,r1,r14
77	ldrd	r12,r14,[sp,#96]
78	eor	r2,r2,r10
79	eor	r3,r3,r11
80	eor	r4,r4,r12
81	ldrd	r10,r11,[sp,#104]
82	eor	r5,r5,r14
83	ldrd	r12,r14,[sp,#112]
84	eor	r6,r6,r10
85	eor	r7,r7,r11
86	eor	r8,r8,r12
87	ldrd	r10,r11,[sp,#120]
88	eor	r9,r9,r14
89	ldrd	r12,r14,[sp,#128]
90	eor	r0,r0,r10
91	eor	r1,r1,r11
92	eor	r2,r2,r12
93	ldrd	r10,r11,[sp,#136]
94	eor	r3,r3,r14
95	ldrd	r12,r14,[sp,#144]
96	eor	r4,r4,r10
97	eor	r5,r5,r11
98	eor	r6,r6,r12
99	ldrd	r10,r11,[sp,#152]
100	eor	r7,r7,r14
101	ldrd	r12,r14,[sp,#160]
102	eor	r8,r8,r10
103	eor	r9,r9,r11
104	eor	r0,r0,r12
105	ldrd	r10,r11,[sp,#168]
106	eor	r1,r1,r14
107	ldrd	r12,r14,[sp,#16]
108	eor	r2,r2,r10
109	eor	r3,r3,r11
110	eor	r4,r4,r12
111	ldrd	r10,r11,[sp,#24]
112	eor	r5,r5,r14
113	ldrd	r12,r14,[sp,#32]
114#else
115	eor	r0,r0,r10
116	add	r10,sp,#56
117	eor	r1,r1,r11
118	eor	r2,r2,r12
119	eor	r3,r3,r14
120	ldmia	r10,{r10,r11,r12,r14}	@ A[1][2..3]
121	eor	r4,r4,r10
122	add	r10,sp,#72
123	eor	r5,r5,r11
124	eor	r6,r6,r12
125	eor	r7,r7,r14
126	ldmia	r10,{r10,r11,r12,r14}	@ A[1][4]..A[2][0]
127	eor	r8,r8,r10
128	add	r10,sp,#88
129	eor	r9,r9,r11
130	eor	r0,r0,r12
131	eor	r1,r1,r14
132	ldmia	r10,{r10,r11,r12,r14}	@ A[2][1..2]
133	eor	r2,r2,r10
134	add	r10,sp,#104
135	eor	r3,r3,r11
136	eor	r4,r4,r12
137	eor	r5,r5,r14
138	ldmia	r10,{r10,r11,r12,r14}	@ A[2][3..4]
139	eor	r6,r6,r10
140	add	r10,sp,#120
141	eor	r7,r7,r11
142	eor	r8,r8,r12
143	eor	r9,r9,r14
144	ldmia	r10,{r10,r11,r12,r14}	@ A[3][0..1]
145	eor	r0,r0,r10
146	add	r10,sp,#136
147	eor	r1,r1,r11
148	eor	r2,r2,r12
149	eor	r3,r3,r14
150	ldmia	r10,{r10,r11,r12,r14}	@ A[3][2..3]
151	eor	r4,r4,r10
152	add	r10,sp,#152
153	eor	r5,r5,r11
154	eor	r6,r6,r12
155	eor	r7,r7,r14
156	ldmia	r10,{r10,r11,r12,r14}	@ A[3][4]..A[4][0]
157	eor	r8,r8,r10
158	ldr	r10,[sp,#168]		@ A[4][1]
159	eor	r9,r9,r11
160	ldr	r11,[sp,#168+4]
161	eor	r0,r0,r12
162	ldr	r12,[sp,#16]		@ A[0][2]
163	eor	r1,r1,r14
164	ldr	r14,[sp,#16+4]
165	eor	r2,r2,r10
166	add	r10,sp,#24
167	eor	r3,r3,r11
168	eor	r4,r4,r12
169	eor	r5,r5,r14
170	ldmia	r10,{r10,r11,r12,r14}	@ A[0][3..4]
171#endif
172	eor	r6,r6,r10
173	eor	r7,r7,r11
174	eor	r8,r8,r12
175	eor	r9,r9,r14
176
177	eor	r10,r0,r5,ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
178#ifndef	__thumb2__
179	str	r10,[sp,#208]		@ D[1] = E[0]
180#endif
181	eor	r11,r1,r4
182#ifndef	__thumb2__
183	str	r11,[sp,#208+4]
184#else
185	strd	r10,r11,[sp,#208]		@ D[1] = E[0]
186#endif
187	eor	r12,r6,r1,ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
188	eor	r14,r7,r0
189#ifndef	__thumb2__
190	str	r12,[sp,#232]		@ D[4] = E[1]
191#endif
192	eor	r0,r8,r3,ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
193#ifndef	__thumb2__
194	str	r14,[sp,#232+4]
195#else
196	strd	r12,r14,[sp,#232]		@ D[4] = E[1]
197#endif
198	eor	r1,r9,r2
199#ifndef	__thumb2__
200	str	r0,[sp,#200]		@ D[0] = C[0]
201#endif
202	eor	r2,r2,r7,ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
203#ifndef	__thumb2__
204	ldr	r7,[sp,#144]
205#endif
206	eor	r3,r3,r6
207#ifndef	__thumb2__
208	str	r1,[sp,#200+4]
209#else
210	strd	r0,r1,[sp,#200]		@ D[0] = C[0]
211#endif
212#ifndef	__thumb2__
213	ldr	r6,[sp,#144+4]
214#else
215	ldrd	r7,r6,[sp,#144]
216#endif
217#ifndef	__thumb2__
218	str	r2,[sp,#216]		@ D[2] = C[1]
219#endif
220	eor	r4,r4,r9,ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
221#ifndef	__thumb2__
222	str	r3,[sp,#216+4]
223#else
224	strd	r2,r3,[sp,#216]		@ D[2] = C[1]
225#endif
226	eor	r5,r5,r8
227
228#ifndef	__thumb2__
229	ldr	r8,[sp,#192]
230#endif
231#ifndef	__thumb2__
232	ldr	r9,[sp,#192+4]
233#else
234	ldrd	r8,r9,[sp,#192]
235#endif
236#ifndef	__thumb2__
237	str	r4,[sp,#224]		@ D[3] = C[2]
238#endif
239	eor	r7,r7,r4
240#ifndef	__thumb2__
241	str	r5,[sp,#224+4]
242#else
243	strd	r4,r5,[sp,#224]		@ D[3] = C[2]
244#endif
245	eor	r6,r6,r5
246#ifndef	__thumb2__
247	ldr	r4,[sp,#0]
248#endif
249	@ mov	r7,r7,ror#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
250	@ mov	r6,r6,ror#32-11
251#ifndef	__thumb2__
252	ldr	r5,[sp,#0+4]
253#else
254	ldrd	r4,r5,[sp,#0]
255#endif
256	eor	r8,r8,r12
257	eor	r9,r9,r14
258#ifndef	__thumb2__
259	ldr	r12,[sp,#96]
260#endif
261	eor	r0,r0,r4
262#ifndef	__thumb2__
263	ldr	r14,[sp,#96+4]
264#else
265	ldrd	r12,r14,[sp,#96]
266#endif
267	@ mov	r8,r8,ror#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
268	@ mov	r9,r9,ror#32-7
269	eor	r1,r1,r5		@ C[0] =       A[0][0] ^ C[0];
270	eor	r12,r12,r2
271#ifndef	__thumb2__
272	ldr	r2,[sp,#48]
273#endif
274	eor	r14,r14,r3
275#ifndef	__thumb2__
276	ldr	r3,[sp,#48+4]
277#else
278	ldrd	r2,r3,[sp,#48]
279#endif
280	mov	r5,r12,ror#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);
281	ldr	r12,[sp,#444]			@ load counter
282	eor	r2,r2,r10
283	adr	r10,iotas32
284	mov	r4,r14,ror#32-22
285	add	r14,r10,r12
286	eor	r3,r3,r11
287	ldmia	r14,{r10,r11}		@ iotas[i]
288	bic	r12,r4,r2,ror#32-22
289	bic	r14,r5,r3,ror#32-22
290	mov	r2,r2,ror#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);
291	mov	r3,r3,ror#32-22
292	eor	r12,r12,r0
293	eor	r14,r14,r1
294	eor	r10,r10,r12
295	eor	r11,r11,r14
296#ifndef	__thumb2__
297	str	r10,[sp,#240]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
298#endif
299	bic	r12,r6,r4,ror#11
300#ifndef	__thumb2__
301	str	r11,[sp,#240+4]
302#else
303	strd	r10,r11,[sp,#240]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
304#endif
305	bic	r14,r7,r5,ror#10
306	bic	r10,r8,r6,ror#32-(11-7)
307	bic	r11,r9,r7,ror#32-(10-7)
308	eor	r12,r2,r12,ror#32-11
309#ifndef	__thumb2__
310	str	r12,[sp,#248]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
311#endif
312	eor	r14,r3,r14,ror#32-10
313#ifndef	__thumb2__
314	str	r14,[sp,#248+4]
315#else
316	strd	r12,r14,[sp,#248]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
317#endif
318	eor	r10,r4,r10,ror#32-7
319	eor	r11,r5,r11,ror#32-7
320#ifndef	__thumb2__
321	str	r10,[sp,#256]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
322#endif
323	bic	r12,r0,r8,ror#32-7
324#ifndef	__thumb2__
325	str	r11,[sp,#256+4]
326#else
327	strd	r10,r11,[sp,#256]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
328#endif
329	bic	r14,r1,r9,ror#32-7
330	eor	r12,r12,r6,ror#32-11
331#ifndef	__thumb2__
332	str	r12,[sp,#264]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
333#endif
334	eor	r14,r14,r7,ror#32-10
335#ifndef	__thumb2__
336	str	r14,[sp,#264+4]
337#else
338	strd	r12,r14,[sp,#264]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
339#endif
340	bic	r10,r2,r0
341	add	r14,sp,#224
342#ifndef	__thumb2__
343	ldr	r0,[sp,#24]		@ A[0][3]
344#endif
345	bic	r11,r3,r1
346#ifndef	__thumb2__
347	ldr	r1,[sp,#24+4]
348#else
349	ldrd	r0,r1,[sp,#24]		@ A[0][3]
350#endif
351	eor	r10,r10,r8,ror#32-7
352	eor	r11,r11,r9,ror#32-7
353#ifndef	__thumb2__
354	str	r10,[sp,#272]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
355#endif
356	add	r9,sp,#200
357#ifndef	__thumb2__
358	str	r11,[sp,#272+4]
359#else
360	strd	r10,r11,[sp,#272]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
361#endif
362
363	ldmia	r14,{r10,r11,r12,r14}	@ D[3..4]
364	ldmia	r9,{r6,r7,r8,r9}		@ D[0..1]
365
366#ifndef	__thumb2__
367	ldr	r2,[sp,#72]		@ A[1][4]
368#endif
369	eor	r0,r0,r10
370#ifndef	__thumb2__
371	ldr	r3,[sp,#72+4]
372#else
373	ldrd	r2,r3,[sp,#72]		@ A[1][4]
374#endif
375	eor	r1,r1,r11
376	@ mov	r0,r0,ror#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
377#ifndef	__thumb2__
378	ldr	r10,[sp,#128]		@ A[3][1]
379#endif
380	@ mov	r1,r1,ror#32-14
381#ifndef	__thumb2__
382	ldr	r11,[sp,#128+4]
383#else
384	ldrd	r10,r11,[sp,#128]		@ A[3][1]
385#endif
386
387	eor	r2,r2,r12
388#ifndef	__thumb2__
389	ldr	r4,[sp,#80]		@ A[2][0]
390#endif
391	eor	r3,r3,r14
392#ifndef	__thumb2__
393	ldr	r5,[sp,#80+4]
394#else
395	ldrd	r4,r5,[sp,#80]		@ A[2][0]
396#endif
397	@ mov	r2,r2,ror#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
398	@ mov	r3,r3,ror#32-10
399
400	eor	r6,r6,r4
401#ifndef	__thumb2__
402	ldr	r12,[sp,#216]		@ D[2]
403#endif
404	eor	r7,r7,r5
405#ifndef	__thumb2__
406	ldr	r14,[sp,#216+4]
407#else
408	ldrd	r12,r14,[sp,#216]		@ D[2]
409#endif
410	mov	r5,r6,ror#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
411	mov	r4,r7,ror#32-2
412
413	eor	r10,r10,r8
414#ifndef	__thumb2__
415	ldr	r8,[sp,#176]		@ A[4][2]
416#endif
417	eor	r11,r11,r9
418#ifndef	__thumb2__
419	ldr	r9,[sp,#176+4]
420#else
421	ldrd	r8,r9,[sp,#176]		@ A[4][2]
422#endif
423	mov	r7,r10,ror#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
424	mov	r6,r11,ror#32-23
425
426	bic	r10,r4,r2,ror#32-10
427	bic	r11,r5,r3,ror#32-10
428	eor	r12,r12,r8
429	eor	r14,r14,r9
430	mov	r9,r12,ror#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
431	mov	r8,r14,ror#32-31
432	eor	r10,r10,r0,ror#32-14
433	eor	r11,r11,r1,ror#32-14
434#ifndef	__thumb2__
435	str	r10,[sp,#280]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
436#endif
437	bic	r12,r6,r4
438#ifndef	__thumb2__
439	str	r11,[sp,#280+4]
440#else
441	strd	r10,r11,[sp,#280]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
442#endif
443	bic	r14,r7,r5
444	eor	r12,r12,r2,ror#32-10
445#ifndef	__thumb2__
446	str	r12,[sp,#288]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
447#endif
448	eor	r14,r14,r3,ror#32-10
449#ifndef	__thumb2__
450	str	r14,[sp,#288+4]
451#else
452	strd	r12,r14,[sp,#288]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
453#endif
454	bic	r10,r8,r6
455	bic	r11,r9,r7
456	bic	r12,r0,r8,ror#14
457	bic	r14,r1,r9,ror#14
458	eor	r10,r10,r4
459	eor	r11,r11,r5
460#ifndef	__thumb2__
461	str	r10,[sp,#296]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
462#endif
463	bic	r2,r2,r0,ror#32-(14-10)
464#ifndef	__thumb2__
465	str	r11,[sp,#296+4]
466#else
467	strd	r10,r11,[sp,#296]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
468#endif
469	eor	r12,r6,r12,ror#32-14
470	bic	r11,r3,r1,ror#32-(14-10)
471#ifndef	__thumb2__
472	str	r12,[sp,#304]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
473#endif
474	eor	r14,r7,r14,ror#32-14
475#ifndef	__thumb2__
476	str	r14,[sp,#304+4]
477#else
478	strd	r12,r14,[sp,#304]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
479#endif
480	add	r12,sp,#208
481#ifndef	__thumb2__
482	ldr	r1,[sp,#8]		@ A[0][1]
483#endif
484	eor	r10,r8,r2,ror#32-10
485#ifndef	__thumb2__
486	ldr	r0,[sp,#8+4]
487#else
488	ldrd	r1,r0,[sp,#8]		@ A[0][1]
489#endif
490	eor	r11,r9,r11,ror#32-10
491#ifndef	__thumb2__
492	str	r10,[sp,#312]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
493#endif
494#ifndef	__thumb2__
495	str	r11,[sp,#312+4]
496#else
497	strd	r10,r11,[sp,#312]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
498#endif
499
500	add	r9,sp,#224
501	ldmia	r12,{r10,r11,r12,r14}	@ D[1..2]
502#ifndef	__thumb2__
503	ldr	r2,[sp,#56]		@ A[1][2]
504#endif
505#ifndef	__thumb2__
506	ldr	r3,[sp,#56+4]
507#else
508	ldrd	r2,r3,[sp,#56]		@ A[1][2]
509#endif
510	ldmia	r9,{r6,r7,r8,r9}		@ D[3..4]
511
512	eor	r1,r1,r10
513#ifndef	__thumb2__
514	ldr	r4,[sp,#104]		@ A[2][3]
515#endif
516	eor	r0,r0,r11
517#ifndef	__thumb2__
518	ldr	r5,[sp,#104+4]
519#else
520	ldrd	r4,r5,[sp,#104]		@ A[2][3]
521#endif
522	mov	r0,r0,ror#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
523
524	eor	r2,r2,r12
525#ifndef	__thumb2__
526	ldr	r10,[sp,#152]		@ A[3][4]
527#endif
528	eor	r3,r3,r14
529#ifndef	__thumb2__
530	ldr	r11,[sp,#152+4]
531#else
532	ldrd	r10,r11,[sp,#152]		@ A[3][4]
533#endif
534	@ mov	r2,r2,ror#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
535#ifndef	__thumb2__
536	ldr	r12,[sp,#200]		@ D[0]
537#endif
538	@ mov	r3,r3,ror#32-3
539#ifndef	__thumb2__
540	ldr	r14,[sp,#200+4]
541#else
542	ldrd	r12,r14,[sp,#200]		@ D[0]
543#endif
544
545	eor	r4,r4,r6
546	eor	r5,r5,r7
547	@ mov	r5,r6,ror#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
548	@ mov	r4,r7,ror#32-13		@ [track reverse order below]
549
550	eor	r10,r10,r8
551#ifndef	__thumb2__
552	ldr	r8,[sp,#160]		@ A[4][0]
553#endif
554	eor	r11,r11,r9
555#ifndef	__thumb2__
556	ldr	r9,[sp,#160+4]
557#else
558	ldrd	r8,r9,[sp,#160]		@ A[4][0]
559#endif
560	mov	r6,r10,ror#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
561	mov	r7,r11,ror#32-4
562
563	eor	r12,r12,r8
564	eor	r14,r14,r9
565	mov	r8,r12,ror#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
566	mov	r9,r14,ror#32-9
567
568	bic	r10,r5,r2,ror#13-3
569	bic	r11,r4,r3,ror#12-3
570	bic	r12,r6,r5,ror#32-13
571	bic	r14,r7,r4,ror#32-12
572	eor	r10,r0,r10,ror#32-13
573	eor	r11,r1,r11,ror#32-12
574#ifndef	__thumb2__
575	str	r10,[sp,#320]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
576#endif
577	eor	r12,r12,r2,ror#32-3
578#ifndef	__thumb2__
579	str	r11,[sp,#320+4]
580#else
581	strd	r10,r11,[sp,#320]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
582#endif
583	eor	r14,r14,r3,ror#32-3
584#ifndef	__thumb2__
585	str	r12,[sp,#328]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
586#endif
587	bic	r10,r8,r6
588	bic	r11,r9,r7
589#ifndef	__thumb2__
590	str	r14,[sp,#328+4]
591#else
592	strd	r12,r14,[sp,#328]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
593#endif
594	eor	r10,r10,r5,ror#32-13
595	eor	r11,r11,r4,ror#32-12
596#ifndef	__thumb2__
597	str	r10,[sp,#336]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
598#endif
599	bic	r12,r0,r8
600#ifndef	__thumb2__
601	str	r11,[sp,#336+4]
602#else
603	strd	r10,r11,[sp,#336]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
604#endif
605	bic	r14,r1,r9
606	eor	r12,r12,r6
607	eor	r14,r14,r7
608#ifndef	__thumb2__
609	str	r12,[sp,#344]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
610#endif
611	bic	r10,r2,r0,ror#3
612#ifndef	__thumb2__
613	str	r14,[sp,#344+4]
614#else
615	strd	r12,r14,[sp,#344]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
616#endif
617	bic	r11,r3,r1,ror#3
618#ifndef	__thumb2__
619	ldr	r1,[sp,#32]		@ A[0][4] [in reverse order]
620#endif
621	eor	r10,r8,r10,ror#32-3
622#ifndef	__thumb2__
623	ldr	r0,[sp,#32+4]
624#else
625	ldrd	r1,r0,[sp,#32]		@ A[0][4] [in reverse order]
626#endif
627	eor	r11,r9,r11,ror#32-3
628#ifndef	__thumb2__
629	str	r10,[sp,#352]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
630#endif
631	add	r9,sp,#208
632#ifndef	__thumb2__
633	str	r11,[sp,#352+4]
634#else
635	strd	r10,r11,[sp,#352]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
636#endif
637
638#ifndef	__thumb2__
639	ldr	r10,[sp,#232]		@ D[4]
640#endif
641#ifndef	__thumb2__
642	ldr	r11,[sp,#232+4]
643#else
644	ldrd	r10,r11,[sp,#232]		@ D[4]
645#endif
646#ifndef	__thumb2__
647	ldr	r12,[sp,#200]		@ D[0]
648#endif
649#ifndef	__thumb2__
650	ldr	r14,[sp,#200+4]
651#else
652	ldrd	r12,r14,[sp,#200]		@ D[0]
653#endif
654
655	ldmia	r9,{r6,r7,r8,r9}		@ D[1..2]
656
657	eor	r1,r1,r10
658#ifndef	__thumb2__
659	ldr	r2,[sp,#40]		@ A[1][0]
660#endif
661	eor	r0,r0,r11
662#ifndef	__thumb2__
663	ldr	r3,[sp,#40+4]
664#else
665	ldrd	r2,r3,[sp,#40]		@ A[1][0]
666#endif
667	@ mov	r1,r10,ror#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
668#ifndef	__thumb2__
669	ldr	r4,[sp,#88]		@ A[2][1]
670#endif
671	@ mov	r0,r11,ror#32-14		@ [was loaded in reverse order]
672#ifndef	__thumb2__
673	ldr	r5,[sp,#88+4]
674#else
675	ldrd	r4,r5,[sp,#88]		@ A[2][1]
676#endif
677
678	eor	r2,r2,r12
679#ifndef	__thumb2__
680	ldr	r10,[sp,#136]		@ A[3][2]
681#endif
682	eor	r3,r3,r14
683#ifndef	__thumb2__
684	ldr	r11,[sp,#136+4]
685#else
686	ldrd	r10,r11,[sp,#136]		@ A[3][2]
687#endif
688	@ mov	r2,r2,ror#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
689#ifndef	__thumb2__
690	ldr	r12,[sp,#224]		@ D[3]
691#endif
692	@ mov	r3,r3,ror#32-18
693#ifndef	__thumb2__
694	ldr	r14,[sp,#224+4]
695#else
696	ldrd	r12,r14,[sp,#224]		@ D[3]
697#endif
698
699	eor	r6,r6,r4
700	eor	r7,r7,r5
701	mov	r4,r6,ror#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
702	mov	r5,r7,ror#32-5
703
704	eor	r10,r10,r8
705#ifndef	__thumb2__
706	ldr	r8,[sp,#184]		@ A[4][3]
707#endif
708	eor	r11,r11,r9
709#ifndef	__thumb2__
710	ldr	r9,[sp,#184+4]
711#else
712	ldrd	r8,r9,[sp,#184]		@ A[4][3]
713#endif
714	mov	r7,r10,ror#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
715	mov	r6,r11,ror#32-8
716
717	eor	r12,r12,r8
718	eor	r14,r14,r9
719	mov	r8,r12,ror#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
720	mov	r9,r14,ror#32-28
721
722	bic	r10,r4,r2,ror#32-18
723	bic	r11,r5,r3,ror#32-18
724	eor	r10,r10,r0,ror#32-14
725	eor	r11,r11,r1,ror#32-13
726#ifndef	__thumb2__
727	str	r10,[sp,#360]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
728#endif
729	bic	r12,r6,r4
730#ifndef	__thumb2__
731	str	r11,[sp,#360+4]
732#else
733	strd	r10,r11,[sp,#360]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
734#endif
735	bic	r14,r7,r5
736	eor	r12,r12,r2,ror#32-18
737#ifndef	__thumb2__
738	str	r12,[sp,#368]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
739#endif
740	eor	r14,r14,r3,ror#32-18
741#ifndef	__thumb2__
742	str	r14,[sp,#368+4]
743#else
744	strd	r12,r14,[sp,#368]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
745#endif
746	bic	r10,r8,r6
747	bic	r11,r9,r7
748	bic	r12,r0,r8,ror#14
749	bic	r14,r1,r9,ror#13
750	eor	r10,r10,r4
751	eor	r11,r11,r5
752#ifndef	__thumb2__
753	str	r10,[sp,#376]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
754#endif
755	bic	r2,r2,r0,ror#18-14
756#ifndef	__thumb2__
757	str	r11,[sp,#376+4]
758#else
759	strd	r10,r11,[sp,#376]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
760#endif
761	eor	r12,r6,r12,ror#32-14
762	bic	r11,r3,r1,ror#18-13
763	eor	r14,r7,r14,ror#32-13
764#ifndef	__thumb2__
765	str	r12,[sp,#384]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
766#endif
767#ifndef	__thumb2__
768	str	r14,[sp,#384+4]
769#else
770	strd	r12,r14,[sp,#384]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
771#endif
772	add	r14,sp,#216
773#ifndef	__thumb2__
774	ldr	r0,[sp,#16]		@ A[0][2]
775#endif
776	eor	r10,r8,r2,ror#32-18
777#ifndef	__thumb2__
778	ldr	r1,[sp,#16+4]
779#else
780	ldrd	r0,r1,[sp,#16]		@ A[0][2]
781#endif
782	eor	r11,r9,r11,ror#32-18
783#ifndef	__thumb2__
784	str	r10,[sp,#392]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
785#endif
786#ifndef	__thumb2__
787	str	r11,[sp,#392+4]
788#else
789	strd	r10,r11,[sp,#392]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
790#endif
791
792	ldmia	r14,{r10,r11,r12,r14}	@ D[2..3]
793#ifndef	__thumb2__
794	ldr	r2,[sp,#64]		@ A[1][3]
795#endif
796#ifndef	__thumb2__
797	ldr	r3,[sp,#64+4]
798#else
799	ldrd	r2,r3,[sp,#64]		@ A[1][3]
800#endif
801#ifndef	__thumb2__
802	ldr	r6,[sp,#232]		@ D[4]
803#endif
804#ifndef	__thumb2__
805	ldr	r7,[sp,#232+4]
806#else
807	ldrd	r6,r7,[sp,#232]		@ D[4]
808#endif
809
810	eor	r0,r0,r10
811#ifndef	__thumb2__
812	ldr	r4,[sp,#112]		@ A[2][4]
813#endif
814	eor	r1,r1,r11
815#ifndef	__thumb2__
816	ldr	r5,[sp,#112+4]
817#else
818	ldrd	r4,r5,[sp,#112]		@ A[2][4]
819#endif
820	@ mov	r0,r0,ror#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
821#ifndef	__thumb2__
822	ldr	r8,[sp,#200]		@ D[0]
823#endif
824	@ mov	r1,r1,ror#32-31
825#ifndef	__thumb2__
826	ldr	r9,[sp,#200+4]
827#else
828	ldrd	r8,r9,[sp,#200]		@ D[0]
829#endif
830
831	eor	r12,r12,r2
832#ifndef	__thumb2__
833	ldr	r10,[sp,#120]		@ A[3][0]
834#endif
835	eor	r14,r14,r3
836#ifndef	__thumb2__
837	ldr	r11,[sp,#120+4]
838#else
839	ldrd	r10,r11,[sp,#120]		@ A[3][0]
840#endif
841	mov	r3,r12,ror#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
842#ifndef	__thumb2__
843	ldr	r12,[sp,#208]		@ D[1]
844#endif
845	mov	r2,r14,ror#32-28
846#ifndef	__thumb2__
847	ldr	r14,[sp,#208+4]
848#else
849	ldrd	r12,r14,[sp,#208]		@ D[1]
850#endif
851
852	eor	r6,r6,r4
853	eor	r7,r7,r5
854	mov	r5,r6,ror#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
855	mov	r4,r7,ror#32-20
856
857	eor	r10,r10,r8
858#ifndef	__thumb2__
859	ldr	r8,[sp,#168]		@ A[4][1]
860#endif
861	eor	r11,r11,r9
862#ifndef	__thumb2__
863	ldr	r9,[sp,#168+4]
864#else
865	ldrd	r8,r9,[sp,#168]		@ A[4][1]
866#endif
867	mov	r7,r10,ror#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
868	mov	r6,r11,ror#32-21
869
870	eor	r8,r8,r12
871	eor	r9,r9,r14
872	@ mov	r8,r2,ror#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
873	@ mov	r9,r3,ror#32-1
874
875	bic	r10,r4,r2
876	bic	r11,r5,r3
877	eor	r10,r10,r0,ror#32-31
878#ifndef	__thumb2__
879	str	r10,[sp,#400]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
880#endif
881	eor	r11,r11,r1,ror#32-31
882#ifndef	__thumb2__
883	str	r11,[sp,#400+4]
884#else
885	strd	r10,r11,[sp,#400]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
886#endif
887	bic	r12,r6,r4
888	bic	r14,r7,r5
889	eor	r12,r12,r2
890	eor	r14,r14,r3
891#ifndef	__thumb2__
892	str	r12,[sp,#408]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
893#endif
894	bic	r10,r8,r6,ror#1
895#ifndef	__thumb2__
896	str	r14,[sp,#408+4]
897#else
898	strd	r12,r14,[sp,#408]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
899#endif
900	bic	r11,r9,r7,ror#1
901	bic	r12,r0,r8,ror#31-1
902	bic	r14,r1,r9,ror#31-1
903	eor	r4,r4,r10,ror#32-1
904#ifndef	__thumb2__
905	str	r4,[sp,#416]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
906#endif
907	eor	r5,r5,r11,ror#32-1
908#ifndef	__thumb2__
909	str	r5,[sp,#416+4]
910#else
911	strd	r4,r5,[sp,#416]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
912#endif
913	eor	r6,r6,r12,ror#32-31
914	eor	r7,r7,r14,ror#32-31
915#ifndef	__thumb2__
916	str	r6,[sp,#424]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
917#endif
918	bic	r10,r2,r0,ror#32-31
919#ifndef	__thumb2__
920	str	r7,[sp,#424+4]
921#else
922	strd	r6,r7,[sp,#424]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
923#endif
924	bic	r11,r3,r1,ror#32-31
925	add	r12,sp,#240
926	eor	r8,r10,r8,ror#32-1
927	add	r10,sp,#280
928	eor	r9,r11,r9,ror#32-1
929#ifndef	__thumb2__
930	str	r8,[sp,#432]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
931#endif
932#ifndef	__thumb2__
933	str	r9,[sp,#432+4]
934#else
935	strd	r8,r9,[sp,#432]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
936#endif
937	ldmia	r12,{r0,r1,r2,r3}		@ A[0][0..1]
938	ldmia	r10,{r10,r11,r12,r14}	@ A[1][0..1]
939#ifdef	__thumb2__
940	eor	r0,r0,r10
941	eor	r1,r1,r11
942	eor	r2,r2,r12
943	ldrd	r10,r11,[sp,#296]
944	eor	r3,r3,r14
945	ldrd	r12,r14,[sp,#304]
946	eor	r4,r4,r10
947	eor	r5,r5,r11
948	eor	r6,r6,r12
949	ldrd	r10,r11,[sp,#312]
950	eor	r7,r7,r14
951	ldrd	r12,r14,[sp,#320]
952	eor	r8,r8,r10
953	eor	r9,r9,r11
954	eor	r0,r0,r12
955	ldrd	r10,r11,[sp,#328]
956	eor	r1,r1,r14
957	ldrd	r12,r14,[sp,#336]
958	eor	r2,r2,r10
959	eor	r3,r3,r11
960	eor	r4,r4,r12
961	ldrd	r10,r11,[sp,#344]
962	eor	r5,r5,r14
963	ldrd	r12,r14,[sp,#352]
964	eor	r6,r6,r10
965	eor	r7,r7,r11
966	eor	r8,r8,r12
967	ldrd	r10,r11,[sp,#360]
968	eor	r9,r9,r14
969	ldrd	r12,r14,[sp,#368]
970	eor	r0,r0,r10
971	eor	r1,r1,r11
972	eor	r2,r2,r12
973	ldrd	r10,r11,[sp,#376]
974	eor	r3,r3,r14
975	ldrd	r12,r14,[sp,#384]
976	eor	r4,r4,r10
977	eor	r5,r5,r11
978	eor	r6,r6,r12
979	ldrd	r10,r11,[sp,#392]
980	eor	r7,r7,r14
981	ldrd	r12,r14,[sp,#400]
982	eor	r8,r8,r10
983	eor	r9,r9,r11
984	eor	r0,r0,r12
985	ldrd	r10,r11,[sp,#408]
986	eor	r1,r1,r14
987	ldrd	r12,r14,[sp,#256]
988	eor	r2,r2,r10
989	eor	r3,r3,r11
990	eor	r4,r4,r12
991	ldrd	r10,r11,[sp,#264]
992	eor	r5,r5,r14
993	ldrd	r12,r14,[sp,#272]
994#else
995	eor	r0,r0,r10
996	add	r10,sp,#296
997	eor	r1,r1,r11
998	eor	r2,r2,r12
999	eor	r3,r3,r14
1000	ldmia	r10,{r10,r11,r12,r14}	@ A[1][2..3]
1001	eor	r4,r4,r10
1002	add	r10,sp,#312
1003	eor	r5,r5,r11
1004	eor	r6,r6,r12
1005	eor	r7,r7,r14
1006	ldmia	r10,{r10,r11,r12,r14}	@ A[1][4]..A[2][0]
1007	eor	r8,r8,r10
1008	add	r10,sp,#328
1009	eor	r9,r9,r11
1010	eor	r0,r0,r12
1011	eor	r1,r1,r14
1012	ldmia	r10,{r10,r11,r12,r14}	@ A[2][1..2]
1013	eor	r2,r2,r10
1014	add	r10,sp,#344
1015	eor	r3,r3,r11
1016	eor	r4,r4,r12
1017	eor	r5,r5,r14
1018	ldmia	r10,{r10,r11,r12,r14}	@ A[2][3..4]
1019	eor	r6,r6,r10
1020	add	r10,sp,#360
1021	eor	r7,r7,r11
1022	eor	r8,r8,r12
1023	eor	r9,r9,r14
1024	ldmia	r10,{r10,r11,r12,r14}	@ A[3][0..1]
1025	eor	r0,r0,r10
1026	add	r10,sp,#376
1027	eor	r1,r1,r11
1028	eor	r2,r2,r12
1029	eor	r3,r3,r14
1030	ldmia	r10,{r10,r11,r12,r14}	@ A[3][2..3]
1031	eor	r4,r4,r10
1032	add	r10,sp,#392
1033	eor	r5,r5,r11
1034	eor	r6,r6,r12
1035	eor	r7,r7,r14
1036	ldmia	r10,{r10,r11,r12,r14}	@ A[3][4]..A[4][0]
1037	eor	r8,r8,r10
1038	ldr	r10,[sp,#408]		@ A[4][1]
1039	eor	r9,r9,r11
1040	ldr	r11,[sp,#408+4]
1041	eor	r0,r0,r12
1042	ldr	r12,[sp,#256]		@ A[0][2]
1043	eor	r1,r1,r14
1044	ldr	r14,[sp,#256+4]
1045	eor	r2,r2,r10
1046	add	r10,sp,#264
1047	eor	r3,r3,r11
1048	eor	r4,r4,r12
1049	eor	r5,r5,r14
1050	ldmia	r10,{r10,r11,r12,r14}	@ A[0][3..4]
1051#endif
1052	eor	r6,r6,r10
1053	eor	r7,r7,r11
1054	eor	r8,r8,r12
1055	eor	r9,r9,r14
1056
1057	eor	r10,r0,r5,ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
1058#ifndef	__thumb2__
1059	str	r10,[sp,#208]		@ D[1] = E[0]
1060#endif
1061	eor	r11,r1,r4
1062#ifndef	__thumb2__
1063	str	r11,[sp,#208+4]
1064#else
1065	strd	r10,r11,[sp,#208]		@ D[1] = E[0]
1066#endif
1067	eor	r12,r6,r1,ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
1068	eor	r14,r7,r0
1069#ifndef	__thumb2__
1070	str	r12,[sp,#232]		@ D[4] = E[1]
1071#endif
1072	eor	r0,r8,r3,ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
1073#ifndef	__thumb2__
1074	str	r14,[sp,#232+4]
1075#else
1076	strd	r12,r14,[sp,#232]		@ D[4] = E[1]
1077#endif
1078	eor	r1,r9,r2
1079#ifndef	__thumb2__
1080	str	r0,[sp,#200]		@ D[0] = C[0]
1081#endif
1082	eor	r2,r2,r7,ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
1083#ifndef	__thumb2__
1084	ldr	r7,[sp,#384]
1085#endif
1086	eor	r3,r3,r6
1087#ifndef	__thumb2__
1088	str	r1,[sp,#200+4]
1089#else
1090	strd	r0,r1,[sp,#200]		@ D[0] = C[0]
1091#endif
1092#ifndef	__thumb2__
1093	ldr	r6,[sp,#384+4]
1094#else
1095	ldrd	r7,r6,[sp,#384]
1096#endif
1097#ifndef	__thumb2__
1098	str	r2,[sp,#216]		@ D[2] = C[1]
1099#endif
1100	eor	r4,r4,r9,ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
1101#ifndef	__thumb2__
1102	str	r3,[sp,#216+4]
1103#else
1104	strd	r2,r3,[sp,#216]		@ D[2] = C[1]
1105#endif
1106	eor	r5,r5,r8
1107
1108#ifndef	__thumb2__
1109	ldr	r8,[sp,#432]
1110#endif
1111#ifndef	__thumb2__
1112	ldr	r9,[sp,#432+4]
1113#else
1114	ldrd	r8,r9,[sp,#432]
1115#endif
1116#ifndef	__thumb2__
1117	str	r4,[sp,#224]		@ D[3] = C[2]
1118#endif
1119	eor	r7,r7,r4
1120#ifndef	__thumb2__
1121	str	r5,[sp,#224+4]
1122#else
1123	strd	r4,r5,[sp,#224]		@ D[3] = C[2]
1124#endif
1125	eor	r6,r6,r5
1126#ifndef	__thumb2__
1127	ldr	r4,[sp,#240]
1128#endif
1129	@ mov	r7,r7,ror#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
1130	@ mov	r6,r6,ror#32-11
1131#ifndef	__thumb2__
1132	ldr	r5,[sp,#240+4]
1133#else
1134	ldrd	r4,r5,[sp,#240]
1135#endif
1136	eor	r8,r8,r12
1137	eor	r9,r9,r14
1138#ifndef	__thumb2__
1139	ldr	r12,[sp,#336]
1140#endif
1141	eor	r0,r0,r4
1142#ifndef	__thumb2__
1143	ldr	r14,[sp,#336+4]
1144#else
1145	ldrd	r12,r14,[sp,#336]
1146#endif
1147	@ mov	r8,r8,ror#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
1148	@ mov	r9,r9,ror#32-7
1149	eor	r1,r1,r5		@ C[0] =       A[0][0] ^ C[0];
1150	eor	r12,r12,r2
1151#ifndef	__thumb2__
1152	ldr	r2,[sp,#288]
1153#endif
1154	eor	r14,r14,r3
1155#ifndef	__thumb2__
1156	ldr	r3,[sp,#288+4]
1157#else
1158	ldrd	r2,r3,[sp,#288]
1159#endif
1160	mov	r5,r12,ror#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);
1161	ldr	r12,[sp,#444]			@ load counter
1162	eor	r2,r2,r10
1163	adr	r10,iotas32
1164	mov	r4,r14,ror#32-22
1165	add	r14,r10,r12
1166	eor	r3,r3,r11
1167#ifndef	__thumb2__
1168	ldr	r10,[r14,#8]		@ iotas[i].lo
1169#endif
1170	add	r12,r12,#16
1171#ifndef	__thumb2__
1172	ldr	r11,[r14,#12]		@ iotas[i].hi
1173#else
1174	ldrd	r10,r11,[r14,#8]		@ iotas[i].lo
1175#endif
1176	cmp	r12,#192
1177	str	r12,[sp,#444]			@ store counter
1178	bic	r12,r4,r2,ror#32-22
1179	bic	r14,r5,r3,ror#32-22
1180	mov	r2,r2,ror#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);
1181	mov	r3,r3,ror#32-22
1182	eor	r12,r12,r0
1183	eor	r14,r14,r1
1184	eor	r10,r10,r12
1185	eor	r11,r11,r14
1186#ifndef	__thumb2__
1187	str	r10,[sp,#0]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
1188#endif
1189	bic	r12,r6,r4,ror#11
1190#ifndef	__thumb2__
1191	str	r11,[sp,#0+4]
1192#else
1193	strd	r10,r11,[sp,#0]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
1194#endif
1195	bic	r14,r7,r5,ror#10
1196	bic	r10,r8,r6,ror#32-(11-7)
1197	bic	r11,r9,r7,ror#32-(10-7)
1198	eor	r12,r2,r12,ror#32-11
1199#ifndef	__thumb2__
1200	str	r12,[sp,#8]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
1201#endif
1202	eor	r14,r3,r14,ror#32-10
1203#ifndef	__thumb2__
1204	str	r14,[sp,#8+4]
1205#else
1206	strd	r12,r14,[sp,#8]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
1207#endif
1208	eor	r10,r4,r10,ror#32-7
1209	eor	r11,r5,r11,ror#32-7
1210#ifndef	__thumb2__
1211	str	r10,[sp,#16]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
1212#endif
1213	bic	r12,r0,r8,ror#32-7
1214#ifndef	__thumb2__
1215	str	r11,[sp,#16+4]
1216#else
1217	strd	r10,r11,[sp,#16]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
1218#endif
1219	bic	r14,r1,r9,ror#32-7
1220	eor	r12,r12,r6,ror#32-11
1221#ifndef	__thumb2__
1222	str	r12,[sp,#24]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
1223#endif
1224	eor	r14,r14,r7,ror#32-10
1225#ifndef	__thumb2__
1226	str	r14,[sp,#24+4]
1227#else
1228	strd	r12,r14,[sp,#24]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
1229#endif
1230	bic	r10,r2,r0
1231	add	r14,sp,#224
1232#ifndef	__thumb2__
1233	ldr	r0,[sp,#264]		@ A[0][3]
1234#endif
1235	bic	r11,r3,r1
1236#ifndef	__thumb2__
1237	ldr	r1,[sp,#264+4]
1238#else
1239	ldrd	r0,r1,[sp,#264]		@ A[0][3]
1240#endif
1241	eor	r10,r10,r8,ror#32-7
1242	eor	r11,r11,r9,ror#32-7
1243#ifndef	__thumb2__
1244	str	r10,[sp,#32]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
1245#endif
1246	add	r9,sp,#200
1247#ifndef	__thumb2__
1248	str	r11,[sp,#32+4]
1249#else
1250	strd	r10,r11,[sp,#32]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
1251#endif
1252
1253	ldmia	r14,{r10,r11,r12,r14}	@ D[3..4]
1254	ldmia	r9,{r6,r7,r8,r9}		@ D[0..1]
1255
1256#ifndef	__thumb2__
1257	ldr	r2,[sp,#312]		@ A[1][4]
1258#endif
1259	eor	r0,r0,r10
1260#ifndef	__thumb2__
1261	ldr	r3,[sp,#312+4]
1262#else
1263	ldrd	r2,r3,[sp,#312]		@ A[1][4]
1264#endif
1265	eor	r1,r1,r11
1266	@ mov	r0,r0,ror#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
1267#ifndef	__thumb2__
1268	ldr	r10,[sp,#368]		@ A[3][1]
1269#endif
1270	@ mov	r1,r1,ror#32-14
1271#ifndef	__thumb2__
1272	ldr	r11,[sp,#368+4]
1273#else
1274	ldrd	r10,r11,[sp,#368]		@ A[3][1]
1275#endif
1276
1277	eor	r2,r2,r12
1278#ifndef	__thumb2__
1279	ldr	r4,[sp,#320]		@ A[2][0]
1280#endif
1281	eor	r3,r3,r14
1282#ifndef	__thumb2__
1283	ldr	r5,[sp,#320+4]
1284#else
1285	ldrd	r4,r5,[sp,#320]		@ A[2][0]
1286#endif
1287	@ mov	r2,r2,ror#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
1288	@ mov	r3,r3,ror#32-10
1289
1290	eor	r6,r6,r4
1291#ifndef	__thumb2__
1292	ldr	r12,[sp,#216]		@ D[2]
1293#endif
1294	eor	r7,r7,r5
1295#ifndef	__thumb2__
1296	ldr	r14,[sp,#216+4]
1297#else
1298	ldrd	r12,r14,[sp,#216]		@ D[2]
1299#endif
1300	mov	r5,r6,ror#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
1301	mov	r4,r7,ror#32-2
1302
1303	eor	r10,r10,r8
1304#ifndef	__thumb2__
1305	ldr	r8,[sp,#416]		@ A[4][2]
1306#endif
1307	eor	r11,r11,r9
1308#ifndef	__thumb2__
1309	ldr	r9,[sp,#416+4]
1310#else
1311	ldrd	r8,r9,[sp,#416]		@ A[4][2]
1312#endif
1313	mov	r7,r10,ror#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
1314	mov	r6,r11,ror#32-23
1315
1316	bic	r10,r4,r2,ror#32-10
1317	bic	r11,r5,r3,ror#32-10
1318	eor	r12,r12,r8
1319	eor	r14,r14,r9
1320	mov	r9,r12,ror#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
1321	mov	r8,r14,ror#32-31
1322	eor	r10,r10,r0,ror#32-14
1323	eor	r11,r11,r1,ror#32-14
1324#ifndef	__thumb2__
1325	str	r10,[sp,#40]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
1326#endif
1327	bic	r12,r6,r4
1328#ifndef	__thumb2__
1329	str	r11,[sp,#40+4]
1330#else
1331	strd	r10,r11,[sp,#40]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
1332#endif
1333	bic	r14,r7,r5
1334	eor	r12,r12,r2,ror#32-10
1335#ifndef	__thumb2__
1336	str	r12,[sp,#48]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
1337#endif
1338	eor	r14,r14,r3,ror#32-10
1339#ifndef	__thumb2__
1340	str	r14,[sp,#48+4]
1341#else
1342	strd	r12,r14,[sp,#48]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
1343#endif
1344	bic	r10,r8,r6
1345	bic	r11,r9,r7
1346	bic	r12,r0,r8,ror#14
1347	bic	r14,r1,r9,ror#14
1348	eor	r10,r10,r4
1349	eor	r11,r11,r5
1350#ifndef	__thumb2__
1351	str	r10,[sp,#56]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
1352#endif
1353	bic	r2,r2,r0,ror#32-(14-10)
1354#ifndef	__thumb2__
1355	str	r11,[sp,#56+4]
1356#else
1357	strd	r10,r11,[sp,#56]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
1358#endif
1359	eor	r12,r6,r12,ror#32-14
1360	bic	r11,r3,r1,ror#32-(14-10)
1361#ifndef	__thumb2__
1362	str	r12,[sp,#64]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
1363#endif
1364	eor	r14,r7,r14,ror#32-14
1365#ifndef	__thumb2__
1366	str	r14,[sp,#64+4]
1367#else
1368	strd	r12,r14,[sp,#64]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
1369#endif
1370	add	r12,sp,#208
1371#ifndef	__thumb2__
1372	ldr	r1,[sp,#248]		@ A[0][1]
1373#endif
1374	eor	r10,r8,r2,ror#32-10
1375#ifndef	__thumb2__
1376	ldr	r0,[sp,#248+4]
1377#else
1378	ldrd	r1,r0,[sp,#248]		@ A[0][1]
1379#endif
1380	eor	r11,r9,r11,ror#32-10
1381#ifndef	__thumb2__
1382	str	r10,[sp,#72]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
1383#endif
1384#ifndef	__thumb2__
1385	str	r11,[sp,#72+4]
1386#else
1387	strd	r10,r11,[sp,#72]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
1388#endif
1389
1390	add	r9,sp,#224
1391	ldmia	r12,{r10,r11,r12,r14}	@ D[1..2]
1392#ifndef	__thumb2__
1393	ldr	r2,[sp,#296]		@ A[1][2]
1394#endif
1395#ifndef	__thumb2__
1396	ldr	r3,[sp,#296+4]
1397#else
1398	ldrd	r2,r3,[sp,#296]		@ A[1][2]
1399#endif
1400	ldmia	r9,{r6,r7,r8,r9}		@ D[3..4]
1401
1402	eor	r1,r1,r10
1403#ifndef	__thumb2__
1404	ldr	r4,[sp,#344]		@ A[2][3]
1405#endif
1406	eor	r0,r0,r11
1407#ifndef	__thumb2__
1408	ldr	r5,[sp,#344+4]
1409#else
1410	ldrd	r4,r5,[sp,#344]		@ A[2][3]
1411#endif
1412	mov	r0,r0,ror#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
1413
1414	eor	r2,r2,r12
1415#ifndef	__thumb2__
1416	ldr	r10,[sp,#392]		@ A[3][4]
1417#endif
1418	eor	r3,r3,r14
1419#ifndef	__thumb2__
1420	ldr	r11,[sp,#392+4]
1421#else
1422	ldrd	r10,r11,[sp,#392]		@ A[3][4]
1423#endif
1424	@ mov	r2,r2,ror#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
1425#ifndef	__thumb2__
1426	ldr	r12,[sp,#200]		@ D[0]
1427#endif
1428	@ mov	r3,r3,ror#32-3
1429#ifndef	__thumb2__
1430	ldr	r14,[sp,#200+4]
1431#else
1432	ldrd	r12,r14,[sp,#200]		@ D[0]
1433#endif
1434
1435	eor	r4,r4,r6
1436	eor	r5,r5,r7
1437	@ mov	r5,r6,ror#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
1438	@ mov	r4,r7,ror#32-13		@ [track reverse order below]
1439
1440	eor	r10,r10,r8
1441#ifndef	__thumb2__
1442	ldr	r8,[sp,#400]		@ A[4][0]
1443#endif
1444	eor	r11,r11,r9
1445#ifndef	__thumb2__
1446	ldr	r9,[sp,#400+4]
1447#else
1448	ldrd	r8,r9,[sp,#400]		@ A[4][0]
1449#endif
1450	mov	r6,r10,ror#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
1451	mov	r7,r11,ror#32-4
1452
1453	eor	r12,r12,r8
1454	eor	r14,r14,r9
1455	mov	r8,r12,ror#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
1456	mov	r9,r14,ror#32-9
1457
1458	bic	r10,r5,r2,ror#13-3
1459	bic	r11,r4,r3,ror#12-3
1460	bic	r12,r6,r5,ror#32-13
1461	bic	r14,r7,r4,ror#32-12
1462	eor	r10,r0,r10,ror#32-13
1463	eor	r11,r1,r11,ror#32-12
1464#ifndef	__thumb2__
1465	str	r10,[sp,#80]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
1466#endif
1467	eor	r12,r12,r2,ror#32-3
1468#ifndef	__thumb2__
1469	str	r11,[sp,#80+4]
1470#else
1471	strd	r10,r11,[sp,#80]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
1472#endif
1473	eor	r14,r14,r3,ror#32-3
1474#ifndef	__thumb2__
1475	str	r12,[sp,#88]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
1476#endif
1477	bic	r10,r8,r6
1478	bic	r11,r9,r7
1479#ifndef	__thumb2__
1480	str	r14,[sp,#88+4]
1481#else
1482	strd	r12,r14,[sp,#88]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
1483#endif
1484	eor	r10,r10,r5,ror#32-13
1485	eor	r11,r11,r4,ror#32-12
1486#ifndef	__thumb2__
1487	str	r10,[sp,#96]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
1488#endif
1489	bic	r12,r0,r8
1490#ifndef	__thumb2__
1491	str	r11,[sp,#96+4]
1492#else
1493	strd	r10,r11,[sp,#96]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
1494#endif
1495	bic	r14,r1,r9
1496	eor	r12,r12,r6
1497	eor	r14,r14,r7
1498#ifndef	__thumb2__
1499	str	r12,[sp,#104]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
1500#endif
1501	bic	r10,r2,r0,ror#3
1502#ifndef	__thumb2__
1503	str	r14,[sp,#104+4]
1504#else
1505	strd	r12,r14,[sp,#104]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
1506#endif
1507	bic	r11,r3,r1,ror#3
1508#ifndef	__thumb2__
1509	ldr	r1,[sp,#272]		@ A[0][4] [in reverse order]
1510#endif
1511	eor	r10,r8,r10,ror#32-3
1512#ifndef	__thumb2__
1513	ldr	r0,[sp,#272+4]
1514#else
1515	ldrd	r1,r0,[sp,#272]		@ A[0][4] [in reverse order]
1516#endif
1517	eor	r11,r9,r11,ror#32-3
1518#ifndef	__thumb2__
1519	str	r10,[sp,#112]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
1520#endif
1521	add	r9,sp,#208
1522#ifndef	__thumb2__
1523	str	r11,[sp,#112+4]
1524#else
1525	strd	r10,r11,[sp,#112]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
1526#endif
1527
1528#ifndef	__thumb2__
1529	ldr	r10,[sp,#232]		@ D[4]
1530#endif
1531#ifndef	__thumb2__
1532	ldr	r11,[sp,#232+4]
1533#else
1534	ldrd	r10,r11,[sp,#232]		@ D[4]
1535#endif
1536#ifndef	__thumb2__
1537	ldr	r12,[sp,#200]		@ D[0]
1538#endif
1539#ifndef	__thumb2__
1540	ldr	r14,[sp,#200+4]
1541#else
1542	ldrd	r12,r14,[sp,#200]		@ D[0]
1543#endif
1544
1545	ldmia	r9,{r6,r7,r8,r9}		@ D[1..2]
1546
1547	eor	r1,r1,r10
1548#ifndef	__thumb2__
1549	ldr	r2,[sp,#280]		@ A[1][0]
1550#endif
1551	eor	r0,r0,r11
1552#ifndef	__thumb2__
1553	ldr	r3,[sp,#280+4]
1554#else
1555	ldrd	r2,r3,[sp,#280]		@ A[1][0]
1556#endif
1557	@ mov	r1,r10,ror#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
1558#ifndef	__thumb2__
1559	ldr	r4,[sp,#328]		@ A[2][1]
1560#endif
1561	@ mov	r0,r11,ror#32-14		@ [was loaded in reverse order]
1562#ifndef	__thumb2__
1563	ldr	r5,[sp,#328+4]
1564#else
1565	ldrd	r4,r5,[sp,#328]		@ A[2][1]
1566#endif
1567
1568	eor	r2,r2,r12
1569#ifndef	__thumb2__
1570	ldr	r10,[sp,#376]		@ A[3][2]
1571#endif
1572	eor	r3,r3,r14
1573#ifndef	__thumb2__
1574	ldr	r11,[sp,#376+4]
1575#else
1576	ldrd	r10,r11,[sp,#376]		@ A[3][2]
1577#endif
1578	@ mov	r2,r2,ror#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
1579#ifndef	__thumb2__
1580	ldr	r12,[sp,#224]		@ D[3]
1581#endif
1582	@ mov	r3,r3,ror#32-18
1583#ifndef	__thumb2__
1584	ldr	r14,[sp,#224+4]
1585#else
1586	ldrd	r12,r14,[sp,#224]		@ D[3]
1587#endif
1588
1589	eor	r6,r6,r4
1590	eor	r7,r7,r5
1591	mov	r4,r6,ror#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
1592	mov	r5,r7,ror#32-5
1593
1594	eor	r10,r10,r8
1595#ifndef	__thumb2__
1596	ldr	r8,[sp,#424]		@ A[4][3]
1597#endif
1598	eor	r11,r11,r9
1599#ifndef	__thumb2__
1600	ldr	r9,[sp,#424+4]
1601#else
1602	ldrd	r8,r9,[sp,#424]		@ A[4][3]
1603#endif
1604	mov	r7,r10,ror#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
1605	mov	r6,r11,ror#32-8
1606
1607	eor	r12,r12,r8
1608	eor	r14,r14,r9
1609	mov	r8,r12,ror#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
1610	mov	r9,r14,ror#32-28
1611
1612	bic	r10,r4,r2,ror#32-18
1613	bic	r11,r5,r3,ror#32-18
1614	eor	r10,r10,r0,ror#32-14
1615	eor	r11,r11,r1,ror#32-13
1616#ifndef	__thumb2__
1617	str	r10,[sp,#120]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
1618#endif
1619	bic	r12,r6,r4
1620#ifndef	__thumb2__
1621	str	r11,[sp,#120+4]
1622#else
1623	strd	r10,r11,[sp,#120]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
1624#endif
1625	bic	r14,r7,r5
1626	eor	r12,r12,r2,ror#32-18
1627#ifndef	__thumb2__
1628	str	r12,[sp,#128]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
1629#endif
1630	eor	r14,r14,r3,ror#32-18
1631#ifndef	__thumb2__
1632	str	r14,[sp,#128+4]
1633#else
1634	strd	r12,r14,[sp,#128]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
1635#endif
1636	bic	r10,r8,r6
1637	bic	r11,r9,r7
1638	bic	r12,r0,r8,ror#14
1639	bic	r14,r1,r9,ror#13
1640	eor	r10,r10,r4
1641	eor	r11,r11,r5
1642#ifndef	__thumb2__
1643	str	r10,[sp,#136]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
1644#endif
1645	bic	r2,r2,r0,ror#18-14
1646#ifndef	__thumb2__
1647	str	r11,[sp,#136+4]
1648#else
1649	strd	r10,r11,[sp,#136]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
1650#endif
1651	eor	r12,r6,r12,ror#32-14
1652	bic	r11,r3,r1,ror#18-13
1653	eor	r14,r7,r14,ror#32-13
1654#ifndef	__thumb2__
1655	str	r12,[sp,#144]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
1656#endif
1657#ifndef	__thumb2__
1658	str	r14,[sp,#144+4]
1659#else
1660	strd	r12,r14,[sp,#144]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
1661#endif
1662	add	r14,sp,#216
1663#ifndef	__thumb2__
1664	ldr	r0,[sp,#256]		@ A[0][2]
1665#endif
1666	eor	r10,r8,r2,ror#32-18
1667#ifndef	__thumb2__
1668	ldr	r1,[sp,#256+4]
1669#else
1670	ldrd	r0,r1,[sp,#256]		@ A[0][2]
1671#endif
1672	eor	r11,r9,r11,ror#32-18
1673#ifndef	__thumb2__
1674	str	r10,[sp,#152]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
1675#endif
1676#ifndef	__thumb2__
1677	str	r11,[sp,#152+4]
1678#else
1679	strd	r10,r11,[sp,#152]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
1680#endif
1681
1682	ldmia	r14,{r10,r11,r12,r14}	@ D[2..3]
1683#ifndef	__thumb2__
1684	ldr	r2,[sp,#304]		@ A[1][3]
1685#endif
1686#ifndef	__thumb2__
1687	ldr	r3,[sp,#304+4]
1688#else
1689	ldrd	r2,r3,[sp,#304]		@ A[1][3]
1690#endif
1691#ifndef	__thumb2__
1692	ldr	r6,[sp,#232]		@ D[4]
1693#endif
1694#ifndef	__thumb2__
1695	ldr	r7,[sp,#232+4]
1696#else
1697	ldrd	r6,r7,[sp,#232]		@ D[4]
1698#endif
1699
1700	eor	r0,r0,r10
1701#ifndef	__thumb2__
1702	ldr	r4,[sp,#352]		@ A[2][4]
1703#endif
1704	eor	r1,r1,r11
1705#ifndef	__thumb2__
1706	ldr	r5,[sp,#352+4]
1707#else
1708	ldrd	r4,r5,[sp,#352]		@ A[2][4]
1709#endif
1710	@ mov	r0,r0,ror#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
1711#ifndef	__thumb2__
1712	ldr	r8,[sp,#200]		@ D[0]
1713#endif
1714	@ mov	r1,r1,ror#32-31
1715#ifndef	__thumb2__
1716	ldr	r9,[sp,#200+4]
1717#else
1718	ldrd	r8,r9,[sp,#200]		@ D[0]
1719#endif
1720
1721	eor	r12,r12,r2
1722#ifndef	__thumb2__
1723	ldr	r10,[sp,#360]		@ A[3][0]
1724#endif
1725	eor	r14,r14,r3
1726#ifndef	__thumb2__
1727	ldr	r11,[sp,#360+4]
1728#else
1729	ldrd	r10,r11,[sp,#360]		@ A[3][0]
1730#endif
1731	mov	r3,r12,ror#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
1732#ifndef	__thumb2__
1733	ldr	r12,[sp,#208]		@ D[1]
1734#endif
1735	mov	r2,r14,ror#32-28
1736#ifndef	__thumb2__
1737	ldr	r14,[sp,#208+4]
1738#else
1739	ldrd	r12,r14,[sp,#208]		@ D[1]
1740#endif
1741
1742	eor	r6,r6,r4
1743	eor	r7,r7,r5
1744	mov	r5,r6,ror#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
1745	mov	r4,r7,ror#32-20
1746
1747	eor	r10,r10,r8
1748#ifndef	__thumb2__
1749	ldr	r8,[sp,#408]		@ A[4][1]
1750#endif
1751	eor	r11,r11,r9
1752#ifndef	__thumb2__
1753	ldr	r9,[sp,#408+4]
1754#else
1755	ldrd	r8,r9,[sp,#408]		@ A[4][1]
1756#endif
1757	mov	r7,r10,ror#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
1758	mov	r6,r11,ror#32-21
1759
1760	eor	r8,r8,r12
1761	eor	r9,r9,r14
1762	@ mov	r8,r2,ror#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
1763	@ mov	r9,r3,ror#32-1
1764
1765	bic	r10,r4,r2
1766	bic	r11,r5,r3
1767	eor	r10,r10,r0,ror#32-31
1768#ifndef	__thumb2__
1769	str	r10,[sp,#160]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
1770#endif
1771	eor	r11,r11,r1,ror#32-31
1772#ifndef	__thumb2__
1773	str	r11,[sp,#160+4]
1774#else
1775	strd	r10,r11,[sp,#160]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
1776#endif
1777	bic	r12,r6,r4
1778	bic	r14,r7,r5
1779	eor	r12,r12,r2
1780	eor	r14,r14,r3
1781#ifndef	__thumb2__
1782	str	r12,[sp,#168]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
1783#endif
1784	bic	r10,r8,r6,ror#1
1785#ifndef	__thumb2__
1786	str	r14,[sp,#168+4]
1787#else
1788	strd	r12,r14,[sp,#168]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
1789#endif
1790	bic	r11,r9,r7,ror#1
1791	bic	r12,r0,r8,ror#31-1
1792	bic	r14,r1,r9,ror#31-1
1793	eor	r4,r4,r10,ror#32-1
1794#ifndef	__thumb2__
1795	str	r4,[sp,#176]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
1796#endif
1797	eor	r5,r5,r11,ror#32-1
1798#ifndef	__thumb2__
1799	str	r5,[sp,#176+4]
1800#else
1801	strd	r4,r5,[sp,#176]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
1802#endif
1803	eor	r6,r6,r12,ror#32-31
1804	eor	r7,r7,r14,ror#32-31
1805#ifndef	__thumb2__
1806	str	r6,[sp,#184]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
1807#endif
1808	bic	r10,r2,r0,ror#32-31
1809#ifndef	__thumb2__
1810	str	r7,[sp,#184+4]
1811#else
1812	strd	r6,r7,[sp,#184]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
1813#endif
1814	bic	r11,r3,r1,ror#32-31
1815	add	r12,sp,#0
1816	eor	r8,r10,r8,ror#32-1
1817	add	r10,sp,#40
1818	eor	r9,r11,r9,ror#32-1
1819#ifndef	__thumb2__
1820	str	r8,[sp,#192]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
1821#endif
1822#ifndef	__thumb2__
1823	str	r9,[sp,#192+4]
1824#else
1825	strd	r8,r9,[sp,#192]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
1826#endif
1827	blo	.Lround2x
1828
1829#if __ARM_ARCH__>=5
1830	ldr	pc,[sp,#440]
1831#else
1832	ldr	lr,[sp,#440]
1833	tst	lr,#1
1834	moveq	pc,lr		@ be binary compatible with V4, yet
1835.word	0xe12fff1e		@ interoperable with Thumb ISA:-)
1836#endif
1837.size	KeccakF1600_int,.-KeccakF1600_int
1838
1839.type	KeccakF1600, %function
1840.align	5
1841KeccakF1600:
1842	stmdb	sp!,{r0,r4-r11,lr}
1843	sub	sp,sp,#440+16			@ space for A[5][5],D[5],T[5][5],...
1844
1845	add	r10,r0,#40
1846	add	r11,sp,#40
1847	ldmia	r0,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ copy A[5][5] to stack
1848	stmia	sp,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1849	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1850	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1851	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1852	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1853	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1854	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1855	ldmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1856	add	r12,sp,#0
1857	add	r10,sp,#40
1858	stmia	r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1859
1860	bl	KeccakF1600_enter
1861
1862	ldr	r11, [sp,#440+16]		@ restore pointer to A
1863	ldmia	sp,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1864	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ return A[5][5]
1865	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1866	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1867	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1868	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1869	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1870	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1871	ldmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1872	stmia	r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1873
1874	add	sp,sp,#440+20
1875#if __ARM_ARCH__>=5
1876	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
1877#else
1878	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
1879	tst	lr,#1
1880	moveq	pc,lr		@ be binary compatible with V4, yet
1881.word	0xe12fff1e		@ interoperable with Thumb ISA:-)
1882#endif
1883.size	KeccakF1600,.-KeccakF1600
1884.globl	SHA3_absorb
1885.type	SHA3_absorb,%function
1886.align	5
1887SHA3_absorb:
1888	stmdb	sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
1889	sub	sp,sp,#456+16
1890
1891	add	r10,r0,#40
1892	@ mov	r11,r1
1893	mov	r12,r2
1894	mov	r14,r3
1895	cmp	r2,r3
1896	blo	.Labsorb_abort
1897
1898	add	r11,sp,#0
1899	ldmia	r0,      {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}	@ copy A[5][5] to stack
1900	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1901	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1902	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1903	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1904	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1905	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1906	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1907	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1908	stmia	r11,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1909
1910	ldr	r11,[sp,#476]		@ restore r11
1911#ifdef	__thumb2__
1912	mov	r9,#0x00ff00ff
1913	mov	r8,#0x0f0f0f0f
1914	mov	r7,#0x33333333
1915	mov	r6,#0x55555555
1916#else
1917	mov	r6,#0x11		@ compose constants
1918	mov	r8,#0x0f
1919	mov	r9,#0xff
1920	orr	r6,r6,r6,lsl#8
1921	orr	r8,r8,r8,lsl#8
1922	orr	r6,r6,r6,lsl#16		@ 0x11111111
1923	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
1924	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
1925	orr	r7,r6,r6,lsl#1		@ 0x33333333
1926	orr	r6,r6,r6,lsl#2		@ 0x55555555
1927#endif
1928	str	r9,[sp,#468]
1929	str	r8,[sp,#464]
1930	str	r7,[sp,#460]
1931	str	r6,[sp,#456]
1932	b	.Loop_absorb
1933
1934.align	4
1935.Loop_absorb:
1936	subs	r0,r12,r14
1937	blo	.Labsorbed
1938	add	r10,sp,#0
1939	str	r0,[sp,#480]		@ save len - bsz
1940
1941.align	4
1942.Loop_block:
1943	ldrb	r0,[r11],#1
1944	ldrb	r1,[r11],#1
1945	ldrb	r2,[r11],#1
1946	ldrb	r3,[r11],#1
1947	ldrb	r4,[r11],#1
1948	orr	r0,r0,r1,lsl#8
1949	ldrb	r1,[r11],#1
1950	orr	r0,r0,r2,lsl#16
1951	ldrb	r2,[r11],#1
1952	orr	r0,r0,r3,lsl#24		@ lo
1953	ldrb	r3,[r11],#1
1954	orr	r1,r4,r1,lsl#8
1955	orr	r1,r1,r2,lsl#16
1956	orr	r1,r1,r3,lsl#24		@ hi
1957
1958	and	r2,r0,r6		@ &=0x55555555
1959	and	r0,r0,r6,lsl#1		@ &=0xaaaaaaaa
1960	and	r3,r1,r6		@ &=0x55555555
1961	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
1962	orr	r2,r2,r2,lsr#1
1963	orr	r0,r0,r0,lsl#1
1964	orr	r3,r3,r3,lsr#1
1965	orr	r1,r1,r1,lsl#1
1966	and	r2,r2,r7		@ &=0x33333333
1967	and	r0,r0,r7,lsl#2		@ &=0xcccccccc
1968	and	r3,r3,r7		@ &=0x33333333
1969	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
1970	orr	r2,r2,r2,lsr#2
1971	orr	r0,r0,r0,lsl#2
1972	orr	r3,r3,r3,lsr#2
1973	orr	r1,r1,r1,lsl#2
1974	and	r2,r2,r8		@ &=0x0f0f0f0f
1975	and	r0,r0,r8,lsl#4		@ &=0xf0f0f0f0
1976	and	r3,r3,r8		@ &=0x0f0f0f0f
1977	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
1978	ldmia	r10,{r4,r5}		@ A_flat[i]
1979	orr	r2,r2,r2,lsr#4
1980	orr	r0,r0,r0,lsl#4
1981	orr	r3,r3,r3,lsr#4
1982	orr	r1,r1,r1,lsl#4
1983	and	r2,r2,r9		@ &=0x00ff00ff
1984	and	r0,r0,r9,lsl#8		@ &=0xff00ff00
1985	and	r3,r3,r9		@ &=0x00ff00ff
1986	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
1987	orr	r2,r2,r2,lsr#8
1988	orr	r0,r0,r0,lsl#8
1989	orr	r3,r3,r3,lsr#8
1990	orr	r1,r1,r1,lsl#8
1991
1992	mov	r2,r2,lsl#16
1993	mov	r1,r1,lsr#16
1994	eor	r4,r4,r3,lsl#16
1995	eor	r5,r5,r0,lsr#16
1996	eor	r4,r4,r2,lsr#16
1997	eor	r5,r5,r1,lsl#16
1998	stmia	r10!,{r4,r5}	@ A_flat[i++] ^= BitInterleave(inp[0..7])
1999
2000	subs	r14,r14,#8
2001	bhi	.Loop_block
2002
2003	str	r11,[sp,#476]
2004
2005	bl	KeccakF1600_int
2006
2007	add	r14,sp,#456
2008	ldmia	r14,{r6,r7,r8,r9,r10,r11,r12,r14}	@ restore constants and variables
2009	b	.Loop_absorb
2010
2011.align	4
2012.Labsorbed:
2013	add	r11,sp,#40
2014	ldmia	sp,      {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2015	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}	@ return A[5][5]
2016	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2017	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2018	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2019	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2020	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2021	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2022	ldmia	r11,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2023	stmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2024
2025.Labsorb_abort:
2026	add	sp,sp,#456+32
2027	mov	r0,r12			@ return value
2028#if __ARM_ARCH__>=5
2029	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
2030#else
2031	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
2032	tst	lr,#1
2033	moveq	pc,lr		@ be binary compatible with V4, yet
2034.word	0xe12fff1e		@ interoperable with Thumb ISA:-)
2035#endif
2036.size	SHA3_absorb,.-SHA3_absorb
2037.globl	SHA3_squeeze
2038.type	SHA3_squeeze,%function
2039.align	5
2040SHA3_squeeze:
2041	stmdb	sp!,{r0,r3-r10,lr}
2042
2043	mov	r10,r0
2044	mov	r4,r1
2045	mov	r5,r2
2046	mov	r12,r3
2047
2048#ifdef	__thumb2__
2049	mov	r9,#0x00ff00ff
2050	mov	r8,#0x0f0f0f0f
2051	mov	r7,#0x33333333
2052	mov	r6,#0x55555555
2053#else
2054	mov	r6,#0x11		@ compose constants
2055	mov	r8,#0x0f
2056	mov	r9,#0xff
2057	orr	r6,r6,r6,lsl#8
2058	orr	r8,r8,r8,lsl#8
2059	orr	r6,r6,r6,lsl#16		@ 0x11111111
2060	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
2061	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
2062	orr	r7,r6,r6,lsl#1		@ 0x33333333
2063	orr	r6,r6,r6,lsl#2		@ 0x55555555
2064#endif
2065	stmdb	sp!,{r6,r7,r8,r9}
2066
2067	mov	r14,r10
2068	b	.Loop_squeeze
2069
2070.align	4
2071.Loop_squeeze:
2072	ldmia	r10!,{r0,r1}	@ A_flat[i++]
2073
2074	mov	r2,r0,lsl#16
2075	mov	r3,r1,lsl#16		@ r3 = r1 << 16
2076	mov	r2,r2,lsr#16		@ r2 = r0 & 0x0000ffff
2077	mov	r1,r1,lsr#16
2078	mov	r0,r0,lsr#16		@ r0 = r0 >> 16
2079	mov	r1,r1,lsl#16		@ r1 = r1 & 0xffff0000
2080
2081	orr	r2,r2,r2,lsl#8
2082	orr	r3,r3,r3,lsr#8
2083	orr	r0,r0,r0,lsl#8
2084	orr	r1,r1,r1,lsr#8
2085	and	r2,r2,r9		@ &=0x00ff00ff
2086	and	r3,r3,r9,lsl#8		@ &=0xff00ff00
2087	and	r0,r0,r9		@ &=0x00ff00ff
2088	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
2089	orr	r2,r2,r2,lsl#4
2090	orr	r3,r3,r3,lsr#4
2091	orr	r0,r0,r0,lsl#4
2092	orr	r1,r1,r1,lsr#4
2093	and	r2,r2,r8		@ &=0x0f0f0f0f
2094	and	r3,r3,r8,lsl#4		@ &=0xf0f0f0f0
2095	and	r0,r0,r8		@ &=0x0f0f0f0f
2096	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
2097	orr	r2,r2,r2,lsl#2
2098	orr	r3,r3,r3,lsr#2
2099	orr	r0,r0,r0,lsl#2
2100	orr	r1,r1,r1,lsr#2
2101	and	r2,r2,r7		@ &=0x33333333
2102	and	r3,r3,r7,lsl#2		@ &=0xcccccccc
2103	and	r0,r0,r7		@ &=0x33333333
2104	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
2105	orr	r2,r2,r2,lsl#1
2106	orr	r3,r3,r3,lsr#1
2107	orr	r0,r0,r0,lsl#1
2108	orr	r1,r1,r1,lsr#1
2109	and	r2,r2,r6		@ &=0x55555555
2110	and	r3,r3,r6,lsl#1		@ &=0xaaaaaaaa
2111	and	r0,r0,r6		@ &=0x55555555
2112	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
2113
2114	orr	r2,r2,r3
2115	orr	r0,r0,r1
2116
2117	cmp	r5,#8
2118	blo	.Lsqueeze_tail
2119	mov	r1,r2,lsr#8
2120	strb	r2,[r4],#1
2121	mov	r3,r2,lsr#16
2122	strb	r1,[r4],#1
2123	mov	r2,r2,lsr#24
2124	strb	r3,[r4],#1
2125	strb	r2,[r4],#1
2126
2127	mov	r1,r0,lsr#8
2128	strb	r0,[r4],#1
2129	mov	r3,r0,lsr#16
2130	strb	r1,[r4],#1
2131	mov	r0,r0,lsr#24
2132	strb	r3,[r4],#1
2133	strb	r0,[r4],#1
2134	subs	r5,r5,#8
2135	beq	.Lsqueeze_done
2136
2137	subs	r12,r12,#8		@ bsz -= 8
2138	bhi	.Loop_squeeze
2139
2140	mov	r0,r14			@ original r10
2141
2142	bl	KeccakF1600
2143
2144	ldmia	sp,{r6,r7,r8,r9,r10,r12}		@ restore constants and variables
2145	mov	r14,r10
2146	b	.Loop_squeeze
2147
2148.align	4
2149.Lsqueeze_tail:
2150	strb	r2,[r4],#1
2151	mov	r2,r2,lsr#8
2152	subs	r5,r5,#1
2153	beq	.Lsqueeze_done
2154	strb	r2,[r4],#1
2155	mov	r2,r2,lsr#8
2156	subs	r5,r5,#1
2157	beq	.Lsqueeze_done
2158	strb	r2,[r4],#1
2159	mov	r2,r2,lsr#8
2160	subs	r5,r5,#1
2161	beq	.Lsqueeze_done
2162	strb	r2,[r4],#1
2163	subs	r5,r5,#1
2164	beq	.Lsqueeze_done
2165
2166	strb	r0,[r4],#1
2167	mov	r0,r0,lsr#8
2168	subs	r5,r5,#1
2169	beq	.Lsqueeze_done
2170	strb	r0,[r4],#1
2171	mov	r0,r0,lsr#8
2172	subs	r5,r5,#1
2173	beq	.Lsqueeze_done
2174	strb	r0,[r4]
2175	b	.Lsqueeze_done
2176
2177.align	4
2178.Lsqueeze_done:
2179	add	sp,sp,#24
2180#if __ARM_ARCH__>=5
2181	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
2182#else
2183	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
2184	tst	lr,#1
2185	moveq	pc,lr		@ be binary compatible with V4, yet
2186.word	0xe12fff1e		@ interoperable with Thumb ISA:-)
2187#endif
2188.size	SHA3_squeeze,.-SHA3_squeeze
2189#if __ARM_MAX_ARCH__>=7
2190.fpu	neon
2191
2192.type	iotas64, %object
2193.align	5
2194iotas64:
2195.quad	0x0000000000000001
2196.quad	0x0000000000008082
2197.quad	0x800000000000808a
2198.quad	0x8000000080008000
2199.quad	0x000000000000808b
2200.quad	0x0000000080000001
2201.quad	0x8000000080008081
2202.quad	0x8000000000008009
2203.quad	0x000000000000008a
2204.quad	0x0000000000000088
2205.quad	0x0000000080008009
2206.quad	0x000000008000000a
2207.quad	0x000000008000808b
2208.quad	0x800000000000008b
2209.quad	0x8000000000008089
2210.quad	0x8000000000008003
2211.quad	0x8000000000008002
2212.quad	0x8000000000000080
2213.quad	0x000000000000800a
2214.quad	0x800000008000000a
2215.quad	0x8000000080008081
2216.quad	0x8000000000008080
2217.quad	0x0000000080000001
2218.quad	0x8000000080008008
2219.size	iotas64,.-iotas64
2220
2221.type	KeccakF1600_neon, %function
2222.align	5
2223KeccakF1600_neon:
2224	add	r1, r0, #16
2225	adr	r2, iotas64
2226	mov	r3, #24			@ loop counter
2227	b	.Loop_neon
2228
2229.align	4
2230.Loop_neon:
2231	@ Theta
2232	vst1.64	{q4},  [r0,:64]		@ offload A[0..1][4]
2233	veor	q13, q0,  q5		@ A[0..1][0]^A[2..3][0]
2234	vst1.64	{d18}, [r1,:64]		@ offload A[2][4]
2235	veor	q14, q1,  q6		@ A[0..1][1]^A[2..3][1]
2236	veor	q15, q2,  q7		@ A[0..1][2]^A[2..3][2]
2237	veor	d26, d26, d27		@ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
2238	veor	d27, d28, d29		@ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
2239	veor	q14, q3,  q8		@ A[0..1][3]^A[2..3][3]
2240	veor	q4,  q4,  q9		@ A[0..1][4]^A[2..3][4]
2241	veor	d30, d30, d31		@ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
2242	veor	d31, d28, d29		@ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
2243	veor	d25, d8,  d9		@ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
2244	veor	q13, q13, q10		@ C[0..1]^=A[4][0..1]
2245	veor	q14, q15, q11		@ C[2..3]^=A[4][2..3]
2246	veor	d25, d25, d24		@ C[4]^=A[4][4]
2247
2248	vadd.u64	q4,  q13, q13		@ C[0..1]<<1
2249	vadd.u64	q15, q14, q14		@ C[2..3]<<1
2250	vadd.u64	d18, d25, d25		@ C[4]<<1
2251	vsri.u64	q4,  q13, #63		@ ROL64(C[0..1],1)
2252	vsri.u64	q15, q14, #63		@ ROL64(C[2..3],1)
2253	vsri.u64	d18, d25, #63		@ ROL64(C[4],1)
2254	veor	d25, d25, d9		@ D[0] = C[4] ^= ROL64(C[1],1)
2255	veor	q13, q13, q15		@ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
2256	veor	d28, d28, d18		@ D[3] = C[2] ^= ROL64(C[4],1)
2257	veor	d29, d29, d8		@ D[4] = C[3] ^= ROL64(C[0],1)
2258
2259	veor	d0,  d0,  d25		@ A[0][0] ^= C[4]
2260	veor	d1,  d1,  d25		@ A[1][0] ^= C[4]
2261	veor	d10, d10, d25		@ A[2][0] ^= C[4]
2262	veor	d11, d11, d25		@ A[3][0] ^= C[4]
2263	veor	d20, d20, d25		@ A[4][0] ^= C[4]
2264
2265	veor	d2,  d2,  d26		@ A[0][1] ^= D[1]
2266	veor	d3,  d3,  d26		@ A[1][1] ^= D[1]
2267	veor	d12, d12, d26		@ A[2][1] ^= D[1]
2268	veor	d13, d13, d26		@ A[3][1] ^= D[1]
2269	veor	d21, d21, d26		@ A[4][1] ^= D[1]
2270	vmov	d26, d27
2271
2272	veor	d6,  d6,  d28		@ A[0][3] ^= C[2]
2273	veor	d7,  d7,  d28		@ A[1][3] ^= C[2]
2274	veor	d16, d16, d28		@ A[2][3] ^= C[2]
2275	veor	d17, d17, d28		@ A[3][3] ^= C[2]
2276	veor	d23, d23, d28		@ A[4][3] ^= C[2]
2277	vld1.64	{q4},  [r0,:64]		@ restore A[0..1][4]
2278	vmov	d28, d29
2279
2280	vld1.64	{d18}, [r1,:64]		@ restore A[2][4]
2281	veor	q2,  q2,  q13		@ A[0..1][2] ^= D[2]
2282	veor	q7,  q7,  q13		@ A[2..3][2] ^= D[2]
2283	veor	d22, d22, d27		@ A[4][2]    ^= D[2]
2284
2285	veor	q4,  q4,  q14		@ A[0..1][4] ^= C[3]
2286	veor	q9,  q9,  q14		@ A[2..3][4] ^= C[3]
2287	veor	d24, d24, d29		@ A[4][4]    ^= C[3]
2288
2289	@ Rho + Pi
2290	vmov	d26, d2			@ C[1] = A[0][1]
2291	vshl.u64	d2,  d3,  #44
2292	vmov	d27, d4			@ C[2] = A[0][2]
2293	vshl.u64	d4,  d14, #43
2294	vmov	d28, d6			@ C[3] = A[0][3]
2295	vshl.u64	d6,  d17, #21
2296	vmov	d29, d8			@ C[4] = A[0][4]
2297	vshl.u64	d8,  d24, #14
2298	vsri.u64	d2,  d3,  #64-44	@ A[0][1] = ROL64(A[1][1], rhotates[1][1])
2299	vsri.u64	d4,  d14, #64-43	@ A[0][2] = ROL64(A[2][2], rhotates[2][2])
2300	vsri.u64	d6,  d17, #64-21	@ A[0][3] = ROL64(A[3][3], rhotates[3][3])
2301	vsri.u64	d8,  d24, #64-14	@ A[0][4] = ROL64(A[4][4], rhotates[4][4])
2302
2303	vshl.u64	d3,  d9,  #20
2304	vshl.u64	d14, d16, #25
2305	vshl.u64	d17, d15, #15
2306	vshl.u64	d24, d21, #2
2307	vsri.u64	d3,  d9,  #64-20	@ A[1][1] = ROL64(A[1][4], rhotates[1][4])
2308	vsri.u64	d14, d16, #64-25	@ A[2][2] = ROL64(A[2][3], rhotates[2][3])
2309	vsri.u64	d17, d15, #64-15	@ A[3][3] = ROL64(A[3][2], rhotates[3][2])
2310	vsri.u64	d24, d21, #64-2		@ A[4][4] = ROL64(A[4][1], rhotates[4][1])
2311
2312	vshl.u64	d9,  d22, #61
2313	@ vshl.u64	d16, d19, #8
2314	vshl.u64	d15, d12, #10
2315	vshl.u64	d21, d7,  #55
2316	vsri.u64	d9,  d22, #64-61	@ A[1][4] = ROL64(A[4][2], rhotates[4][2])
2317	vext.8	d16, d19, d19, #8-1	@ A[2][3] = ROL64(A[3][4], rhotates[3][4])
2318	vsri.u64	d15, d12, #64-10	@ A[3][2] = ROL64(A[2][1], rhotates[2][1])
2319	vsri.u64	d21, d7,  #64-55	@ A[4][1] = ROL64(A[1][3], rhotates[1][3])
2320
2321	vshl.u64	d22, d18, #39
2322	@ vshl.u64	d19, d23, #56
2323	vshl.u64	d12, d5,  #6
2324	vshl.u64	d7,  d13, #45
2325	vsri.u64	d22, d18, #64-39	@ A[4][2] = ROL64(A[2][4], rhotates[2][4])
2326	vext.8	d19, d23, d23, #8-7	@ A[3][4] = ROL64(A[4][3], rhotates[4][3])
2327	vsri.u64	d12, d5,  #64-6		@ A[2][1] = ROL64(A[1][2], rhotates[1][2])
2328	vsri.u64	d7,  d13, #64-45	@ A[1][3] = ROL64(A[3][1], rhotates[3][1])
2329
2330	vshl.u64	d18, d20, #18
2331	vshl.u64	d23, d11, #41
2332	vshl.u64	d5,  d10, #3
2333	vshl.u64	d13, d1,  #36
2334	vsri.u64	d18, d20, #64-18	@ A[2][4] = ROL64(A[4][0], rhotates[4][0])
2335	vsri.u64	d23, d11, #64-41	@ A[4][3] = ROL64(A[3][0], rhotates[3][0])
2336	vsri.u64	d5,  d10, #64-3		@ A[1][2] = ROL64(A[2][0], rhotates[2][0])
2337	vsri.u64	d13, d1,  #64-36	@ A[3][1] = ROL64(A[1][0], rhotates[1][0])
2338
2339	vshl.u64	d1,  d28, #28
2340	vshl.u64	d10, d26, #1
2341	vshl.u64	d11, d29, #27
2342	vshl.u64	d20, d27, #62
2343	vsri.u64	d1,  d28, #64-28	@ A[1][0] = ROL64(C[3],    rhotates[0][3])
2344	vsri.u64	d10, d26, #64-1		@ A[2][0] = ROL64(C[1],    rhotates[0][1])
2345	vsri.u64	d11, d29, #64-27	@ A[3][0] = ROL64(C[4],    rhotates[0][4])
2346	vsri.u64	d20, d27, #64-62	@ A[4][0] = ROL64(C[2],    rhotates[0][2])
2347
2348	@ Chi + Iota
2349	vbic	q13, q2,  q1
2350	vbic	q14, q3,  q2
2351	vbic	q15, q4,  q3
2352	veor	q13, q13, q0		@ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
2353	veor	q14, q14, q1		@ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
2354	veor	q2,  q2,  q15		@ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
2355	vst1.64	{q13}, [r0,:64]		@ offload A[0..1][0]
2356	vbic	q13, q0,  q4
2357	vbic	q15, q1,  q0
2358	vmov	q1,  q14		@ A[0..1][1]
2359	veor	q3,  q3,  q13		@ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
2360	veor	q4,  q4,  q15		@ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
2361
2362	vbic	q13, q7,  q6
2363	vmov	q0,  q5			@ A[2..3][0]
2364	vbic	q14, q8,  q7
2365	vmov	q15, q6			@ A[2..3][1]
2366	veor	q5,  q5,  q13		@ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
2367	vbic	q13, q9,  q8
2368	veor	q6,  q6,  q14		@ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
2369	vbic	q14, q0,  q9
2370	veor	q7,  q7,  q13		@ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
2371	vbic	q13, q15, q0
2372	veor	q8,  q8,  q14		@ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
2373	vmov	q14, q10		@ A[4][0..1]
2374	veor	q9,  q9,  q13		@ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
2375
2376	vld1.64	d25, [r2,:64]!		@ Iota[i++]
2377	vbic	d26, d22, d21
2378	vbic	d27, d23, d22
2379	vld1.64	{q0}, [r0,:64]		@ restore A[0..1][0]
2380	veor	d20, d20, d26		@ A[4][0] ^= (~A[4][1] & A[4][2])
2381	vbic	d26, d24, d23
2382	veor	d21, d21, d27		@ A[4][1] ^= (~A[4][2] & A[4][3])
2383	vbic	d27, d28, d24
2384	veor	d22, d22, d26		@ A[4][2] ^= (~A[4][3] & A[4][4])
2385	vbic	d26, d29, d28
2386	veor	d23, d23, d27		@ A[4][3] ^= (~A[4][4] & A[4][0])
2387	veor	d0,  d0,  d25		@ A[0][0] ^= Iota[i]
2388	veor	d24, d24, d26		@ A[4][4] ^= (~A[4][0] & A[4][1])
2389
2390	subs	r3, r3, #1
2391	bne	.Loop_neon
2392
2393	bx	lr
2394.size	KeccakF1600_neon,.-KeccakF1600_neon
2395
2396.globl	SHA3_absorb_neon
2397.type	SHA3_absorb_neon, %function
2398.align	5
2399SHA3_absorb_neon:
2400	stmdb	sp!, {r4,r5,r6,lr}
2401	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
2402
2403	mov	r4, r1			@ inp
2404	mov	r5, r2			@ len
2405	mov	r6, r3			@ bsz
2406
2407	vld1.32	{d0}, [r0,:64]!		@ A[0][0]
2408	vld1.32	{d2}, [r0,:64]!		@ A[0][1]
2409	vld1.32	{d4}, [r0,:64]!		@ A[0][2]
2410	vld1.32	{d6}, [r0,:64]!		@ A[0][3]
2411	vld1.32	{d8}, [r0,:64]!		@ A[0][4]
2412
2413	vld1.32	{d1}, [r0,:64]!		@ A[1][0]
2414	vld1.32	{d3}, [r0,:64]!		@ A[1][1]
2415	vld1.32	{d5}, [r0,:64]!		@ A[1][2]
2416	vld1.32	{d7}, [r0,:64]!		@ A[1][3]
2417	vld1.32	{d9}, [r0,:64]!		@ A[1][4]
2418
2419	vld1.32	{d10}, [r0,:64]!		@ A[2][0]
2420	vld1.32	{d12}, [r0,:64]!		@ A[2][1]
2421	vld1.32	{d14}, [r0,:64]!		@ A[2][2]
2422	vld1.32	{d16}, [r0,:64]!		@ A[2][3]
2423	vld1.32	{d18}, [r0,:64]!		@ A[2][4]
2424
2425	vld1.32	{d11}, [r0,:64]!		@ A[3][0]
2426	vld1.32	{d13}, [r0,:64]!		@ A[3][1]
2427	vld1.32	{d15}, [r0,:64]!		@ A[3][2]
2428	vld1.32	{d17}, [r0,:64]!		@ A[3][3]
2429	vld1.32	{d19}, [r0,:64]!		@ A[3][4]
2430
2431	vld1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..3]
2432	vld1.32	{d24}, [r0,:64]		@ A[4][4]
2433	sub	r0, r0, #24*8		@ rewind
2434	b	.Loop_absorb_neon
2435
2436.align	4
2437.Loop_absorb_neon:
2438	subs	r12, r5, r6		@ len - bsz
2439	blo	.Labsorbed_neon
2440	mov	r5, r12
2441
2442	vld1.8	{d31}, [r4]!		@ endian-neutral loads...
2443	cmp	r6, #8*2
2444	veor	d0, d0, d31		@ A[0][0] ^= *inp++
2445	blo	.Lprocess_neon
2446	vld1.8	{d31}, [r4]!
2447	veor	d2, d2, d31		@ A[0][1] ^= *inp++
2448	beq	.Lprocess_neon
2449	vld1.8	{d31}, [r4]!
2450	cmp	r6, #8*4
2451	veor	d4, d4, d31		@ A[0][2] ^= *inp++
2452	blo	.Lprocess_neon
2453	vld1.8	{d31}, [r4]!
2454	veor	d6, d6, d31		@ A[0][3] ^= *inp++
2455	beq	.Lprocess_neon
2456	vld1.8	{d31},[r4]!
2457	cmp	r6, #8*6
2458	veor	d8, d8, d31		@ A[0][4] ^= *inp++
2459	blo	.Lprocess_neon
2460
2461	vld1.8	{d31}, [r4]!
2462	veor	d1, d1, d31		@ A[1][0] ^= *inp++
2463	beq	.Lprocess_neon
2464	vld1.8	{d31}, [r4]!
2465	cmp	r6, #8*8
2466	veor	d3, d3, d31		@ A[1][1] ^= *inp++
2467	blo	.Lprocess_neon
2468	vld1.8	{d31}, [r4]!
2469	veor	d5, d5, d31		@ A[1][2] ^= *inp++
2470	beq	.Lprocess_neon
2471	vld1.8	{d31}, [r4]!
2472	cmp	r6, #8*10
2473	veor	d7, d7, d31		@ A[1][3] ^= *inp++
2474	blo	.Lprocess_neon
2475	vld1.8	{d31}, [r4]!
2476	veor	d9, d9, d31		@ A[1][4] ^= *inp++
2477	beq	.Lprocess_neon
2478
2479	vld1.8	{d31}, [r4]!
2480	cmp	r6, #8*12
2481	veor	d10, d10, d31		@ A[2][0] ^= *inp++
2482	blo	.Lprocess_neon
2483	vld1.8	{d31}, [r4]!
2484	veor	d12, d12, d31		@ A[2][1] ^= *inp++
2485	beq	.Lprocess_neon
2486	vld1.8	{d31}, [r4]!
2487	cmp	r6, #8*14
2488	veor	d14, d14, d31		@ A[2][2] ^= *inp++
2489	blo	.Lprocess_neon
2490	vld1.8	{d31}, [r4]!
2491	veor	d16, d16, d31		@ A[2][3] ^= *inp++
2492	beq	.Lprocess_neon
2493	vld1.8	{d31}, [r4]!
2494	cmp	r6, #8*16
2495	veor	d18, d18, d31		@ A[2][4] ^= *inp++
2496	blo	.Lprocess_neon
2497
2498	vld1.8	{d31}, [r4]!
2499	veor	d11, d11, d31		@ A[3][0] ^= *inp++
2500	beq	.Lprocess_neon
2501	vld1.8	{d31}, [r4]!
2502	cmp	r6, #8*18
2503	veor	d13, d13, d31		@ A[3][1] ^= *inp++
2504	blo	.Lprocess_neon
2505	vld1.8	{d31}, [r4]!
2506	veor	d15, d15, d31		@ A[3][2] ^= *inp++
2507	beq	.Lprocess_neon
2508	vld1.8	{d31}, [r4]!
2509	cmp	r6, #8*20
2510	veor	d17, d17, d31		@ A[3][3] ^= *inp++
2511	blo	.Lprocess_neon
2512	vld1.8	{d31}, [r4]!
2513	veor	d19, d19, d31		@ A[3][4] ^= *inp++
2514	beq	.Lprocess_neon
2515
2516	vld1.8	{d31}, [r4]!
2517	cmp	r6, #8*22
2518	veor	d20, d20, d31		@ A[4][0] ^= *inp++
2519	blo	.Lprocess_neon
2520	vld1.8	{d31}, [r4]!
2521	veor	d21, d21, d31		@ A[4][1] ^= *inp++
2522	beq	.Lprocess_neon
2523	vld1.8	{d31}, [r4]!
2524	cmp	r6, #8*24
2525	veor	d22, d22, d31		@ A[4][2] ^= *inp++
2526	blo	.Lprocess_neon
2527	vld1.8	{d31}, [r4]!
2528	veor	d23, d23, d31		@ A[4][3] ^= *inp++
2529	beq	.Lprocess_neon
2530	vld1.8	{d31}, [r4]!
2531	veor	d24, d24, d31		@ A[4][4] ^= *inp++
2532
2533.Lprocess_neon:
2534	bl	KeccakF1600_neon
2535	b	.Loop_absorb_neon
2536
2537.align	4
2538.Labsorbed_neon:
2539	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2540	vst1.32	{d2}, [r0,:64]!
2541	vst1.32	{d4}, [r0,:64]!
2542	vst1.32	{d6}, [r0,:64]!
2543	vst1.32	{d8}, [r0,:64]!
2544
2545	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2546	vst1.32	{d3}, [r0,:64]!
2547	vst1.32	{d5}, [r0,:64]!
2548	vst1.32	{d7}, [r0,:64]!
2549	vst1.32	{d9}, [r0,:64]!
2550
2551	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2552	vst1.32	{d12}, [r0,:64]!
2553	vst1.32	{d14}, [r0,:64]!
2554	vst1.32	{d16}, [r0,:64]!
2555	vst1.32	{d18}, [r0,:64]!
2556
2557	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2558	vst1.32	{d13}, [r0,:64]!
2559	vst1.32	{d15}, [r0,:64]!
2560	vst1.32	{d17}, [r0,:64]!
2561	vst1.32	{d19}, [r0,:64]!
2562
2563	vst1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
2564	vst1.32	{d24}, [r0,:64]
2565
2566	mov	r0, r5			@ return value
2567	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
2568	ldmia	sp!, {r4,r5,r6,pc}
2569.size	SHA3_absorb_neon,.-SHA3_absorb_neon
2570
2571.globl	SHA3_squeeze_neon
2572.type	SHA3_squeeze_neon, %function
2573.align	5
2574SHA3_squeeze_neon:
2575	stmdb	sp!, {r4,r5,r6,lr}
2576
2577	mov	r4, r1			@ out
2578	mov	r5, r2			@ len
2579	mov	r6, r3			@ bsz
2580	mov	r12, r0			@ A_flat
2581	mov	r14, r3			@ bsz
2582	b	.Loop_squeeze_neon
2583
2584.align	4
2585.Loop_squeeze_neon:
2586	cmp	r5, #8
2587	blo	.Lsqueeze_neon_tail
2588	vld1.32	{d0}, [r12]!
2589	vst1.8	{d0}, [r4]!		@ endian-neutral store
2590
2591	subs	r5, r5, #8		@ len -= 8
2592	beq	.Lsqueeze_neon_done
2593
2594	subs	r14, r14, #8		@ bsz -= 8
2595	bhi	.Loop_squeeze_neon
2596
2597	vstmdb	sp!,  {d8,d9,d10,d11,d12,d13,d14,d15}
2598
2599	vld1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2600	vld1.32	{d2}, [r0,:64]!
2601	vld1.32	{d4}, [r0,:64]!
2602	vld1.32	{d6}, [r0,:64]!
2603	vld1.32	{d8}, [r0,:64]!
2604
2605	vld1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2606	vld1.32	{d3}, [r0,:64]!
2607	vld1.32	{d5}, [r0,:64]!
2608	vld1.32	{d7}, [r0,:64]!
2609	vld1.32	{d9}, [r0,:64]!
2610
2611	vld1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2612	vld1.32	{d12}, [r0,:64]!
2613	vld1.32	{d14}, [r0,:64]!
2614	vld1.32	{d16}, [r0,:64]!
2615	vld1.32	{d18}, [r0,:64]!
2616
2617	vld1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2618	vld1.32	{d13}, [r0,:64]!
2619	vld1.32	{d15}, [r0,:64]!
2620	vld1.32	{d17}, [r0,:64]!
2621	vld1.32	{d19}, [r0,:64]!
2622
2623	vld1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
2624	vld1.32	{d24}, [r0,:64]
2625	sub	r0, r0, #24*8		@ rewind
2626
2627	bl	KeccakF1600_neon
2628
2629	mov	r12, r0			@ A_flat
2630	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2631	vst1.32	{d2}, [r0,:64]!
2632	vst1.32	{d4}, [r0,:64]!
2633	vst1.32	{d6}, [r0,:64]!
2634	vst1.32	{d8}, [r0,:64]!
2635
2636	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2637	vst1.32	{d3}, [r0,:64]!
2638	vst1.32	{d5}, [r0,:64]!
2639	vst1.32	{d7}, [r0,:64]!
2640	vst1.32	{d9}, [r0,:64]!
2641
2642	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2643	vst1.32	{d12}, [r0,:64]!
2644	vst1.32	{d14}, [r0,:64]!
2645	vst1.32	{d16}, [r0,:64]!
2646	vst1.32	{d18}, [r0,:64]!
2647
2648	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2649	vst1.32	{d13}, [r0,:64]!
2650	vst1.32	{d15}, [r0,:64]!
2651	vst1.32	{d17}, [r0,:64]!
2652	vst1.32	{d19}, [r0,:64]!
2653
2654	vst1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
2655	mov	r14, r6			@ bsz
2656	vst1.32	{d24}, [r0,:64]
2657	mov	r0,  r12		@ rewind
2658
2659	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
2660	b	.Loop_squeeze_neon
2661
2662.align	4
2663.Lsqueeze_neon_tail:
2664	ldmia	r12, {r2,r3}
2665	cmp	r5, #2
2666	strb	r2, [r4],#1		@ endian-neutral store
2667	mov	r2, r2, lsr#8
2668	blo	.Lsqueeze_neon_done
2669	strb	r2, [r4], #1
2670	mov	r2, r2, lsr#8
2671	beq	.Lsqueeze_neon_done
2672	strb	r2, [r4], #1
2673	mov	r2, r2, lsr#8
2674	cmp	r5, #4
2675	blo	.Lsqueeze_neon_done
2676	strb	r2, [r4], #1
2677	beq	.Lsqueeze_neon_done
2678
2679	strb	r3, [r4], #1
2680	mov	r3, r3, lsr#8
2681	cmp	r5, #6
2682	blo	.Lsqueeze_neon_done
2683	strb	r3, [r4], #1
2684	mov	r3, r3, lsr#8
2685	beq	.Lsqueeze_neon_done
2686	strb	r3, [r4], #1
2687
2688.Lsqueeze_neon_done:
2689	ldmia	sp!, {r4,r5,r6,pc}
2690.size	SHA3_squeeze_neon,.-SHA3_squeeze_neon
2691#endif
2692.byte	75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2693.align	2
2694.align	2
2695