xref: /freebsd/sys/crypto/openssl/aarch64/keccak1600-armv8.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from keccak1600-armv8.pl. */
2#include "arm_arch.h"
3
4.section	.rodata
5
6.align	8	// strategic alignment and padding that allows to use
7		// address value as loop termination condition...
8.quad	0,0,0,0,0,0,0,0
9.type	iotas,%object
10iotas:
11.quad	0x0000000000000001
12.quad	0x0000000000008082
13.quad	0x800000000000808a
14.quad	0x8000000080008000
15.quad	0x000000000000808b
16.quad	0x0000000080000001
17.quad	0x8000000080008081
18.quad	0x8000000000008009
19.quad	0x000000000000008a
20.quad	0x0000000000000088
21.quad	0x0000000080008009
22.quad	0x000000008000000a
23.quad	0x000000008000808b
24.quad	0x800000000000008b
25.quad	0x8000000000008089
26.quad	0x8000000000008003
27.quad	0x8000000000008002
28.quad	0x8000000000000080
29.quad	0x000000000000800a
30.quad	0x800000008000000a
31.quad	0x8000000080008081
32.quad	0x8000000000008080
33.quad	0x0000000080000001
34.quad	0x8000000080008008
35.size	iotas,.-iotas
36.text
37
38.type	KeccakF1600_int,%function
39.align	5
40KeccakF1600_int:
41	AARCH64_SIGN_LINK_REGISTER
42	adrp	x28,iotas
43	add	x28,x28,#:lo12:iotas
44	stp	x28,x30,[sp,#16]		// 32 bytes on top are mine
45	b	.Loop
46.align	4
47.Loop:
48	////////////////////////////////////////// Theta
49	eor	x26,x0,x5
50	stp	x4,x9,[sp,#0]	// offload pair...
51	eor	x27,x1,x6
52	eor	x28,x2,x7
53	eor	x30,x3,x8
54	eor	x4,x4,x9
55	eor	x26,x26,x10
56	eor	x27,x27,x11
57	eor	x28,x28,x12
58	eor	x30,x30,x13
59	eor	x4,x4,x14
60	eor	x26,x26,x15
61	eor	x27,x27,x16
62	eor	x28,x28,x17
63	eor	x30,x30,x25
64	eor	x4,x4,x19
65	eor	x26,x26,x20
66	eor	x28,x28,x22
67	eor	x27,x27,x21
68	eor	x30,x30,x23
69	eor	x4,x4,x24
70
71	eor	x9,x26,x28,ror#63
72
73	eor	x1,x1,x9
74	eor	x6,x6,x9
75	eor	x11,x11,x9
76	eor	x16,x16,x9
77	eor	x21,x21,x9
78
79	eor	x9,x27,x30,ror#63
80	eor	x28,x28,x4,ror#63
81	eor	x30,x30,x26,ror#63
82	eor	x4,x4,x27,ror#63
83
84	eor	x27,   x2,x9		// mov	x27,x2
85	eor	x7,x7,x9
86	eor	x12,x12,x9
87	eor	x17,x17,x9
88	eor	x22,x22,x9
89
90	eor	x0,x0,x4
91	eor	x5,x5,x4
92	eor	x10,x10,x4
93	eor	x15,x15,x4
94	eor	x20,x20,x4
95	ldp	x4,x9,[sp,#0]	// re-load offloaded data
96	eor	x26,   x3,x28		// mov	x26,x3
97	eor	x8,x8,x28
98	eor	x13,x13,x28
99	eor	x25,x25,x28
100	eor	x23,x23,x28
101
102	eor	x28,   x4,x30		// mov	x28,x4
103	eor	x9,x9,x30
104	eor	x14,x14,x30
105	eor	x19,x19,x30
106	eor	x24,x24,x30
107
108	////////////////////////////////////////// Rho+Pi
109	mov	x30,x1
110	ror	x1,x6,#64-44
111	//mov	x27,x2
112	ror	x2,x12,#64-43
113	//mov	x26,x3
114	ror	x3,x25,#64-21
115	//mov	x28,x4
116	ror	x4,x24,#64-14
117
118	ror	x6,x9,#64-20
119	ror	x12,x13,#64-25
120	ror	x25,x17,#64-15
121	ror	x24,x21,#64-2
122
123	ror	x9,x22,#64-61
124	ror	x13,x19,#64-8
125	ror	x17,x11,#64-10
126	ror	x21,x8,#64-55
127
128	ror	x22,x14,#64-39
129	ror	x19,x23,#64-56
130	ror	x11,x7,#64-6
131	ror	x8,x16,#64-45
132
133	ror	x14,x20,#64-18
134	ror	x23,x15,#64-41
135	ror	x7,x10,#64-3
136	ror	x16,x5,#64-36
137
138	ror	x5,x26,#64-28
139	ror	x10,x30,#64-1
140	ror	x15,x28,#64-27
141	ror	x20,x27,#64-62
142
143	////////////////////////////////////////// Chi+Iota
144	bic	x26,x2,x1
145	bic	x27,x3,x2
146	bic	x28,x0,x4
147	bic	x30,x1,x0
148	eor	x0,x0,x26
149	bic	x26,x4,x3
150	eor	x1,x1,x27
151	ldr	x27,[sp,#16]
152	eor	x3,x3,x28
153	eor	x4,x4,x30
154	eor	x2,x2,x26
155	ldr	x30,[x27],#8		// Iota[i++]
156
157	bic	x26,x7,x6
158	tst	x27,#255			// are we done?
159	str	x27,[sp,#16]
160	bic	x27,x8,x7
161	bic	x28,x5,x9
162	eor	x0,x0,x30		// A[0][0] ^= Iota
163	bic	x30,x6,x5
164	eor	x5,x5,x26
165	bic	x26,x9,x8
166	eor	x6,x6,x27
167	eor	x8,x8,x28
168	eor	x9,x9,x30
169	eor	x7,x7,x26
170
171	bic	x26,x12,x11
172	bic	x27,x13,x12
173	bic	x28,x10,x14
174	bic	x30,x11,x10
175	eor	x10,x10,x26
176	bic	x26,x14,x13
177	eor	x11,x11,x27
178	eor	x13,x13,x28
179	eor	x14,x14,x30
180	eor	x12,x12,x26
181
182	bic	x26,x17,x16
183	bic	x27,x25,x17
184	bic	x28,x15,x19
185	bic	x30,x16,x15
186	eor	x15,x15,x26
187	bic	x26,x19,x25
188	eor	x16,x16,x27
189	eor	x25,x25,x28
190	eor	x19,x19,x30
191	eor	x17,x17,x26
192
193	bic	x26,x22,x21
194	bic	x27,x23,x22
195	bic	x28,x20,x24
196	bic	x30,x21,x20
197	eor	x20,x20,x26
198	bic	x26,x24,x23
199	eor	x21,x21,x27
200	eor	x23,x23,x28
201	eor	x24,x24,x30
202	eor	x22,x22,x26
203
204	bne	.Loop
205
206	ldr	x30,[sp,#24]
207	AARCH64_VALIDATE_LINK_REGISTER
208	ret
209.size	KeccakF1600_int,.-KeccakF1600_int
210
211.type	KeccakF1600,%function
212.align	5
213KeccakF1600:
214	AARCH64_SIGN_LINK_REGISTER
215	stp	x29,x30,[sp,#-128]!
216	add	x29,sp,#0
217	stp	x19,x20,[sp,#16]
218	stp	x21,x22,[sp,#32]
219	stp	x23,x24,[sp,#48]
220	stp	x25,x26,[sp,#64]
221	stp	x27,x28,[sp,#80]
222	sub	sp,sp,#48
223
224	str	x0,[sp,#32]			// offload argument
225	mov	x26,x0
226	ldp	x0,x1,[x0,#16*0]
227	ldp	x2,x3,[x26,#16*1]
228	ldp	x4,x5,[x26,#16*2]
229	ldp	x6,x7,[x26,#16*3]
230	ldp	x8,x9,[x26,#16*4]
231	ldp	x10,x11,[x26,#16*5]
232	ldp	x12,x13,[x26,#16*6]
233	ldp	x14,x15,[x26,#16*7]
234	ldp	x16,x17,[x26,#16*8]
235	ldp	x25,x19,[x26,#16*9]
236	ldp	x20,x21,[x26,#16*10]
237	ldp	x22,x23,[x26,#16*11]
238	ldr	x24,[x26,#16*12]
239
240	bl	KeccakF1600_int
241
242	ldr	x26,[sp,#32]
243	stp	x0,x1,[x26,#16*0]
244	stp	x2,x3,[x26,#16*1]
245	stp	x4,x5,[x26,#16*2]
246	stp	x6,x7,[x26,#16*3]
247	stp	x8,x9,[x26,#16*4]
248	stp	x10,x11,[x26,#16*5]
249	stp	x12,x13,[x26,#16*6]
250	stp	x14,x15,[x26,#16*7]
251	stp	x16,x17,[x26,#16*8]
252	stp	x25,x19,[x26,#16*9]
253	stp	x20,x21,[x26,#16*10]
254	stp	x22,x23,[x26,#16*11]
255	str	x24,[x26,#16*12]
256
257	ldp	x19,x20,[x29,#16]
258	add	sp,sp,#48
259	ldp	x21,x22,[x29,#32]
260	ldp	x23,x24,[x29,#48]
261	ldp	x25,x26,[x29,#64]
262	ldp	x27,x28,[x29,#80]
263	ldp	x29,x30,[sp],#128
264	AARCH64_VALIDATE_LINK_REGISTER
265	ret
266.size	KeccakF1600,.-KeccakF1600
267
268.globl	SHA3_absorb
269.type	SHA3_absorb,%function
270.align	5
271SHA3_absorb:
272	AARCH64_SIGN_LINK_REGISTER
273	stp	x29,x30,[sp,#-128]!
274	add	x29,sp,#0
275	stp	x19,x20,[sp,#16]
276	stp	x21,x22,[sp,#32]
277	stp	x23,x24,[sp,#48]
278	stp	x25,x26,[sp,#64]
279	stp	x27,x28,[sp,#80]
280	sub	sp,sp,#64
281
282	stp	x0,x1,[sp,#32]			// offload arguments
283	stp	x2,x3,[sp,#48]
284
285	mov	x26,x0			// uint64_t A[5][5]
286	mov	x27,x1			// const void *inp
287	mov	x28,x2			// size_t len
288	mov	x30,x3			// size_t bsz
289	ldp	x0,x1,[x26,#16*0]
290	ldp	x2,x3,[x26,#16*1]
291	ldp	x4,x5,[x26,#16*2]
292	ldp	x6,x7,[x26,#16*3]
293	ldp	x8,x9,[x26,#16*4]
294	ldp	x10,x11,[x26,#16*5]
295	ldp	x12,x13,[x26,#16*6]
296	ldp	x14,x15,[x26,#16*7]
297	ldp	x16,x17,[x26,#16*8]
298	ldp	x25,x19,[x26,#16*9]
299	ldp	x20,x21,[x26,#16*10]
300	ldp	x22,x23,[x26,#16*11]
301	ldr	x24,[x26,#16*12]
302	b	.Loop_absorb
303
304.align	4
305.Loop_absorb:
306	subs	x26,x28,x30		// len - bsz
307	blo	.Labsorbed
308
309	str	x26,[sp,#48]			// save len - bsz
310	ldr	x26,[x27],#8		// *inp++
311#ifdef	__AARCH64EB__
312	rev	x26,x26
313#endif
314	eor	x0,x0,x26
315	cmp	x30,#8*(0+2)
316	blo	.Lprocess_block
317	ldr	x26,[x27],#8		// *inp++
318#ifdef	__AARCH64EB__
319	rev	x26,x26
320#endif
321	eor	x1,x1,x26
322	beq	.Lprocess_block
323	ldr	x26,[x27],#8		// *inp++
324#ifdef	__AARCH64EB__
325	rev	x26,x26
326#endif
327	eor	x2,x2,x26
328	cmp	x30,#8*(2+2)
329	blo	.Lprocess_block
330	ldr	x26,[x27],#8		// *inp++
331#ifdef	__AARCH64EB__
332	rev	x26,x26
333#endif
334	eor	x3,x3,x26
335	beq	.Lprocess_block
336	ldr	x26,[x27],#8		// *inp++
337#ifdef	__AARCH64EB__
338	rev	x26,x26
339#endif
340	eor	x4,x4,x26
341	cmp	x30,#8*(4+2)
342	blo	.Lprocess_block
343	ldr	x26,[x27],#8		// *inp++
344#ifdef	__AARCH64EB__
345	rev	x26,x26
346#endif
347	eor	x5,x5,x26
348	beq	.Lprocess_block
349	ldr	x26,[x27],#8		// *inp++
350#ifdef	__AARCH64EB__
351	rev	x26,x26
352#endif
353	eor	x6,x6,x26
354	cmp	x30,#8*(6+2)
355	blo	.Lprocess_block
356	ldr	x26,[x27],#8		// *inp++
357#ifdef	__AARCH64EB__
358	rev	x26,x26
359#endif
360	eor	x7,x7,x26
361	beq	.Lprocess_block
362	ldr	x26,[x27],#8		// *inp++
363#ifdef	__AARCH64EB__
364	rev	x26,x26
365#endif
366	eor	x8,x8,x26
367	cmp	x30,#8*(8+2)
368	blo	.Lprocess_block
369	ldr	x26,[x27],#8		// *inp++
370#ifdef	__AARCH64EB__
371	rev	x26,x26
372#endif
373	eor	x9,x9,x26
374	beq	.Lprocess_block
375	ldr	x26,[x27],#8		// *inp++
376#ifdef	__AARCH64EB__
377	rev	x26,x26
378#endif
379	eor	x10,x10,x26
380	cmp	x30,#8*(10+2)
381	blo	.Lprocess_block
382	ldr	x26,[x27],#8		// *inp++
383#ifdef	__AARCH64EB__
384	rev	x26,x26
385#endif
386	eor	x11,x11,x26
387	beq	.Lprocess_block
388	ldr	x26,[x27],#8		// *inp++
389#ifdef	__AARCH64EB__
390	rev	x26,x26
391#endif
392	eor	x12,x12,x26
393	cmp	x30,#8*(12+2)
394	blo	.Lprocess_block
395	ldr	x26,[x27],#8		// *inp++
396#ifdef	__AARCH64EB__
397	rev	x26,x26
398#endif
399	eor	x13,x13,x26
400	beq	.Lprocess_block
401	ldr	x26,[x27],#8		// *inp++
402#ifdef	__AARCH64EB__
403	rev	x26,x26
404#endif
405	eor	x14,x14,x26
406	cmp	x30,#8*(14+2)
407	blo	.Lprocess_block
408	ldr	x26,[x27],#8		// *inp++
409#ifdef	__AARCH64EB__
410	rev	x26,x26
411#endif
412	eor	x15,x15,x26
413	beq	.Lprocess_block
414	ldr	x26,[x27],#8		// *inp++
415#ifdef	__AARCH64EB__
416	rev	x26,x26
417#endif
418	eor	x16,x16,x26
419	cmp	x30,#8*(16+2)
420	blo	.Lprocess_block
421	ldr	x26,[x27],#8		// *inp++
422#ifdef	__AARCH64EB__
423	rev	x26,x26
424#endif
425	eor	x17,x17,x26
426	beq	.Lprocess_block
427	ldr	x26,[x27],#8		// *inp++
428#ifdef	__AARCH64EB__
429	rev	x26,x26
430#endif
431	eor	x25,x25,x26
432	cmp	x30,#8*(18+2)
433	blo	.Lprocess_block
434	ldr	x26,[x27],#8		// *inp++
435#ifdef	__AARCH64EB__
436	rev	x26,x26
437#endif
438	eor	x19,x19,x26
439	beq	.Lprocess_block
440	ldr	x26,[x27],#8		// *inp++
441#ifdef	__AARCH64EB__
442	rev	x26,x26
443#endif
444	eor	x20,x20,x26
445	cmp	x30,#8*(20+2)
446	blo	.Lprocess_block
447	ldr	x26,[x27],#8		// *inp++
448#ifdef	__AARCH64EB__
449	rev	x26,x26
450#endif
451	eor	x21,x21,x26
452	beq	.Lprocess_block
453	ldr	x26,[x27],#8		// *inp++
454#ifdef	__AARCH64EB__
455	rev	x26,x26
456#endif
457	eor	x22,x22,x26
458	cmp	x30,#8*(22+2)
459	blo	.Lprocess_block
460	ldr	x26,[x27],#8		// *inp++
461#ifdef	__AARCH64EB__
462	rev	x26,x26
463#endif
464	eor	x23,x23,x26
465	beq	.Lprocess_block
466	ldr	x26,[x27],#8		// *inp++
467#ifdef	__AARCH64EB__
468	rev	x26,x26
469#endif
470	eor	x24,x24,x26
471
472.Lprocess_block:
473	str	x27,[sp,#40]			// save inp
474
475	bl	KeccakF1600_int
476
477	ldr	x27,[sp,#40]			// restore arguments
478	ldp	x28,x30,[sp,#48]
479	b	.Loop_absorb
480
481.align	4
482.Labsorbed:
483	ldr	x27,[sp,#32]
484	stp	x0,x1,[x27,#16*0]
485	stp	x2,x3,[x27,#16*1]
486	stp	x4,x5,[x27,#16*2]
487	stp	x6,x7,[x27,#16*3]
488	stp	x8,x9,[x27,#16*4]
489	stp	x10,x11,[x27,#16*5]
490	stp	x12,x13,[x27,#16*6]
491	stp	x14,x15,[x27,#16*7]
492	stp	x16,x17,[x27,#16*8]
493	stp	x25,x19,[x27,#16*9]
494	stp	x20,x21,[x27,#16*10]
495	stp	x22,x23,[x27,#16*11]
496	str	x24,[x27,#16*12]
497
498	mov	x0,x28			// return value
499	ldp	x19,x20,[x29,#16]
500	add	sp,sp,#64
501	ldp	x21,x22,[x29,#32]
502	ldp	x23,x24,[x29,#48]
503	ldp	x25,x26,[x29,#64]
504	ldp	x27,x28,[x29,#80]
505	ldp	x29,x30,[sp],#128
506	AARCH64_VALIDATE_LINK_REGISTER
507	ret
508.size	SHA3_absorb,.-SHA3_absorb
509.globl	SHA3_squeeze
510.type	SHA3_squeeze,%function
511.align	5
512SHA3_squeeze:
513	AARCH64_SIGN_LINK_REGISTER
514	stp	x29,x30,[sp,#-48]!
515	add	x29,sp,#0
516	stp	x19,x20,[sp,#16]
517	stp	x21,x22,[sp,#32]
518
519	mov	x19,x0			// put aside arguments
520	mov	x20,x1
521	mov	x21,x2
522	mov	x22,x3
523	cmp	w4, #0				// w4 = 'next' argument
524	bne	.Lnext_block
525
526.Loop_squeeze:
527	ldr	x4,[x0],#8
528	cmp	x21,#8
529	blo	.Lsqueeze_tail
530#ifdef	__AARCH64EB__
531	rev	x4,x4
532#endif
533	str	x4,[x20],#8
534	subs	x21,x21,#8
535	beq	.Lsqueeze_done
536
537	subs	x3,x3,#8
538	bhi	.Loop_squeeze
539.Lnext_block:
540	mov	x0,x19
541	bl	KeccakF1600
542	mov	x0,x19
543	mov	x3,x22
544	b	.Loop_squeeze
545
546.align	4
547.Lsqueeze_tail:
548	strb	w4,[x20],#1
549	lsr	x4,x4,#8
550	subs	x21,x21,#1
551	beq	.Lsqueeze_done
552	strb	w4,[x20],#1
553	lsr	x4,x4,#8
554	subs	x21,x21,#1
555	beq	.Lsqueeze_done
556	strb	w4,[x20],#1
557	lsr	x4,x4,#8
558	subs	x21,x21,#1
559	beq	.Lsqueeze_done
560	strb	w4,[x20],#1
561	lsr	x4,x4,#8
562	subs	x21,x21,#1
563	beq	.Lsqueeze_done
564	strb	w4,[x20],#1
565	lsr	x4,x4,#8
566	subs	x21,x21,#1
567	beq	.Lsqueeze_done
568	strb	w4,[x20],#1
569	lsr	x4,x4,#8
570	subs	x21,x21,#1
571	beq	.Lsqueeze_done
572	strb	w4,[x20],#1
573
574.Lsqueeze_done:
575	ldp	x19,x20,[sp,#16]
576	ldp	x21,x22,[sp,#32]
577	ldp	x29,x30,[sp],#48
578	AARCH64_VALIDATE_LINK_REGISTER
579	ret
580.size	SHA3_squeeze,.-SHA3_squeeze
581.type	KeccakF1600_ce,%function
582.align	5
583KeccakF1600_ce:
584	mov	x9,#24
585	adrp	x10,iotas
586	add	x10,x10,#:lo12:iotas
587	b	.Loop_ce
588.align	4
589.Loop_ce:
590	////////////////////////////////////////////////// Theta
591.inst	0xce0f2a99	//eor3 v25.16b,v20.16b,v15.16b,v10.16b
592.inst	0xce102eba	//eor3 v26.16b,v21.16b,v16.16b,v11.16b
593.inst	0xce1132db	//eor3 v27.16b,v22.16b,v17.16b,v12.16b
594.inst	0xce1236fc	//eor3 v28.16b,v23.16b,v18.16b,v13.16b
595.inst	0xce133b1d	//eor3 v29.16b,v24.16b,v19.16b,v14.16b
596.inst	0xce050339	//eor3 v25.16b,v25.16b,   v5.16b,v0.16b
597.inst	0xce06075a	//eor3 v26.16b,v26.16b,   v6.16b,v1.16b
598.inst	0xce070b7b	//eor3 v27.16b,v27.16b,   v7.16b,v2.16b
599.inst	0xce080f9c	//eor3 v28.16b,v28.16b,   v8.16b,v3.16b
600.inst	0xce0913bd	//eor3 v29.16b,v29.16b,   v9.16b,v4.16b
601
602.inst	0xce7b8f3e	//rax1 v30.16b,v25.16b,v27.16b			// D[1]
603.inst	0xce7c8f5f	//rax1 v31.16b,v26.16b,v28.16b			// D[2]
604.inst	0xce7d8f7b	//rax1 v27.16b,v27.16b,v29.16b			// D[3]
605.inst	0xce798f9c	//rax1 v28.16b,v28.16b,v25.16b			// D[4]
606.inst	0xce7a8fbd	//rax1 v29.16b,v29.16b,v26.16b			// D[0]
607
608	////////////////////////////////////////////////// Theta+Rho+Pi
609.inst	0xce9efc39	//xar v25.16b,   v1.16b,v30.16b,#64-1 // C[0]=A[2][0]
610
611.inst	0xce9e50c1	//xar v1.16b,v6.16b,v30.16b,#64-44
612.inst	0xce9cb126	//xar v6.16b,v9.16b,v28.16b,#64-20
613.inst	0xce9f0ec9	//xar v9.16b,v22.16b,v31.16b,#64-61
614.inst	0xce9c65d6	//xar v22.16b,v14.16b,v28.16b,#64-39
615.inst	0xce9dba8e	//xar v14.16b,v20.16b,v29.16b,#64-18
616
617.inst	0xce9f085a	//xar v26.16b,   v2.16b,v31.16b,#64-62 // C[1]=A[4][0]
618
619.inst	0xce9f5582	//xar v2.16b,v12.16b,v31.16b,#64-43
620.inst	0xce9b9dac	//xar v12.16b,v13.16b,v27.16b,#64-25
621.inst	0xce9ce26d	//xar v13.16b,v19.16b,v28.16b,#64-8
622.inst	0xce9b22f3	//xar v19.16b,v23.16b,v27.16b,#64-56
623.inst	0xce9d5df7	//xar v23.16b,v15.16b,v29.16b,#64-41
624
625.inst	0xce9c948f	//xar v15.16b,v4.16b,v28.16b,#64-27
626
627.inst	0xce9ccb1c	//xar v28.16b,   v24.16b,v28.16b,#64-14 // D[4]=A[0][4]
628.inst	0xce9efab8	//xar v24.16b,v21.16b,v30.16b,#64-2
629.inst	0xce9b2508	//xar v8.16b,v8.16b,v27.16b,#64-55 // A[1][3]=A[4][1]
630.inst	0xce9e4e04	//xar v4.16b,v16.16b,v30.16b,#64-45 // A[0][4]=A[1][3]
631.inst	0xce9d70b0	//xar v16.16b,v5.16b,v29.16b,#64-36
632
633.inst	0xce9b9065	//xar v5.16b,v3.16b,v27.16b,#64-28
634
635	eor	v0.16b,v0.16b,v29.16b
636
637.inst	0xce9bae5b	//xar v27.16b,   v18.16b,v27.16b,#64-21 // D[3]=A[0][3]
638.inst	0xce9fc623	//xar v3.16b,v17.16b,v31.16b,#64-15 // A[0][3]=A[3][3]
639.inst	0xce9ed97e	//xar v30.16b,   v11.16b,v30.16b,#64-10 // D[1]=A[3][2]
640.inst	0xce9fe8ff	//xar v31.16b,   v7.16b,v31.16b,#64-6 // D[2]=A[2][1]
641.inst	0xce9df55d	//xar v29.16b,   v10.16b,v29.16b,#64-3 // D[0]=A[1][2]
642
643	////////////////////////////////////////////////// Chi+Iota
644.inst	0xce362354	//bcax v20.16b,v26.16b,   v22.16b,v8.16b	// A[1][3]=A[4][1]
645.inst	0xce375915	//bcax v21.16b,v8.16b,v23.16b,v22.16b	// A[1][3]=A[4][1]
646.inst	0xce385ed6	//bcax v22.16b,v22.16b,v24.16b,v23.16b
647.inst	0xce3a62f7	//bcax v23.16b,v23.16b,v26.16b,   v24.16b
648.inst	0xce286b18	//bcax v24.16b,v24.16b,v8.16b,v26.16b	// A[1][3]=A[4][1]
649
650	ld1r	{v26.2d},[x10],#8
651
652.inst	0xce330fd1	//bcax v17.16b,v30.16b,   v19.16b,v3.16b	// A[0][3]=A[3][3]
653.inst	0xce2f4c72	//bcax v18.16b,v3.16b,v15.16b,v19.16b	// A[0][3]=A[3][3]
654.inst	0xce303e73	//bcax v19.16b,v19.16b,v16.16b,v15.16b
655.inst	0xce3e41ef	//bcax v15.16b,v15.16b,v30.16b,   v16.16b
656.inst	0xce237a10	//bcax v16.16b,v16.16b,v3.16b,v30.16b	// A[0][3]=A[3][3]
657
658.inst	0xce2c7f2a	//bcax v10.16b,v25.16b,   v12.16b,v31.16b
659.inst	0xce2d33eb	//bcax v11.16b,v31.16b,   v13.16b,v12.16b
660.inst	0xce2e358c	//bcax v12.16b,v12.16b,v14.16b,v13.16b
661.inst	0xce3939ad	//bcax v13.16b,v13.16b,v25.16b,   v14.16b
662.inst	0xce3f65ce	//bcax v14.16b,v14.16b,v31.16b,   v25.16b
663
664.inst	0xce2913a7	//bcax v7.16b,v29.16b,   v9.16b,v4.16b	// A[0][4]=A[1][3]
665.inst	0xce252488	//bcax v8.16b,v4.16b,v5.16b,v9.16b	// A[0][4]=A[1][3]
666.inst	0xce261529	//bcax v9.16b,v9.16b,v6.16b,v5.16b
667.inst	0xce3d18a5	//bcax v5.16b,v5.16b,v29.16b,   v6.16b
668.inst	0xce2474c6	//bcax v6.16b,v6.16b,v4.16b,v29.16b	// A[0][4]=A[1][3]
669
670.inst	0xce207363	//bcax v3.16b,v27.16b,   v0.16b,v28.16b
671.inst	0xce210384	//bcax v4.16b,v28.16b,   v1.16b,v0.16b
672.inst	0xce220400	//bcax v0.16b,v0.16b,v2.16b,v1.16b
673.inst	0xce3b0821	//bcax v1.16b,v1.16b,v27.16b,   v2.16b
674.inst	0xce3c6c42	//bcax v2.16b,v2.16b,v28.16b,   v27.16b
675
676	eor	v0.16b,v0.16b,v26.16b
677
678	subs	x9,x9,#1
679	bne	.Loop_ce
680
681	ret
682.size	KeccakF1600_ce,.-KeccakF1600_ce
683
684.type	KeccakF1600_cext,%function
685.align	5
686KeccakF1600_cext:
687	AARCH64_SIGN_LINK_REGISTER
688	stp	x29,x30,[sp,#-80]!
689	add	x29,sp,#0
690	stp	d8,d9,[sp,#16]		// per ABI requirement
691	stp	d10,d11,[sp,#32]
692	stp	d12,d13,[sp,#48]
693	stp	d14,d15,[sp,#64]
694	ldp	d0,d1,[x0,#8*0]
695	ldp	d2,d3,[x0,#8*2]
696	ldp	d4,d5,[x0,#8*4]
697	ldp	d6,d7,[x0,#8*6]
698	ldp	d8,d9,[x0,#8*8]
699	ldp	d10,d11,[x0,#8*10]
700	ldp	d12,d13,[x0,#8*12]
701	ldp	d14,d15,[x0,#8*14]
702	ldp	d16,d17,[x0,#8*16]
703	ldp	d18,d19,[x0,#8*18]
704	ldp	d20,d21,[x0,#8*20]
705	ldp	d22,d23,[x0,#8*22]
706	ldr	d24,[x0,#8*24]
707	bl	KeccakF1600_ce
708	ldr	x30,[sp,#8]
709	stp	d0,d1,[x0,#8*0]
710	stp	d2,d3,[x0,#8*2]
711	stp	d4,d5,[x0,#8*4]
712	stp	d6,d7,[x0,#8*6]
713	stp	d8,d9,[x0,#8*8]
714	stp	d10,d11,[x0,#8*10]
715	stp	d12,d13,[x0,#8*12]
716	stp	d14,d15,[x0,#8*14]
717	stp	d16,d17,[x0,#8*16]
718	stp	d18,d19,[x0,#8*18]
719	stp	d20,d21,[x0,#8*20]
720	stp	d22,d23,[x0,#8*22]
721	str	d24,[x0,#8*24]
722
723	ldp	d8,d9,[sp,#16]
724	ldp	d10,d11,[sp,#32]
725	ldp	d12,d13,[sp,#48]
726	ldp	d14,d15,[sp,#64]
727	ldr	x29,[sp],#80
728	AARCH64_VALIDATE_LINK_REGISTER
729	ret
730.size	KeccakF1600_cext,.-KeccakF1600_cext
731.globl	SHA3_absorb_cext
732.type	SHA3_absorb_cext,%function
733.align	5
734SHA3_absorb_cext:
735	AARCH64_SIGN_LINK_REGISTER
736	stp	x29,x30,[sp,#-80]!
737	add	x29,sp,#0
738	stp	d8,d9,[sp,#16]		// per ABI requirement
739	stp	d10,d11,[sp,#32]
740	stp	d12,d13,[sp,#48]
741	stp	d14,d15,[sp,#64]
742	ldp	d0,d1,[x0,#8*0]
743	ldp	d2,d3,[x0,#8*2]
744	ldp	d4,d5,[x0,#8*4]
745	ldp	d6,d7,[x0,#8*6]
746	ldp	d8,d9,[x0,#8*8]
747	ldp	d10,d11,[x0,#8*10]
748	ldp	d12,d13,[x0,#8*12]
749	ldp	d14,d15,[x0,#8*14]
750	ldp	d16,d17,[x0,#8*16]
751	ldp	d18,d19,[x0,#8*18]
752	ldp	d20,d21,[x0,#8*20]
753	ldp	d22,d23,[x0,#8*22]
754	ldr	d24,[x0,#8*24]
755	b	.Loop_absorb_ce
756
757.align	4
758.Loop_absorb_ce:
759	subs	x2,x2,x3		// len - bsz
760	blo	.Labsorbed_ce
761	ldr	d31,[x1],#8		// *inp++
762#ifdef	__AARCH64EB__
763	rev64	v31.16b,v31.16b
764#endif
765	eor	v0.16b,v0.16b,v31.16b
766	cmp	x3,#8*(0+2)
767	blo	.Lprocess_block_ce
768	ldr	d31,[x1],#8		// *inp++
769#ifdef	__AARCH64EB__
770	rev64	v31.16b,v31.16b
771#endif
772	eor	v1.16b,v1.16b,v31.16b
773	beq	.Lprocess_block_ce
774	ldr	d31,[x1],#8		// *inp++
775#ifdef	__AARCH64EB__
776	rev64	v31.16b,v31.16b
777#endif
778	eor	v2.16b,v2.16b,v31.16b
779	cmp	x3,#8*(2+2)
780	blo	.Lprocess_block_ce
781	ldr	d31,[x1],#8		// *inp++
782#ifdef	__AARCH64EB__
783	rev64	v31.16b,v31.16b
784#endif
785	eor	v3.16b,v3.16b,v31.16b
786	beq	.Lprocess_block_ce
787	ldr	d31,[x1],#8		// *inp++
788#ifdef	__AARCH64EB__
789	rev64	v31.16b,v31.16b
790#endif
791	eor	v4.16b,v4.16b,v31.16b
792	cmp	x3,#8*(4+2)
793	blo	.Lprocess_block_ce
794	ldr	d31,[x1],#8		// *inp++
795#ifdef	__AARCH64EB__
796	rev64	v31.16b,v31.16b
797#endif
798	eor	v5.16b,v5.16b,v31.16b
799	beq	.Lprocess_block_ce
800	ldr	d31,[x1],#8		// *inp++
801#ifdef	__AARCH64EB__
802	rev64	v31.16b,v31.16b
803#endif
804	eor	v6.16b,v6.16b,v31.16b
805	cmp	x3,#8*(6+2)
806	blo	.Lprocess_block_ce
807	ldr	d31,[x1],#8		// *inp++
808#ifdef	__AARCH64EB__
809	rev64	v31.16b,v31.16b
810#endif
811	eor	v7.16b,v7.16b,v31.16b
812	beq	.Lprocess_block_ce
813	ldr	d31,[x1],#8		// *inp++
814#ifdef	__AARCH64EB__
815	rev64	v31.16b,v31.16b
816#endif
817	eor	v8.16b,v8.16b,v31.16b
818	cmp	x3,#8*(8+2)
819	blo	.Lprocess_block_ce
820	ldr	d31,[x1],#8		// *inp++
821#ifdef	__AARCH64EB__
822	rev64	v31.16b,v31.16b
823#endif
824	eor	v9.16b,v9.16b,v31.16b
825	beq	.Lprocess_block_ce
826	ldr	d31,[x1],#8		// *inp++
827#ifdef	__AARCH64EB__
828	rev64	v31.16b,v31.16b
829#endif
830	eor	v10.16b,v10.16b,v31.16b
831	cmp	x3,#8*(10+2)
832	blo	.Lprocess_block_ce
833	ldr	d31,[x1],#8		// *inp++
834#ifdef	__AARCH64EB__
835	rev64	v31.16b,v31.16b
836#endif
837	eor	v11.16b,v11.16b,v31.16b
838	beq	.Lprocess_block_ce
839	ldr	d31,[x1],#8		// *inp++
840#ifdef	__AARCH64EB__
841	rev64	v31.16b,v31.16b
842#endif
843	eor	v12.16b,v12.16b,v31.16b
844	cmp	x3,#8*(12+2)
845	blo	.Lprocess_block_ce
846	ldr	d31,[x1],#8		// *inp++
847#ifdef	__AARCH64EB__
848	rev64	v31.16b,v31.16b
849#endif
850	eor	v13.16b,v13.16b,v31.16b
851	beq	.Lprocess_block_ce
852	ldr	d31,[x1],#8		// *inp++
853#ifdef	__AARCH64EB__
854	rev64	v31.16b,v31.16b
855#endif
856	eor	v14.16b,v14.16b,v31.16b
857	cmp	x3,#8*(14+2)
858	blo	.Lprocess_block_ce
859	ldr	d31,[x1],#8		// *inp++
860#ifdef	__AARCH64EB__
861	rev64	v31.16b,v31.16b
862#endif
863	eor	v15.16b,v15.16b,v31.16b
864	beq	.Lprocess_block_ce
865	ldr	d31,[x1],#8		// *inp++
866#ifdef	__AARCH64EB__
867	rev64	v31.16b,v31.16b
868#endif
869	eor	v16.16b,v16.16b,v31.16b
870	cmp	x3,#8*(16+2)
871	blo	.Lprocess_block_ce
872	ldr	d31,[x1],#8		// *inp++
873#ifdef	__AARCH64EB__
874	rev64	v31.16b,v31.16b
875#endif
876	eor	v17.16b,v17.16b,v31.16b
877	beq	.Lprocess_block_ce
878	ldr	d31,[x1],#8		// *inp++
879#ifdef	__AARCH64EB__
880	rev64	v31.16b,v31.16b
881#endif
882	eor	v18.16b,v18.16b,v31.16b
883	cmp	x3,#8*(18+2)
884	blo	.Lprocess_block_ce
885	ldr	d31,[x1],#8		// *inp++
886#ifdef	__AARCH64EB__
887	rev64	v31.16b,v31.16b
888#endif
889	eor	v19.16b,v19.16b,v31.16b
890	beq	.Lprocess_block_ce
891	ldr	d31,[x1],#8		// *inp++
892#ifdef	__AARCH64EB__
893	rev64	v31.16b,v31.16b
894#endif
895	eor	v20.16b,v20.16b,v31.16b
896	cmp	x3,#8*(20+2)
897	blo	.Lprocess_block_ce
898	ldr	d31,[x1],#8		// *inp++
899#ifdef	__AARCH64EB__
900	rev64	v31.16b,v31.16b
901#endif
902	eor	v21.16b,v21.16b,v31.16b
903	beq	.Lprocess_block_ce
904	ldr	d31,[x1],#8		// *inp++
905#ifdef	__AARCH64EB__
906	rev64	v31.16b,v31.16b
907#endif
908	eor	v22.16b,v22.16b,v31.16b
909	cmp	x3,#8*(22+2)
910	blo	.Lprocess_block_ce
911	ldr	d31,[x1],#8		// *inp++
912#ifdef	__AARCH64EB__
913	rev64	v31.16b,v31.16b
914#endif
915	eor	v23.16b,v23.16b,v31.16b
916	beq	.Lprocess_block_ce
917	ldr	d31,[x1],#8		// *inp++
918#ifdef	__AARCH64EB__
919	rev64	v31.16b,v31.16b
920#endif
921	eor	v24.16b,v24.16b,v31.16b
922
923.Lprocess_block_ce:
924
925	bl	KeccakF1600_ce
926
927	b	.Loop_absorb_ce
928
929.align	4
930.Labsorbed_ce:
931	stp	d0,d1,[x0,#8*0]
932	stp	d2,d3,[x0,#8*2]
933	stp	d4,d5,[x0,#8*4]
934	stp	d6,d7,[x0,#8*6]
935	stp	d8,d9,[x0,#8*8]
936	stp	d10,d11,[x0,#8*10]
937	stp	d12,d13,[x0,#8*12]
938	stp	d14,d15,[x0,#8*14]
939	stp	d16,d17,[x0,#8*16]
940	stp	d18,d19,[x0,#8*18]
941	stp	d20,d21,[x0,#8*20]
942	stp	d22,d23,[x0,#8*22]
943	str	d24,[x0,#8*24]
944	add	x0,x2,x3		// return value
945
946	ldp	d8,d9,[sp,#16]
947	ldp	d10,d11,[sp,#32]
948	ldp	d12,d13,[sp,#48]
949	ldp	d14,d15,[sp,#64]
950	ldp	x29,x30,[sp],#80
951	AARCH64_VALIDATE_LINK_REGISTER
952	ret
953.size	SHA3_absorb_cext,.-SHA3_absorb_cext
954.globl	SHA3_squeeze_cext
955.type	SHA3_squeeze_cext,%function
956.align	5
957SHA3_squeeze_cext:
958	AARCH64_SIGN_LINK_REGISTER
959	stp	x29,x30,[sp,#-16]!
960	add	x29,sp,#0
961	mov	x9,x0
962	mov	x10,x3
963
964.Loop_squeeze_ce:
965	ldr	x4,[x9],#8
966	cmp	x2,#8
967	blo	.Lsqueeze_tail_ce
968#ifdef	__AARCH64EB__
969	rev	x4,x4
970#endif
971	str	x4,[x1],#8
972	beq	.Lsqueeze_done_ce
973
974	sub	x2,x2,#8
975	subs	x10,x10,#8
976	bhi	.Loop_squeeze_ce
977
978	bl	KeccakF1600_cext
979	ldr	x30,[sp,#8]
980	mov	x9,x0
981	mov	x10,x3
982	b	.Loop_squeeze_ce
983
984.align	4
985.Lsqueeze_tail_ce:
986	strb	w4,[x1],#1
987	lsr	x4,x4,#8
988	subs	x2,x2,#1
989	beq	.Lsqueeze_done_ce
990	strb	w4,[x1],#1
991	lsr	x4,x4,#8
992	subs	x2,x2,#1
993	beq	.Lsqueeze_done_ce
994	strb	w4,[x1],#1
995	lsr	x4,x4,#8
996	subs	x2,x2,#1
997	beq	.Lsqueeze_done_ce
998	strb	w4,[x1],#1
999	lsr	x4,x4,#8
1000	subs	x2,x2,#1
1001	beq	.Lsqueeze_done_ce
1002	strb	w4,[x1],#1
1003	lsr	x4,x4,#8
1004	subs	x2,x2,#1
1005	beq	.Lsqueeze_done_ce
1006	strb	w4,[x1],#1
1007	lsr	x4,x4,#8
1008	subs	x2,x2,#1
1009	beq	.Lsqueeze_done_ce
1010	strb	w4,[x1],#1
1011
1012.Lsqueeze_done_ce:
1013	ldr	x29,[sp],#16
1014	AARCH64_VALIDATE_LINK_REGISTER
1015	ret
1016.size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
1017.byte	75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1018.align	2
1019