xref: /freebsd/sys/crypto/openssl/aarch64/keccak1600-armv8.S (revision 05427f4639bcf2703329a9be9d25ec09bb782742)
1/* Do not modify. This file is auto-generated from keccak1600-armv8.pl. */
2#include "arm_arch.h"
3
4.text
5
6.align	8	// strategic alignment and padding that allows to use
7		// address value as loop termination condition...
8.quad	0,0,0,0,0,0,0,0
9.type	iotas,%object
10iotas:
11.quad	0x0000000000000001
12.quad	0x0000000000008082
13.quad	0x800000000000808a
14.quad	0x8000000080008000
15.quad	0x000000000000808b
16.quad	0x0000000080000001
17.quad	0x8000000080008081
18.quad	0x8000000000008009
19.quad	0x000000000000008a
20.quad	0x0000000000000088
21.quad	0x0000000080008009
22.quad	0x000000008000000a
23.quad	0x000000008000808b
24.quad	0x800000000000008b
25.quad	0x8000000000008089
26.quad	0x8000000000008003
27.quad	0x8000000000008002
28.quad	0x8000000000000080
29.quad	0x000000000000800a
30.quad	0x800000008000000a
31.quad	0x8000000080008081
32.quad	0x8000000000008080
33.quad	0x0000000080000001
34.quad	0x8000000080008008
35.size	iotas,.-iotas
36.type	KeccakF1600_int,%function
37.align	5
38KeccakF1600_int:
39	AARCH64_SIGN_LINK_REGISTER
40	adr	x28,iotas
41	stp	x28,x30,[sp,#16]		// 32 bytes on top are mine
42	b	.Loop
43.align	4
44.Loop:
45	////////////////////////////////////////// Theta
46	eor	x26,x0,x5
47	stp	x4,x9,[sp,#0]	// offload pair...
48	eor	x27,x1,x6
49	eor	x28,x2,x7
50	eor	x30,x3,x8
51	eor	x4,x4,x9
52	eor	x26,x26,x10
53	eor	x27,x27,x11
54	eor	x28,x28,x12
55	eor	x30,x30,x13
56	eor	x4,x4,x14
57	eor	x26,x26,x15
58	eor	x27,x27,x16
59	eor	x28,x28,x17
60	eor	x30,x30,x25
61	eor	x4,x4,x19
62	eor	x26,x26,x20
63	eor	x28,x28,x22
64	eor	x27,x27,x21
65	eor	x30,x30,x23
66	eor	x4,x4,x24
67
68	eor	x9,x26,x28,ror#63
69
70	eor	x1,x1,x9
71	eor	x6,x6,x9
72	eor	x11,x11,x9
73	eor	x16,x16,x9
74	eor	x21,x21,x9
75
76	eor	x9,x27,x30,ror#63
77	eor	x28,x28,x4,ror#63
78	eor	x30,x30,x26,ror#63
79	eor	x4,x4,x27,ror#63
80
81	eor	x27,   x2,x9		// mov	x27,x2
82	eor	x7,x7,x9
83	eor	x12,x12,x9
84	eor	x17,x17,x9
85	eor	x22,x22,x9
86
87	eor	x0,x0,x4
88	eor	x5,x5,x4
89	eor	x10,x10,x4
90	eor	x15,x15,x4
91	eor	x20,x20,x4
92	ldp	x4,x9,[sp,#0]	// re-load offloaded data
93	eor	x26,   x3,x28		// mov	x26,x3
94	eor	x8,x8,x28
95	eor	x13,x13,x28
96	eor	x25,x25,x28
97	eor	x23,x23,x28
98
99	eor	x28,   x4,x30		// mov	x28,x4
100	eor	x9,x9,x30
101	eor	x14,x14,x30
102	eor	x19,x19,x30
103	eor	x24,x24,x30
104
105	////////////////////////////////////////// Rho+Pi
106	mov	x30,x1
107	ror	x1,x6,#64-44
108	//mov	x27,x2
109	ror	x2,x12,#64-43
110	//mov	x26,x3
111	ror	x3,x25,#64-21
112	//mov	x28,x4
113	ror	x4,x24,#64-14
114
115	ror	x6,x9,#64-20
116	ror	x12,x13,#64-25
117	ror	x25,x17,#64-15
118	ror	x24,x21,#64-2
119
120	ror	x9,x22,#64-61
121	ror	x13,x19,#64-8
122	ror	x17,x11,#64-10
123	ror	x21,x8,#64-55
124
125	ror	x22,x14,#64-39
126	ror	x19,x23,#64-56
127	ror	x11,x7,#64-6
128	ror	x8,x16,#64-45
129
130	ror	x14,x20,#64-18
131	ror	x23,x15,#64-41
132	ror	x7,x10,#64-3
133	ror	x16,x5,#64-36
134
135	ror	x5,x26,#64-28
136	ror	x10,x30,#64-1
137	ror	x15,x28,#64-27
138	ror	x20,x27,#64-62
139
140	////////////////////////////////////////// Chi+Iota
141	bic	x26,x2,x1
142	bic	x27,x3,x2
143	bic	x28,x0,x4
144	bic	x30,x1,x0
145	eor	x0,x0,x26
146	bic	x26,x4,x3
147	eor	x1,x1,x27
148	ldr	x27,[sp,#16]
149	eor	x3,x3,x28
150	eor	x4,x4,x30
151	eor	x2,x2,x26
152	ldr	x30,[x27],#8		// Iota[i++]
153
154	bic	x26,x7,x6
155	tst	x27,#255			// are we done?
156	str	x27,[sp,#16]
157	bic	x27,x8,x7
158	bic	x28,x5,x9
159	eor	x0,x0,x30		// A[0][0] ^= Iota
160	bic	x30,x6,x5
161	eor	x5,x5,x26
162	bic	x26,x9,x8
163	eor	x6,x6,x27
164	eor	x8,x8,x28
165	eor	x9,x9,x30
166	eor	x7,x7,x26
167
168	bic	x26,x12,x11
169	bic	x27,x13,x12
170	bic	x28,x10,x14
171	bic	x30,x11,x10
172	eor	x10,x10,x26
173	bic	x26,x14,x13
174	eor	x11,x11,x27
175	eor	x13,x13,x28
176	eor	x14,x14,x30
177	eor	x12,x12,x26
178
179	bic	x26,x17,x16
180	bic	x27,x25,x17
181	bic	x28,x15,x19
182	bic	x30,x16,x15
183	eor	x15,x15,x26
184	bic	x26,x19,x25
185	eor	x16,x16,x27
186	eor	x25,x25,x28
187	eor	x19,x19,x30
188	eor	x17,x17,x26
189
190	bic	x26,x22,x21
191	bic	x27,x23,x22
192	bic	x28,x20,x24
193	bic	x30,x21,x20
194	eor	x20,x20,x26
195	bic	x26,x24,x23
196	eor	x21,x21,x27
197	eor	x23,x23,x28
198	eor	x24,x24,x30
199	eor	x22,x22,x26
200
201	bne	.Loop
202
203	ldr	x30,[sp,#24]
204	AARCH64_VALIDATE_LINK_REGISTER
205	ret
206.size	KeccakF1600_int,.-KeccakF1600_int
207
208.type	KeccakF1600,%function
209.align	5
210KeccakF1600:
211	AARCH64_SIGN_LINK_REGISTER
212	stp	x29,x30,[sp,#-128]!
213	add	x29,sp,#0
214	stp	x19,x20,[sp,#16]
215	stp	x21,x22,[sp,#32]
216	stp	x23,x24,[sp,#48]
217	stp	x25,x26,[sp,#64]
218	stp	x27,x28,[sp,#80]
219	sub	sp,sp,#48
220
221	str	x0,[sp,#32]			// offload argument
222	mov	x26,x0
223	ldp	x0,x1,[x0,#16*0]
224	ldp	x2,x3,[x26,#16*1]
225	ldp	x4,x5,[x26,#16*2]
226	ldp	x6,x7,[x26,#16*3]
227	ldp	x8,x9,[x26,#16*4]
228	ldp	x10,x11,[x26,#16*5]
229	ldp	x12,x13,[x26,#16*6]
230	ldp	x14,x15,[x26,#16*7]
231	ldp	x16,x17,[x26,#16*8]
232	ldp	x25,x19,[x26,#16*9]
233	ldp	x20,x21,[x26,#16*10]
234	ldp	x22,x23,[x26,#16*11]
235	ldr	x24,[x26,#16*12]
236
237	bl	KeccakF1600_int
238
239	ldr	x26,[sp,#32]
240	stp	x0,x1,[x26,#16*0]
241	stp	x2,x3,[x26,#16*1]
242	stp	x4,x5,[x26,#16*2]
243	stp	x6,x7,[x26,#16*3]
244	stp	x8,x9,[x26,#16*4]
245	stp	x10,x11,[x26,#16*5]
246	stp	x12,x13,[x26,#16*6]
247	stp	x14,x15,[x26,#16*7]
248	stp	x16,x17,[x26,#16*8]
249	stp	x25,x19,[x26,#16*9]
250	stp	x20,x21,[x26,#16*10]
251	stp	x22,x23,[x26,#16*11]
252	str	x24,[x26,#16*12]
253
254	ldp	x19,x20,[x29,#16]
255	add	sp,sp,#48
256	ldp	x21,x22,[x29,#32]
257	ldp	x23,x24,[x29,#48]
258	ldp	x25,x26,[x29,#64]
259	ldp	x27,x28,[x29,#80]
260	ldp	x29,x30,[sp],#128
261	AARCH64_VALIDATE_LINK_REGISTER
262	ret
263.size	KeccakF1600,.-KeccakF1600
264
265.globl	SHA3_absorb
266.type	SHA3_absorb,%function
267.align	5
268SHA3_absorb:
269	AARCH64_SIGN_LINK_REGISTER
270	stp	x29,x30,[sp,#-128]!
271	add	x29,sp,#0
272	stp	x19,x20,[sp,#16]
273	stp	x21,x22,[sp,#32]
274	stp	x23,x24,[sp,#48]
275	stp	x25,x26,[sp,#64]
276	stp	x27,x28,[sp,#80]
277	sub	sp,sp,#64
278
279	stp	x0,x1,[sp,#32]			// offload arguments
280	stp	x2,x3,[sp,#48]
281
282	mov	x26,x0			// uint64_t A[5][5]
283	mov	x27,x1			// const void *inp
284	mov	x28,x2			// size_t len
285	mov	x30,x3			// size_t bsz
286	ldp	x0,x1,[x26,#16*0]
287	ldp	x2,x3,[x26,#16*1]
288	ldp	x4,x5,[x26,#16*2]
289	ldp	x6,x7,[x26,#16*3]
290	ldp	x8,x9,[x26,#16*4]
291	ldp	x10,x11,[x26,#16*5]
292	ldp	x12,x13,[x26,#16*6]
293	ldp	x14,x15,[x26,#16*7]
294	ldp	x16,x17,[x26,#16*8]
295	ldp	x25,x19,[x26,#16*9]
296	ldp	x20,x21,[x26,#16*10]
297	ldp	x22,x23,[x26,#16*11]
298	ldr	x24,[x26,#16*12]
299	b	.Loop_absorb
300
301.align	4
302.Loop_absorb:
303	subs	x26,x28,x30		// len - bsz
304	blo	.Labsorbed
305
306	str	x26,[sp,#48]			// save len - bsz
307	ldr	x26,[x27],#8		// *inp++
308#ifdef	__AARCH64EB__
309	rev	x26,x26
310#endif
311	eor	x0,x0,x26
312	cmp	x30,#8*(0+2)
313	blo	.Lprocess_block
314	ldr	x26,[x27],#8		// *inp++
315#ifdef	__AARCH64EB__
316	rev	x26,x26
317#endif
318	eor	x1,x1,x26
319	beq	.Lprocess_block
320	ldr	x26,[x27],#8		// *inp++
321#ifdef	__AARCH64EB__
322	rev	x26,x26
323#endif
324	eor	x2,x2,x26
325	cmp	x30,#8*(2+2)
326	blo	.Lprocess_block
327	ldr	x26,[x27],#8		// *inp++
328#ifdef	__AARCH64EB__
329	rev	x26,x26
330#endif
331	eor	x3,x3,x26
332	beq	.Lprocess_block
333	ldr	x26,[x27],#8		// *inp++
334#ifdef	__AARCH64EB__
335	rev	x26,x26
336#endif
337	eor	x4,x4,x26
338	cmp	x30,#8*(4+2)
339	blo	.Lprocess_block
340	ldr	x26,[x27],#8		// *inp++
341#ifdef	__AARCH64EB__
342	rev	x26,x26
343#endif
344	eor	x5,x5,x26
345	beq	.Lprocess_block
346	ldr	x26,[x27],#8		// *inp++
347#ifdef	__AARCH64EB__
348	rev	x26,x26
349#endif
350	eor	x6,x6,x26
351	cmp	x30,#8*(6+2)
352	blo	.Lprocess_block
353	ldr	x26,[x27],#8		// *inp++
354#ifdef	__AARCH64EB__
355	rev	x26,x26
356#endif
357	eor	x7,x7,x26
358	beq	.Lprocess_block
359	ldr	x26,[x27],#8		// *inp++
360#ifdef	__AARCH64EB__
361	rev	x26,x26
362#endif
363	eor	x8,x8,x26
364	cmp	x30,#8*(8+2)
365	blo	.Lprocess_block
366	ldr	x26,[x27],#8		// *inp++
367#ifdef	__AARCH64EB__
368	rev	x26,x26
369#endif
370	eor	x9,x9,x26
371	beq	.Lprocess_block
372	ldr	x26,[x27],#8		// *inp++
373#ifdef	__AARCH64EB__
374	rev	x26,x26
375#endif
376	eor	x10,x10,x26
377	cmp	x30,#8*(10+2)
378	blo	.Lprocess_block
379	ldr	x26,[x27],#8		// *inp++
380#ifdef	__AARCH64EB__
381	rev	x26,x26
382#endif
383	eor	x11,x11,x26
384	beq	.Lprocess_block
385	ldr	x26,[x27],#8		// *inp++
386#ifdef	__AARCH64EB__
387	rev	x26,x26
388#endif
389	eor	x12,x12,x26
390	cmp	x30,#8*(12+2)
391	blo	.Lprocess_block
392	ldr	x26,[x27],#8		// *inp++
393#ifdef	__AARCH64EB__
394	rev	x26,x26
395#endif
396	eor	x13,x13,x26
397	beq	.Lprocess_block
398	ldr	x26,[x27],#8		// *inp++
399#ifdef	__AARCH64EB__
400	rev	x26,x26
401#endif
402	eor	x14,x14,x26
403	cmp	x30,#8*(14+2)
404	blo	.Lprocess_block
405	ldr	x26,[x27],#8		// *inp++
406#ifdef	__AARCH64EB__
407	rev	x26,x26
408#endif
409	eor	x15,x15,x26
410	beq	.Lprocess_block
411	ldr	x26,[x27],#8		// *inp++
412#ifdef	__AARCH64EB__
413	rev	x26,x26
414#endif
415	eor	x16,x16,x26
416	cmp	x30,#8*(16+2)
417	blo	.Lprocess_block
418	ldr	x26,[x27],#8		// *inp++
419#ifdef	__AARCH64EB__
420	rev	x26,x26
421#endif
422	eor	x17,x17,x26
423	beq	.Lprocess_block
424	ldr	x26,[x27],#8		// *inp++
425#ifdef	__AARCH64EB__
426	rev	x26,x26
427#endif
428	eor	x25,x25,x26
429	cmp	x30,#8*(18+2)
430	blo	.Lprocess_block
431	ldr	x26,[x27],#8		// *inp++
432#ifdef	__AARCH64EB__
433	rev	x26,x26
434#endif
435	eor	x19,x19,x26
436	beq	.Lprocess_block
437	ldr	x26,[x27],#8		// *inp++
438#ifdef	__AARCH64EB__
439	rev	x26,x26
440#endif
441	eor	x20,x20,x26
442	cmp	x30,#8*(20+2)
443	blo	.Lprocess_block
444	ldr	x26,[x27],#8		// *inp++
445#ifdef	__AARCH64EB__
446	rev	x26,x26
447#endif
448	eor	x21,x21,x26
449	beq	.Lprocess_block
450	ldr	x26,[x27],#8		// *inp++
451#ifdef	__AARCH64EB__
452	rev	x26,x26
453#endif
454	eor	x22,x22,x26
455	cmp	x30,#8*(22+2)
456	blo	.Lprocess_block
457	ldr	x26,[x27],#8		// *inp++
458#ifdef	__AARCH64EB__
459	rev	x26,x26
460#endif
461	eor	x23,x23,x26
462	beq	.Lprocess_block
463	ldr	x26,[x27],#8		// *inp++
464#ifdef	__AARCH64EB__
465	rev	x26,x26
466#endif
467	eor	x24,x24,x26
468
469.Lprocess_block:
470	str	x27,[sp,#40]			// save inp
471
472	bl	KeccakF1600_int
473
474	ldr	x27,[sp,#40]			// restore arguments
475	ldp	x28,x30,[sp,#48]
476	b	.Loop_absorb
477
478.align	4
479.Labsorbed:
480	ldr	x27,[sp,#32]
481	stp	x0,x1,[x27,#16*0]
482	stp	x2,x3,[x27,#16*1]
483	stp	x4,x5,[x27,#16*2]
484	stp	x6,x7,[x27,#16*3]
485	stp	x8,x9,[x27,#16*4]
486	stp	x10,x11,[x27,#16*5]
487	stp	x12,x13,[x27,#16*6]
488	stp	x14,x15,[x27,#16*7]
489	stp	x16,x17,[x27,#16*8]
490	stp	x25,x19,[x27,#16*9]
491	stp	x20,x21,[x27,#16*10]
492	stp	x22,x23,[x27,#16*11]
493	str	x24,[x27,#16*12]
494
495	mov	x0,x28			// return value
496	ldp	x19,x20,[x29,#16]
497	add	sp,sp,#64
498	ldp	x21,x22,[x29,#32]
499	ldp	x23,x24,[x29,#48]
500	ldp	x25,x26,[x29,#64]
501	ldp	x27,x28,[x29,#80]
502	ldp	x29,x30,[sp],#128
503	AARCH64_VALIDATE_LINK_REGISTER
504	ret
505.size	SHA3_absorb,.-SHA3_absorb
506.globl	SHA3_squeeze
507.type	SHA3_squeeze,%function
508.align	5
509SHA3_squeeze:
510	AARCH64_SIGN_LINK_REGISTER
511	stp	x29,x30,[sp,#-48]!
512	add	x29,sp,#0
513	stp	x19,x20,[sp,#16]
514	stp	x21,x22,[sp,#32]
515
516	mov	x19,x0			// put aside arguments
517	mov	x20,x1
518	mov	x21,x2
519	mov	x22,x3
520
521.Loop_squeeze:
522	ldr	x4,[x0],#8
523	cmp	x21,#8
524	blo	.Lsqueeze_tail
525#ifdef	__AARCH64EB__
526	rev	x4,x4
527#endif
528	str	x4,[x20],#8
529	subs	x21,x21,#8
530	beq	.Lsqueeze_done
531
532	subs	x3,x3,#8
533	bhi	.Loop_squeeze
534
535	mov	x0,x19
536	bl	KeccakF1600
537	mov	x0,x19
538	mov	x3,x22
539	b	.Loop_squeeze
540
541.align	4
542.Lsqueeze_tail:
543	strb	w4,[x20],#1
544	lsr	x4,x4,#8
545	subs	x21,x21,#1
546	beq	.Lsqueeze_done
547	strb	w4,[x20],#1
548	lsr	x4,x4,#8
549	subs	x21,x21,#1
550	beq	.Lsqueeze_done
551	strb	w4,[x20],#1
552	lsr	x4,x4,#8
553	subs	x21,x21,#1
554	beq	.Lsqueeze_done
555	strb	w4,[x20],#1
556	lsr	x4,x4,#8
557	subs	x21,x21,#1
558	beq	.Lsqueeze_done
559	strb	w4,[x20],#1
560	lsr	x4,x4,#8
561	subs	x21,x21,#1
562	beq	.Lsqueeze_done
563	strb	w4,[x20],#1
564	lsr	x4,x4,#8
565	subs	x21,x21,#1
566	beq	.Lsqueeze_done
567	strb	w4,[x20],#1
568
569.Lsqueeze_done:
570	ldp	x19,x20,[sp,#16]
571	ldp	x21,x22,[sp,#32]
572	ldp	x29,x30,[sp],#48
573	AARCH64_VALIDATE_LINK_REGISTER
574	ret
575.size	SHA3_squeeze,.-SHA3_squeeze
576.type	KeccakF1600_ce,%function
577.align	5
578KeccakF1600_ce:
579	mov	x9,#24
580	adr	x10,iotas
581	b	.Loop_ce
582.align	4
583.Loop_ce:
584	////////////////////////////////////////////////// Theta
585.inst	0xce0f2a99	//eor3 v25.16b,v20.16b,v15.16b,v10.16b
586.inst	0xce102eba	//eor3 v26.16b,v21.16b,v16.16b,v11.16b
587.inst	0xce1132db	//eor3 v27.16b,v22.16b,v17.16b,v12.16b
588.inst	0xce1236fc	//eor3 v28.16b,v23.16b,v18.16b,v13.16b
589.inst	0xce133b1d	//eor3 v29.16b,v24.16b,v19.16b,v14.16b
590.inst	0xce050339	//eor3 v25.16b,v25.16b,   v5.16b,v0.16b
591.inst	0xce06075a	//eor3 v26.16b,v26.16b,   v6.16b,v1.16b
592.inst	0xce070b7b	//eor3 v27.16b,v27.16b,   v7.16b,v2.16b
593.inst	0xce080f9c	//eor3 v28.16b,v28.16b,   v8.16b,v3.16b
594.inst	0xce0913bd	//eor3 v29.16b,v29.16b,   v9.16b,v4.16b
595
596.inst	0xce7b8f3e	//rax1 v30.16b,v25.16b,v27.16b			// D[1]
597.inst	0xce7c8f5f	//rax1 v31.16b,v26.16b,v28.16b			// D[2]
598.inst	0xce7d8f7b	//rax1 v27.16b,v27.16b,v29.16b			// D[3]
599.inst	0xce798f9c	//rax1 v28.16b,v28.16b,v25.16b			// D[4]
600.inst	0xce7a8fbd	//rax1 v29.16b,v29.16b,v26.16b			// D[0]
601
602	////////////////////////////////////////////////// Theta+Rho+Pi
603.inst	0xce9efc39	//xar v25.16b,   v1.16b,v30.16b,#64-1 // C[0]=A[2][0]
604
605.inst	0xce9e50c1	//xar v1.16b,v6.16b,v30.16b,#64-44
606.inst	0xce9cb126	//xar v6.16b,v9.16b,v28.16b,#64-20
607.inst	0xce9f0ec9	//xar v9.16b,v22.16b,v31.16b,#64-61
608.inst	0xce9c65d6	//xar v22.16b,v14.16b,v28.16b,#64-39
609.inst	0xce9dba8e	//xar v14.16b,v20.16b,v29.16b,#64-18
610
611.inst	0xce9f085a	//xar v26.16b,   v2.16b,v31.16b,#64-62 // C[1]=A[4][0]
612
613.inst	0xce9f5582	//xar v2.16b,v12.16b,v31.16b,#64-43
614.inst	0xce9b9dac	//xar v12.16b,v13.16b,v27.16b,#64-25
615.inst	0xce9ce26d	//xar v13.16b,v19.16b,v28.16b,#64-8
616.inst	0xce9b22f3	//xar v19.16b,v23.16b,v27.16b,#64-56
617.inst	0xce9d5df7	//xar v23.16b,v15.16b,v29.16b,#64-41
618
619.inst	0xce9c948f	//xar v15.16b,v4.16b,v28.16b,#64-27
620
621.inst	0xce9ccb1c	//xar v28.16b,   v24.16b,v28.16b,#64-14 // D[4]=A[0][4]
622.inst	0xce9efab8	//xar v24.16b,v21.16b,v30.16b,#64-2
623.inst	0xce9b2508	//xar v8.16b,v8.16b,v27.16b,#64-55 // A[1][3]=A[4][1]
624.inst	0xce9e4e04	//xar v4.16b,v16.16b,v30.16b,#64-45 // A[0][4]=A[1][3]
625.inst	0xce9d70b0	//xar v16.16b,v5.16b,v29.16b,#64-36
626
627.inst	0xce9b9065	//xar v5.16b,v3.16b,v27.16b,#64-28
628
629	eor	v0.16b,v0.16b,v29.16b
630
631.inst	0xce9bae5b	//xar v27.16b,   v18.16b,v27.16b,#64-21 // D[3]=A[0][3]
632.inst	0xce9fc623	//xar v3.16b,v17.16b,v31.16b,#64-15 // A[0][3]=A[3][3]
633.inst	0xce9ed97e	//xar v30.16b,   v11.16b,v30.16b,#64-10 // D[1]=A[3][2]
634.inst	0xce9fe8ff	//xar v31.16b,   v7.16b,v31.16b,#64-6 // D[2]=A[2][1]
635.inst	0xce9df55d	//xar v29.16b,   v10.16b,v29.16b,#64-3 // D[0]=A[1][2]
636
637	////////////////////////////////////////////////// Chi+Iota
638.inst	0xce362354	//bcax v20.16b,v26.16b,   v22.16b,v8.16b	// A[1][3]=A[4][1]
639.inst	0xce375915	//bcax v21.16b,v8.16b,v23.16b,v22.16b	// A[1][3]=A[4][1]
640.inst	0xce385ed6	//bcax v22.16b,v22.16b,v24.16b,v23.16b
641.inst	0xce3a62f7	//bcax v23.16b,v23.16b,v26.16b,   v24.16b
642.inst	0xce286b18	//bcax v24.16b,v24.16b,v8.16b,v26.16b	// A[1][3]=A[4][1]
643
644	ld1r	{v26.2d},[x10],#8
645
646.inst	0xce330fd1	//bcax v17.16b,v30.16b,   v19.16b,v3.16b	// A[0][3]=A[3][3]
647.inst	0xce2f4c72	//bcax v18.16b,v3.16b,v15.16b,v19.16b	// A[0][3]=A[3][3]
648.inst	0xce303e73	//bcax v19.16b,v19.16b,v16.16b,v15.16b
649.inst	0xce3e41ef	//bcax v15.16b,v15.16b,v30.16b,   v16.16b
650.inst	0xce237a10	//bcax v16.16b,v16.16b,v3.16b,v30.16b	// A[0][3]=A[3][3]
651
652.inst	0xce2c7f2a	//bcax v10.16b,v25.16b,   v12.16b,v31.16b
653.inst	0xce2d33eb	//bcax v11.16b,v31.16b,   v13.16b,v12.16b
654.inst	0xce2e358c	//bcax v12.16b,v12.16b,v14.16b,v13.16b
655.inst	0xce3939ad	//bcax v13.16b,v13.16b,v25.16b,   v14.16b
656.inst	0xce3f65ce	//bcax v14.16b,v14.16b,v31.16b,   v25.16b
657
658.inst	0xce2913a7	//bcax v7.16b,v29.16b,   v9.16b,v4.16b	// A[0][4]=A[1][3]
659.inst	0xce252488	//bcax v8.16b,v4.16b,v5.16b,v9.16b	// A[0][4]=A[1][3]
660.inst	0xce261529	//bcax v9.16b,v9.16b,v6.16b,v5.16b
661.inst	0xce3d18a5	//bcax v5.16b,v5.16b,v29.16b,   v6.16b
662.inst	0xce2474c6	//bcax v6.16b,v6.16b,v4.16b,v29.16b	// A[0][4]=A[1][3]
663
664.inst	0xce207363	//bcax v3.16b,v27.16b,   v0.16b,v28.16b
665.inst	0xce210384	//bcax v4.16b,v28.16b,   v1.16b,v0.16b
666.inst	0xce220400	//bcax v0.16b,v0.16b,v2.16b,v1.16b
667.inst	0xce3b0821	//bcax v1.16b,v1.16b,v27.16b,   v2.16b
668.inst	0xce3c6c42	//bcax v2.16b,v2.16b,v28.16b,   v27.16b
669
670	eor	v0.16b,v0.16b,v26.16b
671
672	subs	x9,x9,#1
673	bne	.Loop_ce
674
675	ret
676.size	KeccakF1600_ce,.-KeccakF1600_ce
677
678.type	KeccakF1600_cext,%function
679.align	5
680KeccakF1600_cext:
681	AARCH64_SIGN_LINK_REGISTER
682	stp	x29,x30,[sp,#-80]!
683	add	x29,sp,#0
684	stp	d8,d9,[sp,#16]		// per ABI requirement
685	stp	d10,d11,[sp,#32]
686	stp	d12,d13,[sp,#48]
687	stp	d14,d15,[sp,#64]
688	ldp	d0,d1,[x0,#8*0]
689	ldp	d2,d3,[x0,#8*2]
690	ldp	d4,d5,[x0,#8*4]
691	ldp	d6,d7,[x0,#8*6]
692	ldp	d8,d9,[x0,#8*8]
693	ldp	d10,d11,[x0,#8*10]
694	ldp	d12,d13,[x0,#8*12]
695	ldp	d14,d15,[x0,#8*14]
696	ldp	d16,d17,[x0,#8*16]
697	ldp	d18,d19,[x0,#8*18]
698	ldp	d20,d21,[x0,#8*20]
699	ldp	d22,d23,[x0,#8*22]
700	ldr	d24,[x0,#8*24]
701	bl	KeccakF1600_ce
702	ldr	x30,[sp,#8]
703	stp	d0,d1,[x0,#8*0]
704	stp	d2,d3,[x0,#8*2]
705	stp	d4,d5,[x0,#8*4]
706	stp	d6,d7,[x0,#8*6]
707	stp	d8,d9,[x0,#8*8]
708	stp	d10,d11,[x0,#8*10]
709	stp	d12,d13,[x0,#8*12]
710	stp	d14,d15,[x0,#8*14]
711	stp	d16,d17,[x0,#8*16]
712	stp	d18,d19,[x0,#8*18]
713	stp	d20,d21,[x0,#8*20]
714	stp	d22,d23,[x0,#8*22]
715	str	d24,[x0,#8*24]
716
717	ldp	d8,d9,[sp,#16]
718	ldp	d10,d11,[sp,#32]
719	ldp	d12,d13,[sp,#48]
720	ldp	d14,d15,[sp,#64]
721	ldr	x29,[sp],#80
722	AARCH64_VALIDATE_LINK_REGISTER
723	ret
724.size	KeccakF1600_cext,.-KeccakF1600_cext
725.globl	SHA3_absorb_cext
726.type	SHA3_absorb_cext,%function
727.align	5
728SHA3_absorb_cext:
729	AARCH64_SIGN_LINK_REGISTER
730	stp	x29,x30,[sp,#-80]!
731	add	x29,sp,#0
732	stp	d8,d9,[sp,#16]		// per ABI requirement
733	stp	d10,d11,[sp,#32]
734	stp	d12,d13,[sp,#48]
735	stp	d14,d15,[sp,#64]
736	ldp	d0,d1,[x0,#8*0]
737	ldp	d2,d3,[x0,#8*2]
738	ldp	d4,d5,[x0,#8*4]
739	ldp	d6,d7,[x0,#8*6]
740	ldp	d8,d9,[x0,#8*8]
741	ldp	d10,d11,[x0,#8*10]
742	ldp	d12,d13,[x0,#8*12]
743	ldp	d14,d15,[x0,#8*14]
744	ldp	d16,d17,[x0,#8*16]
745	ldp	d18,d19,[x0,#8*18]
746	ldp	d20,d21,[x0,#8*20]
747	ldp	d22,d23,[x0,#8*22]
748	ldr	d24,[x0,#8*24]
749	b	.Loop_absorb_ce
750
751.align	4
752.Loop_absorb_ce:
753	subs	x2,x2,x3		// len - bsz
754	blo	.Labsorbed_ce
755	ldr	d31,[x1],#8		// *inp++
756#ifdef	__AARCH64EB__
757	rev64	v31.16b,v31.16b
758#endif
759	eor	v0.16b,v0.16b,v31.16b
760	cmp	x3,#8*(0+2)
761	blo	.Lprocess_block_ce
762	ldr	d31,[x1],#8		// *inp++
763#ifdef	__AARCH64EB__
764	rev64	v31.16b,v31.16b
765#endif
766	eor	v1.16b,v1.16b,v31.16b
767	beq	.Lprocess_block_ce
768	ldr	d31,[x1],#8		// *inp++
769#ifdef	__AARCH64EB__
770	rev64	v31.16b,v31.16b
771#endif
772	eor	v2.16b,v2.16b,v31.16b
773	cmp	x3,#8*(2+2)
774	blo	.Lprocess_block_ce
775	ldr	d31,[x1],#8		// *inp++
776#ifdef	__AARCH64EB__
777	rev64	v31.16b,v31.16b
778#endif
779	eor	v3.16b,v3.16b,v31.16b
780	beq	.Lprocess_block_ce
781	ldr	d31,[x1],#8		// *inp++
782#ifdef	__AARCH64EB__
783	rev64	v31.16b,v31.16b
784#endif
785	eor	v4.16b,v4.16b,v31.16b
786	cmp	x3,#8*(4+2)
787	blo	.Lprocess_block_ce
788	ldr	d31,[x1],#8		// *inp++
789#ifdef	__AARCH64EB__
790	rev64	v31.16b,v31.16b
791#endif
792	eor	v5.16b,v5.16b,v31.16b
793	beq	.Lprocess_block_ce
794	ldr	d31,[x1],#8		// *inp++
795#ifdef	__AARCH64EB__
796	rev64	v31.16b,v31.16b
797#endif
798	eor	v6.16b,v6.16b,v31.16b
799	cmp	x3,#8*(6+2)
800	blo	.Lprocess_block_ce
801	ldr	d31,[x1],#8		// *inp++
802#ifdef	__AARCH64EB__
803	rev64	v31.16b,v31.16b
804#endif
805	eor	v7.16b,v7.16b,v31.16b
806	beq	.Lprocess_block_ce
807	ldr	d31,[x1],#8		// *inp++
808#ifdef	__AARCH64EB__
809	rev64	v31.16b,v31.16b
810#endif
811	eor	v8.16b,v8.16b,v31.16b
812	cmp	x3,#8*(8+2)
813	blo	.Lprocess_block_ce
814	ldr	d31,[x1],#8		// *inp++
815#ifdef	__AARCH64EB__
816	rev64	v31.16b,v31.16b
817#endif
818	eor	v9.16b,v9.16b,v31.16b
819	beq	.Lprocess_block_ce
820	ldr	d31,[x1],#8		// *inp++
821#ifdef	__AARCH64EB__
822	rev64	v31.16b,v31.16b
823#endif
824	eor	v10.16b,v10.16b,v31.16b
825	cmp	x3,#8*(10+2)
826	blo	.Lprocess_block_ce
827	ldr	d31,[x1],#8		// *inp++
828#ifdef	__AARCH64EB__
829	rev64	v31.16b,v31.16b
830#endif
831	eor	v11.16b,v11.16b,v31.16b
832	beq	.Lprocess_block_ce
833	ldr	d31,[x1],#8		// *inp++
834#ifdef	__AARCH64EB__
835	rev64	v31.16b,v31.16b
836#endif
837	eor	v12.16b,v12.16b,v31.16b
838	cmp	x3,#8*(12+2)
839	blo	.Lprocess_block_ce
840	ldr	d31,[x1],#8		// *inp++
841#ifdef	__AARCH64EB__
842	rev64	v31.16b,v31.16b
843#endif
844	eor	v13.16b,v13.16b,v31.16b
845	beq	.Lprocess_block_ce
846	ldr	d31,[x1],#8		// *inp++
847#ifdef	__AARCH64EB__
848	rev64	v31.16b,v31.16b
849#endif
850	eor	v14.16b,v14.16b,v31.16b
851	cmp	x3,#8*(14+2)
852	blo	.Lprocess_block_ce
853	ldr	d31,[x1],#8		// *inp++
854#ifdef	__AARCH64EB__
855	rev64	v31.16b,v31.16b
856#endif
857	eor	v15.16b,v15.16b,v31.16b
858	beq	.Lprocess_block_ce
859	ldr	d31,[x1],#8		// *inp++
860#ifdef	__AARCH64EB__
861	rev64	v31.16b,v31.16b
862#endif
863	eor	v16.16b,v16.16b,v31.16b
864	cmp	x3,#8*(16+2)
865	blo	.Lprocess_block_ce
866	ldr	d31,[x1],#8		// *inp++
867#ifdef	__AARCH64EB__
868	rev64	v31.16b,v31.16b
869#endif
870	eor	v17.16b,v17.16b,v31.16b
871	beq	.Lprocess_block_ce
872	ldr	d31,[x1],#8		// *inp++
873#ifdef	__AARCH64EB__
874	rev64	v31.16b,v31.16b
875#endif
876	eor	v18.16b,v18.16b,v31.16b
877	cmp	x3,#8*(18+2)
878	blo	.Lprocess_block_ce
879	ldr	d31,[x1],#8		// *inp++
880#ifdef	__AARCH64EB__
881	rev64	v31.16b,v31.16b
882#endif
883	eor	v19.16b,v19.16b,v31.16b
884	beq	.Lprocess_block_ce
885	ldr	d31,[x1],#8		// *inp++
886#ifdef	__AARCH64EB__
887	rev64	v31.16b,v31.16b
888#endif
889	eor	v20.16b,v20.16b,v31.16b
890	cmp	x3,#8*(20+2)
891	blo	.Lprocess_block_ce
892	ldr	d31,[x1],#8		// *inp++
893#ifdef	__AARCH64EB__
894	rev64	v31.16b,v31.16b
895#endif
896	eor	v21.16b,v21.16b,v31.16b
897	beq	.Lprocess_block_ce
898	ldr	d31,[x1],#8		// *inp++
899#ifdef	__AARCH64EB__
900	rev64	v31.16b,v31.16b
901#endif
902	eor	v22.16b,v22.16b,v31.16b
903	cmp	x3,#8*(22+2)
904	blo	.Lprocess_block_ce
905	ldr	d31,[x1],#8		// *inp++
906#ifdef	__AARCH64EB__
907	rev64	v31.16b,v31.16b
908#endif
909	eor	v23.16b,v23.16b,v31.16b
910	beq	.Lprocess_block_ce
911	ldr	d31,[x1],#8		// *inp++
912#ifdef	__AARCH64EB__
913	rev64	v31.16b,v31.16b
914#endif
915	eor	v24.16b,v24.16b,v31.16b
916
917.Lprocess_block_ce:
918
919	bl	KeccakF1600_ce
920
921	b	.Loop_absorb_ce
922
923.align	4
924.Labsorbed_ce:
925	stp	d0,d1,[x0,#8*0]
926	stp	d2,d3,[x0,#8*2]
927	stp	d4,d5,[x0,#8*4]
928	stp	d6,d7,[x0,#8*6]
929	stp	d8,d9,[x0,#8*8]
930	stp	d10,d11,[x0,#8*10]
931	stp	d12,d13,[x0,#8*12]
932	stp	d14,d15,[x0,#8*14]
933	stp	d16,d17,[x0,#8*16]
934	stp	d18,d19,[x0,#8*18]
935	stp	d20,d21,[x0,#8*20]
936	stp	d22,d23,[x0,#8*22]
937	str	d24,[x0,#8*24]
938	add	x0,x2,x3		// return value
939
940	ldp	d8,d9,[sp,#16]
941	ldp	d10,d11,[sp,#32]
942	ldp	d12,d13,[sp,#48]
943	ldp	d14,d15,[sp,#64]
944	ldp	x29,x30,[sp],#80
945	AARCH64_VALIDATE_LINK_REGISTER
946	ret
947.size	SHA3_absorb_cext,.-SHA3_absorb_cext
948.globl	SHA3_squeeze_cext
949.type	SHA3_squeeze_cext,%function
950.align	5
951SHA3_squeeze_cext:
952	AARCH64_SIGN_LINK_REGISTER
953	stp	x29,x30,[sp,#-16]!
954	add	x29,sp,#0
955	mov	x9,x0
956	mov	x10,x3
957
958.Loop_squeeze_ce:
959	ldr	x4,[x9],#8
960	cmp	x2,#8
961	blo	.Lsqueeze_tail_ce
962#ifdef	__AARCH64EB__
963	rev	x4,x4
964#endif
965	str	x4,[x1],#8
966	beq	.Lsqueeze_done_ce
967
968	sub	x2,x2,#8
969	subs	x10,x10,#8
970	bhi	.Loop_squeeze_ce
971
972	bl	KeccakF1600_cext
973	ldr	x30,[sp,#8]
974	mov	x9,x0
975	mov	x10,x3
976	b	.Loop_squeeze_ce
977
978.align	4
979.Lsqueeze_tail_ce:
980	strb	w4,[x1],#1
981	lsr	x4,x4,#8
982	subs	x2,x2,#1
983	beq	.Lsqueeze_done_ce
984	strb	w4,[x1],#1
985	lsr	x4,x4,#8
986	subs	x2,x2,#1
987	beq	.Lsqueeze_done_ce
988	strb	w4,[x1],#1
989	lsr	x4,x4,#8
990	subs	x2,x2,#1
991	beq	.Lsqueeze_done_ce
992	strb	w4,[x1],#1
993	lsr	x4,x4,#8
994	subs	x2,x2,#1
995	beq	.Lsqueeze_done_ce
996	strb	w4,[x1],#1
997	lsr	x4,x4,#8
998	subs	x2,x2,#1
999	beq	.Lsqueeze_done_ce
1000	strb	w4,[x1],#1
1001	lsr	x4,x4,#8
1002	subs	x2,x2,#1
1003	beq	.Lsqueeze_done_ce
1004	strb	w4,[x1],#1
1005
1006.Lsqueeze_done_ce:
1007	ldr	x29,[sp],#16
1008	AARCH64_VALIDATE_LINK_REGISTER
1009	ret
1010.size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
1011.byte	75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1012.align	2
1013