xref: /freebsd/sys/crypto/openssl/aarch64/aesv8-armx.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from aesv8-armx.pl. */
2#include "arm_arch.h"
3
4#if __ARM_MAX_ARCH__>=7
5.arch	armv8-a+crypto
6.text
7.section	.rodata
8.align	5
9.Lrcon:
10.long	0x01,0x01,0x01,0x01
11.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
12.long	0x1b,0x1b,0x1b,0x1b
13.previous
14.globl	aes_v8_set_encrypt_key
15.type	aes_v8_set_encrypt_key,%function
16.align	5
17aes_v8_set_encrypt_key:
18.Lenc_key:
19	AARCH64_VALID_CALL_TARGET
20	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
21	stp	x29,x30,[sp,#-16]!
22	add	x29,sp,#0
23	mov	x3,#-1
24	cmp	x0,#0
25	b.eq	.Lenc_key_abort
26	cmp	x2,#0
27	b.eq	.Lenc_key_abort
28	mov	x3,#-2
29	cmp	w1,#128
30	b.lt	.Lenc_key_abort
31	cmp	w1,#256
32	b.gt	.Lenc_key_abort
33	tst	w1,#0x3f
34	b.ne	.Lenc_key_abort
35
36	adrp	x3,.Lrcon
37	add	x3,x3,#:lo12:.Lrcon
38	cmp	w1,#192
39
40	eor	v0.16b,v0.16b,v0.16b
41	ld1	{v3.16b},[x0],#16
42	mov	w1,#8		// reuse w1
43	ld1	{v1.4s,v2.4s},[x3],#32
44
45	b.lt	.Loop128
46	b.eq	.L192
47	b	.L256
48
49.align	4
50.Loop128:
51	tbl	v6.16b,{v3.16b},v2.16b
52	ext	v5.16b,v0.16b,v3.16b,#12
53	st1	{v3.4s},[x2],#16
54	aese	v6.16b,v0.16b
55	subs	w1,w1,#1
56
57	eor	v3.16b,v3.16b,v5.16b
58	ext	v5.16b,v0.16b,v5.16b,#12
59	eor	v3.16b,v3.16b,v5.16b
60	ext	v5.16b,v0.16b,v5.16b,#12
61	eor	v6.16b,v6.16b,v1.16b
62	eor	v3.16b,v3.16b,v5.16b
63	shl	v1.16b,v1.16b,#1
64	eor	v3.16b,v3.16b,v6.16b
65	b.ne	.Loop128
66
67	ld1	{v1.4s},[x3]
68
69	tbl	v6.16b,{v3.16b},v2.16b
70	ext	v5.16b,v0.16b,v3.16b,#12
71	st1	{v3.4s},[x2],#16
72	aese	v6.16b,v0.16b
73
74	eor	v3.16b,v3.16b,v5.16b
75	ext	v5.16b,v0.16b,v5.16b,#12
76	eor	v3.16b,v3.16b,v5.16b
77	ext	v5.16b,v0.16b,v5.16b,#12
78	eor	v6.16b,v6.16b,v1.16b
79	eor	v3.16b,v3.16b,v5.16b
80	shl	v1.16b,v1.16b,#1
81	eor	v3.16b,v3.16b,v6.16b
82
83	tbl	v6.16b,{v3.16b},v2.16b
84	ext	v5.16b,v0.16b,v3.16b,#12
85	st1	{v3.4s},[x2],#16
86	aese	v6.16b,v0.16b
87
88	eor	v3.16b,v3.16b,v5.16b
89	ext	v5.16b,v0.16b,v5.16b,#12
90	eor	v3.16b,v3.16b,v5.16b
91	ext	v5.16b,v0.16b,v5.16b,#12
92	eor	v6.16b,v6.16b,v1.16b
93	eor	v3.16b,v3.16b,v5.16b
94	eor	v3.16b,v3.16b,v6.16b
95	st1	{v3.4s},[x2]
96	add	x2,x2,#0x50
97
98	mov	w12,#10
99	b	.Ldone
100
101.align	4
102.L192:
103	ld1	{v4.8b},[x0],#8
104	movi	v6.16b,#8			// borrow v6.16b
105	st1	{v3.4s},[x2],#16
106	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
107
108.Loop192:
109	tbl	v6.16b,{v4.16b},v2.16b
110	ext	v5.16b,v0.16b,v3.16b,#12
111#ifdef __AARCH64EB__
112	st1	{v4.4s},[x2],#16
113	sub	x2,x2,#8
114#else
115	st1	{v4.8b},[x2],#8
116#endif
117	aese	v6.16b,v0.16b
118	subs	w1,w1,#1
119
120	eor	v3.16b,v3.16b,v5.16b
121	ext	v5.16b,v0.16b,v5.16b,#12
122	eor	v3.16b,v3.16b,v5.16b
123	ext	v5.16b,v0.16b,v5.16b,#12
124	eor	v3.16b,v3.16b,v5.16b
125
126	dup	v5.4s,v3.s[3]
127	eor	v5.16b,v5.16b,v4.16b
128	eor	v6.16b,v6.16b,v1.16b
129	ext	v4.16b,v0.16b,v4.16b,#12
130	shl	v1.16b,v1.16b,#1
131	eor	v4.16b,v4.16b,v5.16b
132	eor	v3.16b,v3.16b,v6.16b
133	eor	v4.16b,v4.16b,v6.16b
134	st1	{v3.4s},[x2],#16
135	b.ne	.Loop192
136
137	mov	w12,#12
138	add	x2,x2,#0x20
139	b	.Ldone
140
141.align	4
142.L256:
143	ld1	{v4.16b},[x0]
144	mov	w1,#7
145	mov	w12,#14
146	st1	{v3.4s},[x2],#16
147
148.Loop256:
149	tbl	v6.16b,{v4.16b},v2.16b
150	ext	v5.16b,v0.16b,v3.16b,#12
151	st1	{v4.4s},[x2],#16
152	aese	v6.16b,v0.16b
153	subs	w1,w1,#1
154
155	eor	v3.16b,v3.16b,v5.16b
156	ext	v5.16b,v0.16b,v5.16b,#12
157	eor	v3.16b,v3.16b,v5.16b
158	ext	v5.16b,v0.16b,v5.16b,#12
159	eor	v6.16b,v6.16b,v1.16b
160	eor	v3.16b,v3.16b,v5.16b
161	shl	v1.16b,v1.16b,#1
162	eor	v3.16b,v3.16b,v6.16b
163	st1	{v3.4s},[x2],#16
164	b.eq	.Ldone
165
166	dup	v6.4s,v3.s[3]		// just splat
167	ext	v5.16b,v0.16b,v4.16b,#12
168	aese	v6.16b,v0.16b
169
170	eor	v4.16b,v4.16b,v5.16b
171	ext	v5.16b,v0.16b,v5.16b,#12
172	eor	v4.16b,v4.16b,v5.16b
173	ext	v5.16b,v0.16b,v5.16b,#12
174	eor	v4.16b,v4.16b,v5.16b
175
176	eor	v4.16b,v4.16b,v6.16b
177	b	.Loop256
178
179.Ldone:
180	str	w12,[x2]
181	mov	x3,#0
182
183.Lenc_key_abort:
184	mov	x0,x3			// return value
185	ldr	x29,[sp],#16
186	ret
187.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
188
189.globl	aes_v8_set_decrypt_key
190.type	aes_v8_set_decrypt_key,%function
191.align	5
192aes_v8_set_decrypt_key:
193	AARCH64_SIGN_LINK_REGISTER
194	stp	x29,x30,[sp,#-16]!
195	add	x29,sp,#0
196	bl	.Lenc_key
197
198	cmp	x0,#0
199	b.ne	.Ldec_key_abort
200
201	sub	x2,x2,#240		// restore original x2
202	mov	x4,#-16
203	add	x0,x2,x12,lsl#4	// end of key schedule
204
205	ld1	{v0.4s},[x2]
206	ld1	{v1.4s},[x0]
207	st1	{v0.4s},[x0],x4
208	st1	{v1.4s},[x2],#16
209
210.Loop_imc:
211	ld1	{v0.4s},[x2]
212	ld1	{v1.4s},[x0]
213	aesimc	v0.16b,v0.16b
214	aesimc	v1.16b,v1.16b
215	st1	{v0.4s},[x0],x4
216	st1	{v1.4s},[x2],#16
217	cmp	x0,x2
218	b.hi	.Loop_imc
219
220	ld1	{v0.4s},[x2]
221	aesimc	v0.16b,v0.16b
222	st1	{v0.4s},[x0]
223
224	eor	x0,x0,x0		// return value
225.Ldec_key_abort:
226	ldp	x29,x30,[sp],#16
227	AARCH64_VALIDATE_LINK_REGISTER
228	ret
229.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
230.globl	aes_v8_encrypt
231.type	aes_v8_encrypt,%function
232.align	5
233aes_v8_encrypt:
234	AARCH64_VALID_CALL_TARGET
235	ldr	w3,[x2,#240]
236	ld1	{v0.4s},[x2],#16
237	ld1	{v2.16b},[x0]
238	sub	w3,w3,#2
239	ld1	{v1.4s},[x2],#16
240
241.Loop_enc:
242	aese	v2.16b,v0.16b
243	aesmc	v2.16b,v2.16b
244	ld1	{v0.4s},[x2],#16
245	subs	w3,w3,#2
246	aese	v2.16b,v1.16b
247	aesmc	v2.16b,v2.16b
248	ld1	{v1.4s},[x2],#16
249	b.gt	.Loop_enc
250
251	aese	v2.16b,v0.16b
252	aesmc	v2.16b,v2.16b
253	ld1	{v0.4s},[x2]
254	aese	v2.16b,v1.16b
255	eor	v2.16b,v2.16b,v0.16b
256
257	st1	{v2.16b},[x1]
258	ret
259.size	aes_v8_encrypt,.-aes_v8_encrypt
260.globl	aes_v8_decrypt
261.type	aes_v8_decrypt,%function
262.align	5
263aes_v8_decrypt:
264	AARCH64_VALID_CALL_TARGET
265	ldr	w3,[x2,#240]
266	ld1	{v0.4s},[x2],#16
267	ld1	{v2.16b},[x0]
268	sub	w3,w3,#2
269	ld1	{v1.4s},[x2],#16
270
271.Loop_dec:
272	aesd	v2.16b,v0.16b
273	aesimc	v2.16b,v2.16b
274	ld1	{v0.4s},[x2],#16
275	subs	w3,w3,#2
276	aesd	v2.16b,v1.16b
277	aesimc	v2.16b,v2.16b
278	ld1	{v1.4s},[x2],#16
279	b.gt	.Loop_dec
280
281	aesd	v2.16b,v0.16b
282	aesimc	v2.16b,v2.16b
283	ld1	{v0.4s},[x2]
284	aesd	v2.16b,v1.16b
285	eor	v2.16b,v2.16b,v0.16b
286
287	st1	{v2.16b},[x1]
288	ret
289.size	aes_v8_decrypt,.-aes_v8_decrypt
290.globl	aes_v8_ecb_encrypt
291.type	aes_v8_ecb_encrypt,%function
292.align	5
293aes_v8_ecb_encrypt:
294	AARCH64_VALID_CALL_TARGET
295	subs	x2,x2,#16
296	// Original input data size bigger than 16, jump to big size processing.
297	b.ne	.Lecb_big_size
298	ld1	{v0.16b},[x0]
299	cmp	w4,#0					// en- or decrypting?
300	ldr	w5,[x3,#240]
301	ld1	{v5.4s,v6.4s},[x3],#32			// load key schedule...
302
303	b.eq	.Lecb_small_dec
304	aese	v0.16b,v5.16b
305	aesmc	v0.16b,v0.16b
306	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
307	aese	v0.16b,v6.16b
308	aesmc	v0.16b,v0.16b
309	subs	w5,w5,#10			// if rounds==10, jump to aes-128-ecb processing
310	b.eq	.Lecb_128_enc
311.Lecb_round_loop:
312	aese	v0.16b,v16.16b
313	aesmc	v0.16b,v0.16b
314	ld1	{v16.4s},[x3],#16				// load key schedule...
315	aese	v0.16b,v17.16b
316	aesmc	v0.16b,v0.16b
317	ld1	{v17.4s},[x3],#16				// load key schedule...
318	subs	w5,w5,#2			// bias
319	b.gt	.Lecb_round_loop
320.Lecb_128_enc:
321	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
322	aese	v0.16b,v16.16b
323	aesmc	v0.16b,v0.16b
324	aese	v0.16b,v17.16b
325	aesmc	v0.16b,v0.16b
326	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
327	aese	v0.16b,v18.16b
328	aesmc	v0.16b,v0.16b
329	aese	v0.16b,v19.16b
330	aesmc	v0.16b,v0.16b
331	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
332	aese	v0.16b,v20.16b
333	aesmc	v0.16b,v0.16b
334	aese	v0.16b,v21.16b
335	aesmc	v0.16b,v0.16b
336	ld1	{v7.4s},[x3]
337	aese	v0.16b,v22.16b
338	aesmc	v0.16b,v0.16b
339	aese	v0.16b,v23.16b
340	eor	v0.16b,v0.16b,v7.16b
341	st1	{v0.16b},[x1]
342	b	.Lecb_Final_abort
343.Lecb_small_dec:
344	aesd	v0.16b,v5.16b
345	aesimc	v0.16b,v0.16b
346	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
347	aesd	v0.16b,v6.16b
348	aesimc	v0.16b,v0.16b
349	subs	w5,w5,#10			// bias
350	b.eq	.Lecb_128_dec
351.Lecb_dec_round_loop:
352	aesd	v0.16b,v16.16b
353	aesimc	v0.16b,v0.16b
354	ld1	{v16.4s},[x3],#16				// load key schedule...
355	aesd	v0.16b,v17.16b
356	aesimc	v0.16b,v0.16b
357	ld1	{v17.4s},[x3],#16				// load key schedule...
358	subs	w5,w5,#2			// bias
359	b.gt	.Lecb_dec_round_loop
360.Lecb_128_dec:
361	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
362	aesd	v0.16b,v16.16b
363	aesimc	v0.16b,v0.16b
364	aesd	v0.16b,v17.16b
365	aesimc	v0.16b,v0.16b
366	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
367	aesd	v0.16b,v18.16b
368	aesimc	v0.16b,v0.16b
369	aesd	v0.16b,v19.16b
370	aesimc	v0.16b,v0.16b
371	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
372	aesd	v0.16b,v20.16b
373	aesimc	v0.16b,v0.16b
374	aesd	v0.16b,v21.16b
375	aesimc	v0.16b,v0.16b
376	ld1	{v7.4s},[x3]
377	aesd	v0.16b,v22.16b
378	aesimc	v0.16b,v0.16b
379	aesd	v0.16b,v23.16b
380	eor	v0.16b,v0.16b,v7.16b
381	st1	{v0.16b},[x1]
382	b	.Lecb_Final_abort
383.Lecb_big_size:
384	stp	x29,x30,[sp,#-16]!
385	add	x29,sp,#0
386	mov	x8,#16
387	b.lo	.Lecb_done
388	csel	x8,xzr,x8,eq
389
390	cmp	w4,#0					// en- or decrypting?
391	ldr	w5,[x3,#240]
392	and	x2,x2,#-16
393	ld1	{v0.16b},[x0],x8
394
395	ld1	{v16.4s,v17.4s},[x3]				// load key schedule...
396	sub	w5,w5,#6
397	add	x7,x3,x5,lsl#4				// pointer to last 7 round keys
398	sub	w5,w5,#2
399	ld1	{v18.4s,v19.4s},[x7],#32
400	ld1	{v20.4s,v21.4s},[x7],#32
401	ld1	{v22.4s,v23.4s},[x7],#32
402	ld1	{v7.4s},[x7]
403
404	add	x7,x3,#32
405	mov	w6,w5
406	b.eq	.Lecb_dec
407
408	ld1	{v1.16b},[x0],#16
409	subs	x2,x2,#32				// bias
410	add	w6,w5,#2
411	orr	v3.16b,v1.16b,v1.16b
412	orr	v24.16b,v1.16b,v1.16b
413	orr	v1.16b,v0.16b,v0.16b
414	b.lo	.Lecb_enc_tail
415
416	orr	v1.16b,v3.16b,v3.16b
417	ld1	{v24.16b},[x0],#16
418	cmp	x2,#32
419	b.lo	.Loop3x_ecb_enc
420
421	ld1	{v25.16b},[x0],#16
422	ld1	{v26.16b},[x0],#16
423	sub	x2,x2,#32				// bias
424	mov	w6,w5
425
426.Loop5x_ecb_enc:
427	aese	v0.16b,v16.16b
428	aesmc	v0.16b,v0.16b
429	aese	v1.16b,v16.16b
430	aesmc	v1.16b,v1.16b
431	aese	v24.16b,v16.16b
432	aesmc	v24.16b,v24.16b
433	aese	v25.16b,v16.16b
434	aesmc	v25.16b,v25.16b
435	aese	v26.16b,v16.16b
436	aesmc	v26.16b,v26.16b
437	ld1	{v16.4s},[x7],#16
438	subs	w6,w6,#2
439	aese	v0.16b,v17.16b
440	aesmc	v0.16b,v0.16b
441	aese	v1.16b,v17.16b
442	aesmc	v1.16b,v1.16b
443	aese	v24.16b,v17.16b
444	aesmc	v24.16b,v24.16b
445	aese	v25.16b,v17.16b
446	aesmc	v25.16b,v25.16b
447	aese	v26.16b,v17.16b
448	aesmc	v26.16b,v26.16b
449	ld1	{v17.4s},[x7],#16
450	b.gt	.Loop5x_ecb_enc
451
452	aese	v0.16b,v16.16b
453	aesmc	v0.16b,v0.16b
454	aese	v1.16b,v16.16b
455	aesmc	v1.16b,v1.16b
456	aese	v24.16b,v16.16b
457	aesmc	v24.16b,v24.16b
458	aese	v25.16b,v16.16b
459	aesmc	v25.16b,v25.16b
460	aese	v26.16b,v16.16b
461	aesmc	v26.16b,v26.16b
462	cmp	x2,#0x40					// because .Lecb_enc_tail4x
463	sub	x2,x2,#0x50
464
465	aese	v0.16b,v17.16b
466	aesmc	v0.16b,v0.16b
467	aese	v1.16b,v17.16b
468	aesmc	v1.16b,v1.16b
469	aese	v24.16b,v17.16b
470	aesmc	v24.16b,v24.16b
471	aese	v25.16b,v17.16b
472	aesmc	v25.16b,v25.16b
473	aese	v26.16b,v17.16b
474	aesmc	v26.16b,v26.16b
475	csel	x6,xzr,x2,gt			// borrow x6, w6, "gt" is not typo
476	mov	x7,x3
477
478	aese	v0.16b,v18.16b
479	aesmc	v0.16b,v0.16b
480	aese	v1.16b,v18.16b
481	aesmc	v1.16b,v1.16b
482	aese	v24.16b,v18.16b
483	aesmc	v24.16b,v24.16b
484	aese	v25.16b,v18.16b
485	aesmc	v25.16b,v25.16b
486	aese	v26.16b,v18.16b
487	aesmc	v26.16b,v26.16b
488	add	x0,x0,x6				// x0 is adjusted in such way that
489							// at exit from the loop v1.16b-v26.16b
490							// are loaded with last "words"
491	add	x6,x2,#0x60		    // because .Lecb_enc_tail4x
492
493	aese	v0.16b,v19.16b
494	aesmc	v0.16b,v0.16b
495	aese	v1.16b,v19.16b
496	aesmc	v1.16b,v1.16b
497	aese	v24.16b,v19.16b
498	aesmc	v24.16b,v24.16b
499	aese	v25.16b,v19.16b
500	aesmc	v25.16b,v25.16b
501	aese	v26.16b,v19.16b
502	aesmc	v26.16b,v26.16b
503
504	aese	v0.16b,v20.16b
505	aesmc	v0.16b,v0.16b
506	aese	v1.16b,v20.16b
507	aesmc	v1.16b,v1.16b
508	aese	v24.16b,v20.16b
509	aesmc	v24.16b,v24.16b
510	aese	v25.16b,v20.16b
511	aesmc	v25.16b,v25.16b
512	aese	v26.16b,v20.16b
513	aesmc	v26.16b,v26.16b
514
515	aese	v0.16b,v21.16b
516	aesmc	v0.16b,v0.16b
517	aese	v1.16b,v21.16b
518	aesmc	v1.16b,v1.16b
519	aese	v24.16b,v21.16b
520	aesmc	v24.16b,v24.16b
521	aese	v25.16b,v21.16b
522	aesmc	v25.16b,v25.16b
523	aese	v26.16b,v21.16b
524	aesmc	v26.16b,v26.16b
525
526	aese	v0.16b,v22.16b
527	aesmc	v0.16b,v0.16b
528	aese	v1.16b,v22.16b
529	aesmc	v1.16b,v1.16b
530	aese	v24.16b,v22.16b
531	aesmc	v24.16b,v24.16b
532	aese	v25.16b,v22.16b
533	aesmc	v25.16b,v25.16b
534	aese	v26.16b,v22.16b
535	aesmc	v26.16b,v26.16b
536
537	aese	v0.16b,v23.16b
538	ld1	{v2.16b},[x0],#16
539	aese	v1.16b,v23.16b
540	ld1	{v3.16b},[x0],#16
541	aese	v24.16b,v23.16b
542	ld1	{v27.16b},[x0],#16
543	aese	v25.16b,v23.16b
544	ld1	{v28.16b},[x0],#16
545	aese	v26.16b,v23.16b
546	ld1	{v29.16b},[x0],#16
547	cbz	x6,.Lecb_enc_tail4x
548	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
549	eor	v4.16b,v7.16b,v0.16b
550	orr	v0.16b,v2.16b,v2.16b
551	eor	v5.16b,v7.16b,v1.16b
552	orr	v1.16b,v3.16b,v3.16b
553	eor	v17.16b,v7.16b,v24.16b
554	orr	v24.16b,v27.16b,v27.16b
555	eor	v30.16b,v7.16b,v25.16b
556	orr	v25.16b,v28.16b,v28.16b
557	eor	v31.16b,v7.16b,v26.16b
558	st1	{v4.16b},[x1],#16
559	orr	v26.16b,v29.16b,v29.16b
560	st1	{v5.16b},[x1],#16
561	mov	w6,w5
562	st1	{v17.16b},[x1],#16
563	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
564	st1	{v30.16b},[x1],#16
565	st1	{v31.16b},[x1],#16
566	b.hs	.Loop5x_ecb_enc
567
568	add	x2,x2,#0x50
569	cbz	x2,.Lecb_done
570
571	add	w6,w5,#2
572	subs	x2,x2,#0x30
573	orr	v0.16b,v27.16b,v27.16b
574	orr	v1.16b,v28.16b,v28.16b
575	orr	v24.16b,v29.16b,v29.16b
576	b.lo	.Lecb_enc_tail
577
578	b	.Loop3x_ecb_enc
579
580.align	4
581.Lecb_enc_tail4x:
582	eor	v5.16b,v7.16b,v1.16b
583	eor	v17.16b,v7.16b,v24.16b
584	eor	v30.16b,v7.16b,v25.16b
585	eor	v31.16b,v7.16b,v26.16b
586	st1	{v5.16b},[x1],#16
587	st1	{v17.16b},[x1],#16
588	st1	{v30.16b},[x1],#16
589	st1	{v31.16b},[x1],#16
590
591	b	.Lecb_done
592.align	4
593.Loop3x_ecb_enc:
594	aese	v0.16b,v16.16b
595	aesmc	v0.16b,v0.16b
596	aese	v1.16b,v16.16b
597	aesmc	v1.16b,v1.16b
598	aese	v24.16b,v16.16b
599	aesmc	v24.16b,v24.16b
600	ld1	{v16.4s},[x7],#16
601	subs	w6,w6,#2
602	aese	v0.16b,v17.16b
603	aesmc	v0.16b,v0.16b
604	aese	v1.16b,v17.16b
605	aesmc	v1.16b,v1.16b
606	aese	v24.16b,v17.16b
607	aesmc	v24.16b,v24.16b
608	ld1	{v17.4s},[x7],#16
609	b.gt	.Loop3x_ecb_enc
610
611	aese	v0.16b,v16.16b
612	aesmc	v0.16b,v0.16b
613	aese	v1.16b,v16.16b
614	aesmc	v1.16b,v1.16b
615	aese	v24.16b,v16.16b
616	aesmc	v24.16b,v24.16b
617	subs	x2,x2,#0x30
618	csel	x6,x2,x6,lo				// x6, w6, is zero at this point
619	aese	v0.16b,v17.16b
620	aesmc	v0.16b,v0.16b
621	aese	v1.16b,v17.16b
622	aesmc	v1.16b,v1.16b
623	aese	v24.16b,v17.16b
624	aesmc	v24.16b,v24.16b
625	add	x0,x0,x6			// x0 is adjusted in such way that
626						// at exit from the loop v1.16b-v24.16b
627						// are loaded with last "words"
628	mov	x7,x3
629	aese	v0.16b,v20.16b
630	aesmc	v0.16b,v0.16b
631	aese	v1.16b,v20.16b
632	aesmc	v1.16b,v1.16b
633	aese	v24.16b,v20.16b
634	aesmc	v24.16b,v24.16b
635	ld1	{v2.16b},[x0],#16
636	aese	v0.16b,v21.16b
637	aesmc	v0.16b,v0.16b
638	aese	v1.16b,v21.16b
639	aesmc	v1.16b,v1.16b
640	aese	v24.16b,v21.16b
641	aesmc	v24.16b,v24.16b
642	ld1	{v3.16b},[x0],#16
643	aese	v0.16b,v22.16b
644	aesmc	v0.16b,v0.16b
645	aese	v1.16b,v22.16b
646	aesmc	v1.16b,v1.16b
647	aese	v24.16b,v22.16b
648	aesmc	v24.16b,v24.16b
649	ld1	{v27.16b},[x0],#16
650	aese	v0.16b,v23.16b
651	aese	v1.16b,v23.16b
652	aese	v24.16b,v23.16b
653	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
654	add	w6,w5,#2
655	eor	v4.16b,v7.16b,v0.16b
656	eor	v5.16b,v7.16b,v1.16b
657	eor	v24.16b,v24.16b,v7.16b
658	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
659	st1	{v4.16b},[x1],#16
660	orr	v0.16b,v2.16b,v2.16b
661	st1	{v5.16b},[x1],#16
662	orr	v1.16b,v3.16b,v3.16b
663	st1	{v24.16b},[x1],#16
664	orr	v24.16b,v27.16b,v27.16b
665	b.hs	.Loop3x_ecb_enc
666
667	cmn	x2,#0x30
668	b.eq	.Lecb_done
669	nop
670
671.Lecb_enc_tail:
672	aese	v1.16b,v16.16b
673	aesmc	v1.16b,v1.16b
674	aese	v24.16b,v16.16b
675	aesmc	v24.16b,v24.16b
676	ld1	{v16.4s},[x7],#16
677	subs	w6,w6,#2
678	aese	v1.16b,v17.16b
679	aesmc	v1.16b,v1.16b
680	aese	v24.16b,v17.16b
681	aesmc	v24.16b,v24.16b
682	ld1	{v17.4s},[x7],#16
683	b.gt	.Lecb_enc_tail
684
685	aese	v1.16b,v16.16b
686	aesmc	v1.16b,v1.16b
687	aese	v24.16b,v16.16b
688	aesmc	v24.16b,v24.16b
689	aese	v1.16b,v17.16b
690	aesmc	v1.16b,v1.16b
691	aese	v24.16b,v17.16b
692	aesmc	v24.16b,v24.16b
693	aese	v1.16b,v20.16b
694	aesmc	v1.16b,v1.16b
695	aese	v24.16b,v20.16b
696	aesmc	v24.16b,v24.16b
697	cmn	x2,#0x20
698	aese	v1.16b,v21.16b
699	aesmc	v1.16b,v1.16b
700	aese	v24.16b,v21.16b
701	aesmc	v24.16b,v24.16b
702	aese	v1.16b,v22.16b
703	aesmc	v1.16b,v1.16b
704	aese	v24.16b,v22.16b
705	aesmc	v24.16b,v24.16b
706	aese	v1.16b,v23.16b
707	aese	v24.16b,v23.16b
708	b.eq	.Lecb_enc_one
709	eor	v5.16b,v7.16b,v1.16b
710	eor	v17.16b,v7.16b,v24.16b
711	st1	{v5.16b},[x1],#16
712	st1	{v17.16b},[x1],#16
713	b	.Lecb_done
714
715.Lecb_enc_one:
716	eor	v5.16b,v7.16b,v24.16b
717	st1	{v5.16b},[x1],#16
718	b	.Lecb_done
719.align	5
720.Lecb_dec:
721	ld1	{v1.16b},[x0],#16
722	subs	x2,x2,#32			// bias
723	add	w6,w5,#2
724	orr	v3.16b,v1.16b,v1.16b
725	orr	v24.16b,v1.16b,v1.16b
726	orr	v1.16b,v0.16b,v0.16b
727	b.lo	.Lecb_dec_tail
728
729	orr	v1.16b,v3.16b,v3.16b
730	ld1	{v24.16b},[x0],#16
731	cmp	x2,#32
732	b.lo	.Loop3x_ecb_dec
733
734	ld1	{v25.16b},[x0],#16
735	ld1	{v26.16b},[x0],#16
736	sub	x2,x2,#32				// bias
737	mov	w6,w5
738
739.Loop5x_ecb_dec:
740	aesd	v0.16b,v16.16b
741	aesimc	v0.16b,v0.16b
742	aesd	v1.16b,v16.16b
743	aesimc	v1.16b,v1.16b
744	aesd	v24.16b,v16.16b
745	aesimc	v24.16b,v24.16b
746	aesd	v25.16b,v16.16b
747	aesimc	v25.16b,v25.16b
748	aesd	v26.16b,v16.16b
749	aesimc	v26.16b,v26.16b
750	ld1	{v16.4s},[x7],#16
751	subs	w6,w6,#2
752	aesd	v0.16b,v17.16b
753	aesimc	v0.16b,v0.16b
754	aesd	v1.16b,v17.16b
755	aesimc	v1.16b,v1.16b
756	aesd	v24.16b,v17.16b
757	aesimc	v24.16b,v24.16b
758	aesd	v25.16b,v17.16b
759	aesimc	v25.16b,v25.16b
760	aesd	v26.16b,v17.16b
761	aesimc	v26.16b,v26.16b
762	ld1	{v17.4s},[x7],#16
763	b.gt	.Loop5x_ecb_dec
764
765	aesd	v0.16b,v16.16b
766	aesimc	v0.16b,v0.16b
767	aesd	v1.16b,v16.16b
768	aesimc	v1.16b,v1.16b
769	aesd	v24.16b,v16.16b
770	aesimc	v24.16b,v24.16b
771	aesd	v25.16b,v16.16b
772	aesimc	v25.16b,v25.16b
773	aesd	v26.16b,v16.16b
774	aesimc	v26.16b,v26.16b
775	cmp	x2,#0x40				// because .Lecb_tail4x
776	sub	x2,x2,#0x50
777
778	aesd	v0.16b,v17.16b
779	aesimc	v0.16b,v0.16b
780	aesd	v1.16b,v17.16b
781	aesimc	v1.16b,v1.16b
782	aesd	v24.16b,v17.16b
783	aesimc	v24.16b,v24.16b
784	aesd	v25.16b,v17.16b
785	aesimc	v25.16b,v25.16b
786	aesd	v26.16b,v17.16b
787	aesimc	v26.16b,v26.16b
788	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
789	mov	x7,x3
790
791	aesd	v0.16b,v18.16b
792	aesimc	v0.16b,v0.16b
793	aesd	v1.16b,v18.16b
794	aesimc	v1.16b,v1.16b
795	aesd	v24.16b,v18.16b
796	aesimc	v24.16b,v24.16b
797	aesd	v25.16b,v18.16b
798	aesimc	v25.16b,v25.16b
799	aesd	v26.16b,v18.16b
800	aesimc	v26.16b,v26.16b
801	add	x0,x0,x6				// x0 is adjusted in such way that
802							// at exit from the loop v1.16b-v26.16b
803							// are loaded with last "words"
804	add	x6,x2,#0x60			// because .Lecb_tail4x
805
806	aesd	v0.16b,v19.16b
807	aesimc	v0.16b,v0.16b
808	aesd	v1.16b,v19.16b
809	aesimc	v1.16b,v1.16b
810	aesd	v24.16b,v19.16b
811	aesimc	v24.16b,v24.16b
812	aesd	v25.16b,v19.16b
813	aesimc	v25.16b,v25.16b
814	aesd	v26.16b,v19.16b
815	aesimc	v26.16b,v26.16b
816
817	aesd	v0.16b,v20.16b
818	aesimc	v0.16b,v0.16b
819	aesd	v1.16b,v20.16b
820	aesimc	v1.16b,v1.16b
821	aesd	v24.16b,v20.16b
822	aesimc	v24.16b,v24.16b
823	aesd	v25.16b,v20.16b
824	aesimc	v25.16b,v25.16b
825	aesd	v26.16b,v20.16b
826	aesimc	v26.16b,v26.16b
827
828	aesd	v0.16b,v21.16b
829	aesimc	v0.16b,v0.16b
830	aesd	v1.16b,v21.16b
831	aesimc	v1.16b,v1.16b
832	aesd	v24.16b,v21.16b
833	aesimc	v24.16b,v24.16b
834	aesd	v25.16b,v21.16b
835	aesimc	v25.16b,v25.16b
836	aesd	v26.16b,v21.16b
837	aesimc	v26.16b,v26.16b
838
839	aesd	v0.16b,v22.16b
840	aesimc	v0.16b,v0.16b
841	aesd	v1.16b,v22.16b
842	aesimc	v1.16b,v1.16b
843	aesd	v24.16b,v22.16b
844	aesimc	v24.16b,v24.16b
845	aesd	v25.16b,v22.16b
846	aesimc	v25.16b,v25.16b
847	aesd	v26.16b,v22.16b
848	aesimc	v26.16b,v26.16b
849
850	aesd	v0.16b,v23.16b
851	ld1	{v2.16b},[x0],#16
852	aesd	v1.16b,v23.16b
853	ld1	{v3.16b},[x0],#16
854	aesd	v24.16b,v23.16b
855	ld1	{v27.16b},[x0],#16
856	aesd	v25.16b,v23.16b
857	ld1	{v28.16b},[x0],#16
858	aesd	v26.16b,v23.16b
859	ld1	{v29.16b},[x0],#16
860	cbz	x6,.Lecb_tail4x
861	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
862	eor	v4.16b,v7.16b,v0.16b
863	orr	v0.16b,v2.16b,v2.16b
864	eor	v5.16b,v7.16b,v1.16b
865	orr	v1.16b,v3.16b,v3.16b
866	eor	v17.16b,v7.16b,v24.16b
867	orr	v24.16b,v27.16b,v27.16b
868	eor	v30.16b,v7.16b,v25.16b
869	orr	v25.16b,v28.16b,v28.16b
870	eor	v31.16b,v7.16b,v26.16b
871	st1	{v4.16b},[x1],#16
872	orr	v26.16b,v29.16b,v29.16b
873	st1	{v5.16b},[x1],#16
874	mov	w6,w5
875	st1	{v17.16b},[x1],#16
876	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
877	st1	{v30.16b},[x1],#16
878	st1	{v31.16b},[x1],#16
879	b.hs	.Loop5x_ecb_dec
880
881	add	x2,x2,#0x50
882	cbz	x2,.Lecb_done
883
884	add	w6,w5,#2
885	subs	x2,x2,#0x30
886	orr	v0.16b,v27.16b,v27.16b
887	orr	v1.16b,v28.16b,v28.16b
888	orr	v24.16b,v29.16b,v29.16b
889	b.lo	.Lecb_dec_tail
890
891	b	.Loop3x_ecb_dec
892
893.align	4
894.Lecb_tail4x:
895	eor	v5.16b,v7.16b,v1.16b
896	eor	v17.16b,v7.16b,v24.16b
897	eor	v30.16b,v7.16b,v25.16b
898	eor	v31.16b,v7.16b,v26.16b
899	st1	{v5.16b},[x1],#16
900	st1	{v17.16b},[x1],#16
901	st1	{v30.16b},[x1],#16
902	st1	{v31.16b},[x1],#16
903
904	b	.Lecb_done
905.align	4
906.Loop3x_ecb_dec:
907	aesd	v0.16b,v16.16b
908	aesimc	v0.16b,v0.16b
909	aesd	v1.16b,v16.16b
910	aesimc	v1.16b,v1.16b
911	aesd	v24.16b,v16.16b
912	aesimc	v24.16b,v24.16b
913	ld1	{v16.4s},[x7],#16
914	subs	w6,w6,#2
915	aesd	v0.16b,v17.16b
916	aesimc	v0.16b,v0.16b
917	aesd	v1.16b,v17.16b
918	aesimc	v1.16b,v1.16b
919	aesd	v24.16b,v17.16b
920	aesimc	v24.16b,v24.16b
921	ld1	{v17.4s},[x7],#16
922	b.gt	.Loop3x_ecb_dec
923
924	aesd	v0.16b,v16.16b
925	aesimc	v0.16b,v0.16b
926	aesd	v1.16b,v16.16b
927	aesimc	v1.16b,v1.16b
928	aesd	v24.16b,v16.16b
929	aesimc	v24.16b,v24.16b
930	subs	x2,x2,#0x30
931	csel	x6,x2,x6,lo				// x6, w6, is zero at this point
932	aesd	v0.16b,v17.16b
933	aesimc	v0.16b,v0.16b
934	aesd	v1.16b,v17.16b
935	aesimc	v1.16b,v1.16b
936	aesd	v24.16b,v17.16b
937	aesimc	v24.16b,v24.16b
938	add	x0,x0,x6 			// x0 is adjusted in such way that
939						// at exit from the loop v1.16b-v24.16b
940						// are loaded with last "words"
941	mov	x7,x3
942	aesd	v0.16b,v20.16b
943	aesimc	v0.16b,v0.16b
944	aesd	v1.16b,v20.16b
945	aesimc	v1.16b,v1.16b
946	aesd	v24.16b,v20.16b
947	aesimc	v24.16b,v24.16b
948	ld1	{v2.16b},[x0],#16
949	aesd	v0.16b,v21.16b
950	aesimc	v0.16b,v0.16b
951	aesd	v1.16b,v21.16b
952	aesimc	v1.16b,v1.16b
953	aesd	v24.16b,v21.16b
954	aesimc	v24.16b,v24.16b
955	ld1	{v3.16b},[x0],#16
956	aesd	v0.16b,v22.16b
957	aesimc	v0.16b,v0.16b
958	aesd	v1.16b,v22.16b
959	aesimc	v1.16b,v1.16b
960	aesd	v24.16b,v22.16b
961	aesimc	v24.16b,v24.16b
962	ld1	{v27.16b},[x0],#16
963	aesd	v0.16b,v23.16b
964	aesd	v1.16b,v23.16b
965	aesd	v24.16b,v23.16b
966	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
967	add	w6,w5,#2
968	eor	v4.16b,v7.16b,v0.16b
969	eor	v5.16b,v7.16b,v1.16b
970	eor	v24.16b,v24.16b,v7.16b
971	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
972	st1	{v4.16b},[x1],#16
973	orr	v0.16b,v2.16b,v2.16b
974	st1	{v5.16b},[x1],#16
975	orr	v1.16b,v3.16b,v3.16b
976	st1	{v24.16b},[x1],#16
977	orr	v24.16b,v27.16b,v27.16b
978	b.hs	.Loop3x_ecb_dec
979
980	cmn	x2,#0x30
981	b.eq	.Lecb_done
982	nop
983
984.Lecb_dec_tail:
985	aesd	v1.16b,v16.16b
986	aesimc	v1.16b,v1.16b
987	aesd	v24.16b,v16.16b
988	aesimc	v24.16b,v24.16b
989	ld1	{v16.4s},[x7],#16
990	subs	w6,w6,#2
991	aesd	v1.16b,v17.16b
992	aesimc	v1.16b,v1.16b
993	aesd	v24.16b,v17.16b
994	aesimc	v24.16b,v24.16b
995	ld1	{v17.4s},[x7],#16
996	b.gt	.Lecb_dec_tail
997
998	aesd	v1.16b,v16.16b
999	aesimc	v1.16b,v1.16b
1000	aesd	v24.16b,v16.16b
1001	aesimc	v24.16b,v24.16b
1002	aesd	v1.16b,v17.16b
1003	aesimc	v1.16b,v1.16b
1004	aesd	v24.16b,v17.16b
1005	aesimc	v24.16b,v24.16b
1006	aesd	v1.16b,v20.16b
1007	aesimc	v1.16b,v1.16b
1008	aesd	v24.16b,v20.16b
1009	aesimc	v24.16b,v24.16b
1010	cmn	x2,#0x20
1011	aesd	v1.16b,v21.16b
1012	aesimc	v1.16b,v1.16b
1013	aesd	v24.16b,v21.16b
1014	aesimc	v24.16b,v24.16b
1015	aesd	v1.16b,v22.16b
1016	aesimc	v1.16b,v1.16b
1017	aesd	v24.16b,v22.16b
1018	aesimc	v24.16b,v24.16b
1019	aesd	v1.16b,v23.16b
1020	aesd	v24.16b,v23.16b
1021	b.eq	.Lecb_dec_one
1022	eor	v5.16b,v7.16b,v1.16b
1023	eor	v17.16b,v7.16b,v24.16b
1024	st1	{v5.16b},[x1],#16
1025	st1	{v17.16b},[x1],#16
1026	b	.Lecb_done
1027
1028.Lecb_dec_one:
1029	eor	v5.16b,v7.16b,v24.16b
1030	st1	{v5.16b},[x1],#16
1031
1032.Lecb_done:
1033	ldr	x29,[sp],#16
1034.Lecb_Final_abort:
1035	ret
1036.size	aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt
1037.globl	aes_v8_cbc_encrypt
1038.type	aes_v8_cbc_encrypt,%function
1039.align	5
1040aes_v8_cbc_encrypt:
1041	AARCH64_VALID_CALL_TARGET
1042	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1043	stp	x29,x30,[sp,#-16]!
1044	add	x29,sp,#0
1045	subs	x2,x2,#16
1046	mov	x8,#16
1047	b.lo	.Lcbc_abort
1048	csel	x8,xzr,x8,eq
1049
1050	cmp	w5,#0			// en- or decrypting?
1051	ldr	w5,[x3,#240]
1052	and	x2,x2,#-16
1053	ld1	{v6.16b},[x4]
1054	ld1	{v0.16b},[x0],x8
1055
1056	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
1057	sub	w5,w5,#6
1058	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
1059	sub	w5,w5,#2
1060	ld1	{v18.4s,v19.4s},[x7],#32
1061	ld1	{v20.4s,v21.4s},[x7],#32
1062	ld1	{v22.4s,v23.4s},[x7],#32
1063	ld1	{v7.4s},[x7]
1064
1065	add	x7,x3,#32
1066	mov	w6,w5
1067	b.eq	.Lcbc_dec
1068
1069	cmp	w5,#2
1070	eor	v0.16b,v0.16b,v6.16b
1071	eor	v5.16b,v16.16b,v7.16b
1072	b.eq	.Lcbc_enc128
1073
1074	ld1	{v2.4s,v3.4s},[x7]
1075	add	x7,x3,#16
1076	add	x6,x3,#16*4
1077	add	x12,x3,#16*5
1078	aese	v0.16b,v16.16b
1079	aesmc	v0.16b,v0.16b
1080	add	x14,x3,#16*6
1081	add	x3,x3,#16*7
1082	b	.Lenter_cbc_enc
1083
1084.align	4
1085.Loop_cbc_enc:
1086	aese	v0.16b,v16.16b
1087	aesmc	v0.16b,v0.16b
1088	st1	{v6.16b},[x1],#16
1089.Lenter_cbc_enc:
1090	aese	v0.16b,v17.16b
1091	aesmc	v0.16b,v0.16b
1092	aese	v0.16b,v2.16b
1093	aesmc	v0.16b,v0.16b
1094	ld1	{v16.4s},[x6]
1095	cmp	w5,#4
1096	aese	v0.16b,v3.16b
1097	aesmc	v0.16b,v0.16b
1098	ld1	{v17.4s},[x12]
1099	b.eq	.Lcbc_enc192
1100
1101	aese	v0.16b,v16.16b
1102	aesmc	v0.16b,v0.16b
1103	ld1	{v16.4s},[x14]
1104	aese	v0.16b,v17.16b
1105	aesmc	v0.16b,v0.16b
1106	ld1	{v17.4s},[x3]
1107	nop
1108
1109.Lcbc_enc192:
1110	aese	v0.16b,v16.16b
1111	aesmc	v0.16b,v0.16b
1112	subs	x2,x2,#16
1113	aese	v0.16b,v17.16b
1114	aesmc	v0.16b,v0.16b
1115	csel	x8,xzr,x8,eq
1116	aese	v0.16b,v18.16b
1117	aesmc	v0.16b,v0.16b
1118	aese	v0.16b,v19.16b
1119	aesmc	v0.16b,v0.16b
1120	ld1	{v16.16b},[x0],x8
1121	aese	v0.16b,v20.16b
1122	aesmc	v0.16b,v0.16b
1123	eor	v16.16b,v16.16b,v5.16b
1124	aese	v0.16b,v21.16b
1125	aesmc	v0.16b,v0.16b
1126	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
1127	aese	v0.16b,v22.16b
1128	aesmc	v0.16b,v0.16b
1129	aese	v0.16b,v23.16b
1130	eor	v6.16b,v0.16b,v7.16b
1131	b.hs	.Loop_cbc_enc
1132
1133	st1	{v6.16b},[x1],#16
1134	b	.Lcbc_done
1135
1136.align	5
1137.Lcbc_enc128:
1138	ld1	{v2.4s,v3.4s},[x7]
1139	aese	v0.16b,v16.16b
1140	aesmc	v0.16b,v0.16b
1141	b	.Lenter_cbc_enc128
1142.Loop_cbc_enc128:
1143	aese	v0.16b,v16.16b
1144	aesmc	v0.16b,v0.16b
1145	st1	{v6.16b},[x1],#16
1146.Lenter_cbc_enc128:
1147	aese	v0.16b,v17.16b
1148	aesmc	v0.16b,v0.16b
1149	subs	x2,x2,#16
1150	aese	v0.16b,v2.16b
1151	aesmc	v0.16b,v0.16b
1152	csel	x8,xzr,x8,eq
1153	aese	v0.16b,v3.16b
1154	aesmc	v0.16b,v0.16b
1155	aese	v0.16b,v18.16b
1156	aesmc	v0.16b,v0.16b
1157	aese	v0.16b,v19.16b
1158	aesmc	v0.16b,v0.16b
1159	ld1	{v16.16b},[x0],x8
1160	aese	v0.16b,v20.16b
1161	aesmc	v0.16b,v0.16b
1162	aese	v0.16b,v21.16b
1163	aesmc	v0.16b,v0.16b
1164	aese	v0.16b,v22.16b
1165	aesmc	v0.16b,v0.16b
1166	eor	v16.16b,v16.16b,v5.16b
1167	aese	v0.16b,v23.16b
1168	eor	v6.16b,v0.16b,v7.16b
1169	b.hs	.Loop_cbc_enc128
1170
1171	st1	{v6.16b},[x1],#16
1172	b	.Lcbc_done
1173.align	5
1174.Lcbc_dec:
1175	ld1	{v24.16b},[x0],#16
1176	subs	x2,x2,#32		// bias
1177	add	w6,w5,#2
1178	orr	v3.16b,v0.16b,v0.16b
1179	orr	v1.16b,v0.16b,v0.16b
1180	orr	v27.16b,v24.16b,v24.16b
1181	b.lo	.Lcbc_dec_tail
1182
1183	orr	v1.16b,v24.16b,v24.16b
1184	ld1	{v24.16b},[x0],#16
1185	orr	v2.16b,v0.16b,v0.16b
1186	orr	v3.16b,v1.16b,v1.16b
1187	orr	v27.16b,v24.16b,v24.16b
1188	cmp	x2,#32
1189	b.lo	.Loop3x_cbc_dec
1190
1191	ld1	{v25.16b},[x0],#16
1192	ld1	{v26.16b},[x0],#16
1193	sub	x2,x2,#32		// bias
1194	mov	w6,w5
1195	orr	v28.16b,v25.16b,v25.16b
1196	orr	v29.16b,v26.16b,v26.16b
1197
1198.Loop5x_cbc_dec:
1199	aesd	v0.16b,v16.16b
1200	aesimc	v0.16b,v0.16b
1201	aesd	v1.16b,v16.16b
1202	aesimc	v1.16b,v1.16b
1203	aesd	v24.16b,v16.16b
1204	aesimc	v24.16b,v24.16b
1205	aesd	v25.16b,v16.16b
1206	aesimc	v25.16b,v25.16b
1207	aesd	v26.16b,v16.16b
1208	aesimc	v26.16b,v26.16b
1209	ld1	{v16.4s},[x7],#16
1210	subs	w6,w6,#2
1211	aesd	v0.16b,v17.16b
1212	aesimc	v0.16b,v0.16b
1213	aesd	v1.16b,v17.16b
1214	aesimc	v1.16b,v1.16b
1215	aesd	v24.16b,v17.16b
1216	aesimc	v24.16b,v24.16b
1217	aesd	v25.16b,v17.16b
1218	aesimc	v25.16b,v25.16b
1219	aesd	v26.16b,v17.16b
1220	aesimc	v26.16b,v26.16b
1221	ld1	{v17.4s},[x7],#16
1222	b.gt	.Loop5x_cbc_dec
1223
1224	aesd	v0.16b,v16.16b
1225	aesimc	v0.16b,v0.16b
1226	aesd	v1.16b,v16.16b
1227	aesimc	v1.16b,v1.16b
1228	aesd	v24.16b,v16.16b
1229	aesimc	v24.16b,v24.16b
1230	aesd	v25.16b,v16.16b
1231	aesimc	v25.16b,v25.16b
1232	aesd	v26.16b,v16.16b
1233	aesimc	v26.16b,v26.16b
1234	cmp	x2,#0x40		// because .Lcbc_tail4x
1235	sub	x2,x2,#0x50
1236
1237	aesd	v0.16b,v17.16b
1238	aesimc	v0.16b,v0.16b
1239	aesd	v1.16b,v17.16b
1240	aesimc	v1.16b,v1.16b
1241	aesd	v24.16b,v17.16b
1242	aesimc	v24.16b,v24.16b
1243	aesd	v25.16b,v17.16b
1244	aesimc	v25.16b,v25.16b
1245	aesd	v26.16b,v17.16b
1246	aesimc	v26.16b,v26.16b
1247	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
1248	mov	x7,x3
1249
1250	aesd	v0.16b,v18.16b
1251	aesimc	v0.16b,v0.16b
1252	aesd	v1.16b,v18.16b
1253	aesimc	v1.16b,v1.16b
1254	aesd	v24.16b,v18.16b
1255	aesimc	v24.16b,v24.16b
1256	aesd	v25.16b,v18.16b
1257	aesimc	v25.16b,v25.16b
1258	aesd	v26.16b,v18.16b
1259	aesimc	v26.16b,v26.16b
1260	add	x0,x0,x6		// x0 is adjusted in such way that
1261					// at exit from the loop v1.16b-v26.16b
1262					// are loaded with last "words"
1263	add	x6,x2,#0x60		// because .Lcbc_tail4x
1264
1265	aesd	v0.16b,v19.16b
1266	aesimc	v0.16b,v0.16b
1267	aesd	v1.16b,v19.16b
1268	aesimc	v1.16b,v1.16b
1269	aesd	v24.16b,v19.16b
1270	aesimc	v24.16b,v24.16b
1271	aesd	v25.16b,v19.16b
1272	aesimc	v25.16b,v25.16b
1273	aesd	v26.16b,v19.16b
1274	aesimc	v26.16b,v26.16b
1275
1276	aesd	v0.16b,v20.16b
1277	aesimc	v0.16b,v0.16b
1278	aesd	v1.16b,v20.16b
1279	aesimc	v1.16b,v1.16b
1280	aesd	v24.16b,v20.16b
1281	aesimc	v24.16b,v24.16b
1282	aesd	v25.16b,v20.16b
1283	aesimc	v25.16b,v25.16b
1284	aesd	v26.16b,v20.16b
1285	aesimc	v26.16b,v26.16b
1286
1287	aesd	v0.16b,v21.16b
1288	aesimc	v0.16b,v0.16b
1289	aesd	v1.16b,v21.16b
1290	aesimc	v1.16b,v1.16b
1291	aesd	v24.16b,v21.16b
1292	aesimc	v24.16b,v24.16b
1293	aesd	v25.16b,v21.16b
1294	aesimc	v25.16b,v25.16b
1295	aesd	v26.16b,v21.16b
1296	aesimc	v26.16b,v26.16b
1297
1298	aesd	v0.16b,v22.16b
1299	aesimc	v0.16b,v0.16b
1300	aesd	v1.16b,v22.16b
1301	aesimc	v1.16b,v1.16b
1302	aesd	v24.16b,v22.16b
1303	aesimc	v24.16b,v24.16b
1304	aesd	v25.16b,v22.16b
1305	aesimc	v25.16b,v25.16b
1306	aesd	v26.16b,v22.16b
1307	aesimc	v26.16b,v26.16b
1308
1309	eor	v4.16b,v6.16b,v7.16b
1310	aesd	v0.16b,v23.16b
1311	eor	v5.16b,v2.16b,v7.16b
1312	ld1	{v2.16b},[x0],#16
1313	aesd	v1.16b,v23.16b
1314	eor	v17.16b,v3.16b,v7.16b
1315	ld1	{v3.16b},[x0],#16
1316	aesd	v24.16b,v23.16b
1317	eor	v30.16b,v27.16b,v7.16b
1318	ld1	{v27.16b},[x0],#16
1319	aesd	v25.16b,v23.16b
1320	eor	v31.16b,v28.16b,v7.16b
1321	ld1	{v28.16b},[x0],#16
1322	aesd	v26.16b,v23.16b
1323	orr	v6.16b,v29.16b,v29.16b
1324	ld1	{v29.16b},[x0],#16
1325	cbz	x6,.Lcbc_tail4x
1326	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1327	eor	v4.16b,v4.16b,v0.16b
1328	orr	v0.16b,v2.16b,v2.16b
1329	eor	v5.16b,v5.16b,v1.16b
1330	orr	v1.16b,v3.16b,v3.16b
1331	eor	v17.16b,v17.16b,v24.16b
1332	orr	v24.16b,v27.16b,v27.16b
1333	eor	v30.16b,v30.16b,v25.16b
1334	orr	v25.16b,v28.16b,v28.16b
1335	eor	v31.16b,v31.16b,v26.16b
1336	st1	{v4.16b},[x1],#16
1337	orr	v26.16b,v29.16b,v29.16b
1338	st1	{v5.16b},[x1],#16
1339	mov	w6,w5
1340	st1	{v17.16b},[x1],#16
1341	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1342	st1	{v30.16b},[x1],#16
1343	st1	{v31.16b},[x1],#16
1344	b.hs	.Loop5x_cbc_dec
1345
1346	add	x2,x2,#0x50
1347	cbz	x2,.Lcbc_done
1348
1349	add	w6,w5,#2
1350	subs	x2,x2,#0x30
1351	orr	v0.16b,v27.16b,v27.16b
1352	orr	v2.16b,v27.16b,v27.16b
1353	orr	v1.16b,v28.16b,v28.16b
1354	orr	v3.16b,v28.16b,v28.16b
1355	orr	v24.16b,v29.16b,v29.16b
1356	orr	v27.16b,v29.16b,v29.16b
1357	b.lo	.Lcbc_dec_tail
1358
1359	b	.Loop3x_cbc_dec
1360
1361.align	4
1362.Lcbc_tail4x:
1363	eor	v5.16b,v4.16b,v1.16b
1364	eor	v17.16b,v17.16b,v24.16b
1365	eor	v30.16b,v30.16b,v25.16b
1366	eor	v31.16b,v31.16b,v26.16b
1367	st1	{v5.16b},[x1],#16
1368	st1	{v17.16b},[x1],#16
1369	st1	{v30.16b},[x1],#16
1370	st1	{v31.16b},[x1],#16
1371
1372	b	.Lcbc_done
1373.align	4
1374.Loop3x_cbc_dec:
1375	aesd	v0.16b,v16.16b
1376	aesimc	v0.16b,v0.16b
1377	aesd	v1.16b,v16.16b
1378	aesimc	v1.16b,v1.16b
1379	aesd	v24.16b,v16.16b
1380	aesimc	v24.16b,v24.16b
1381	ld1	{v16.4s},[x7],#16
1382	subs	w6,w6,#2
1383	aesd	v0.16b,v17.16b
1384	aesimc	v0.16b,v0.16b
1385	aesd	v1.16b,v17.16b
1386	aesimc	v1.16b,v1.16b
1387	aesd	v24.16b,v17.16b
1388	aesimc	v24.16b,v24.16b
1389	ld1	{v17.4s},[x7],#16
1390	b.gt	.Loop3x_cbc_dec
1391
1392	aesd	v0.16b,v16.16b
1393	aesimc	v0.16b,v0.16b
1394	aesd	v1.16b,v16.16b
1395	aesimc	v1.16b,v1.16b
1396	aesd	v24.16b,v16.16b
1397	aesimc	v24.16b,v24.16b
1398	eor	v4.16b,v6.16b,v7.16b
1399	subs	x2,x2,#0x30
1400	eor	v5.16b,v2.16b,v7.16b
1401	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
1402	aesd	v0.16b,v17.16b
1403	aesimc	v0.16b,v0.16b
1404	aesd	v1.16b,v17.16b
1405	aesimc	v1.16b,v1.16b
1406	aesd	v24.16b,v17.16b
1407	aesimc	v24.16b,v24.16b
1408	eor	v17.16b,v3.16b,v7.16b
1409	add	x0,x0,x6		// x0 is adjusted in such way that
1410					// at exit from the loop v1.16b-v24.16b
1411					// are loaded with last "words"
1412	orr	v6.16b,v27.16b,v27.16b
1413	mov	x7,x3
1414	aesd	v0.16b,v20.16b
1415	aesimc	v0.16b,v0.16b
1416	aesd	v1.16b,v20.16b
1417	aesimc	v1.16b,v1.16b
1418	aesd	v24.16b,v20.16b
1419	aesimc	v24.16b,v24.16b
1420	ld1	{v2.16b},[x0],#16
1421	aesd	v0.16b,v21.16b
1422	aesimc	v0.16b,v0.16b
1423	aesd	v1.16b,v21.16b
1424	aesimc	v1.16b,v1.16b
1425	aesd	v24.16b,v21.16b
1426	aesimc	v24.16b,v24.16b
1427	ld1	{v3.16b},[x0],#16
1428	aesd	v0.16b,v22.16b
1429	aesimc	v0.16b,v0.16b
1430	aesd	v1.16b,v22.16b
1431	aesimc	v1.16b,v1.16b
1432	aesd	v24.16b,v22.16b
1433	aesimc	v24.16b,v24.16b
1434	ld1	{v27.16b},[x0],#16
1435	aesd	v0.16b,v23.16b
1436	aesd	v1.16b,v23.16b
1437	aesd	v24.16b,v23.16b
1438	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1439	add	w6,w5,#2
1440	eor	v4.16b,v4.16b,v0.16b
1441	eor	v5.16b,v5.16b,v1.16b
1442	eor	v24.16b,v24.16b,v17.16b
1443	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1444	st1	{v4.16b},[x1],#16
1445	orr	v0.16b,v2.16b,v2.16b
1446	st1	{v5.16b},[x1],#16
1447	orr	v1.16b,v3.16b,v3.16b
1448	st1	{v24.16b},[x1],#16
1449	orr	v24.16b,v27.16b,v27.16b
1450	b.hs	.Loop3x_cbc_dec
1451
1452	cmn	x2,#0x30
1453	b.eq	.Lcbc_done
1454	nop
1455
1456.Lcbc_dec_tail:
1457	aesd	v1.16b,v16.16b
1458	aesimc	v1.16b,v1.16b
1459	aesd	v24.16b,v16.16b
1460	aesimc	v24.16b,v24.16b
1461	ld1	{v16.4s},[x7],#16
1462	subs	w6,w6,#2
1463	aesd	v1.16b,v17.16b
1464	aesimc	v1.16b,v1.16b
1465	aesd	v24.16b,v17.16b
1466	aesimc	v24.16b,v24.16b
1467	ld1	{v17.4s},[x7],#16
1468	b.gt	.Lcbc_dec_tail
1469
1470	aesd	v1.16b,v16.16b
1471	aesimc	v1.16b,v1.16b
1472	aesd	v24.16b,v16.16b
1473	aesimc	v24.16b,v24.16b
1474	aesd	v1.16b,v17.16b
1475	aesimc	v1.16b,v1.16b
1476	aesd	v24.16b,v17.16b
1477	aesimc	v24.16b,v24.16b
1478	aesd	v1.16b,v20.16b
1479	aesimc	v1.16b,v1.16b
1480	aesd	v24.16b,v20.16b
1481	aesimc	v24.16b,v24.16b
1482	cmn	x2,#0x20
1483	aesd	v1.16b,v21.16b
1484	aesimc	v1.16b,v1.16b
1485	aesd	v24.16b,v21.16b
1486	aesimc	v24.16b,v24.16b
1487	eor	v5.16b,v6.16b,v7.16b
1488	aesd	v1.16b,v22.16b
1489	aesimc	v1.16b,v1.16b
1490	aesd	v24.16b,v22.16b
1491	aesimc	v24.16b,v24.16b
1492	eor	v17.16b,v3.16b,v7.16b
1493	aesd	v1.16b,v23.16b
1494	aesd	v24.16b,v23.16b
1495	b.eq	.Lcbc_dec_one
1496	eor	v5.16b,v5.16b,v1.16b
1497	eor	v17.16b,v17.16b,v24.16b
1498	orr	v6.16b,v27.16b,v27.16b
1499	st1	{v5.16b},[x1],#16
1500	st1	{v17.16b},[x1],#16
1501	b	.Lcbc_done
1502
1503.Lcbc_dec_one:
1504	eor	v5.16b,v5.16b,v24.16b
1505	orr	v6.16b,v27.16b,v27.16b
1506	st1	{v5.16b},[x1],#16
1507
1508.Lcbc_done:
1509	st1	{v6.16b},[x4]
1510.Lcbc_abort:
1511	ldr	x29,[sp],#16
1512	ret
1513.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
1514.globl	aes_v8_ctr32_encrypt_blocks_unroll12_eor3
1515.type	aes_v8_ctr32_encrypt_blocks_unroll12_eor3,%function
1516.align	5
1517aes_v8_ctr32_encrypt_blocks_unroll12_eor3:
1518	AARCH64_VALID_CALL_TARGET
1519	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1520	stp	x29,x30,[sp,#-80]!
1521	stp	d8,d9,[sp, #16]
1522	stp	d10,d11,[sp, #32]
1523	stp	d12,d13,[sp, #48]
1524	stp	d14,d15,[sp, #64]
1525	add	x29,sp,#0
1526
1527	ldr	w5,[x3,#240]
1528
1529	ldr	w8, [x4, #12]
1530#ifdef __AARCH64EB__
1531	ld1	{v24.16b},[x4]
1532#else
1533	ld1	{v24.4s},[x4]
1534#endif
1535	ld1	{v2.4s,v3.4s},[x3]		// load key schedule...
1536	sub	w5,w5,#4
1537	cmp	x2,#2
1538	add	x7,x3,x5,lsl#4	// pointer to last round key
1539	sub	w5,w5,#2
1540	add	x7, x7, #64
1541	ld1	{v1.4s},[x7]
1542	add	x7,x3,#32
1543	mov	w6,w5
1544#ifndef __AARCH64EB__
1545	rev	w8, w8
1546#endif
1547
1548	orr	v25.16b,v24.16b,v24.16b
1549	add	w10, w8, #1
1550	orr	v26.16b,v24.16b,v24.16b
1551	add	w8, w8, #2
1552	orr	v0.16b,v24.16b,v24.16b
1553	rev	w10, w10
1554	mov	v25.s[3],w10
1555	b.ls	.Lctr32_tail_unroll
1556	cmp	x2,#6
1557	rev	w12, w8
1558	sub	x2,x2,#3		// bias
1559	mov	v26.s[3],w12
1560	b.lo	.Loop3x_ctr32_unroll
1561	cmp	x2,#9
1562	orr	v27.16b,v24.16b,v24.16b
1563	add	w11, w8, #1
1564	orr	v28.16b,v24.16b,v24.16b
1565	add	w13, w8, #2
1566	rev	w11, w11
1567	orr	v29.16b,v24.16b,v24.16b
1568	add	w8, w8, #3
1569	rev	w13, w13
1570	mov	v27.s[3],w11
1571	rev	w14, w8
1572	mov	v28.s[3],w13
1573	mov	v29.s[3],w14
1574	sub	x2,x2,#3
1575	b.lo	.Loop6x_ctr32_unroll
1576
1577	// push regs to stack when 12 data chunks are interleaved
1578	stp	x19,x20,[sp,#-16]!
1579	stp	x21,x22,[sp,#-16]!
1580	stp	x23,x24,[sp,#-16]!
1581	stp	d8,d9,[sp,#-32]!
1582	stp	d10,d11,[sp,#-32]!
1583
1584	add	w15,w8,#1
1585	add	w19,w8,#2
1586	add	w20,w8,#3
1587	add	w21,w8,#4
1588	add	w22,w8,#5
1589	add	w8,w8,#6
1590	orr	v30.16b,v24.16b,v24.16b
1591	rev	w15,w15
1592	orr	v31.16b,v24.16b,v24.16b
1593	rev	w19,w19
1594	orr	v8.16b,v24.16b,v24.16b
1595	rev	w20,w20
1596	orr	v9.16b,v24.16b,v24.16b
1597	rev	w21,w21
1598	orr	v10.16b,v24.16b,v24.16b
1599	rev	w22,w22
1600	orr	v11.16b,v24.16b,v24.16b
1601	rev	w23,w8
1602
1603	sub	x2,x2,#6		// bias
1604	mov	v30.s[3],w15
1605	mov	v31.s[3],w19
1606	mov	v8.s[3],w20
1607	mov	v9.s[3],w21
1608	mov	v10.s[3],w22
1609	mov	v11.s[3],w23
1610	b	.Loop12x_ctr32_unroll
1611
1612.align	4
1613.Loop12x_ctr32_unroll:
1614	aese	v24.16b,v2.16b
1615	aesmc	v24.16b,v24.16b
1616	aese	v25.16b,v2.16b
1617	aesmc	v25.16b,v25.16b
1618	aese	v26.16b,v2.16b
1619	aesmc	v26.16b,v26.16b
1620	aese	v27.16b,v2.16b
1621	aesmc	v27.16b,v27.16b
1622	aese	v28.16b,v2.16b
1623	aesmc	v28.16b,v28.16b
1624	aese	v29.16b,v2.16b
1625	aesmc	v29.16b,v29.16b
1626	aese	v30.16b,v2.16b
1627	aesmc	v30.16b,v30.16b
1628	aese	v31.16b,v2.16b
1629	aesmc	v31.16b,v31.16b
1630	aese	v8.16b,v2.16b
1631	aesmc	v8.16b,v8.16b
1632	aese	v9.16b,v2.16b
1633	aesmc	v9.16b,v9.16b
1634	aese	v10.16b,v2.16b
1635	aesmc	v10.16b,v10.16b
1636	aese	v11.16b,v2.16b
1637	aesmc	v11.16b,v11.16b
1638	ld1	{v2.4s},[x7],#16
1639	subs	w6,w6,#2
1640	aese	v24.16b,v3.16b
1641	aesmc	v24.16b,v24.16b
1642	aese	v25.16b,v3.16b
1643	aesmc	v25.16b,v25.16b
1644	aese	v26.16b,v3.16b
1645	aesmc	v26.16b,v26.16b
1646	aese	v27.16b,v3.16b
1647	aesmc	v27.16b,v27.16b
1648	aese	v28.16b,v3.16b
1649	aesmc	v28.16b,v28.16b
1650	aese	v29.16b,v3.16b
1651	aesmc	v29.16b,v29.16b
1652	aese	v30.16b,v3.16b
1653	aesmc	v30.16b,v30.16b
1654	aese	v31.16b,v3.16b
1655	aesmc	v31.16b,v31.16b
1656	aese	v8.16b,v3.16b
1657	aesmc	v8.16b,v8.16b
1658	aese	v9.16b,v3.16b
1659	aesmc	v9.16b,v9.16b
1660	aese	v10.16b,v3.16b
1661	aesmc	v10.16b,v10.16b
1662	aese	v11.16b,v3.16b
1663	aesmc	v11.16b,v11.16b
1664	ld1	{v3.4s},[x7],#16
1665	b.gt	.Loop12x_ctr32_unroll
1666
1667	aese	v24.16b,v2.16b
1668	aesmc	v24.16b,v24.16b
1669	aese	v25.16b,v2.16b
1670	aesmc	v25.16b,v25.16b
1671	aese	v26.16b,v2.16b
1672	aesmc	v26.16b,v26.16b
1673	aese	v27.16b,v2.16b
1674	aesmc	v27.16b,v27.16b
1675	aese	v28.16b,v2.16b
1676	aesmc	v28.16b,v28.16b
1677	aese	v29.16b,v2.16b
1678	aesmc	v29.16b,v29.16b
1679	aese	v30.16b,v2.16b
1680	aesmc	v30.16b,v30.16b
1681	aese	v31.16b,v2.16b
1682	aesmc	v31.16b,v31.16b
1683	aese	v8.16b,v2.16b
1684	aesmc	v8.16b,v8.16b
1685	aese	v9.16b,v2.16b
1686	aesmc	v9.16b,v9.16b
1687	aese	v10.16b,v2.16b
1688	aesmc	v10.16b,v10.16b
1689	aese	v11.16b,v2.16b
1690	aesmc	v11.16b,v11.16b
1691	ld1	{v2.4s},[x7],#16
1692
1693	aese	v24.16b,v3.16b
1694	aesmc	v24.16b,v24.16b
1695	aese	v25.16b,v3.16b
1696	aesmc	v25.16b,v25.16b
1697	aese	v26.16b,v3.16b
1698	aesmc	v26.16b,v26.16b
1699	aese	v27.16b,v3.16b
1700	aesmc	v27.16b,v27.16b
1701	aese	v28.16b,v3.16b
1702	aesmc	v28.16b,v28.16b
1703	aese	v29.16b,v3.16b
1704	aesmc	v29.16b,v29.16b
1705	aese	v30.16b,v3.16b
1706	aesmc	v30.16b,v30.16b
1707	aese	v31.16b,v3.16b
1708	aesmc	v31.16b,v31.16b
1709	aese	v8.16b,v3.16b
1710	aesmc	v8.16b,v8.16b
1711	aese	v9.16b,v3.16b
1712	aesmc	v9.16b,v9.16b
1713	aese	v10.16b,v3.16b
1714	aesmc	v10.16b,v10.16b
1715	aese	v11.16b,v3.16b
1716	aesmc	v11.16b,v11.16b
1717	ld1	{v3.4s},[x7],#16
1718
1719	aese	v24.16b,v2.16b
1720	aesmc	v24.16b,v24.16b
1721	add	w9,w8,#1
1722	add	w10,w8,#2
1723	aese	v25.16b,v2.16b
1724	aesmc	v25.16b,v25.16b
1725	add	w12,w8,#3
1726	add	w11,w8,#4
1727	aese	v26.16b,v2.16b
1728	aesmc	v26.16b,v26.16b
1729	add	w13,w8,#5
1730	add	w14,w8,#6
1731	rev	w9,w9
1732	aese	v27.16b,v2.16b
1733	aesmc	v27.16b,v27.16b
1734	add	w15,w8,#7
1735	add	w19,w8,#8
1736	rev	w10,w10
1737	rev	w12,w12
1738	aese	v28.16b,v2.16b
1739	aesmc	v28.16b,v28.16b
1740	add	w20,w8,#9
1741	add	w21,w8,#10
1742	rev	w11,w11
1743	rev	w13,w13
1744	aese	v29.16b,v2.16b
1745	aesmc	v29.16b,v29.16b
1746	add	w22,w8,#11
1747	add	w23,w8,#12
1748	rev	w14,w14
1749	rev	w15,w15
1750	aese	v30.16b,v2.16b
1751	aesmc	v30.16b,v30.16b
1752	rev	w19,w19
1753	rev	w20,w20
1754	aese	v31.16b,v2.16b
1755	aesmc	v31.16b,v31.16b
1756	rev	w21,w21
1757	rev	w22,w22
1758	aese	v8.16b,v2.16b
1759	aesmc	v8.16b,v8.16b
1760	rev	w23,w23
1761	aese	v9.16b,v2.16b
1762	aesmc	v9.16b,v9.16b
1763	aese	v10.16b,v2.16b
1764	aesmc	v10.16b,v10.16b
1765	aese	v11.16b,v2.16b
1766	aesmc	v11.16b,v11.16b
1767	ld1	{v2.4s},[x7],#16
1768
1769	aese	v24.16b,v3.16b
1770	aesmc	v24.16b,v24.16b
1771	aese	v25.16b,v3.16b
1772	aesmc	v25.16b,v25.16b
1773	aese	v26.16b,v3.16b
1774	aesmc	v26.16b,v26.16b
1775	aese	v27.16b,v3.16b
1776	aesmc	v27.16b,v27.16b
1777	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1778	aese	v28.16b,v3.16b
1779	aesmc	v28.16b,v28.16b
1780	aese	v29.16b,v3.16b
1781	aesmc	v29.16b,v29.16b
1782	aese	v30.16b,v3.16b
1783	aesmc	v30.16b,v30.16b
1784	aese	v31.16b,v3.16b
1785	aesmc	v31.16b,v31.16b
1786	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1787	aese	v8.16b,v3.16b
1788	aesmc	v8.16b,v8.16b
1789	aese	v9.16b,v3.16b
1790	aesmc	v9.16b,v9.16b
1791	aese	v10.16b,v3.16b
1792	aesmc	v10.16b,v10.16b
1793	aese	v11.16b,v3.16b
1794	aesmc	v11.16b,v11.16b
1795	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1796	ld1	{v3.4s},[x7],#16
1797
1798	mov	x7, x3
1799	aese	v24.16b,v2.16b
1800	aesmc	v24.16b,v24.16b
1801	aese	v25.16b,v2.16b
1802	aesmc	v25.16b,v25.16b
1803	aese	v26.16b,v2.16b
1804	aesmc	v26.16b,v26.16b
1805	aese	v27.16b,v2.16b
1806	aesmc	v27.16b,v27.16b
1807	aese	v28.16b,v2.16b
1808	aesmc	v28.16b,v28.16b
1809	aese	v29.16b,v2.16b
1810	aesmc	v29.16b,v29.16b
1811	aese	v30.16b,v2.16b
1812	aesmc	v30.16b,v30.16b
1813	aese	v31.16b,v2.16b
1814	aesmc	v31.16b,v31.16b
1815	aese	v8.16b,v2.16b
1816	aesmc	v8.16b,v8.16b
1817	aese	v9.16b,v2.16b
1818	aesmc	v9.16b,v9.16b
1819	aese	v10.16b,v2.16b
1820	aesmc	v10.16b,v10.16b
1821	aese	v11.16b,v2.16b
1822	aesmc	v11.16b,v11.16b
1823	ld1	{v2.4s},[x7],#16	// re-pre-load rndkey[0]
1824
1825	aese	v24.16b,v3.16b
1826.inst	0xce016084	//eor3 v4.16b,v4.16b,v1.16b,v24.16b
1827	orr	v24.16b,v0.16b,v0.16b
1828	aese	v25.16b,v3.16b
1829.inst	0xce0164a5	//eor3 v5.16b,v5.16b,v1.16b,v25.16b
1830	orr	v25.16b,v0.16b,v0.16b
1831	aese	v26.16b,v3.16b
1832.inst	0xce0168c6	//eor3 v6.16b,v6.16b,v1.16b,v26.16b
1833	orr	v26.16b,v0.16b,v0.16b
1834	aese	v27.16b,v3.16b
1835.inst	0xce016ce7	//eor3 v7.16b,v7.16b,v1.16b,v27.16b
1836	orr	v27.16b,v0.16b,v0.16b
1837	aese	v28.16b,v3.16b
1838.inst	0xce017210	//eor3 v16.16b,v16.16b,v1.16b,v28.16b
1839	orr	v28.16b,v0.16b,v0.16b
1840	aese	v29.16b,v3.16b
1841.inst	0xce017631	//eor3 v17.16b,v17.16b,v1.16b,v29.16b
1842	orr	v29.16b,v0.16b,v0.16b
1843	aese	v30.16b,v3.16b
1844.inst	0xce017a52	//eor3 v18.16b,v18.16b,v1.16b,v30.16b
1845	orr	v30.16b,v0.16b,v0.16b
1846	aese	v31.16b,v3.16b
1847.inst	0xce017e73	//eor3 v19.16b,v19.16b,v1.16b,v31.16b
1848	orr	v31.16b,v0.16b,v0.16b
1849	aese	v8.16b,v3.16b
1850.inst	0xce012294	//eor3 v20.16b,v20.16b,v1.16b,v8.16b
1851	orr	v8.16b,v0.16b,v0.16b
1852	aese	v9.16b,v3.16b
1853.inst	0xce0126b5	//eor3 v21.16b,v21.16b,v1.16b,v9.16b
1854	orr	v9.16b,v0.16b,v0.16b
1855	aese	v10.16b,v3.16b
1856.inst	0xce012ad6	//eor3 v22.16b,v22.16b,v1.16b,v10.16b
1857	orr	v10.16b,v0.16b,v0.16b
1858	aese	v11.16b,v3.16b
1859.inst	0xce012ef7	//eor3 v23.16b,v23.16b,v1.16b,v11.16b
1860	orr	v11.16b,v0.16b,v0.16b
1861	ld1	{v3.4s},[x7],#16	// re-pre-load rndkey[1]
1862
1863	mov	v24.s[3],w9
1864	mov	v25.s[3],w10
1865	mov	v26.s[3],w12
1866	mov	v27.s[3],w11
1867	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1868	mov	v28.s[3],w13
1869	mov	v29.s[3],w14
1870	mov	v30.s[3],w15
1871	mov	v31.s[3],w19
1872	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
1873	mov	v8.s[3],w20
1874	mov	v9.s[3],w21
1875	mov	v10.s[3],w22
1876	mov	v11.s[3],w23
1877	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1878
1879	mov	w6,w5
1880
1881	add	w8,w8,#12
1882	subs	x2,x2,#12
1883	b.hs	.Loop12x_ctr32_unroll
1884
1885	// pop regs from stack when 12 data chunks are interleaved
1886	ldp	d10,d11,[sp],#32
1887	ldp	d8,d9,[sp],#32
1888	ldp	x23,x24,[sp],#16
1889	ldp	x21,x22,[sp],#16
1890	ldp	x19,x20,[sp],#16
1891
1892	add	x2,x2,#12
1893	cbz	x2,.Lctr32_done_unroll
1894	sub	w8,w8,#12
1895
1896	cmp	x2,#2
1897	b.ls	.Lctr32_tail_unroll
1898
1899	cmp	x2,#6
1900	sub	x2,x2,#3		// bias
1901	add	w8,w8,#3
1902	b.lo	.Loop3x_ctr32_unroll
1903
1904	sub	x2,x2,#3
1905	add	w8,w8,#3
1906	b.lo	.Loop6x_ctr32_unroll
1907
1908.align	4
1909.Loop6x_ctr32_unroll:
1910	aese	v24.16b,v2.16b
1911	aesmc	v24.16b,v24.16b
1912	aese	v25.16b,v2.16b
1913	aesmc	v25.16b,v25.16b
1914	aese	v26.16b,v2.16b
1915	aesmc	v26.16b,v26.16b
1916	aese	v27.16b,v2.16b
1917	aesmc	v27.16b,v27.16b
1918	aese	v28.16b,v2.16b
1919	aesmc	v28.16b,v28.16b
1920	aese	v29.16b,v2.16b
1921	aesmc	v29.16b,v29.16b
1922	ld1	{v2.4s},[x7],#16
1923	subs	w6,w6,#2
1924	aese	v24.16b,v3.16b
1925	aesmc	v24.16b,v24.16b
1926	aese	v25.16b,v3.16b
1927	aesmc	v25.16b,v25.16b
1928	aese	v26.16b,v3.16b
1929	aesmc	v26.16b,v26.16b
1930	aese	v27.16b,v3.16b
1931	aesmc	v27.16b,v27.16b
1932	aese	v28.16b,v3.16b
1933	aesmc	v28.16b,v28.16b
1934	aese	v29.16b,v3.16b
1935	aesmc	v29.16b,v29.16b
1936	ld1	{v3.4s},[x7],#16
1937	b.gt	.Loop6x_ctr32_unroll
1938
1939	aese	v24.16b,v2.16b
1940	aesmc	v24.16b,v24.16b
1941	aese	v25.16b,v2.16b
1942	aesmc	v25.16b,v25.16b
1943	aese	v26.16b,v2.16b
1944	aesmc	v26.16b,v26.16b
1945	aese	v27.16b,v2.16b
1946	aesmc	v27.16b,v27.16b
1947	aese	v28.16b,v2.16b
1948	aesmc	v28.16b,v28.16b
1949	aese	v29.16b,v2.16b
1950	aesmc	v29.16b,v29.16b
1951	ld1	{v2.4s},[x7],#16
1952
1953	aese	v24.16b,v3.16b
1954	aesmc	v24.16b,v24.16b
1955	aese	v25.16b,v3.16b
1956	aesmc	v25.16b,v25.16b
1957	aese	v26.16b,v3.16b
1958	aesmc	v26.16b,v26.16b
1959	aese	v27.16b,v3.16b
1960	aesmc	v27.16b,v27.16b
1961	aese	v28.16b,v3.16b
1962	aesmc	v28.16b,v28.16b
1963	aese	v29.16b,v3.16b
1964	aesmc	v29.16b,v29.16b
1965	ld1	{v3.4s},[x7],#16
1966
1967	aese	v24.16b,v2.16b
1968	aesmc	v24.16b,v24.16b
1969	add	w9,w8,#1
1970	add	w10,w8,#2
1971	aese	v25.16b,v2.16b
1972	aesmc	v25.16b,v25.16b
1973	add	w12,w8,#3
1974	add	w11,w8,#4
1975	aese	v26.16b,v2.16b
1976	aesmc	v26.16b,v26.16b
1977	add	w13,w8,#5
1978	add	w14,w8,#6
1979	rev	w9,w9
1980	aese	v27.16b,v2.16b
1981	aesmc	v27.16b,v27.16b
1982	rev	w10,w10
1983	rev	w12,w12
1984	aese	v28.16b,v2.16b
1985	aesmc	v28.16b,v28.16b
1986	rev	w11,w11
1987	rev	w13,w13
1988	aese	v29.16b,v2.16b
1989	aesmc	v29.16b,v29.16b
1990	rev	w14,w14
1991	ld1	{v2.4s},[x7],#16
1992
1993	aese	v24.16b,v3.16b
1994	aesmc	v24.16b,v24.16b
1995	aese	v25.16b,v3.16b
1996	aesmc	v25.16b,v25.16b
1997	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1998	aese	v26.16b,v3.16b
1999	aesmc	v26.16b,v26.16b
2000	aese	v27.16b,v3.16b
2001	aesmc	v27.16b,v27.16b
2002	ld1	{v16.16b,v17.16b},[x0],#32
2003	aese	v28.16b,v3.16b
2004	aesmc	v28.16b,v28.16b
2005	aese	v29.16b,v3.16b
2006	aesmc	v29.16b,v29.16b
2007	ld1	{v3.4s},[x7],#16
2008
2009	mov	x7, x3
2010	aese	v24.16b,v2.16b
2011	aesmc	v24.16b,v24.16b
2012	aese	v25.16b,v2.16b
2013	aesmc	v25.16b,v25.16b
2014	aese	v26.16b,v2.16b
2015	aesmc	v26.16b,v26.16b
2016	aese	v27.16b,v2.16b
2017	aesmc	v27.16b,v27.16b
2018	aese	v28.16b,v2.16b
2019	aesmc	v28.16b,v28.16b
2020	aese	v29.16b,v2.16b
2021	aesmc	v29.16b,v29.16b
2022	ld1	{v2.4s},[x7],#16	// re-pre-load rndkey[0]
2023
2024	aese	v24.16b,v3.16b
2025.inst	0xce016084	//eor3 v4.16b,v4.16b,v1.16b,v24.16b
2026	aese	v25.16b,v3.16b
2027.inst	0xce0164a5	//eor3 v5.16b,v5.16b,v1.16b,v25.16b
2028	aese	v26.16b,v3.16b
2029.inst	0xce0168c6	//eor3 v6.16b,v6.16b,v1.16b,v26.16b
2030	aese	v27.16b,v3.16b
2031.inst	0xce016ce7	//eor3 v7.16b,v7.16b,v1.16b,v27.16b
2032	aese	v28.16b,v3.16b
2033.inst	0xce017210	//eor3 v16.16b,v16.16b,v1.16b,v28.16b
2034	aese	v29.16b,v3.16b
2035.inst	0xce017631	//eor3 v17.16b,v17.16b,v1.16b,v29.16b
2036	ld1	{v3.4s},[x7],#16	// re-pre-load rndkey[1]
2037
2038	orr	v24.16b,v0.16b,v0.16b
2039	orr	v25.16b,v0.16b,v0.16b
2040	orr	v26.16b,v0.16b,v0.16b
2041	orr	v27.16b,v0.16b,v0.16b
2042	orr	v28.16b,v0.16b,v0.16b
2043	orr	v29.16b,v0.16b,v0.16b
2044
2045	mov	v24.s[3],w9
2046	mov	v25.s[3],w10
2047	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
2048	mov	v26.s[3],w12
2049	mov	v27.s[3],w11
2050	st1	{v16.16b,v17.16b},[x1],#32
2051	mov	v28.s[3],w13
2052	mov	v29.s[3],w14
2053
2054	cbz	x2,.Lctr32_done_unroll
2055	mov	w6,w5
2056
2057	cmp	x2,#2
2058	b.ls	.Lctr32_tail_unroll
2059
2060	sub	x2,x2,#3		// bias
2061	add	w8,w8,#3
2062	b	.Loop3x_ctr32_unroll
2063
2064.align	4
2065.Loop3x_ctr32_unroll:
2066	aese	v24.16b,v2.16b
2067	aesmc	v24.16b,v24.16b
2068	aese	v25.16b,v2.16b
2069	aesmc	v25.16b,v25.16b
2070	aese	v26.16b,v2.16b
2071	aesmc	v26.16b,v26.16b
2072	ld1	{v2.4s},[x7],#16
2073	subs	w6,w6,#2
2074	aese	v24.16b,v3.16b
2075	aesmc	v24.16b,v24.16b
2076	aese	v25.16b,v3.16b
2077	aesmc	v25.16b,v25.16b
2078	aese	v26.16b,v3.16b
2079	aesmc	v26.16b,v26.16b
2080	ld1	{v3.4s},[x7],#16
2081	b.gt	.Loop3x_ctr32_unroll
2082
2083	aese	v24.16b,v2.16b
2084	aesmc	v9.16b,v24.16b
2085	aese	v25.16b,v2.16b
2086	aesmc	v10.16b,v25.16b
2087	ld1	{v4.16b,v5.16b,v6.16b},[x0],#48
2088	orr	v24.16b,v0.16b,v0.16b
2089	aese	v26.16b,v2.16b
2090	aesmc	v26.16b,v26.16b
2091	ld1	{v2.4s},[x7],#16
2092	orr	v25.16b,v0.16b,v0.16b
2093	aese	v9.16b,v3.16b
2094	aesmc	v9.16b,v9.16b
2095	aese	v10.16b,v3.16b
2096	aesmc	v10.16b,v10.16b
2097	aese	v26.16b,v3.16b
2098	aesmc	v11.16b,v26.16b
2099	ld1	{v3.4s},[x7],#16
2100	orr	v26.16b,v0.16b,v0.16b
2101	add	w9,w8,#1
2102	aese	v9.16b,v2.16b
2103	aesmc	v9.16b,v9.16b
2104	aese	v10.16b,v2.16b
2105	aesmc	v10.16b,v10.16b
2106	add	w10,w8,#2
2107	aese	v11.16b,v2.16b
2108	aesmc	v11.16b,v11.16b
2109	ld1	{v2.4s},[x7],#16
2110	add	w8,w8,#3
2111	aese	v9.16b,v3.16b
2112	aesmc	v9.16b,v9.16b
2113	aese	v10.16b,v3.16b
2114	aesmc	v10.16b,v10.16b
2115
2116	rev	w9,w9
2117	aese	v11.16b,v3.16b
2118	aesmc	v11.16b,v11.16b
2119	ld1	{v3.4s},[x7],#16
2120	mov	v24.s[3], w9
2121	mov	x7,x3
2122	rev	w10,w10
2123	aese	v9.16b,v2.16b
2124	aesmc	v9.16b,v9.16b
2125
2126	aese	v10.16b,v2.16b
2127	aesmc	v10.16b,v10.16b
2128	mov	v25.s[3], w10
2129	rev	w12,w8
2130	aese	v11.16b,v2.16b
2131	aesmc	v11.16b,v11.16b
2132	mov	v26.s[3], w12
2133
2134	aese	v9.16b,v3.16b
2135	aese	v10.16b,v3.16b
2136	aese	v11.16b,v3.16b
2137
2138.inst	0xce012484	//eor3 v4.16b,v4.16b,v1.16b,v9.16b
2139	ld1	{v2.4s},[x7],#16	// re-pre-load rndkey[0]
2140.inst	0xce0128a5	//eor3 v5.16b,v5.16b,v1.16b,v10.16b
2141	mov	w6,w5
2142.inst	0xce012cc6	//eor3 v6.16b,v6.16b,v1.16b,v11.16b
2143	ld1	{v3.4s},[x7],#16	// re-pre-load rndkey[1]
2144	st1	{v4.16b,v5.16b,v6.16b},[x1],#48
2145
2146	cbz	x2,.Lctr32_done_unroll
2147
2148.Lctr32_tail_unroll:
2149	cmp	x2,#1
2150	b.eq	.Lctr32_tail_1_unroll
2151
2152.Lctr32_tail_2_unroll:
2153	aese	v24.16b,v2.16b
2154	aesmc	v24.16b,v24.16b
2155	aese	v25.16b,v2.16b
2156	aesmc	v25.16b,v25.16b
2157	ld1	{v2.4s},[x7],#16
2158	subs	w6,w6,#2
2159	aese	v24.16b,v3.16b
2160	aesmc	v24.16b,v24.16b
2161	aese	v25.16b,v3.16b
2162	aesmc	v25.16b,v25.16b
2163	ld1	{v3.4s},[x7],#16
2164	b.gt	.Lctr32_tail_2_unroll
2165
2166	aese	v24.16b,v2.16b
2167	aesmc	v24.16b,v24.16b
2168	aese	v25.16b,v2.16b
2169	aesmc	v25.16b,v25.16b
2170	ld1	{v2.4s},[x7],#16
2171	aese	v24.16b,v3.16b
2172	aesmc	v24.16b,v24.16b
2173	aese	v25.16b,v3.16b
2174	aesmc	v25.16b,v25.16b
2175	ld1	{v3.4s},[x7],#16
2176	ld1	{v4.16b,v5.16b},[x0],#32
2177	aese	v24.16b,v2.16b
2178	aesmc	v24.16b,v24.16b
2179	aese	v25.16b,v2.16b
2180	aesmc	v25.16b,v25.16b
2181	ld1	{v2.4s},[x7],#16
2182	aese	v24.16b,v3.16b
2183	aesmc	v24.16b,v24.16b
2184	aese	v25.16b,v3.16b
2185	aesmc	v25.16b,v25.16b
2186	ld1	{v3.4s},[x7],#16
2187	aese	v24.16b,v2.16b
2188	aesmc	v24.16b,v24.16b
2189	aese	v25.16b,v2.16b
2190	aesmc	v25.16b,v25.16b
2191	aese	v24.16b,v3.16b
2192	aese	v25.16b,v3.16b
2193
2194.inst	0xce016084	//eor3 v4.16b,v4.16b,v1.16b,v24.16b
2195.inst	0xce0164a5	//eor3 v5.16b,v5.16b,v1.16b,v25.16b
2196	st1	{v4.16b,v5.16b},[x1],#32
2197	b	.Lctr32_done_unroll
2198
2199.Lctr32_tail_1_unroll:
2200	aese	v24.16b,v2.16b
2201	aesmc	v24.16b,v24.16b
2202	ld1	{v2.4s},[x7],#16
2203	subs	w6,w6,#2
2204	aese	v24.16b,v3.16b
2205	aesmc	v24.16b,v24.16b
2206	ld1	{v3.4s},[x7],#16
2207	b.gt	.Lctr32_tail_1_unroll
2208
2209	aese	v24.16b,v2.16b
2210	aesmc	v24.16b,v24.16b
2211	ld1	{v2.4s},[x7],#16
2212	aese	v24.16b,v3.16b
2213	aesmc	v24.16b,v24.16b
2214	ld1	{v3.4s},[x7],#16
2215	ld1	{v4.16b},[x0]
2216	aese	v24.16b,v2.16b
2217	aesmc	v24.16b,v24.16b
2218	ld1	{v2.4s},[x7],#16
2219	aese	v24.16b,v3.16b
2220	aesmc	v24.16b,v24.16b
2221	ld1	{v3.4s},[x7],#16
2222	aese	v24.16b,v2.16b
2223	aesmc	v24.16b,v24.16b
2224	aese	v24.16b,v3.16b
2225
2226.inst	0xce016084	//eor3 v4.16b,v4.16b,v1.16b,v24.16b
2227	st1	{v4.16b},[x1],#16
2228
2229.Lctr32_done_unroll:
2230	ldp	d8,d9,[sp, #16]
2231	ldp	d10,d11,[sp, #32]
2232	ldp	d12,d13,[sp, #48]
2233	ldp	d14,d15,[sp, #64]
2234	ldr	x29,[sp],#80
2235	ret
2236.size	aes_v8_ctr32_encrypt_blocks_unroll12_eor3,.-aes_v8_ctr32_encrypt_blocks_unroll12_eor3
2237.globl	aes_v8_ctr32_encrypt_blocks
2238.type	aes_v8_ctr32_encrypt_blocks,%function
2239.align	5
2240aes_v8_ctr32_encrypt_blocks:
2241	AARCH64_VALID_CALL_TARGET
2242	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
2243	stp	x29,x30,[sp,#-16]!
2244	add	x29,sp,#0
2245	ldr	w5,[x3,#240]
2246
2247	ldr	w8, [x4, #12]
2248#ifdef __AARCH64EB__
2249	ld1	{v0.16b},[x4]
2250#else
2251	ld1	{v0.4s},[x4]
2252#endif
2253	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
2254	sub	w5,w5,#4
2255	mov	x12,#16
2256	cmp	x2,#2
2257	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
2258	sub	w5,w5,#2
2259	ld1	{v20.4s,v21.4s},[x7],#32
2260	ld1	{v22.4s,v23.4s},[x7],#32
2261	ld1	{v7.4s},[x7]
2262	add	x7,x3,#32
2263	mov	w6,w5
2264	csel	x12,xzr,x12,lo
2265#ifndef __AARCH64EB__
2266	rev	w8, w8
2267#endif
2268	orr	v1.16b,v0.16b,v0.16b
2269	add	w10, w8, #1
2270	orr	v18.16b,v0.16b,v0.16b
2271	add	w8, w8, #2
2272	orr	v6.16b,v0.16b,v0.16b
2273	rev	w10, w10
2274	mov	v1.s[3],w10
2275	b.ls	.Lctr32_tail
2276	rev	w12, w8
2277	sub	x2,x2,#3		// bias
2278	mov	v18.s[3],w12
2279	cmp	x2,#32
2280	b.lo	.Loop3x_ctr32
2281
2282	add	w13,w8,#1
2283	add	w14,w8,#2
2284	orr	v24.16b,v0.16b,v0.16b
2285	rev	w13,w13
2286	orr	v25.16b,v0.16b,v0.16b
2287	rev	w14,w14
2288	mov	v24.s[3],w13
2289	sub	x2,x2,#2		// bias
2290	mov	v25.s[3],w14
2291	add	w8,w8,#2
2292	b	.Loop5x_ctr32
2293
2294.align	4
2295.Loop5x_ctr32:
2296	aese	v0.16b,v16.16b
2297	aesmc	v0.16b,v0.16b
2298	aese	v1.16b,v16.16b
2299	aesmc	v1.16b,v1.16b
2300	aese	v18.16b,v16.16b
2301	aesmc	v18.16b,v18.16b
2302	aese	v24.16b,v16.16b
2303	aesmc	v24.16b,v24.16b
2304	aese	v25.16b,v16.16b
2305	aesmc	v25.16b,v25.16b
2306	ld1	{v16.4s},[x7],#16
2307	subs	w6,w6,#2
2308	aese	v0.16b,v17.16b
2309	aesmc	v0.16b,v0.16b
2310	aese	v1.16b,v17.16b
2311	aesmc	v1.16b,v1.16b
2312	aese	v18.16b,v17.16b
2313	aesmc	v18.16b,v18.16b
2314	aese	v24.16b,v17.16b
2315	aesmc	v24.16b,v24.16b
2316	aese	v25.16b,v17.16b
2317	aesmc	v25.16b,v25.16b
2318	ld1	{v17.4s},[x7],#16
2319	b.gt	.Loop5x_ctr32
2320
2321	mov	x7,x3
2322	aese	v0.16b,v16.16b
2323	aesmc	v0.16b,v0.16b
2324	aese	v1.16b,v16.16b
2325	aesmc	v1.16b,v1.16b
2326	aese	v18.16b,v16.16b
2327	aesmc	v18.16b,v18.16b
2328	aese	v24.16b,v16.16b
2329	aesmc	v24.16b,v24.16b
2330	aese	v25.16b,v16.16b
2331	aesmc	v25.16b,v25.16b
2332	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
2333
2334	aese	v0.16b,v17.16b
2335	aesmc	v0.16b,v0.16b
2336	aese	v1.16b,v17.16b
2337	aesmc	v1.16b,v1.16b
2338	aese	v18.16b,v17.16b
2339	aesmc	v18.16b,v18.16b
2340	aese	v24.16b,v17.16b
2341	aesmc	v24.16b,v24.16b
2342	aese	v25.16b,v17.16b
2343	aesmc	v25.16b,v25.16b
2344	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
2345
2346	aese	v0.16b,v20.16b
2347	aesmc	v0.16b,v0.16b
2348	add	w9,w8,#1
2349	add	w10,w8,#2
2350	aese	v1.16b,v20.16b
2351	aesmc	v1.16b,v1.16b
2352	add	w12,w8,#3
2353	add	w13,w8,#4
2354	aese	v18.16b,v20.16b
2355	aesmc	v18.16b,v18.16b
2356	add	w14,w8,#5
2357	rev	w9,w9
2358	aese	v24.16b,v20.16b
2359	aesmc	v24.16b,v24.16b
2360	rev	w10,w10
2361	rev	w12,w12
2362	aese	v25.16b,v20.16b
2363	aesmc	v25.16b,v25.16b
2364	rev	w13,w13
2365	rev	w14,w14
2366
2367	aese	v0.16b,v21.16b
2368	aesmc	v0.16b,v0.16b
2369	aese	v1.16b,v21.16b
2370	aesmc	v1.16b,v1.16b
2371	aese	v18.16b,v21.16b
2372	aesmc	v18.16b,v18.16b
2373	aese	v24.16b,v21.16b
2374	aesmc	v24.16b,v24.16b
2375	aese	v25.16b,v21.16b
2376	aesmc	v25.16b,v25.16b
2377
2378	aese	v0.16b,v22.16b
2379	aesmc	v0.16b,v0.16b
2380	ld1	{v2.16b},[x0],#16
2381	aese	v1.16b,v22.16b
2382	aesmc	v1.16b,v1.16b
2383	ld1	{v3.16b},[x0],#16
2384	aese	v18.16b,v22.16b
2385	aesmc	v18.16b,v18.16b
2386	ld1	{v19.16b},[x0],#16
2387	aese	v24.16b,v22.16b
2388	aesmc	v24.16b,v24.16b
2389	ld1	{v26.16b},[x0],#16
2390	aese	v25.16b,v22.16b
2391	aesmc	v25.16b,v25.16b
2392	ld1	{v27.16b},[x0],#16
2393
2394	aese	v0.16b,v23.16b
2395	eor	v2.16b,v2.16b,v7.16b
2396	aese	v1.16b,v23.16b
2397	eor	v3.16b,v3.16b,v7.16b
2398	aese	v18.16b,v23.16b
2399	eor	v19.16b,v19.16b,v7.16b
2400	aese	v24.16b,v23.16b
2401	eor	v26.16b,v26.16b,v7.16b
2402	aese	v25.16b,v23.16b
2403	eor	v27.16b,v27.16b,v7.16b
2404
2405	eor	v2.16b,v2.16b,v0.16b
2406	orr	v0.16b,v6.16b,v6.16b
2407	eor	v3.16b,v3.16b,v1.16b
2408	orr	v1.16b,v6.16b,v6.16b
2409	eor	v19.16b,v19.16b,v18.16b
2410	orr	v18.16b,v6.16b,v6.16b
2411	eor	v26.16b,v26.16b,v24.16b
2412	orr	v24.16b,v6.16b,v6.16b
2413	eor	v27.16b,v27.16b,v25.16b
2414	orr	v25.16b,v6.16b,v6.16b
2415
2416	st1	{v2.16b},[x1],#16
2417	mov	v0.s[3],w9
2418	st1	{v3.16b},[x1],#16
2419	mov	v1.s[3],w10
2420	st1	{v19.16b},[x1],#16
2421	mov	v18.s[3],w12
2422	st1	{v26.16b},[x1],#16
2423	mov	v24.s[3],w13
2424	st1	{v27.16b},[x1],#16
2425	mov	v25.s[3],w14
2426
2427	mov	w6,w5
2428	cbz	x2,.Lctr32_done
2429
2430	add	w8,w8,#5
2431	subs	x2,x2,#5
2432	b.hs	.Loop5x_ctr32
2433
2434	add	x2,x2,#5
2435	sub	w8,w8,#5
2436
2437	cmp	x2,#2
2438	mov	x12,#16
2439	csel	x12,xzr,x12,lo
2440	b.ls	.Lctr32_tail
2441
2442	sub	x2,x2,#3		// bias
2443	add	w8,w8,#3
2444	b	.Loop3x_ctr32
2445
2446.align	4
2447.Loop3x_ctr32:
2448	aese	v0.16b,v16.16b
2449	aesmc	v0.16b,v0.16b
2450	aese	v1.16b,v16.16b
2451	aesmc	v1.16b,v1.16b
2452	aese	v18.16b,v16.16b
2453	aesmc	v18.16b,v18.16b
2454	ld1	{v16.4s},[x7],#16
2455	subs	w6,w6,#2
2456	aese	v0.16b,v17.16b
2457	aesmc	v0.16b,v0.16b
2458	aese	v1.16b,v17.16b
2459	aesmc	v1.16b,v1.16b
2460	aese	v18.16b,v17.16b
2461	aesmc	v18.16b,v18.16b
2462	ld1	{v17.4s},[x7],#16
2463	b.gt	.Loop3x_ctr32
2464
2465	aese	v0.16b,v16.16b
2466	aesmc	v4.16b,v0.16b
2467	aese	v1.16b,v16.16b
2468	aesmc	v5.16b,v1.16b
2469	ld1	{v2.16b},[x0],#16
2470	orr	v0.16b,v6.16b,v6.16b
2471	aese	v18.16b,v16.16b
2472	aesmc	v18.16b,v18.16b
2473	ld1	{v3.16b},[x0],#16
2474	orr	v1.16b,v6.16b,v6.16b
2475	aese	v4.16b,v17.16b
2476	aesmc	v4.16b,v4.16b
2477	aese	v5.16b,v17.16b
2478	aesmc	v5.16b,v5.16b
2479	ld1	{v19.16b},[x0],#16
2480	mov	x7,x3
2481	aese	v18.16b,v17.16b
2482	aesmc	v17.16b,v18.16b
2483	orr	v18.16b,v6.16b,v6.16b
2484	add	w9,w8,#1
2485	aese	v4.16b,v20.16b
2486	aesmc	v4.16b,v4.16b
2487	aese	v5.16b,v20.16b
2488	aesmc	v5.16b,v5.16b
2489	eor	v2.16b,v2.16b,v7.16b
2490	add	w10,w8,#2
2491	aese	v17.16b,v20.16b
2492	aesmc	v17.16b,v17.16b
2493	eor	v3.16b,v3.16b,v7.16b
2494	add	w8,w8,#3
2495	aese	v4.16b,v21.16b
2496	aesmc	v4.16b,v4.16b
2497	aese	v5.16b,v21.16b
2498	aesmc	v5.16b,v5.16b
2499	eor	v19.16b,v19.16b,v7.16b
2500	rev	w9,w9
2501	aese	v17.16b,v21.16b
2502	aesmc	v17.16b,v17.16b
2503	mov	v0.s[3], w9
2504	rev	w10,w10
2505	aese	v4.16b,v22.16b
2506	aesmc	v4.16b,v4.16b
2507	aese	v5.16b,v22.16b
2508	aesmc	v5.16b,v5.16b
2509	mov	v1.s[3], w10
2510	rev	w12,w8
2511	aese	v17.16b,v22.16b
2512	aesmc	v17.16b,v17.16b
2513	mov	v18.s[3], w12
2514	subs	x2,x2,#3
2515	aese	v4.16b,v23.16b
2516	aese	v5.16b,v23.16b
2517	aese	v17.16b,v23.16b
2518
2519	eor	v2.16b,v2.16b,v4.16b
2520	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
2521	st1	{v2.16b},[x1],#16
2522	eor	v3.16b,v3.16b,v5.16b
2523	mov	w6,w5
2524	st1	{v3.16b},[x1],#16
2525	eor	v19.16b,v19.16b,v17.16b
2526	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
2527	st1	{v19.16b},[x1],#16
2528	b.hs	.Loop3x_ctr32
2529
2530	adds	x2,x2,#3
2531	b.eq	.Lctr32_done
2532	cmp	x2,#1
2533	mov	x12,#16
2534	csel	x12,xzr,x12,eq
2535
2536.Lctr32_tail:
2537	aese	v0.16b,v16.16b
2538	aesmc	v0.16b,v0.16b
2539	aese	v1.16b,v16.16b
2540	aesmc	v1.16b,v1.16b
2541	ld1	{v16.4s},[x7],#16
2542	subs	w6,w6,#2
2543	aese	v0.16b,v17.16b
2544	aesmc	v0.16b,v0.16b
2545	aese	v1.16b,v17.16b
2546	aesmc	v1.16b,v1.16b
2547	ld1	{v17.4s},[x7],#16
2548	b.gt	.Lctr32_tail
2549
2550	aese	v0.16b,v16.16b
2551	aesmc	v0.16b,v0.16b
2552	aese	v1.16b,v16.16b
2553	aesmc	v1.16b,v1.16b
2554	aese	v0.16b,v17.16b
2555	aesmc	v0.16b,v0.16b
2556	aese	v1.16b,v17.16b
2557	aesmc	v1.16b,v1.16b
2558	ld1	{v2.16b},[x0],x12
2559	aese	v0.16b,v20.16b
2560	aesmc	v0.16b,v0.16b
2561	aese	v1.16b,v20.16b
2562	aesmc	v1.16b,v1.16b
2563	ld1	{v3.16b},[x0]
2564	aese	v0.16b,v21.16b
2565	aesmc	v0.16b,v0.16b
2566	aese	v1.16b,v21.16b
2567	aesmc	v1.16b,v1.16b
2568	eor	v2.16b,v2.16b,v7.16b
2569	aese	v0.16b,v22.16b
2570	aesmc	v0.16b,v0.16b
2571	aese	v1.16b,v22.16b
2572	aesmc	v1.16b,v1.16b
2573	eor	v3.16b,v3.16b,v7.16b
2574	aese	v0.16b,v23.16b
2575	aese	v1.16b,v23.16b
2576
2577	cmp	x2,#1
2578	eor	v2.16b,v2.16b,v0.16b
2579	eor	v3.16b,v3.16b,v1.16b
2580	st1	{v2.16b},[x1],#16
2581	b.eq	.Lctr32_done
2582	st1	{v3.16b},[x1]
2583
2584.Lctr32_done:
2585	ldr	x29,[sp],#16
2586	ret
2587.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
2588.globl	aes_v8_xts_encrypt
2589.type	aes_v8_xts_encrypt,%function
2590.align	5
2591aes_v8_xts_encrypt:
2592	AARCH64_VALID_CALL_TARGET
2593	cmp	x2,#16
2594	// Original input data size bigger than 16, jump to big size processing.
2595	b.ne	.Lxts_enc_big_size
2596	// Encrypt the iv with key2, as the first XEX iv.
2597	ldr	w6,[x4,#240]
2598	ld1	{v0.4s},[x4],#16
2599	ld1	{v6.16b},[x5]
2600	sub	w6,w6,#2
2601	ld1	{v1.4s},[x4],#16
2602
2603.Loop_enc_iv_enc:
2604	aese	v6.16b,v0.16b
2605	aesmc	v6.16b,v6.16b
2606	ld1	{v0.4s},[x4],#16
2607	subs	w6,w6,#2
2608	aese	v6.16b,v1.16b
2609	aesmc	v6.16b,v6.16b
2610	ld1	{v1.4s},[x4],#16
2611	b.gt	.Loop_enc_iv_enc
2612
2613	aese	v6.16b,v0.16b
2614	aesmc	v6.16b,v6.16b
2615	ld1	{v0.4s},[x4]
2616	aese	v6.16b,v1.16b
2617	eor	v6.16b,v6.16b,v0.16b
2618
2619	ld1	{v0.16b},[x0]
2620	eor	v0.16b,v6.16b,v0.16b
2621
2622	ldr	w6,[x3,#240]
2623	ld1	{v28.4s,v29.4s},[x3],#32		// load key schedule...
2624
2625	aese	v0.16b,v28.16b
2626	aesmc	v0.16b,v0.16b
2627	ld1	{v16.4s,v17.4s},[x3],#32		// load key schedule...
2628	aese	v0.16b,v29.16b
2629	aesmc	v0.16b,v0.16b
2630	subs	w6,w6,#10		// if rounds==10, jump to aes-128-xts processing
2631	b.eq	.Lxts_128_enc
2632.Lxts_enc_round_loop:
2633	aese	v0.16b,v16.16b
2634	aesmc	v0.16b,v0.16b
2635	ld1	{v16.4s},[x3],#16		// load key schedule...
2636	aese	v0.16b,v17.16b
2637	aesmc	v0.16b,v0.16b
2638	ld1	{v17.4s},[x3],#16		// load key schedule...
2639	subs	w6,w6,#2		// bias
2640	b.gt	.Lxts_enc_round_loop
2641.Lxts_128_enc:
2642	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
2643	aese	v0.16b,v16.16b
2644	aesmc	v0.16b,v0.16b
2645	aese	v0.16b,v17.16b
2646	aesmc	v0.16b,v0.16b
2647	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
2648	aese	v0.16b,v18.16b
2649	aesmc	v0.16b,v0.16b
2650	aese	v0.16b,v19.16b
2651	aesmc	v0.16b,v0.16b
2652	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
2653	aese	v0.16b,v20.16b
2654	aesmc	v0.16b,v0.16b
2655	aese	v0.16b,v21.16b
2656	aesmc	v0.16b,v0.16b
2657	ld1	{v7.4s},[x3]
2658	aese	v0.16b,v22.16b
2659	aesmc	v0.16b,v0.16b
2660	aese	v0.16b,v23.16b
2661	eor	v0.16b,v0.16b,v7.16b
2662	eor	v0.16b,v0.16b,v6.16b
2663	st1	{v0.16b},[x1]
2664	b	.Lxts_enc_final_abort
2665
2666.align	4
2667.Lxts_enc_big_size:
2668	stp	x19,x20,[sp,#-64]!
2669	stp	x21,x22,[sp,#48]
2670	stp	d8,d9,[sp,#32]
2671	stp	d10,d11,[sp,#16]
2672
2673	// tailcnt store the tail value of length%16.
2674	and	x21,x2,#0xf
2675	and	x2,x2,#-16
2676	subs	x2,x2,#16
2677	mov	x8,#16
2678	b.lo	.Lxts_abort
2679	csel	x8,xzr,x8,eq
2680
2681	// Firstly, encrypt the iv with key2, as the first iv of XEX.
2682	ldr	w6,[x4,#240]
2683	ld1	{v0.4s},[x4],#16
2684	ld1	{v6.16b},[x5]
2685	sub	w6,w6,#2
2686	ld1	{v1.4s},[x4],#16
2687
2688.Loop_iv_enc:
2689	aese	v6.16b,v0.16b
2690	aesmc	v6.16b,v6.16b
2691	ld1	{v0.4s},[x4],#16
2692	subs	w6,w6,#2
2693	aese	v6.16b,v1.16b
2694	aesmc	v6.16b,v6.16b
2695	ld1	{v1.4s},[x4],#16
2696	b.gt	.Loop_iv_enc
2697
2698	aese	v6.16b,v0.16b
2699	aesmc	v6.16b,v6.16b
2700	ld1	{v0.4s},[x4]
2701	aese	v6.16b,v1.16b
2702	eor	v6.16b,v6.16b,v0.16b
2703
2704	// The iv for second block
2705	// x9- iv(low), x10 - iv(high)
2706	// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
2707	fmov	x9,d6
2708	fmov	x10,v6.d[1]
2709	mov	w19,#0x87
2710	extr	x22,x10,x10,#32
2711	extr	x10,x10,x9,#63
2712	and	w11,w19,w22,asr#31
2713	eor	x9,x11,x9,lsl#1
2714	fmov	d8,x9
2715	fmov	v8.d[1],x10
2716
2717	ldr	w5,[x3,#240]		// next starting point
2718	ld1	{v0.16b},[x0],x8
2719
2720	ld1	{v16.4s,v17.4s},[x3]			// load key schedule...
2721	sub	w5,w5,#6
2722	add	x7,x3,x5,lsl#4		// pointer to last 7 round keys
2723	sub	w5,w5,#2
2724	ld1	{v18.4s,v19.4s},[x7],#32
2725	ld1	{v20.4s,v21.4s},[x7],#32
2726	ld1	{v22.4s,v23.4s},[x7],#32
2727	ld1	{v7.4s},[x7]
2728
2729	add	x7,x3,#32
2730	mov	w6,w5
2731
2732	// Encryption
2733.Lxts_enc:
2734	ld1	{v24.16b},[x0],#16
2735	subs	x2,x2,#32			// bias
2736	add	w6,w5,#2
2737	orr	v3.16b,v0.16b,v0.16b
2738	orr	v1.16b,v0.16b,v0.16b
2739	orr	v28.16b,v0.16b,v0.16b
2740	orr	v27.16b,v24.16b,v24.16b
2741	orr	v29.16b,v24.16b,v24.16b
2742	b.lo	.Lxts_inner_enc_tail
2743	eor	v0.16b,v0.16b,v6.16b			// before encryption, xor with iv
2744	eor	v24.16b,v24.16b,v8.16b
2745
2746	// The iv for third block
2747	extr	x22,x10,x10,#32
2748	extr	x10,x10,x9,#63
2749	and	w11,w19,w22,asr#31
2750	eor	x9,x11,x9,lsl#1
2751	fmov	d9,x9
2752	fmov	v9.d[1],x10
2753
2754
2755	orr	v1.16b,v24.16b,v24.16b
2756	ld1	{v24.16b},[x0],#16
2757	orr	v2.16b,v0.16b,v0.16b
2758	orr	v3.16b,v1.16b,v1.16b
2759	eor	v27.16b,v24.16b,v9.16b 		// the third block
2760	eor	v24.16b,v24.16b,v9.16b
2761	cmp	x2,#32
2762	b.lo	.Lxts_outer_enc_tail
2763
2764	// The iv for fourth block
2765	extr	x22,x10,x10,#32
2766	extr	x10,x10,x9,#63
2767	and	w11,w19,w22,asr#31
2768	eor	x9,x11,x9,lsl#1
2769	fmov	d10,x9
2770	fmov	v10.d[1],x10
2771
2772	ld1	{v25.16b},[x0],#16
2773	// The iv for fifth block
2774	extr	x22,x10,x10,#32
2775	extr	x10,x10,x9,#63
2776	and	w11,w19,w22,asr#31
2777	eor	x9,x11,x9,lsl#1
2778	fmov	d11,x9
2779	fmov	v11.d[1],x10
2780
2781	ld1	{v26.16b},[x0],#16
2782	eor	v25.16b,v25.16b,v10.16b		// the fourth block
2783	eor	v26.16b,v26.16b,v11.16b
2784	sub	x2,x2,#32			// bias
2785	mov	w6,w5
2786	b	.Loop5x_xts_enc
2787
2788.align	4
2789.Loop5x_xts_enc:
2790	aese	v0.16b,v16.16b
2791	aesmc	v0.16b,v0.16b
2792	aese	v1.16b,v16.16b
2793	aesmc	v1.16b,v1.16b
2794	aese	v24.16b,v16.16b
2795	aesmc	v24.16b,v24.16b
2796	aese	v25.16b,v16.16b
2797	aesmc	v25.16b,v25.16b
2798	aese	v26.16b,v16.16b
2799	aesmc	v26.16b,v26.16b
2800	ld1	{v16.4s},[x7],#16
2801	subs	w6,w6,#2
2802	aese	v0.16b,v17.16b
2803	aesmc	v0.16b,v0.16b
2804	aese	v1.16b,v17.16b
2805	aesmc	v1.16b,v1.16b
2806	aese	v24.16b,v17.16b
2807	aesmc	v24.16b,v24.16b
2808	aese	v25.16b,v17.16b
2809	aesmc	v25.16b,v25.16b
2810	aese	v26.16b,v17.16b
2811	aesmc	v26.16b,v26.16b
2812	ld1	{v17.4s},[x7],#16
2813	b.gt	.Loop5x_xts_enc
2814
2815	aese	v0.16b,v16.16b
2816	aesmc	v0.16b,v0.16b
2817	aese	v1.16b,v16.16b
2818	aesmc	v1.16b,v1.16b
2819	aese	v24.16b,v16.16b
2820	aesmc	v24.16b,v24.16b
2821	aese	v25.16b,v16.16b
2822	aesmc	v25.16b,v25.16b
2823	aese	v26.16b,v16.16b
2824	aesmc	v26.16b,v26.16b
2825	subs	x2,x2,#0x50			// because .Lxts_enc_tail4x
2826
2827	aese	v0.16b,v17.16b
2828	aesmc	v0.16b,v0.16b
2829	aese	v1.16b,v17.16b
2830	aesmc	v1.16b,v1.16b
2831	aese	v24.16b,v17.16b
2832	aesmc	v24.16b,v24.16b
2833	aese	v25.16b,v17.16b
2834	aesmc	v25.16b,v25.16b
2835	aese	v26.16b,v17.16b
2836	aesmc	v26.16b,v26.16b
2837	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
2838	mov	x7,x3
2839
2840	aese	v0.16b,v18.16b
2841	aesmc	v0.16b,v0.16b
2842	aese	v1.16b,v18.16b
2843	aesmc	v1.16b,v1.16b
2844	aese	v24.16b,v18.16b
2845	aesmc	v24.16b,v24.16b
2846	aese	v25.16b,v18.16b
2847	aesmc	v25.16b,v25.16b
2848	aese	v26.16b,v18.16b
2849	aesmc	v26.16b,v26.16b
2850	add	x0,x0,x6		// x0 is adjusted in such way that
2851						// at exit from the loop v1.16b-v26.16b
2852						// are loaded with last "words"
2853	add	x6,x2,#0x60		// because .Lxts_enc_tail4x
2854
2855	aese	v0.16b,v19.16b
2856	aesmc	v0.16b,v0.16b
2857	aese	v1.16b,v19.16b
2858	aesmc	v1.16b,v1.16b
2859	aese	v24.16b,v19.16b
2860	aesmc	v24.16b,v24.16b
2861	aese	v25.16b,v19.16b
2862	aesmc	v25.16b,v25.16b
2863	aese	v26.16b,v19.16b
2864	aesmc	v26.16b,v26.16b
2865
2866	aese	v0.16b,v20.16b
2867	aesmc	v0.16b,v0.16b
2868	aese	v1.16b,v20.16b
2869	aesmc	v1.16b,v1.16b
2870	aese	v24.16b,v20.16b
2871	aesmc	v24.16b,v24.16b
2872	aese	v25.16b,v20.16b
2873	aesmc	v25.16b,v25.16b
2874	aese	v26.16b,v20.16b
2875	aesmc	v26.16b,v26.16b
2876
2877	aese	v0.16b,v21.16b
2878	aesmc	v0.16b,v0.16b
2879	aese	v1.16b,v21.16b
2880	aesmc	v1.16b,v1.16b
2881	aese	v24.16b,v21.16b
2882	aesmc	v24.16b,v24.16b
2883	aese	v25.16b,v21.16b
2884	aesmc	v25.16b,v25.16b
2885	aese	v26.16b,v21.16b
2886	aesmc	v26.16b,v26.16b
2887
2888	aese	v0.16b,v22.16b
2889	aesmc	v0.16b,v0.16b
2890	aese	v1.16b,v22.16b
2891	aesmc	v1.16b,v1.16b
2892	aese	v24.16b,v22.16b
2893	aesmc	v24.16b,v24.16b
2894	aese	v25.16b,v22.16b
2895	aesmc	v25.16b,v25.16b
2896	aese	v26.16b,v22.16b
2897	aesmc	v26.16b,v26.16b
2898
2899	eor	v4.16b,v7.16b,v6.16b
2900	aese	v0.16b,v23.16b
2901	// The iv for first block of one iteration
2902	extr	x22,x10,x10,#32
2903	extr	x10,x10,x9,#63
2904	and	w11,w19,w22,asr#31
2905	eor	x9,x11,x9,lsl#1
2906	fmov	d6,x9
2907	fmov	v6.d[1],x10
2908	eor	v5.16b,v7.16b,v8.16b
2909	ld1	{v2.16b},[x0],#16
2910	aese	v1.16b,v23.16b
2911	// The iv for second block
2912	extr	x22,x10,x10,#32
2913	extr	x10,x10,x9,#63
2914	and	w11,w19,w22,asr#31
2915	eor	x9,x11,x9,lsl#1
2916	fmov	d8,x9
2917	fmov	v8.d[1],x10
2918	eor	v17.16b,v7.16b,v9.16b
2919	ld1	{v3.16b},[x0],#16
2920	aese	v24.16b,v23.16b
2921	// The iv for third block
2922	extr	x22,x10,x10,#32
2923	extr	x10,x10,x9,#63
2924	and	w11,w19,w22,asr#31
2925	eor	x9,x11,x9,lsl#1
2926	fmov	d9,x9
2927	fmov	v9.d[1],x10
2928	eor	v30.16b,v7.16b,v10.16b
2929	ld1	{v27.16b},[x0],#16
2930	aese	v25.16b,v23.16b
2931	// The iv for fourth block
2932	extr	x22,x10,x10,#32
2933	extr	x10,x10,x9,#63
2934	and	w11,w19,w22,asr#31
2935	eor	x9,x11,x9,lsl#1
2936	fmov	d10,x9
2937	fmov	v10.d[1],x10
2938	eor	v31.16b,v7.16b,v11.16b
2939	ld1	{v28.16b},[x0],#16
2940	aese	v26.16b,v23.16b
2941
2942	// The iv for fifth block
2943	extr	x22,x10,x10,#32
2944	extr	x10,x10,x9,#63
2945	and	w11,w19,w22,asr #31
2946	eor	x9,x11,x9,lsl #1
2947	fmov	d11,x9
2948	fmov	v11.d[1],x10
2949
2950	ld1	{v29.16b},[x0],#16
2951	cbz	x6,.Lxts_enc_tail4x
2952	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
2953	eor	v4.16b,v4.16b,v0.16b
2954	eor	v0.16b,v2.16b,v6.16b
2955	eor	v5.16b,v5.16b,v1.16b
2956	eor	v1.16b,v3.16b,v8.16b
2957	eor	v17.16b,v17.16b,v24.16b
2958	eor	v24.16b,v27.16b,v9.16b
2959	eor	v30.16b,v30.16b,v25.16b
2960	eor	v25.16b,v28.16b,v10.16b
2961	eor	v31.16b,v31.16b,v26.16b
2962	st1	{v4.16b},[x1],#16
2963	eor	v26.16b,v29.16b,v11.16b
2964	st1	{v5.16b},[x1],#16
2965	mov	w6,w5
2966	st1	{v17.16b},[x1],#16
2967	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
2968	st1	{v30.16b},[x1],#16
2969	st1	{v31.16b},[x1],#16
2970	b.hs	.Loop5x_xts_enc
2971
2972
2973	// If left 4 blocks, borrow the five block's processing.
2974	cmn	x2,#0x10
2975	b.ne	.Loop5x_enc_after
2976	orr	v11.16b,v10.16b,v10.16b
2977	orr	v10.16b,v9.16b,v9.16b
2978	orr	v9.16b,v8.16b,v8.16b
2979	orr	v8.16b,v6.16b,v6.16b
2980	fmov	x9,d11
2981	fmov	x10,v11.d[1]
2982	eor	v0.16b,v6.16b,v2.16b
2983	eor	v1.16b,v8.16b,v3.16b
2984	eor	v24.16b,v27.16b,v9.16b
2985	eor	v25.16b,v28.16b,v10.16b
2986	eor	v26.16b,v29.16b,v11.16b
2987	b.eq	.Loop5x_xts_enc
2988
2989.Loop5x_enc_after:
2990	add	x2,x2,#0x50
2991	cbz	x2,.Lxts_enc_done
2992
2993	add	w6,w5,#2
2994	subs	x2,x2,#0x30
2995	b.lo	.Lxts_inner_enc_tail
2996
2997	eor	v0.16b,v6.16b,v27.16b
2998	eor	v1.16b,v8.16b,v28.16b
2999	eor	v24.16b,v29.16b,v9.16b
3000	b	.Lxts_outer_enc_tail
3001
3002.align	4
3003.Lxts_enc_tail4x:
3004	add	x0,x0,#16
3005	eor	v5.16b,v1.16b,v5.16b
3006	st1	{v5.16b},[x1],#16
3007	eor	v17.16b,v24.16b,v17.16b
3008	st1	{v17.16b},[x1],#16
3009	eor	v30.16b,v25.16b,v30.16b
3010	eor	v31.16b,v26.16b,v31.16b
3011	st1	{v30.16b,v31.16b},[x1],#32
3012
3013	b	.Lxts_enc_done
3014.align	4
3015.Lxts_outer_enc_tail:
3016	aese	v0.16b,v16.16b
3017	aesmc	v0.16b,v0.16b
3018	aese	v1.16b,v16.16b
3019	aesmc	v1.16b,v1.16b
3020	aese	v24.16b,v16.16b
3021	aesmc	v24.16b,v24.16b
3022	ld1	{v16.4s},[x7],#16
3023	subs	w6,w6,#2
3024	aese	v0.16b,v17.16b
3025	aesmc	v0.16b,v0.16b
3026	aese	v1.16b,v17.16b
3027	aesmc	v1.16b,v1.16b
3028	aese	v24.16b,v17.16b
3029	aesmc	v24.16b,v24.16b
3030	ld1	{v17.4s},[x7],#16
3031	b.gt	.Lxts_outer_enc_tail
3032
3033	aese	v0.16b,v16.16b
3034	aesmc	v0.16b,v0.16b
3035	aese	v1.16b,v16.16b
3036	aesmc	v1.16b,v1.16b
3037	aese	v24.16b,v16.16b
3038	aesmc	v24.16b,v24.16b
3039	eor	v4.16b,v6.16b,v7.16b
3040	subs	x2,x2,#0x30
3041	// The iv for first block
3042	fmov	x9,d9
3043	fmov	x10,v9.d[1]
3044	//mov	w19,#0x87
3045	extr	x22,x10,x10,#32
3046	extr	x10,x10,x9,#63
3047	and	w11,w19,w22,asr#31
3048	eor	x9,x11,x9,lsl#1
3049	fmov	d6,x9
3050	fmov	v6.d[1],x10
3051	eor	v5.16b,v8.16b,v7.16b
3052	csel	x6,x2,x6,lo       // x6, w6, is zero at this point
3053	aese	v0.16b,v17.16b
3054	aesmc	v0.16b,v0.16b
3055	aese	v1.16b,v17.16b
3056	aesmc	v1.16b,v1.16b
3057	aese	v24.16b,v17.16b
3058	aesmc	v24.16b,v24.16b
3059	eor	v17.16b,v9.16b,v7.16b
3060
3061	add	x6,x6,#0x20
3062	add	x0,x0,x6
3063	mov	x7,x3
3064
3065	aese	v0.16b,v20.16b
3066	aesmc	v0.16b,v0.16b
3067	aese	v1.16b,v20.16b
3068	aesmc	v1.16b,v1.16b
3069	aese	v24.16b,v20.16b
3070	aesmc	v24.16b,v24.16b
3071	aese	v0.16b,v21.16b
3072	aesmc	v0.16b,v0.16b
3073	aese	v1.16b,v21.16b
3074	aesmc	v1.16b,v1.16b
3075	aese	v24.16b,v21.16b
3076	aesmc	v24.16b,v24.16b
3077	aese	v0.16b,v22.16b
3078	aesmc	v0.16b,v0.16b
3079	aese	v1.16b,v22.16b
3080	aesmc	v1.16b,v1.16b
3081	aese	v24.16b,v22.16b
3082	aesmc	v24.16b,v24.16b
3083	aese	v0.16b,v23.16b
3084	aese	v1.16b,v23.16b
3085	aese	v24.16b,v23.16b
3086	ld1	{v27.16b},[x0],#16
3087	add	w6,w5,#2
3088	ld1	{v16.4s},[x7],#16                // re-pre-load rndkey[0]
3089	eor	v4.16b,v4.16b,v0.16b
3090	eor	v5.16b,v5.16b,v1.16b
3091	eor	v24.16b,v24.16b,v17.16b
3092	ld1	{v17.4s},[x7],#16                // re-pre-load rndkey[1]
3093	st1	{v4.16b},[x1],#16
3094	st1	{v5.16b},[x1],#16
3095	st1	{v24.16b},[x1],#16
3096	cmn	x2,#0x30
3097	b.eq	.Lxts_enc_done
3098.Lxts_encxor_one:
3099	orr	v28.16b,v3.16b,v3.16b
3100	orr	v29.16b,v27.16b,v27.16b
3101	nop
3102
3103.Lxts_inner_enc_tail:
3104	cmn	x2,#0x10
3105	eor	v1.16b,v28.16b,v6.16b
3106	eor	v24.16b,v29.16b,v8.16b
3107	b.eq	.Lxts_enc_tail_loop
3108	eor	v24.16b,v29.16b,v6.16b
3109.Lxts_enc_tail_loop:
3110	aese	v1.16b,v16.16b
3111	aesmc	v1.16b,v1.16b
3112	aese	v24.16b,v16.16b
3113	aesmc	v24.16b,v24.16b
3114	ld1	{v16.4s},[x7],#16
3115	subs	w6,w6,#2
3116	aese	v1.16b,v17.16b
3117	aesmc	v1.16b,v1.16b
3118	aese	v24.16b,v17.16b
3119	aesmc	v24.16b,v24.16b
3120	ld1	{v17.4s},[x7],#16
3121	b.gt	.Lxts_enc_tail_loop
3122
3123	aese	v1.16b,v16.16b
3124	aesmc	v1.16b,v1.16b
3125	aese	v24.16b,v16.16b
3126	aesmc	v24.16b,v24.16b
3127	aese	v1.16b,v17.16b
3128	aesmc	v1.16b,v1.16b
3129	aese	v24.16b,v17.16b
3130	aesmc	v24.16b,v24.16b
3131	aese	v1.16b,v20.16b
3132	aesmc	v1.16b,v1.16b
3133	aese	v24.16b,v20.16b
3134	aesmc	v24.16b,v24.16b
3135	cmn	x2,#0x20
3136	aese	v1.16b,v21.16b
3137	aesmc	v1.16b,v1.16b
3138	aese	v24.16b,v21.16b
3139	aesmc	v24.16b,v24.16b
3140	eor	v5.16b,v6.16b,v7.16b
3141	aese	v1.16b,v22.16b
3142	aesmc	v1.16b,v1.16b
3143	aese	v24.16b,v22.16b
3144	aesmc	v24.16b,v24.16b
3145	eor	v17.16b,v8.16b,v7.16b
3146	aese	v1.16b,v23.16b
3147	aese	v24.16b,v23.16b
3148	b.eq	.Lxts_enc_one
3149	eor	v5.16b,v5.16b,v1.16b
3150	st1	{v5.16b},[x1],#16
3151	eor	v17.16b,v17.16b,v24.16b
3152	orr	v6.16b,v8.16b,v8.16b
3153	st1	{v17.16b},[x1],#16
3154	fmov	x9,d8
3155	fmov	x10,v8.d[1]
3156	mov	w19,#0x87
3157	extr	x22,x10,x10,#32
3158	extr	x10,x10,x9,#63
3159	and	w11,w19,w22,asr #31
3160	eor	x9,x11,x9,lsl #1
3161	fmov	d6,x9
3162	fmov	v6.d[1],x10
3163	b	.Lxts_enc_done
3164
3165.Lxts_enc_one:
3166	eor	v5.16b,v5.16b,v24.16b
3167	orr	v6.16b,v6.16b,v6.16b
3168	st1	{v5.16b},[x1],#16
3169	fmov	x9,d6
3170	fmov	x10,v6.d[1]
3171	mov	w19,#0x87
3172	extr	x22,x10,x10,#32
3173	extr	x10,x10,x9,#63
3174	and	w11,w19,w22,asr #31
3175	eor	x9,x11,x9,lsl #1
3176	fmov	d6,x9
3177	fmov	v6.d[1],x10
3178	b	.Lxts_enc_done
3179.align	5
3180.Lxts_enc_done:
3181	// Process the tail block with cipher stealing.
3182	tst	x21,#0xf
3183	b.eq	.Lxts_abort
3184
3185	mov	x20,x0
3186	mov	x13,x1
3187	sub	x1,x1,#16
3188.composite_enc_loop:
3189	subs	x21,x21,#1
3190	ldrb	w15,[x1,x21]
3191	ldrb	w14,[x20,x21]
3192	strb	w15,[x13,x21]
3193	strb	w14,[x1,x21]
3194	b.gt	.composite_enc_loop
3195.Lxts_enc_load_done:
3196	ld1	{v26.16b},[x1]
3197	eor	v26.16b,v26.16b,v6.16b
3198
3199	// Encrypt the composite block to get the last second encrypted text block
3200	ldr	w6,[x3,#240]		// load key schedule...
3201	ld1	{v0.4s},[x3],#16
3202	sub	w6,w6,#2
3203	ld1	{v1.4s},[x3],#16		// load key schedule...
3204.Loop_final_enc:
3205	aese	v26.16b,v0.16b
3206	aesmc	v26.16b,v26.16b
3207	ld1	{v0.4s},[x3],#16
3208	subs	w6,w6,#2
3209	aese	v26.16b,v1.16b
3210	aesmc	v26.16b,v26.16b
3211	ld1	{v1.4s},[x3],#16
3212	b.gt	.Loop_final_enc
3213
3214	aese	v26.16b,v0.16b
3215	aesmc	v26.16b,v26.16b
3216	ld1	{v0.4s},[x3]
3217	aese	v26.16b,v1.16b
3218	eor	v26.16b,v26.16b,v0.16b
3219	eor	v26.16b,v26.16b,v6.16b
3220	st1	{v26.16b},[x1]
3221
3222.Lxts_abort:
3223	ldp	x21,x22,[sp,#48]
3224	ldp	d8,d9,[sp,#32]
3225	ldp	d10,d11,[sp,#16]
3226	ldp	x19,x20,[sp],#64
3227.Lxts_enc_final_abort:
3228	ret
3229.size	aes_v8_xts_encrypt,.-aes_v8_xts_encrypt
3230.globl	aes_v8_xts_decrypt
3231.type	aes_v8_xts_decrypt,%function
3232.align	5
3233aes_v8_xts_decrypt:
3234	AARCH64_VALID_CALL_TARGET
3235	cmp	x2,#16
3236	// Original input data size bigger than 16, jump to big size processing.
3237	b.ne	.Lxts_dec_big_size
3238	// Encrypt the iv with key2, as the first XEX iv.
3239	ldr	w6,[x4,#240]
3240	ld1	{v0.4s},[x4],#16
3241	ld1	{v6.16b},[x5]
3242	sub	w6,w6,#2
3243	ld1	{v1.4s},[x4],#16
3244
3245.Loop_dec_small_iv_enc:
3246	aese	v6.16b,v0.16b
3247	aesmc	v6.16b,v6.16b
3248	ld1	{v0.4s},[x4],#16
3249	subs	w6,w6,#2
3250	aese	v6.16b,v1.16b
3251	aesmc	v6.16b,v6.16b
3252	ld1	{v1.4s},[x4],#16
3253	b.gt	.Loop_dec_small_iv_enc
3254
3255	aese	v6.16b,v0.16b
3256	aesmc	v6.16b,v6.16b
3257	ld1	{v0.4s},[x4]
3258	aese	v6.16b,v1.16b
3259	eor	v6.16b,v6.16b,v0.16b
3260
3261	ld1	{v0.16b},[x0]
3262	eor	v0.16b,v6.16b,v0.16b
3263
3264	ldr	w6,[x3,#240]
3265	ld1	{v28.4s,v29.4s},[x3],#32			// load key schedule...
3266
3267	aesd	v0.16b,v28.16b
3268	aesimc	v0.16b,v0.16b
3269	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
3270	aesd	v0.16b,v29.16b
3271	aesimc	v0.16b,v0.16b
3272	subs	w6,w6,#10			// bias
3273	b.eq	.Lxts_128_dec
3274.Lxts_dec_round_loop:
3275	aesd	v0.16b,v16.16b
3276	aesimc	v0.16b,v0.16b
3277	ld1	{v16.4s},[x3],#16			// load key schedule...
3278	aesd	v0.16b,v17.16b
3279	aesimc	v0.16b,v0.16b
3280	ld1	{v17.4s},[x3],#16			// load key schedule...
3281	subs	w6,w6,#2			// bias
3282	b.gt	.Lxts_dec_round_loop
3283.Lxts_128_dec:
3284	ld1	{v18.4s,v19.4s},[x3],#32			// load key schedule...
3285	aesd	v0.16b,v16.16b
3286	aesimc	v0.16b,v0.16b
3287	aesd	v0.16b,v17.16b
3288	aesimc	v0.16b,v0.16b
3289	ld1	{v20.4s,v21.4s},[x3],#32			// load key schedule...
3290	aesd	v0.16b,v18.16b
3291	aesimc	v0.16b,v0.16b
3292	aesd	v0.16b,v19.16b
3293	aesimc	v0.16b,v0.16b
3294	ld1	{v22.4s,v23.4s},[x3],#32			// load key schedule...
3295	aesd	v0.16b,v20.16b
3296	aesimc	v0.16b,v0.16b
3297	aesd	v0.16b,v21.16b
3298	aesimc	v0.16b,v0.16b
3299	ld1	{v7.4s},[x3]
3300	aesd	v0.16b,v22.16b
3301	aesimc	v0.16b,v0.16b
3302	aesd	v0.16b,v23.16b
3303	eor	v0.16b,v0.16b,v7.16b
3304	eor	v0.16b,v6.16b,v0.16b
3305	st1	{v0.16b},[x1]
3306	b	.Lxts_dec_final_abort
3307.Lxts_dec_big_size:
3308	stp	x19,x20,[sp,#-64]!
3309	stp	x21,x22,[sp,#48]
3310	stp	d8,d9,[sp,#32]
3311	stp	d10,d11,[sp,#16]
3312
3313	and	x21,x2,#0xf
3314	and	x2,x2,#-16
3315	subs	x2,x2,#16
3316	mov	x8,#16
3317	b.lo	.Lxts_dec_abort
3318
3319	// Encrypt the iv with key2, as the first XEX iv
3320	ldr	w6,[x4,#240]
3321	ld1	{v0.4s},[x4],#16
3322	ld1	{v6.16b},[x5]
3323	sub	w6,w6,#2
3324	ld1	{v1.4s},[x4],#16
3325
3326.Loop_dec_iv_enc:
3327	aese	v6.16b,v0.16b
3328	aesmc	v6.16b,v6.16b
3329	ld1	{v0.4s},[x4],#16
3330	subs	w6,w6,#2
3331	aese	v6.16b,v1.16b
3332	aesmc	v6.16b,v6.16b
3333	ld1	{v1.4s},[x4],#16
3334	b.gt	.Loop_dec_iv_enc
3335
3336	aese	v6.16b,v0.16b
3337	aesmc	v6.16b,v6.16b
3338	ld1	{v0.4s},[x4]
3339	aese	v6.16b,v1.16b
3340	eor	v6.16b,v6.16b,v0.16b
3341
3342	// The iv for second block
3343	// x9- iv(low), x10 - iv(high)
3344	// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
3345	fmov	x9,d6
3346	fmov	x10,v6.d[1]
3347	mov	w19,#0x87
3348	extr	x22,x10,x10,#32
3349	extr	x10,x10,x9,#63
3350	and	w11,w19,w22,asr #31
3351	eor	x9,x11,x9,lsl #1
3352	fmov	d8,x9
3353	fmov	v8.d[1],x10
3354
3355	ldr	w5,[x3,#240]		// load rounds number
3356
3357	// The iv for third block
3358	extr	x22,x10,x10,#32
3359	extr	x10,x10,x9,#63
3360	and	w11,w19,w22,asr #31
3361	eor	x9,x11,x9,lsl #1
3362	fmov	d9,x9
3363	fmov	v9.d[1],x10
3364
3365	ld1	{v16.4s,v17.4s},[x3]			// load key schedule...
3366	sub	w5,w5,#6
3367	add	x7,x3,x5,lsl#4		// pointer to last 7 round keys
3368	sub	w5,w5,#2
3369	ld1	{v18.4s,v19.4s},[x7],#32		// load key schedule...
3370	ld1	{v20.4s,v21.4s},[x7],#32
3371	ld1	{v22.4s,v23.4s},[x7],#32
3372	ld1	{v7.4s},[x7]
3373
3374	// The iv for fourth block
3375	extr	x22,x10,x10,#32
3376	extr	x10,x10,x9,#63
3377	and	w11,w19,w22,asr #31
3378	eor	x9,x11,x9,lsl #1
3379	fmov	d10,x9
3380	fmov	v10.d[1],x10
3381
3382	add	x7,x3,#32
3383	mov	w6,w5
3384	b	.Lxts_dec
3385
3386	// Decryption
3387.align	5
3388.Lxts_dec:
3389	tst	x21,#0xf
3390	b.eq	.Lxts_dec_begin
3391	subs	x2,x2,#16
3392	csel	x8,xzr,x8,eq
3393	ld1	{v0.16b},[x0],#16
3394	b.lo	.Lxts_done
3395	sub	x0,x0,#16
3396.Lxts_dec_begin:
3397	ld1	{v0.16b},[x0],x8
3398	subs	x2,x2,#32			// bias
3399	add	w6,w5,#2
3400	orr	v3.16b,v0.16b,v0.16b
3401	orr	v1.16b,v0.16b,v0.16b
3402	orr	v28.16b,v0.16b,v0.16b
3403	ld1	{v24.16b},[x0],#16
3404	orr	v27.16b,v24.16b,v24.16b
3405	orr	v29.16b,v24.16b,v24.16b
3406	b.lo	.Lxts_inner_dec_tail
3407	eor	v0.16b,v0.16b,v6.16b			// before decryt, xor with iv
3408	eor	v24.16b,v24.16b,v8.16b
3409
3410	orr	v1.16b,v24.16b,v24.16b
3411	ld1	{v24.16b},[x0],#16
3412	orr	v2.16b,v0.16b,v0.16b
3413	orr	v3.16b,v1.16b,v1.16b
3414	eor	v27.16b,v24.16b,v9.16b			// third block xox with third iv
3415	eor	v24.16b,v24.16b,v9.16b
3416	cmp	x2,#32
3417	b.lo	.Lxts_outer_dec_tail
3418
3419	ld1	{v25.16b},[x0],#16
3420
3421	// The iv for fifth block
3422	extr	x22,x10,x10,#32
3423	extr	x10,x10,x9,#63
3424	and	w11,w19,w22,asr #31
3425	eor	x9,x11,x9,lsl #1
3426	fmov	d11,x9
3427	fmov	v11.d[1],x10
3428
3429	ld1	{v26.16b},[x0],#16
3430	eor	v25.16b,v25.16b,v10.16b		// the fourth block
3431	eor	v26.16b,v26.16b,v11.16b
3432	sub	x2,x2,#32			// bias
3433	mov	w6,w5
3434	b	.Loop5x_xts_dec
3435
3436.align	4
3437.Loop5x_xts_dec:
3438	aesd	v0.16b,v16.16b
3439	aesimc	v0.16b,v0.16b
3440	aesd	v1.16b,v16.16b
3441	aesimc	v1.16b,v1.16b
3442	aesd	v24.16b,v16.16b
3443	aesimc	v24.16b,v24.16b
3444	aesd	v25.16b,v16.16b
3445	aesimc	v25.16b,v25.16b
3446	aesd	v26.16b,v16.16b
3447	aesimc	v26.16b,v26.16b
3448	ld1	{v16.4s},[x7],#16		// load key schedule...
3449	subs	w6,w6,#2
3450	aesd	v0.16b,v17.16b
3451	aesimc	v0.16b,v0.16b
3452	aesd	v1.16b,v17.16b
3453	aesimc	v1.16b,v1.16b
3454	aesd	v24.16b,v17.16b
3455	aesimc	v24.16b,v24.16b
3456	aesd	v25.16b,v17.16b
3457	aesimc	v25.16b,v25.16b
3458	aesd	v26.16b,v17.16b
3459	aesimc	v26.16b,v26.16b
3460	ld1	{v17.4s},[x7],#16		// load key schedule...
3461	b.gt	.Loop5x_xts_dec
3462
3463	aesd	v0.16b,v16.16b
3464	aesimc	v0.16b,v0.16b
3465	aesd	v1.16b,v16.16b
3466	aesimc	v1.16b,v1.16b
3467	aesd	v24.16b,v16.16b
3468	aesimc	v24.16b,v24.16b
3469	aesd	v25.16b,v16.16b
3470	aesimc	v25.16b,v25.16b
3471	aesd	v26.16b,v16.16b
3472	aesimc	v26.16b,v26.16b
3473	subs	x2,x2,#0x50			// because .Lxts_dec_tail4x
3474
3475	aesd	v0.16b,v17.16b
3476	aesimc	v0.16b,v0.16b
3477	aesd	v1.16b,v17.16b
3478	aesimc	v1.16b,v1.16b
3479	aesd	v24.16b,v17.16b
3480	aesimc	v24.16b,v24.16b
3481	aesd	v25.16b,v17.16b
3482	aesimc	v25.16b,v25.16b
3483	aesd	v26.16b,v17.16b
3484	aesimc	v26.16b,v26.16b
3485	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
3486	mov	x7,x3
3487
3488	aesd	v0.16b,v18.16b
3489	aesimc	v0.16b,v0.16b
3490	aesd	v1.16b,v18.16b
3491	aesimc	v1.16b,v1.16b
3492	aesd	v24.16b,v18.16b
3493	aesimc	v24.16b,v24.16b
3494	aesd	v25.16b,v18.16b
3495	aesimc	v25.16b,v25.16b
3496	aesd	v26.16b,v18.16b
3497	aesimc	v26.16b,v26.16b
3498	add	x0,x0,x6		// x0 is adjusted in such way that
3499						// at exit from the loop v1.16b-v26.16b
3500						// are loaded with last "words"
3501	add	x6,x2,#0x60		// because .Lxts_dec_tail4x
3502
3503	aesd	v0.16b,v19.16b
3504	aesimc	v0.16b,v0.16b
3505	aesd	v1.16b,v19.16b
3506	aesimc	v1.16b,v1.16b
3507	aesd	v24.16b,v19.16b
3508	aesimc	v24.16b,v24.16b
3509	aesd	v25.16b,v19.16b
3510	aesimc	v25.16b,v25.16b
3511	aesd	v26.16b,v19.16b
3512	aesimc	v26.16b,v26.16b
3513
3514	aesd	v0.16b,v20.16b
3515	aesimc	v0.16b,v0.16b
3516	aesd	v1.16b,v20.16b
3517	aesimc	v1.16b,v1.16b
3518	aesd	v24.16b,v20.16b
3519	aesimc	v24.16b,v24.16b
3520	aesd	v25.16b,v20.16b
3521	aesimc	v25.16b,v25.16b
3522	aesd	v26.16b,v20.16b
3523	aesimc	v26.16b,v26.16b
3524
3525	aesd	v0.16b,v21.16b
3526	aesimc	v0.16b,v0.16b
3527	aesd	v1.16b,v21.16b
3528	aesimc	v1.16b,v1.16b
3529	aesd	v24.16b,v21.16b
3530	aesimc	v24.16b,v24.16b
3531	aesd	v25.16b,v21.16b
3532	aesimc	v25.16b,v25.16b
3533	aesd	v26.16b,v21.16b
3534	aesimc	v26.16b,v26.16b
3535
3536	aesd	v0.16b,v22.16b
3537	aesimc	v0.16b,v0.16b
3538	aesd	v1.16b,v22.16b
3539	aesimc	v1.16b,v1.16b
3540	aesd	v24.16b,v22.16b
3541	aesimc	v24.16b,v24.16b
3542	aesd	v25.16b,v22.16b
3543	aesimc	v25.16b,v25.16b
3544	aesd	v26.16b,v22.16b
3545	aesimc	v26.16b,v26.16b
3546
3547	eor	v4.16b,v7.16b,v6.16b
3548	aesd	v0.16b,v23.16b
3549	// The iv for first block of next iteration.
3550	extr	x22,x10,x10,#32
3551	extr	x10,x10,x9,#63
3552	and	w11,w19,w22,asr #31
3553	eor	x9,x11,x9,lsl #1
3554	fmov	d6,x9
3555	fmov	v6.d[1],x10
3556	eor	v5.16b,v7.16b,v8.16b
3557	ld1	{v2.16b},[x0],#16
3558	aesd	v1.16b,v23.16b
3559	// The iv for second block
3560	extr	x22,x10,x10,#32
3561	extr	x10,x10,x9,#63
3562	and	w11,w19,w22,asr #31
3563	eor	x9,x11,x9,lsl #1
3564	fmov	d8,x9
3565	fmov	v8.d[1],x10
3566	eor	v17.16b,v7.16b,v9.16b
3567	ld1	{v3.16b},[x0],#16
3568	aesd	v24.16b,v23.16b
3569	// The iv for third block
3570	extr	x22,x10,x10,#32
3571	extr	x10,x10,x9,#63
3572	and	w11,w19,w22,asr #31
3573	eor	x9,x11,x9,lsl #1
3574	fmov	d9,x9
3575	fmov	v9.d[1],x10
3576	eor	v30.16b,v7.16b,v10.16b
3577	ld1	{v27.16b},[x0],#16
3578	aesd	v25.16b,v23.16b
3579	// The iv for fourth block
3580	extr	x22,x10,x10,#32
3581	extr	x10,x10,x9,#63
3582	and	w11,w19,w22,asr #31
3583	eor	x9,x11,x9,lsl #1
3584	fmov	d10,x9
3585	fmov	v10.d[1],x10
3586	eor	v31.16b,v7.16b,v11.16b
3587	ld1	{v28.16b},[x0],#16
3588	aesd	v26.16b,v23.16b
3589
3590	// The iv for fifth block
3591	extr	x22,x10,x10,#32
3592	extr	x10,x10,x9,#63
3593	and	w11,w19,w22,asr #31
3594	eor	x9,x11,x9,lsl #1
3595	fmov	d11,x9
3596	fmov	v11.d[1],x10
3597
3598	ld1	{v29.16b},[x0],#16
3599	cbz	x6,.Lxts_dec_tail4x
3600	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
3601	eor	v4.16b,v4.16b,v0.16b
3602	eor	v0.16b,v2.16b,v6.16b
3603	eor	v5.16b,v5.16b,v1.16b
3604	eor	v1.16b,v3.16b,v8.16b
3605	eor	v17.16b,v17.16b,v24.16b
3606	eor	v24.16b,v27.16b,v9.16b
3607	eor	v30.16b,v30.16b,v25.16b
3608	eor	v25.16b,v28.16b,v10.16b
3609	eor	v31.16b,v31.16b,v26.16b
3610	st1	{v4.16b},[x1],#16
3611	eor	v26.16b,v29.16b,v11.16b
3612	st1	{v5.16b},[x1],#16
3613	mov	w6,w5
3614	st1	{v17.16b},[x1],#16
3615	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
3616	st1	{v30.16b},[x1],#16
3617	st1	{v31.16b},[x1],#16
3618	b.hs	.Loop5x_xts_dec
3619
3620	cmn	x2,#0x10
3621	b.ne	.Loop5x_dec_after
3622	// If x2(x2) equal to -0x10, the left blocks is 4.
3623	// After specially processing, utilize the five blocks processing again.
3624	// It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b.
3625	orr	v11.16b,v10.16b,v10.16b
3626	orr	v10.16b,v9.16b,v9.16b
3627	orr	v9.16b,v8.16b,v8.16b
3628	orr	v8.16b,v6.16b,v6.16b
3629	fmov	x9,d11
3630	fmov	x10,v11.d[1]
3631	eor	v0.16b,v6.16b,v2.16b
3632	eor	v1.16b,v8.16b,v3.16b
3633	eor	v24.16b,v27.16b,v9.16b
3634	eor	v25.16b,v28.16b,v10.16b
3635	eor	v26.16b,v29.16b,v11.16b
3636	b.eq	.Loop5x_xts_dec
3637
3638.Loop5x_dec_after:
3639	add	x2,x2,#0x50
3640	cbz	x2,.Lxts_done
3641
3642	add	w6,w5,#2
3643	subs	x2,x2,#0x30
3644	b.lo	.Lxts_inner_dec_tail
3645
3646	eor	v0.16b,v6.16b,v27.16b
3647	eor	v1.16b,v8.16b,v28.16b
3648	eor	v24.16b,v29.16b,v9.16b
3649	b	.Lxts_outer_dec_tail
3650
3651.align	4
3652.Lxts_dec_tail4x:
3653	add	x0,x0,#16
3654	tst	x21,#0xf
3655	eor	v5.16b,v1.16b,v4.16b
3656	st1	{v5.16b},[x1],#16
3657	eor	v17.16b,v24.16b,v17.16b
3658	st1	{v17.16b},[x1],#16
3659	eor	v30.16b,v25.16b,v30.16b
3660	eor	v31.16b,v26.16b,v31.16b
3661	st1	{v30.16b,v31.16b},[x1],#32
3662
3663	b.eq	.Lxts_dec_abort
3664	ld1	{v0.16b},[x0],#16
3665	b	.Lxts_done
3666.align	4
3667.Lxts_outer_dec_tail:
3668	aesd	v0.16b,v16.16b
3669	aesimc	v0.16b,v0.16b
3670	aesd	v1.16b,v16.16b
3671	aesimc	v1.16b,v1.16b
3672	aesd	v24.16b,v16.16b
3673	aesimc	v24.16b,v24.16b
3674	ld1	{v16.4s},[x7],#16
3675	subs	w6,w6,#2
3676	aesd	v0.16b,v17.16b
3677	aesimc	v0.16b,v0.16b
3678	aesd	v1.16b,v17.16b
3679	aesimc	v1.16b,v1.16b
3680	aesd	v24.16b,v17.16b
3681	aesimc	v24.16b,v24.16b
3682	ld1	{v17.4s},[x7],#16
3683	b.gt	.Lxts_outer_dec_tail
3684
3685	aesd	v0.16b,v16.16b
3686	aesimc	v0.16b,v0.16b
3687	aesd	v1.16b,v16.16b
3688	aesimc	v1.16b,v1.16b
3689	aesd	v24.16b,v16.16b
3690	aesimc	v24.16b,v24.16b
3691	eor	v4.16b,v6.16b,v7.16b
3692	subs	x2,x2,#0x30
3693	// The iv for first block
3694	fmov	x9,d9
3695	fmov	x10,v9.d[1]
3696	mov	w19,#0x87
3697	extr	x22,x10,x10,#32
3698	extr	x10,x10,x9,#63
3699	and	w11,w19,w22,asr #31
3700	eor	x9,x11,x9,lsl #1
3701	fmov	d6,x9
3702	fmov	v6.d[1],x10
3703	eor	v5.16b,v8.16b,v7.16b
3704	csel	x6,x2,x6,lo	// x6, w6, is zero at this point
3705	aesd	v0.16b,v17.16b
3706	aesimc	v0.16b,v0.16b
3707	aesd	v1.16b,v17.16b
3708	aesimc	v1.16b,v1.16b
3709	aesd	v24.16b,v17.16b
3710	aesimc	v24.16b,v24.16b
3711	eor	v17.16b,v9.16b,v7.16b
3712	// The iv for second block
3713	extr	x22,x10,x10,#32
3714	extr	x10,x10,x9,#63
3715	and	w11,w19,w22,asr #31
3716	eor	x9,x11,x9,lsl #1
3717	fmov	d8,x9
3718	fmov	v8.d[1],x10
3719
3720	add	x6,x6,#0x20
3721	add	x0,x0,x6		// x0 is adjusted to the last data
3722
3723	mov	x7,x3
3724
3725	// The iv for third block
3726	extr	x22,x10,x10,#32
3727	extr	x10,x10,x9,#63
3728	and	w11,w19,w22,asr #31
3729	eor	x9,x11,x9,lsl #1
3730	fmov	d9,x9
3731	fmov	v9.d[1],x10
3732
3733	aesd	v0.16b,v20.16b
3734	aesimc	v0.16b,v0.16b
3735	aesd	v1.16b,v20.16b
3736	aesimc	v1.16b,v1.16b
3737	aesd	v24.16b,v20.16b
3738	aesimc	v24.16b,v24.16b
3739	aesd	v0.16b,v21.16b
3740	aesimc	v0.16b,v0.16b
3741	aesd	v1.16b,v21.16b
3742	aesimc	v1.16b,v1.16b
3743	aesd	v24.16b,v21.16b
3744	aesimc	v24.16b,v24.16b
3745	aesd	v0.16b,v22.16b
3746	aesimc	v0.16b,v0.16b
3747	aesd	v1.16b,v22.16b
3748	aesimc	v1.16b,v1.16b
3749	aesd	v24.16b,v22.16b
3750	aesimc	v24.16b,v24.16b
3751	ld1	{v27.16b},[x0],#16
3752	aesd	v0.16b,v23.16b
3753	aesd	v1.16b,v23.16b
3754	aesd	v24.16b,v23.16b
3755	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
3756	add	w6,w5,#2
3757	eor	v4.16b,v4.16b,v0.16b
3758	eor	v5.16b,v5.16b,v1.16b
3759	eor	v24.16b,v24.16b,v17.16b
3760	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
3761	st1	{v4.16b},[x1],#16
3762	st1	{v5.16b},[x1],#16
3763	st1	{v24.16b},[x1],#16
3764
3765	cmn	x2,#0x30
3766	add	x2,x2,#0x30
3767	b.eq	.Lxts_done
3768	sub	x2,x2,#0x30
3769	orr	v28.16b,v3.16b,v3.16b
3770	orr	v29.16b,v27.16b,v27.16b
3771	nop
3772
3773.Lxts_inner_dec_tail:
3774	// x2 == -0x10 means two blocks left.
3775	cmn	x2,#0x10
3776	eor	v1.16b,v28.16b,v6.16b
3777	eor	v24.16b,v29.16b,v8.16b
3778	b.eq	.Lxts_dec_tail_loop
3779	eor	v24.16b,v29.16b,v6.16b
3780.Lxts_dec_tail_loop:
3781	aesd	v1.16b,v16.16b
3782	aesimc	v1.16b,v1.16b
3783	aesd	v24.16b,v16.16b
3784	aesimc	v24.16b,v24.16b
3785	ld1	{v16.4s},[x7],#16
3786	subs	w6,w6,#2
3787	aesd	v1.16b,v17.16b
3788	aesimc	v1.16b,v1.16b
3789	aesd	v24.16b,v17.16b
3790	aesimc	v24.16b,v24.16b
3791	ld1	{v17.4s},[x7],#16
3792	b.gt	.Lxts_dec_tail_loop
3793
3794	aesd	v1.16b,v16.16b
3795	aesimc	v1.16b,v1.16b
3796	aesd	v24.16b,v16.16b
3797	aesimc	v24.16b,v24.16b
3798	aesd	v1.16b,v17.16b
3799	aesimc	v1.16b,v1.16b
3800	aesd	v24.16b,v17.16b
3801	aesimc	v24.16b,v24.16b
3802	aesd	v1.16b,v20.16b
3803	aesimc	v1.16b,v1.16b
3804	aesd	v24.16b,v20.16b
3805	aesimc	v24.16b,v24.16b
3806	cmn	x2,#0x20
3807	aesd	v1.16b,v21.16b
3808	aesimc	v1.16b,v1.16b
3809	aesd	v24.16b,v21.16b
3810	aesimc	v24.16b,v24.16b
3811	eor	v5.16b,v6.16b,v7.16b
3812	aesd	v1.16b,v22.16b
3813	aesimc	v1.16b,v1.16b
3814	aesd	v24.16b,v22.16b
3815	aesimc	v24.16b,v24.16b
3816	eor	v17.16b,v8.16b,v7.16b
3817	aesd	v1.16b,v23.16b
3818	aesd	v24.16b,v23.16b
3819	b.eq	.Lxts_dec_one
3820	eor	v5.16b,v5.16b,v1.16b
3821	eor	v17.16b,v17.16b,v24.16b
3822	orr	v6.16b,v9.16b,v9.16b
3823	orr	v8.16b,v10.16b,v10.16b
3824	st1	{v5.16b},[x1],#16
3825	st1	{v17.16b},[x1],#16
3826	add	x2,x2,#16
3827	b	.Lxts_done
3828
3829.Lxts_dec_one:
3830	eor	v5.16b,v5.16b,v24.16b
3831	orr	v6.16b,v8.16b,v8.16b
3832	orr	v8.16b,v9.16b,v9.16b
3833	st1	{v5.16b},[x1],#16
3834	add	x2,x2,#32
3835
3836.Lxts_done:
3837	tst	x21,#0xf
3838	b.eq	.Lxts_dec_abort
3839	// Processing the last two blocks with cipher stealing.
3840	mov	x7,x3
3841	cbnz	x2,.Lxts_dec_1st_done
3842	ld1	{v0.16b},[x0],#16
3843
3844	// Decrypt the last second block to get the last plain text block
3845.Lxts_dec_1st_done:
3846	eor	v26.16b,v0.16b,v8.16b
3847	ldr	w6,[x3,#240]
3848	ld1	{v0.4s},[x3],#16
3849	sub	w6,w6,#2
3850	ld1	{v1.4s},[x3],#16
3851.Loop_final_2nd_dec:
3852	aesd	v26.16b,v0.16b
3853	aesimc	v26.16b,v26.16b
3854	ld1	{v0.4s},[x3],#16		// load key schedule...
3855	subs	w6,w6,#2
3856	aesd	v26.16b,v1.16b
3857	aesimc	v26.16b,v26.16b
3858	ld1	{v1.4s},[x3],#16		// load key schedule...
3859	b.gt	.Loop_final_2nd_dec
3860
3861	aesd	v26.16b,v0.16b
3862	aesimc	v26.16b,v26.16b
3863	ld1	{v0.4s},[x3]
3864	aesd	v26.16b,v1.16b
3865	eor	v26.16b,v26.16b,v0.16b
3866	eor	v26.16b,v26.16b,v8.16b
3867	st1	{v26.16b},[x1]
3868
3869	mov	x20,x0
3870	add	x13,x1,#16
3871
3872	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3873	// to get the last encrypted block.
3874.composite_dec_loop:
3875	subs	x21,x21,#1
3876	ldrb	w15,[x1,x21]
3877	ldrb	w14,[x20,x21]
3878	strb	w15,[x13,x21]
3879	strb	w14,[x1,x21]
3880	b.gt	.composite_dec_loop
3881.Lxts_dec_load_done:
3882	ld1	{v26.16b},[x1]
3883	eor	v26.16b,v26.16b,v6.16b
3884
3885	// Decrypt the composite block to get the last second plain text block
3886	ldr	w6,[x7,#240]
3887	ld1	{v0.4s},[x7],#16
3888	sub	w6,w6,#2
3889	ld1	{v1.4s},[x7],#16
3890.Loop_final_dec:
3891	aesd	v26.16b,v0.16b
3892	aesimc	v26.16b,v26.16b
3893	ld1	{v0.4s},[x7],#16		// load key schedule...
3894	subs	w6,w6,#2
3895	aesd	v26.16b,v1.16b
3896	aesimc	v26.16b,v26.16b
3897	ld1	{v1.4s},[x7],#16		// load key schedule...
3898	b.gt	.Loop_final_dec
3899
3900	aesd	v26.16b,v0.16b
3901	aesimc	v26.16b,v26.16b
3902	ld1	{v0.4s},[x7]
3903	aesd	v26.16b,v1.16b
3904	eor	v26.16b,v26.16b,v0.16b
3905	eor	v26.16b,v26.16b,v6.16b
3906	st1	{v26.16b},[x1]
3907
3908.Lxts_dec_abort:
3909	ldp	x21,x22,[sp,#48]
3910	ldp	d8,d9,[sp,#32]
3911	ldp	d10,d11,[sp,#16]
3912	ldp	x19,x20,[sp],#64
3913
3914.Lxts_dec_final_abort:
3915	ret
3916.size	aes_v8_xts_decrypt,.-aes_v8_xts_decrypt
3917#endif
3918