xref: /freebsd/sys/crypto/openssl/aarch64/aesv8-armx.S (revision e37e8677fe55ddd0685c764e17ac58707787758a)
1/* Do not modify. This file is auto-generated from aesv8-armx.pl. */
2#include "arm_arch.h"
3
4#if __ARM_MAX_ARCH__>=7
5.arch	armv8-a+crypto
6.text
7.align	5
8.Lrcon:
9.long	0x01,0x01,0x01,0x01
10.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
11.long	0x1b,0x1b,0x1b,0x1b
12
13.globl	aes_v8_set_encrypt_key
14.type	aes_v8_set_encrypt_key,%function
15.align	5
16aes_v8_set_encrypt_key:
17.Lenc_key:
18	AARCH64_VALID_CALL_TARGET
19	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
20	stp	x29,x30,[sp,#-16]!
21	add	x29,sp,#0
22	mov	x3,#-1
23	cmp	x0,#0
24	b.eq	.Lenc_key_abort
25	cmp	x2,#0
26	b.eq	.Lenc_key_abort
27	mov	x3,#-2
28	cmp	w1,#128
29	b.lt	.Lenc_key_abort
30	cmp	w1,#256
31	b.gt	.Lenc_key_abort
32	tst	w1,#0x3f
33	b.ne	.Lenc_key_abort
34
35	adr	x3,.Lrcon
36	cmp	w1,#192
37
38	eor	v0.16b,v0.16b,v0.16b
39	ld1	{v3.16b},[x0],#16
40	mov	w1,#8		// reuse w1
41	ld1	{v1.4s,v2.4s},[x3],#32
42
43	b.lt	.Loop128
44	b.eq	.L192
45	b	.L256
46
47.align	4
48.Loop128:
49	tbl	v6.16b,{v3.16b},v2.16b
50	ext	v5.16b,v0.16b,v3.16b,#12
51	st1	{v3.4s},[x2],#16
52	aese	v6.16b,v0.16b
53	subs	w1,w1,#1
54
55	eor	v3.16b,v3.16b,v5.16b
56	ext	v5.16b,v0.16b,v5.16b,#12
57	eor	v3.16b,v3.16b,v5.16b
58	ext	v5.16b,v0.16b,v5.16b,#12
59	eor	v6.16b,v6.16b,v1.16b
60	eor	v3.16b,v3.16b,v5.16b
61	shl	v1.16b,v1.16b,#1
62	eor	v3.16b,v3.16b,v6.16b
63	b.ne	.Loop128
64
65	ld1	{v1.4s},[x3]
66
67	tbl	v6.16b,{v3.16b},v2.16b
68	ext	v5.16b,v0.16b,v3.16b,#12
69	st1	{v3.4s},[x2],#16
70	aese	v6.16b,v0.16b
71
72	eor	v3.16b,v3.16b,v5.16b
73	ext	v5.16b,v0.16b,v5.16b,#12
74	eor	v3.16b,v3.16b,v5.16b
75	ext	v5.16b,v0.16b,v5.16b,#12
76	eor	v6.16b,v6.16b,v1.16b
77	eor	v3.16b,v3.16b,v5.16b
78	shl	v1.16b,v1.16b,#1
79	eor	v3.16b,v3.16b,v6.16b
80
81	tbl	v6.16b,{v3.16b},v2.16b
82	ext	v5.16b,v0.16b,v3.16b,#12
83	st1	{v3.4s},[x2],#16
84	aese	v6.16b,v0.16b
85
86	eor	v3.16b,v3.16b,v5.16b
87	ext	v5.16b,v0.16b,v5.16b,#12
88	eor	v3.16b,v3.16b,v5.16b
89	ext	v5.16b,v0.16b,v5.16b,#12
90	eor	v6.16b,v6.16b,v1.16b
91	eor	v3.16b,v3.16b,v5.16b
92	eor	v3.16b,v3.16b,v6.16b
93	st1	{v3.4s},[x2]
94	add	x2,x2,#0x50
95
96	mov	w12,#10
97	b	.Ldone
98
99.align	4
100.L192:
101	ld1	{v4.8b},[x0],#8
102	movi	v6.16b,#8			// borrow v6.16b
103	st1	{v3.4s},[x2],#16
104	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
105
106.Loop192:
107	tbl	v6.16b,{v4.16b},v2.16b
108	ext	v5.16b,v0.16b,v3.16b,#12
109#ifdef __AARCH64EB__
110	st1	{v4.4s},[x2],#16
111	sub	x2,x2,#8
112#else
113	st1	{v4.8b},[x2],#8
114#endif
115	aese	v6.16b,v0.16b
116	subs	w1,w1,#1
117
118	eor	v3.16b,v3.16b,v5.16b
119	ext	v5.16b,v0.16b,v5.16b,#12
120	eor	v3.16b,v3.16b,v5.16b
121	ext	v5.16b,v0.16b,v5.16b,#12
122	eor	v3.16b,v3.16b,v5.16b
123
124	dup	v5.4s,v3.s[3]
125	eor	v5.16b,v5.16b,v4.16b
126	eor	v6.16b,v6.16b,v1.16b
127	ext	v4.16b,v0.16b,v4.16b,#12
128	shl	v1.16b,v1.16b,#1
129	eor	v4.16b,v4.16b,v5.16b
130	eor	v3.16b,v3.16b,v6.16b
131	eor	v4.16b,v4.16b,v6.16b
132	st1	{v3.4s},[x2],#16
133	b.ne	.Loop192
134
135	mov	w12,#12
136	add	x2,x2,#0x20
137	b	.Ldone
138
139.align	4
140.L256:
141	ld1	{v4.16b},[x0]
142	mov	w1,#7
143	mov	w12,#14
144	st1	{v3.4s},[x2],#16
145
146.Loop256:
147	tbl	v6.16b,{v4.16b},v2.16b
148	ext	v5.16b,v0.16b,v3.16b,#12
149	st1	{v4.4s},[x2],#16
150	aese	v6.16b,v0.16b
151	subs	w1,w1,#1
152
153	eor	v3.16b,v3.16b,v5.16b
154	ext	v5.16b,v0.16b,v5.16b,#12
155	eor	v3.16b,v3.16b,v5.16b
156	ext	v5.16b,v0.16b,v5.16b,#12
157	eor	v6.16b,v6.16b,v1.16b
158	eor	v3.16b,v3.16b,v5.16b
159	shl	v1.16b,v1.16b,#1
160	eor	v3.16b,v3.16b,v6.16b
161	st1	{v3.4s},[x2],#16
162	b.eq	.Ldone
163
164	dup	v6.4s,v3.s[3]		// just splat
165	ext	v5.16b,v0.16b,v4.16b,#12
166	aese	v6.16b,v0.16b
167
168	eor	v4.16b,v4.16b,v5.16b
169	ext	v5.16b,v0.16b,v5.16b,#12
170	eor	v4.16b,v4.16b,v5.16b
171	ext	v5.16b,v0.16b,v5.16b,#12
172	eor	v4.16b,v4.16b,v5.16b
173
174	eor	v4.16b,v4.16b,v6.16b
175	b	.Loop256
176
177.Ldone:
178	str	w12,[x2]
179	mov	x3,#0
180
181.Lenc_key_abort:
182	mov	x0,x3			// return value
183	ldr	x29,[sp],#16
184	ret
185.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
186
187.globl	aes_v8_set_decrypt_key
188.type	aes_v8_set_decrypt_key,%function
189.align	5
190aes_v8_set_decrypt_key:
191	AARCH64_SIGN_LINK_REGISTER
192	stp	x29,x30,[sp,#-16]!
193	add	x29,sp,#0
194	bl	.Lenc_key
195
196	cmp	x0,#0
197	b.ne	.Ldec_key_abort
198
199	sub	x2,x2,#240		// restore original x2
200	mov	x4,#-16
201	add	x0,x2,x12,lsl#4	// end of key schedule
202
203	ld1	{v0.4s},[x2]
204	ld1	{v1.4s},[x0]
205	st1	{v0.4s},[x0],x4
206	st1	{v1.4s},[x2],#16
207
208.Loop_imc:
209	ld1	{v0.4s},[x2]
210	ld1	{v1.4s},[x0]
211	aesimc	v0.16b,v0.16b
212	aesimc	v1.16b,v1.16b
213	st1	{v0.4s},[x0],x4
214	st1	{v1.4s},[x2],#16
215	cmp	x0,x2
216	b.hi	.Loop_imc
217
218	ld1	{v0.4s},[x2]
219	aesimc	v0.16b,v0.16b
220	st1	{v0.4s},[x0]
221
222	eor	x0,x0,x0		// return value
223.Ldec_key_abort:
224	ldp	x29,x30,[sp],#16
225	AARCH64_VALIDATE_LINK_REGISTER
226	ret
227.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
228.globl	aes_v8_encrypt
229.type	aes_v8_encrypt,%function
230.align	5
231aes_v8_encrypt:
232	AARCH64_VALID_CALL_TARGET
233	ldr	w3,[x2,#240]
234	ld1	{v0.4s},[x2],#16
235	ld1	{v2.16b},[x0]
236	sub	w3,w3,#2
237	ld1	{v1.4s},[x2],#16
238
239.Loop_enc:
240	aese	v2.16b,v0.16b
241	aesmc	v2.16b,v2.16b
242	ld1	{v0.4s},[x2],#16
243	subs	w3,w3,#2
244	aese	v2.16b,v1.16b
245	aesmc	v2.16b,v2.16b
246	ld1	{v1.4s},[x2],#16
247	b.gt	.Loop_enc
248
249	aese	v2.16b,v0.16b
250	aesmc	v2.16b,v2.16b
251	ld1	{v0.4s},[x2]
252	aese	v2.16b,v1.16b
253	eor	v2.16b,v2.16b,v0.16b
254
255	st1	{v2.16b},[x1]
256	ret
257.size	aes_v8_encrypt,.-aes_v8_encrypt
258.globl	aes_v8_decrypt
259.type	aes_v8_decrypt,%function
260.align	5
261aes_v8_decrypt:
262	AARCH64_VALID_CALL_TARGET
263	ldr	w3,[x2,#240]
264	ld1	{v0.4s},[x2],#16
265	ld1	{v2.16b},[x0]
266	sub	w3,w3,#2
267	ld1	{v1.4s},[x2],#16
268
269.Loop_dec:
270	aesd	v2.16b,v0.16b
271	aesimc	v2.16b,v2.16b
272	ld1	{v0.4s},[x2],#16
273	subs	w3,w3,#2
274	aesd	v2.16b,v1.16b
275	aesimc	v2.16b,v2.16b
276	ld1	{v1.4s},[x2],#16
277	b.gt	.Loop_dec
278
279	aesd	v2.16b,v0.16b
280	aesimc	v2.16b,v2.16b
281	ld1	{v0.4s},[x2]
282	aesd	v2.16b,v1.16b
283	eor	v2.16b,v2.16b,v0.16b
284
285	st1	{v2.16b},[x1]
286	ret
287.size	aes_v8_decrypt,.-aes_v8_decrypt
288.globl	aes_v8_ecb_encrypt
289.type	aes_v8_ecb_encrypt,%function
290.align	5
291aes_v8_ecb_encrypt:
292	AARCH64_VALID_CALL_TARGET
293	subs	x2,x2,#16
294	// Original input data size bigger than 16, jump to big size processing.
295	b.ne	.Lecb_big_size
296	ld1	{v0.16b},[x0]
297	cmp	w4,#0					// en- or decrypting?
298	ldr	w5,[x3,#240]
299	ld1	{v5.4s,v6.4s},[x3],#32			// load key schedule...
300
301	b.eq	.Lecb_small_dec
302	aese	v0.16b,v5.16b
303	aesmc	v0.16b,v0.16b
304	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
305	aese	v0.16b,v6.16b
306	aesmc	v0.16b,v0.16b
307	subs	w5,w5,#10			// if rounds==10, jump to aes-128-ecb processing
308	b.eq	.Lecb_128_enc
309.Lecb_round_loop:
310	aese	v0.16b,v16.16b
311	aesmc	v0.16b,v0.16b
312	ld1	{v16.4s},[x3],#16				// load key schedule...
313	aese	v0.16b,v17.16b
314	aesmc	v0.16b,v0.16b
315	ld1	{v17.4s},[x3],#16				// load key schedule...
316	subs	w5,w5,#2			// bias
317	b.gt	.Lecb_round_loop
318.Lecb_128_enc:
319	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
320	aese	v0.16b,v16.16b
321	aesmc	v0.16b,v0.16b
322	aese	v0.16b,v17.16b
323	aesmc	v0.16b,v0.16b
324	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
325	aese	v0.16b,v18.16b
326	aesmc	v0.16b,v0.16b
327	aese	v0.16b,v19.16b
328	aesmc	v0.16b,v0.16b
329	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
330	aese	v0.16b,v20.16b
331	aesmc	v0.16b,v0.16b
332	aese	v0.16b,v21.16b
333	aesmc	v0.16b,v0.16b
334	ld1	{v7.4s},[x3]
335	aese	v0.16b,v22.16b
336	aesmc	v0.16b,v0.16b
337	aese	v0.16b,v23.16b
338	eor	v0.16b,v0.16b,v7.16b
339	st1	{v0.16b},[x1]
340	b	.Lecb_Final_abort
341.Lecb_small_dec:
342	aesd	v0.16b,v5.16b
343	aesimc	v0.16b,v0.16b
344	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
345	aesd	v0.16b,v6.16b
346	aesimc	v0.16b,v0.16b
347	subs	w5,w5,#10			// bias
348	b.eq	.Lecb_128_dec
349.Lecb_dec_round_loop:
350	aesd	v0.16b,v16.16b
351	aesimc	v0.16b,v0.16b
352	ld1	{v16.4s},[x3],#16				// load key schedule...
353	aesd	v0.16b,v17.16b
354	aesimc	v0.16b,v0.16b
355	ld1	{v17.4s},[x3],#16				// load key schedule...
356	subs	w5,w5,#2			// bias
357	b.gt	.Lecb_dec_round_loop
358.Lecb_128_dec:
359	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
360	aesd	v0.16b,v16.16b
361	aesimc	v0.16b,v0.16b
362	aesd	v0.16b,v17.16b
363	aesimc	v0.16b,v0.16b
364	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
365	aesd	v0.16b,v18.16b
366	aesimc	v0.16b,v0.16b
367	aesd	v0.16b,v19.16b
368	aesimc	v0.16b,v0.16b
369	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
370	aesd	v0.16b,v20.16b
371	aesimc	v0.16b,v0.16b
372	aesd	v0.16b,v21.16b
373	aesimc	v0.16b,v0.16b
374	ld1	{v7.4s},[x3]
375	aesd	v0.16b,v22.16b
376	aesimc	v0.16b,v0.16b
377	aesd	v0.16b,v23.16b
378	eor	v0.16b,v0.16b,v7.16b
379	st1	{v0.16b},[x1]
380	b	.Lecb_Final_abort
381.Lecb_big_size:
382	stp	x29,x30,[sp,#-16]!
383	add	x29,sp,#0
384	mov	x8,#16
385	b.lo	.Lecb_done
386	csel	x8,xzr,x8,eq
387
388	cmp	w4,#0					// en- or decrypting?
389	ldr	w5,[x3,#240]
390	and	x2,x2,#-16
391	ld1	{v0.16b},[x0],x8
392
393	ld1	{v16.4s,v17.4s},[x3]				// load key schedule...
394	sub	w5,w5,#6
395	add	x7,x3,x5,lsl#4				// pointer to last 7 round keys
396	sub	w5,w5,#2
397	ld1	{v18.4s,v19.4s},[x7],#32
398	ld1	{v20.4s,v21.4s},[x7],#32
399	ld1	{v22.4s,v23.4s},[x7],#32
400	ld1	{v7.4s},[x7]
401
402	add	x7,x3,#32
403	mov	w6,w5
404	b.eq	.Lecb_dec
405
406	ld1	{v1.16b},[x0],#16
407	subs	x2,x2,#32				// bias
408	add	w6,w5,#2
409	orr	v3.16b,v1.16b,v1.16b
410	orr	v24.16b,v1.16b,v1.16b
411	orr	v1.16b,v0.16b,v0.16b
412	b.lo	.Lecb_enc_tail
413
414	orr	v1.16b,v3.16b,v3.16b
415	ld1	{v24.16b},[x0],#16
416	cmp	x2,#32
417	b.lo	.Loop3x_ecb_enc
418
419	ld1	{v25.16b},[x0],#16
420	ld1	{v26.16b},[x0],#16
421	sub	x2,x2,#32				// bias
422	mov	w6,w5
423
424.Loop5x_ecb_enc:
425	aese	v0.16b,v16.16b
426	aesmc	v0.16b,v0.16b
427	aese	v1.16b,v16.16b
428	aesmc	v1.16b,v1.16b
429	aese	v24.16b,v16.16b
430	aesmc	v24.16b,v24.16b
431	aese	v25.16b,v16.16b
432	aesmc	v25.16b,v25.16b
433	aese	v26.16b,v16.16b
434	aesmc	v26.16b,v26.16b
435	ld1	{v16.4s},[x7],#16
436	subs	w6,w6,#2
437	aese	v0.16b,v17.16b
438	aesmc	v0.16b,v0.16b
439	aese	v1.16b,v17.16b
440	aesmc	v1.16b,v1.16b
441	aese	v24.16b,v17.16b
442	aesmc	v24.16b,v24.16b
443	aese	v25.16b,v17.16b
444	aesmc	v25.16b,v25.16b
445	aese	v26.16b,v17.16b
446	aesmc	v26.16b,v26.16b
447	ld1	{v17.4s},[x7],#16
448	b.gt	.Loop5x_ecb_enc
449
450	aese	v0.16b,v16.16b
451	aesmc	v0.16b,v0.16b
452	aese	v1.16b,v16.16b
453	aesmc	v1.16b,v1.16b
454	aese	v24.16b,v16.16b
455	aesmc	v24.16b,v24.16b
456	aese	v25.16b,v16.16b
457	aesmc	v25.16b,v25.16b
458	aese	v26.16b,v16.16b
459	aesmc	v26.16b,v26.16b
460	cmp	x2,#0x40					// because .Lecb_enc_tail4x
461	sub	x2,x2,#0x50
462
463	aese	v0.16b,v17.16b
464	aesmc	v0.16b,v0.16b
465	aese	v1.16b,v17.16b
466	aesmc	v1.16b,v1.16b
467	aese	v24.16b,v17.16b
468	aesmc	v24.16b,v24.16b
469	aese	v25.16b,v17.16b
470	aesmc	v25.16b,v25.16b
471	aese	v26.16b,v17.16b
472	aesmc	v26.16b,v26.16b
473	csel	x6,xzr,x2,gt			// borrow x6, w6, "gt" is not typo
474	mov	x7,x3
475
476	aese	v0.16b,v18.16b
477	aesmc	v0.16b,v0.16b
478	aese	v1.16b,v18.16b
479	aesmc	v1.16b,v1.16b
480	aese	v24.16b,v18.16b
481	aesmc	v24.16b,v24.16b
482	aese	v25.16b,v18.16b
483	aesmc	v25.16b,v25.16b
484	aese	v26.16b,v18.16b
485	aesmc	v26.16b,v26.16b
486	add	x0,x0,x6				// x0 is adjusted in such way that
487							// at exit from the loop v1.16b-v26.16b
488							// are loaded with last "words"
489	add	x6,x2,#0x60		    // because .Lecb_enc_tail4x
490
491	aese	v0.16b,v19.16b
492	aesmc	v0.16b,v0.16b
493	aese	v1.16b,v19.16b
494	aesmc	v1.16b,v1.16b
495	aese	v24.16b,v19.16b
496	aesmc	v24.16b,v24.16b
497	aese	v25.16b,v19.16b
498	aesmc	v25.16b,v25.16b
499	aese	v26.16b,v19.16b
500	aesmc	v26.16b,v26.16b
501
502	aese	v0.16b,v20.16b
503	aesmc	v0.16b,v0.16b
504	aese	v1.16b,v20.16b
505	aesmc	v1.16b,v1.16b
506	aese	v24.16b,v20.16b
507	aesmc	v24.16b,v24.16b
508	aese	v25.16b,v20.16b
509	aesmc	v25.16b,v25.16b
510	aese	v26.16b,v20.16b
511	aesmc	v26.16b,v26.16b
512
513	aese	v0.16b,v21.16b
514	aesmc	v0.16b,v0.16b
515	aese	v1.16b,v21.16b
516	aesmc	v1.16b,v1.16b
517	aese	v24.16b,v21.16b
518	aesmc	v24.16b,v24.16b
519	aese	v25.16b,v21.16b
520	aesmc	v25.16b,v25.16b
521	aese	v26.16b,v21.16b
522	aesmc	v26.16b,v26.16b
523
524	aese	v0.16b,v22.16b
525	aesmc	v0.16b,v0.16b
526	aese	v1.16b,v22.16b
527	aesmc	v1.16b,v1.16b
528	aese	v24.16b,v22.16b
529	aesmc	v24.16b,v24.16b
530	aese	v25.16b,v22.16b
531	aesmc	v25.16b,v25.16b
532	aese	v26.16b,v22.16b
533	aesmc	v26.16b,v26.16b
534
535	aese	v0.16b,v23.16b
536	ld1	{v2.16b},[x0],#16
537	aese	v1.16b,v23.16b
538	ld1	{v3.16b},[x0],#16
539	aese	v24.16b,v23.16b
540	ld1	{v27.16b},[x0],#16
541	aese	v25.16b,v23.16b
542	ld1	{v28.16b},[x0],#16
543	aese	v26.16b,v23.16b
544	ld1	{v29.16b},[x0],#16
545	cbz	x6,.Lecb_enc_tail4x
546	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
547	eor	v4.16b,v7.16b,v0.16b
548	orr	v0.16b,v2.16b,v2.16b
549	eor	v5.16b,v7.16b,v1.16b
550	orr	v1.16b,v3.16b,v3.16b
551	eor	v17.16b,v7.16b,v24.16b
552	orr	v24.16b,v27.16b,v27.16b
553	eor	v30.16b,v7.16b,v25.16b
554	orr	v25.16b,v28.16b,v28.16b
555	eor	v31.16b,v7.16b,v26.16b
556	st1	{v4.16b},[x1],#16
557	orr	v26.16b,v29.16b,v29.16b
558	st1	{v5.16b},[x1],#16
559	mov	w6,w5
560	st1	{v17.16b},[x1],#16
561	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
562	st1	{v30.16b},[x1],#16
563	st1	{v31.16b},[x1],#16
564	b.hs	.Loop5x_ecb_enc
565
566	add	x2,x2,#0x50
567	cbz	x2,.Lecb_done
568
569	add	w6,w5,#2
570	subs	x2,x2,#0x30
571	orr	v0.16b,v27.16b,v27.16b
572	orr	v1.16b,v28.16b,v28.16b
573	orr	v24.16b,v29.16b,v29.16b
574	b.lo	.Lecb_enc_tail
575
576	b	.Loop3x_ecb_enc
577
578.align	4
579.Lecb_enc_tail4x:
580	eor	v5.16b,v7.16b,v1.16b
581	eor	v17.16b,v7.16b,v24.16b
582	eor	v30.16b,v7.16b,v25.16b
583	eor	v31.16b,v7.16b,v26.16b
584	st1	{v5.16b},[x1],#16
585	st1	{v17.16b},[x1],#16
586	st1	{v30.16b},[x1],#16
587	st1	{v31.16b},[x1],#16
588
589	b	.Lecb_done
590.align	4
591.Loop3x_ecb_enc:
592	aese	v0.16b,v16.16b
593	aesmc	v0.16b,v0.16b
594	aese	v1.16b,v16.16b
595	aesmc	v1.16b,v1.16b
596	aese	v24.16b,v16.16b
597	aesmc	v24.16b,v24.16b
598	ld1	{v16.4s},[x7],#16
599	subs	w6,w6,#2
600	aese	v0.16b,v17.16b
601	aesmc	v0.16b,v0.16b
602	aese	v1.16b,v17.16b
603	aesmc	v1.16b,v1.16b
604	aese	v24.16b,v17.16b
605	aesmc	v24.16b,v24.16b
606	ld1	{v17.4s},[x7],#16
607	b.gt	.Loop3x_ecb_enc
608
609	aese	v0.16b,v16.16b
610	aesmc	v0.16b,v0.16b
611	aese	v1.16b,v16.16b
612	aesmc	v1.16b,v1.16b
613	aese	v24.16b,v16.16b
614	aesmc	v24.16b,v24.16b
615	subs	x2,x2,#0x30
616	csel	x6,x2,x6,lo				// x6, w6, is zero at this point
617	aese	v0.16b,v17.16b
618	aesmc	v0.16b,v0.16b
619	aese	v1.16b,v17.16b
620	aesmc	v1.16b,v1.16b
621	aese	v24.16b,v17.16b
622	aesmc	v24.16b,v24.16b
623	add	x0,x0,x6			// x0 is adjusted in such way that
624						// at exit from the loop v1.16b-v24.16b
625						// are loaded with last "words"
626	mov	x7,x3
627	aese	v0.16b,v20.16b
628	aesmc	v0.16b,v0.16b
629	aese	v1.16b,v20.16b
630	aesmc	v1.16b,v1.16b
631	aese	v24.16b,v20.16b
632	aesmc	v24.16b,v24.16b
633	ld1	{v2.16b},[x0],#16
634	aese	v0.16b,v21.16b
635	aesmc	v0.16b,v0.16b
636	aese	v1.16b,v21.16b
637	aesmc	v1.16b,v1.16b
638	aese	v24.16b,v21.16b
639	aesmc	v24.16b,v24.16b
640	ld1	{v3.16b},[x0],#16
641	aese	v0.16b,v22.16b
642	aesmc	v0.16b,v0.16b
643	aese	v1.16b,v22.16b
644	aesmc	v1.16b,v1.16b
645	aese	v24.16b,v22.16b
646	aesmc	v24.16b,v24.16b
647	ld1	{v27.16b},[x0],#16
648	aese	v0.16b,v23.16b
649	aese	v1.16b,v23.16b
650	aese	v24.16b,v23.16b
651	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
652	add	w6,w5,#2
653	eor	v4.16b,v7.16b,v0.16b
654	eor	v5.16b,v7.16b,v1.16b
655	eor	v24.16b,v24.16b,v7.16b
656	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
657	st1	{v4.16b},[x1],#16
658	orr	v0.16b,v2.16b,v2.16b
659	st1	{v5.16b},[x1],#16
660	orr	v1.16b,v3.16b,v3.16b
661	st1	{v24.16b},[x1],#16
662	orr	v24.16b,v27.16b,v27.16b
663	b.hs	.Loop3x_ecb_enc
664
665	cmn	x2,#0x30
666	b.eq	.Lecb_done
667	nop
668
669.Lecb_enc_tail:
670	aese	v1.16b,v16.16b
671	aesmc	v1.16b,v1.16b
672	aese	v24.16b,v16.16b
673	aesmc	v24.16b,v24.16b
674	ld1	{v16.4s},[x7],#16
675	subs	w6,w6,#2
676	aese	v1.16b,v17.16b
677	aesmc	v1.16b,v1.16b
678	aese	v24.16b,v17.16b
679	aesmc	v24.16b,v24.16b
680	ld1	{v17.4s},[x7],#16
681	b.gt	.Lecb_enc_tail
682
683	aese	v1.16b,v16.16b
684	aesmc	v1.16b,v1.16b
685	aese	v24.16b,v16.16b
686	aesmc	v24.16b,v24.16b
687	aese	v1.16b,v17.16b
688	aesmc	v1.16b,v1.16b
689	aese	v24.16b,v17.16b
690	aesmc	v24.16b,v24.16b
691	aese	v1.16b,v20.16b
692	aesmc	v1.16b,v1.16b
693	aese	v24.16b,v20.16b
694	aesmc	v24.16b,v24.16b
695	cmn	x2,#0x20
696	aese	v1.16b,v21.16b
697	aesmc	v1.16b,v1.16b
698	aese	v24.16b,v21.16b
699	aesmc	v24.16b,v24.16b
700	aese	v1.16b,v22.16b
701	aesmc	v1.16b,v1.16b
702	aese	v24.16b,v22.16b
703	aesmc	v24.16b,v24.16b
704	aese	v1.16b,v23.16b
705	aese	v24.16b,v23.16b
706	b.eq	.Lecb_enc_one
707	eor	v5.16b,v7.16b,v1.16b
708	eor	v17.16b,v7.16b,v24.16b
709	st1	{v5.16b},[x1],#16
710	st1	{v17.16b},[x1],#16
711	b	.Lecb_done
712
713.Lecb_enc_one:
714	eor	v5.16b,v7.16b,v24.16b
715	st1	{v5.16b},[x1],#16
716	b	.Lecb_done
717.align	5
718.Lecb_dec:
719	ld1	{v1.16b},[x0],#16
720	subs	x2,x2,#32			// bias
721	add	w6,w5,#2
722	orr	v3.16b,v1.16b,v1.16b
723	orr	v24.16b,v1.16b,v1.16b
724	orr	v1.16b,v0.16b,v0.16b
725	b.lo	.Lecb_dec_tail
726
727	orr	v1.16b,v3.16b,v3.16b
728	ld1	{v24.16b},[x0],#16
729	cmp	x2,#32
730	b.lo	.Loop3x_ecb_dec
731
732	ld1	{v25.16b},[x0],#16
733	ld1	{v26.16b},[x0],#16
734	sub	x2,x2,#32				// bias
735	mov	w6,w5
736
737.Loop5x_ecb_dec:
738	aesd	v0.16b,v16.16b
739	aesimc	v0.16b,v0.16b
740	aesd	v1.16b,v16.16b
741	aesimc	v1.16b,v1.16b
742	aesd	v24.16b,v16.16b
743	aesimc	v24.16b,v24.16b
744	aesd	v25.16b,v16.16b
745	aesimc	v25.16b,v25.16b
746	aesd	v26.16b,v16.16b
747	aesimc	v26.16b,v26.16b
748	ld1	{v16.4s},[x7],#16
749	subs	w6,w6,#2
750	aesd	v0.16b,v17.16b
751	aesimc	v0.16b,v0.16b
752	aesd	v1.16b,v17.16b
753	aesimc	v1.16b,v1.16b
754	aesd	v24.16b,v17.16b
755	aesimc	v24.16b,v24.16b
756	aesd	v25.16b,v17.16b
757	aesimc	v25.16b,v25.16b
758	aesd	v26.16b,v17.16b
759	aesimc	v26.16b,v26.16b
760	ld1	{v17.4s},[x7],#16
761	b.gt	.Loop5x_ecb_dec
762
763	aesd	v0.16b,v16.16b
764	aesimc	v0.16b,v0.16b
765	aesd	v1.16b,v16.16b
766	aesimc	v1.16b,v1.16b
767	aesd	v24.16b,v16.16b
768	aesimc	v24.16b,v24.16b
769	aesd	v25.16b,v16.16b
770	aesimc	v25.16b,v25.16b
771	aesd	v26.16b,v16.16b
772	aesimc	v26.16b,v26.16b
773	cmp	x2,#0x40				// because .Lecb_tail4x
774	sub	x2,x2,#0x50
775
776	aesd	v0.16b,v17.16b
777	aesimc	v0.16b,v0.16b
778	aesd	v1.16b,v17.16b
779	aesimc	v1.16b,v1.16b
780	aesd	v24.16b,v17.16b
781	aesimc	v24.16b,v24.16b
782	aesd	v25.16b,v17.16b
783	aesimc	v25.16b,v25.16b
784	aesd	v26.16b,v17.16b
785	aesimc	v26.16b,v26.16b
786	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
787	mov	x7,x3
788
789	aesd	v0.16b,v18.16b
790	aesimc	v0.16b,v0.16b
791	aesd	v1.16b,v18.16b
792	aesimc	v1.16b,v1.16b
793	aesd	v24.16b,v18.16b
794	aesimc	v24.16b,v24.16b
795	aesd	v25.16b,v18.16b
796	aesimc	v25.16b,v25.16b
797	aesd	v26.16b,v18.16b
798	aesimc	v26.16b,v26.16b
799	add	x0,x0,x6				// x0 is adjusted in such way that
800							// at exit from the loop v1.16b-v26.16b
801							// are loaded with last "words"
802	add	x6,x2,#0x60			// because .Lecb_tail4x
803
804	aesd	v0.16b,v19.16b
805	aesimc	v0.16b,v0.16b
806	aesd	v1.16b,v19.16b
807	aesimc	v1.16b,v1.16b
808	aesd	v24.16b,v19.16b
809	aesimc	v24.16b,v24.16b
810	aesd	v25.16b,v19.16b
811	aesimc	v25.16b,v25.16b
812	aesd	v26.16b,v19.16b
813	aesimc	v26.16b,v26.16b
814
815	aesd	v0.16b,v20.16b
816	aesimc	v0.16b,v0.16b
817	aesd	v1.16b,v20.16b
818	aesimc	v1.16b,v1.16b
819	aesd	v24.16b,v20.16b
820	aesimc	v24.16b,v24.16b
821	aesd	v25.16b,v20.16b
822	aesimc	v25.16b,v25.16b
823	aesd	v26.16b,v20.16b
824	aesimc	v26.16b,v26.16b
825
826	aesd	v0.16b,v21.16b
827	aesimc	v0.16b,v0.16b
828	aesd	v1.16b,v21.16b
829	aesimc	v1.16b,v1.16b
830	aesd	v24.16b,v21.16b
831	aesimc	v24.16b,v24.16b
832	aesd	v25.16b,v21.16b
833	aesimc	v25.16b,v25.16b
834	aesd	v26.16b,v21.16b
835	aesimc	v26.16b,v26.16b
836
837	aesd	v0.16b,v22.16b
838	aesimc	v0.16b,v0.16b
839	aesd	v1.16b,v22.16b
840	aesimc	v1.16b,v1.16b
841	aesd	v24.16b,v22.16b
842	aesimc	v24.16b,v24.16b
843	aesd	v25.16b,v22.16b
844	aesimc	v25.16b,v25.16b
845	aesd	v26.16b,v22.16b
846	aesimc	v26.16b,v26.16b
847
848	aesd	v0.16b,v23.16b
849	ld1	{v2.16b},[x0],#16
850	aesd	v1.16b,v23.16b
851	ld1	{v3.16b},[x0],#16
852	aesd	v24.16b,v23.16b
853	ld1	{v27.16b},[x0],#16
854	aesd	v25.16b,v23.16b
855	ld1	{v28.16b},[x0],#16
856	aesd	v26.16b,v23.16b
857	ld1	{v29.16b},[x0],#16
858	cbz	x6,.Lecb_tail4x
859	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
860	eor	v4.16b,v7.16b,v0.16b
861	orr	v0.16b,v2.16b,v2.16b
862	eor	v5.16b,v7.16b,v1.16b
863	orr	v1.16b,v3.16b,v3.16b
864	eor	v17.16b,v7.16b,v24.16b
865	orr	v24.16b,v27.16b,v27.16b
866	eor	v30.16b,v7.16b,v25.16b
867	orr	v25.16b,v28.16b,v28.16b
868	eor	v31.16b,v7.16b,v26.16b
869	st1	{v4.16b},[x1],#16
870	orr	v26.16b,v29.16b,v29.16b
871	st1	{v5.16b},[x1],#16
872	mov	w6,w5
873	st1	{v17.16b},[x1],#16
874	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
875	st1	{v30.16b},[x1],#16
876	st1	{v31.16b},[x1],#16
877	b.hs	.Loop5x_ecb_dec
878
879	add	x2,x2,#0x50
880	cbz	x2,.Lecb_done
881
882	add	w6,w5,#2
883	subs	x2,x2,#0x30
884	orr	v0.16b,v27.16b,v27.16b
885	orr	v1.16b,v28.16b,v28.16b
886	orr	v24.16b,v29.16b,v29.16b
887	b.lo	.Lecb_dec_tail
888
889	b	.Loop3x_ecb_dec
890
891.align	4
892.Lecb_tail4x:
893	eor	v5.16b,v7.16b,v1.16b
894	eor	v17.16b,v7.16b,v24.16b
895	eor	v30.16b,v7.16b,v25.16b
896	eor	v31.16b,v7.16b,v26.16b
897	st1	{v5.16b},[x1],#16
898	st1	{v17.16b},[x1],#16
899	st1	{v30.16b},[x1],#16
900	st1	{v31.16b},[x1],#16
901
902	b	.Lecb_done
903.align	4
904.Loop3x_ecb_dec:
905	aesd	v0.16b,v16.16b
906	aesimc	v0.16b,v0.16b
907	aesd	v1.16b,v16.16b
908	aesimc	v1.16b,v1.16b
909	aesd	v24.16b,v16.16b
910	aesimc	v24.16b,v24.16b
911	ld1	{v16.4s},[x7],#16
912	subs	w6,w6,#2
913	aesd	v0.16b,v17.16b
914	aesimc	v0.16b,v0.16b
915	aesd	v1.16b,v17.16b
916	aesimc	v1.16b,v1.16b
917	aesd	v24.16b,v17.16b
918	aesimc	v24.16b,v24.16b
919	ld1	{v17.4s},[x7],#16
920	b.gt	.Loop3x_ecb_dec
921
922	aesd	v0.16b,v16.16b
923	aesimc	v0.16b,v0.16b
924	aesd	v1.16b,v16.16b
925	aesimc	v1.16b,v1.16b
926	aesd	v24.16b,v16.16b
927	aesimc	v24.16b,v24.16b
928	subs	x2,x2,#0x30
929	csel	x6,x2,x6,lo				// x6, w6, is zero at this point
930	aesd	v0.16b,v17.16b
931	aesimc	v0.16b,v0.16b
932	aesd	v1.16b,v17.16b
933	aesimc	v1.16b,v1.16b
934	aesd	v24.16b,v17.16b
935	aesimc	v24.16b,v24.16b
936	add	x0,x0,x6 			// x0 is adjusted in such way that
937						// at exit from the loop v1.16b-v24.16b
938						// are loaded with last "words"
939	mov	x7,x3
940	aesd	v0.16b,v20.16b
941	aesimc	v0.16b,v0.16b
942	aesd	v1.16b,v20.16b
943	aesimc	v1.16b,v1.16b
944	aesd	v24.16b,v20.16b
945	aesimc	v24.16b,v24.16b
946	ld1	{v2.16b},[x0],#16
947	aesd	v0.16b,v21.16b
948	aesimc	v0.16b,v0.16b
949	aesd	v1.16b,v21.16b
950	aesimc	v1.16b,v1.16b
951	aesd	v24.16b,v21.16b
952	aesimc	v24.16b,v24.16b
953	ld1	{v3.16b},[x0],#16
954	aesd	v0.16b,v22.16b
955	aesimc	v0.16b,v0.16b
956	aesd	v1.16b,v22.16b
957	aesimc	v1.16b,v1.16b
958	aesd	v24.16b,v22.16b
959	aesimc	v24.16b,v24.16b
960	ld1	{v27.16b},[x0],#16
961	aesd	v0.16b,v23.16b
962	aesd	v1.16b,v23.16b
963	aesd	v24.16b,v23.16b
964	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
965	add	w6,w5,#2
966	eor	v4.16b,v7.16b,v0.16b
967	eor	v5.16b,v7.16b,v1.16b
968	eor	v24.16b,v24.16b,v7.16b
969	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
970	st1	{v4.16b},[x1],#16
971	orr	v0.16b,v2.16b,v2.16b
972	st1	{v5.16b},[x1],#16
973	orr	v1.16b,v3.16b,v3.16b
974	st1	{v24.16b},[x1],#16
975	orr	v24.16b,v27.16b,v27.16b
976	b.hs	.Loop3x_ecb_dec
977
978	cmn	x2,#0x30
979	b.eq	.Lecb_done
980	nop
981
982.Lecb_dec_tail:
983	aesd	v1.16b,v16.16b
984	aesimc	v1.16b,v1.16b
985	aesd	v24.16b,v16.16b
986	aesimc	v24.16b,v24.16b
987	ld1	{v16.4s},[x7],#16
988	subs	w6,w6,#2
989	aesd	v1.16b,v17.16b
990	aesimc	v1.16b,v1.16b
991	aesd	v24.16b,v17.16b
992	aesimc	v24.16b,v24.16b
993	ld1	{v17.4s},[x7],#16
994	b.gt	.Lecb_dec_tail
995
996	aesd	v1.16b,v16.16b
997	aesimc	v1.16b,v1.16b
998	aesd	v24.16b,v16.16b
999	aesimc	v24.16b,v24.16b
1000	aesd	v1.16b,v17.16b
1001	aesimc	v1.16b,v1.16b
1002	aesd	v24.16b,v17.16b
1003	aesimc	v24.16b,v24.16b
1004	aesd	v1.16b,v20.16b
1005	aesimc	v1.16b,v1.16b
1006	aesd	v24.16b,v20.16b
1007	aesimc	v24.16b,v24.16b
1008	cmn	x2,#0x20
1009	aesd	v1.16b,v21.16b
1010	aesimc	v1.16b,v1.16b
1011	aesd	v24.16b,v21.16b
1012	aesimc	v24.16b,v24.16b
1013	aesd	v1.16b,v22.16b
1014	aesimc	v1.16b,v1.16b
1015	aesd	v24.16b,v22.16b
1016	aesimc	v24.16b,v24.16b
1017	aesd	v1.16b,v23.16b
1018	aesd	v24.16b,v23.16b
1019	b.eq	.Lecb_dec_one
1020	eor	v5.16b,v7.16b,v1.16b
1021	eor	v17.16b,v7.16b,v24.16b
1022	st1	{v5.16b},[x1],#16
1023	st1	{v17.16b},[x1],#16
1024	b	.Lecb_done
1025
1026.Lecb_dec_one:
1027	eor	v5.16b,v7.16b,v24.16b
1028	st1	{v5.16b},[x1],#16
1029
1030.Lecb_done:
1031	ldr	x29,[sp],#16
1032.Lecb_Final_abort:
1033	ret
1034.size	aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt
1035.globl	aes_v8_cbc_encrypt
1036.type	aes_v8_cbc_encrypt,%function
1037.align	5
1038aes_v8_cbc_encrypt:
1039	AARCH64_VALID_CALL_TARGET
1040	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1041	stp	x29,x30,[sp,#-16]!
1042	add	x29,sp,#0
1043	subs	x2,x2,#16
1044	mov	x8,#16
1045	b.lo	.Lcbc_abort
1046	csel	x8,xzr,x8,eq
1047
1048	cmp	w5,#0			// en- or decrypting?
1049	ldr	w5,[x3,#240]
1050	and	x2,x2,#-16
1051	ld1	{v6.16b},[x4]
1052	ld1	{v0.16b},[x0],x8
1053
1054	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
1055	sub	w5,w5,#6
1056	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
1057	sub	w5,w5,#2
1058	ld1	{v18.4s,v19.4s},[x7],#32
1059	ld1	{v20.4s,v21.4s},[x7],#32
1060	ld1	{v22.4s,v23.4s},[x7],#32
1061	ld1	{v7.4s},[x7]
1062
1063	add	x7,x3,#32
1064	mov	w6,w5
1065	b.eq	.Lcbc_dec
1066
1067	cmp	w5,#2
1068	eor	v0.16b,v0.16b,v6.16b
1069	eor	v5.16b,v16.16b,v7.16b
1070	b.eq	.Lcbc_enc128
1071
1072	ld1	{v2.4s,v3.4s},[x7]
1073	add	x7,x3,#16
1074	add	x6,x3,#16*4
1075	add	x12,x3,#16*5
1076	aese	v0.16b,v16.16b
1077	aesmc	v0.16b,v0.16b
1078	add	x14,x3,#16*6
1079	add	x3,x3,#16*7
1080	b	.Lenter_cbc_enc
1081
1082.align	4
1083.Loop_cbc_enc:
1084	aese	v0.16b,v16.16b
1085	aesmc	v0.16b,v0.16b
1086	st1	{v6.16b},[x1],#16
1087.Lenter_cbc_enc:
1088	aese	v0.16b,v17.16b
1089	aesmc	v0.16b,v0.16b
1090	aese	v0.16b,v2.16b
1091	aesmc	v0.16b,v0.16b
1092	ld1	{v16.4s},[x6]
1093	cmp	w5,#4
1094	aese	v0.16b,v3.16b
1095	aesmc	v0.16b,v0.16b
1096	ld1	{v17.4s},[x12]
1097	b.eq	.Lcbc_enc192
1098
1099	aese	v0.16b,v16.16b
1100	aesmc	v0.16b,v0.16b
1101	ld1	{v16.4s},[x14]
1102	aese	v0.16b,v17.16b
1103	aesmc	v0.16b,v0.16b
1104	ld1	{v17.4s},[x3]
1105	nop
1106
1107.Lcbc_enc192:
1108	aese	v0.16b,v16.16b
1109	aesmc	v0.16b,v0.16b
1110	subs	x2,x2,#16
1111	aese	v0.16b,v17.16b
1112	aesmc	v0.16b,v0.16b
1113	csel	x8,xzr,x8,eq
1114	aese	v0.16b,v18.16b
1115	aesmc	v0.16b,v0.16b
1116	aese	v0.16b,v19.16b
1117	aesmc	v0.16b,v0.16b
1118	ld1	{v16.16b},[x0],x8
1119	aese	v0.16b,v20.16b
1120	aesmc	v0.16b,v0.16b
1121	eor	v16.16b,v16.16b,v5.16b
1122	aese	v0.16b,v21.16b
1123	aesmc	v0.16b,v0.16b
1124	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
1125	aese	v0.16b,v22.16b
1126	aesmc	v0.16b,v0.16b
1127	aese	v0.16b,v23.16b
1128	eor	v6.16b,v0.16b,v7.16b
1129	b.hs	.Loop_cbc_enc
1130
1131	st1	{v6.16b},[x1],#16
1132	b	.Lcbc_done
1133
1134.align	5
1135.Lcbc_enc128:
1136	ld1	{v2.4s,v3.4s},[x7]
1137	aese	v0.16b,v16.16b
1138	aesmc	v0.16b,v0.16b
1139	b	.Lenter_cbc_enc128
1140.Loop_cbc_enc128:
1141	aese	v0.16b,v16.16b
1142	aesmc	v0.16b,v0.16b
1143	st1	{v6.16b},[x1],#16
1144.Lenter_cbc_enc128:
1145	aese	v0.16b,v17.16b
1146	aesmc	v0.16b,v0.16b
1147	subs	x2,x2,#16
1148	aese	v0.16b,v2.16b
1149	aesmc	v0.16b,v0.16b
1150	csel	x8,xzr,x8,eq
1151	aese	v0.16b,v3.16b
1152	aesmc	v0.16b,v0.16b
1153	aese	v0.16b,v18.16b
1154	aesmc	v0.16b,v0.16b
1155	aese	v0.16b,v19.16b
1156	aesmc	v0.16b,v0.16b
1157	ld1	{v16.16b},[x0],x8
1158	aese	v0.16b,v20.16b
1159	aesmc	v0.16b,v0.16b
1160	aese	v0.16b,v21.16b
1161	aesmc	v0.16b,v0.16b
1162	aese	v0.16b,v22.16b
1163	aesmc	v0.16b,v0.16b
1164	eor	v16.16b,v16.16b,v5.16b
1165	aese	v0.16b,v23.16b
1166	eor	v6.16b,v0.16b,v7.16b
1167	b.hs	.Loop_cbc_enc128
1168
1169	st1	{v6.16b},[x1],#16
1170	b	.Lcbc_done
1171.align	5
1172.Lcbc_dec:
1173	ld1	{v24.16b},[x0],#16
1174	subs	x2,x2,#32		// bias
1175	add	w6,w5,#2
1176	orr	v3.16b,v0.16b,v0.16b
1177	orr	v1.16b,v0.16b,v0.16b
1178	orr	v27.16b,v24.16b,v24.16b
1179	b.lo	.Lcbc_dec_tail
1180
1181	orr	v1.16b,v24.16b,v24.16b
1182	ld1	{v24.16b},[x0],#16
1183	orr	v2.16b,v0.16b,v0.16b
1184	orr	v3.16b,v1.16b,v1.16b
1185	orr	v27.16b,v24.16b,v24.16b
1186	cmp	x2,#32
1187	b.lo	.Loop3x_cbc_dec
1188
1189	ld1	{v25.16b},[x0],#16
1190	ld1	{v26.16b},[x0],#16
1191	sub	x2,x2,#32		// bias
1192	mov	w6,w5
1193	orr	v28.16b,v25.16b,v25.16b
1194	orr	v29.16b,v26.16b,v26.16b
1195
1196.Loop5x_cbc_dec:
1197	aesd	v0.16b,v16.16b
1198	aesimc	v0.16b,v0.16b
1199	aesd	v1.16b,v16.16b
1200	aesimc	v1.16b,v1.16b
1201	aesd	v24.16b,v16.16b
1202	aesimc	v24.16b,v24.16b
1203	aesd	v25.16b,v16.16b
1204	aesimc	v25.16b,v25.16b
1205	aesd	v26.16b,v16.16b
1206	aesimc	v26.16b,v26.16b
1207	ld1	{v16.4s},[x7],#16
1208	subs	w6,w6,#2
1209	aesd	v0.16b,v17.16b
1210	aesimc	v0.16b,v0.16b
1211	aesd	v1.16b,v17.16b
1212	aesimc	v1.16b,v1.16b
1213	aesd	v24.16b,v17.16b
1214	aesimc	v24.16b,v24.16b
1215	aesd	v25.16b,v17.16b
1216	aesimc	v25.16b,v25.16b
1217	aesd	v26.16b,v17.16b
1218	aesimc	v26.16b,v26.16b
1219	ld1	{v17.4s},[x7],#16
1220	b.gt	.Loop5x_cbc_dec
1221
1222	aesd	v0.16b,v16.16b
1223	aesimc	v0.16b,v0.16b
1224	aesd	v1.16b,v16.16b
1225	aesimc	v1.16b,v1.16b
1226	aesd	v24.16b,v16.16b
1227	aesimc	v24.16b,v24.16b
1228	aesd	v25.16b,v16.16b
1229	aesimc	v25.16b,v25.16b
1230	aesd	v26.16b,v16.16b
1231	aesimc	v26.16b,v26.16b
1232	cmp	x2,#0x40		// because .Lcbc_tail4x
1233	sub	x2,x2,#0x50
1234
1235	aesd	v0.16b,v17.16b
1236	aesimc	v0.16b,v0.16b
1237	aesd	v1.16b,v17.16b
1238	aesimc	v1.16b,v1.16b
1239	aesd	v24.16b,v17.16b
1240	aesimc	v24.16b,v24.16b
1241	aesd	v25.16b,v17.16b
1242	aesimc	v25.16b,v25.16b
1243	aesd	v26.16b,v17.16b
1244	aesimc	v26.16b,v26.16b
1245	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
1246	mov	x7,x3
1247
1248	aesd	v0.16b,v18.16b
1249	aesimc	v0.16b,v0.16b
1250	aesd	v1.16b,v18.16b
1251	aesimc	v1.16b,v1.16b
1252	aesd	v24.16b,v18.16b
1253	aesimc	v24.16b,v24.16b
1254	aesd	v25.16b,v18.16b
1255	aesimc	v25.16b,v25.16b
1256	aesd	v26.16b,v18.16b
1257	aesimc	v26.16b,v26.16b
1258	add	x0,x0,x6		// x0 is adjusted in such way that
1259					// at exit from the loop v1.16b-v26.16b
1260					// are loaded with last "words"
1261	add	x6,x2,#0x60		// because .Lcbc_tail4x
1262
1263	aesd	v0.16b,v19.16b
1264	aesimc	v0.16b,v0.16b
1265	aesd	v1.16b,v19.16b
1266	aesimc	v1.16b,v1.16b
1267	aesd	v24.16b,v19.16b
1268	aesimc	v24.16b,v24.16b
1269	aesd	v25.16b,v19.16b
1270	aesimc	v25.16b,v25.16b
1271	aesd	v26.16b,v19.16b
1272	aesimc	v26.16b,v26.16b
1273
1274	aesd	v0.16b,v20.16b
1275	aesimc	v0.16b,v0.16b
1276	aesd	v1.16b,v20.16b
1277	aesimc	v1.16b,v1.16b
1278	aesd	v24.16b,v20.16b
1279	aesimc	v24.16b,v24.16b
1280	aesd	v25.16b,v20.16b
1281	aesimc	v25.16b,v25.16b
1282	aesd	v26.16b,v20.16b
1283	aesimc	v26.16b,v26.16b
1284
1285	aesd	v0.16b,v21.16b
1286	aesimc	v0.16b,v0.16b
1287	aesd	v1.16b,v21.16b
1288	aesimc	v1.16b,v1.16b
1289	aesd	v24.16b,v21.16b
1290	aesimc	v24.16b,v24.16b
1291	aesd	v25.16b,v21.16b
1292	aesimc	v25.16b,v25.16b
1293	aesd	v26.16b,v21.16b
1294	aesimc	v26.16b,v26.16b
1295
1296	aesd	v0.16b,v22.16b
1297	aesimc	v0.16b,v0.16b
1298	aesd	v1.16b,v22.16b
1299	aesimc	v1.16b,v1.16b
1300	aesd	v24.16b,v22.16b
1301	aesimc	v24.16b,v24.16b
1302	aesd	v25.16b,v22.16b
1303	aesimc	v25.16b,v25.16b
1304	aesd	v26.16b,v22.16b
1305	aesimc	v26.16b,v26.16b
1306
1307	eor	v4.16b,v6.16b,v7.16b
1308	aesd	v0.16b,v23.16b
1309	eor	v5.16b,v2.16b,v7.16b
1310	ld1	{v2.16b},[x0],#16
1311	aesd	v1.16b,v23.16b
1312	eor	v17.16b,v3.16b,v7.16b
1313	ld1	{v3.16b},[x0],#16
1314	aesd	v24.16b,v23.16b
1315	eor	v30.16b,v27.16b,v7.16b
1316	ld1	{v27.16b},[x0],#16
1317	aesd	v25.16b,v23.16b
1318	eor	v31.16b,v28.16b,v7.16b
1319	ld1	{v28.16b},[x0],#16
1320	aesd	v26.16b,v23.16b
1321	orr	v6.16b,v29.16b,v29.16b
1322	ld1	{v29.16b},[x0],#16
1323	cbz	x6,.Lcbc_tail4x
1324	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1325	eor	v4.16b,v4.16b,v0.16b
1326	orr	v0.16b,v2.16b,v2.16b
1327	eor	v5.16b,v5.16b,v1.16b
1328	orr	v1.16b,v3.16b,v3.16b
1329	eor	v17.16b,v17.16b,v24.16b
1330	orr	v24.16b,v27.16b,v27.16b
1331	eor	v30.16b,v30.16b,v25.16b
1332	orr	v25.16b,v28.16b,v28.16b
1333	eor	v31.16b,v31.16b,v26.16b
1334	st1	{v4.16b},[x1],#16
1335	orr	v26.16b,v29.16b,v29.16b
1336	st1	{v5.16b},[x1],#16
1337	mov	w6,w5
1338	st1	{v17.16b},[x1],#16
1339	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1340	st1	{v30.16b},[x1],#16
1341	st1	{v31.16b},[x1],#16
1342	b.hs	.Loop5x_cbc_dec
1343
1344	add	x2,x2,#0x50
1345	cbz	x2,.Lcbc_done
1346
1347	add	w6,w5,#2
1348	subs	x2,x2,#0x30
1349	orr	v0.16b,v27.16b,v27.16b
1350	orr	v2.16b,v27.16b,v27.16b
1351	orr	v1.16b,v28.16b,v28.16b
1352	orr	v3.16b,v28.16b,v28.16b
1353	orr	v24.16b,v29.16b,v29.16b
1354	orr	v27.16b,v29.16b,v29.16b
1355	b.lo	.Lcbc_dec_tail
1356
1357	b	.Loop3x_cbc_dec
1358
1359.align	4
1360.Lcbc_tail4x:
1361	eor	v5.16b,v4.16b,v1.16b
1362	eor	v17.16b,v17.16b,v24.16b
1363	eor	v30.16b,v30.16b,v25.16b
1364	eor	v31.16b,v31.16b,v26.16b
1365	st1	{v5.16b},[x1],#16
1366	st1	{v17.16b},[x1],#16
1367	st1	{v30.16b},[x1],#16
1368	st1	{v31.16b},[x1],#16
1369
1370	b	.Lcbc_done
1371.align	4
1372.Loop3x_cbc_dec:
1373	aesd	v0.16b,v16.16b
1374	aesimc	v0.16b,v0.16b
1375	aesd	v1.16b,v16.16b
1376	aesimc	v1.16b,v1.16b
1377	aesd	v24.16b,v16.16b
1378	aesimc	v24.16b,v24.16b
1379	ld1	{v16.4s},[x7],#16
1380	subs	w6,w6,#2
1381	aesd	v0.16b,v17.16b
1382	aesimc	v0.16b,v0.16b
1383	aesd	v1.16b,v17.16b
1384	aesimc	v1.16b,v1.16b
1385	aesd	v24.16b,v17.16b
1386	aesimc	v24.16b,v24.16b
1387	ld1	{v17.4s},[x7],#16
1388	b.gt	.Loop3x_cbc_dec
1389
1390	aesd	v0.16b,v16.16b
1391	aesimc	v0.16b,v0.16b
1392	aesd	v1.16b,v16.16b
1393	aesimc	v1.16b,v1.16b
1394	aesd	v24.16b,v16.16b
1395	aesimc	v24.16b,v24.16b
1396	eor	v4.16b,v6.16b,v7.16b
1397	subs	x2,x2,#0x30
1398	eor	v5.16b,v2.16b,v7.16b
1399	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
1400	aesd	v0.16b,v17.16b
1401	aesimc	v0.16b,v0.16b
1402	aesd	v1.16b,v17.16b
1403	aesimc	v1.16b,v1.16b
1404	aesd	v24.16b,v17.16b
1405	aesimc	v24.16b,v24.16b
1406	eor	v17.16b,v3.16b,v7.16b
1407	add	x0,x0,x6		// x0 is adjusted in such way that
1408					// at exit from the loop v1.16b-v24.16b
1409					// are loaded with last "words"
1410	orr	v6.16b,v27.16b,v27.16b
1411	mov	x7,x3
1412	aesd	v0.16b,v20.16b
1413	aesimc	v0.16b,v0.16b
1414	aesd	v1.16b,v20.16b
1415	aesimc	v1.16b,v1.16b
1416	aesd	v24.16b,v20.16b
1417	aesimc	v24.16b,v24.16b
1418	ld1	{v2.16b},[x0],#16
1419	aesd	v0.16b,v21.16b
1420	aesimc	v0.16b,v0.16b
1421	aesd	v1.16b,v21.16b
1422	aesimc	v1.16b,v1.16b
1423	aesd	v24.16b,v21.16b
1424	aesimc	v24.16b,v24.16b
1425	ld1	{v3.16b},[x0],#16
1426	aesd	v0.16b,v22.16b
1427	aesimc	v0.16b,v0.16b
1428	aesd	v1.16b,v22.16b
1429	aesimc	v1.16b,v1.16b
1430	aesd	v24.16b,v22.16b
1431	aesimc	v24.16b,v24.16b
1432	ld1	{v27.16b},[x0],#16
1433	aesd	v0.16b,v23.16b
1434	aesd	v1.16b,v23.16b
1435	aesd	v24.16b,v23.16b
1436	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1437	add	w6,w5,#2
1438	eor	v4.16b,v4.16b,v0.16b
1439	eor	v5.16b,v5.16b,v1.16b
1440	eor	v24.16b,v24.16b,v17.16b
1441	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1442	st1	{v4.16b},[x1],#16
1443	orr	v0.16b,v2.16b,v2.16b
1444	st1	{v5.16b},[x1],#16
1445	orr	v1.16b,v3.16b,v3.16b
1446	st1	{v24.16b},[x1],#16
1447	orr	v24.16b,v27.16b,v27.16b
1448	b.hs	.Loop3x_cbc_dec
1449
1450	cmn	x2,#0x30
1451	b.eq	.Lcbc_done
1452	nop
1453
1454.Lcbc_dec_tail:
1455	aesd	v1.16b,v16.16b
1456	aesimc	v1.16b,v1.16b
1457	aesd	v24.16b,v16.16b
1458	aesimc	v24.16b,v24.16b
1459	ld1	{v16.4s},[x7],#16
1460	subs	w6,w6,#2
1461	aesd	v1.16b,v17.16b
1462	aesimc	v1.16b,v1.16b
1463	aesd	v24.16b,v17.16b
1464	aesimc	v24.16b,v24.16b
1465	ld1	{v17.4s},[x7],#16
1466	b.gt	.Lcbc_dec_tail
1467
1468	aesd	v1.16b,v16.16b
1469	aesimc	v1.16b,v1.16b
1470	aesd	v24.16b,v16.16b
1471	aesimc	v24.16b,v24.16b
1472	aesd	v1.16b,v17.16b
1473	aesimc	v1.16b,v1.16b
1474	aesd	v24.16b,v17.16b
1475	aesimc	v24.16b,v24.16b
1476	aesd	v1.16b,v20.16b
1477	aesimc	v1.16b,v1.16b
1478	aesd	v24.16b,v20.16b
1479	aesimc	v24.16b,v24.16b
1480	cmn	x2,#0x20
1481	aesd	v1.16b,v21.16b
1482	aesimc	v1.16b,v1.16b
1483	aesd	v24.16b,v21.16b
1484	aesimc	v24.16b,v24.16b
1485	eor	v5.16b,v6.16b,v7.16b
1486	aesd	v1.16b,v22.16b
1487	aesimc	v1.16b,v1.16b
1488	aesd	v24.16b,v22.16b
1489	aesimc	v24.16b,v24.16b
1490	eor	v17.16b,v3.16b,v7.16b
1491	aesd	v1.16b,v23.16b
1492	aesd	v24.16b,v23.16b
1493	b.eq	.Lcbc_dec_one
1494	eor	v5.16b,v5.16b,v1.16b
1495	eor	v17.16b,v17.16b,v24.16b
1496	orr	v6.16b,v27.16b,v27.16b
1497	st1	{v5.16b},[x1],#16
1498	st1	{v17.16b},[x1],#16
1499	b	.Lcbc_done
1500
1501.Lcbc_dec_one:
1502	eor	v5.16b,v5.16b,v24.16b
1503	orr	v6.16b,v27.16b,v27.16b
1504	st1	{v5.16b},[x1],#16
1505
1506.Lcbc_done:
1507	st1	{v6.16b},[x4]
1508.Lcbc_abort:
1509	ldr	x29,[sp],#16
1510	ret
1511.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
1512.globl	aes_v8_ctr32_encrypt_blocks
1513.type	aes_v8_ctr32_encrypt_blocks,%function
1514.align	5
1515aes_v8_ctr32_encrypt_blocks:
1516	AARCH64_VALID_CALL_TARGET
1517	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1518	stp	x29,x30,[sp,#-16]!
1519	add	x29,sp,#0
1520	ldr	w5,[x3,#240]
1521
1522	ldr	w8, [x4, #12]
1523#ifdef __AARCH64EB__
1524	ld1	{v0.16b},[x4]
1525#else
1526	ld1	{v0.4s},[x4]
1527#endif
1528	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
1529	sub	w5,w5,#4
1530	mov	x12,#16
1531	cmp	x2,#2
1532	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
1533	sub	w5,w5,#2
1534	ld1	{v20.4s,v21.4s},[x7],#32
1535	ld1	{v22.4s,v23.4s},[x7],#32
1536	ld1	{v7.4s},[x7]
1537	add	x7,x3,#32
1538	mov	w6,w5
1539	csel	x12,xzr,x12,lo
1540#ifndef __AARCH64EB__
1541	rev	w8, w8
1542#endif
1543	orr	v1.16b,v0.16b,v0.16b
1544	add	w10, w8, #1
1545	orr	v18.16b,v0.16b,v0.16b
1546	add	w8, w8, #2
1547	orr	v6.16b,v0.16b,v0.16b
1548	rev	w10, w10
1549	mov	v1.s[3],w10
1550	b.ls	.Lctr32_tail
1551	rev	w12, w8
1552	sub	x2,x2,#3		// bias
1553	mov	v18.s[3],w12
1554	cmp	x2,#32
1555	b.lo	.Loop3x_ctr32
1556
1557	add	w13,w8,#1
1558	add	w14,w8,#2
1559	orr	v24.16b,v0.16b,v0.16b
1560	rev	w13,w13
1561	orr	v25.16b,v0.16b,v0.16b
1562	rev	w14,w14
1563	mov	v24.s[3],w13
1564	sub	x2,x2,#2		// bias
1565	mov	v25.s[3],w14
1566	add	w8,w8,#2
1567	b	.Loop5x_ctr32
1568
1569.align	4
1570.Loop5x_ctr32:
1571	aese	v0.16b,v16.16b
1572	aesmc	v0.16b,v0.16b
1573	aese	v1.16b,v16.16b
1574	aesmc	v1.16b,v1.16b
1575	aese	v18.16b,v16.16b
1576	aesmc	v18.16b,v18.16b
1577	aese	v24.16b,v16.16b
1578	aesmc	v24.16b,v24.16b
1579	aese	v25.16b,v16.16b
1580	aesmc	v25.16b,v25.16b
1581	ld1	{v16.4s},[x7],#16
1582	subs	w6,w6,#2
1583	aese	v0.16b,v17.16b
1584	aesmc	v0.16b,v0.16b
1585	aese	v1.16b,v17.16b
1586	aesmc	v1.16b,v1.16b
1587	aese	v18.16b,v17.16b
1588	aesmc	v18.16b,v18.16b
1589	aese	v24.16b,v17.16b
1590	aesmc	v24.16b,v24.16b
1591	aese	v25.16b,v17.16b
1592	aesmc	v25.16b,v25.16b
1593	ld1	{v17.4s},[x7],#16
1594	b.gt	.Loop5x_ctr32
1595
1596	mov	x7,x3
1597	aese	v0.16b,v16.16b
1598	aesmc	v0.16b,v0.16b
1599	aese	v1.16b,v16.16b
1600	aesmc	v1.16b,v1.16b
1601	aese	v18.16b,v16.16b
1602	aesmc	v18.16b,v18.16b
1603	aese	v24.16b,v16.16b
1604	aesmc	v24.16b,v24.16b
1605	aese	v25.16b,v16.16b
1606	aesmc	v25.16b,v25.16b
1607	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1608
1609	aese	v0.16b,v17.16b
1610	aesmc	v0.16b,v0.16b
1611	aese	v1.16b,v17.16b
1612	aesmc	v1.16b,v1.16b
1613	aese	v18.16b,v17.16b
1614	aesmc	v18.16b,v18.16b
1615	aese	v24.16b,v17.16b
1616	aesmc	v24.16b,v24.16b
1617	aese	v25.16b,v17.16b
1618	aesmc	v25.16b,v25.16b
1619	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1620
1621	aese	v0.16b,v20.16b
1622	aesmc	v0.16b,v0.16b
1623	add	w9,w8,#1
1624	add	w10,w8,#2
1625	aese	v1.16b,v20.16b
1626	aesmc	v1.16b,v1.16b
1627	add	w12,w8,#3
1628	add	w13,w8,#4
1629	aese	v18.16b,v20.16b
1630	aesmc	v18.16b,v18.16b
1631	add	w14,w8,#5
1632	rev	w9,w9
1633	aese	v24.16b,v20.16b
1634	aesmc	v24.16b,v24.16b
1635	rev	w10,w10
1636	rev	w12,w12
1637	aese	v25.16b,v20.16b
1638	aesmc	v25.16b,v25.16b
1639	rev	w13,w13
1640	rev	w14,w14
1641
1642	aese	v0.16b,v21.16b
1643	aesmc	v0.16b,v0.16b
1644	aese	v1.16b,v21.16b
1645	aesmc	v1.16b,v1.16b
1646	aese	v18.16b,v21.16b
1647	aesmc	v18.16b,v18.16b
1648	aese	v24.16b,v21.16b
1649	aesmc	v24.16b,v24.16b
1650	aese	v25.16b,v21.16b
1651	aesmc	v25.16b,v25.16b
1652
1653	aese	v0.16b,v22.16b
1654	aesmc	v0.16b,v0.16b
1655	ld1	{v2.16b},[x0],#16
1656	aese	v1.16b,v22.16b
1657	aesmc	v1.16b,v1.16b
1658	ld1	{v3.16b},[x0],#16
1659	aese	v18.16b,v22.16b
1660	aesmc	v18.16b,v18.16b
1661	ld1	{v19.16b},[x0],#16
1662	aese	v24.16b,v22.16b
1663	aesmc	v24.16b,v24.16b
1664	ld1	{v26.16b},[x0],#16
1665	aese	v25.16b,v22.16b
1666	aesmc	v25.16b,v25.16b
1667	ld1	{v27.16b},[x0],#16
1668
1669	aese	v0.16b,v23.16b
1670	eor	v2.16b,v2.16b,v7.16b
1671	aese	v1.16b,v23.16b
1672	eor	v3.16b,v3.16b,v7.16b
1673	aese	v18.16b,v23.16b
1674	eor	v19.16b,v19.16b,v7.16b
1675	aese	v24.16b,v23.16b
1676	eor	v26.16b,v26.16b,v7.16b
1677	aese	v25.16b,v23.16b
1678	eor	v27.16b,v27.16b,v7.16b
1679
1680	eor	v2.16b,v2.16b,v0.16b
1681	orr	v0.16b,v6.16b,v6.16b
1682	eor	v3.16b,v3.16b,v1.16b
1683	orr	v1.16b,v6.16b,v6.16b
1684	eor	v19.16b,v19.16b,v18.16b
1685	orr	v18.16b,v6.16b,v6.16b
1686	eor	v26.16b,v26.16b,v24.16b
1687	orr	v24.16b,v6.16b,v6.16b
1688	eor	v27.16b,v27.16b,v25.16b
1689	orr	v25.16b,v6.16b,v6.16b
1690
1691	st1	{v2.16b},[x1],#16
1692	mov	v0.s[3],w9
1693	st1	{v3.16b},[x1],#16
1694	mov	v1.s[3],w10
1695	st1	{v19.16b},[x1],#16
1696	mov	v18.s[3],w12
1697	st1	{v26.16b},[x1],#16
1698	mov	v24.s[3],w13
1699	st1	{v27.16b},[x1],#16
1700	mov	v25.s[3],w14
1701
1702	mov	w6,w5
1703	cbz	x2,.Lctr32_done
1704
1705	add	w8,w8,#5
1706	subs	x2,x2,#5
1707	b.hs	.Loop5x_ctr32
1708
1709	add	x2,x2,#5
1710	sub	w8,w8,#5
1711
1712	cmp	x2,#2
1713	mov	x12,#16
1714	csel	x12,xzr,x12,lo
1715	b.ls	.Lctr32_tail
1716
1717	sub	x2,x2,#3		// bias
1718	add	w8,w8,#3
1719	b	.Loop3x_ctr32
1720
1721.align	4
1722.Loop3x_ctr32:
1723	aese	v0.16b,v16.16b
1724	aesmc	v0.16b,v0.16b
1725	aese	v1.16b,v16.16b
1726	aesmc	v1.16b,v1.16b
1727	aese	v18.16b,v16.16b
1728	aesmc	v18.16b,v18.16b
1729	ld1	{v16.4s},[x7],#16
1730	subs	w6,w6,#2
1731	aese	v0.16b,v17.16b
1732	aesmc	v0.16b,v0.16b
1733	aese	v1.16b,v17.16b
1734	aesmc	v1.16b,v1.16b
1735	aese	v18.16b,v17.16b
1736	aesmc	v18.16b,v18.16b
1737	ld1	{v17.4s},[x7],#16
1738	b.gt	.Loop3x_ctr32
1739
1740	aese	v0.16b,v16.16b
1741	aesmc	v4.16b,v0.16b
1742	aese	v1.16b,v16.16b
1743	aesmc	v5.16b,v1.16b
1744	ld1	{v2.16b},[x0],#16
1745	orr	v0.16b,v6.16b,v6.16b
1746	aese	v18.16b,v16.16b
1747	aesmc	v18.16b,v18.16b
1748	ld1	{v3.16b},[x0],#16
1749	orr	v1.16b,v6.16b,v6.16b
1750	aese	v4.16b,v17.16b
1751	aesmc	v4.16b,v4.16b
1752	aese	v5.16b,v17.16b
1753	aesmc	v5.16b,v5.16b
1754	ld1	{v19.16b},[x0],#16
1755	mov	x7,x3
1756	aese	v18.16b,v17.16b
1757	aesmc	v17.16b,v18.16b
1758	orr	v18.16b,v6.16b,v6.16b
1759	add	w9,w8,#1
1760	aese	v4.16b,v20.16b
1761	aesmc	v4.16b,v4.16b
1762	aese	v5.16b,v20.16b
1763	aesmc	v5.16b,v5.16b
1764	eor	v2.16b,v2.16b,v7.16b
1765	add	w10,w8,#2
1766	aese	v17.16b,v20.16b
1767	aesmc	v17.16b,v17.16b
1768	eor	v3.16b,v3.16b,v7.16b
1769	add	w8,w8,#3
1770	aese	v4.16b,v21.16b
1771	aesmc	v4.16b,v4.16b
1772	aese	v5.16b,v21.16b
1773	aesmc	v5.16b,v5.16b
1774	eor	v19.16b,v19.16b,v7.16b
1775	rev	w9,w9
1776	aese	v17.16b,v21.16b
1777	aesmc	v17.16b,v17.16b
1778	mov	v0.s[3], w9
1779	rev	w10,w10
1780	aese	v4.16b,v22.16b
1781	aesmc	v4.16b,v4.16b
1782	aese	v5.16b,v22.16b
1783	aesmc	v5.16b,v5.16b
1784	mov	v1.s[3], w10
1785	rev	w12,w8
1786	aese	v17.16b,v22.16b
1787	aesmc	v17.16b,v17.16b
1788	mov	v18.s[3], w12
1789	subs	x2,x2,#3
1790	aese	v4.16b,v23.16b
1791	aese	v5.16b,v23.16b
1792	aese	v17.16b,v23.16b
1793
1794	eor	v2.16b,v2.16b,v4.16b
1795	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1796	st1	{v2.16b},[x1],#16
1797	eor	v3.16b,v3.16b,v5.16b
1798	mov	w6,w5
1799	st1	{v3.16b},[x1],#16
1800	eor	v19.16b,v19.16b,v17.16b
1801	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1802	st1	{v19.16b},[x1],#16
1803	b.hs	.Loop3x_ctr32
1804
1805	adds	x2,x2,#3
1806	b.eq	.Lctr32_done
1807	cmp	x2,#1
1808	mov	x12,#16
1809	csel	x12,xzr,x12,eq
1810
1811.Lctr32_tail:
1812	aese	v0.16b,v16.16b
1813	aesmc	v0.16b,v0.16b
1814	aese	v1.16b,v16.16b
1815	aesmc	v1.16b,v1.16b
1816	ld1	{v16.4s},[x7],#16
1817	subs	w6,w6,#2
1818	aese	v0.16b,v17.16b
1819	aesmc	v0.16b,v0.16b
1820	aese	v1.16b,v17.16b
1821	aesmc	v1.16b,v1.16b
1822	ld1	{v17.4s},[x7],#16
1823	b.gt	.Lctr32_tail
1824
1825	aese	v0.16b,v16.16b
1826	aesmc	v0.16b,v0.16b
1827	aese	v1.16b,v16.16b
1828	aesmc	v1.16b,v1.16b
1829	aese	v0.16b,v17.16b
1830	aesmc	v0.16b,v0.16b
1831	aese	v1.16b,v17.16b
1832	aesmc	v1.16b,v1.16b
1833	ld1	{v2.16b},[x0],x12
1834	aese	v0.16b,v20.16b
1835	aesmc	v0.16b,v0.16b
1836	aese	v1.16b,v20.16b
1837	aesmc	v1.16b,v1.16b
1838	ld1	{v3.16b},[x0]
1839	aese	v0.16b,v21.16b
1840	aesmc	v0.16b,v0.16b
1841	aese	v1.16b,v21.16b
1842	aesmc	v1.16b,v1.16b
1843	eor	v2.16b,v2.16b,v7.16b
1844	aese	v0.16b,v22.16b
1845	aesmc	v0.16b,v0.16b
1846	aese	v1.16b,v22.16b
1847	aesmc	v1.16b,v1.16b
1848	eor	v3.16b,v3.16b,v7.16b
1849	aese	v0.16b,v23.16b
1850	aese	v1.16b,v23.16b
1851
1852	cmp	x2,#1
1853	eor	v2.16b,v2.16b,v0.16b
1854	eor	v3.16b,v3.16b,v1.16b
1855	st1	{v2.16b},[x1],#16
1856	b.eq	.Lctr32_done
1857	st1	{v3.16b},[x1]
1858
1859.Lctr32_done:
1860	ldr	x29,[sp],#16
1861	ret
1862.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
1863.globl	aes_v8_xts_encrypt
1864.type	aes_v8_xts_encrypt,%function
1865.align	5
1866aes_v8_xts_encrypt:
1867	AARCH64_VALID_CALL_TARGET
1868	cmp	x2,#16
1869	// Original input data size bigger than 16, jump to big size processing.
1870	b.ne	.Lxts_enc_big_size
1871	// Encrypt the iv with key2, as the first XEX iv.
1872	ldr	w6,[x4,#240]
1873	ld1	{v0.4s},[x4],#16
1874	ld1	{v6.16b},[x5]
1875	sub	w6,w6,#2
1876	ld1	{v1.4s},[x4],#16
1877
1878.Loop_enc_iv_enc:
1879	aese	v6.16b,v0.16b
1880	aesmc	v6.16b,v6.16b
1881	ld1	{v0.4s},[x4],#16
1882	subs	w6,w6,#2
1883	aese	v6.16b,v1.16b
1884	aesmc	v6.16b,v6.16b
1885	ld1	{v1.4s},[x4],#16
1886	b.gt	.Loop_enc_iv_enc
1887
1888	aese	v6.16b,v0.16b
1889	aesmc	v6.16b,v6.16b
1890	ld1	{v0.4s},[x4]
1891	aese	v6.16b,v1.16b
1892	eor	v6.16b,v6.16b,v0.16b
1893
1894	ld1	{v0.16b},[x0]
1895	eor	v0.16b,v6.16b,v0.16b
1896
1897	ldr	w6,[x3,#240]
1898	ld1	{v28.4s,v29.4s},[x3],#32		// load key schedule...
1899
1900	aese	v0.16b,v28.16b
1901	aesmc	v0.16b,v0.16b
1902	ld1	{v16.4s,v17.4s},[x3],#32		// load key schedule...
1903	aese	v0.16b,v29.16b
1904	aesmc	v0.16b,v0.16b
1905	subs	w6,w6,#10		// if rounds==10, jump to aes-128-xts processing
1906	b.eq	.Lxts_128_enc
1907.Lxts_enc_round_loop:
1908	aese	v0.16b,v16.16b
1909	aesmc	v0.16b,v0.16b
1910	ld1	{v16.4s},[x3],#16		// load key schedule...
1911	aese	v0.16b,v17.16b
1912	aesmc	v0.16b,v0.16b
1913	ld1	{v17.4s},[x3],#16		// load key schedule...
1914	subs	w6,w6,#2		// bias
1915	b.gt	.Lxts_enc_round_loop
1916.Lxts_128_enc:
1917	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
1918	aese	v0.16b,v16.16b
1919	aesmc	v0.16b,v0.16b
1920	aese	v0.16b,v17.16b
1921	aesmc	v0.16b,v0.16b
1922	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
1923	aese	v0.16b,v18.16b
1924	aesmc	v0.16b,v0.16b
1925	aese	v0.16b,v19.16b
1926	aesmc	v0.16b,v0.16b
1927	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
1928	aese	v0.16b,v20.16b
1929	aesmc	v0.16b,v0.16b
1930	aese	v0.16b,v21.16b
1931	aesmc	v0.16b,v0.16b
1932	ld1	{v7.4s},[x3]
1933	aese	v0.16b,v22.16b
1934	aesmc	v0.16b,v0.16b
1935	aese	v0.16b,v23.16b
1936	eor	v0.16b,v0.16b,v7.16b
1937	eor	v0.16b,v0.16b,v6.16b
1938	st1	{v0.16b},[x1]
1939	b	.Lxts_enc_final_abort
1940
1941.align	4
1942.Lxts_enc_big_size:
1943	stp	x19,x20,[sp,#-64]!
1944	stp	x21,x22,[sp,#48]
1945	stp	d8,d9,[sp,#32]
1946	stp	d10,d11,[sp,#16]
1947
1948	// tailcnt store the tail value of length%16.
1949	and	x21,x2,#0xf
1950	and	x2,x2,#-16
1951	subs	x2,x2,#16
1952	mov	x8,#16
1953	b.lo	.Lxts_abort
1954	csel	x8,xzr,x8,eq
1955
1956	// Firstly, encrypt the iv with key2, as the first iv of XEX.
1957	ldr	w6,[x4,#240]
1958	ld1	{v0.4s},[x4],#16
1959	ld1	{v6.16b},[x5]
1960	sub	w6,w6,#2
1961	ld1	{v1.4s},[x4],#16
1962
1963.Loop_iv_enc:
1964	aese	v6.16b,v0.16b
1965	aesmc	v6.16b,v6.16b
1966	ld1	{v0.4s},[x4],#16
1967	subs	w6,w6,#2
1968	aese	v6.16b,v1.16b
1969	aesmc	v6.16b,v6.16b
1970	ld1	{v1.4s},[x4],#16
1971	b.gt	.Loop_iv_enc
1972
1973	aese	v6.16b,v0.16b
1974	aesmc	v6.16b,v6.16b
1975	ld1	{v0.4s},[x4]
1976	aese	v6.16b,v1.16b
1977	eor	v6.16b,v6.16b,v0.16b
1978
1979	// The iv for second block
1980	// x9- iv(low), x10 - iv(high)
1981	// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
1982	fmov	x9,d6
1983	fmov	x10,v6.d[1]
1984	mov	w19,#0x87
1985	extr	x22,x10,x10,#32
1986	extr	x10,x10,x9,#63
1987	and	w11,w19,w22,asr#31
1988	eor	x9,x11,x9,lsl#1
1989	fmov	d8,x9
1990	fmov	v8.d[1],x10
1991
1992	ldr	w5,[x3,#240]		// next starting point
1993	ld1	{v0.16b},[x0],x8
1994
1995	ld1	{v16.4s,v17.4s},[x3]			// load key schedule...
1996	sub	w5,w5,#6
1997	add	x7,x3,x5,lsl#4		// pointer to last 7 round keys
1998	sub	w5,w5,#2
1999	ld1	{v18.4s,v19.4s},[x7],#32
2000	ld1	{v20.4s,v21.4s},[x7],#32
2001	ld1	{v22.4s,v23.4s},[x7],#32
2002	ld1	{v7.4s},[x7]
2003
2004	add	x7,x3,#32
2005	mov	w6,w5
2006
2007	// Encryption
2008.Lxts_enc:
2009	ld1	{v24.16b},[x0],#16
2010	subs	x2,x2,#32			// bias
2011	add	w6,w5,#2
2012	orr	v3.16b,v0.16b,v0.16b
2013	orr	v1.16b,v0.16b,v0.16b
2014	orr	v28.16b,v0.16b,v0.16b
2015	orr	v27.16b,v24.16b,v24.16b
2016	orr	v29.16b,v24.16b,v24.16b
2017	b.lo	.Lxts_inner_enc_tail
2018	eor	v0.16b,v0.16b,v6.16b			// before encryption, xor with iv
2019	eor	v24.16b,v24.16b,v8.16b
2020
2021	// The iv for third block
2022	extr	x22,x10,x10,#32
2023	extr	x10,x10,x9,#63
2024	and	w11,w19,w22,asr#31
2025	eor	x9,x11,x9,lsl#1
2026	fmov	d9,x9
2027	fmov	v9.d[1],x10
2028
2029
2030	orr	v1.16b,v24.16b,v24.16b
2031	ld1	{v24.16b},[x0],#16
2032	orr	v2.16b,v0.16b,v0.16b
2033	orr	v3.16b,v1.16b,v1.16b
2034	eor	v27.16b,v24.16b,v9.16b 		// the third block
2035	eor	v24.16b,v24.16b,v9.16b
2036	cmp	x2,#32
2037	b.lo	.Lxts_outer_enc_tail
2038
2039	// The iv for fourth block
2040	extr	x22,x10,x10,#32
2041	extr	x10,x10,x9,#63
2042	and	w11,w19,w22,asr#31
2043	eor	x9,x11,x9,lsl#1
2044	fmov	d10,x9
2045	fmov	v10.d[1],x10
2046
2047	ld1	{v25.16b},[x0],#16
2048	// The iv for fifth block
2049	extr	x22,x10,x10,#32
2050	extr	x10,x10,x9,#63
2051	and	w11,w19,w22,asr#31
2052	eor	x9,x11,x9,lsl#1
2053	fmov	d11,x9
2054	fmov	v11.d[1],x10
2055
2056	ld1	{v26.16b},[x0],#16
2057	eor	v25.16b,v25.16b,v10.16b		// the fourth block
2058	eor	v26.16b,v26.16b,v11.16b
2059	sub	x2,x2,#32			// bias
2060	mov	w6,w5
2061	b	.Loop5x_xts_enc
2062
2063.align	4
2064.Loop5x_xts_enc:
2065	aese	v0.16b,v16.16b
2066	aesmc	v0.16b,v0.16b
2067	aese	v1.16b,v16.16b
2068	aesmc	v1.16b,v1.16b
2069	aese	v24.16b,v16.16b
2070	aesmc	v24.16b,v24.16b
2071	aese	v25.16b,v16.16b
2072	aesmc	v25.16b,v25.16b
2073	aese	v26.16b,v16.16b
2074	aesmc	v26.16b,v26.16b
2075	ld1	{v16.4s},[x7],#16
2076	subs	w6,w6,#2
2077	aese	v0.16b,v17.16b
2078	aesmc	v0.16b,v0.16b
2079	aese	v1.16b,v17.16b
2080	aesmc	v1.16b,v1.16b
2081	aese	v24.16b,v17.16b
2082	aesmc	v24.16b,v24.16b
2083	aese	v25.16b,v17.16b
2084	aesmc	v25.16b,v25.16b
2085	aese	v26.16b,v17.16b
2086	aesmc	v26.16b,v26.16b
2087	ld1	{v17.4s},[x7],#16
2088	b.gt	.Loop5x_xts_enc
2089
2090	aese	v0.16b,v16.16b
2091	aesmc	v0.16b,v0.16b
2092	aese	v1.16b,v16.16b
2093	aesmc	v1.16b,v1.16b
2094	aese	v24.16b,v16.16b
2095	aesmc	v24.16b,v24.16b
2096	aese	v25.16b,v16.16b
2097	aesmc	v25.16b,v25.16b
2098	aese	v26.16b,v16.16b
2099	aesmc	v26.16b,v26.16b
2100	subs	x2,x2,#0x50			// because .Lxts_enc_tail4x
2101
2102	aese	v0.16b,v17.16b
2103	aesmc	v0.16b,v0.16b
2104	aese	v1.16b,v17.16b
2105	aesmc	v1.16b,v1.16b
2106	aese	v24.16b,v17.16b
2107	aesmc	v24.16b,v24.16b
2108	aese	v25.16b,v17.16b
2109	aesmc	v25.16b,v25.16b
2110	aese	v26.16b,v17.16b
2111	aesmc	v26.16b,v26.16b
2112	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
2113	mov	x7,x3
2114
2115	aese	v0.16b,v18.16b
2116	aesmc	v0.16b,v0.16b
2117	aese	v1.16b,v18.16b
2118	aesmc	v1.16b,v1.16b
2119	aese	v24.16b,v18.16b
2120	aesmc	v24.16b,v24.16b
2121	aese	v25.16b,v18.16b
2122	aesmc	v25.16b,v25.16b
2123	aese	v26.16b,v18.16b
2124	aesmc	v26.16b,v26.16b
2125	add	x0,x0,x6		// x0 is adjusted in such way that
2126						// at exit from the loop v1.16b-v26.16b
2127						// are loaded with last "words"
2128	add	x6,x2,#0x60		// because .Lxts_enc_tail4x
2129
2130	aese	v0.16b,v19.16b
2131	aesmc	v0.16b,v0.16b
2132	aese	v1.16b,v19.16b
2133	aesmc	v1.16b,v1.16b
2134	aese	v24.16b,v19.16b
2135	aesmc	v24.16b,v24.16b
2136	aese	v25.16b,v19.16b
2137	aesmc	v25.16b,v25.16b
2138	aese	v26.16b,v19.16b
2139	aesmc	v26.16b,v26.16b
2140
2141	aese	v0.16b,v20.16b
2142	aesmc	v0.16b,v0.16b
2143	aese	v1.16b,v20.16b
2144	aesmc	v1.16b,v1.16b
2145	aese	v24.16b,v20.16b
2146	aesmc	v24.16b,v24.16b
2147	aese	v25.16b,v20.16b
2148	aesmc	v25.16b,v25.16b
2149	aese	v26.16b,v20.16b
2150	aesmc	v26.16b,v26.16b
2151
2152	aese	v0.16b,v21.16b
2153	aesmc	v0.16b,v0.16b
2154	aese	v1.16b,v21.16b
2155	aesmc	v1.16b,v1.16b
2156	aese	v24.16b,v21.16b
2157	aesmc	v24.16b,v24.16b
2158	aese	v25.16b,v21.16b
2159	aesmc	v25.16b,v25.16b
2160	aese	v26.16b,v21.16b
2161	aesmc	v26.16b,v26.16b
2162
2163	aese	v0.16b,v22.16b
2164	aesmc	v0.16b,v0.16b
2165	aese	v1.16b,v22.16b
2166	aesmc	v1.16b,v1.16b
2167	aese	v24.16b,v22.16b
2168	aesmc	v24.16b,v24.16b
2169	aese	v25.16b,v22.16b
2170	aesmc	v25.16b,v25.16b
2171	aese	v26.16b,v22.16b
2172	aesmc	v26.16b,v26.16b
2173
2174	eor	v4.16b,v7.16b,v6.16b
2175	aese	v0.16b,v23.16b
2176	// The iv for first block of one iteration
2177	extr	x22,x10,x10,#32
2178	extr	x10,x10,x9,#63
2179	and	w11,w19,w22,asr#31
2180	eor	x9,x11,x9,lsl#1
2181	fmov	d6,x9
2182	fmov	v6.d[1],x10
2183	eor	v5.16b,v7.16b,v8.16b
2184	ld1	{v2.16b},[x0],#16
2185	aese	v1.16b,v23.16b
2186	// The iv for second block
2187	extr	x22,x10,x10,#32
2188	extr	x10,x10,x9,#63
2189	and	w11,w19,w22,asr#31
2190	eor	x9,x11,x9,lsl#1
2191	fmov	d8,x9
2192	fmov	v8.d[1],x10
2193	eor	v17.16b,v7.16b,v9.16b
2194	ld1	{v3.16b},[x0],#16
2195	aese	v24.16b,v23.16b
2196	// The iv for third block
2197	extr	x22,x10,x10,#32
2198	extr	x10,x10,x9,#63
2199	and	w11,w19,w22,asr#31
2200	eor	x9,x11,x9,lsl#1
2201	fmov	d9,x9
2202	fmov	v9.d[1],x10
2203	eor	v30.16b,v7.16b,v10.16b
2204	ld1	{v27.16b},[x0],#16
2205	aese	v25.16b,v23.16b
2206	// The iv for fourth block
2207	extr	x22,x10,x10,#32
2208	extr	x10,x10,x9,#63
2209	and	w11,w19,w22,asr#31
2210	eor	x9,x11,x9,lsl#1
2211	fmov	d10,x9
2212	fmov	v10.d[1],x10
2213	eor	v31.16b,v7.16b,v11.16b
2214	ld1	{v28.16b},[x0],#16
2215	aese	v26.16b,v23.16b
2216
2217	// The iv for fifth block
2218	extr	x22,x10,x10,#32
2219	extr	x10,x10,x9,#63
2220	and	w11,w19,w22,asr #31
2221	eor	x9,x11,x9,lsl #1
2222	fmov	d11,x9
2223	fmov	v11.d[1],x10
2224
2225	ld1	{v29.16b},[x0],#16
2226	cbz	x6,.Lxts_enc_tail4x
2227	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
2228	eor	v4.16b,v4.16b,v0.16b
2229	eor	v0.16b,v2.16b,v6.16b
2230	eor	v5.16b,v5.16b,v1.16b
2231	eor	v1.16b,v3.16b,v8.16b
2232	eor	v17.16b,v17.16b,v24.16b
2233	eor	v24.16b,v27.16b,v9.16b
2234	eor	v30.16b,v30.16b,v25.16b
2235	eor	v25.16b,v28.16b,v10.16b
2236	eor	v31.16b,v31.16b,v26.16b
2237	st1	{v4.16b},[x1],#16
2238	eor	v26.16b,v29.16b,v11.16b
2239	st1	{v5.16b},[x1],#16
2240	mov	w6,w5
2241	st1	{v17.16b},[x1],#16
2242	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
2243	st1	{v30.16b},[x1],#16
2244	st1	{v31.16b},[x1],#16
2245	b.hs	.Loop5x_xts_enc
2246
2247
2248	// If left 4 blocks, borrow the five block's processing.
2249	cmn	x2,#0x10
2250	b.ne	.Loop5x_enc_after
2251	orr	v11.16b,v10.16b,v10.16b
2252	orr	v10.16b,v9.16b,v9.16b
2253	orr	v9.16b,v8.16b,v8.16b
2254	orr	v8.16b,v6.16b,v6.16b
2255	fmov	x9,d11
2256	fmov	x10,v11.d[1]
2257	eor	v0.16b,v6.16b,v2.16b
2258	eor	v1.16b,v8.16b,v3.16b
2259	eor	v24.16b,v27.16b,v9.16b
2260	eor	v25.16b,v28.16b,v10.16b
2261	eor	v26.16b,v29.16b,v11.16b
2262	b.eq	.Loop5x_xts_enc
2263
2264.Loop5x_enc_after:
2265	add	x2,x2,#0x50
2266	cbz	x2,.Lxts_enc_done
2267
2268	add	w6,w5,#2
2269	subs	x2,x2,#0x30
2270	b.lo	.Lxts_inner_enc_tail
2271
2272	eor	v0.16b,v6.16b,v27.16b
2273	eor	v1.16b,v8.16b,v28.16b
2274	eor	v24.16b,v29.16b,v9.16b
2275	b	.Lxts_outer_enc_tail
2276
2277.align	4
2278.Lxts_enc_tail4x:
2279	add	x0,x0,#16
2280	eor	v5.16b,v1.16b,v5.16b
2281	st1	{v5.16b},[x1],#16
2282	eor	v17.16b,v24.16b,v17.16b
2283	st1	{v17.16b},[x1],#16
2284	eor	v30.16b,v25.16b,v30.16b
2285	eor	v31.16b,v26.16b,v31.16b
2286	st1	{v30.16b,v31.16b},[x1],#32
2287
2288	b	.Lxts_enc_done
2289.align	4
2290.Lxts_outer_enc_tail:
2291	aese	v0.16b,v16.16b
2292	aesmc	v0.16b,v0.16b
2293	aese	v1.16b,v16.16b
2294	aesmc	v1.16b,v1.16b
2295	aese	v24.16b,v16.16b
2296	aesmc	v24.16b,v24.16b
2297	ld1	{v16.4s},[x7],#16
2298	subs	w6,w6,#2
2299	aese	v0.16b,v17.16b
2300	aesmc	v0.16b,v0.16b
2301	aese	v1.16b,v17.16b
2302	aesmc	v1.16b,v1.16b
2303	aese	v24.16b,v17.16b
2304	aesmc	v24.16b,v24.16b
2305	ld1	{v17.4s},[x7],#16
2306	b.gt	.Lxts_outer_enc_tail
2307
2308	aese	v0.16b,v16.16b
2309	aesmc	v0.16b,v0.16b
2310	aese	v1.16b,v16.16b
2311	aesmc	v1.16b,v1.16b
2312	aese	v24.16b,v16.16b
2313	aesmc	v24.16b,v24.16b
2314	eor	v4.16b,v6.16b,v7.16b
2315	subs	x2,x2,#0x30
2316	// The iv for first block
2317	fmov	x9,d9
2318	fmov	x10,v9.d[1]
2319	//mov	w19,#0x87
2320	extr	x22,x10,x10,#32
2321	extr	x10,x10,x9,#63
2322	and	w11,w19,w22,asr#31
2323	eor	x9,x11,x9,lsl#1
2324	fmov	d6,x9
2325	fmov	v6.d[1],x10
2326	eor	v5.16b,v8.16b,v7.16b
2327	csel	x6,x2,x6,lo       // x6, w6, is zero at this point
2328	aese	v0.16b,v17.16b
2329	aesmc	v0.16b,v0.16b
2330	aese	v1.16b,v17.16b
2331	aesmc	v1.16b,v1.16b
2332	aese	v24.16b,v17.16b
2333	aesmc	v24.16b,v24.16b
2334	eor	v17.16b,v9.16b,v7.16b
2335
2336	add	x6,x6,#0x20
2337	add	x0,x0,x6
2338	mov	x7,x3
2339
2340	aese	v0.16b,v20.16b
2341	aesmc	v0.16b,v0.16b
2342	aese	v1.16b,v20.16b
2343	aesmc	v1.16b,v1.16b
2344	aese	v24.16b,v20.16b
2345	aesmc	v24.16b,v24.16b
2346	aese	v0.16b,v21.16b
2347	aesmc	v0.16b,v0.16b
2348	aese	v1.16b,v21.16b
2349	aesmc	v1.16b,v1.16b
2350	aese	v24.16b,v21.16b
2351	aesmc	v24.16b,v24.16b
2352	aese	v0.16b,v22.16b
2353	aesmc	v0.16b,v0.16b
2354	aese	v1.16b,v22.16b
2355	aesmc	v1.16b,v1.16b
2356	aese	v24.16b,v22.16b
2357	aesmc	v24.16b,v24.16b
2358	aese	v0.16b,v23.16b
2359	aese	v1.16b,v23.16b
2360	aese	v24.16b,v23.16b
2361	ld1	{v27.16b},[x0],#16
2362	add	w6,w5,#2
2363	ld1	{v16.4s},[x7],#16                // re-pre-load rndkey[0]
2364	eor	v4.16b,v4.16b,v0.16b
2365	eor	v5.16b,v5.16b,v1.16b
2366	eor	v24.16b,v24.16b,v17.16b
2367	ld1	{v17.4s},[x7],#16                // re-pre-load rndkey[1]
2368	st1	{v4.16b},[x1],#16
2369	st1	{v5.16b},[x1],#16
2370	st1	{v24.16b},[x1],#16
2371	cmn	x2,#0x30
2372	b.eq	.Lxts_enc_done
2373.Lxts_encxor_one:
2374	orr	v28.16b,v3.16b,v3.16b
2375	orr	v29.16b,v27.16b,v27.16b
2376	nop
2377
2378.Lxts_inner_enc_tail:
2379	cmn	x2,#0x10
2380	eor	v1.16b,v28.16b,v6.16b
2381	eor	v24.16b,v29.16b,v8.16b
2382	b.eq	.Lxts_enc_tail_loop
2383	eor	v24.16b,v29.16b,v6.16b
2384.Lxts_enc_tail_loop:
2385	aese	v1.16b,v16.16b
2386	aesmc	v1.16b,v1.16b
2387	aese	v24.16b,v16.16b
2388	aesmc	v24.16b,v24.16b
2389	ld1	{v16.4s},[x7],#16
2390	subs	w6,w6,#2
2391	aese	v1.16b,v17.16b
2392	aesmc	v1.16b,v1.16b
2393	aese	v24.16b,v17.16b
2394	aesmc	v24.16b,v24.16b
2395	ld1	{v17.4s},[x7],#16
2396	b.gt	.Lxts_enc_tail_loop
2397
2398	aese	v1.16b,v16.16b
2399	aesmc	v1.16b,v1.16b
2400	aese	v24.16b,v16.16b
2401	aesmc	v24.16b,v24.16b
2402	aese	v1.16b,v17.16b
2403	aesmc	v1.16b,v1.16b
2404	aese	v24.16b,v17.16b
2405	aesmc	v24.16b,v24.16b
2406	aese	v1.16b,v20.16b
2407	aesmc	v1.16b,v1.16b
2408	aese	v24.16b,v20.16b
2409	aesmc	v24.16b,v24.16b
2410	cmn	x2,#0x20
2411	aese	v1.16b,v21.16b
2412	aesmc	v1.16b,v1.16b
2413	aese	v24.16b,v21.16b
2414	aesmc	v24.16b,v24.16b
2415	eor	v5.16b,v6.16b,v7.16b
2416	aese	v1.16b,v22.16b
2417	aesmc	v1.16b,v1.16b
2418	aese	v24.16b,v22.16b
2419	aesmc	v24.16b,v24.16b
2420	eor	v17.16b,v8.16b,v7.16b
2421	aese	v1.16b,v23.16b
2422	aese	v24.16b,v23.16b
2423	b.eq	.Lxts_enc_one
2424	eor	v5.16b,v5.16b,v1.16b
2425	st1	{v5.16b},[x1],#16
2426	eor	v17.16b,v17.16b,v24.16b
2427	orr	v6.16b,v8.16b,v8.16b
2428	st1	{v17.16b},[x1],#16
2429	fmov	x9,d8
2430	fmov	x10,v8.d[1]
2431	mov	w19,#0x87
2432	extr	x22,x10,x10,#32
2433	extr	x10,x10,x9,#63
2434	and	w11,w19,w22,asr #31
2435	eor	x9,x11,x9,lsl #1
2436	fmov	d6,x9
2437	fmov	v6.d[1],x10
2438	b	.Lxts_enc_done
2439
2440.Lxts_enc_one:
2441	eor	v5.16b,v5.16b,v24.16b
2442	orr	v6.16b,v6.16b,v6.16b
2443	st1	{v5.16b},[x1],#16
2444	fmov	x9,d6
2445	fmov	x10,v6.d[1]
2446	mov	w19,#0x87
2447	extr	x22,x10,x10,#32
2448	extr	x10,x10,x9,#63
2449	and	w11,w19,w22,asr #31
2450	eor	x9,x11,x9,lsl #1
2451	fmov	d6,x9
2452	fmov	v6.d[1],x10
2453	b	.Lxts_enc_done
2454.align	5
2455.Lxts_enc_done:
2456	// Process the tail block with cipher stealing.
2457	tst	x21,#0xf
2458	b.eq	.Lxts_abort
2459
2460	mov	x20,x0
2461	mov	x13,x1
2462	sub	x1,x1,#16
2463.composite_enc_loop:
2464	subs	x21,x21,#1
2465	ldrb	w15,[x1,x21]
2466	ldrb	w14,[x20,x21]
2467	strb	w15,[x13,x21]
2468	strb	w14,[x1,x21]
2469	b.gt	.composite_enc_loop
2470.Lxts_enc_load_done:
2471	ld1	{v26.16b},[x1]
2472	eor	v26.16b,v26.16b,v6.16b
2473
2474	// Encrypt the composite block to get the last second encrypted text block
2475	ldr	w6,[x3,#240]		// load key schedule...
2476	ld1	{v0.4s},[x3],#16
2477	sub	w6,w6,#2
2478	ld1	{v1.4s},[x3],#16		// load key schedule...
2479.Loop_final_enc:
2480	aese	v26.16b,v0.16b
2481	aesmc	v26.16b,v26.16b
2482	ld1	{v0.4s},[x3],#16
2483	subs	w6,w6,#2
2484	aese	v26.16b,v1.16b
2485	aesmc	v26.16b,v26.16b
2486	ld1	{v1.4s},[x3],#16
2487	b.gt	.Loop_final_enc
2488
2489	aese	v26.16b,v0.16b
2490	aesmc	v26.16b,v26.16b
2491	ld1	{v0.4s},[x3]
2492	aese	v26.16b,v1.16b
2493	eor	v26.16b,v26.16b,v0.16b
2494	eor	v26.16b,v26.16b,v6.16b
2495	st1	{v26.16b},[x1]
2496
2497.Lxts_abort:
2498	ldp	x21,x22,[sp,#48]
2499	ldp	d8,d9,[sp,#32]
2500	ldp	d10,d11,[sp,#16]
2501	ldp	x19,x20,[sp],#64
2502.Lxts_enc_final_abort:
2503	ret
2504.size	aes_v8_xts_encrypt,.-aes_v8_xts_encrypt
2505.globl	aes_v8_xts_decrypt
2506.type	aes_v8_xts_decrypt,%function
2507.align	5
2508aes_v8_xts_decrypt:
2509	AARCH64_VALID_CALL_TARGET
2510	cmp	x2,#16
2511	// Original input data size bigger than 16, jump to big size processing.
2512	b.ne	.Lxts_dec_big_size
2513	// Encrypt the iv with key2, as the first XEX iv.
2514	ldr	w6,[x4,#240]
2515	ld1	{v0.4s},[x4],#16
2516	ld1	{v6.16b},[x5]
2517	sub	w6,w6,#2
2518	ld1	{v1.4s},[x4],#16
2519
2520.Loop_dec_small_iv_enc:
2521	aese	v6.16b,v0.16b
2522	aesmc	v6.16b,v6.16b
2523	ld1	{v0.4s},[x4],#16
2524	subs	w6,w6,#2
2525	aese	v6.16b,v1.16b
2526	aesmc	v6.16b,v6.16b
2527	ld1	{v1.4s},[x4],#16
2528	b.gt	.Loop_dec_small_iv_enc
2529
2530	aese	v6.16b,v0.16b
2531	aesmc	v6.16b,v6.16b
2532	ld1	{v0.4s},[x4]
2533	aese	v6.16b,v1.16b
2534	eor	v6.16b,v6.16b,v0.16b
2535
2536	ld1	{v0.16b},[x0]
2537	eor	v0.16b,v6.16b,v0.16b
2538
2539	ldr	w6,[x3,#240]
2540	ld1	{v28.4s,v29.4s},[x3],#32			// load key schedule...
2541
2542	aesd	v0.16b,v28.16b
2543	aesimc	v0.16b,v0.16b
2544	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
2545	aesd	v0.16b,v29.16b
2546	aesimc	v0.16b,v0.16b
2547	subs	w6,w6,#10			// bias
2548	b.eq	.Lxts_128_dec
2549.Lxts_dec_round_loop:
2550	aesd	v0.16b,v16.16b
2551	aesimc	v0.16b,v0.16b
2552	ld1	{v16.4s},[x3],#16			// load key schedule...
2553	aesd	v0.16b,v17.16b
2554	aesimc	v0.16b,v0.16b
2555	ld1	{v17.4s},[x3],#16			// load key schedule...
2556	subs	w6,w6,#2			// bias
2557	b.gt	.Lxts_dec_round_loop
2558.Lxts_128_dec:
2559	ld1	{v18.4s,v19.4s},[x3],#32			// load key schedule...
2560	aesd	v0.16b,v16.16b
2561	aesimc	v0.16b,v0.16b
2562	aesd	v0.16b,v17.16b
2563	aesimc	v0.16b,v0.16b
2564	ld1	{v20.4s,v21.4s},[x3],#32			// load key schedule...
2565	aesd	v0.16b,v18.16b
2566	aesimc	v0.16b,v0.16b
2567	aesd	v0.16b,v19.16b
2568	aesimc	v0.16b,v0.16b
2569	ld1	{v22.4s,v23.4s},[x3],#32			// load key schedule...
2570	aesd	v0.16b,v20.16b
2571	aesimc	v0.16b,v0.16b
2572	aesd	v0.16b,v21.16b
2573	aesimc	v0.16b,v0.16b
2574	ld1	{v7.4s},[x3]
2575	aesd	v0.16b,v22.16b
2576	aesimc	v0.16b,v0.16b
2577	aesd	v0.16b,v23.16b
2578	eor	v0.16b,v0.16b,v7.16b
2579	eor	v0.16b,v6.16b,v0.16b
2580	st1	{v0.16b},[x1]
2581	b	.Lxts_dec_final_abort
2582.Lxts_dec_big_size:
2583	stp	x19,x20,[sp,#-64]!
2584	stp	x21,x22,[sp,#48]
2585	stp	d8,d9,[sp,#32]
2586	stp	d10,d11,[sp,#16]
2587
2588	and	x21,x2,#0xf
2589	and	x2,x2,#-16
2590	subs	x2,x2,#16
2591	mov	x8,#16
2592	b.lo	.Lxts_dec_abort
2593
2594	// Encrypt the iv with key2, as the first XEX iv
2595	ldr	w6,[x4,#240]
2596	ld1	{v0.4s},[x4],#16
2597	ld1	{v6.16b},[x5]
2598	sub	w6,w6,#2
2599	ld1	{v1.4s},[x4],#16
2600
2601.Loop_dec_iv_enc:
2602	aese	v6.16b,v0.16b
2603	aesmc	v6.16b,v6.16b
2604	ld1	{v0.4s},[x4],#16
2605	subs	w6,w6,#2
2606	aese	v6.16b,v1.16b
2607	aesmc	v6.16b,v6.16b
2608	ld1	{v1.4s},[x4],#16
2609	b.gt	.Loop_dec_iv_enc
2610
2611	aese	v6.16b,v0.16b
2612	aesmc	v6.16b,v6.16b
2613	ld1	{v0.4s},[x4]
2614	aese	v6.16b,v1.16b
2615	eor	v6.16b,v6.16b,v0.16b
2616
2617	// The iv for second block
2618	// x9- iv(low), x10 - iv(high)
2619	// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
2620	fmov	x9,d6
2621	fmov	x10,v6.d[1]
2622	mov	w19,#0x87
2623	extr	x22,x10,x10,#32
2624	extr	x10,x10,x9,#63
2625	and	w11,w19,w22,asr #31
2626	eor	x9,x11,x9,lsl #1
2627	fmov	d8,x9
2628	fmov	v8.d[1],x10
2629
2630	ldr	w5,[x3,#240]		// load rounds number
2631
2632	// The iv for third block
2633	extr	x22,x10,x10,#32
2634	extr	x10,x10,x9,#63
2635	and	w11,w19,w22,asr #31
2636	eor	x9,x11,x9,lsl #1
2637	fmov	d9,x9
2638	fmov	v9.d[1],x10
2639
2640	ld1	{v16.4s,v17.4s},[x3]			// load key schedule...
2641	sub	w5,w5,#6
2642	add	x7,x3,x5,lsl#4		// pointer to last 7 round keys
2643	sub	w5,w5,#2
2644	ld1	{v18.4s,v19.4s},[x7],#32		// load key schedule...
2645	ld1	{v20.4s,v21.4s},[x7],#32
2646	ld1	{v22.4s,v23.4s},[x7],#32
2647	ld1	{v7.4s},[x7]
2648
2649	// The iv for fourth block
2650	extr	x22,x10,x10,#32
2651	extr	x10,x10,x9,#63
2652	and	w11,w19,w22,asr #31
2653	eor	x9,x11,x9,lsl #1
2654	fmov	d10,x9
2655	fmov	v10.d[1],x10
2656
2657	add	x7,x3,#32
2658	mov	w6,w5
2659	b	.Lxts_dec
2660
2661	// Decryption
2662.align	5
2663.Lxts_dec:
2664	tst	x21,#0xf
2665	b.eq	.Lxts_dec_begin
2666	subs	x2,x2,#16
2667	csel	x8,xzr,x8,eq
2668	ld1	{v0.16b},[x0],#16
2669	b.lo	.Lxts_done
2670	sub	x0,x0,#16
2671.Lxts_dec_begin:
2672	ld1	{v0.16b},[x0],x8
2673	subs	x2,x2,#32			// bias
2674	add	w6,w5,#2
2675	orr	v3.16b,v0.16b,v0.16b
2676	orr	v1.16b,v0.16b,v0.16b
2677	orr	v28.16b,v0.16b,v0.16b
2678	ld1	{v24.16b},[x0],#16
2679	orr	v27.16b,v24.16b,v24.16b
2680	orr	v29.16b,v24.16b,v24.16b
2681	b.lo	.Lxts_inner_dec_tail
2682	eor	v0.16b,v0.16b,v6.16b			// before decryt, xor with iv
2683	eor	v24.16b,v24.16b,v8.16b
2684
2685	orr	v1.16b,v24.16b,v24.16b
2686	ld1	{v24.16b},[x0],#16
2687	orr	v2.16b,v0.16b,v0.16b
2688	orr	v3.16b,v1.16b,v1.16b
2689	eor	v27.16b,v24.16b,v9.16b			// third block xox with third iv
2690	eor	v24.16b,v24.16b,v9.16b
2691	cmp	x2,#32
2692	b.lo	.Lxts_outer_dec_tail
2693
2694	ld1	{v25.16b},[x0],#16
2695
2696	// The iv for fifth block
2697	extr	x22,x10,x10,#32
2698	extr	x10,x10,x9,#63
2699	and	w11,w19,w22,asr #31
2700	eor	x9,x11,x9,lsl #1
2701	fmov	d11,x9
2702	fmov	v11.d[1],x10
2703
2704	ld1	{v26.16b},[x0],#16
2705	eor	v25.16b,v25.16b,v10.16b		// the fourth block
2706	eor	v26.16b,v26.16b,v11.16b
2707	sub	x2,x2,#32			// bias
2708	mov	w6,w5
2709	b	.Loop5x_xts_dec
2710
2711.align	4
2712.Loop5x_xts_dec:
2713	aesd	v0.16b,v16.16b
2714	aesimc	v0.16b,v0.16b
2715	aesd	v1.16b,v16.16b
2716	aesimc	v1.16b,v1.16b
2717	aesd	v24.16b,v16.16b
2718	aesimc	v24.16b,v24.16b
2719	aesd	v25.16b,v16.16b
2720	aesimc	v25.16b,v25.16b
2721	aesd	v26.16b,v16.16b
2722	aesimc	v26.16b,v26.16b
2723	ld1	{v16.4s},[x7],#16		// load key schedule...
2724	subs	w6,w6,#2
2725	aesd	v0.16b,v17.16b
2726	aesimc	v0.16b,v0.16b
2727	aesd	v1.16b,v17.16b
2728	aesimc	v1.16b,v1.16b
2729	aesd	v24.16b,v17.16b
2730	aesimc	v24.16b,v24.16b
2731	aesd	v25.16b,v17.16b
2732	aesimc	v25.16b,v25.16b
2733	aesd	v26.16b,v17.16b
2734	aesimc	v26.16b,v26.16b
2735	ld1	{v17.4s},[x7],#16		// load key schedule...
2736	b.gt	.Loop5x_xts_dec
2737
2738	aesd	v0.16b,v16.16b
2739	aesimc	v0.16b,v0.16b
2740	aesd	v1.16b,v16.16b
2741	aesimc	v1.16b,v1.16b
2742	aesd	v24.16b,v16.16b
2743	aesimc	v24.16b,v24.16b
2744	aesd	v25.16b,v16.16b
2745	aesimc	v25.16b,v25.16b
2746	aesd	v26.16b,v16.16b
2747	aesimc	v26.16b,v26.16b
2748	subs	x2,x2,#0x50			// because .Lxts_dec_tail4x
2749
2750	aesd	v0.16b,v17.16b
2751	aesimc	v0.16b,v0.16b
2752	aesd	v1.16b,v17.16b
2753	aesimc	v1.16b,v1.16b
2754	aesd	v24.16b,v17.16b
2755	aesimc	v24.16b,v24.16b
2756	aesd	v25.16b,v17.16b
2757	aesimc	v25.16b,v25.16b
2758	aesd	v26.16b,v17.16b
2759	aesimc	v26.16b,v26.16b
2760	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
2761	mov	x7,x3
2762
2763	aesd	v0.16b,v18.16b
2764	aesimc	v0.16b,v0.16b
2765	aesd	v1.16b,v18.16b
2766	aesimc	v1.16b,v1.16b
2767	aesd	v24.16b,v18.16b
2768	aesimc	v24.16b,v24.16b
2769	aesd	v25.16b,v18.16b
2770	aesimc	v25.16b,v25.16b
2771	aesd	v26.16b,v18.16b
2772	aesimc	v26.16b,v26.16b
2773	add	x0,x0,x6		// x0 is adjusted in such way that
2774						// at exit from the loop v1.16b-v26.16b
2775						// are loaded with last "words"
2776	add	x6,x2,#0x60		// because .Lxts_dec_tail4x
2777
2778	aesd	v0.16b,v19.16b
2779	aesimc	v0.16b,v0.16b
2780	aesd	v1.16b,v19.16b
2781	aesimc	v1.16b,v1.16b
2782	aesd	v24.16b,v19.16b
2783	aesimc	v24.16b,v24.16b
2784	aesd	v25.16b,v19.16b
2785	aesimc	v25.16b,v25.16b
2786	aesd	v26.16b,v19.16b
2787	aesimc	v26.16b,v26.16b
2788
2789	aesd	v0.16b,v20.16b
2790	aesimc	v0.16b,v0.16b
2791	aesd	v1.16b,v20.16b
2792	aesimc	v1.16b,v1.16b
2793	aesd	v24.16b,v20.16b
2794	aesimc	v24.16b,v24.16b
2795	aesd	v25.16b,v20.16b
2796	aesimc	v25.16b,v25.16b
2797	aesd	v26.16b,v20.16b
2798	aesimc	v26.16b,v26.16b
2799
2800	aesd	v0.16b,v21.16b
2801	aesimc	v0.16b,v0.16b
2802	aesd	v1.16b,v21.16b
2803	aesimc	v1.16b,v1.16b
2804	aesd	v24.16b,v21.16b
2805	aesimc	v24.16b,v24.16b
2806	aesd	v25.16b,v21.16b
2807	aesimc	v25.16b,v25.16b
2808	aesd	v26.16b,v21.16b
2809	aesimc	v26.16b,v26.16b
2810
2811	aesd	v0.16b,v22.16b
2812	aesimc	v0.16b,v0.16b
2813	aesd	v1.16b,v22.16b
2814	aesimc	v1.16b,v1.16b
2815	aesd	v24.16b,v22.16b
2816	aesimc	v24.16b,v24.16b
2817	aesd	v25.16b,v22.16b
2818	aesimc	v25.16b,v25.16b
2819	aesd	v26.16b,v22.16b
2820	aesimc	v26.16b,v26.16b
2821
2822	eor	v4.16b,v7.16b,v6.16b
2823	aesd	v0.16b,v23.16b
2824	// The iv for first block of next iteration.
2825	extr	x22,x10,x10,#32
2826	extr	x10,x10,x9,#63
2827	and	w11,w19,w22,asr #31
2828	eor	x9,x11,x9,lsl #1
2829	fmov	d6,x9
2830	fmov	v6.d[1],x10
2831	eor	v5.16b,v7.16b,v8.16b
2832	ld1	{v2.16b},[x0],#16
2833	aesd	v1.16b,v23.16b
2834	// The iv for second block
2835	extr	x22,x10,x10,#32
2836	extr	x10,x10,x9,#63
2837	and	w11,w19,w22,asr #31
2838	eor	x9,x11,x9,lsl #1
2839	fmov	d8,x9
2840	fmov	v8.d[1],x10
2841	eor	v17.16b,v7.16b,v9.16b
2842	ld1	{v3.16b},[x0],#16
2843	aesd	v24.16b,v23.16b
2844	// The iv for third block
2845	extr	x22,x10,x10,#32
2846	extr	x10,x10,x9,#63
2847	and	w11,w19,w22,asr #31
2848	eor	x9,x11,x9,lsl #1
2849	fmov	d9,x9
2850	fmov	v9.d[1],x10
2851	eor	v30.16b,v7.16b,v10.16b
2852	ld1	{v27.16b},[x0],#16
2853	aesd	v25.16b,v23.16b
2854	// The iv for fourth block
2855	extr	x22,x10,x10,#32
2856	extr	x10,x10,x9,#63
2857	and	w11,w19,w22,asr #31
2858	eor	x9,x11,x9,lsl #1
2859	fmov	d10,x9
2860	fmov	v10.d[1],x10
2861	eor	v31.16b,v7.16b,v11.16b
2862	ld1	{v28.16b},[x0],#16
2863	aesd	v26.16b,v23.16b
2864
2865	// The iv for fifth block
2866	extr	x22,x10,x10,#32
2867	extr	x10,x10,x9,#63
2868	and	w11,w19,w22,asr #31
2869	eor	x9,x11,x9,lsl #1
2870	fmov	d11,x9
2871	fmov	v11.d[1],x10
2872
2873	ld1	{v29.16b},[x0],#16
2874	cbz	x6,.Lxts_dec_tail4x
2875	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
2876	eor	v4.16b,v4.16b,v0.16b
2877	eor	v0.16b,v2.16b,v6.16b
2878	eor	v5.16b,v5.16b,v1.16b
2879	eor	v1.16b,v3.16b,v8.16b
2880	eor	v17.16b,v17.16b,v24.16b
2881	eor	v24.16b,v27.16b,v9.16b
2882	eor	v30.16b,v30.16b,v25.16b
2883	eor	v25.16b,v28.16b,v10.16b
2884	eor	v31.16b,v31.16b,v26.16b
2885	st1	{v4.16b},[x1],#16
2886	eor	v26.16b,v29.16b,v11.16b
2887	st1	{v5.16b},[x1],#16
2888	mov	w6,w5
2889	st1	{v17.16b},[x1],#16
2890	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
2891	st1	{v30.16b},[x1],#16
2892	st1	{v31.16b},[x1],#16
2893	b.hs	.Loop5x_xts_dec
2894
2895	cmn	x2,#0x10
2896	b.ne	.Loop5x_dec_after
2897	// If x2(x2) equal to -0x10, the left blocks is 4.
2898	// After specially processing, utilize the five blocks processing again.
2899	// It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b.
2900	orr	v11.16b,v10.16b,v10.16b
2901	orr	v10.16b,v9.16b,v9.16b
2902	orr	v9.16b,v8.16b,v8.16b
2903	orr	v8.16b,v6.16b,v6.16b
2904	fmov	x9,d11
2905	fmov	x10,v11.d[1]
2906	eor	v0.16b,v6.16b,v2.16b
2907	eor	v1.16b,v8.16b,v3.16b
2908	eor	v24.16b,v27.16b,v9.16b
2909	eor	v25.16b,v28.16b,v10.16b
2910	eor	v26.16b,v29.16b,v11.16b
2911	b.eq	.Loop5x_xts_dec
2912
2913.Loop5x_dec_after:
2914	add	x2,x2,#0x50
2915	cbz	x2,.Lxts_done
2916
2917	add	w6,w5,#2
2918	subs	x2,x2,#0x30
2919	b.lo	.Lxts_inner_dec_tail
2920
2921	eor	v0.16b,v6.16b,v27.16b
2922	eor	v1.16b,v8.16b,v28.16b
2923	eor	v24.16b,v29.16b,v9.16b
2924	b	.Lxts_outer_dec_tail
2925
2926.align	4
2927.Lxts_dec_tail4x:
2928	add	x0,x0,#16
2929	tst	x21,#0xf
2930	eor	v5.16b,v1.16b,v4.16b
2931	st1	{v5.16b},[x1],#16
2932	eor	v17.16b,v24.16b,v17.16b
2933	st1	{v17.16b},[x1],#16
2934	eor	v30.16b,v25.16b,v30.16b
2935	eor	v31.16b,v26.16b,v31.16b
2936	st1	{v30.16b,v31.16b},[x1],#32
2937
2938	b.eq	.Lxts_dec_abort
2939	ld1	{v0.16b},[x0],#16
2940	b	.Lxts_done
2941.align	4
2942.Lxts_outer_dec_tail:
2943	aesd	v0.16b,v16.16b
2944	aesimc	v0.16b,v0.16b
2945	aesd	v1.16b,v16.16b
2946	aesimc	v1.16b,v1.16b
2947	aesd	v24.16b,v16.16b
2948	aesimc	v24.16b,v24.16b
2949	ld1	{v16.4s},[x7],#16
2950	subs	w6,w6,#2
2951	aesd	v0.16b,v17.16b
2952	aesimc	v0.16b,v0.16b
2953	aesd	v1.16b,v17.16b
2954	aesimc	v1.16b,v1.16b
2955	aesd	v24.16b,v17.16b
2956	aesimc	v24.16b,v24.16b
2957	ld1	{v17.4s},[x7],#16
2958	b.gt	.Lxts_outer_dec_tail
2959
2960	aesd	v0.16b,v16.16b
2961	aesimc	v0.16b,v0.16b
2962	aesd	v1.16b,v16.16b
2963	aesimc	v1.16b,v1.16b
2964	aesd	v24.16b,v16.16b
2965	aesimc	v24.16b,v24.16b
2966	eor	v4.16b,v6.16b,v7.16b
2967	subs	x2,x2,#0x30
2968	// The iv for first block
2969	fmov	x9,d9
2970	fmov	x10,v9.d[1]
2971	mov	w19,#0x87
2972	extr	x22,x10,x10,#32
2973	extr	x10,x10,x9,#63
2974	and	w11,w19,w22,asr #31
2975	eor	x9,x11,x9,lsl #1
2976	fmov	d6,x9
2977	fmov	v6.d[1],x10
2978	eor	v5.16b,v8.16b,v7.16b
2979	csel	x6,x2,x6,lo	// x6, w6, is zero at this point
2980	aesd	v0.16b,v17.16b
2981	aesimc	v0.16b,v0.16b
2982	aesd	v1.16b,v17.16b
2983	aesimc	v1.16b,v1.16b
2984	aesd	v24.16b,v17.16b
2985	aesimc	v24.16b,v24.16b
2986	eor	v17.16b,v9.16b,v7.16b
2987	// The iv for second block
2988	extr	x22,x10,x10,#32
2989	extr	x10,x10,x9,#63
2990	and	w11,w19,w22,asr #31
2991	eor	x9,x11,x9,lsl #1
2992	fmov	d8,x9
2993	fmov	v8.d[1],x10
2994
2995	add	x6,x6,#0x20
2996	add	x0,x0,x6		// x0 is adjusted to the last data
2997
2998	mov	x7,x3
2999
3000	// The iv for third block
3001	extr	x22,x10,x10,#32
3002	extr	x10,x10,x9,#63
3003	and	w11,w19,w22,asr #31
3004	eor	x9,x11,x9,lsl #1
3005	fmov	d9,x9
3006	fmov	v9.d[1],x10
3007
3008	aesd	v0.16b,v20.16b
3009	aesimc	v0.16b,v0.16b
3010	aesd	v1.16b,v20.16b
3011	aesimc	v1.16b,v1.16b
3012	aesd	v24.16b,v20.16b
3013	aesimc	v24.16b,v24.16b
3014	aesd	v0.16b,v21.16b
3015	aesimc	v0.16b,v0.16b
3016	aesd	v1.16b,v21.16b
3017	aesimc	v1.16b,v1.16b
3018	aesd	v24.16b,v21.16b
3019	aesimc	v24.16b,v24.16b
3020	aesd	v0.16b,v22.16b
3021	aesimc	v0.16b,v0.16b
3022	aesd	v1.16b,v22.16b
3023	aesimc	v1.16b,v1.16b
3024	aesd	v24.16b,v22.16b
3025	aesimc	v24.16b,v24.16b
3026	ld1	{v27.16b},[x0],#16
3027	aesd	v0.16b,v23.16b
3028	aesd	v1.16b,v23.16b
3029	aesd	v24.16b,v23.16b
3030	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
3031	add	w6,w5,#2
3032	eor	v4.16b,v4.16b,v0.16b
3033	eor	v5.16b,v5.16b,v1.16b
3034	eor	v24.16b,v24.16b,v17.16b
3035	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
3036	st1	{v4.16b},[x1],#16
3037	st1	{v5.16b},[x1],#16
3038	st1	{v24.16b},[x1],#16
3039
3040	cmn	x2,#0x30
3041	add	x2,x2,#0x30
3042	b.eq	.Lxts_done
3043	sub	x2,x2,#0x30
3044	orr	v28.16b,v3.16b,v3.16b
3045	orr	v29.16b,v27.16b,v27.16b
3046	nop
3047
3048.Lxts_inner_dec_tail:
3049	// x2 == -0x10 means two blocks left.
3050	cmn	x2,#0x10
3051	eor	v1.16b,v28.16b,v6.16b
3052	eor	v24.16b,v29.16b,v8.16b
3053	b.eq	.Lxts_dec_tail_loop
3054	eor	v24.16b,v29.16b,v6.16b
3055.Lxts_dec_tail_loop:
3056	aesd	v1.16b,v16.16b
3057	aesimc	v1.16b,v1.16b
3058	aesd	v24.16b,v16.16b
3059	aesimc	v24.16b,v24.16b
3060	ld1	{v16.4s},[x7],#16
3061	subs	w6,w6,#2
3062	aesd	v1.16b,v17.16b
3063	aesimc	v1.16b,v1.16b
3064	aesd	v24.16b,v17.16b
3065	aesimc	v24.16b,v24.16b
3066	ld1	{v17.4s},[x7],#16
3067	b.gt	.Lxts_dec_tail_loop
3068
3069	aesd	v1.16b,v16.16b
3070	aesimc	v1.16b,v1.16b
3071	aesd	v24.16b,v16.16b
3072	aesimc	v24.16b,v24.16b
3073	aesd	v1.16b,v17.16b
3074	aesimc	v1.16b,v1.16b
3075	aesd	v24.16b,v17.16b
3076	aesimc	v24.16b,v24.16b
3077	aesd	v1.16b,v20.16b
3078	aesimc	v1.16b,v1.16b
3079	aesd	v24.16b,v20.16b
3080	aesimc	v24.16b,v24.16b
3081	cmn	x2,#0x20
3082	aesd	v1.16b,v21.16b
3083	aesimc	v1.16b,v1.16b
3084	aesd	v24.16b,v21.16b
3085	aesimc	v24.16b,v24.16b
3086	eor	v5.16b,v6.16b,v7.16b
3087	aesd	v1.16b,v22.16b
3088	aesimc	v1.16b,v1.16b
3089	aesd	v24.16b,v22.16b
3090	aesimc	v24.16b,v24.16b
3091	eor	v17.16b,v8.16b,v7.16b
3092	aesd	v1.16b,v23.16b
3093	aesd	v24.16b,v23.16b
3094	b.eq	.Lxts_dec_one
3095	eor	v5.16b,v5.16b,v1.16b
3096	eor	v17.16b,v17.16b,v24.16b
3097	orr	v6.16b,v9.16b,v9.16b
3098	orr	v8.16b,v10.16b,v10.16b
3099	st1	{v5.16b},[x1],#16
3100	st1	{v17.16b},[x1],#16
3101	add	x2,x2,#16
3102	b	.Lxts_done
3103
3104.Lxts_dec_one:
3105	eor	v5.16b,v5.16b,v24.16b
3106	orr	v6.16b,v8.16b,v8.16b
3107	orr	v8.16b,v9.16b,v9.16b
3108	st1	{v5.16b},[x1],#16
3109	add	x2,x2,#32
3110
3111.Lxts_done:
3112	tst	x21,#0xf
3113	b.eq	.Lxts_dec_abort
3114	// Processing the last two blocks with cipher stealing.
3115	mov	x7,x3
3116	cbnz	x2,.Lxts_dec_1st_done
3117	ld1	{v0.16b},[x0],#16
3118
3119	// Decrypt the last secod block to get the last plain text block
3120.Lxts_dec_1st_done:
3121	eor	v26.16b,v0.16b,v8.16b
3122	ldr	w6,[x3,#240]
3123	ld1	{v0.4s},[x3],#16
3124	sub	w6,w6,#2
3125	ld1	{v1.4s},[x3],#16
3126.Loop_final_2nd_dec:
3127	aesd	v26.16b,v0.16b
3128	aesimc	v26.16b,v26.16b
3129	ld1	{v0.4s},[x3],#16		// load key schedule...
3130	subs	w6,w6,#2
3131	aesd	v26.16b,v1.16b
3132	aesimc	v26.16b,v26.16b
3133	ld1	{v1.4s},[x3],#16		// load key schedule...
3134	b.gt	.Loop_final_2nd_dec
3135
3136	aesd	v26.16b,v0.16b
3137	aesimc	v26.16b,v26.16b
3138	ld1	{v0.4s},[x3]
3139	aesd	v26.16b,v1.16b
3140	eor	v26.16b,v26.16b,v0.16b
3141	eor	v26.16b,v26.16b,v8.16b
3142	st1	{v26.16b},[x1]
3143
3144	mov	x20,x0
3145	add	x13,x1,#16
3146
3147	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3148	// to get the last encrypted block.
3149.composite_dec_loop:
3150	subs	x21,x21,#1
3151	ldrb	w15,[x1,x21]
3152	ldrb	w14,[x20,x21]
3153	strb	w15,[x13,x21]
3154	strb	w14,[x1,x21]
3155	b.gt	.composite_dec_loop
3156.Lxts_dec_load_done:
3157	ld1	{v26.16b},[x1]
3158	eor	v26.16b,v26.16b,v6.16b
3159
3160	// Decrypt the composite block to get the last second plain text block
3161	ldr	w6,[x7,#240]
3162	ld1	{v0.4s},[x7],#16
3163	sub	w6,w6,#2
3164	ld1	{v1.4s},[x7],#16
3165.Loop_final_dec:
3166	aesd	v26.16b,v0.16b
3167	aesimc	v26.16b,v26.16b
3168	ld1	{v0.4s},[x7],#16		// load key schedule...
3169	subs	w6,w6,#2
3170	aesd	v26.16b,v1.16b
3171	aesimc	v26.16b,v26.16b
3172	ld1	{v1.4s},[x7],#16		// load key schedule...
3173	b.gt	.Loop_final_dec
3174
3175	aesd	v26.16b,v0.16b
3176	aesimc	v26.16b,v26.16b
3177	ld1	{v0.4s},[x7]
3178	aesd	v26.16b,v1.16b
3179	eor	v26.16b,v26.16b,v0.16b
3180	eor	v26.16b,v26.16b,v6.16b
3181	st1	{v26.16b},[x1]
3182
3183.Lxts_dec_abort:
3184	ldp	x21,x22,[sp,#48]
3185	ldp	d8,d9,[sp,#32]
3186	ldp	d10,d11,[sp,#16]
3187	ldp	x19,x20,[sp],#64
3188
3189.Lxts_dec_final_abort:
3190	ret
3191.size	aes_v8_xts_decrypt,.-aes_v8_xts_decrypt
3192#endif
3193