xref: /freebsd/sys/crypto/openssl/aarch64/aesv8-armx.S (revision 4fbb9c43aa44d9145151bb5f77d302ba01fb7551)
1/* Do not modify. This file is auto-generated from aesv8-armx.pl. */
2#include "arm_arch.h"
3
4#if __ARM_MAX_ARCH__>=7
5.arch	armv8-a+crypto
6.text
7.align	5
8.Lrcon:
9.long	0x01,0x01,0x01,0x01
10.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
11.long	0x1b,0x1b,0x1b,0x1b
12
13.globl	aes_v8_set_encrypt_key
14.type	aes_v8_set_encrypt_key,%function
15.align	5
16aes_v8_set_encrypt_key:
17.Lenc_key:
18	stp	x29,x30,[sp,#-16]!
19	add	x29,sp,#0
20	mov	x3,#-1
21	cmp	x0,#0
22	b.eq	.Lenc_key_abort
23	cmp	x2,#0
24	b.eq	.Lenc_key_abort
25	mov	x3,#-2
26	cmp	w1,#128
27	b.lt	.Lenc_key_abort
28	cmp	w1,#256
29	b.gt	.Lenc_key_abort
30	tst	w1,#0x3f
31	b.ne	.Lenc_key_abort
32
33	adr	x3,.Lrcon
34	cmp	w1,#192
35
36	eor	v0.16b,v0.16b,v0.16b
37	ld1	{v3.16b},[x0],#16
38	mov	w1,#8		// reuse w1
39	ld1	{v1.4s,v2.4s},[x3],#32
40
41	b.lt	.Loop128
42	b.eq	.L192
43	b	.L256
44
45.align	4
46.Loop128:
47	tbl	v6.16b,{v3.16b},v2.16b
48	ext	v5.16b,v0.16b,v3.16b,#12
49	st1	{v3.4s},[x2],#16
50	aese	v6.16b,v0.16b
51	subs	w1,w1,#1
52
53	eor	v3.16b,v3.16b,v5.16b
54	ext	v5.16b,v0.16b,v5.16b,#12
55	eor	v3.16b,v3.16b,v5.16b
56	ext	v5.16b,v0.16b,v5.16b,#12
57	eor	v6.16b,v6.16b,v1.16b
58	eor	v3.16b,v3.16b,v5.16b
59	shl	v1.16b,v1.16b,#1
60	eor	v3.16b,v3.16b,v6.16b
61	b.ne	.Loop128
62
63	ld1	{v1.4s},[x3]
64
65	tbl	v6.16b,{v3.16b},v2.16b
66	ext	v5.16b,v0.16b,v3.16b,#12
67	st1	{v3.4s},[x2],#16
68	aese	v6.16b,v0.16b
69
70	eor	v3.16b,v3.16b,v5.16b
71	ext	v5.16b,v0.16b,v5.16b,#12
72	eor	v3.16b,v3.16b,v5.16b
73	ext	v5.16b,v0.16b,v5.16b,#12
74	eor	v6.16b,v6.16b,v1.16b
75	eor	v3.16b,v3.16b,v5.16b
76	shl	v1.16b,v1.16b,#1
77	eor	v3.16b,v3.16b,v6.16b
78
79	tbl	v6.16b,{v3.16b},v2.16b
80	ext	v5.16b,v0.16b,v3.16b,#12
81	st1	{v3.4s},[x2],#16
82	aese	v6.16b,v0.16b
83
84	eor	v3.16b,v3.16b,v5.16b
85	ext	v5.16b,v0.16b,v5.16b,#12
86	eor	v3.16b,v3.16b,v5.16b
87	ext	v5.16b,v0.16b,v5.16b,#12
88	eor	v6.16b,v6.16b,v1.16b
89	eor	v3.16b,v3.16b,v5.16b
90	eor	v3.16b,v3.16b,v6.16b
91	st1	{v3.4s},[x2]
92	add	x2,x2,#0x50
93
94	mov	w12,#10
95	b	.Ldone
96
97.align	4
98.L192:
99	ld1	{v4.8b},[x0],#8
100	movi	v6.16b,#8			// borrow v6.16b
101	st1	{v3.4s},[x2],#16
102	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
103
104.Loop192:
105	tbl	v6.16b,{v4.16b},v2.16b
106	ext	v5.16b,v0.16b,v3.16b,#12
107#ifdef __ARMEB__
108	st1	{v4.4s},[x2],#16
109	sub	x2,x2,#8
110#else
111	st1	{v4.8b},[x2],#8
112#endif
113	aese	v6.16b,v0.16b
114	subs	w1,w1,#1
115
116	eor	v3.16b,v3.16b,v5.16b
117	ext	v5.16b,v0.16b,v5.16b,#12
118	eor	v3.16b,v3.16b,v5.16b
119	ext	v5.16b,v0.16b,v5.16b,#12
120	eor	v3.16b,v3.16b,v5.16b
121
122	dup	v5.4s,v3.s[3]
123	eor	v5.16b,v5.16b,v4.16b
124	eor	v6.16b,v6.16b,v1.16b
125	ext	v4.16b,v0.16b,v4.16b,#12
126	shl	v1.16b,v1.16b,#1
127	eor	v4.16b,v4.16b,v5.16b
128	eor	v3.16b,v3.16b,v6.16b
129	eor	v4.16b,v4.16b,v6.16b
130	st1	{v3.4s},[x2],#16
131	b.ne	.Loop192
132
133	mov	w12,#12
134	add	x2,x2,#0x20
135	b	.Ldone
136
137.align	4
138.L256:
139	ld1	{v4.16b},[x0]
140	mov	w1,#7
141	mov	w12,#14
142	st1	{v3.4s},[x2],#16
143
144.Loop256:
145	tbl	v6.16b,{v4.16b},v2.16b
146	ext	v5.16b,v0.16b,v3.16b,#12
147	st1	{v4.4s},[x2],#16
148	aese	v6.16b,v0.16b
149	subs	w1,w1,#1
150
151	eor	v3.16b,v3.16b,v5.16b
152	ext	v5.16b,v0.16b,v5.16b,#12
153	eor	v3.16b,v3.16b,v5.16b
154	ext	v5.16b,v0.16b,v5.16b,#12
155	eor	v6.16b,v6.16b,v1.16b
156	eor	v3.16b,v3.16b,v5.16b
157	shl	v1.16b,v1.16b,#1
158	eor	v3.16b,v3.16b,v6.16b
159	st1	{v3.4s},[x2],#16
160	b.eq	.Ldone
161
162	dup	v6.4s,v3.s[3]		// just splat
163	ext	v5.16b,v0.16b,v4.16b,#12
164	aese	v6.16b,v0.16b
165
166	eor	v4.16b,v4.16b,v5.16b
167	ext	v5.16b,v0.16b,v5.16b,#12
168	eor	v4.16b,v4.16b,v5.16b
169	ext	v5.16b,v0.16b,v5.16b,#12
170	eor	v4.16b,v4.16b,v5.16b
171
172	eor	v4.16b,v4.16b,v6.16b
173	b	.Loop256
174
175.Ldone:
176	str	w12,[x2]
177	mov	x3,#0
178
179.Lenc_key_abort:
180	mov	x0,x3			// return value
181	ldr	x29,[sp],#16
182	ret
183.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
184
185.globl	aes_v8_set_decrypt_key
186.type	aes_v8_set_decrypt_key,%function
187.align	5
188aes_v8_set_decrypt_key:
189.inst	0xd503233f		// paciasp
190	stp	x29,x30,[sp,#-16]!
191	add	x29,sp,#0
192	bl	.Lenc_key
193
194	cmp	x0,#0
195	b.ne	.Ldec_key_abort
196
197	sub	x2,x2,#240		// restore original x2
198	mov	x4,#-16
199	add	x0,x2,x12,lsl#4	// end of key schedule
200
201	ld1	{v0.4s},[x2]
202	ld1	{v1.4s},[x0]
203	st1	{v0.4s},[x0],x4
204	st1	{v1.4s},[x2],#16
205
206.Loop_imc:
207	ld1	{v0.4s},[x2]
208	ld1	{v1.4s},[x0]
209	aesimc	v0.16b,v0.16b
210	aesimc	v1.16b,v1.16b
211	st1	{v0.4s},[x0],x4
212	st1	{v1.4s},[x2],#16
213	cmp	x0,x2
214	b.hi	.Loop_imc
215
216	ld1	{v0.4s},[x2]
217	aesimc	v0.16b,v0.16b
218	st1	{v0.4s},[x0]
219
220	eor	x0,x0,x0		// return value
221.Ldec_key_abort:
222	ldp	x29,x30,[sp],#16
223.inst	0xd50323bf		// autiasp
224	ret
225.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
226.globl	aes_v8_encrypt
227.type	aes_v8_encrypt,%function
228.align	5
229aes_v8_encrypt:
230	ldr	w3,[x2,#240]
231	ld1	{v0.4s},[x2],#16
232	ld1	{v2.16b},[x0]
233	sub	w3,w3,#2
234	ld1	{v1.4s},[x2],#16
235
236.Loop_enc:
237	aese	v2.16b,v0.16b
238	aesmc	v2.16b,v2.16b
239	ld1	{v0.4s},[x2],#16
240	subs	w3,w3,#2
241	aese	v2.16b,v1.16b
242	aesmc	v2.16b,v2.16b
243	ld1	{v1.4s},[x2],#16
244	b.gt	.Loop_enc
245
246	aese	v2.16b,v0.16b
247	aesmc	v2.16b,v2.16b
248	ld1	{v0.4s},[x2]
249	aese	v2.16b,v1.16b
250	eor	v2.16b,v2.16b,v0.16b
251
252	st1	{v2.16b},[x1]
253	ret
254.size	aes_v8_encrypt,.-aes_v8_encrypt
255.globl	aes_v8_decrypt
256.type	aes_v8_decrypt,%function
257.align	5
258aes_v8_decrypt:
259	ldr	w3,[x2,#240]
260	ld1	{v0.4s},[x2],#16
261	ld1	{v2.16b},[x0]
262	sub	w3,w3,#2
263	ld1	{v1.4s},[x2],#16
264
265.Loop_dec:
266	aesd	v2.16b,v0.16b
267	aesimc	v2.16b,v2.16b
268	ld1	{v0.4s},[x2],#16
269	subs	w3,w3,#2
270	aesd	v2.16b,v1.16b
271	aesimc	v2.16b,v2.16b
272	ld1	{v1.4s},[x2],#16
273	b.gt	.Loop_dec
274
275	aesd	v2.16b,v0.16b
276	aesimc	v2.16b,v2.16b
277	ld1	{v0.4s},[x2]
278	aesd	v2.16b,v1.16b
279	eor	v2.16b,v2.16b,v0.16b
280
281	st1	{v2.16b},[x1]
282	ret
283.size	aes_v8_decrypt,.-aes_v8_decrypt
284.globl	aes_v8_ecb_encrypt
285.type	aes_v8_ecb_encrypt,%function
286.align	5
287aes_v8_ecb_encrypt:
288	subs	x2,x2,#16
289	// Original input data size bigger than 16, jump to big size processing.
290	b.ne	.Lecb_big_size
291	ld1	{v0.16b},[x0]
292	cmp	w4,#0					// en- or decrypting?
293	ldr	w5,[x3,#240]
294	ld1	{v5.4s,v6.4s},[x3],#32			// load key schedule...
295
296	b.eq	.Lecb_small_dec
297	aese	v0.16b,v5.16b
298	aesmc	v0.16b,v0.16b
299	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
300	aese	v0.16b,v6.16b
301	aesmc	v0.16b,v0.16b
302	subs	w5,w5,#10			// if rounds==10, jump to aes-128-ecb processing
303	b.eq	.Lecb_128_enc
304.Lecb_round_loop:
305	aese	v0.16b,v16.16b
306	aesmc	v0.16b,v0.16b
307	ld1	{v16.4s},[x3],#16				// load key schedule...
308	aese	v0.16b,v17.16b
309	aesmc	v0.16b,v0.16b
310	ld1	{v17.4s},[x3],#16				// load key schedule...
311	subs	w5,w5,#2			// bias
312	b.gt	.Lecb_round_loop
313.Lecb_128_enc:
314	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
315	aese	v0.16b,v16.16b
316	aesmc	v0.16b,v0.16b
317	aese	v0.16b,v17.16b
318	aesmc	v0.16b,v0.16b
319	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
320	aese	v0.16b,v18.16b
321	aesmc	v0.16b,v0.16b
322	aese	v0.16b,v19.16b
323	aesmc	v0.16b,v0.16b
324	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
325	aese	v0.16b,v20.16b
326	aesmc	v0.16b,v0.16b
327	aese	v0.16b,v21.16b
328	aesmc	v0.16b,v0.16b
329	ld1	{v7.4s},[x3]
330	aese	v0.16b,v22.16b
331	aesmc	v0.16b,v0.16b
332	aese	v0.16b,v23.16b
333	eor	v0.16b,v0.16b,v7.16b
334	st1	{v0.16b},[x1]
335	b	.Lecb_Final_abort
336.Lecb_small_dec:
337	aesd	v0.16b,v5.16b
338	aesimc	v0.16b,v0.16b
339	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
340	aesd	v0.16b,v6.16b
341	aesimc	v0.16b,v0.16b
342	subs	w5,w5,#10			// bias
343	b.eq	.Lecb_128_dec
344.Lecb_dec_round_loop:
345	aesd	v0.16b,v16.16b
346	aesimc	v0.16b,v0.16b
347	ld1	{v16.4s},[x3],#16				// load key schedule...
348	aesd	v0.16b,v17.16b
349	aesimc	v0.16b,v0.16b
350	ld1	{v17.4s},[x3],#16				// load key schedule...
351	subs	w5,w5,#2			// bias
352	b.gt	.Lecb_dec_round_loop
353.Lecb_128_dec:
354	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
355	aesd	v0.16b,v16.16b
356	aesimc	v0.16b,v0.16b
357	aesd	v0.16b,v17.16b
358	aesimc	v0.16b,v0.16b
359	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
360	aesd	v0.16b,v18.16b
361	aesimc	v0.16b,v0.16b
362	aesd	v0.16b,v19.16b
363	aesimc	v0.16b,v0.16b
364	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
365	aesd	v0.16b,v20.16b
366	aesimc	v0.16b,v0.16b
367	aesd	v0.16b,v21.16b
368	aesimc	v0.16b,v0.16b
369	ld1	{v7.4s},[x3]
370	aesd	v0.16b,v22.16b
371	aesimc	v0.16b,v0.16b
372	aesd	v0.16b,v23.16b
373	eor	v0.16b,v0.16b,v7.16b
374	st1	{v0.16b},[x1]
375	b	.Lecb_Final_abort
376.Lecb_big_size:
377	stp	x29,x30,[sp,#-16]!
378	add	x29,sp,#0
379	mov	x8,#16
380	b.lo	.Lecb_done
381	csel	x8,xzr,x8,eq
382
383	cmp	w4,#0					// en- or decrypting?
384	ldr	w5,[x3,#240]
385	and	x2,x2,#-16
386	ld1	{v0.16b},[x0],x8
387
388	ld1	{v16.4s,v17.4s},[x3]				// load key schedule...
389	sub	w5,w5,#6
390	add	x7,x3,x5,lsl#4				// pointer to last 7 round keys
391	sub	w5,w5,#2
392	ld1	{v18.4s,v19.4s},[x7],#32
393	ld1	{v20.4s,v21.4s},[x7],#32
394	ld1	{v22.4s,v23.4s},[x7],#32
395	ld1	{v7.4s},[x7]
396
397	add	x7,x3,#32
398	mov	w6,w5
399	b.eq	.Lecb_dec
400
401	ld1	{v1.16b},[x0],#16
402	subs	x2,x2,#32				// bias
403	add	w6,w5,#2
404	orr	v3.16b,v1.16b,v1.16b
405	orr	v24.16b,v1.16b,v1.16b
406	orr	v1.16b,v0.16b,v0.16b
407	b.lo	.Lecb_enc_tail
408
409	orr	v1.16b,v3.16b,v3.16b
410	ld1	{v24.16b},[x0],#16
411	cmp	x2,#32
412	b.lo	.Loop3x_ecb_enc
413
414	ld1	{v25.16b},[x0],#16
415	ld1	{v26.16b},[x0],#16
416	sub	x2,x2,#32				// bias
417	mov	w6,w5
418
419.Loop5x_ecb_enc:
420	aese	v0.16b,v16.16b
421	aesmc	v0.16b,v0.16b
422	aese	v1.16b,v16.16b
423	aesmc	v1.16b,v1.16b
424	aese	v24.16b,v16.16b
425	aesmc	v24.16b,v24.16b
426	aese	v25.16b,v16.16b
427	aesmc	v25.16b,v25.16b
428	aese	v26.16b,v16.16b
429	aesmc	v26.16b,v26.16b
430	ld1	{v16.4s},[x7],#16
431	subs	w6,w6,#2
432	aese	v0.16b,v17.16b
433	aesmc	v0.16b,v0.16b
434	aese	v1.16b,v17.16b
435	aesmc	v1.16b,v1.16b
436	aese	v24.16b,v17.16b
437	aesmc	v24.16b,v24.16b
438	aese	v25.16b,v17.16b
439	aesmc	v25.16b,v25.16b
440	aese	v26.16b,v17.16b
441	aesmc	v26.16b,v26.16b
442	ld1	{v17.4s},[x7],#16
443	b.gt	.Loop5x_ecb_enc
444
445	aese	v0.16b,v16.16b
446	aesmc	v0.16b,v0.16b
447	aese	v1.16b,v16.16b
448	aesmc	v1.16b,v1.16b
449	aese	v24.16b,v16.16b
450	aesmc	v24.16b,v24.16b
451	aese	v25.16b,v16.16b
452	aesmc	v25.16b,v25.16b
453	aese	v26.16b,v16.16b
454	aesmc	v26.16b,v26.16b
455	cmp	x2,#0x40					// because .Lecb_enc_tail4x
456	sub	x2,x2,#0x50
457
458	aese	v0.16b,v17.16b
459	aesmc	v0.16b,v0.16b
460	aese	v1.16b,v17.16b
461	aesmc	v1.16b,v1.16b
462	aese	v24.16b,v17.16b
463	aesmc	v24.16b,v24.16b
464	aese	v25.16b,v17.16b
465	aesmc	v25.16b,v25.16b
466	aese	v26.16b,v17.16b
467	aesmc	v26.16b,v26.16b
468	csel	x6,xzr,x2,gt			// borrow x6, w6, "gt" is not typo
469	mov	x7,x3
470
471	aese	v0.16b,v18.16b
472	aesmc	v0.16b,v0.16b
473	aese	v1.16b,v18.16b
474	aesmc	v1.16b,v1.16b
475	aese	v24.16b,v18.16b
476	aesmc	v24.16b,v24.16b
477	aese	v25.16b,v18.16b
478	aesmc	v25.16b,v25.16b
479	aese	v26.16b,v18.16b
480	aesmc	v26.16b,v26.16b
481	add	x0,x0,x6				// x0 is adjusted in such way that
482							// at exit from the loop v1.16b-v26.16b
483							// are loaded with last "words"
484	add	x6,x2,#0x60		    // because .Lecb_enc_tail4x
485
486	aese	v0.16b,v19.16b
487	aesmc	v0.16b,v0.16b
488	aese	v1.16b,v19.16b
489	aesmc	v1.16b,v1.16b
490	aese	v24.16b,v19.16b
491	aesmc	v24.16b,v24.16b
492	aese	v25.16b,v19.16b
493	aesmc	v25.16b,v25.16b
494	aese	v26.16b,v19.16b
495	aesmc	v26.16b,v26.16b
496
497	aese	v0.16b,v20.16b
498	aesmc	v0.16b,v0.16b
499	aese	v1.16b,v20.16b
500	aesmc	v1.16b,v1.16b
501	aese	v24.16b,v20.16b
502	aesmc	v24.16b,v24.16b
503	aese	v25.16b,v20.16b
504	aesmc	v25.16b,v25.16b
505	aese	v26.16b,v20.16b
506	aesmc	v26.16b,v26.16b
507
508	aese	v0.16b,v21.16b
509	aesmc	v0.16b,v0.16b
510	aese	v1.16b,v21.16b
511	aesmc	v1.16b,v1.16b
512	aese	v24.16b,v21.16b
513	aesmc	v24.16b,v24.16b
514	aese	v25.16b,v21.16b
515	aesmc	v25.16b,v25.16b
516	aese	v26.16b,v21.16b
517	aesmc	v26.16b,v26.16b
518
519	aese	v0.16b,v22.16b
520	aesmc	v0.16b,v0.16b
521	aese	v1.16b,v22.16b
522	aesmc	v1.16b,v1.16b
523	aese	v24.16b,v22.16b
524	aesmc	v24.16b,v24.16b
525	aese	v25.16b,v22.16b
526	aesmc	v25.16b,v25.16b
527	aese	v26.16b,v22.16b
528	aesmc	v26.16b,v26.16b
529
530	aese	v0.16b,v23.16b
531	ld1	{v2.16b},[x0],#16
532	aese	v1.16b,v23.16b
533	ld1	{v3.16b},[x0],#16
534	aese	v24.16b,v23.16b
535	ld1	{v27.16b},[x0],#16
536	aese	v25.16b,v23.16b
537	ld1	{v28.16b},[x0],#16
538	aese	v26.16b,v23.16b
539	ld1	{v29.16b},[x0],#16
540	cbz	x6,.Lecb_enc_tail4x
541	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
542	eor	v4.16b,v7.16b,v0.16b
543	orr	v0.16b,v2.16b,v2.16b
544	eor	v5.16b,v7.16b,v1.16b
545	orr	v1.16b,v3.16b,v3.16b
546	eor	v17.16b,v7.16b,v24.16b
547	orr	v24.16b,v27.16b,v27.16b
548	eor	v30.16b,v7.16b,v25.16b
549	orr	v25.16b,v28.16b,v28.16b
550	eor	v31.16b,v7.16b,v26.16b
551	st1	{v4.16b},[x1],#16
552	orr	v26.16b,v29.16b,v29.16b
553	st1	{v5.16b},[x1],#16
554	mov	w6,w5
555	st1	{v17.16b},[x1],#16
556	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
557	st1	{v30.16b},[x1],#16
558	st1	{v31.16b},[x1],#16
559	b.hs	.Loop5x_ecb_enc
560
561	add	x2,x2,#0x50
562	cbz	x2,.Lecb_done
563
564	add	w6,w5,#2
565	subs	x2,x2,#0x30
566	orr	v0.16b,v27.16b,v27.16b
567	orr	v1.16b,v28.16b,v28.16b
568	orr	v24.16b,v29.16b,v29.16b
569	b.lo	.Lecb_enc_tail
570
571	b	.Loop3x_ecb_enc
572
573.align	4
574.Lecb_enc_tail4x:
575	eor	v5.16b,v7.16b,v1.16b
576	eor	v17.16b,v7.16b,v24.16b
577	eor	v30.16b,v7.16b,v25.16b
578	eor	v31.16b,v7.16b,v26.16b
579	st1	{v5.16b},[x1],#16
580	st1	{v17.16b},[x1],#16
581	st1	{v30.16b},[x1],#16
582	st1	{v31.16b},[x1],#16
583
584	b	.Lecb_done
585.align	4
586.Loop3x_ecb_enc:
587	aese	v0.16b,v16.16b
588	aesmc	v0.16b,v0.16b
589	aese	v1.16b,v16.16b
590	aesmc	v1.16b,v1.16b
591	aese	v24.16b,v16.16b
592	aesmc	v24.16b,v24.16b
593	ld1	{v16.4s},[x7],#16
594	subs	w6,w6,#2
595	aese	v0.16b,v17.16b
596	aesmc	v0.16b,v0.16b
597	aese	v1.16b,v17.16b
598	aesmc	v1.16b,v1.16b
599	aese	v24.16b,v17.16b
600	aesmc	v24.16b,v24.16b
601	ld1	{v17.4s},[x7],#16
602	b.gt	.Loop3x_ecb_enc
603
604	aese	v0.16b,v16.16b
605	aesmc	v0.16b,v0.16b
606	aese	v1.16b,v16.16b
607	aesmc	v1.16b,v1.16b
608	aese	v24.16b,v16.16b
609	aesmc	v24.16b,v24.16b
610	subs	x2,x2,#0x30
611	csel	x6,x2,x6,lo				// x6, w6, is zero at this point
612	aese	v0.16b,v17.16b
613	aesmc	v0.16b,v0.16b
614	aese	v1.16b,v17.16b
615	aesmc	v1.16b,v1.16b
616	aese	v24.16b,v17.16b
617	aesmc	v24.16b,v24.16b
618	add	x0,x0,x6			// x0 is adjusted in such way that
619						// at exit from the loop v1.16b-v24.16b
620						// are loaded with last "words"
621	mov	x7,x3
622	aese	v0.16b,v20.16b
623	aesmc	v0.16b,v0.16b
624	aese	v1.16b,v20.16b
625	aesmc	v1.16b,v1.16b
626	aese	v24.16b,v20.16b
627	aesmc	v24.16b,v24.16b
628	ld1	{v2.16b},[x0],#16
629	aese	v0.16b,v21.16b
630	aesmc	v0.16b,v0.16b
631	aese	v1.16b,v21.16b
632	aesmc	v1.16b,v1.16b
633	aese	v24.16b,v21.16b
634	aesmc	v24.16b,v24.16b
635	ld1	{v3.16b},[x0],#16
636	aese	v0.16b,v22.16b
637	aesmc	v0.16b,v0.16b
638	aese	v1.16b,v22.16b
639	aesmc	v1.16b,v1.16b
640	aese	v24.16b,v22.16b
641	aesmc	v24.16b,v24.16b
642	ld1	{v27.16b},[x0],#16
643	aese	v0.16b,v23.16b
644	aese	v1.16b,v23.16b
645	aese	v24.16b,v23.16b
646	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
647	add	w6,w5,#2
648	eor	v4.16b,v7.16b,v0.16b
649	eor	v5.16b,v7.16b,v1.16b
650	eor	v24.16b,v24.16b,v7.16b
651	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
652	st1	{v4.16b},[x1],#16
653	orr	v0.16b,v2.16b,v2.16b
654	st1	{v5.16b},[x1],#16
655	orr	v1.16b,v3.16b,v3.16b
656	st1	{v24.16b},[x1],#16
657	orr	v24.16b,v27.16b,v27.16b
658	b.hs	.Loop3x_ecb_enc
659
660	cmn	x2,#0x30
661	b.eq	.Lecb_done
662	nop
663
664.Lecb_enc_tail:
665	aese	v1.16b,v16.16b
666	aesmc	v1.16b,v1.16b
667	aese	v24.16b,v16.16b
668	aesmc	v24.16b,v24.16b
669	ld1	{v16.4s},[x7],#16
670	subs	w6,w6,#2
671	aese	v1.16b,v17.16b
672	aesmc	v1.16b,v1.16b
673	aese	v24.16b,v17.16b
674	aesmc	v24.16b,v24.16b
675	ld1	{v17.4s},[x7],#16
676	b.gt	.Lecb_enc_tail
677
678	aese	v1.16b,v16.16b
679	aesmc	v1.16b,v1.16b
680	aese	v24.16b,v16.16b
681	aesmc	v24.16b,v24.16b
682	aese	v1.16b,v17.16b
683	aesmc	v1.16b,v1.16b
684	aese	v24.16b,v17.16b
685	aesmc	v24.16b,v24.16b
686	aese	v1.16b,v20.16b
687	aesmc	v1.16b,v1.16b
688	aese	v24.16b,v20.16b
689	aesmc	v24.16b,v24.16b
690	cmn	x2,#0x20
691	aese	v1.16b,v21.16b
692	aesmc	v1.16b,v1.16b
693	aese	v24.16b,v21.16b
694	aesmc	v24.16b,v24.16b
695	aese	v1.16b,v22.16b
696	aesmc	v1.16b,v1.16b
697	aese	v24.16b,v22.16b
698	aesmc	v24.16b,v24.16b
699	aese	v1.16b,v23.16b
700	aese	v24.16b,v23.16b
701	b.eq	.Lecb_enc_one
702	eor	v5.16b,v7.16b,v1.16b
703	eor	v17.16b,v7.16b,v24.16b
704	st1	{v5.16b},[x1],#16
705	st1	{v17.16b},[x1],#16
706	b	.Lecb_done
707
708.Lecb_enc_one:
709	eor	v5.16b,v7.16b,v24.16b
710	st1	{v5.16b},[x1],#16
711	b	.Lecb_done
712.align	5
713.Lecb_dec:
714	ld1	{v1.16b},[x0],#16
715	subs	x2,x2,#32			// bias
716	add	w6,w5,#2
717	orr	v3.16b,v1.16b,v1.16b
718	orr	v24.16b,v1.16b,v1.16b
719	orr	v1.16b,v0.16b,v0.16b
720	b.lo	.Lecb_dec_tail
721
722	orr	v1.16b,v3.16b,v3.16b
723	ld1	{v24.16b},[x0],#16
724	cmp	x2,#32
725	b.lo	.Loop3x_ecb_dec
726
727	ld1	{v25.16b},[x0],#16
728	ld1	{v26.16b},[x0],#16
729	sub	x2,x2,#32				// bias
730	mov	w6,w5
731
732.Loop5x_ecb_dec:
733	aesd	v0.16b,v16.16b
734	aesimc	v0.16b,v0.16b
735	aesd	v1.16b,v16.16b
736	aesimc	v1.16b,v1.16b
737	aesd	v24.16b,v16.16b
738	aesimc	v24.16b,v24.16b
739	aesd	v25.16b,v16.16b
740	aesimc	v25.16b,v25.16b
741	aesd	v26.16b,v16.16b
742	aesimc	v26.16b,v26.16b
743	ld1	{v16.4s},[x7],#16
744	subs	w6,w6,#2
745	aesd	v0.16b,v17.16b
746	aesimc	v0.16b,v0.16b
747	aesd	v1.16b,v17.16b
748	aesimc	v1.16b,v1.16b
749	aesd	v24.16b,v17.16b
750	aesimc	v24.16b,v24.16b
751	aesd	v25.16b,v17.16b
752	aesimc	v25.16b,v25.16b
753	aesd	v26.16b,v17.16b
754	aesimc	v26.16b,v26.16b
755	ld1	{v17.4s},[x7],#16
756	b.gt	.Loop5x_ecb_dec
757
758	aesd	v0.16b,v16.16b
759	aesimc	v0.16b,v0.16b
760	aesd	v1.16b,v16.16b
761	aesimc	v1.16b,v1.16b
762	aesd	v24.16b,v16.16b
763	aesimc	v24.16b,v24.16b
764	aesd	v25.16b,v16.16b
765	aesimc	v25.16b,v25.16b
766	aesd	v26.16b,v16.16b
767	aesimc	v26.16b,v26.16b
768	cmp	x2,#0x40				// because .Lecb_tail4x
769	sub	x2,x2,#0x50
770
771	aesd	v0.16b,v17.16b
772	aesimc	v0.16b,v0.16b
773	aesd	v1.16b,v17.16b
774	aesimc	v1.16b,v1.16b
775	aesd	v24.16b,v17.16b
776	aesimc	v24.16b,v24.16b
777	aesd	v25.16b,v17.16b
778	aesimc	v25.16b,v25.16b
779	aesd	v26.16b,v17.16b
780	aesimc	v26.16b,v26.16b
781	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
782	mov	x7,x3
783
784	aesd	v0.16b,v18.16b
785	aesimc	v0.16b,v0.16b
786	aesd	v1.16b,v18.16b
787	aesimc	v1.16b,v1.16b
788	aesd	v24.16b,v18.16b
789	aesimc	v24.16b,v24.16b
790	aesd	v25.16b,v18.16b
791	aesimc	v25.16b,v25.16b
792	aesd	v26.16b,v18.16b
793	aesimc	v26.16b,v26.16b
794	add	x0,x0,x6				// x0 is adjusted in such way that
795							// at exit from the loop v1.16b-v26.16b
796							// are loaded with last "words"
797	add	x6,x2,#0x60			// because .Lecb_tail4x
798
799	aesd	v0.16b,v19.16b
800	aesimc	v0.16b,v0.16b
801	aesd	v1.16b,v19.16b
802	aesimc	v1.16b,v1.16b
803	aesd	v24.16b,v19.16b
804	aesimc	v24.16b,v24.16b
805	aesd	v25.16b,v19.16b
806	aesimc	v25.16b,v25.16b
807	aesd	v26.16b,v19.16b
808	aesimc	v26.16b,v26.16b
809
810	aesd	v0.16b,v20.16b
811	aesimc	v0.16b,v0.16b
812	aesd	v1.16b,v20.16b
813	aesimc	v1.16b,v1.16b
814	aesd	v24.16b,v20.16b
815	aesimc	v24.16b,v24.16b
816	aesd	v25.16b,v20.16b
817	aesimc	v25.16b,v25.16b
818	aesd	v26.16b,v20.16b
819	aesimc	v26.16b,v26.16b
820
821	aesd	v0.16b,v21.16b
822	aesimc	v0.16b,v0.16b
823	aesd	v1.16b,v21.16b
824	aesimc	v1.16b,v1.16b
825	aesd	v24.16b,v21.16b
826	aesimc	v24.16b,v24.16b
827	aesd	v25.16b,v21.16b
828	aesimc	v25.16b,v25.16b
829	aesd	v26.16b,v21.16b
830	aesimc	v26.16b,v26.16b
831
832	aesd	v0.16b,v22.16b
833	aesimc	v0.16b,v0.16b
834	aesd	v1.16b,v22.16b
835	aesimc	v1.16b,v1.16b
836	aesd	v24.16b,v22.16b
837	aesimc	v24.16b,v24.16b
838	aesd	v25.16b,v22.16b
839	aesimc	v25.16b,v25.16b
840	aesd	v26.16b,v22.16b
841	aesimc	v26.16b,v26.16b
842
843	aesd	v0.16b,v23.16b
844	ld1	{v2.16b},[x0],#16
845	aesd	v1.16b,v23.16b
846	ld1	{v3.16b},[x0],#16
847	aesd	v24.16b,v23.16b
848	ld1	{v27.16b},[x0],#16
849	aesd	v25.16b,v23.16b
850	ld1	{v28.16b},[x0],#16
851	aesd	v26.16b,v23.16b
852	ld1	{v29.16b},[x0],#16
853	cbz	x6,.Lecb_tail4x
854	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
855	eor	v4.16b,v7.16b,v0.16b
856	orr	v0.16b,v2.16b,v2.16b
857	eor	v5.16b,v7.16b,v1.16b
858	orr	v1.16b,v3.16b,v3.16b
859	eor	v17.16b,v7.16b,v24.16b
860	orr	v24.16b,v27.16b,v27.16b
861	eor	v30.16b,v7.16b,v25.16b
862	orr	v25.16b,v28.16b,v28.16b
863	eor	v31.16b,v7.16b,v26.16b
864	st1	{v4.16b},[x1],#16
865	orr	v26.16b,v29.16b,v29.16b
866	st1	{v5.16b},[x1],#16
867	mov	w6,w5
868	st1	{v17.16b},[x1],#16
869	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
870	st1	{v30.16b},[x1],#16
871	st1	{v31.16b},[x1],#16
872	b.hs	.Loop5x_ecb_dec
873
874	add	x2,x2,#0x50
875	cbz	x2,.Lecb_done
876
877	add	w6,w5,#2
878	subs	x2,x2,#0x30
879	orr	v0.16b,v27.16b,v27.16b
880	orr	v1.16b,v28.16b,v28.16b
881	orr	v24.16b,v29.16b,v29.16b
882	b.lo	.Lecb_dec_tail
883
884	b	.Loop3x_ecb_dec
885
886.align	4
887.Lecb_tail4x:
888	eor	v5.16b,v7.16b,v1.16b
889	eor	v17.16b,v7.16b,v24.16b
890	eor	v30.16b,v7.16b,v25.16b
891	eor	v31.16b,v7.16b,v26.16b
892	st1	{v5.16b},[x1],#16
893	st1	{v17.16b},[x1],#16
894	st1	{v30.16b},[x1],#16
895	st1	{v31.16b},[x1],#16
896
897	b	.Lecb_done
898.align	4
899.Loop3x_ecb_dec:
900	aesd	v0.16b,v16.16b
901	aesimc	v0.16b,v0.16b
902	aesd	v1.16b,v16.16b
903	aesimc	v1.16b,v1.16b
904	aesd	v24.16b,v16.16b
905	aesimc	v24.16b,v24.16b
906	ld1	{v16.4s},[x7],#16
907	subs	w6,w6,#2
908	aesd	v0.16b,v17.16b
909	aesimc	v0.16b,v0.16b
910	aesd	v1.16b,v17.16b
911	aesimc	v1.16b,v1.16b
912	aesd	v24.16b,v17.16b
913	aesimc	v24.16b,v24.16b
914	ld1	{v17.4s},[x7],#16
915	b.gt	.Loop3x_ecb_dec
916
917	aesd	v0.16b,v16.16b
918	aesimc	v0.16b,v0.16b
919	aesd	v1.16b,v16.16b
920	aesimc	v1.16b,v1.16b
921	aesd	v24.16b,v16.16b
922	aesimc	v24.16b,v24.16b
923	subs	x2,x2,#0x30
924	csel	x6,x2,x6,lo				// x6, w6, is zero at this point
925	aesd	v0.16b,v17.16b
926	aesimc	v0.16b,v0.16b
927	aesd	v1.16b,v17.16b
928	aesimc	v1.16b,v1.16b
929	aesd	v24.16b,v17.16b
930	aesimc	v24.16b,v24.16b
931	add	x0,x0,x6 			// x0 is adjusted in such way that
932						// at exit from the loop v1.16b-v24.16b
933						// are loaded with last "words"
934	mov	x7,x3
935	aesd	v0.16b,v20.16b
936	aesimc	v0.16b,v0.16b
937	aesd	v1.16b,v20.16b
938	aesimc	v1.16b,v1.16b
939	aesd	v24.16b,v20.16b
940	aesimc	v24.16b,v24.16b
941	ld1	{v2.16b},[x0],#16
942	aesd	v0.16b,v21.16b
943	aesimc	v0.16b,v0.16b
944	aesd	v1.16b,v21.16b
945	aesimc	v1.16b,v1.16b
946	aesd	v24.16b,v21.16b
947	aesimc	v24.16b,v24.16b
948	ld1	{v3.16b},[x0],#16
949	aesd	v0.16b,v22.16b
950	aesimc	v0.16b,v0.16b
951	aesd	v1.16b,v22.16b
952	aesimc	v1.16b,v1.16b
953	aesd	v24.16b,v22.16b
954	aesimc	v24.16b,v24.16b
955	ld1	{v27.16b},[x0],#16
956	aesd	v0.16b,v23.16b
957	aesd	v1.16b,v23.16b
958	aesd	v24.16b,v23.16b
959	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
960	add	w6,w5,#2
961	eor	v4.16b,v7.16b,v0.16b
962	eor	v5.16b,v7.16b,v1.16b
963	eor	v24.16b,v24.16b,v7.16b
964	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
965	st1	{v4.16b},[x1],#16
966	orr	v0.16b,v2.16b,v2.16b
967	st1	{v5.16b},[x1],#16
968	orr	v1.16b,v3.16b,v3.16b
969	st1	{v24.16b},[x1],#16
970	orr	v24.16b,v27.16b,v27.16b
971	b.hs	.Loop3x_ecb_dec
972
973	cmn	x2,#0x30
974	b.eq	.Lecb_done
975	nop
976
977.Lecb_dec_tail:
978	aesd	v1.16b,v16.16b
979	aesimc	v1.16b,v1.16b
980	aesd	v24.16b,v16.16b
981	aesimc	v24.16b,v24.16b
982	ld1	{v16.4s},[x7],#16
983	subs	w6,w6,#2
984	aesd	v1.16b,v17.16b
985	aesimc	v1.16b,v1.16b
986	aesd	v24.16b,v17.16b
987	aesimc	v24.16b,v24.16b
988	ld1	{v17.4s},[x7],#16
989	b.gt	.Lecb_dec_tail
990
991	aesd	v1.16b,v16.16b
992	aesimc	v1.16b,v1.16b
993	aesd	v24.16b,v16.16b
994	aesimc	v24.16b,v24.16b
995	aesd	v1.16b,v17.16b
996	aesimc	v1.16b,v1.16b
997	aesd	v24.16b,v17.16b
998	aesimc	v24.16b,v24.16b
999	aesd	v1.16b,v20.16b
1000	aesimc	v1.16b,v1.16b
1001	aesd	v24.16b,v20.16b
1002	aesimc	v24.16b,v24.16b
1003	cmn	x2,#0x20
1004	aesd	v1.16b,v21.16b
1005	aesimc	v1.16b,v1.16b
1006	aesd	v24.16b,v21.16b
1007	aesimc	v24.16b,v24.16b
1008	aesd	v1.16b,v22.16b
1009	aesimc	v1.16b,v1.16b
1010	aesd	v24.16b,v22.16b
1011	aesimc	v24.16b,v24.16b
1012	aesd	v1.16b,v23.16b
1013	aesd	v24.16b,v23.16b
1014	b.eq	.Lecb_dec_one
1015	eor	v5.16b,v7.16b,v1.16b
1016	eor	v17.16b,v7.16b,v24.16b
1017	st1	{v5.16b},[x1],#16
1018	st1	{v17.16b},[x1],#16
1019	b	.Lecb_done
1020
1021.Lecb_dec_one:
1022	eor	v5.16b,v7.16b,v24.16b
1023	st1	{v5.16b},[x1],#16
1024
1025.Lecb_done:
1026	ldr	x29,[sp],#16
1027.Lecb_Final_abort:
1028	ret
1029.size	aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt
1030.globl	aes_v8_cbc_encrypt
1031.type	aes_v8_cbc_encrypt,%function
1032.align	5
1033aes_v8_cbc_encrypt:
1034	stp	x29,x30,[sp,#-16]!
1035	add	x29,sp,#0
1036	subs	x2,x2,#16
1037	mov	x8,#16
1038	b.lo	.Lcbc_abort
1039	csel	x8,xzr,x8,eq
1040
1041	cmp	w5,#0			// en- or decrypting?
1042	ldr	w5,[x3,#240]
1043	and	x2,x2,#-16
1044	ld1	{v6.16b},[x4]
1045	ld1	{v0.16b},[x0],x8
1046
1047	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
1048	sub	w5,w5,#6
1049	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
1050	sub	w5,w5,#2
1051	ld1	{v18.4s,v19.4s},[x7],#32
1052	ld1	{v20.4s,v21.4s},[x7],#32
1053	ld1	{v22.4s,v23.4s},[x7],#32
1054	ld1	{v7.4s},[x7]
1055
1056	add	x7,x3,#32
1057	mov	w6,w5
1058	b.eq	.Lcbc_dec
1059
1060	cmp	w5,#2
1061	eor	v0.16b,v0.16b,v6.16b
1062	eor	v5.16b,v16.16b,v7.16b
1063	b.eq	.Lcbc_enc128
1064
1065	ld1	{v2.4s,v3.4s},[x7]
1066	add	x7,x3,#16
1067	add	x6,x3,#16*4
1068	add	x12,x3,#16*5
1069	aese	v0.16b,v16.16b
1070	aesmc	v0.16b,v0.16b
1071	add	x14,x3,#16*6
1072	add	x3,x3,#16*7
1073	b	.Lenter_cbc_enc
1074
1075.align	4
1076.Loop_cbc_enc:
1077	aese	v0.16b,v16.16b
1078	aesmc	v0.16b,v0.16b
1079	st1	{v6.16b},[x1],#16
1080.Lenter_cbc_enc:
1081	aese	v0.16b,v17.16b
1082	aesmc	v0.16b,v0.16b
1083	aese	v0.16b,v2.16b
1084	aesmc	v0.16b,v0.16b
1085	ld1	{v16.4s},[x6]
1086	cmp	w5,#4
1087	aese	v0.16b,v3.16b
1088	aesmc	v0.16b,v0.16b
1089	ld1	{v17.4s},[x12]
1090	b.eq	.Lcbc_enc192
1091
1092	aese	v0.16b,v16.16b
1093	aesmc	v0.16b,v0.16b
1094	ld1	{v16.4s},[x14]
1095	aese	v0.16b,v17.16b
1096	aesmc	v0.16b,v0.16b
1097	ld1	{v17.4s},[x3]
1098	nop
1099
1100.Lcbc_enc192:
1101	aese	v0.16b,v16.16b
1102	aesmc	v0.16b,v0.16b
1103	subs	x2,x2,#16
1104	aese	v0.16b,v17.16b
1105	aesmc	v0.16b,v0.16b
1106	csel	x8,xzr,x8,eq
1107	aese	v0.16b,v18.16b
1108	aesmc	v0.16b,v0.16b
1109	aese	v0.16b,v19.16b
1110	aesmc	v0.16b,v0.16b
1111	ld1	{v16.16b},[x0],x8
1112	aese	v0.16b,v20.16b
1113	aesmc	v0.16b,v0.16b
1114	eor	v16.16b,v16.16b,v5.16b
1115	aese	v0.16b,v21.16b
1116	aesmc	v0.16b,v0.16b
1117	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
1118	aese	v0.16b,v22.16b
1119	aesmc	v0.16b,v0.16b
1120	aese	v0.16b,v23.16b
1121	eor	v6.16b,v0.16b,v7.16b
1122	b.hs	.Loop_cbc_enc
1123
1124	st1	{v6.16b},[x1],#16
1125	b	.Lcbc_done
1126
1127.align	5
1128.Lcbc_enc128:
1129	ld1	{v2.4s,v3.4s},[x7]
1130	aese	v0.16b,v16.16b
1131	aesmc	v0.16b,v0.16b
1132	b	.Lenter_cbc_enc128
1133.Loop_cbc_enc128:
1134	aese	v0.16b,v16.16b
1135	aesmc	v0.16b,v0.16b
1136	st1	{v6.16b},[x1],#16
1137.Lenter_cbc_enc128:
1138	aese	v0.16b,v17.16b
1139	aesmc	v0.16b,v0.16b
1140	subs	x2,x2,#16
1141	aese	v0.16b,v2.16b
1142	aesmc	v0.16b,v0.16b
1143	csel	x8,xzr,x8,eq
1144	aese	v0.16b,v3.16b
1145	aesmc	v0.16b,v0.16b
1146	aese	v0.16b,v18.16b
1147	aesmc	v0.16b,v0.16b
1148	aese	v0.16b,v19.16b
1149	aesmc	v0.16b,v0.16b
1150	ld1	{v16.16b},[x0],x8
1151	aese	v0.16b,v20.16b
1152	aesmc	v0.16b,v0.16b
1153	aese	v0.16b,v21.16b
1154	aesmc	v0.16b,v0.16b
1155	aese	v0.16b,v22.16b
1156	aesmc	v0.16b,v0.16b
1157	eor	v16.16b,v16.16b,v5.16b
1158	aese	v0.16b,v23.16b
1159	eor	v6.16b,v0.16b,v7.16b
1160	b.hs	.Loop_cbc_enc128
1161
1162	st1	{v6.16b},[x1],#16
1163	b	.Lcbc_done
1164.align	5
1165.Lcbc_dec:
1166	ld1	{v24.16b},[x0],#16
1167	subs	x2,x2,#32		// bias
1168	add	w6,w5,#2
1169	orr	v3.16b,v0.16b,v0.16b
1170	orr	v1.16b,v0.16b,v0.16b
1171	orr	v27.16b,v24.16b,v24.16b
1172	b.lo	.Lcbc_dec_tail
1173
1174	orr	v1.16b,v24.16b,v24.16b
1175	ld1	{v24.16b},[x0],#16
1176	orr	v2.16b,v0.16b,v0.16b
1177	orr	v3.16b,v1.16b,v1.16b
1178	orr	v27.16b,v24.16b,v24.16b
1179	cmp	x2,#32
1180	b.lo	.Loop3x_cbc_dec
1181
1182	ld1	{v25.16b},[x0],#16
1183	ld1	{v26.16b},[x0],#16
1184	sub	x2,x2,#32		// bias
1185	mov	w6,w5
1186	orr	v28.16b,v25.16b,v25.16b
1187	orr	v29.16b,v26.16b,v26.16b
1188
1189.Loop5x_cbc_dec:
1190	aesd	v0.16b,v16.16b
1191	aesimc	v0.16b,v0.16b
1192	aesd	v1.16b,v16.16b
1193	aesimc	v1.16b,v1.16b
1194	aesd	v24.16b,v16.16b
1195	aesimc	v24.16b,v24.16b
1196	aesd	v25.16b,v16.16b
1197	aesimc	v25.16b,v25.16b
1198	aesd	v26.16b,v16.16b
1199	aesimc	v26.16b,v26.16b
1200	ld1	{v16.4s},[x7],#16
1201	subs	w6,w6,#2
1202	aesd	v0.16b,v17.16b
1203	aesimc	v0.16b,v0.16b
1204	aesd	v1.16b,v17.16b
1205	aesimc	v1.16b,v1.16b
1206	aesd	v24.16b,v17.16b
1207	aesimc	v24.16b,v24.16b
1208	aesd	v25.16b,v17.16b
1209	aesimc	v25.16b,v25.16b
1210	aesd	v26.16b,v17.16b
1211	aesimc	v26.16b,v26.16b
1212	ld1	{v17.4s},[x7],#16
1213	b.gt	.Loop5x_cbc_dec
1214
1215	aesd	v0.16b,v16.16b
1216	aesimc	v0.16b,v0.16b
1217	aesd	v1.16b,v16.16b
1218	aesimc	v1.16b,v1.16b
1219	aesd	v24.16b,v16.16b
1220	aesimc	v24.16b,v24.16b
1221	aesd	v25.16b,v16.16b
1222	aesimc	v25.16b,v25.16b
1223	aesd	v26.16b,v16.16b
1224	aesimc	v26.16b,v26.16b
1225	cmp	x2,#0x40		// because .Lcbc_tail4x
1226	sub	x2,x2,#0x50
1227
1228	aesd	v0.16b,v17.16b
1229	aesimc	v0.16b,v0.16b
1230	aesd	v1.16b,v17.16b
1231	aesimc	v1.16b,v1.16b
1232	aesd	v24.16b,v17.16b
1233	aesimc	v24.16b,v24.16b
1234	aesd	v25.16b,v17.16b
1235	aesimc	v25.16b,v25.16b
1236	aesd	v26.16b,v17.16b
1237	aesimc	v26.16b,v26.16b
1238	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
1239	mov	x7,x3
1240
1241	aesd	v0.16b,v18.16b
1242	aesimc	v0.16b,v0.16b
1243	aesd	v1.16b,v18.16b
1244	aesimc	v1.16b,v1.16b
1245	aesd	v24.16b,v18.16b
1246	aesimc	v24.16b,v24.16b
1247	aesd	v25.16b,v18.16b
1248	aesimc	v25.16b,v25.16b
1249	aesd	v26.16b,v18.16b
1250	aesimc	v26.16b,v26.16b
1251	add	x0,x0,x6		// x0 is adjusted in such way that
1252					// at exit from the loop v1.16b-v26.16b
1253					// are loaded with last "words"
1254	add	x6,x2,#0x60		// because .Lcbc_tail4x
1255
1256	aesd	v0.16b,v19.16b
1257	aesimc	v0.16b,v0.16b
1258	aesd	v1.16b,v19.16b
1259	aesimc	v1.16b,v1.16b
1260	aesd	v24.16b,v19.16b
1261	aesimc	v24.16b,v24.16b
1262	aesd	v25.16b,v19.16b
1263	aesimc	v25.16b,v25.16b
1264	aesd	v26.16b,v19.16b
1265	aesimc	v26.16b,v26.16b
1266
1267	aesd	v0.16b,v20.16b
1268	aesimc	v0.16b,v0.16b
1269	aesd	v1.16b,v20.16b
1270	aesimc	v1.16b,v1.16b
1271	aesd	v24.16b,v20.16b
1272	aesimc	v24.16b,v24.16b
1273	aesd	v25.16b,v20.16b
1274	aesimc	v25.16b,v25.16b
1275	aesd	v26.16b,v20.16b
1276	aesimc	v26.16b,v26.16b
1277
1278	aesd	v0.16b,v21.16b
1279	aesimc	v0.16b,v0.16b
1280	aesd	v1.16b,v21.16b
1281	aesimc	v1.16b,v1.16b
1282	aesd	v24.16b,v21.16b
1283	aesimc	v24.16b,v24.16b
1284	aesd	v25.16b,v21.16b
1285	aesimc	v25.16b,v25.16b
1286	aesd	v26.16b,v21.16b
1287	aesimc	v26.16b,v26.16b
1288
1289	aesd	v0.16b,v22.16b
1290	aesimc	v0.16b,v0.16b
1291	aesd	v1.16b,v22.16b
1292	aesimc	v1.16b,v1.16b
1293	aesd	v24.16b,v22.16b
1294	aesimc	v24.16b,v24.16b
1295	aesd	v25.16b,v22.16b
1296	aesimc	v25.16b,v25.16b
1297	aesd	v26.16b,v22.16b
1298	aesimc	v26.16b,v26.16b
1299
1300	eor	v4.16b,v6.16b,v7.16b
1301	aesd	v0.16b,v23.16b
1302	eor	v5.16b,v2.16b,v7.16b
1303	ld1	{v2.16b},[x0],#16
1304	aesd	v1.16b,v23.16b
1305	eor	v17.16b,v3.16b,v7.16b
1306	ld1	{v3.16b},[x0],#16
1307	aesd	v24.16b,v23.16b
1308	eor	v30.16b,v27.16b,v7.16b
1309	ld1	{v27.16b},[x0],#16
1310	aesd	v25.16b,v23.16b
1311	eor	v31.16b,v28.16b,v7.16b
1312	ld1	{v28.16b},[x0],#16
1313	aesd	v26.16b,v23.16b
1314	orr	v6.16b,v29.16b,v29.16b
1315	ld1	{v29.16b},[x0],#16
1316	cbz	x6,.Lcbc_tail4x
1317	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1318	eor	v4.16b,v4.16b,v0.16b
1319	orr	v0.16b,v2.16b,v2.16b
1320	eor	v5.16b,v5.16b,v1.16b
1321	orr	v1.16b,v3.16b,v3.16b
1322	eor	v17.16b,v17.16b,v24.16b
1323	orr	v24.16b,v27.16b,v27.16b
1324	eor	v30.16b,v30.16b,v25.16b
1325	orr	v25.16b,v28.16b,v28.16b
1326	eor	v31.16b,v31.16b,v26.16b
1327	st1	{v4.16b},[x1],#16
1328	orr	v26.16b,v29.16b,v29.16b
1329	st1	{v5.16b},[x1],#16
1330	mov	w6,w5
1331	st1	{v17.16b},[x1],#16
1332	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1333	st1	{v30.16b},[x1],#16
1334	st1	{v31.16b},[x1],#16
1335	b.hs	.Loop5x_cbc_dec
1336
1337	add	x2,x2,#0x50
1338	cbz	x2,.Lcbc_done
1339
1340	add	w6,w5,#2
1341	subs	x2,x2,#0x30
1342	orr	v0.16b,v27.16b,v27.16b
1343	orr	v2.16b,v27.16b,v27.16b
1344	orr	v1.16b,v28.16b,v28.16b
1345	orr	v3.16b,v28.16b,v28.16b
1346	orr	v24.16b,v29.16b,v29.16b
1347	orr	v27.16b,v29.16b,v29.16b
1348	b.lo	.Lcbc_dec_tail
1349
1350	b	.Loop3x_cbc_dec
1351
1352.align	4
1353.Lcbc_tail4x:
1354	eor	v5.16b,v4.16b,v1.16b
1355	eor	v17.16b,v17.16b,v24.16b
1356	eor	v30.16b,v30.16b,v25.16b
1357	eor	v31.16b,v31.16b,v26.16b
1358	st1	{v5.16b},[x1],#16
1359	st1	{v17.16b},[x1],#16
1360	st1	{v30.16b},[x1],#16
1361	st1	{v31.16b},[x1],#16
1362
1363	b	.Lcbc_done
1364.align	4
1365.Loop3x_cbc_dec:
1366	aesd	v0.16b,v16.16b
1367	aesimc	v0.16b,v0.16b
1368	aesd	v1.16b,v16.16b
1369	aesimc	v1.16b,v1.16b
1370	aesd	v24.16b,v16.16b
1371	aesimc	v24.16b,v24.16b
1372	ld1	{v16.4s},[x7],#16
1373	subs	w6,w6,#2
1374	aesd	v0.16b,v17.16b
1375	aesimc	v0.16b,v0.16b
1376	aesd	v1.16b,v17.16b
1377	aesimc	v1.16b,v1.16b
1378	aesd	v24.16b,v17.16b
1379	aesimc	v24.16b,v24.16b
1380	ld1	{v17.4s},[x7],#16
1381	b.gt	.Loop3x_cbc_dec
1382
1383	aesd	v0.16b,v16.16b
1384	aesimc	v0.16b,v0.16b
1385	aesd	v1.16b,v16.16b
1386	aesimc	v1.16b,v1.16b
1387	aesd	v24.16b,v16.16b
1388	aesimc	v24.16b,v24.16b
1389	eor	v4.16b,v6.16b,v7.16b
1390	subs	x2,x2,#0x30
1391	eor	v5.16b,v2.16b,v7.16b
1392	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
1393	aesd	v0.16b,v17.16b
1394	aesimc	v0.16b,v0.16b
1395	aesd	v1.16b,v17.16b
1396	aesimc	v1.16b,v1.16b
1397	aesd	v24.16b,v17.16b
1398	aesimc	v24.16b,v24.16b
1399	eor	v17.16b,v3.16b,v7.16b
1400	add	x0,x0,x6		// x0 is adjusted in such way that
1401					// at exit from the loop v1.16b-v24.16b
1402					// are loaded with last "words"
1403	orr	v6.16b,v27.16b,v27.16b
1404	mov	x7,x3
1405	aesd	v0.16b,v20.16b
1406	aesimc	v0.16b,v0.16b
1407	aesd	v1.16b,v20.16b
1408	aesimc	v1.16b,v1.16b
1409	aesd	v24.16b,v20.16b
1410	aesimc	v24.16b,v24.16b
1411	ld1	{v2.16b},[x0],#16
1412	aesd	v0.16b,v21.16b
1413	aesimc	v0.16b,v0.16b
1414	aesd	v1.16b,v21.16b
1415	aesimc	v1.16b,v1.16b
1416	aesd	v24.16b,v21.16b
1417	aesimc	v24.16b,v24.16b
1418	ld1	{v3.16b},[x0],#16
1419	aesd	v0.16b,v22.16b
1420	aesimc	v0.16b,v0.16b
1421	aesd	v1.16b,v22.16b
1422	aesimc	v1.16b,v1.16b
1423	aesd	v24.16b,v22.16b
1424	aesimc	v24.16b,v24.16b
1425	ld1	{v27.16b},[x0],#16
1426	aesd	v0.16b,v23.16b
1427	aesd	v1.16b,v23.16b
1428	aesd	v24.16b,v23.16b
1429	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1430	add	w6,w5,#2
1431	eor	v4.16b,v4.16b,v0.16b
1432	eor	v5.16b,v5.16b,v1.16b
1433	eor	v24.16b,v24.16b,v17.16b
1434	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1435	st1	{v4.16b},[x1],#16
1436	orr	v0.16b,v2.16b,v2.16b
1437	st1	{v5.16b},[x1],#16
1438	orr	v1.16b,v3.16b,v3.16b
1439	st1	{v24.16b},[x1],#16
1440	orr	v24.16b,v27.16b,v27.16b
1441	b.hs	.Loop3x_cbc_dec
1442
1443	cmn	x2,#0x30
1444	b.eq	.Lcbc_done
1445	nop
1446
1447.Lcbc_dec_tail:
1448	aesd	v1.16b,v16.16b
1449	aesimc	v1.16b,v1.16b
1450	aesd	v24.16b,v16.16b
1451	aesimc	v24.16b,v24.16b
1452	ld1	{v16.4s},[x7],#16
1453	subs	w6,w6,#2
1454	aesd	v1.16b,v17.16b
1455	aesimc	v1.16b,v1.16b
1456	aesd	v24.16b,v17.16b
1457	aesimc	v24.16b,v24.16b
1458	ld1	{v17.4s},[x7],#16
1459	b.gt	.Lcbc_dec_tail
1460
1461	aesd	v1.16b,v16.16b
1462	aesimc	v1.16b,v1.16b
1463	aesd	v24.16b,v16.16b
1464	aesimc	v24.16b,v24.16b
1465	aesd	v1.16b,v17.16b
1466	aesimc	v1.16b,v1.16b
1467	aesd	v24.16b,v17.16b
1468	aesimc	v24.16b,v24.16b
1469	aesd	v1.16b,v20.16b
1470	aesimc	v1.16b,v1.16b
1471	aesd	v24.16b,v20.16b
1472	aesimc	v24.16b,v24.16b
1473	cmn	x2,#0x20
1474	aesd	v1.16b,v21.16b
1475	aesimc	v1.16b,v1.16b
1476	aesd	v24.16b,v21.16b
1477	aesimc	v24.16b,v24.16b
1478	eor	v5.16b,v6.16b,v7.16b
1479	aesd	v1.16b,v22.16b
1480	aesimc	v1.16b,v1.16b
1481	aesd	v24.16b,v22.16b
1482	aesimc	v24.16b,v24.16b
1483	eor	v17.16b,v3.16b,v7.16b
1484	aesd	v1.16b,v23.16b
1485	aesd	v24.16b,v23.16b
1486	b.eq	.Lcbc_dec_one
1487	eor	v5.16b,v5.16b,v1.16b
1488	eor	v17.16b,v17.16b,v24.16b
1489	orr	v6.16b,v27.16b,v27.16b
1490	st1	{v5.16b},[x1],#16
1491	st1	{v17.16b},[x1],#16
1492	b	.Lcbc_done
1493
1494.Lcbc_dec_one:
1495	eor	v5.16b,v5.16b,v24.16b
1496	orr	v6.16b,v27.16b,v27.16b
1497	st1	{v5.16b},[x1],#16
1498
1499.Lcbc_done:
1500	st1	{v6.16b},[x4]
1501.Lcbc_abort:
1502	ldr	x29,[sp],#16
1503	ret
1504.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
1505.globl	aes_v8_ctr32_encrypt_blocks
1506.type	aes_v8_ctr32_encrypt_blocks,%function
1507.align	5
1508aes_v8_ctr32_encrypt_blocks:
1509	stp	x29,x30,[sp,#-16]!
1510	add	x29,sp,#0
1511	ldr	w5,[x3,#240]
1512
1513	ldr	w8, [x4, #12]
1514#ifdef __ARMEB__
1515	ld1	{v0.16b},[x4]
1516#else
1517	ld1	{v0.4s},[x4]
1518#endif
1519	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
1520	sub	w5,w5,#4
1521	mov	x12,#16
1522	cmp	x2,#2
1523	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
1524	sub	w5,w5,#2
1525	ld1	{v20.4s,v21.4s},[x7],#32
1526	ld1	{v22.4s,v23.4s},[x7],#32
1527	ld1	{v7.4s},[x7]
1528	add	x7,x3,#32
1529	mov	w6,w5
1530	csel	x12,xzr,x12,lo
1531#ifndef __ARMEB__
1532	rev	w8, w8
1533#endif
1534	orr	v1.16b,v0.16b,v0.16b
1535	add	w10, w8, #1
1536	orr	v18.16b,v0.16b,v0.16b
1537	add	w8, w8, #2
1538	orr	v6.16b,v0.16b,v0.16b
1539	rev	w10, w10
1540	mov	v1.s[3],w10
1541	b.ls	.Lctr32_tail
1542	rev	w12, w8
1543	sub	x2,x2,#3		// bias
1544	mov	v18.s[3],w12
1545	cmp	x2,#32
1546	b.lo	.Loop3x_ctr32
1547
1548	add	w13,w8,#1
1549	add	w14,w8,#2
1550	orr	v24.16b,v0.16b,v0.16b
1551	rev	w13,w13
1552	orr	v25.16b,v0.16b,v0.16b
1553	rev	w14,w14
1554	mov	v24.s[3],w13
1555	sub	x2,x2,#2		// bias
1556	mov	v25.s[3],w14
1557	add	w8,w8,#2
1558	b	.Loop5x_ctr32
1559
1560.align	4
1561.Loop5x_ctr32:
1562	aese	v0.16b,v16.16b
1563	aesmc	v0.16b,v0.16b
1564	aese	v1.16b,v16.16b
1565	aesmc	v1.16b,v1.16b
1566	aese	v18.16b,v16.16b
1567	aesmc	v18.16b,v18.16b
1568	aese	v24.16b,v16.16b
1569	aesmc	v24.16b,v24.16b
1570	aese	v25.16b,v16.16b
1571	aesmc	v25.16b,v25.16b
1572	ld1	{v16.4s},[x7],#16
1573	subs	w6,w6,#2
1574	aese	v0.16b,v17.16b
1575	aesmc	v0.16b,v0.16b
1576	aese	v1.16b,v17.16b
1577	aesmc	v1.16b,v1.16b
1578	aese	v18.16b,v17.16b
1579	aesmc	v18.16b,v18.16b
1580	aese	v24.16b,v17.16b
1581	aesmc	v24.16b,v24.16b
1582	aese	v25.16b,v17.16b
1583	aesmc	v25.16b,v25.16b
1584	ld1	{v17.4s},[x7],#16
1585	b.gt	.Loop5x_ctr32
1586
1587	mov	x7,x3
1588	aese	v0.16b,v16.16b
1589	aesmc	v0.16b,v0.16b
1590	aese	v1.16b,v16.16b
1591	aesmc	v1.16b,v1.16b
1592	aese	v18.16b,v16.16b
1593	aesmc	v18.16b,v18.16b
1594	aese	v24.16b,v16.16b
1595	aesmc	v24.16b,v24.16b
1596	aese	v25.16b,v16.16b
1597	aesmc	v25.16b,v25.16b
1598	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1599
1600	aese	v0.16b,v17.16b
1601	aesmc	v0.16b,v0.16b
1602	aese	v1.16b,v17.16b
1603	aesmc	v1.16b,v1.16b
1604	aese	v18.16b,v17.16b
1605	aesmc	v18.16b,v18.16b
1606	aese	v24.16b,v17.16b
1607	aesmc	v24.16b,v24.16b
1608	aese	v25.16b,v17.16b
1609	aesmc	v25.16b,v25.16b
1610	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1611
1612	aese	v0.16b,v20.16b
1613	aesmc	v0.16b,v0.16b
1614	add	w9,w8,#1
1615	add	w10,w8,#2
1616	aese	v1.16b,v20.16b
1617	aesmc	v1.16b,v1.16b
1618	add	w12,w8,#3
1619	add	w13,w8,#4
1620	aese	v18.16b,v20.16b
1621	aesmc	v18.16b,v18.16b
1622	add	w14,w8,#5
1623	rev	w9,w9
1624	aese	v24.16b,v20.16b
1625	aesmc	v24.16b,v24.16b
1626	rev	w10,w10
1627	rev	w12,w12
1628	aese	v25.16b,v20.16b
1629	aesmc	v25.16b,v25.16b
1630	rev	w13,w13
1631	rev	w14,w14
1632
1633	aese	v0.16b,v21.16b
1634	aesmc	v0.16b,v0.16b
1635	aese	v1.16b,v21.16b
1636	aesmc	v1.16b,v1.16b
1637	aese	v18.16b,v21.16b
1638	aesmc	v18.16b,v18.16b
1639	aese	v24.16b,v21.16b
1640	aesmc	v24.16b,v24.16b
1641	aese	v25.16b,v21.16b
1642	aesmc	v25.16b,v25.16b
1643
1644	aese	v0.16b,v22.16b
1645	aesmc	v0.16b,v0.16b
1646	ld1	{v2.16b},[x0],#16
1647	aese	v1.16b,v22.16b
1648	aesmc	v1.16b,v1.16b
1649	ld1	{v3.16b},[x0],#16
1650	aese	v18.16b,v22.16b
1651	aesmc	v18.16b,v18.16b
1652	ld1	{v19.16b},[x0],#16
1653	aese	v24.16b,v22.16b
1654	aesmc	v24.16b,v24.16b
1655	ld1	{v26.16b},[x0],#16
1656	aese	v25.16b,v22.16b
1657	aesmc	v25.16b,v25.16b
1658	ld1	{v27.16b},[x0],#16
1659
1660	aese	v0.16b,v23.16b
1661	eor	v2.16b,v2.16b,v7.16b
1662	aese	v1.16b,v23.16b
1663	eor	v3.16b,v3.16b,v7.16b
1664	aese	v18.16b,v23.16b
1665	eor	v19.16b,v19.16b,v7.16b
1666	aese	v24.16b,v23.16b
1667	eor	v26.16b,v26.16b,v7.16b
1668	aese	v25.16b,v23.16b
1669	eor	v27.16b,v27.16b,v7.16b
1670
1671	eor	v2.16b,v2.16b,v0.16b
1672	orr	v0.16b,v6.16b,v6.16b
1673	eor	v3.16b,v3.16b,v1.16b
1674	orr	v1.16b,v6.16b,v6.16b
1675	eor	v19.16b,v19.16b,v18.16b
1676	orr	v18.16b,v6.16b,v6.16b
1677	eor	v26.16b,v26.16b,v24.16b
1678	orr	v24.16b,v6.16b,v6.16b
1679	eor	v27.16b,v27.16b,v25.16b
1680	orr	v25.16b,v6.16b,v6.16b
1681
1682	st1	{v2.16b},[x1],#16
1683	mov	v0.s[3],w9
1684	st1	{v3.16b},[x1],#16
1685	mov	v1.s[3],w10
1686	st1	{v19.16b},[x1],#16
1687	mov	v18.s[3],w12
1688	st1	{v26.16b},[x1],#16
1689	mov	v24.s[3],w13
1690	st1	{v27.16b},[x1],#16
1691	mov	v25.s[3],w14
1692
1693	mov	w6,w5
1694	cbz	x2,.Lctr32_done
1695
1696	add	w8,w8,#5
1697	subs	x2,x2,#5
1698	b.hs	.Loop5x_ctr32
1699
1700	add	x2,x2,#5
1701	sub	w8,w8,#5
1702
1703	cmp	x2,#2
1704	mov	x12,#16
1705	csel	x12,xzr,x12,lo
1706	b.ls	.Lctr32_tail
1707
1708	sub	x2,x2,#3		// bias
1709	add	w8,w8,#3
1710	b	.Loop3x_ctr32
1711
1712.align	4
1713.Loop3x_ctr32:
1714	aese	v0.16b,v16.16b
1715	aesmc	v0.16b,v0.16b
1716	aese	v1.16b,v16.16b
1717	aesmc	v1.16b,v1.16b
1718	aese	v18.16b,v16.16b
1719	aesmc	v18.16b,v18.16b
1720	ld1	{v16.4s},[x7],#16
1721	subs	w6,w6,#2
1722	aese	v0.16b,v17.16b
1723	aesmc	v0.16b,v0.16b
1724	aese	v1.16b,v17.16b
1725	aesmc	v1.16b,v1.16b
1726	aese	v18.16b,v17.16b
1727	aesmc	v18.16b,v18.16b
1728	ld1	{v17.4s},[x7],#16
1729	b.gt	.Loop3x_ctr32
1730
1731	aese	v0.16b,v16.16b
1732	aesmc	v4.16b,v0.16b
1733	aese	v1.16b,v16.16b
1734	aesmc	v5.16b,v1.16b
1735	ld1	{v2.16b},[x0],#16
1736	orr	v0.16b,v6.16b,v6.16b
1737	aese	v18.16b,v16.16b
1738	aesmc	v18.16b,v18.16b
1739	ld1	{v3.16b},[x0],#16
1740	orr	v1.16b,v6.16b,v6.16b
1741	aese	v4.16b,v17.16b
1742	aesmc	v4.16b,v4.16b
1743	aese	v5.16b,v17.16b
1744	aesmc	v5.16b,v5.16b
1745	ld1	{v19.16b},[x0],#16
1746	mov	x7,x3
1747	aese	v18.16b,v17.16b
1748	aesmc	v17.16b,v18.16b
1749	orr	v18.16b,v6.16b,v6.16b
1750	add	w9,w8,#1
1751	aese	v4.16b,v20.16b
1752	aesmc	v4.16b,v4.16b
1753	aese	v5.16b,v20.16b
1754	aesmc	v5.16b,v5.16b
1755	eor	v2.16b,v2.16b,v7.16b
1756	add	w10,w8,#2
1757	aese	v17.16b,v20.16b
1758	aesmc	v17.16b,v17.16b
1759	eor	v3.16b,v3.16b,v7.16b
1760	add	w8,w8,#3
1761	aese	v4.16b,v21.16b
1762	aesmc	v4.16b,v4.16b
1763	aese	v5.16b,v21.16b
1764	aesmc	v5.16b,v5.16b
1765	eor	v19.16b,v19.16b,v7.16b
1766	rev	w9,w9
1767	aese	v17.16b,v21.16b
1768	aesmc	v17.16b,v17.16b
1769	mov	v0.s[3], w9
1770	rev	w10,w10
1771	aese	v4.16b,v22.16b
1772	aesmc	v4.16b,v4.16b
1773	aese	v5.16b,v22.16b
1774	aesmc	v5.16b,v5.16b
1775	mov	v1.s[3], w10
1776	rev	w12,w8
1777	aese	v17.16b,v22.16b
1778	aesmc	v17.16b,v17.16b
1779	mov	v18.s[3], w12
1780	subs	x2,x2,#3
1781	aese	v4.16b,v23.16b
1782	aese	v5.16b,v23.16b
1783	aese	v17.16b,v23.16b
1784
1785	eor	v2.16b,v2.16b,v4.16b
1786	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1787	st1	{v2.16b},[x1],#16
1788	eor	v3.16b,v3.16b,v5.16b
1789	mov	w6,w5
1790	st1	{v3.16b},[x1],#16
1791	eor	v19.16b,v19.16b,v17.16b
1792	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1793	st1	{v19.16b},[x1],#16
1794	b.hs	.Loop3x_ctr32
1795
1796	adds	x2,x2,#3
1797	b.eq	.Lctr32_done
1798	cmp	x2,#1
1799	mov	x12,#16
1800	csel	x12,xzr,x12,eq
1801
1802.Lctr32_tail:
1803	aese	v0.16b,v16.16b
1804	aesmc	v0.16b,v0.16b
1805	aese	v1.16b,v16.16b
1806	aesmc	v1.16b,v1.16b
1807	ld1	{v16.4s},[x7],#16
1808	subs	w6,w6,#2
1809	aese	v0.16b,v17.16b
1810	aesmc	v0.16b,v0.16b
1811	aese	v1.16b,v17.16b
1812	aesmc	v1.16b,v1.16b
1813	ld1	{v17.4s},[x7],#16
1814	b.gt	.Lctr32_tail
1815
1816	aese	v0.16b,v16.16b
1817	aesmc	v0.16b,v0.16b
1818	aese	v1.16b,v16.16b
1819	aesmc	v1.16b,v1.16b
1820	aese	v0.16b,v17.16b
1821	aesmc	v0.16b,v0.16b
1822	aese	v1.16b,v17.16b
1823	aesmc	v1.16b,v1.16b
1824	ld1	{v2.16b},[x0],x12
1825	aese	v0.16b,v20.16b
1826	aesmc	v0.16b,v0.16b
1827	aese	v1.16b,v20.16b
1828	aesmc	v1.16b,v1.16b
1829	ld1	{v3.16b},[x0]
1830	aese	v0.16b,v21.16b
1831	aesmc	v0.16b,v0.16b
1832	aese	v1.16b,v21.16b
1833	aesmc	v1.16b,v1.16b
1834	eor	v2.16b,v2.16b,v7.16b
1835	aese	v0.16b,v22.16b
1836	aesmc	v0.16b,v0.16b
1837	aese	v1.16b,v22.16b
1838	aesmc	v1.16b,v1.16b
1839	eor	v3.16b,v3.16b,v7.16b
1840	aese	v0.16b,v23.16b
1841	aese	v1.16b,v23.16b
1842
1843	cmp	x2,#1
1844	eor	v2.16b,v2.16b,v0.16b
1845	eor	v3.16b,v3.16b,v1.16b
1846	st1	{v2.16b},[x1],#16
1847	b.eq	.Lctr32_done
1848	st1	{v3.16b},[x1]
1849
1850.Lctr32_done:
1851	ldr	x29,[sp],#16
1852	ret
1853.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
1854.globl	aes_v8_xts_encrypt
1855.type	aes_v8_xts_encrypt,%function
1856.align	5
1857aes_v8_xts_encrypt:
1858	cmp	x2,#16
1859	// Original input data size bigger than 16, jump to big size processing.
1860	b.ne	.Lxts_enc_big_size
1861	// Encrypt the iv with key2, as the first XEX iv.
1862	ldr	w6,[x4,#240]
1863	ld1	{v0.4s},[x4],#16
1864	ld1	{v6.16b},[x5]
1865	sub	w6,w6,#2
1866	ld1	{v1.4s},[x4],#16
1867
1868.Loop_enc_iv_enc:
1869	aese	v6.16b,v0.16b
1870	aesmc	v6.16b,v6.16b
1871	ld1	{v0.4s},[x4],#16
1872	subs	w6,w6,#2
1873	aese	v6.16b,v1.16b
1874	aesmc	v6.16b,v6.16b
1875	ld1	{v1.4s},[x4],#16
1876	b.gt	.Loop_enc_iv_enc
1877
1878	aese	v6.16b,v0.16b
1879	aesmc	v6.16b,v6.16b
1880	ld1	{v0.4s},[x4]
1881	aese	v6.16b,v1.16b
1882	eor	v6.16b,v6.16b,v0.16b
1883
1884	ld1	{v0.16b},[x0]
1885	eor	v0.16b,v6.16b,v0.16b
1886
1887	ldr	w6,[x3,#240]
1888	ld1	{v28.4s,v29.4s},[x3],#32		// load key schedule...
1889
1890	aese	v0.16b,v28.16b
1891	aesmc	v0.16b,v0.16b
1892	ld1	{v16.4s,v17.4s},[x3],#32		// load key schedule...
1893	aese	v0.16b,v29.16b
1894	aesmc	v0.16b,v0.16b
1895	subs	w6,w6,#10		// if rounds==10, jump to aes-128-xts processing
1896	b.eq	.Lxts_128_enc
1897.Lxts_enc_round_loop:
1898	aese	v0.16b,v16.16b
1899	aesmc	v0.16b,v0.16b
1900	ld1	{v16.4s},[x3],#16		// load key schedule...
1901	aese	v0.16b,v17.16b
1902	aesmc	v0.16b,v0.16b
1903	ld1	{v17.4s},[x3],#16		// load key schedule...
1904	subs	w6,w6,#2		// bias
1905	b.gt	.Lxts_enc_round_loop
1906.Lxts_128_enc:
1907	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
1908	aese	v0.16b,v16.16b
1909	aesmc	v0.16b,v0.16b
1910	aese	v0.16b,v17.16b
1911	aesmc	v0.16b,v0.16b
1912	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
1913	aese	v0.16b,v18.16b
1914	aesmc	v0.16b,v0.16b
1915	aese	v0.16b,v19.16b
1916	aesmc	v0.16b,v0.16b
1917	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
1918	aese	v0.16b,v20.16b
1919	aesmc	v0.16b,v0.16b
1920	aese	v0.16b,v21.16b
1921	aesmc	v0.16b,v0.16b
1922	ld1	{v7.4s},[x3]
1923	aese	v0.16b,v22.16b
1924	aesmc	v0.16b,v0.16b
1925	aese	v0.16b,v23.16b
1926	eor	v0.16b,v0.16b,v7.16b
1927	eor	v0.16b,v0.16b,v6.16b
1928	st1	{v0.16b},[x1]
1929	b	.Lxts_enc_final_abort
1930
1931.align	4
1932.Lxts_enc_big_size:
1933	stp	x19,x20,[sp,#-64]!
1934	stp	x21,x22,[sp,#48]
1935	stp	d8,d9,[sp,#32]
1936	stp	d10,d11,[sp,#16]
1937
1938	// tailcnt store the tail value of length%16.
1939	and	x21,x2,#0xf
1940	and	x2,x2,#-16
1941	subs	x2,x2,#16
1942	mov	x8,#16
1943	b.lo	.Lxts_abort
1944	csel	x8,xzr,x8,eq
1945
1946	// Firstly, encrypt the iv with key2, as the first iv of XEX.
1947	ldr	w6,[x4,#240]
1948	ld1	{v0.4s},[x4],#16
1949	ld1	{v6.16b},[x5]
1950	sub	w6,w6,#2
1951	ld1	{v1.4s},[x4],#16
1952
1953.Loop_iv_enc:
1954	aese	v6.16b,v0.16b
1955	aesmc	v6.16b,v6.16b
1956	ld1	{v0.4s},[x4],#16
1957	subs	w6,w6,#2
1958	aese	v6.16b,v1.16b
1959	aesmc	v6.16b,v6.16b
1960	ld1	{v1.4s},[x4],#16
1961	b.gt	.Loop_iv_enc
1962
1963	aese	v6.16b,v0.16b
1964	aesmc	v6.16b,v6.16b
1965	ld1	{v0.4s},[x4]
1966	aese	v6.16b,v1.16b
1967	eor	v6.16b,v6.16b,v0.16b
1968
1969	// The iv for second block
1970	// x9- iv(low), x10 - iv(high)
1971	// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
1972	fmov	x9,d6
1973	fmov	x10,v6.d[1]
1974	mov	w19,#0x87
1975	extr	x22,x10,x10,#32
1976	extr	x10,x10,x9,#63
1977	and	w11,w19,w22,asr#31
1978	eor	x9,x11,x9,lsl#1
1979	fmov	d8,x9
1980	fmov	v8.d[1],x10
1981
1982	ldr	w5,[x3,#240]		// next starting point
1983	ld1	{v0.16b},[x0],x8
1984
1985	ld1	{v16.4s,v17.4s},[x3]			// load key schedule...
1986	sub	w5,w5,#6
1987	add	x7,x3,x5,lsl#4		// pointer to last 7 round keys
1988	sub	w5,w5,#2
1989	ld1	{v18.4s,v19.4s},[x7],#32
1990	ld1	{v20.4s,v21.4s},[x7],#32
1991	ld1	{v22.4s,v23.4s},[x7],#32
1992	ld1	{v7.4s},[x7]
1993
1994	add	x7,x3,#32
1995	mov	w6,w5
1996
1997	// Encryption
1998.Lxts_enc:
1999	ld1	{v24.16b},[x0],#16
2000	subs	x2,x2,#32			// bias
2001	add	w6,w5,#2
2002	orr	v3.16b,v0.16b,v0.16b
2003	orr	v1.16b,v0.16b,v0.16b
2004	orr	v28.16b,v0.16b,v0.16b
2005	orr	v27.16b,v24.16b,v24.16b
2006	orr	v29.16b,v24.16b,v24.16b
2007	b.lo	.Lxts_inner_enc_tail
2008	eor	v0.16b,v0.16b,v6.16b			// before encryption, xor with iv
2009	eor	v24.16b,v24.16b,v8.16b
2010
2011	// The iv for third block
2012	extr	x22,x10,x10,#32
2013	extr	x10,x10,x9,#63
2014	and	w11,w19,w22,asr#31
2015	eor	x9,x11,x9,lsl#1
2016	fmov	d9,x9
2017	fmov	v9.d[1],x10
2018
2019
2020	orr	v1.16b,v24.16b,v24.16b
2021	ld1	{v24.16b},[x0],#16
2022	orr	v2.16b,v0.16b,v0.16b
2023	orr	v3.16b,v1.16b,v1.16b
2024	eor	v27.16b,v24.16b,v9.16b 		// the third block
2025	eor	v24.16b,v24.16b,v9.16b
2026	cmp	x2,#32
2027	b.lo	.Lxts_outer_enc_tail
2028
2029	// The iv for fourth block
2030	extr	x22,x10,x10,#32
2031	extr	x10,x10,x9,#63
2032	and	w11,w19,w22,asr#31
2033	eor	x9,x11,x9,lsl#1
2034	fmov	d10,x9
2035	fmov	v10.d[1],x10
2036
2037	ld1	{v25.16b},[x0],#16
2038	// The iv for fifth block
2039	extr	x22,x10,x10,#32
2040	extr	x10,x10,x9,#63
2041	and	w11,w19,w22,asr#31
2042	eor	x9,x11,x9,lsl#1
2043	fmov	d11,x9
2044	fmov	v11.d[1],x10
2045
2046	ld1	{v26.16b},[x0],#16
2047	eor	v25.16b,v25.16b,v10.16b		// the fourth block
2048	eor	v26.16b,v26.16b,v11.16b
2049	sub	x2,x2,#32			// bias
2050	mov	w6,w5
2051	b	.Loop5x_xts_enc
2052
2053.align	4
2054.Loop5x_xts_enc:
2055	aese	v0.16b,v16.16b
2056	aesmc	v0.16b,v0.16b
2057	aese	v1.16b,v16.16b
2058	aesmc	v1.16b,v1.16b
2059	aese	v24.16b,v16.16b
2060	aesmc	v24.16b,v24.16b
2061	aese	v25.16b,v16.16b
2062	aesmc	v25.16b,v25.16b
2063	aese	v26.16b,v16.16b
2064	aesmc	v26.16b,v26.16b
2065	ld1	{v16.4s},[x7],#16
2066	subs	w6,w6,#2
2067	aese	v0.16b,v17.16b
2068	aesmc	v0.16b,v0.16b
2069	aese	v1.16b,v17.16b
2070	aesmc	v1.16b,v1.16b
2071	aese	v24.16b,v17.16b
2072	aesmc	v24.16b,v24.16b
2073	aese	v25.16b,v17.16b
2074	aesmc	v25.16b,v25.16b
2075	aese	v26.16b,v17.16b
2076	aesmc	v26.16b,v26.16b
2077	ld1	{v17.4s},[x7],#16
2078	b.gt	.Loop5x_xts_enc
2079
2080	aese	v0.16b,v16.16b
2081	aesmc	v0.16b,v0.16b
2082	aese	v1.16b,v16.16b
2083	aesmc	v1.16b,v1.16b
2084	aese	v24.16b,v16.16b
2085	aesmc	v24.16b,v24.16b
2086	aese	v25.16b,v16.16b
2087	aesmc	v25.16b,v25.16b
2088	aese	v26.16b,v16.16b
2089	aesmc	v26.16b,v26.16b
2090	subs	x2,x2,#0x50			// because .Lxts_enc_tail4x
2091
2092	aese	v0.16b,v17.16b
2093	aesmc	v0.16b,v0.16b
2094	aese	v1.16b,v17.16b
2095	aesmc	v1.16b,v1.16b
2096	aese	v24.16b,v17.16b
2097	aesmc	v24.16b,v24.16b
2098	aese	v25.16b,v17.16b
2099	aesmc	v25.16b,v25.16b
2100	aese	v26.16b,v17.16b
2101	aesmc	v26.16b,v26.16b
2102	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
2103	mov	x7,x3
2104
2105	aese	v0.16b,v18.16b
2106	aesmc	v0.16b,v0.16b
2107	aese	v1.16b,v18.16b
2108	aesmc	v1.16b,v1.16b
2109	aese	v24.16b,v18.16b
2110	aesmc	v24.16b,v24.16b
2111	aese	v25.16b,v18.16b
2112	aesmc	v25.16b,v25.16b
2113	aese	v26.16b,v18.16b
2114	aesmc	v26.16b,v26.16b
2115	add	x0,x0,x6		// x0 is adjusted in such way that
2116						// at exit from the loop v1.16b-v26.16b
2117						// are loaded with last "words"
2118	add	x6,x2,#0x60		// because .Lxts_enc_tail4x
2119
2120	aese	v0.16b,v19.16b
2121	aesmc	v0.16b,v0.16b
2122	aese	v1.16b,v19.16b
2123	aesmc	v1.16b,v1.16b
2124	aese	v24.16b,v19.16b
2125	aesmc	v24.16b,v24.16b
2126	aese	v25.16b,v19.16b
2127	aesmc	v25.16b,v25.16b
2128	aese	v26.16b,v19.16b
2129	aesmc	v26.16b,v26.16b
2130
2131	aese	v0.16b,v20.16b
2132	aesmc	v0.16b,v0.16b
2133	aese	v1.16b,v20.16b
2134	aesmc	v1.16b,v1.16b
2135	aese	v24.16b,v20.16b
2136	aesmc	v24.16b,v24.16b
2137	aese	v25.16b,v20.16b
2138	aesmc	v25.16b,v25.16b
2139	aese	v26.16b,v20.16b
2140	aesmc	v26.16b,v26.16b
2141
2142	aese	v0.16b,v21.16b
2143	aesmc	v0.16b,v0.16b
2144	aese	v1.16b,v21.16b
2145	aesmc	v1.16b,v1.16b
2146	aese	v24.16b,v21.16b
2147	aesmc	v24.16b,v24.16b
2148	aese	v25.16b,v21.16b
2149	aesmc	v25.16b,v25.16b
2150	aese	v26.16b,v21.16b
2151	aesmc	v26.16b,v26.16b
2152
2153	aese	v0.16b,v22.16b
2154	aesmc	v0.16b,v0.16b
2155	aese	v1.16b,v22.16b
2156	aesmc	v1.16b,v1.16b
2157	aese	v24.16b,v22.16b
2158	aesmc	v24.16b,v24.16b
2159	aese	v25.16b,v22.16b
2160	aesmc	v25.16b,v25.16b
2161	aese	v26.16b,v22.16b
2162	aesmc	v26.16b,v26.16b
2163
2164	eor	v4.16b,v7.16b,v6.16b
2165	aese	v0.16b,v23.16b
2166	// The iv for first block of one iteration
2167	extr	x22,x10,x10,#32
2168	extr	x10,x10,x9,#63
2169	and	w11,w19,w22,asr#31
2170	eor	x9,x11,x9,lsl#1
2171	fmov	d6,x9
2172	fmov	v6.d[1],x10
2173	eor	v5.16b,v7.16b,v8.16b
2174	ld1	{v2.16b},[x0],#16
2175	aese	v1.16b,v23.16b
2176	// The iv for second block
2177	extr	x22,x10,x10,#32
2178	extr	x10,x10,x9,#63
2179	and	w11,w19,w22,asr#31
2180	eor	x9,x11,x9,lsl#1
2181	fmov	d8,x9
2182	fmov	v8.d[1],x10
2183	eor	v17.16b,v7.16b,v9.16b
2184	ld1	{v3.16b},[x0],#16
2185	aese	v24.16b,v23.16b
2186	// The iv for third block
2187	extr	x22,x10,x10,#32
2188	extr	x10,x10,x9,#63
2189	and	w11,w19,w22,asr#31
2190	eor	x9,x11,x9,lsl#1
2191	fmov	d9,x9
2192	fmov	v9.d[1],x10
2193	eor	v30.16b,v7.16b,v10.16b
2194	ld1	{v27.16b},[x0],#16
2195	aese	v25.16b,v23.16b
2196	// The iv for fourth block
2197	extr	x22,x10,x10,#32
2198	extr	x10,x10,x9,#63
2199	and	w11,w19,w22,asr#31
2200	eor	x9,x11,x9,lsl#1
2201	fmov	d10,x9
2202	fmov	v10.d[1],x10
2203	eor	v31.16b,v7.16b,v11.16b
2204	ld1	{v28.16b},[x0],#16
2205	aese	v26.16b,v23.16b
2206
2207	// The iv for fifth block
2208	extr	x22,x10,x10,#32
2209	extr	x10,x10,x9,#63
2210	and	w11,w19,w22,asr #31
2211	eor	x9,x11,x9,lsl #1
2212	fmov	d11,x9
2213	fmov	v11.d[1],x10
2214
2215	ld1	{v29.16b},[x0],#16
2216	cbz	x6,.Lxts_enc_tail4x
2217	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
2218	eor	v4.16b,v4.16b,v0.16b
2219	eor	v0.16b,v2.16b,v6.16b
2220	eor	v5.16b,v5.16b,v1.16b
2221	eor	v1.16b,v3.16b,v8.16b
2222	eor	v17.16b,v17.16b,v24.16b
2223	eor	v24.16b,v27.16b,v9.16b
2224	eor	v30.16b,v30.16b,v25.16b
2225	eor	v25.16b,v28.16b,v10.16b
2226	eor	v31.16b,v31.16b,v26.16b
2227	st1	{v4.16b},[x1],#16
2228	eor	v26.16b,v29.16b,v11.16b
2229	st1	{v5.16b},[x1],#16
2230	mov	w6,w5
2231	st1	{v17.16b},[x1],#16
2232	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
2233	st1	{v30.16b},[x1],#16
2234	st1	{v31.16b},[x1],#16
2235	b.hs	.Loop5x_xts_enc
2236
2237
2238	// If left 4 blocks, borrow the five block's processing.
2239	cmn	x2,#0x10
2240	b.ne	.Loop5x_enc_after
2241	orr	v11.16b,v10.16b,v10.16b
2242	orr	v10.16b,v9.16b,v9.16b
2243	orr	v9.16b,v8.16b,v8.16b
2244	orr	v8.16b,v6.16b,v6.16b
2245	fmov	x9,d11
2246	fmov	x10,v11.d[1]
2247	eor	v0.16b,v6.16b,v2.16b
2248	eor	v1.16b,v8.16b,v3.16b
2249	eor	v24.16b,v27.16b,v9.16b
2250	eor	v25.16b,v28.16b,v10.16b
2251	eor	v26.16b,v29.16b,v11.16b
2252	b.eq	.Loop5x_xts_enc
2253
2254.Loop5x_enc_after:
2255	add	x2,x2,#0x50
2256	cbz	x2,.Lxts_enc_done
2257
2258	add	w6,w5,#2
2259	subs	x2,x2,#0x30
2260	b.lo	.Lxts_inner_enc_tail
2261
2262	eor	v0.16b,v6.16b,v27.16b
2263	eor	v1.16b,v8.16b,v28.16b
2264	eor	v24.16b,v29.16b,v9.16b
2265	b	.Lxts_outer_enc_tail
2266
2267.align	4
2268.Lxts_enc_tail4x:
2269	add	x0,x0,#16
2270	eor	v5.16b,v1.16b,v5.16b
2271	st1	{v5.16b},[x1],#16
2272	eor	v17.16b,v24.16b,v17.16b
2273	st1	{v17.16b},[x1],#16
2274	eor	v30.16b,v25.16b,v30.16b
2275	eor	v31.16b,v26.16b,v31.16b
2276	st1	{v30.16b,v31.16b},[x1],#32
2277
2278	b	.Lxts_enc_done
2279.align	4
2280.Lxts_outer_enc_tail:
2281	aese	v0.16b,v16.16b
2282	aesmc	v0.16b,v0.16b
2283	aese	v1.16b,v16.16b
2284	aesmc	v1.16b,v1.16b
2285	aese	v24.16b,v16.16b
2286	aesmc	v24.16b,v24.16b
2287	ld1	{v16.4s},[x7],#16
2288	subs	w6,w6,#2
2289	aese	v0.16b,v17.16b
2290	aesmc	v0.16b,v0.16b
2291	aese	v1.16b,v17.16b
2292	aesmc	v1.16b,v1.16b
2293	aese	v24.16b,v17.16b
2294	aesmc	v24.16b,v24.16b
2295	ld1	{v17.4s},[x7],#16
2296	b.gt	.Lxts_outer_enc_tail
2297
2298	aese	v0.16b,v16.16b
2299	aesmc	v0.16b,v0.16b
2300	aese	v1.16b,v16.16b
2301	aesmc	v1.16b,v1.16b
2302	aese	v24.16b,v16.16b
2303	aesmc	v24.16b,v24.16b
2304	eor	v4.16b,v6.16b,v7.16b
2305	subs	x2,x2,#0x30
2306	// The iv for first block
2307	fmov	x9,d9
2308	fmov	x10,v9.d[1]
2309	//mov	w19,#0x87
2310	extr	x22,x10,x10,#32
2311	extr	x10,x10,x9,#63
2312	and	w11,w19,w22,asr#31
2313	eor	x9,x11,x9,lsl#1
2314	fmov	d6,x9
2315	fmov	v6.d[1],x10
2316	eor	v5.16b,v8.16b,v7.16b
2317	csel	x6,x2,x6,lo       // x6, w6, is zero at this point
2318	aese	v0.16b,v17.16b
2319	aesmc	v0.16b,v0.16b
2320	aese	v1.16b,v17.16b
2321	aesmc	v1.16b,v1.16b
2322	aese	v24.16b,v17.16b
2323	aesmc	v24.16b,v24.16b
2324	eor	v17.16b,v9.16b,v7.16b
2325
2326	add	x6,x6,#0x20
2327	add	x0,x0,x6
2328	mov	x7,x3
2329
2330	aese	v0.16b,v20.16b
2331	aesmc	v0.16b,v0.16b
2332	aese	v1.16b,v20.16b
2333	aesmc	v1.16b,v1.16b
2334	aese	v24.16b,v20.16b
2335	aesmc	v24.16b,v24.16b
2336	aese	v0.16b,v21.16b
2337	aesmc	v0.16b,v0.16b
2338	aese	v1.16b,v21.16b
2339	aesmc	v1.16b,v1.16b
2340	aese	v24.16b,v21.16b
2341	aesmc	v24.16b,v24.16b
2342	aese	v0.16b,v22.16b
2343	aesmc	v0.16b,v0.16b
2344	aese	v1.16b,v22.16b
2345	aesmc	v1.16b,v1.16b
2346	aese	v24.16b,v22.16b
2347	aesmc	v24.16b,v24.16b
2348	aese	v0.16b,v23.16b
2349	aese	v1.16b,v23.16b
2350	aese	v24.16b,v23.16b
2351	ld1	{v27.16b},[x0],#16
2352	add	w6,w5,#2
2353	ld1	{v16.4s},[x7],#16                // re-pre-load rndkey[0]
2354	eor	v4.16b,v4.16b,v0.16b
2355	eor	v5.16b,v5.16b,v1.16b
2356	eor	v24.16b,v24.16b,v17.16b
2357	ld1	{v17.4s},[x7],#16                // re-pre-load rndkey[1]
2358	st1	{v4.16b},[x1],#16
2359	st1	{v5.16b},[x1],#16
2360	st1	{v24.16b},[x1],#16
2361	cmn	x2,#0x30
2362	b.eq	.Lxts_enc_done
2363.Lxts_encxor_one:
2364	orr	v28.16b,v3.16b,v3.16b
2365	orr	v29.16b,v27.16b,v27.16b
2366	nop
2367
2368.Lxts_inner_enc_tail:
2369	cmn	x2,#0x10
2370	eor	v1.16b,v28.16b,v6.16b
2371	eor	v24.16b,v29.16b,v8.16b
2372	b.eq	.Lxts_enc_tail_loop
2373	eor	v24.16b,v29.16b,v6.16b
2374.Lxts_enc_tail_loop:
2375	aese	v1.16b,v16.16b
2376	aesmc	v1.16b,v1.16b
2377	aese	v24.16b,v16.16b
2378	aesmc	v24.16b,v24.16b
2379	ld1	{v16.4s},[x7],#16
2380	subs	w6,w6,#2
2381	aese	v1.16b,v17.16b
2382	aesmc	v1.16b,v1.16b
2383	aese	v24.16b,v17.16b
2384	aesmc	v24.16b,v24.16b
2385	ld1	{v17.4s},[x7],#16
2386	b.gt	.Lxts_enc_tail_loop
2387
2388	aese	v1.16b,v16.16b
2389	aesmc	v1.16b,v1.16b
2390	aese	v24.16b,v16.16b
2391	aesmc	v24.16b,v24.16b
2392	aese	v1.16b,v17.16b
2393	aesmc	v1.16b,v1.16b
2394	aese	v24.16b,v17.16b
2395	aesmc	v24.16b,v24.16b
2396	aese	v1.16b,v20.16b
2397	aesmc	v1.16b,v1.16b
2398	aese	v24.16b,v20.16b
2399	aesmc	v24.16b,v24.16b
2400	cmn	x2,#0x20
2401	aese	v1.16b,v21.16b
2402	aesmc	v1.16b,v1.16b
2403	aese	v24.16b,v21.16b
2404	aesmc	v24.16b,v24.16b
2405	eor	v5.16b,v6.16b,v7.16b
2406	aese	v1.16b,v22.16b
2407	aesmc	v1.16b,v1.16b
2408	aese	v24.16b,v22.16b
2409	aesmc	v24.16b,v24.16b
2410	eor	v17.16b,v8.16b,v7.16b
2411	aese	v1.16b,v23.16b
2412	aese	v24.16b,v23.16b
2413	b.eq	.Lxts_enc_one
2414	eor	v5.16b,v5.16b,v1.16b
2415	st1	{v5.16b},[x1],#16
2416	eor	v17.16b,v17.16b,v24.16b
2417	orr	v6.16b,v8.16b,v8.16b
2418	st1	{v17.16b},[x1],#16
2419	fmov	x9,d8
2420	fmov	x10,v8.d[1]
2421	mov	w19,#0x87
2422	extr	x22,x10,x10,#32
2423	extr	x10,x10,x9,#63
2424	and	w11,w19,w22,asr #31
2425	eor	x9,x11,x9,lsl #1
2426	fmov	d6,x9
2427	fmov	v6.d[1],x10
2428	b	.Lxts_enc_done
2429
2430.Lxts_enc_one:
2431	eor	v5.16b,v5.16b,v24.16b
2432	orr	v6.16b,v6.16b,v6.16b
2433	st1	{v5.16b},[x1],#16
2434	fmov	x9,d6
2435	fmov	x10,v6.d[1]
2436	mov	w19,#0x87
2437	extr	x22,x10,x10,#32
2438	extr	x10,x10,x9,#63
2439	and	w11,w19,w22,asr #31
2440	eor	x9,x11,x9,lsl #1
2441	fmov	d6,x9
2442	fmov	v6.d[1],x10
2443	b	.Lxts_enc_done
2444.align	5
2445.Lxts_enc_done:
2446	// Process the tail block with cipher stealing.
2447	tst	x21,#0xf
2448	b.eq	.Lxts_abort
2449
2450	mov	x20,x0
2451	mov	x13,x1
2452	sub	x1,x1,#16
2453.composite_enc_loop:
2454	subs	x21,x21,#1
2455	ldrb	w15,[x1,x21]
2456	ldrb	w14,[x20,x21]
2457	strb	w15,[x13,x21]
2458	strb	w14,[x1,x21]
2459	b.gt	.composite_enc_loop
2460.Lxts_enc_load_done:
2461	ld1	{v26.16b},[x1]
2462	eor	v26.16b,v26.16b,v6.16b
2463
2464	// Encrypt the composite block to get the last second encrypted text block
2465	ldr	w6,[x3,#240]		// load key schedule...
2466	ld1	{v0.4s},[x3],#16
2467	sub	w6,w6,#2
2468	ld1	{v1.4s},[x3],#16		// load key schedule...
2469.Loop_final_enc:
2470	aese	v26.16b,v0.16b
2471	aesmc	v26.16b,v26.16b
2472	ld1	{v0.4s},[x3],#16
2473	subs	w6,w6,#2
2474	aese	v26.16b,v1.16b
2475	aesmc	v26.16b,v26.16b
2476	ld1	{v1.4s},[x3],#16
2477	b.gt	.Loop_final_enc
2478
2479	aese	v26.16b,v0.16b
2480	aesmc	v26.16b,v26.16b
2481	ld1	{v0.4s},[x3]
2482	aese	v26.16b,v1.16b
2483	eor	v26.16b,v26.16b,v0.16b
2484	eor	v26.16b,v26.16b,v6.16b
2485	st1	{v26.16b},[x1]
2486
2487.Lxts_abort:
2488	ldp	x21,x22,[sp,#48]
2489	ldp	d8,d9,[sp,#32]
2490	ldp	d10,d11,[sp,#16]
2491	ldp	x19,x20,[sp],#64
2492.Lxts_enc_final_abort:
2493	ret
2494.size	aes_v8_xts_encrypt,.-aes_v8_xts_encrypt
2495.globl	aes_v8_xts_decrypt
2496.type	aes_v8_xts_decrypt,%function
2497.align	5
2498aes_v8_xts_decrypt:
2499	cmp	x2,#16
2500	// Original input data size bigger than 16, jump to big size processing.
2501	b.ne	.Lxts_dec_big_size
2502	// Encrypt the iv with key2, as the first XEX iv.
2503	ldr	w6,[x4,#240]
2504	ld1	{v0.4s},[x4],#16
2505	ld1	{v6.16b},[x5]
2506	sub	w6,w6,#2
2507	ld1	{v1.4s},[x4],#16
2508
2509.Loop_dec_small_iv_enc:
2510	aese	v6.16b,v0.16b
2511	aesmc	v6.16b,v6.16b
2512	ld1	{v0.4s},[x4],#16
2513	subs	w6,w6,#2
2514	aese	v6.16b,v1.16b
2515	aesmc	v6.16b,v6.16b
2516	ld1	{v1.4s},[x4],#16
2517	b.gt	.Loop_dec_small_iv_enc
2518
2519	aese	v6.16b,v0.16b
2520	aesmc	v6.16b,v6.16b
2521	ld1	{v0.4s},[x4]
2522	aese	v6.16b,v1.16b
2523	eor	v6.16b,v6.16b,v0.16b
2524
2525	ld1	{v0.16b},[x0]
2526	eor	v0.16b,v6.16b,v0.16b
2527
2528	ldr	w6,[x3,#240]
2529	ld1	{v28.4s,v29.4s},[x3],#32			// load key schedule...
2530
2531	aesd	v0.16b,v28.16b
2532	aesimc	v0.16b,v0.16b
2533	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
2534	aesd	v0.16b,v29.16b
2535	aesimc	v0.16b,v0.16b
2536	subs	w6,w6,#10			// bias
2537	b.eq	.Lxts_128_dec
2538.Lxts_dec_round_loop:
2539	aesd	v0.16b,v16.16b
2540	aesimc	v0.16b,v0.16b
2541	ld1	{v16.4s},[x3],#16			// load key schedule...
2542	aesd	v0.16b,v17.16b
2543	aesimc	v0.16b,v0.16b
2544	ld1	{v17.4s},[x3],#16			// load key schedule...
2545	subs	w6,w6,#2			// bias
2546	b.gt	.Lxts_dec_round_loop
2547.Lxts_128_dec:
2548	ld1	{v18.4s,v19.4s},[x3],#32			// load key schedule...
2549	aesd	v0.16b,v16.16b
2550	aesimc	v0.16b,v0.16b
2551	aesd	v0.16b,v17.16b
2552	aesimc	v0.16b,v0.16b
2553	ld1	{v20.4s,v21.4s},[x3],#32			// load key schedule...
2554	aesd	v0.16b,v18.16b
2555	aesimc	v0.16b,v0.16b
2556	aesd	v0.16b,v19.16b
2557	aesimc	v0.16b,v0.16b
2558	ld1	{v22.4s,v23.4s},[x3],#32			// load key schedule...
2559	aesd	v0.16b,v20.16b
2560	aesimc	v0.16b,v0.16b
2561	aesd	v0.16b,v21.16b
2562	aesimc	v0.16b,v0.16b
2563	ld1	{v7.4s},[x3]
2564	aesd	v0.16b,v22.16b
2565	aesimc	v0.16b,v0.16b
2566	aesd	v0.16b,v23.16b
2567	eor	v0.16b,v0.16b,v7.16b
2568	eor	v0.16b,v6.16b,v0.16b
2569	st1	{v0.16b},[x1]
2570	b	.Lxts_dec_final_abort
2571.Lxts_dec_big_size:
2572	stp	x19,x20,[sp,#-64]!
2573	stp	x21,x22,[sp,#48]
2574	stp	d8,d9,[sp,#32]
2575	stp	d10,d11,[sp,#16]
2576
2577	and	x21,x2,#0xf
2578	and	x2,x2,#-16
2579	subs	x2,x2,#16
2580	mov	x8,#16
2581	b.lo	.Lxts_dec_abort
2582
2583	// Encrypt the iv with key2, as the first XEX iv
2584	ldr	w6,[x4,#240]
2585	ld1	{v0.4s},[x4],#16
2586	ld1	{v6.16b},[x5]
2587	sub	w6,w6,#2
2588	ld1	{v1.4s},[x4],#16
2589
2590.Loop_dec_iv_enc:
2591	aese	v6.16b,v0.16b
2592	aesmc	v6.16b,v6.16b
2593	ld1	{v0.4s},[x4],#16
2594	subs	w6,w6,#2
2595	aese	v6.16b,v1.16b
2596	aesmc	v6.16b,v6.16b
2597	ld1	{v1.4s},[x4],#16
2598	b.gt	.Loop_dec_iv_enc
2599
2600	aese	v6.16b,v0.16b
2601	aesmc	v6.16b,v6.16b
2602	ld1	{v0.4s},[x4]
2603	aese	v6.16b,v1.16b
2604	eor	v6.16b,v6.16b,v0.16b
2605
2606	// The iv for second block
2607	// x9- iv(low), x10 - iv(high)
2608	// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
2609	fmov	x9,d6
2610	fmov	x10,v6.d[1]
2611	mov	w19,#0x87
2612	extr	x22,x10,x10,#32
2613	extr	x10,x10,x9,#63
2614	and	w11,w19,w22,asr #31
2615	eor	x9,x11,x9,lsl #1
2616	fmov	d8,x9
2617	fmov	v8.d[1],x10
2618
2619	ldr	w5,[x3,#240]		// load rounds number
2620
2621	// The iv for third block
2622	extr	x22,x10,x10,#32
2623	extr	x10,x10,x9,#63
2624	and	w11,w19,w22,asr #31
2625	eor	x9,x11,x9,lsl #1
2626	fmov	d9,x9
2627	fmov	v9.d[1],x10
2628
2629	ld1	{v16.4s,v17.4s},[x3]			// load key schedule...
2630	sub	w5,w5,#6
2631	add	x7,x3,x5,lsl#4		// pointer to last 7 round keys
2632	sub	w5,w5,#2
2633	ld1	{v18.4s,v19.4s},[x7],#32		// load key schedule...
2634	ld1	{v20.4s,v21.4s},[x7],#32
2635	ld1	{v22.4s,v23.4s},[x7],#32
2636	ld1	{v7.4s},[x7]
2637
2638	// The iv for fourth block
2639	extr	x22,x10,x10,#32
2640	extr	x10,x10,x9,#63
2641	and	w11,w19,w22,asr #31
2642	eor	x9,x11,x9,lsl #1
2643	fmov	d10,x9
2644	fmov	v10.d[1],x10
2645
2646	add	x7,x3,#32
2647	mov	w6,w5
2648	b	.Lxts_dec
2649
2650	// Decryption
2651.align	5
2652.Lxts_dec:
2653	tst	x21,#0xf
2654	b.eq	.Lxts_dec_begin
2655	subs	x2,x2,#16
2656	csel	x8,xzr,x8,eq
2657	ld1	{v0.16b},[x0],#16
2658	b.lo	.Lxts_done
2659	sub	x0,x0,#16
2660.Lxts_dec_begin:
2661	ld1	{v0.16b},[x0],x8
2662	subs	x2,x2,#32			// bias
2663	add	w6,w5,#2
2664	orr	v3.16b,v0.16b,v0.16b
2665	orr	v1.16b,v0.16b,v0.16b
2666	orr	v28.16b,v0.16b,v0.16b
2667	ld1	{v24.16b},[x0],#16
2668	orr	v27.16b,v24.16b,v24.16b
2669	orr	v29.16b,v24.16b,v24.16b
2670	b.lo	.Lxts_inner_dec_tail
2671	eor	v0.16b,v0.16b,v6.16b			// before decryt, xor with iv
2672	eor	v24.16b,v24.16b,v8.16b
2673
2674	orr	v1.16b,v24.16b,v24.16b
2675	ld1	{v24.16b},[x0],#16
2676	orr	v2.16b,v0.16b,v0.16b
2677	orr	v3.16b,v1.16b,v1.16b
2678	eor	v27.16b,v24.16b,v9.16b			// third block xox with third iv
2679	eor	v24.16b,v24.16b,v9.16b
2680	cmp	x2,#32
2681	b.lo	.Lxts_outer_dec_tail
2682
2683	ld1	{v25.16b},[x0],#16
2684
2685	// The iv for fifth block
2686	extr	x22,x10,x10,#32
2687	extr	x10,x10,x9,#63
2688	and	w11,w19,w22,asr #31
2689	eor	x9,x11,x9,lsl #1
2690	fmov	d11,x9
2691	fmov	v11.d[1],x10
2692
2693	ld1	{v26.16b},[x0],#16
2694	eor	v25.16b,v25.16b,v10.16b		// the fourth block
2695	eor	v26.16b,v26.16b,v11.16b
2696	sub	x2,x2,#32			// bias
2697	mov	w6,w5
2698	b	.Loop5x_xts_dec
2699
2700.align	4
2701.Loop5x_xts_dec:
2702	aesd	v0.16b,v16.16b
2703	aesimc	v0.16b,v0.16b
2704	aesd	v1.16b,v16.16b
2705	aesimc	v1.16b,v1.16b
2706	aesd	v24.16b,v16.16b
2707	aesimc	v24.16b,v24.16b
2708	aesd	v25.16b,v16.16b
2709	aesimc	v25.16b,v25.16b
2710	aesd	v26.16b,v16.16b
2711	aesimc	v26.16b,v26.16b
2712	ld1	{v16.4s},[x7],#16		// load key schedule...
2713	subs	w6,w6,#2
2714	aesd	v0.16b,v17.16b
2715	aesimc	v0.16b,v0.16b
2716	aesd	v1.16b,v17.16b
2717	aesimc	v1.16b,v1.16b
2718	aesd	v24.16b,v17.16b
2719	aesimc	v24.16b,v24.16b
2720	aesd	v25.16b,v17.16b
2721	aesimc	v25.16b,v25.16b
2722	aesd	v26.16b,v17.16b
2723	aesimc	v26.16b,v26.16b
2724	ld1	{v17.4s},[x7],#16		// load key schedule...
2725	b.gt	.Loop5x_xts_dec
2726
2727	aesd	v0.16b,v16.16b
2728	aesimc	v0.16b,v0.16b
2729	aesd	v1.16b,v16.16b
2730	aesimc	v1.16b,v1.16b
2731	aesd	v24.16b,v16.16b
2732	aesimc	v24.16b,v24.16b
2733	aesd	v25.16b,v16.16b
2734	aesimc	v25.16b,v25.16b
2735	aesd	v26.16b,v16.16b
2736	aesimc	v26.16b,v26.16b
2737	subs	x2,x2,#0x50			// because .Lxts_dec_tail4x
2738
2739	aesd	v0.16b,v17.16b
2740	aesimc	v0.16b,v0.16b
2741	aesd	v1.16b,v17.16b
2742	aesimc	v1.16b,v1.16b
2743	aesd	v24.16b,v17.16b
2744	aesimc	v24.16b,v24.16b
2745	aesd	v25.16b,v17.16b
2746	aesimc	v25.16b,v25.16b
2747	aesd	v26.16b,v17.16b
2748	aesimc	v26.16b,v26.16b
2749	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
2750	mov	x7,x3
2751
2752	aesd	v0.16b,v18.16b
2753	aesimc	v0.16b,v0.16b
2754	aesd	v1.16b,v18.16b
2755	aesimc	v1.16b,v1.16b
2756	aesd	v24.16b,v18.16b
2757	aesimc	v24.16b,v24.16b
2758	aesd	v25.16b,v18.16b
2759	aesimc	v25.16b,v25.16b
2760	aesd	v26.16b,v18.16b
2761	aesimc	v26.16b,v26.16b
2762	add	x0,x0,x6		// x0 is adjusted in such way that
2763						// at exit from the loop v1.16b-v26.16b
2764						// are loaded with last "words"
2765	add	x6,x2,#0x60		// because .Lxts_dec_tail4x
2766
2767	aesd	v0.16b,v19.16b
2768	aesimc	v0.16b,v0.16b
2769	aesd	v1.16b,v19.16b
2770	aesimc	v1.16b,v1.16b
2771	aesd	v24.16b,v19.16b
2772	aesimc	v24.16b,v24.16b
2773	aesd	v25.16b,v19.16b
2774	aesimc	v25.16b,v25.16b
2775	aesd	v26.16b,v19.16b
2776	aesimc	v26.16b,v26.16b
2777
2778	aesd	v0.16b,v20.16b
2779	aesimc	v0.16b,v0.16b
2780	aesd	v1.16b,v20.16b
2781	aesimc	v1.16b,v1.16b
2782	aesd	v24.16b,v20.16b
2783	aesimc	v24.16b,v24.16b
2784	aesd	v25.16b,v20.16b
2785	aesimc	v25.16b,v25.16b
2786	aesd	v26.16b,v20.16b
2787	aesimc	v26.16b,v26.16b
2788
2789	aesd	v0.16b,v21.16b
2790	aesimc	v0.16b,v0.16b
2791	aesd	v1.16b,v21.16b
2792	aesimc	v1.16b,v1.16b
2793	aesd	v24.16b,v21.16b
2794	aesimc	v24.16b,v24.16b
2795	aesd	v25.16b,v21.16b
2796	aesimc	v25.16b,v25.16b
2797	aesd	v26.16b,v21.16b
2798	aesimc	v26.16b,v26.16b
2799
2800	aesd	v0.16b,v22.16b
2801	aesimc	v0.16b,v0.16b
2802	aesd	v1.16b,v22.16b
2803	aesimc	v1.16b,v1.16b
2804	aesd	v24.16b,v22.16b
2805	aesimc	v24.16b,v24.16b
2806	aesd	v25.16b,v22.16b
2807	aesimc	v25.16b,v25.16b
2808	aesd	v26.16b,v22.16b
2809	aesimc	v26.16b,v26.16b
2810
2811	eor	v4.16b,v7.16b,v6.16b
2812	aesd	v0.16b,v23.16b
2813	// The iv for first block of next iteration.
2814	extr	x22,x10,x10,#32
2815	extr	x10,x10,x9,#63
2816	and	w11,w19,w22,asr #31
2817	eor	x9,x11,x9,lsl #1
2818	fmov	d6,x9
2819	fmov	v6.d[1],x10
2820	eor	v5.16b,v7.16b,v8.16b
2821	ld1	{v2.16b},[x0],#16
2822	aesd	v1.16b,v23.16b
2823	// The iv for second block
2824	extr	x22,x10,x10,#32
2825	extr	x10,x10,x9,#63
2826	and	w11,w19,w22,asr #31
2827	eor	x9,x11,x9,lsl #1
2828	fmov	d8,x9
2829	fmov	v8.d[1],x10
2830	eor	v17.16b,v7.16b,v9.16b
2831	ld1	{v3.16b},[x0],#16
2832	aesd	v24.16b,v23.16b
2833	// The iv for third block
2834	extr	x22,x10,x10,#32
2835	extr	x10,x10,x9,#63
2836	and	w11,w19,w22,asr #31
2837	eor	x9,x11,x9,lsl #1
2838	fmov	d9,x9
2839	fmov	v9.d[1],x10
2840	eor	v30.16b,v7.16b,v10.16b
2841	ld1	{v27.16b},[x0],#16
2842	aesd	v25.16b,v23.16b
2843	// The iv for fourth block
2844	extr	x22,x10,x10,#32
2845	extr	x10,x10,x9,#63
2846	and	w11,w19,w22,asr #31
2847	eor	x9,x11,x9,lsl #1
2848	fmov	d10,x9
2849	fmov	v10.d[1],x10
2850	eor	v31.16b,v7.16b,v11.16b
2851	ld1	{v28.16b},[x0],#16
2852	aesd	v26.16b,v23.16b
2853
2854	// The iv for fifth block
2855	extr	x22,x10,x10,#32
2856	extr	x10,x10,x9,#63
2857	and	w11,w19,w22,asr #31
2858	eor	x9,x11,x9,lsl #1
2859	fmov	d11,x9
2860	fmov	v11.d[1],x10
2861
2862	ld1	{v29.16b},[x0],#16
2863	cbz	x6,.Lxts_dec_tail4x
2864	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
2865	eor	v4.16b,v4.16b,v0.16b
2866	eor	v0.16b,v2.16b,v6.16b
2867	eor	v5.16b,v5.16b,v1.16b
2868	eor	v1.16b,v3.16b,v8.16b
2869	eor	v17.16b,v17.16b,v24.16b
2870	eor	v24.16b,v27.16b,v9.16b
2871	eor	v30.16b,v30.16b,v25.16b
2872	eor	v25.16b,v28.16b,v10.16b
2873	eor	v31.16b,v31.16b,v26.16b
2874	st1	{v4.16b},[x1],#16
2875	eor	v26.16b,v29.16b,v11.16b
2876	st1	{v5.16b},[x1],#16
2877	mov	w6,w5
2878	st1	{v17.16b},[x1],#16
2879	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
2880	st1	{v30.16b},[x1],#16
2881	st1	{v31.16b},[x1],#16
2882	b.hs	.Loop5x_xts_dec
2883
2884	cmn	x2,#0x10
2885	b.ne	.Loop5x_dec_after
2886	// If x2(x2) equal to -0x10, the left blocks is 4.
2887	// After specially processing, utilize the five blocks processing again.
2888	// It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b.
2889	orr	v11.16b,v10.16b,v10.16b
2890	orr	v10.16b,v9.16b,v9.16b
2891	orr	v9.16b,v8.16b,v8.16b
2892	orr	v8.16b,v6.16b,v6.16b
2893	fmov	x9,d11
2894	fmov	x10,v11.d[1]
2895	eor	v0.16b,v6.16b,v2.16b
2896	eor	v1.16b,v8.16b,v3.16b
2897	eor	v24.16b,v27.16b,v9.16b
2898	eor	v25.16b,v28.16b,v10.16b
2899	eor	v26.16b,v29.16b,v11.16b
2900	b.eq	.Loop5x_xts_dec
2901
2902.Loop5x_dec_after:
2903	add	x2,x2,#0x50
2904	cbz	x2,.Lxts_done
2905
2906	add	w6,w5,#2
2907	subs	x2,x2,#0x30
2908	b.lo	.Lxts_inner_dec_tail
2909
2910	eor	v0.16b,v6.16b,v27.16b
2911	eor	v1.16b,v8.16b,v28.16b
2912	eor	v24.16b,v29.16b,v9.16b
2913	b	.Lxts_outer_dec_tail
2914
2915.align	4
2916.Lxts_dec_tail4x:
2917	add	x0,x0,#16
2918	tst	x21,#0xf
2919	eor	v5.16b,v1.16b,v4.16b
2920	st1	{v5.16b},[x1],#16
2921	eor	v17.16b,v24.16b,v17.16b
2922	st1	{v17.16b},[x1],#16
2923	eor	v30.16b,v25.16b,v30.16b
2924	eor	v31.16b,v26.16b,v31.16b
2925	st1	{v30.16b,v31.16b},[x1],#32
2926
2927	b.eq	.Lxts_dec_abort
2928	ld1	{v0.16b},[x0],#16
2929	b	.Lxts_done
2930.align	4
2931.Lxts_outer_dec_tail:
2932	aesd	v0.16b,v16.16b
2933	aesimc	v0.16b,v0.16b
2934	aesd	v1.16b,v16.16b
2935	aesimc	v1.16b,v1.16b
2936	aesd	v24.16b,v16.16b
2937	aesimc	v24.16b,v24.16b
2938	ld1	{v16.4s},[x7],#16
2939	subs	w6,w6,#2
2940	aesd	v0.16b,v17.16b
2941	aesimc	v0.16b,v0.16b
2942	aesd	v1.16b,v17.16b
2943	aesimc	v1.16b,v1.16b
2944	aesd	v24.16b,v17.16b
2945	aesimc	v24.16b,v24.16b
2946	ld1	{v17.4s},[x7],#16
2947	b.gt	.Lxts_outer_dec_tail
2948
2949	aesd	v0.16b,v16.16b
2950	aesimc	v0.16b,v0.16b
2951	aesd	v1.16b,v16.16b
2952	aesimc	v1.16b,v1.16b
2953	aesd	v24.16b,v16.16b
2954	aesimc	v24.16b,v24.16b
2955	eor	v4.16b,v6.16b,v7.16b
2956	subs	x2,x2,#0x30
2957	// The iv for first block
2958	fmov	x9,d9
2959	fmov	x10,v9.d[1]
2960	mov	w19,#0x87
2961	extr	x22,x10,x10,#32
2962	extr	x10,x10,x9,#63
2963	and	w11,w19,w22,asr #31
2964	eor	x9,x11,x9,lsl #1
2965	fmov	d6,x9
2966	fmov	v6.d[1],x10
2967	eor	v5.16b,v8.16b,v7.16b
2968	csel	x6,x2,x6,lo	// x6, w6, is zero at this point
2969	aesd	v0.16b,v17.16b
2970	aesimc	v0.16b,v0.16b
2971	aesd	v1.16b,v17.16b
2972	aesimc	v1.16b,v1.16b
2973	aesd	v24.16b,v17.16b
2974	aesimc	v24.16b,v24.16b
2975	eor	v17.16b,v9.16b,v7.16b
2976	// The iv for second block
2977	extr	x22,x10,x10,#32
2978	extr	x10,x10,x9,#63
2979	and	w11,w19,w22,asr #31
2980	eor	x9,x11,x9,lsl #1
2981	fmov	d8,x9
2982	fmov	v8.d[1],x10
2983
2984	add	x6,x6,#0x20
2985	add	x0,x0,x6		// x0 is adjusted to the last data
2986
2987	mov	x7,x3
2988
2989	// The iv for third block
2990	extr	x22,x10,x10,#32
2991	extr	x10,x10,x9,#63
2992	and	w11,w19,w22,asr #31
2993	eor	x9,x11,x9,lsl #1
2994	fmov	d9,x9
2995	fmov	v9.d[1],x10
2996
2997	aesd	v0.16b,v20.16b
2998	aesimc	v0.16b,v0.16b
2999	aesd	v1.16b,v20.16b
3000	aesimc	v1.16b,v1.16b
3001	aesd	v24.16b,v20.16b
3002	aesimc	v24.16b,v24.16b
3003	aesd	v0.16b,v21.16b
3004	aesimc	v0.16b,v0.16b
3005	aesd	v1.16b,v21.16b
3006	aesimc	v1.16b,v1.16b
3007	aesd	v24.16b,v21.16b
3008	aesimc	v24.16b,v24.16b
3009	aesd	v0.16b,v22.16b
3010	aesimc	v0.16b,v0.16b
3011	aesd	v1.16b,v22.16b
3012	aesimc	v1.16b,v1.16b
3013	aesd	v24.16b,v22.16b
3014	aesimc	v24.16b,v24.16b
3015	ld1	{v27.16b},[x0],#16
3016	aesd	v0.16b,v23.16b
3017	aesd	v1.16b,v23.16b
3018	aesd	v24.16b,v23.16b
3019	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
3020	add	w6,w5,#2
3021	eor	v4.16b,v4.16b,v0.16b
3022	eor	v5.16b,v5.16b,v1.16b
3023	eor	v24.16b,v24.16b,v17.16b
3024	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
3025	st1	{v4.16b},[x1],#16
3026	st1	{v5.16b},[x1],#16
3027	st1	{v24.16b},[x1],#16
3028
3029	cmn	x2,#0x30
3030	add	x2,x2,#0x30
3031	b.eq	.Lxts_done
3032	sub	x2,x2,#0x30
3033	orr	v28.16b,v3.16b,v3.16b
3034	orr	v29.16b,v27.16b,v27.16b
3035	nop
3036
3037.Lxts_inner_dec_tail:
3038	// x2 == -0x10 means two blocks left.
3039	cmn	x2,#0x10
3040	eor	v1.16b,v28.16b,v6.16b
3041	eor	v24.16b,v29.16b,v8.16b
3042	b.eq	.Lxts_dec_tail_loop
3043	eor	v24.16b,v29.16b,v6.16b
3044.Lxts_dec_tail_loop:
3045	aesd	v1.16b,v16.16b
3046	aesimc	v1.16b,v1.16b
3047	aesd	v24.16b,v16.16b
3048	aesimc	v24.16b,v24.16b
3049	ld1	{v16.4s},[x7],#16
3050	subs	w6,w6,#2
3051	aesd	v1.16b,v17.16b
3052	aesimc	v1.16b,v1.16b
3053	aesd	v24.16b,v17.16b
3054	aesimc	v24.16b,v24.16b
3055	ld1	{v17.4s},[x7],#16
3056	b.gt	.Lxts_dec_tail_loop
3057
3058	aesd	v1.16b,v16.16b
3059	aesimc	v1.16b,v1.16b
3060	aesd	v24.16b,v16.16b
3061	aesimc	v24.16b,v24.16b
3062	aesd	v1.16b,v17.16b
3063	aesimc	v1.16b,v1.16b
3064	aesd	v24.16b,v17.16b
3065	aesimc	v24.16b,v24.16b
3066	aesd	v1.16b,v20.16b
3067	aesimc	v1.16b,v1.16b
3068	aesd	v24.16b,v20.16b
3069	aesimc	v24.16b,v24.16b
3070	cmn	x2,#0x20
3071	aesd	v1.16b,v21.16b
3072	aesimc	v1.16b,v1.16b
3073	aesd	v24.16b,v21.16b
3074	aesimc	v24.16b,v24.16b
3075	eor	v5.16b,v6.16b,v7.16b
3076	aesd	v1.16b,v22.16b
3077	aesimc	v1.16b,v1.16b
3078	aesd	v24.16b,v22.16b
3079	aesimc	v24.16b,v24.16b
3080	eor	v17.16b,v8.16b,v7.16b
3081	aesd	v1.16b,v23.16b
3082	aesd	v24.16b,v23.16b
3083	b.eq	.Lxts_dec_one
3084	eor	v5.16b,v5.16b,v1.16b
3085	eor	v17.16b,v17.16b,v24.16b
3086	orr	v6.16b,v9.16b,v9.16b
3087	orr	v8.16b,v10.16b,v10.16b
3088	st1	{v5.16b},[x1],#16
3089	st1	{v17.16b},[x1],#16
3090	add	x2,x2,#16
3091	b	.Lxts_done
3092
3093.Lxts_dec_one:
3094	eor	v5.16b,v5.16b,v24.16b
3095	orr	v6.16b,v8.16b,v8.16b
3096	orr	v8.16b,v9.16b,v9.16b
3097	st1	{v5.16b},[x1],#16
3098	add	x2,x2,#32
3099
3100.Lxts_done:
3101	tst	x21,#0xf
3102	b.eq	.Lxts_dec_abort
3103	// Processing the last two blocks with cipher stealing.
3104	mov	x7,x3
3105	cbnz	x2,.Lxts_dec_1st_done
3106	ld1	{v0.16b},[x0],#16
3107
3108	// Decrypt the last secod block to get the last plain text block
3109.Lxts_dec_1st_done:
3110	eor	v26.16b,v0.16b,v8.16b
3111	ldr	w6,[x3,#240]
3112	ld1	{v0.4s},[x3],#16
3113	sub	w6,w6,#2
3114	ld1	{v1.4s},[x3],#16
3115.Loop_final_2nd_dec:
3116	aesd	v26.16b,v0.16b
3117	aesimc	v26.16b,v26.16b
3118	ld1	{v0.4s},[x3],#16		// load key schedule...
3119	subs	w6,w6,#2
3120	aesd	v26.16b,v1.16b
3121	aesimc	v26.16b,v26.16b
3122	ld1	{v1.4s},[x3],#16		// load key schedule...
3123	b.gt	.Loop_final_2nd_dec
3124
3125	aesd	v26.16b,v0.16b
3126	aesimc	v26.16b,v26.16b
3127	ld1	{v0.4s},[x3]
3128	aesd	v26.16b,v1.16b
3129	eor	v26.16b,v26.16b,v0.16b
3130	eor	v26.16b,v26.16b,v8.16b
3131	st1	{v26.16b},[x1]
3132
3133	mov	x20,x0
3134	add	x13,x1,#16
3135
3136	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3137	// to get the last encrypted block.
3138.composite_dec_loop:
3139	subs	x21,x21,#1
3140	ldrb	w15,[x1,x21]
3141	ldrb	w14,[x20,x21]
3142	strb	w15,[x13,x21]
3143	strb	w14,[x1,x21]
3144	b.gt	.composite_dec_loop
3145.Lxts_dec_load_done:
3146	ld1	{v26.16b},[x1]
3147	eor	v26.16b,v26.16b,v6.16b
3148
3149	// Decrypt the composite block to get the last second plain text block
3150	ldr	w6,[x7,#240]
3151	ld1	{v0.4s},[x7],#16
3152	sub	w6,w6,#2
3153	ld1	{v1.4s},[x7],#16
3154.Loop_final_dec:
3155	aesd	v26.16b,v0.16b
3156	aesimc	v26.16b,v26.16b
3157	ld1	{v0.4s},[x7],#16		// load key schedule...
3158	subs	w6,w6,#2
3159	aesd	v26.16b,v1.16b
3160	aesimc	v26.16b,v26.16b
3161	ld1	{v1.4s},[x7],#16		// load key schedule...
3162	b.gt	.Loop_final_dec
3163
3164	aesd	v26.16b,v0.16b
3165	aesimc	v26.16b,v26.16b
3166	ld1	{v0.4s},[x7]
3167	aesd	v26.16b,v1.16b
3168	eor	v26.16b,v26.16b,v0.16b
3169	eor	v26.16b,v26.16b,v6.16b
3170	st1	{v26.16b},[x1]
3171
3172.Lxts_dec_abort:
3173	ldp	x21,x22,[sp,#48]
3174	ldp	d8,d9,[sp,#32]
3175	ldp	d10,d11,[sp,#16]
3176	ldp	x19,x20,[sp],#64
3177
3178.Lxts_dec_final_abort:
3179	ret
3180.size	aes_v8_xts_decrypt,.-aes_v8_xts_decrypt
3181#endif
3182