xref: /freebsd/sys/crypto/openssl/aarch64/aesv8-armx.S (revision e6bfd18d21b225af6a0ed67ceeaf1293b7b9eba5)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from aesv8-armx.pl. */
3#include "arm_arch.h"
4
5#if __ARM_MAX_ARCH__>=7
6.text
7.align	5
8.Lrcon:
9.long	0x01,0x01,0x01,0x01
10.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
11.long	0x1b,0x1b,0x1b,0x1b
12
13.globl	aes_v8_set_encrypt_key
14.type	aes_v8_set_encrypt_key,%function
15.align	5
16aes_v8_set_encrypt_key:
17.Lenc_key:
18	stp	x29,x30,[sp,#-16]!
19	add	x29,sp,#0
20	mov	x3,#-1
21	cmp	x0,#0
22	b.eq	.Lenc_key_abort
23	cmp	x2,#0
24	b.eq	.Lenc_key_abort
25	mov	x3,#-2
26	cmp	w1,#128
27	b.lt	.Lenc_key_abort
28	cmp	w1,#256
29	b.gt	.Lenc_key_abort
30	tst	w1,#0x3f
31	b.ne	.Lenc_key_abort
32
33	adr	x3,.Lrcon
34	cmp	w1,#192
35
36	eor	v0.16b,v0.16b,v0.16b
37	ld1	{v3.16b},[x0],#16
38	mov	w1,#8		// reuse w1
39	ld1	{v1.4s,v2.4s},[x3],#32
40
41	b.lt	.Loop128
42	b.eq	.L192
43	b	.L256
44
45.align	4
46.Loop128:
47	tbl	v6.16b,{v3.16b},v2.16b
48	ext	v5.16b,v0.16b,v3.16b,#12
49	st1	{v3.4s},[x2],#16
50	aese	v6.16b,v0.16b
51	subs	w1,w1,#1
52
53	eor	v3.16b,v3.16b,v5.16b
54	ext	v5.16b,v0.16b,v5.16b,#12
55	eor	v3.16b,v3.16b,v5.16b
56	ext	v5.16b,v0.16b,v5.16b,#12
57	eor	v6.16b,v6.16b,v1.16b
58	eor	v3.16b,v3.16b,v5.16b
59	shl	v1.16b,v1.16b,#1
60	eor	v3.16b,v3.16b,v6.16b
61	b.ne	.Loop128
62
63	ld1	{v1.4s},[x3]
64
65	tbl	v6.16b,{v3.16b},v2.16b
66	ext	v5.16b,v0.16b,v3.16b,#12
67	st1	{v3.4s},[x2],#16
68	aese	v6.16b,v0.16b
69
70	eor	v3.16b,v3.16b,v5.16b
71	ext	v5.16b,v0.16b,v5.16b,#12
72	eor	v3.16b,v3.16b,v5.16b
73	ext	v5.16b,v0.16b,v5.16b,#12
74	eor	v6.16b,v6.16b,v1.16b
75	eor	v3.16b,v3.16b,v5.16b
76	shl	v1.16b,v1.16b,#1
77	eor	v3.16b,v3.16b,v6.16b
78
79	tbl	v6.16b,{v3.16b},v2.16b
80	ext	v5.16b,v0.16b,v3.16b,#12
81	st1	{v3.4s},[x2],#16
82	aese	v6.16b,v0.16b
83
84	eor	v3.16b,v3.16b,v5.16b
85	ext	v5.16b,v0.16b,v5.16b,#12
86	eor	v3.16b,v3.16b,v5.16b
87	ext	v5.16b,v0.16b,v5.16b,#12
88	eor	v6.16b,v6.16b,v1.16b
89	eor	v3.16b,v3.16b,v5.16b
90	eor	v3.16b,v3.16b,v6.16b
91	st1	{v3.4s},[x2]
92	add	x2,x2,#0x50
93
94	mov	w12,#10
95	b	.Ldone
96
97.align	4
98.L192:
99	ld1	{v4.8b},[x0],#8
100	movi	v6.16b,#8			// borrow v6.16b
101	st1	{v3.4s},[x2],#16
102	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
103
104.Loop192:
105	tbl	v6.16b,{v4.16b},v2.16b
106	ext	v5.16b,v0.16b,v3.16b,#12
107#ifdef __ARMEB__
108	st1	{v4.4s},[x2],#16
109	sub	x2,x2,#8
110#else
111	st1	{v4.8b},[x2],#8
112#endif
113	aese	v6.16b,v0.16b
114	subs	w1,w1,#1
115
116	eor	v3.16b,v3.16b,v5.16b
117	ext	v5.16b,v0.16b,v5.16b,#12
118	eor	v3.16b,v3.16b,v5.16b
119	ext	v5.16b,v0.16b,v5.16b,#12
120	eor	v3.16b,v3.16b,v5.16b
121
122	dup	v5.4s,v3.s[3]
123	eor	v5.16b,v5.16b,v4.16b
124	eor	v6.16b,v6.16b,v1.16b
125	ext	v4.16b,v0.16b,v4.16b,#12
126	shl	v1.16b,v1.16b,#1
127	eor	v4.16b,v4.16b,v5.16b
128	eor	v3.16b,v3.16b,v6.16b
129	eor	v4.16b,v4.16b,v6.16b
130	st1	{v3.4s},[x2],#16
131	b.ne	.Loop192
132
133	mov	w12,#12
134	add	x2,x2,#0x20
135	b	.Ldone
136
137.align	4
138.L256:
139	ld1	{v4.16b},[x0]
140	mov	w1,#7
141	mov	w12,#14
142	st1	{v3.4s},[x2],#16
143
144.Loop256:
145	tbl	v6.16b,{v4.16b},v2.16b
146	ext	v5.16b,v0.16b,v3.16b,#12
147	st1	{v4.4s},[x2],#16
148	aese	v6.16b,v0.16b
149	subs	w1,w1,#1
150
151	eor	v3.16b,v3.16b,v5.16b
152	ext	v5.16b,v0.16b,v5.16b,#12
153	eor	v3.16b,v3.16b,v5.16b
154	ext	v5.16b,v0.16b,v5.16b,#12
155	eor	v6.16b,v6.16b,v1.16b
156	eor	v3.16b,v3.16b,v5.16b
157	shl	v1.16b,v1.16b,#1
158	eor	v3.16b,v3.16b,v6.16b
159	st1	{v3.4s},[x2],#16
160	b.eq	.Ldone
161
162	dup	v6.4s,v3.s[3]		// just splat
163	ext	v5.16b,v0.16b,v4.16b,#12
164	aese	v6.16b,v0.16b
165
166	eor	v4.16b,v4.16b,v5.16b
167	ext	v5.16b,v0.16b,v5.16b,#12
168	eor	v4.16b,v4.16b,v5.16b
169	ext	v5.16b,v0.16b,v5.16b,#12
170	eor	v4.16b,v4.16b,v5.16b
171
172	eor	v4.16b,v4.16b,v6.16b
173	b	.Loop256
174
175.Ldone:
176	str	w12,[x2]
177	mov	x3,#0
178
179.Lenc_key_abort:
180	mov	x0,x3			// return value
181	ldr	x29,[sp],#16
182	ret
183.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
184
185.globl	aes_v8_set_decrypt_key
186.type	aes_v8_set_decrypt_key,%function
187.align	5
188aes_v8_set_decrypt_key:
189.inst	0xd503233f		// paciasp
190	stp	x29,x30,[sp,#-16]!
191	add	x29,sp,#0
192	bl	.Lenc_key
193
194	cmp	x0,#0
195	b.ne	.Ldec_key_abort
196
197	sub	x2,x2,#240		// restore original x2
198	mov	x4,#-16
199	add	x0,x2,x12,lsl#4	// end of key schedule
200
201	ld1	{v0.4s},[x2]
202	ld1	{v1.4s},[x0]
203	st1	{v0.4s},[x0],x4
204	st1	{v1.4s},[x2],#16
205
206.Loop_imc:
207	ld1	{v0.4s},[x2]
208	ld1	{v1.4s},[x0]
209	aesimc	v0.16b,v0.16b
210	aesimc	v1.16b,v1.16b
211	st1	{v0.4s},[x0],x4
212	st1	{v1.4s},[x2],#16
213	cmp	x0,x2
214	b.hi	.Loop_imc
215
216	ld1	{v0.4s},[x2]
217	aesimc	v0.16b,v0.16b
218	st1	{v0.4s},[x0]
219
220	eor	x0,x0,x0		// return value
221.Ldec_key_abort:
222	ldp	x29,x30,[sp],#16
223.inst	0xd50323bf		// autiasp
224	ret
225.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
226.globl	aes_v8_encrypt
227.type	aes_v8_encrypt,%function
228.align	5
229aes_v8_encrypt:
230	ldr	w3,[x2,#240]
231	ld1	{v0.4s},[x2],#16
232	ld1	{v2.16b},[x0]
233	sub	w3,w3,#2
234	ld1	{v1.4s},[x2],#16
235
236.Loop_enc:
237	aese	v2.16b,v0.16b
238	aesmc	v2.16b,v2.16b
239	ld1	{v0.4s},[x2],#16
240	subs	w3,w3,#2
241	aese	v2.16b,v1.16b
242	aesmc	v2.16b,v2.16b
243	ld1	{v1.4s},[x2],#16
244	b.gt	.Loop_enc
245
246	aese	v2.16b,v0.16b
247	aesmc	v2.16b,v2.16b
248	ld1	{v0.4s},[x2]
249	aese	v2.16b,v1.16b
250	eor	v2.16b,v2.16b,v0.16b
251
252	st1	{v2.16b},[x1]
253	ret
254.size	aes_v8_encrypt,.-aes_v8_encrypt
255.globl	aes_v8_decrypt
256.type	aes_v8_decrypt,%function
257.align	5
258aes_v8_decrypt:
259	ldr	w3,[x2,#240]
260	ld1	{v0.4s},[x2],#16
261	ld1	{v2.16b},[x0]
262	sub	w3,w3,#2
263	ld1	{v1.4s},[x2],#16
264
265.Loop_dec:
266	aesd	v2.16b,v0.16b
267	aesimc	v2.16b,v2.16b
268	ld1	{v0.4s},[x2],#16
269	subs	w3,w3,#2
270	aesd	v2.16b,v1.16b
271	aesimc	v2.16b,v2.16b
272	ld1	{v1.4s},[x2],#16
273	b.gt	.Loop_dec
274
275	aesd	v2.16b,v0.16b
276	aesimc	v2.16b,v2.16b
277	ld1	{v0.4s},[x2]
278	aesd	v2.16b,v1.16b
279	eor	v2.16b,v2.16b,v0.16b
280
281	st1	{v2.16b},[x1]
282	ret
283.size	aes_v8_decrypt,.-aes_v8_decrypt
284.globl	aes_v8_cbc_encrypt
285.type	aes_v8_cbc_encrypt,%function
286.align	5
287aes_v8_cbc_encrypt:
288	stp	x29,x30,[sp,#-16]!
289	add	x29,sp,#0
290	subs	x2,x2,#16
291	mov	x8,#16
292	b.lo	.Lcbc_abort
293	csel	x8,xzr,x8,eq
294
295	cmp	w5,#0			// en- or decrypting?
296	ldr	w5,[x3,#240]
297	and	x2,x2,#-16
298	ld1	{v6.16b},[x4]
299	ld1	{v0.16b},[x0],x8
300
301	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
302	sub	w5,w5,#6
303	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
304	sub	w5,w5,#2
305	ld1	{v18.4s,v19.4s},[x7],#32
306	ld1	{v20.4s,v21.4s},[x7],#32
307	ld1	{v22.4s,v23.4s},[x7],#32
308	ld1	{v7.4s},[x7]
309
310	add	x7,x3,#32
311	mov	w6,w5
312	b.eq	.Lcbc_dec
313
314	cmp	w5,#2
315	eor	v0.16b,v0.16b,v6.16b
316	eor	v5.16b,v16.16b,v7.16b
317	b.eq	.Lcbc_enc128
318
319	ld1	{v2.4s,v3.4s},[x7]
320	add	x7,x3,#16
321	add	x6,x3,#16*4
322	add	x12,x3,#16*5
323	aese	v0.16b,v16.16b
324	aesmc	v0.16b,v0.16b
325	add	x14,x3,#16*6
326	add	x3,x3,#16*7
327	b	.Lenter_cbc_enc
328
329.align	4
330.Loop_cbc_enc:
331	aese	v0.16b,v16.16b
332	aesmc	v0.16b,v0.16b
333	st1	{v6.16b},[x1],#16
334.Lenter_cbc_enc:
335	aese	v0.16b,v17.16b
336	aesmc	v0.16b,v0.16b
337	aese	v0.16b,v2.16b
338	aesmc	v0.16b,v0.16b
339	ld1	{v16.4s},[x6]
340	cmp	w5,#4
341	aese	v0.16b,v3.16b
342	aesmc	v0.16b,v0.16b
343	ld1	{v17.4s},[x12]
344	b.eq	.Lcbc_enc192
345
346	aese	v0.16b,v16.16b
347	aesmc	v0.16b,v0.16b
348	ld1	{v16.4s},[x14]
349	aese	v0.16b,v17.16b
350	aesmc	v0.16b,v0.16b
351	ld1	{v17.4s},[x3]
352	nop
353
354.Lcbc_enc192:
355	aese	v0.16b,v16.16b
356	aesmc	v0.16b,v0.16b
357	subs	x2,x2,#16
358	aese	v0.16b,v17.16b
359	aesmc	v0.16b,v0.16b
360	csel	x8,xzr,x8,eq
361	aese	v0.16b,v18.16b
362	aesmc	v0.16b,v0.16b
363	aese	v0.16b,v19.16b
364	aesmc	v0.16b,v0.16b
365	ld1	{v16.16b},[x0],x8
366	aese	v0.16b,v20.16b
367	aesmc	v0.16b,v0.16b
368	eor	v16.16b,v16.16b,v5.16b
369	aese	v0.16b,v21.16b
370	aesmc	v0.16b,v0.16b
371	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
372	aese	v0.16b,v22.16b
373	aesmc	v0.16b,v0.16b
374	aese	v0.16b,v23.16b
375	eor	v6.16b,v0.16b,v7.16b
376	b.hs	.Loop_cbc_enc
377
378	st1	{v6.16b},[x1],#16
379	b	.Lcbc_done
380
381.align	5
382.Lcbc_enc128:
383	ld1	{v2.4s,v3.4s},[x7]
384	aese	v0.16b,v16.16b
385	aesmc	v0.16b,v0.16b
386	b	.Lenter_cbc_enc128
387.Loop_cbc_enc128:
388	aese	v0.16b,v16.16b
389	aesmc	v0.16b,v0.16b
390	st1	{v6.16b},[x1],#16
391.Lenter_cbc_enc128:
392	aese	v0.16b,v17.16b
393	aesmc	v0.16b,v0.16b
394	subs	x2,x2,#16
395	aese	v0.16b,v2.16b
396	aesmc	v0.16b,v0.16b
397	csel	x8,xzr,x8,eq
398	aese	v0.16b,v3.16b
399	aesmc	v0.16b,v0.16b
400	aese	v0.16b,v18.16b
401	aesmc	v0.16b,v0.16b
402	aese	v0.16b,v19.16b
403	aesmc	v0.16b,v0.16b
404	ld1	{v16.16b},[x0],x8
405	aese	v0.16b,v20.16b
406	aesmc	v0.16b,v0.16b
407	aese	v0.16b,v21.16b
408	aesmc	v0.16b,v0.16b
409	aese	v0.16b,v22.16b
410	aesmc	v0.16b,v0.16b
411	eor	v16.16b,v16.16b,v5.16b
412	aese	v0.16b,v23.16b
413	eor	v6.16b,v0.16b,v7.16b
414	b.hs	.Loop_cbc_enc128
415
416	st1	{v6.16b},[x1],#16
417	b	.Lcbc_done
418.align	5
419.Lcbc_dec:
420	ld1	{v18.16b},[x0],#16
421	subs	x2,x2,#32		// bias
422	add	w6,w5,#2
423	orr	v3.16b,v0.16b,v0.16b
424	orr	v1.16b,v0.16b,v0.16b
425	orr	v19.16b,v18.16b,v18.16b
426	b.lo	.Lcbc_dec_tail
427
428	orr	v1.16b,v18.16b,v18.16b
429	ld1	{v18.16b},[x0],#16
430	orr	v2.16b,v0.16b,v0.16b
431	orr	v3.16b,v1.16b,v1.16b
432	orr	v19.16b,v18.16b,v18.16b
433
434.Loop3x_cbc_dec:
435	aesd	v0.16b,v16.16b
436	aesimc	v0.16b,v0.16b
437	aesd	v1.16b,v16.16b
438	aesimc	v1.16b,v1.16b
439	aesd	v18.16b,v16.16b
440	aesimc	v18.16b,v18.16b
441	ld1	{v16.4s},[x7],#16
442	subs	w6,w6,#2
443	aesd	v0.16b,v17.16b
444	aesimc	v0.16b,v0.16b
445	aesd	v1.16b,v17.16b
446	aesimc	v1.16b,v1.16b
447	aesd	v18.16b,v17.16b
448	aesimc	v18.16b,v18.16b
449	ld1	{v17.4s},[x7],#16
450	b.gt	.Loop3x_cbc_dec
451
452	aesd	v0.16b,v16.16b
453	aesimc	v0.16b,v0.16b
454	aesd	v1.16b,v16.16b
455	aesimc	v1.16b,v1.16b
456	aesd	v18.16b,v16.16b
457	aesimc	v18.16b,v18.16b
458	eor	v4.16b,v6.16b,v7.16b
459	subs	x2,x2,#0x30
460	eor	v5.16b,v2.16b,v7.16b
461	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
462	aesd	v0.16b,v17.16b
463	aesimc	v0.16b,v0.16b
464	aesd	v1.16b,v17.16b
465	aesimc	v1.16b,v1.16b
466	aesd	v18.16b,v17.16b
467	aesimc	v18.16b,v18.16b
468	eor	v17.16b,v3.16b,v7.16b
469	add	x0,x0,x6		// x0 is adjusted in such way that
470					// at exit from the loop v1.16b-v18.16b
471					// are loaded with last "words"
472	orr	v6.16b,v19.16b,v19.16b
473	mov	x7,x3
474	aesd	v0.16b,v20.16b
475	aesimc	v0.16b,v0.16b
476	aesd	v1.16b,v20.16b
477	aesimc	v1.16b,v1.16b
478	aesd	v18.16b,v20.16b
479	aesimc	v18.16b,v18.16b
480	ld1	{v2.16b},[x0],#16
481	aesd	v0.16b,v21.16b
482	aesimc	v0.16b,v0.16b
483	aesd	v1.16b,v21.16b
484	aesimc	v1.16b,v1.16b
485	aesd	v18.16b,v21.16b
486	aesimc	v18.16b,v18.16b
487	ld1	{v3.16b},[x0],#16
488	aesd	v0.16b,v22.16b
489	aesimc	v0.16b,v0.16b
490	aesd	v1.16b,v22.16b
491	aesimc	v1.16b,v1.16b
492	aesd	v18.16b,v22.16b
493	aesimc	v18.16b,v18.16b
494	ld1	{v19.16b},[x0],#16
495	aesd	v0.16b,v23.16b
496	aesd	v1.16b,v23.16b
497	aesd	v18.16b,v23.16b
498	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
499	add	w6,w5,#2
500	eor	v4.16b,v4.16b,v0.16b
501	eor	v5.16b,v5.16b,v1.16b
502	eor	v18.16b,v18.16b,v17.16b
503	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
504	st1	{v4.16b},[x1],#16
505	orr	v0.16b,v2.16b,v2.16b
506	st1	{v5.16b},[x1],#16
507	orr	v1.16b,v3.16b,v3.16b
508	st1	{v18.16b},[x1],#16
509	orr	v18.16b,v19.16b,v19.16b
510	b.hs	.Loop3x_cbc_dec
511
512	cmn	x2,#0x30
513	b.eq	.Lcbc_done
514	nop
515
516.Lcbc_dec_tail:
517	aesd	v1.16b,v16.16b
518	aesimc	v1.16b,v1.16b
519	aesd	v18.16b,v16.16b
520	aesimc	v18.16b,v18.16b
521	ld1	{v16.4s},[x7],#16
522	subs	w6,w6,#2
523	aesd	v1.16b,v17.16b
524	aesimc	v1.16b,v1.16b
525	aesd	v18.16b,v17.16b
526	aesimc	v18.16b,v18.16b
527	ld1	{v17.4s},[x7],#16
528	b.gt	.Lcbc_dec_tail
529
530	aesd	v1.16b,v16.16b
531	aesimc	v1.16b,v1.16b
532	aesd	v18.16b,v16.16b
533	aesimc	v18.16b,v18.16b
534	aesd	v1.16b,v17.16b
535	aesimc	v1.16b,v1.16b
536	aesd	v18.16b,v17.16b
537	aesimc	v18.16b,v18.16b
538	aesd	v1.16b,v20.16b
539	aesimc	v1.16b,v1.16b
540	aesd	v18.16b,v20.16b
541	aesimc	v18.16b,v18.16b
542	cmn	x2,#0x20
543	aesd	v1.16b,v21.16b
544	aesimc	v1.16b,v1.16b
545	aesd	v18.16b,v21.16b
546	aesimc	v18.16b,v18.16b
547	eor	v5.16b,v6.16b,v7.16b
548	aesd	v1.16b,v22.16b
549	aesimc	v1.16b,v1.16b
550	aesd	v18.16b,v22.16b
551	aesimc	v18.16b,v18.16b
552	eor	v17.16b,v3.16b,v7.16b
553	aesd	v1.16b,v23.16b
554	aesd	v18.16b,v23.16b
555	b.eq	.Lcbc_dec_one
556	eor	v5.16b,v5.16b,v1.16b
557	eor	v17.16b,v17.16b,v18.16b
558	orr	v6.16b,v19.16b,v19.16b
559	st1	{v5.16b},[x1],#16
560	st1	{v17.16b},[x1],#16
561	b	.Lcbc_done
562
563.Lcbc_dec_one:
564	eor	v5.16b,v5.16b,v18.16b
565	orr	v6.16b,v19.16b,v19.16b
566	st1	{v5.16b},[x1],#16
567
568.Lcbc_done:
569	st1	{v6.16b},[x4]
570.Lcbc_abort:
571	ldr	x29,[sp],#16
572	ret
573.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
574.globl	aes_v8_ctr32_encrypt_blocks
575.type	aes_v8_ctr32_encrypt_blocks,%function
576.align	5
577aes_v8_ctr32_encrypt_blocks:
578	stp	x29,x30,[sp,#-16]!
579	add	x29,sp,#0
580	ldr	w5,[x3,#240]
581
582	ldr	w8, [x4, #12]
583#ifdef __ARMEB__
584	ld1	{v0.16b},[x4]
585#else
586	ld1	{v0.4s},[x4]
587#endif
588	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
589	sub	w5,w5,#4
590	mov	x12,#16
591	cmp	x2,#2
592	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
593	sub	w5,w5,#2
594	ld1	{v20.4s,v21.4s},[x7],#32
595	ld1	{v22.4s,v23.4s},[x7],#32
596	ld1	{v7.4s},[x7]
597	add	x7,x3,#32
598	mov	w6,w5
599	csel	x12,xzr,x12,lo
600#ifndef __ARMEB__
601	rev	w8, w8
602#endif
603	orr	v1.16b,v0.16b,v0.16b
604	add	w10, w8, #1
605	orr	v18.16b,v0.16b,v0.16b
606	add	w8, w8, #2
607	orr	v6.16b,v0.16b,v0.16b
608	rev	w10, w10
609	mov	v1.s[3],w10
610	b.ls	.Lctr32_tail
611	rev	w12, w8
612	sub	x2,x2,#3		// bias
613	mov	v18.s[3],w12
614	b	.Loop3x_ctr32
615
616.align	4
617.Loop3x_ctr32:
618	aese	v0.16b,v16.16b
619	aesmc	v0.16b,v0.16b
620	aese	v1.16b,v16.16b
621	aesmc	v1.16b,v1.16b
622	aese	v18.16b,v16.16b
623	aesmc	v18.16b,v18.16b
624	ld1	{v16.4s},[x7],#16
625	subs	w6,w6,#2
626	aese	v0.16b,v17.16b
627	aesmc	v0.16b,v0.16b
628	aese	v1.16b,v17.16b
629	aesmc	v1.16b,v1.16b
630	aese	v18.16b,v17.16b
631	aesmc	v18.16b,v18.16b
632	ld1	{v17.4s},[x7],#16
633	b.gt	.Loop3x_ctr32
634
635	aese	v0.16b,v16.16b
636	aesmc	v4.16b,v0.16b
637	aese	v1.16b,v16.16b
638	aesmc	v5.16b,v1.16b
639	ld1	{v2.16b},[x0],#16
640	orr	v0.16b,v6.16b,v6.16b
641	aese	v18.16b,v16.16b
642	aesmc	v18.16b,v18.16b
643	ld1	{v3.16b},[x0],#16
644	orr	v1.16b,v6.16b,v6.16b
645	aese	v4.16b,v17.16b
646	aesmc	v4.16b,v4.16b
647	aese	v5.16b,v17.16b
648	aesmc	v5.16b,v5.16b
649	ld1	{v19.16b},[x0],#16
650	mov	x7,x3
651	aese	v18.16b,v17.16b
652	aesmc	v17.16b,v18.16b
653	orr	v18.16b,v6.16b,v6.16b
654	add	w9,w8,#1
655	aese	v4.16b,v20.16b
656	aesmc	v4.16b,v4.16b
657	aese	v5.16b,v20.16b
658	aesmc	v5.16b,v5.16b
659	eor	v2.16b,v2.16b,v7.16b
660	add	w10,w8,#2
661	aese	v17.16b,v20.16b
662	aesmc	v17.16b,v17.16b
663	eor	v3.16b,v3.16b,v7.16b
664	add	w8,w8,#3
665	aese	v4.16b,v21.16b
666	aesmc	v4.16b,v4.16b
667	aese	v5.16b,v21.16b
668	aesmc	v5.16b,v5.16b
669	eor	v19.16b,v19.16b,v7.16b
670	rev	w9,w9
671	aese	v17.16b,v21.16b
672	aesmc	v17.16b,v17.16b
673	mov	v0.s[3], w9
674	rev	w10,w10
675	aese	v4.16b,v22.16b
676	aesmc	v4.16b,v4.16b
677	aese	v5.16b,v22.16b
678	aesmc	v5.16b,v5.16b
679	mov	v1.s[3], w10
680	rev	w12,w8
681	aese	v17.16b,v22.16b
682	aesmc	v17.16b,v17.16b
683	mov	v18.s[3], w12
684	subs	x2,x2,#3
685	aese	v4.16b,v23.16b
686	aese	v5.16b,v23.16b
687	aese	v17.16b,v23.16b
688
689	eor	v2.16b,v2.16b,v4.16b
690	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
691	st1	{v2.16b},[x1],#16
692	eor	v3.16b,v3.16b,v5.16b
693	mov	w6,w5
694	st1	{v3.16b},[x1],#16
695	eor	v19.16b,v19.16b,v17.16b
696	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
697	st1	{v19.16b},[x1],#16
698	b.hs	.Loop3x_ctr32
699
700	adds	x2,x2,#3
701	b.eq	.Lctr32_done
702	cmp	x2,#1
703	mov	x12,#16
704	csel	x12,xzr,x12,eq
705
706.Lctr32_tail:
707	aese	v0.16b,v16.16b
708	aesmc	v0.16b,v0.16b
709	aese	v1.16b,v16.16b
710	aesmc	v1.16b,v1.16b
711	ld1	{v16.4s},[x7],#16
712	subs	w6,w6,#2
713	aese	v0.16b,v17.16b
714	aesmc	v0.16b,v0.16b
715	aese	v1.16b,v17.16b
716	aesmc	v1.16b,v1.16b
717	ld1	{v17.4s},[x7],#16
718	b.gt	.Lctr32_tail
719
720	aese	v0.16b,v16.16b
721	aesmc	v0.16b,v0.16b
722	aese	v1.16b,v16.16b
723	aesmc	v1.16b,v1.16b
724	aese	v0.16b,v17.16b
725	aesmc	v0.16b,v0.16b
726	aese	v1.16b,v17.16b
727	aesmc	v1.16b,v1.16b
728	ld1	{v2.16b},[x0],x12
729	aese	v0.16b,v20.16b
730	aesmc	v0.16b,v0.16b
731	aese	v1.16b,v20.16b
732	aesmc	v1.16b,v1.16b
733	ld1	{v3.16b},[x0]
734	aese	v0.16b,v21.16b
735	aesmc	v0.16b,v0.16b
736	aese	v1.16b,v21.16b
737	aesmc	v1.16b,v1.16b
738	eor	v2.16b,v2.16b,v7.16b
739	aese	v0.16b,v22.16b
740	aesmc	v0.16b,v0.16b
741	aese	v1.16b,v22.16b
742	aesmc	v1.16b,v1.16b
743	eor	v3.16b,v3.16b,v7.16b
744	aese	v0.16b,v23.16b
745	aese	v1.16b,v23.16b
746
747	cmp	x2,#1
748	eor	v2.16b,v2.16b,v0.16b
749	eor	v3.16b,v3.16b,v1.16b
750	st1	{v2.16b},[x1],#16
751	b.eq	.Lctr32_done
752	st1	{v3.16b},[x1]
753
754.Lctr32_done:
755	ldr	x29,[sp],#16
756	ret
757.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
758#endif
759