xref: /freebsd/sys/crypto/openssl/aarch64/aesv8-armx.S (revision f9fd7337f63698f33239c58c07bf430198235a22)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from aesv8-armx.pl. */
3#include "arm_arch.h"
4
5#if __ARM_MAX_ARCH__>=7
6.text
7.align	5
8.Lrcon:
9.long	0x01,0x01,0x01,0x01
10.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
11.long	0x1b,0x1b,0x1b,0x1b
12
13.globl	aes_v8_set_encrypt_key
14.type	aes_v8_set_encrypt_key,%function
15.align	5
16aes_v8_set_encrypt_key:
17.Lenc_key:
18	stp	x29,x30,[sp,#-16]!
19	add	x29,sp,#0
20	mov	x3,#-1
21	cmp	x0,#0
22	b.eq	.Lenc_key_abort
23	cmp	x2,#0
24	b.eq	.Lenc_key_abort
25	mov	x3,#-2
26	cmp	w1,#128
27	b.lt	.Lenc_key_abort
28	cmp	w1,#256
29	b.gt	.Lenc_key_abort
30	tst	w1,#0x3f
31	b.ne	.Lenc_key_abort
32
33	adr	x3,.Lrcon
34	cmp	w1,#192
35
36	eor	v0.16b,v0.16b,v0.16b
37	ld1	{v3.16b},[x0],#16
38	mov	w1,#8		// reuse w1
39	ld1	{v1.4s,v2.4s},[x3],#32
40
41	b.lt	.Loop128
42	b.eq	.L192
43	b	.L256
44
45.align	4
46.Loop128:
47	tbl	v6.16b,{v3.16b},v2.16b
48	ext	v5.16b,v0.16b,v3.16b,#12
49	st1	{v3.4s},[x2],#16
50	aese	v6.16b,v0.16b
51	subs	w1,w1,#1
52
53	eor	v3.16b,v3.16b,v5.16b
54	ext	v5.16b,v0.16b,v5.16b,#12
55	eor	v3.16b,v3.16b,v5.16b
56	ext	v5.16b,v0.16b,v5.16b,#12
57	eor	v6.16b,v6.16b,v1.16b
58	eor	v3.16b,v3.16b,v5.16b
59	shl	v1.16b,v1.16b,#1
60	eor	v3.16b,v3.16b,v6.16b
61	b.ne	.Loop128
62
63	ld1	{v1.4s},[x3]
64
65	tbl	v6.16b,{v3.16b},v2.16b
66	ext	v5.16b,v0.16b,v3.16b,#12
67	st1	{v3.4s},[x2],#16
68	aese	v6.16b,v0.16b
69
70	eor	v3.16b,v3.16b,v5.16b
71	ext	v5.16b,v0.16b,v5.16b,#12
72	eor	v3.16b,v3.16b,v5.16b
73	ext	v5.16b,v0.16b,v5.16b,#12
74	eor	v6.16b,v6.16b,v1.16b
75	eor	v3.16b,v3.16b,v5.16b
76	shl	v1.16b,v1.16b,#1
77	eor	v3.16b,v3.16b,v6.16b
78
79	tbl	v6.16b,{v3.16b},v2.16b
80	ext	v5.16b,v0.16b,v3.16b,#12
81	st1	{v3.4s},[x2],#16
82	aese	v6.16b,v0.16b
83
84	eor	v3.16b,v3.16b,v5.16b
85	ext	v5.16b,v0.16b,v5.16b,#12
86	eor	v3.16b,v3.16b,v5.16b
87	ext	v5.16b,v0.16b,v5.16b,#12
88	eor	v6.16b,v6.16b,v1.16b
89	eor	v3.16b,v3.16b,v5.16b
90	eor	v3.16b,v3.16b,v6.16b
91	st1	{v3.4s},[x2]
92	add	x2,x2,#0x50
93
94	mov	w12,#10
95	b	.Ldone
96
97.align	4
98.L192:
99	ld1	{v4.8b},[x0],#8
100	movi	v6.16b,#8			// borrow v6.16b
101	st1	{v3.4s},[x2],#16
102	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
103
104.Loop192:
105	tbl	v6.16b,{v4.16b},v2.16b
106	ext	v5.16b,v0.16b,v3.16b,#12
107	st1	{v4.8b},[x2],#8
108	aese	v6.16b,v0.16b
109	subs	w1,w1,#1
110
111	eor	v3.16b,v3.16b,v5.16b
112	ext	v5.16b,v0.16b,v5.16b,#12
113	eor	v3.16b,v3.16b,v5.16b
114	ext	v5.16b,v0.16b,v5.16b,#12
115	eor	v3.16b,v3.16b,v5.16b
116
117	dup	v5.4s,v3.s[3]
118	eor	v5.16b,v5.16b,v4.16b
119	eor	v6.16b,v6.16b,v1.16b
120	ext	v4.16b,v0.16b,v4.16b,#12
121	shl	v1.16b,v1.16b,#1
122	eor	v4.16b,v4.16b,v5.16b
123	eor	v3.16b,v3.16b,v6.16b
124	eor	v4.16b,v4.16b,v6.16b
125	st1	{v3.4s},[x2],#16
126	b.ne	.Loop192
127
128	mov	w12,#12
129	add	x2,x2,#0x20
130	b	.Ldone
131
132.align	4
133.L256:
134	ld1	{v4.16b},[x0]
135	mov	w1,#7
136	mov	w12,#14
137	st1	{v3.4s},[x2],#16
138
139.Loop256:
140	tbl	v6.16b,{v4.16b},v2.16b
141	ext	v5.16b,v0.16b,v3.16b,#12
142	st1	{v4.4s},[x2],#16
143	aese	v6.16b,v0.16b
144	subs	w1,w1,#1
145
146	eor	v3.16b,v3.16b,v5.16b
147	ext	v5.16b,v0.16b,v5.16b,#12
148	eor	v3.16b,v3.16b,v5.16b
149	ext	v5.16b,v0.16b,v5.16b,#12
150	eor	v6.16b,v6.16b,v1.16b
151	eor	v3.16b,v3.16b,v5.16b
152	shl	v1.16b,v1.16b,#1
153	eor	v3.16b,v3.16b,v6.16b
154	st1	{v3.4s},[x2],#16
155	b.eq	.Ldone
156
157	dup	v6.4s,v3.s[3]		// just splat
158	ext	v5.16b,v0.16b,v4.16b,#12
159	aese	v6.16b,v0.16b
160
161	eor	v4.16b,v4.16b,v5.16b
162	ext	v5.16b,v0.16b,v5.16b,#12
163	eor	v4.16b,v4.16b,v5.16b
164	ext	v5.16b,v0.16b,v5.16b,#12
165	eor	v4.16b,v4.16b,v5.16b
166
167	eor	v4.16b,v4.16b,v6.16b
168	b	.Loop256
169
170.Ldone:
171	str	w12,[x2]
172	mov	x3,#0
173
174.Lenc_key_abort:
175	mov	x0,x3			// return value
176	ldr	x29,[sp],#16
177	ret
178.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
179
180.globl	aes_v8_set_decrypt_key
181.type	aes_v8_set_decrypt_key,%function
182.align	5
183aes_v8_set_decrypt_key:
184.inst	0xd503233f		// paciasp
185	stp	x29,x30,[sp,#-16]!
186	add	x29,sp,#0
187	bl	.Lenc_key
188
189	cmp	x0,#0
190	b.ne	.Ldec_key_abort
191
192	sub	x2,x2,#240		// restore original x2
193	mov	x4,#-16
194	add	x0,x2,x12,lsl#4	// end of key schedule
195
196	ld1	{v0.4s},[x2]
197	ld1	{v1.4s},[x0]
198	st1	{v0.4s},[x0],x4
199	st1	{v1.4s},[x2],#16
200
201.Loop_imc:
202	ld1	{v0.4s},[x2]
203	ld1	{v1.4s},[x0]
204	aesimc	v0.16b,v0.16b
205	aesimc	v1.16b,v1.16b
206	st1	{v0.4s},[x0],x4
207	st1	{v1.4s},[x2],#16
208	cmp	x0,x2
209	b.hi	.Loop_imc
210
211	ld1	{v0.4s},[x2]
212	aesimc	v0.16b,v0.16b
213	st1	{v0.4s},[x0]
214
215	eor	x0,x0,x0		// return value
216.Ldec_key_abort:
217	ldp	x29,x30,[sp],#16
218.inst	0xd50323bf		// autiasp
219	ret
220.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
221.globl	aes_v8_encrypt
222.type	aes_v8_encrypt,%function
223.align	5
224aes_v8_encrypt:
225	ldr	w3,[x2,#240]
226	ld1	{v0.4s},[x2],#16
227	ld1	{v2.16b},[x0]
228	sub	w3,w3,#2
229	ld1	{v1.4s},[x2],#16
230
231.Loop_enc:
232	aese	v2.16b,v0.16b
233	aesmc	v2.16b,v2.16b
234	ld1	{v0.4s},[x2],#16
235	subs	w3,w3,#2
236	aese	v2.16b,v1.16b
237	aesmc	v2.16b,v2.16b
238	ld1	{v1.4s},[x2],#16
239	b.gt	.Loop_enc
240
241	aese	v2.16b,v0.16b
242	aesmc	v2.16b,v2.16b
243	ld1	{v0.4s},[x2]
244	aese	v2.16b,v1.16b
245	eor	v2.16b,v2.16b,v0.16b
246
247	st1	{v2.16b},[x1]
248	ret
249.size	aes_v8_encrypt,.-aes_v8_encrypt
250.globl	aes_v8_decrypt
251.type	aes_v8_decrypt,%function
252.align	5
253aes_v8_decrypt:
254	ldr	w3,[x2,#240]
255	ld1	{v0.4s},[x2],#16
256	ld1	{v2.16b},[x0]
257	sub	w3,w3,#2
258	ld1	{v1.4s},[x2],#16
259
260.Loop_dec:
261	aesd	v2.16b,v0.16b
262	aesimc	v2.16b,v2.16b
263	ld1	{v0.4s},[x2],#16
264	subs	w3,w3,#2
265	aesd	v2.16b,v1.16b
266	aesimc	v2.16b,v2.16b
267	ld1	{v1.4s},[x2],#16
268	b.gt	.Loop_dec
269
270	aesd	v2.16b,v0.16b
271	aesimc	v2.16b,v2.16b
272	ld1	{v0.4s},[x2]
273	aesd	v2.16b,v1.16b
274	eor	v2.16b,v2.16b,v0.16b
275
276	st1	{v2.16b},[x1]
277	ret
278.size	aes_v8_decrypt,.-aes_v8_decrypt
279.globl	aes_v8_cbc_encrypt
280.type	aes_v8_cbc_encrypt,%function
281.align	5
282aes_v8_cbc_encrypt:
283	stp	x29,x30,[sp,#-16]!
284	add	x29,sp,#0
285	subs	x2,x2,#16
286	mov	x8,#16
287	b.lo	.Lcbc_abort
288	csel	x8,xzr,x8,eq
289
290	cmp	w5,#0			// en- or decrypting?
291	ldr	w5,[x3,#240]
292	and	x2,x2,#-16
293	ld1	{v6.16b},[x4]
294	ld1	{v0.16b},[x0],x8
295
296	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
297	sub	w5,w5,#6
298	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
299	sub	w5,w5,#2
300	ld1	{v18.4s,v19.4s},[x7],#32
301	ld1	{v20.4s,v21.4s},[x7],#32
302	ld1	{v22.4s,v23.4s},[x7],#32
303	ld1	{v7.4s},[x7]
304
305	add	x7,x3,#32
306	mov	w6,w5
307	b.eq	.Lcbc_dec
308
309	cmp	w5,#2
310	eor	v0.16b,v0.16b,v6.16b
311	eor	v5.16b,v16.16b,v7.16b
312	b.eq	.Lcbc_enc128
313
314	ld1	{v2.4s,v3.4s},[x7]
315	add	x7,x3,#16
316	add	x6,x3,#16*4
317	add	x12,x3,#16*5
318	aese	v0.16b,v16.16b
319	aesmc	v0.16b,v0.16b
320	add	x14,x3,#16*6
321	add	x3,x3,#16*7
322	b	.Lenter_cbc_enc
323
324.align	4
325.Loop_cbc_enc:
326	aese	v0.16b,v16.16b
327	aesmc	v0.16b,v0.16b
328	st1	{v6.16b},[x1],#16
329.Lenter_cbc_enc:
330	aese	v0.16b,v17.16b
331	aesmc	v0.16b,v0.16b
332	aese	v0.16b,v2.16b
333	aesmc	v0.16b,v0.16b
334	ld1	{v16.4s},[x6]
335	cmp	w5,#4
336	aese	v0.16b,v3.16b
337	aesmc	v0.16b,v0.16b
338	ld1	{v17.4s},[x12]
339	b.eq	.Lcbc_enc192
340
341	aese	v0.16b,v16.16b
342	aesmc	v0.16b,v0.16b
343	ld1	{v16.4s},[x14]
344	aese	v0.16b,v17.16b
345	aesmc	v0.16b,v0.16b
346	ld1	{v17.4s},[x3]
347	nop
348
349.Lcbc_enc192:
350	aese	v0.16b,v16.16b
351	aesmc	v0.16b,v0.16b
352	subs	x2,x2,#16
353	aese	v0.16b,v17.16b
354	aesmc	v0.16b,v0.16b
355	csel	x8,xzr,x8,eq
356	aese	v0.16b,v18.16b
357	aesmc	v0.16b,v0.16b
358	aese	v0.16b,v19.16b
359	aesmc	v0.16b,v0.16b
360	ld1	{v16.16b},[x0],x8
361	aese	v0.16b,v20.16b
362	aesmc	v0.16b,v0.16b
363	eor	v16.16b,v16.16b,v5.16b
364	aese	v0.16b,v21.16b
365	aesmc	v0.16b,v0.16b
366	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
367	aese	v0.16b,v22.16b
368	aesmc	v0.16b,v0.16b
369	aese	v0.16b,v23.16b
370	eor	v6.16b,v0.16b,v7.16b
371	b.hs	.Loop_cbc_enc
372
373	st1	{v6.16b},[x1],#16
374	b	.Lcbc_done
375
376.align	5
377.Lcbc_enc128:
378	ld1	{v2.4s,v3.4s},[x7]
379	aese	v0.16b,v16.16b
380	aesmc	v0.16b,v0.16b
381	b	.Lenter_cbc_enc128
382.Loop_cbc_enc128:
383	aese	v0.16b,v16.16b
384	aesmc	v0.16b,v0.16b
385	st1	{v6.16b},[x1],#16
386.Lenter_cbc_enc128:
387	aese	v0.16b,v17.16b
388	aesmc	v0.16b,v0.16b
389	subs	x2,x2,#16
390	aese	v0.16b,v2.16b
391	aesmc	v0.16b,v0.16b
392	csel	x8,xzr,x8,eq
393	aese	v0.16b,v3.16b
394	aesmc	v0.16b,v0.16b
395	aese	v0.16b,v18.16b
396	aesmc	v0.16b,v0.16b
397	aese	v0.16b,v19.16b
398	aesmc	v0.16b,v0.16b
399	ld1	{v16.16b},[x0],x8
400	aese	v0.16b,v20.16b
401	aesmc	v0.16b,v0.16b
402	aese	v0.16b,v21.16b
403	aesmc	v0.16b,v0.16b
404	aese	v0.16b,v22.16b
405	aesmc	v0.16b,v0.16b
406	eor	v16.16b,v16.16b,v5.16b
407	aese	v0.16b,v23.16b
408	eor	v6.16b,v0.16b,v7.16b
409	b.hs	.Loop_cbc_enc128
410
411	st1	{v6.16b},[x1],#16
412	b	.Lcbc_done
413.align	5
414.Lcbc_dec:
415	ld1	{v18.16b},[x0],#16
416	subs	x2,x2,#32		// bias
417	add	w6,w5,#2
418	orr	v3.16b,v0.16b,v0.16b
419	orr	v1.16b,v0.16b,v0.16b
420	orr	v19.16b,v18.16b,v18.16b
421	b.lo	.Lcbc_dec_tail
422
423	orr	v1.16b,v18.16b,v18.16b
424	ld1	{v18.16b},[x0],#16
425	orr	v2.16b,v0.16b,v0.16b
426	orr	v3.16b,v1.16b,v1.16b
427	orr	v19.16b,v18.16b,v18.16b
428
429.Loop3x_cbc_dec:
430	aesd	v0.16b,v16.16b
431	aesimc	v0.16b,v0.16b
432	aesd	v1.16b,v16.16b
433	aesimc	v1.16b,v1.16b
434	aesd	v18.16b,v16.16b
435	aesimc	v18.16b,v18.16b
436	ld1	{v16.4s},[x7],#16
437	subs	w6,w6,#2
438	aesd	v0.16b,v17.16b
439	aesimc	v0.16b,v0.16b
440	aesd	v1.16b,v17.16b
441	aesimc	v1.16b,v1.16b
442	aesd	v18.16b,v17.16b
443	aesimc	v18.16b,v18.16b
444	ld1	{v17.4s},[x7],#16
445	b.gt	.Loop3x_cbc_dec
446
447	aesd	v0.16b,v16.16b
448	aesimc	v0.16b,v0.16b
449	aesd	v1.16b,v16.16b
450	aesimc	v1.16b,v1.16b
451	aesd	v18.16b,v16.16b
452	aesimc	v18.16b,v18.16b
453	eor	v4.16b,v6.16b,v7.16b
454	subs	x2,x2,#0x30
455	eor	v5.16b,v2.16b,v7.16b
456	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
457	aesd	v0.16b,v17.16b
458	aesimc	v0.16b,v0.16b
459	aesd	v1.16b,v17.16b
460	aesimc	v1.16b,v1.16b
461	aesd	v18.16b,v17.16b
462	aesimc	v18.16b,v18.16b
463	eor	v17.16b,v3.16b,v7.16b
464	add	x0,x0,x6		// x0 is adjusted in such way that
465					// at exit from the loop v1.16b-v18.16b
466					// are loaded with last "words"
467	orr	v6.16b,v19.16b,v19.16b
468	mov	x7,x3
469	aesd	v0.16b,v20.16b
470	aesimc	v0.16b,v0.16b
471	aesd	v1.16b,v20.16b
472	aesimc	v1.16b,v1.16b
473	aesd	v18.16b,v20.16b
474	aesimc	v18.16b,v18.16b
475	ld1	{v2.16b},[x0],#16
476	aesd	v0.16b,v21.16b
477	aesimc	v0.16b,v0.16b
478	aesd	v1.16b,v21.16b
479	aesimc	v1.16b,v1.16b
480	aesd	v18.16b,v21.16b
481	aesimc	v18.16b,v18.16b
482	ld1	{v3.16b},[x0],#16
483	aesd	v0.16b,v22.16b
484	aesimc	v0.16b,v0.16b
485	aesd	v1.16b,v22.16b
486	aesimc	v1.16b,v1.16b
487	aesd	v18.16b,v22.16b
488	aesimc	v18.16b,v18.16b
489	ld1	{v19.16b},[x0],#16
490	aesd	v0.16b,v23.16b
491	aesd	v1.16b,v23.16b
492	aesd	v18.16b,v23.16b
493	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
494	add	w6,w5,#2
495	eor	v4.16b,v4.16b,v0.16b
496	eor	v5.16b,v5.16b,v1.16b
497	eor	v18.16b,v18.16b,v17.16b
498	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
499	st1	{v4.16b},[x1],#16
500	orr	v0.16b,v2.16b,v2.16b
501	st1	{v5.16b},[x1],#16
502	orr	v1.16b,v3.16b,v3.16b
503	st1	{v18.16b},[x1],#16
504	orr	v18.16b,v19.16b,v19.16b
505	b.hs	.Loop3x_cbc_dec
506
507	cmn	x2,#0x30
508	b.eq	.Lcbc_done
509	nop
510
511.Lcbc_dec_tail:
512	aesd	v1.16b,v16.16b
513	aesimc	v1.16b,v1.16b
514	aesd	v18.16b,v16.16b
515	aesimc	v18.16b,v18.16b
516	ld1	{v16.4s},[x7],#16
517	subs	w6,w6,#2
518	aesd	v1.16b,v17.16b
519	aesimc	v1.16b,v1.16b
520	aesd	v18.16b,v17.16b
521	aesimc	v18.16b,v18.16b
522	ld1	{v17.4s},[x7],#16
523	b.gt	.Lcbc_dec_tail
524
525	aesd	v1.16b,v16.16b
526	aesimc	v1.16b,v1.16b
527	aesd	v18.16b,v16.16b
528	aesimc	v18.16b,v18.16b
529	aesd	v1.16b,v17.16b
530	aesimc	v1.16b,v1.16b
531	aesd	v18.16b,v17.16b
532	aesimc	v18.16b,v18.16b
533	aesd	v1.16b,v20.16b
534	aesimc	v1.16b,v1.16b
535	aesd	v18.16b,v20.16b
536	aesimc	v18.16b,v18.16b
537	cmn	x2,#0x20
538	aesd	v1.16b,v21.16b
539	aesimc	v1.16b,v1.16b
540	aesd	v18.16b,v21.16b
541	aesimc	v18.16b,v18.16b
542	eor	v5.16b,v6.16b,v7.16b
543	aesd	v1.16b,v22.16b
544	aesimc	v1.16b,v1.16b
545	aesd	v18.16b,v22.16b
546	aesimc	v18.16b,v18.16b
547	eor	v17.16b,v3.16b,v7.16b
548	aesd	v1.16b,v23.16b
549	aesd	v18.16b,v23.16b
550	b.eq	.Lcbc_dec_one
551	eor	v5.16b,v5.16b,v1.16b
552	eor	v17.16b,v17.16b,v18.16b
553	orr	v6.16b,v19.16b,v19.16b
554	st1	{v5.16b},[x1],#16
555	st1	{v17.16b},[x1],#16
556	b	.Lcbc_done
557
558.Lcbc_dec_one:
559	eor	v5.16b,v5.16b,v18.16b
560	orr	v6.16b,v19.16b,v19.16b
561	st1	{v5.16b},[x1],#16
562
563.Lcbc_done:
564	st1	{v6.16b},[x4]
565.Lcbc_abort:
566	ldr	x29,[sp],#16
567	ret
568.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
569.globl	aes_v8_ctr32_encrypt_blocks
570.type	aes_v8_ctr32_encrypt_blocks,%function
571.align	5
572aes_v8_ctr32_encrypt_blocks:
573	stp	x29,x30,[sp,#-16]!
574	add	x29,sp,#0
575	ldr	w5,[x3,#240]
576
577	ldr	w8, [x4, #12]
578	ld1	{v0.4s},[x4]
579
580	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
581	sub	w5,w5,#4
582	mov	x12,#16
583	cmp	x2,#2
584	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
585	sub	w5,w5,#2
586	ld1	{v20.4s,v21.4s},[x7],#32
587	ld1	{v22.4s,v23.4s},[x7],#32
588	ld1	{v7.4s},[x7]
589	add	x7,x3,#32
590	mov	w6,w5
591	csel	x12,xzr,x12,lo
592#ifndef __ARMEB__
593	rev	w8, w8
594#endif
595	orr	v1.16b,v0.16b,v0.16b
596	add	w10, w8, #1
597	orr	v18.16b,v0.16b,v0.16b
598	add	w8, w8, #2
599	orr	v6.16b,v0.16b,v0.16b
600	rev	w10, w10
601	mov	v1.s[3],w10
602	b.ls	.Lctr32_tail
603	rev	w12, w8
604	sub	x2,x2,#3		// bias
605	mov	v18.s[3],w12
606	b	.Loop3x_ctr32
607
608.align	4
609.Loop3x_ctr32:
610	aese	v0.16b,v16.16b
611	aesmc	v0.16b,v0.16b
612	aese	v1.16b,v16.16b
613	aesmc	v1.16b,v1.16b
614	aese	v18.16b,v16.16b
615	aesmc	v18.16b,v18.16b
616	ld1	{v16.4s},[x7],#16
617	subs	w6,w6,#2
618	aese	v0.16b,v17.16b
619	aesmc	v0.16b,v0.16b
620	aese	v1.16b,v17.16b
621	aesmc	v1.16b,v1.16b
622	aese	v18.16b,v17.16b
623	aesmc	v18.16b,v18.16b
624	ld1	{v17.4s},[x7],#16
625	b.gt	.Loop3x_ctr32
626
627	aese	v0.16b,v16.16b
628	aesmc	v4.16b,v0.16b
629	aese	v1.16b,v16.16b
630	aesmc	v5.16b,v1.16b
631	ld1	{v2.16b},[x0],#16
632	orr	v0.16b,v6.16b,v6.16b
633	aese	v18.16b,v16.16b
634	aesmc	v18.16b,v18.16b
635	ld1	{v3.16b},[x0],#16
636	orr	v1.16b,v6.16b,v6.16b
637	aese	v4.16b,v17.16b
638	aesmc	v4.16b,v4.16b
639	aese	v5.16b,v17.16b
640	aesmc	v5.16b,v5.16b
641	ld1	{v19.16b},[x0],#16
642	mov	x7,x3
643	aese	v18.16b,v17.16b
644	aesmc	v17.16b,v18.16b
645	orr	v18.16b,v6.16b,v6.16b
646	add	w9,w8,#1
647	aese	v4.16b,v20.16b
648	aesmc	v4.16b,v4.16b
649	aese	v5.16b,v20.16b
650	aesmc	v5.16b,v5.16b
651	eor	v2.16b,v2.16b,v7.16b
652	add	w10,w8,#2
653	aese	v17.16b,v20.16b
654	aesmc	v17.16b,v17.16b
655	eor	v3.16b,v3.16b,v7.16b
656	add	w8,w8,#3
657	aese	v4.16b,v21.16b
658	aesmc	v4.16b,v4.16b
659	aese	v5.16b,v21.16b
660	aesmc	v5.16b,v5.16b
661	eor	v19.16b,v19.16b,v7.16b
662	rev	w9,w9
663	aese	v17.16b,v21.16b
664	aesmc	v17.16b,v17.16b
665	mov	v0.s[3], w9
666	rev	w10,w10
667	aese	v4.16b,v22.16b
668	aesmc	v4.16b,v4.16b
669	aese	v5.16b,v22.16b
670	aesmc	v5.16b,v5.16b
671	mov	v1.s[3], w10
672	rev	w12,w8
673	aese	v17.16b,v22.16b
674	aesmc	v17.16b,v17.16b
675	mov	v18.s[3], w12
676	subs	x2,x2,#3
677	aese	v4.16b,v23.16b
678	aese	v5.16b,v23.16b
679	aese	v17.16b,v23.16b
680
681	eor	v2.16b,v2.16b,v4.16b
682	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
683	st1	{v2.16b},[x1],#16
684	eor	v3.16b,v3.16b,v5.16b
685	mov	w6,w5
686	st1	{v3.16b},[x1],#16
687	eor	v19.16b,v19.16b,v17.16b
688	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
689	st1	{v19.16b},[x1],#16
690	b.hs	.Loop3x_ctr32
691
692	adds	x2,x2,#3
693	b.eq	.Lctr32_done
694	cmp	x2,#1
695	mov	x12,#16
696	csel	x12,xzr,x12,eq
697
698.Lctr32_tail:
699	aese	v0.16b,v16.16b
700	aesmc	v0.16b,v0.16b
701	aese	v1.16b,v16.16b
702	aesmc	v1.16b,v1.16b
703	ld1	{v16.4s},[x7],#16
704	subs	w6,w6,#2
705	aese	v0.16b,v17.16b
706	aesmc	v0.16b,v0.16b
707	aese	v1.16b,v17.16b
708	aesmc	v1.16b,v1.16b
709	ld1	{v17.4s},[x7],#16
710	b.gt	.Lctr32_tail
711
712	aese	v0.16b,v16.16b
713	aesmc	v0.16b,v0.16b
714	aese	v1.16b,v16.16b
715	aesmc	v1.16b,v1.16b
716	aese	v0.16b,v17.16b
717	aesmc	v0.16b,v0.16b
718	aese	v1.16b,v17.16b
719	aesmc	v1.16b,v1.16b
720	ld1	{v2.16b},[x0],x12
721	aese	v0.16b,v20.16b
722	aesmc	v0.16b,v0.16b
723	aese	v1.16b,v20.16b
724	aesmc	v1.16b,v1.16b
725	ld1	{v3.16b},[x0]
726	aese	v0.16b,v21.16b
727	aesmc	v0.16b,v0.16b
728	aese	v1.16b,v21.16b
729	aesmc	v1.16b,v1.16b
730	eor	v2.16b,v2.16b,v7.16b
731	aese	v0.16b,v22.16b
732	aesmc	v0.16b,v0.16b
733	aese	v1.16b,v22.16b
734	aesmc	v1.16b,v1.16b
735	eor	v3.16b,v3.16b,v7.16b
736	aese	v0.16b,v23.16b
737	aese	v1.16b,v23.16b
738
739	cmp	x2,#1
740	eor	v2.16b,v2.16b,v0.16b
741	eor	v3.16b,v3.16b,v1.16b
742	st1	{v2.16b},[x1],#16
743	b.eq	.Lctr32_done
744	st1	{v3.16b},[x1]
745
746.Lctr32_done:
747	ldr	x29,[sp],#16
748	ret
749.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
750#endif
751