xref: /freebsd/sys/crypto/openssl/aarch64/aesv8-armx.S (revision 575878a533823aa3e5bab715928d9cdffbc4dcbc)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from aesv8-armx.pl. */
2bc3d5698SJohn Baldwin#include "arm_arch.h"
3bc3d5698SJohn Baldwin
4bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7
5c0855eaaSJohn Baldwin.arch	armv8-a+crypto
6bc3d5698SJohn Baldwin.text
7bc3d5698SJohn Baldwin.align	5
8bc3d5698SJohn Baldwin.Lrcon:
9bc3d5698SJohn Baldwin.long	0x01,0x01,0x01,0x01
10bc3d5698SJohn Baldwin.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
11bc3d5698SJohn Baldwin.long	0x1b,0x1b,0x1b,0x1b
12bc3d5698SJohn Baldwin
13bc3d5698SJohn Baldwin.globl	aes_v8_set_encrypt_key
14bc3d5698SJohn Baldwin.type	aes_v8_set_encrypt_key,%function
15bc3d5698SJohn Baldwin.align	5
16bc3d5698SJohn Baldwinaes_v8_set_encrypt_key:
17bc3d5698SJohn Baldwin.Lenc_key:
18bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
19bd9588bcSAndrew Turner	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
20bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
21bc3d5698SJohn Baldwin	add	x29,sp,#0
22bc3d5698SJohn Baldwin	mov	x3,#-1
23bc3d5698SJohn Baldwin	cmp	x0,#0
24bc3d5698SJohn Baldwin	b.eq	.Lenc_key_abort
25bc3d5698SJohn Baldwin	cmp	x2,#0
26bc3d5698SJohn Baldwin	b.eq	.Lenc_key_abort
27bc3d5698SJohn Baldwin	mov	x3,#-2
28bc3d5698SJohn Baldwin	cmp	w1,#128
29bc3d5698SJohn Baldwin	b.lt	.Lenc_key_abort
30bc3d5698SJohn Baldwin	cmp	w1,#256
31bc3d5698SJohn Baldwin	b.gt	.Lenc_key_abort
32bc3d5698SJohn Baldwin	tst	w1,#0x3f
33bc3d5698SJohn Baldwin	b.ne	.Lenc_key_abort
34bc3d5698SJohn Baldwin
35bc3d5698SJohn Baldwin	adr	x3,.Lrcon
36bc3d5698SJohn Baldwin	cmp	w1,#192
37bc3d5698SJohn Baldwin
38bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v0.16b
39bc3d5698SJohn Baldwin	ld1	{v3.16b},[x0],#16
40bc3d5698SJohn Baldwin	mov	w1,#8		// reuse w1
41bc3d5698SJohn Baldwin	ld1	{v1.4s,v2.4s},[x3],#32
42bc3d5698SJohn Baldwin
43bc3d5698SJohn Baldwin	b.lt	.Loop128
44bc3d5698SJohn Baldwin	b.eq	.L192
45bc3d5698SJohn Baldwin	b	.L256
46bc3d5698SJohn Baldwin
47bc3d5698SJohn Baldwin.align	4
48bc3d5698SJohn Baldwin.Loop128:
49bc3d5698SJohn Baldwin	tbl	v6.16b,{v3.16b},v2.16b
50bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v3.16b,#12
51bc3d5698SJohn Baldwin	st1	{v3.4s},[x2],#16
52bc3d5698SJohn Baldwin	aese	v6.16b,v0.16b
53bc3d5698SJohn Baldwin	subs	w1,w1,#1
54bc3d5698SJohn Baldwin
55bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
56bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v5.16b,#12
57bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
58bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v5.16b,#12
59bc3d5698SJohn Baldwin	eor	v6.16b,v6.16b,v1.16b
60bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
61bc3d5698SJohn Baldwin	shl	v1.16b,v1.16b,#1
62bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v6.16b
63bc3d5698SJohn Baldwin	b.ne	.Loop128
64bc3d5698SJohn Baldwin
65bc3d5698SJohn Baldwin	ld1	{v1.4s},[x3]
66bc3d5698SJohn Baldwin
67bc3d5698SJohn Baldwin	tbl	v6.16b,{v3.16b},v2.16b
68bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v3.16b,#12
69bc3d5698SJohn Baldwin	st1	{v3.4s},[x2],#16
70bc3d5698SJohn Baldwin	aese	v6.16b,v0.16b
71bc3d5698SJohn Baldwin
72bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
73bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v5.16b,#12
74bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
75bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v5.16b,#12
76bc3d5698SJohn Baldwin	eor	v6.16b,v6.16b,v1.16b
77bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
78bc3d5698SJohn Baldwin	shl	v1.16b,v1.16b,#1
79bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v6.16b
80bc3d5698SJohn Baldwin
81bc3d5698SJohn Baldwin	tbl	v6.16b,{v3.16b},v2.16b
82bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v3.16b,#12
83bc3d5698SJohn Baldwin	st1	{v3.4s},[x2],#16
84bc3d5698SJohn Baldwin	aese	v6.16b,v0.16b
85bc3d5698SJohn Baldwin
86bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
87bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v5.16b,#12
88bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
89bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v5.16b,#12
90bc3d5698SJohn Baldwin	eor	v6.16b,v6.16b,v1.16b
91bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
92bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v6.16b
93bc3d5698SJohn Baldwin	st1	{v3.4s},[x2]
94bc3d5698SJohn Baldwin	add	x2,x2,#0x50
95bc3d5698SJohn Baldwin
96bc3d5698SJohn Baldwin	mov	w12,#10
97bc3d5698SJohn Baldwin	b	.Ldone
98bc3d5698SJohn Baldwin
99bc3d5698SJohn Baldwin.align	4
100bc3d5698SJohn Baldwin.L192:
101bc3d5698SJohn Baldwin	ld1	{v4.8b},[x0],#8
102bc3d5698SJohn Baldwin	movi	v6.16b,#8			// borrow v6.16b
103bc3d5698SJohn Baldwin	st1	{v3.4s},[x2],#16
104bc3d5698SJohn Baldwin	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
105bc3d5698SJohn Baldwin
106bc3d5698SJohn Baldwin.Loop192:
107bc3d5698SJohn Baldwin	tbl	v6.16b,{v4.16b},v2.16b
108bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v3.16b,#12
109*575878a5SEd Maste#ifdef __AARCH64EB__
110c3c73b4fSJung-uk Kim	st1	{v4.4s},[x2],#16
111c3c73b4fSJung-uk Kim	sub	x2,x2,#8
112c3c73b4fSJung-uk Kim#else
113bc3d5698SJohn Baldwin	st1	{v4.8b},[x2],#8
114c3c73b4fSJung-uk Kim#endif
115bc3d5698SJohn Baldwin	aese	v6.16b,v0.16b
116bc3d5698SJohn Baldwin	subs	w1,w1,#1
117bc3d5698SJohn Baldwin
118bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
119bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v5.16b,#12
120bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
121bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v5.16b,#12
122bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
123bc3d5698SJohn Baldwin
124bc3d5698SJohn Baldwin	dup	v5.4s,v3.s[3]
125bc3d5698SJohn Baldwin	eor	v5.16b,v5.16b,v4.16b
126bc3d5698SJohn Baldwin	eor	v6.16b,v6.16b,v1.16b
127bc3d5698SJohn Baldwin	ext	v4.16b,v0.16b,v4.16b,#12
128bc3d5698SJohn Baldwin	shl	v1.16b,v1.16b,#1
129bc3d5698SJohn Baldwin	eor	v4.16b,v4.16b,v5.16b
130bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v6.16b
131bc3d5698SJohn Baldwin	eor	v4.16b,v4.16b,v6.16b
132bc3d5698SJohn Baldwin	st1	{v3.4s},[x2],#16
133bc3d5698SJohn Baldwin	b.ne	.Loop192
134bc3d5698SJohn Baldwin
135bc3d5698SJohn Baldwin	mov	w12,#12
136bc3d5698SJohn Baldwin	add	x2,x2,#0x20
137bc3d5698SJohn Baldwin	b	.Ldone
138bc3d5698SJohn Baldwin
139bc3d5698SJohn Baldwin.align	4
140bc3d5698SJohn Baldwin.L256:
141bc3d5698SJohn Baldwin	ld1	{v4.16b},[x0]
142bc3d5698SJohn Baldwin	mov	w1,#7
143bc3d5698SJohn Baldwin	mov	w12,#14
144bc3d5698SJohn Baldwin	st1	{v3.4s},[x2],#16
145bc3d5698SJohn Baldwin
146bc3d5698SJohn Baldwin.Loop256:
147bc3d5698SJohn Baldwin	tbl	v6.16b,{v4.16b},v2.16b
148bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v3.16b,#12
149bc3d5698SJohn Baldwin	st1	{v4.4s},[x2],#16
150bc3d5698SJohn Baldwin	aese	v6.16b,v0.16b
151bc3d5698SJohn Baldwin	subs	w1,w1,#1
152bc3d5698SJohn Baldwin
153bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
154bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v5.16b,#12
155bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
156bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v5.16b,#12
157bc3d5698SJohn Baldwin	eor	v6.16b,v6.16b,v1.16b
158bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
159bc3d5698SJohn Baldwin	shl	v1.16b,v1.16b,#1
160bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v6.16b
161bc3d5698SJohn Baldwin	st1	{v3.4s},[x2],#16
162bc3d5698SJohn Baldwin	b.eq	.Ldone
163bc3d5698SJohn Baldwin
164bc3d5698SJohn Baldwin	dup	v6.4s,v3.s[3]		// just splat
165bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v4.16b,#12
166bc3d5698SJohn Baldwin	aese	v6.16b,v0.16b
167bc3d5698SJohn Baldwin
168bc3d5698SJohn Baldwin	eor	v4.16b,v4.16b,v5.16b
169bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v5.16b,#12
170bc3d5698SJohn Baldwin	eor	v4.16b,v4.16b,v5.16b
171bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v5.16b,#12
172bc3d5698SJohn Baldwin	eor	v4.16b,v4.16b,v5.16b
173bc3d5698SJohn Baldwin
174bc3d5698SJohn Baldwin	eor	v4.16b,v4.16b,v6.16b
175bc3d5698SJohn Baldwin	b	.Loop256
176bc3d5698SJohn Baldwin
177bc3d5698SJohn Baldwin.Ldone:
178bc3d5698SJohn Baldwin	str	w12,[x2]
179bc3d5698SJohn Baldwin	mov	x3,#0
180bc3d5698SJohn Baldwin
181bc3d5698SJohn Baldwin.Lenc_key_abort:
182bc3d5698SJohn Baldwin	mov	x0,x3			// return value
183bc3d5698SJohn Baldwin	ldr	x29,[sp],#16
184bc3d5698SJohn Baldwin	ret
185bc3d5698SJohn Baldwin.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
186bc3d5698SJohn Baldwin
187bc3d5698SJohn Baldwin.globl	aes_v8_set_decrypt_key
188bc3d5698SJohn Baldwin.type	aes_v8_set_decrypt_key,%function
189bc3d5698SJohn Baldwin.align	5
190bc3d5698SJohn Baldwinaes_v8_set_decrypt_key:
191bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
192bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
193bc3d5698SJohn Baldwin	add	x29,sp,#0
194bc3d5698SJohn Baldwin	bl	.Lenc_key
195bc3d5698SJohn Baldwin
196bc3d5698SJohn Baldwin	cmp	x0,#0
197bc3d5698SJohn Baldwin	b.ne	.Ldec_key_abort
198bc3d5698SJohn Baldwin
199bc3d5698SJohn Baldwin	sub	x2,x2,#240		// restore original x2
200bc3d5698SJohn Baldwin	mov	x4,#-16
201bc3d5698SJohn Baldwin	add	x0,x2,x12,lsl#4	// end of key schedule
202bc3d5698SJohn Baldwin
203bc3d5698SJohn Baldwin	ld1	{v0.4s},[x2]
204bc3d5698SJohn Baldwin	ld1	{v1.4s},[x0]
205bc3d5698SJohn Baldwin	st1	{v0.4s},[x0],x4
206bc3d5698SJohn Baldwin	st1	{v1.4s},[x2],#16
207bc3d5698SJohn Baldwin
208bc3d5698SJohn Baldwin.Loop_imc:
209bc3d5698SJohn Baldwin	ld1	{v0.4s},[x2]
210bc3d5698SJohn Baldwin	ld1	{v1.4s},[x0]
211bc3d5698SJohn Baldwin	aesimc	v0.16b,v0.16b
212bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
213bc3d5698SJohn Baldwin	st1	{v0.4s},[x0],x4
214bc3d5698SJohn Baldwin	st1	{v1.4s},[x2],#16
215bc3d5698SJohn Baldwin	cmp	x0,x2
216bc3d5698SJohn Baldwin	b.hi	.Loop_imc
217bc3d5698SJohn Baldwin
218bc3d5698SJohn Baldwin	ld1	{v0.4s},[x2]
219bc3d5698SJohn Baldwin	aesimc	v0.16b,v0.16b
220bc3d5698SJohn Baldwin	st1	{v0.4s},[x0]
221bc3d5698SJohn Baldwin
222bc3d5698SJohn Baldwin	eor	x0,x0,x0		// return value
223bc3d5698SJohn Baldwin.Ldec_key_abort:
224bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#16
225bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
226bc3d5698SJohn Baldwin	ret
227bc3d5698SJohn Baldwin.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
228bc3d5698SJohn Baldwin.globl	aes_v8_encrypt
229bc3d5698SJohn Baldwin.type	aes_v8_encrypt,%function
230bc3d5698SJohn Baldwin.align	5
231bc3d5698SJohn Baldwinaes_v8_encrypt:
232bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
233bc3d5698SJohn Baldwin	ldr	w3,[x2,#240]
234bc3d5698SJohn Baldwin	ld1	{v0.4s},[x2],#16
235bc3d5698SJohn Baldwin	ld1	{v2.16b},[x0]
236bc3d5698SJohn Baldwin	sub	w3,w3,#2
237bc3d5698SJohn Baldwin	ld1	{v1.4s},[x2],#16
238bc3d5698SJohn Baldwin
239bc3d5698SJohn Baldwin.Loop_enc:
240bc3d5698SJohn Baldwin	aese	v2.16b,v0.16b
241bc3d5698SJohn Baldwin	aesmc	v2.16b,v2.16b
242bc3d5698SJohn Baldwin	ld1	{v0.4s},[x2],#16
243bc3d5698SJohn Baldwin	subs	w3,w3,#2
244bc3d5698SJohn Baldwin	aese	v2.16b,v1.16b
245bc3d5698SJohn Baldwin	aesmc	v2.16b,v2.16b
246bc3d5698SJohn Baldwin	ld1	{v1.4s},[x2],#16
247bc3d5698SJohn Baldwin	b.gt	.Loop_enc
248bc3d5698SJohn Baldwin
249bc3d5698SJohn Baldwin	aese	v2.16b,v0.16b
250bc3d5698SJohn Baldwin	aesmc	v2.16b,v2.16b
251bc3d5698SJohn Baldwin	ld1	{v0.4s},[x2]
252bc3d5698SJohn Baldwin	aese	v2.16b,v1.16b
253bc3d5698SJohn Baldwin	eor	v2.16b,v2.16b,v0.16b
254bc3d5698SJohn Baldwin
255bc3d5698SJohn Baldwin	st1	{v2.16b},[x1]
256bc3d5698SJohn Baldwin	ret
257bc3d5698SJohn Baldwin.size	aes_v8_encrypt,.-aes_v8_encrypt
258bc3d5698SJohn Baldwin.globl	aes_v8_decrypt
259bc3d5698SJohn Baldwin.type	aes_v8_decrypt,%function
260bc3d5698SJohn Baldwin.align	5
261bc3d5698SJohn Baldwinaes_v8_decrypt:
262bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
263bc3d5698SJohn Baldwin	ldr	w3,[x2,#240]
264bc3d5698SJohn Baldwin	ld1	{v0.4s},[x2],#16
265bc3d5698SJohn Baldwin	ld1	{v2.16b},[x0]
266bc3d5698SJohn Baldwin	sub	w3,w3,#2
267bc3d5698SJohn Baldwin	ld1	{v1.4s},[x2],#16
268bc3d5698SJohn Baldwin
269bc3d5698SJohn Baldwin.Loop_dec:
270bc3d5698SJohn Baldwin	aesd	v2.16b,v0.16b
271bc3d5698SJohn Baldwin	aesimc	v2.16b,v2.16b
272bc3d5698SJohn Baldwin	ld1	{v0.4s},[x2],#16
273bc3d5698SJohn Baldwin	subs	w3,w3,#2
274bc3d5698SJohn Baldwin	aesd	v2.16b,v1.16b
275bc3d5698SJohn Baldwin	aesimc	v2.16b,v2.16b
276bc3d5698SJohn Baldwin	ld1	{v1.4s},[x2],#16
277bc3d5698SJohn Baldwin	b.gt	.Loop_dec
278bc3d5698SJohn Baldwin
279bc3d5698SJohn Baldwin	aesd	v2.16b,v0.16b
280bc3d5698SJohn Baldwin	aesimc	v2.16b,v2.16b
281bc3d5698SJohn Baldwin	ld1	{v0.4s},[x2]
282bc3d5698SJohn Baldwin	aesd	v2.16b,v1.16b
283bc3d5698SJohn Baldwin	eor	v2.16b,v2.16b,v0.16b
284bc3d5698SJohn Baldwin
285bc3d5698SJohn Baldwin	st1	{v2.16b},[x1]
286bc3d5698SJohn Baldwin	ret
287bc3d5698SJohn Baldwin.size	aes_v8_decrypt,.-aes_v8_decrypt
288c0855eaaSJohn Baldwin.globl	aes_v8_ecb_encrypt
289c0855eaaSJohn Baldwin.type	aes_v8_ecb_encrypt,%function
290c0855eaaSJohn Baldwin.align	5
291c0855eaaSJohn Baldwinaes_v8_ecb_encrypt:
292bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
293c0855eaaSJohn Baldwin	subs	x2,x2,#16
294c0855eaaSJohn Baldwin	// Original input data size bigger than 16, jump to big size processing.
295c0855eaaSJohn Baldwin	b.ne	.Lecb_big_size
296c0855eaaSJohn Baldwin	ld1	{v0.16b},[x0]
297c0855eaaSJohn Baldwin	cmp	w4,#0					// en- or decrypting?
298c0855eaaSJohn Baldwin	ldr	w5,[x3,#240]
299c0855eaaSJohn Baldwin	ld1	{v5.4s,v6.4s},[x3],#32			// load key schedule...
300c0855eaaSJohn Baldwin
301c0855eaaSJohn Baldwin	b.eq	.Lecb_small_dec
302c0855eaaSJohn Baldwin	aese	v0.16b,v5.16b
303c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
304c0855eaaSJohn Baldwin	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
305c0855eaaSJohn Baldwin	aese	v0.16b,v6.16b
306c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
307c0855eaaSJohn Baldwin	subs	w5,w5,#10			// if rounds==10, jump to aes-128-ecb processing
308c0855eaaSJohn Baldwin	b.eq	.Lecb_128_enc
309c0855eaaSJohn Baldwin.Lecb_round_loop:
310c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
311c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
312c0855eaaSJohn Baldwin	ld1	{v16.4s},[x3],#16				// load key schedule...
313c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
314c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
315c0855eaaSJohn Baldwin	ld1	{v17.4s},[x3],#16				// load key schedule...
316c0855eaaSJohn Baldwin	subs	w5,w5,#2			// bias
317c0855eaaSJohn Baldwin	b.gt	.Lecb_round_loop
318c0855eaaSJohn Baldwin.Lecb_128_enc:
319c0855eaaSJohn Baldwin	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
320c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
321c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
322c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
323c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
324c0855eaaSJohn Baldwin	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
325c0855eaaSJohn Baldwin	aese	v0.16b,v18.16b
326c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
327c0855eaaSJohn Baldwin	aese	v0.16b,v19.16b
328c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
329c0855eaaSJohn Baldwin	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
330c0855eaaSJohn Baldwin	aese	v0.16b,v20.16b
331c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
332c0855eaaSJohn Baldwin	aese	v0.16b,v21.16b
333c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
334c0855eaaSJohn Baldwin	ld1	{v7.4s},[x3]
335c0855eaaSJohn Baldwin	aese	v0.16b,v22.16b
336c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
337c0855eaaSJohn Baldwin	aese	v0.16b,v23.16b
338c0855eaaSJohn Baldwin	eor	v0.16b,v0.16b,v7.16b
339c0855eaaSJohn Baldwin	st1	{v0.16b},[x1]
340c0855eaaSJohn Baldwin	b	.Lecb_Final_abort
341c0855eaaSJohn Baldwin.Lecb_small_dec:
342c0855eaaSJohn Baldwin	aesd	v0.16b,v5.16b
343c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
344c0855eaaSJohn Baldwin	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
345c0855eaaSJohn Baldwin	aesd	v0.16b,v6.16b
346c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
347c0855eaaSJohn Baldwin	subs	w5,w5,#10			// bias
348c0855eaaSJohn Baldwin	b.eq	.Lecb_128_dec
349c0855eaaSJohn Baldwin.Lecb_dec_round_loop:
350c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
351c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
352c0855eaaSJohn Baldwin	ld1	{v16.4s},[x3],#16				// load key schedule...
353c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
354c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
355c0855eaaSJohn Baldwin	ld1	{v17.4s},[x3],#16				// load key schedule...
356c0855eaaSJohn Baldwin	subs	w5,w5,#2			// bias
357c0855eaaSJohn Baldwin	b.gt	.Lecb_dec_round_loop
358c0855eaaSJohn Baldwin.Lecb_128_dec:
359c0855eaaSJohn Baldwin	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
360c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
361c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
362c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
363c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
364c0855eaaSJohn Baldwin	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
365c0855eaaSJohn Baldwin	aesd	v0.16b,v18.16b
366c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
367c0855eaaSJohn Baldwin	aesd	v0.16b,v19.16b
368c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
369c0855eaaSJohn Baldwin	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
370c0855eaaSJohn Baldwin	aesd	v0.16b,v20.16b
371c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
372c0855eaaSJohn Baldwin	aesd	v0.16b,v21.16b
373c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
374c0855eaaSJohn Baldwin	ld1	{v7.4s},[x3]
375c0855eaaSJohn Baldwin	aesd	v0.16b,v22.16b
376c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
377c0855eaaSJohn Baldwin	aesd	v0.16b,v23.16b
378c0855eaaSJohn Baldwin	eor	v0.16b,v0.16b,v7.16b
379c0855eaaSJohn Baldwin	st1	{v0.16b},[x1]
380c0855eaaSJohn Baldwin	b	.Lecb_Final_abort
381c0855eaaSJohn Baldwin.Lecb_big_size:
382c0855eaaSJohn Baldwin	stp	x29,x30,[sp,#-16]!
383c0855eaaSJohn Baldwin	add	x29,sp,#0
384c0855eaaSJohn Baldwin	mov	x8,#16
385c0855eaaSJohn Baldwin	b.lo	.Lecb_done
386c0855eaaSJohn Baldwin	csel	x8,xzr,x8,eq
387c0855eaaSJohn Baldwin
388c0855eaaSJohn Baldwin	cmp	w4,#0					// en- or decrypting?
389c0855eaaSJohn Baldwin	ldr	w5,[x3,#240]
390c0855eaaSJohn Baldwin	and	x2,x2,#-16
391c0855eaaSJohn Baldwin	ld1	{v0.16b},[x0],x8
392c0855eaaSJohn Baldwin
393c0855eaaSJohn Baldwin	ld1	{v16.4s,v17.4s},[x3]				// load key schedule...
394c0855eaaSJohn Baldwin	sub	w5,w5,#6
395c0855eaaSJohn Baldwin	add	x7,x3,x5,lsl#4				// pointer to last 7 round keys
396c0855eaaSJohn Baldwin	sub	w5,w5,#2
397c0855eaaSJohn Baldwin	ld1	{v18.4s,v19.4s},[x7],#32
398c0855eaaSJohn Baldwin	ld1	{v20.4s,v21.4s},[x7],#32
399c0855eaaSJohn Baldwin	ld1	{v22.4s,v23.4s},[x7],#32
400c0855eaaSJohn Baldwin	ld1	{v7.4s},[x7]
401c0855eaaSJohn Baldwin
402c0855eaaSJohn Baldwin	add	x7,x3,#32
403c0855eaaSJohn Baldwin	mov	w6,w5
404c0855eaaSJohn Baldwin	b.eq	.Lecb_dec
405c0855eaaSJohn Baldwin
406c0855eaaSJohn Baldwin	ld1	{v1.16b},[x0],#16
407c0855eaaSJohn Baldwin	subs	x2,x2,#32				// bias
408c0855eaaSJohn Baldwin	add	w6,w5,#2
409c0855eaaSJohn Baldwin	orr	v3.16b,v1.16b,v1.16b
410c0855eaaSJohn Baldwin	orr	v24.16b,v1.16b,v1.16b
411c0855eaaSJohn Baldwin	orr	v1.16b,v0.16b,v0.16b
412c0855eaaSJohn Baldwin	b.lo	.Lecb_enc_tail
413c0855eaaSJohn Baldwin
414c0855eaaSJohn Baldwin	orr	v1.16b,v3.16b,v3.16b
415c0855eaaSJohn Baldwin	ld1	{v24.16b},[x0],#16
416c0855eaaSJohn Baldwin	cmp	x2,#32
417c0855eaaSJohn Baldwin	b.lo	.Loop3x_ecb_enc
418c0855eaaSJohn Baldwin
419c0855eaaSJohn Baldwin	ld1	{v25.16b},[x0],#16
420c0855eaaSJohn Baldwin	ld1	{v26.16b},[x0],#16
421c0855eaaSJohn Baldwin	sub	x2,x2,#32				// bias
422c0855eaaSJohn Baldwin	mov	w6,w5
423c0855eaaSJohn Baldwin
424c0855eaaSJohn Baldwin.Loop5x_ecb_enc:
425c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
426c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
427c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
428c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
429c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
430c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
431c0855eaaSJohn Baldwin	aese	v25.16b,v16.16b
432c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
433c0855eaaSJohn Baldwin	aese	v26.16b,v16.16b
434c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
435c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
436c0855eaaSJohn Baldwin	subs	w6,w6,#2
437c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
438c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
439c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
440c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
441c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
442c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
443c0855eaaSJohn Baldwin	aese	v25.16b,v17.16b
444c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
445c0855eaaSJohn Baldwin	aese	v26.16b,v17.16b
446c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
447c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
448c0855eaaSJohn Baldwin	b.gt	.Loop5x_ecb_enc
449c0855eaaSJohn Baldwin
450c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
451c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
452c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
453c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
454c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
455c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
456c0855eaaSJohn Baldwin	aese	v25.16b,v16.16b
457c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
458c0855eaaSJohn Baldwin	aese	v26.16b,v16.16b
459c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
460c0855eaaSJohn Baldwin	cmp	x2,#0x40					// because .Lecb_enc_tail4x
461c0855eaaSJohn Baldwin	sub	x2,x2,#0x50
462c0855eaaSJohn Baldwin
463c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
464c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
465c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
466c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
467c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
468c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
469c0855eaaSJohn Baldwin	aese	v25.16b,v17.16b
470c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
471c0855eaaSJohn Baldwin	aese	v26.16b,v17.16b
472c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
473c0855eaaSJohn Baldwin	csel	x6,xzr,x2,gt			// borrow x6, w6, "gt" is not typo
474c0855eaaSJohn Baldwin	mov	x7,x3
475c0855eaaSJohn Baldwin
476c0855eaaSJohn Baldwin	aese	v0.16b,v18.16b
477c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
478c0855eaaSJohn Baldwin	aese	v1.16b,v18.16b
479c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
480c0855eaaSJohn Baldwin	aese	v24.16b,v18.16b
481c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
482c0855eaaSJohn Baldwin	aese	v25.16b,v18.16b
483c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
484c0855eaaSJohn Baldwin	aese	v26.16b,v18.16b
485c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
486c0855eaaSJohn Baldwin	add	x0,x0,x6				// x0 is adjusted in such way that
487c0855eaaSJohn Baldwin							// at exit from the loop v1.16b-v26.16b
488c0855eaaSJohn Baldwin							// are loaded with last "words"
489c0855eaaSJohn Baldwin	add	x6,x2,#0x60		    // because .Lecb_enc_tail4x
490c0855eaaSJohn Baldwin
491c0855eaaSJohn Baldwin	aese	v0.16b,v19.16b
492c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
493c0855eaaSJohn Baldwin	aese	v1.16b,v19.16b
494c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
495c0855eaaSJohn Baldwin	aese	v24.16b,v19.16b
496c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
497c0855eaaSJohn Baldwin	aese	v25.16b,v19.16b
498c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
499c0855eaaSJohn Baldwin	aese	v26.16b,v19.16b
500c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
501c0855eaaSJohn Baldwin
502c0855eaaSJohn Baldwin	aese	v0.16b,v20.16b
503c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
504c0855eaaSJohn Baldwin	aese	v1.16b,v20.16b
505c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
506c0855eaaSJohn Baldwin	aese	v24.16b,v20.16b
507c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
508c0855eaaSJohn Baldwin	aese	v25.16b,v20.16b
509c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
510c0855eaaSJohn Baldwin	aese	v26.16b,v20.16b
511c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
512c0855eaaSJohn Baldwin
513c0855eaaSJohn Baldwin	aese	v0.16b,v21.16b
514c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
515c0855eaaSJohn Baldwin	aese	v1.16b,v21.16b
516c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
517c0855eaaSJohn Baldwin	aese	v24.16b,v21.16b
518c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
519c0855eaaSJohn Baldwin	aese	v25.16b,v21.16b
520c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
521c0855eaaSJohn Baldwin	aese	v26.16b,v21.16b
522c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
523c0855eaaSJohn Baldwin
524c0855eaaSJohn Baldwin	aese	v0.16b,v22.16b
525c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
526c0855eaaSJohn Baldwin	aese	v1.16b,v22.16b
527c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
528c0855eaaSJohn Baldwin	aese	v24.16b,v22.16b
529c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
530c0855eaaSJohn Baldwin	aese	v25.16b,v22.16b
531c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
532c0855eaaSJohn Baldwin	aese	v26.16b,v22.16b
533c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
534c0855eaaSJohn Baldwin
535c0855eaaSJohn Baldwin	aese	v0.16b,v23.16b
536c0855eaaSJohn Baldwin	ld1	{v2.16b},[x0],#16
537c0855eaaSJohn Baldwin	aese	v1.16b,v23.16b
538c0855eaaSJohn Baldwin	ld1	{v3.16b},[x0],#16
539c0855eaaSJohn Baldwin	aese	v24.16b,v23.16b
540c0855eaaSJohn Baldwin	ld1	{v27.16b},[x0],#16
541c0855eaaSJohn Baldwin	aese	v25.16b,v23.16b
542c0855eaaSJohn Baldwin	ld1	{v28.16b},[x0],#16
543c0855eaaSJohn Baldwin	aese	v26.16b,v23.16b
544c0855eaaSJohn Baldwin	ld1	{v29.16b},[x0],#16
545c0855eaaSJohn Baldwin	cbz	x6,.Lecb_enc_tail4x
546c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
547c0855eaaSJohn Baldwin	eor	v4.16b,v7.16b,v0.16b
548c0855eaaSJohn Baldwin	orr	v0.16b,v2.16b,v2.16b
549c0855eaaSJohn Baldwin	eor	v5.16b,v7.16b,v1.16b
550c0855eaaSJohn Baldwin	orr	v1.16b,v3.16b,v3.16b
551c0855eaaSJohn Baldwin	eor	v17.16b,v7.16b,v24.16b
552c0855eaaSJohn Baldwin	orr	v24.16b,v27.16b,v27.16b
553c0855eaaSJohn Baldwin	eor	v30.16b,v7.16b,v25.16b
554c0855eaaSJohn Baldwin	orr	v25.16b,v28.16b,v28.16b
555c0855eaaSJohn Baldwin	eor	v31.16b,v7.16b,v26.16b
556c0855eaaSJohn Baldwin	st1	{v4.16b},[x1],#16
557c0855eaaSJohn Baldwin	orr	v26.16b,v29.16b,v29.16b
558c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
559c0855eaaSJohn Baldwin	mov	w6,w5
560c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
561c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
562c0855eaaSJohn Baldwin	st1	{v30.16b},[x1],#16
563c0855eaaSJohn Baldwin	st1	{v31.16b},[x1],#16
564c0855eaaSJohn Baldwin	b.hs	.Loop5x_ecb_enc
565c0855eaaSJohn Baldwin
566c0855eaaSJohn Baldwin	add	x2,x2,#0x50
567c0855eaaSJohn Baldwin	cbz	x2,.Lecb_done
568c0855eaaSJohn Baldwin
569c0855eaaSJohn Baldwin	add	w6,w5,#2
570c0855eaaSJohn Baldwin	subs	x2,x2,#0x30
571c0855eaaSJohn Baldwin	orr	v0.16b,v27.16b,v27.16b
572c0855eaaSJohn Baldwin	orr	v1.16b,v28.16b,v28.16b
573c0855eaaSJohn Baldwin	orr	v24.16b,v29.16b,v29.16b
574c0855eaaSJohn Baldwin	b.lo	.Lecb_enc_tail
575c0855eaaSJohn Baldwin
576c0855eaaSJohn Baldwin	b	.Loop3x_ecb_enc
577c0855eaaSJohn Baldwin
578c0855eaaSJohn Baldwin.align	4
579c0855eaaSJohn Baldwin.Lecb_enc_tail4x:
580c0855eaaSJohn Baldwin	eor	v5.16b,v7.16b,v1.16b
581c0855eaaSJohn Baldwin	eor	v17.16b,v7.16b,v24.16b
582c0855eaaSJohn Baldwin	eor	v30.16b,v7.16b,v25.16b
583c0855eaaSJohn Baldwin	eor	v31.16b,v7.16b,v26.16b
584c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
585c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
586c0855eaaSJohn Baldwin	st1	{v30.16b},[x1],#16
587c0855eaaSJohn Baldwin	st1	{v31.16b},[x1],#16
588c0855eaaSJohn Baldwin
589c0855eaaSJohn Baldwin	b	.Lecb_done
590c0855eaaSJohn Baldwin.align	4
591c0855eaaSJohn Baldwin.Loop3x_ecb_enc:
592c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
593c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
594c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
595c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
596c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
597c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
598c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
599c0855eaaSJohn Baldwin	subs	w6,w6,#2
600c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
601c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
602c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
603c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
604c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
605c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
606c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
607c0855eaaSJohn Baldwin	b.gt	.Loop3x_ecb_enc
608c0855eaaSJohn Baldwin
609c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
610c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
611c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
612c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
613c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
614c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
615c0855eaaSJohn Baldwin	subs	x2,x2,#0x30
616c0855eaaSJohn Baldwin	csel	x6,x2,x6,lo				// x6, w6, is zero at this point
617c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
618c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
619c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
620c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
621c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
622c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
623c0855eaaSJohn Baldwin	add	x0,x0,x6			// x0 is adjusted in such way that
624c0855eaaSJohn Baldwin						// at exit from the loop v1.16b-v24.16b
625c0855eaaSJohn Baldwin						// are loaded with last "words"
626c0855eaaSJohn Baldwin	mov	x7,x3
627c0855eaaSJohn Baldwin	aese	v0.16b,v20.16b
628c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
629c0855eaaSJohn Baldwin	aese	v1.16b,v20.16b
630c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
631c0855eaaSJohn Baldwin	aese	v24.16b,v20.16b
632c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
633c0855eaaSJohn Baldwin	ld1	{v2.16b},[x0],#16
634c0855eaaSJohn Baldwin	aese	v0.16b,v21.16b
635c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
636c0855eaaSJohn Baldwin	aese	v1.16b,v21.16b
637c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
638c0855eaaSJohn Baldwin	aese	v24.16b,v21.16b
639c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
640c0855eaaSJohn Baldwin	ld1	{v3.16b},[x0],#16
641c0855eaaSJohn Baldwin	aese	v0.16b,v22.16b
642c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
643c0855eaaSJohn Baldwin	aese	v1.16b,v22.16b
644c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
645c0855eaaSJohn Baldwin	aese	v24.16b,v22.16b
646c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
647c0855eaaSJohn Baldwin	ld1	{v27.16b},[x0],#16
648c0855eaaSJohn Baldwin	aese	v0.16b,v23.16b
649c0855eaaSJohn Baldwin	aese	v1.16b,v23.16b
650c0855eaaSJohn Baldwin	aese	v24.16b,v23.16b
651c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
652c0855eaaSJohn Baldwin	add	w6,w5,#2
653c0855eaaSJohn Baldwin	eor	v4.16b,v7.16b,v0.16b
654c0855eaaSJohn Baldwin	eor	v5.16b,v7.16b,v1.16b
655c0855eaaSJohn Baldwin	eor	v24.16b,v24.16b,v7.16b
656c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
657c0855eaaSJohn Baldwin	st1	{v4.16b},[x1],#16
658c0855eaaSJohn Baldwin	orr	v0.16b,v2.16b,v2.16b
659c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
660c0855eaaSJohn Baldwin	orr	v1.16b,v3.16b,v3.16b
661c0855eaaSJohn Baldwin	st1	{v24.16b},[x1],#16
662c0855eaaSJohn Baldwin	orr	v24.16b,v27.16b,v27.16b
663c0855eaaSJohn Baldwin	b.hs	.Loop3x_ecb_enc
664c0855eaaSJohn Baldwin
665c0855eaaSJohn Baldwin	cmn	x2,#0x30
666c0855eaaSJohn Baldwin	b.eq	.Lecb_done
667c0855eaaSJohn Baldwin	nop
668c0855eaaSJohn Baldwin
669c0855eaaSJohn Baldwin.Lecb_enc_tail:
670c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
671c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
672c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
673c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
674c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
675c0855eaaSJohn Baldwin	subs	w6,w6,#2
676c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
677c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
678c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
679c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
680c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
681c0855eaaSJohn Baldwin	b.gt	.Lecb_enc_tail
682c0855eaaSJohn Baldwin
683c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
684c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
685c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
686c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
687c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
688c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
689c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
690c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
691c0855eaaSJohn Baldwin	aese	v1.16b,v20.16b
692c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
693c0855eaaSJohn Baldwin	aese	v24.16b,v20.16b
694c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
695c0855eaaSJohn Baldwin	cmn	x2,#0x20
696c0855eaaSJohn Baldwin	aese	v1.16b,v21.16b
697c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
698c0855eaaSJohn Baldwin	aese	v24.16b,v21.16b
699c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
700c0855eaaSJohn Baldwin	aese	v1.16b,v22.16b
701c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
702c0855eaaSJohn Baldwin	aese	v24.16b,v22.16b
703c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
704c0855eaaSJohn Baldwin	aese	v1.16b,v23.16b
705c0855eaaSJohn Baldwin	aese	v24.16b,v23.16b
706c0855eaaSJohn Baldwin	b.eq	.Lecb_enc_one
707c0855eaaSJohn Baldwin	eor	v5.16b,v7.16b,v1.16b
708c0855eaaSJohn Baldwin	eor	v17.16b,v7.16b,v24.16b
709c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
710c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
711c0855eaaSJohn Baldwin	b	.Lecb_done
712c0855eaaSJohn Baldwin
713c0855eaaSJohn Baldwin.Lecb_enc_one:
714c0855eaaSJohn Baldwin	eor	v5.16b,v7.16b,v24.16b
715c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
716c0855eaaSJohn Baldwin	b	.Lecb_done
717c0855eaaSJohn Baldwin.align	5
718c0855eaaSJohn Baldwin.Lecb_dec:
719c0855eaaSJohn Baldwin	ld1	{v1.16b},[x0],#16
720c0855eaaSJohn Baldwin	subs	x2,x2,#32			// bias
721c0855eaaSJohn Baldwin	add	w6,w5,#2
722c0855eaaSJohn Baldwin	orr	v3.16b,v1.16b,v1.16b
723c0855eaaSJohn Baldwin	orr	v24.16b,v1.16b,v1.16b
724c0855eaaSJohn Baldwin	orr	v1.16b,v0.16b,v0.16b
725c0855eaaSJohn Baldwin	b.lo	.Lecb_dec_tail
726c0855eaaSJohn Baldwin
727c0855eaaSJohn Baldwin	orr	v1.16b,v3.16b,v3.16b
728c0855eaaSJohn Baldwin	ld1	{v24.16b},[x0],#16
729c0855eaaSJohn Baldwin	cmp	x2,#32
730c0855eaaSJohn Baldwin	b.lo	.Loop3x_ecb_dec
731c0855eaaSJohn Baldwin
732c0855eaaSJohn Baldwin	ld1	{v25.16b},[x0],#16
733c0855eaaSJohn Baldwin	ld1	{v26.16b},[x0],#16
734c0855eaaSJohn Baldwin	sub	x2,x2,#32				// bias
735c0855eaaSJohn Baldwin	mov	w6,w5
736c0855eaaSJohn Baldwin
737c0855eaaSJohn Baldwin.Loop5x_ecb_dec:
738c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
739c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
740c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
741c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
742c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
743c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
744c0855eaaSJohn Baldwin	aesd	v25.16b,v16.16b
745c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
746c0855eaaSJohn Baldwin	aesd	v26.16b,v16.16b
747c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
748c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
749c0855eaaSJohn Baldwin	subs	w6,w6,#2
750c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
751c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
752c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
753c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
754c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
755c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
756c0855eaaSJohn Baldwin	aesd	v25.16b,v17.16b
757c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
758c0855eaaSJohn Baldwin	aesd	v26.16b,v17.16b
759c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
760c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
761c0855eaaSJohn Baldwin	b.gt	.Loop5x_ecb_dec
762c0855eaaSJohn Baldwin
763c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
764c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
765c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
766c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
767c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
768c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
769c0855eaaSJohn Baldwin	aesd	v25.16b,v16.16b
770c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
771c0855eaaSJohn Baldwin	aesd	v26.16b,v16.16b
772c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
773c0855eaaSJohn Baldwin	cmp	x2,#0x40				// because .Lecb_tail4x
774c0855eaaSJohn Baldwin	sub	x2,x2,#0x50
775c0855eaaSJohn Baldwin
776c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
777c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
778c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
779c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
780c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
781c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
782c0855eaaSJohn Baldwin	aesd	v25.16b,v17.16b
783c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
784c0855eaaSJohn Baldwin	aesd	v26.16b,v17.16b
785c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
786c0855eaaSJohn Baldwin	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
787c0855eaaSJohn Baldwin	mov	x7,x3
788c0855eaaSJohn Baldwin
789c0855eaaSJohn Baldwin	aesd	v0.16b,v18.16b
790c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
791c0855eaaSJohn Baldwin	aesd	v1.16b,v18.16b
792c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
793c0855eaaSJohn Baldwin	aesd	v24.16b,v18.16b
794c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
795c0855eaaSJohn Baldwin	aesd	v25.16b,v18.16b
796c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
797c0855eaaSJohn Baldwin	aesd	v26.16b,v18.16b
798c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
799c0855eaaSJohn Baldwin	add	x0,x0,x6				// x0 is adjusted in such way that
800c0855eaaSJohn Baldwin							// at exit from the loop v1.16b-v26.16b
801c0855eaaSJohn Baldwin							// are loaded with last "words"
802c0855eaaSJohn Baldwin	add	x6,x2,#0x60			// because .Lecb_tail4x
803c0855eaaSJohn Baldwin
804c0855eaaSJohn Baldwin	aesd	v0.16b,v19.16b
805c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
806c0855eaaSJohn Baldwin	aesd	v1.16b,v19.16b
807c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
808c0855eaaSJohn Baldwin	aesd	v24.16b,v19.16b
809c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
810c0855eaaSJohn Baldwin	aesd	v25.16b,v19.16b
811c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
812c0855eaaSJohn Baldwin	aesd	v26.16b,v19.16b
813c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
814c0855eaaSJohn Baldwin
815c0855eaaSJohn Baldwin	aesd	v0.16b,v20.16b
816c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
817c0855eaaSJohn Baldwin	aesd	v1.16b,v20.16b
818c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
819c0855eaaSJohn Baldwin	aesd	v24.16b,v20.16b
820c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
821c0855eaaSJohn Baldwin	aesd	v25.16b,v20.16b
822c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
823c0855eaaSJohn Baldwin	aesd	v26.16b,v20.16b
824c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
825c0855eaaSJohn Baldwin
826c0855eaaSJohn Baldwin	aesd	v0.16b,v21.16b
827c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
828c0855eaaSJohn Baldwin	aesd	v1.16b,v21.16b
829c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
830c0855eaaSJohn Baldwin	aesd	v24.16b,v21.16b
831c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
832c0855eaaSJohn Baldwin	aesd	v25.16b,v21.16b
833c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
834c0855eaaSJohn Baldwin	aesd	v26.16b,v21.16b
835c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
836c0855eaaSJohn Baldwin
837c0855eaaSJohn Baldwin	aesd	v0.16b,v22.16b
838c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
839c0855eaaSJohn Baldwin	aesd	v1.16b,v22.16b
840c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
841c0855eaaSJohn Baldwin	aesd	v24.16b,v22.16b
842c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
843c0855eaaSJohn Baldwin	aesd	v25.16b,v22.16b
844c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
845c0855eaaSJohn Baldwin	aesd	v26.16b,v22.16b
846c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
847c0855eaaSJohn Baldwin
848c0855eaaSJohn Baldwin	aesd	v0.16b,v23.16b
849c0855eaaSJohn Baldwin	ld1	{v2.16b},[x0],#16
850c0855eaaSJohn Baldwin	aesd	v1.16b,v23.16b
851c0855eaaSJohn Baldwin	ld1	{v3.16b},[x0],#16
852c0855eaaSJohn Baldwin	aesd	v24.16b,v23.16b
853c0855eaaSJohn Baldwin	ld1	{v27.16b},[x0],#16
854c0855eaaSJohn Baldwin	aesd	v25.16b,v23.16b
855c0855eaaSJohn Baldwin	ld1	{v28.16b},[x0],#16
856c0855eaaSJohn Baldwin	aesd	v26.16b,v23.16b
857c0855eaaSJohn Baldwin	ld1	{v29.16b},[x0],#16
858c0855eaaSJohn Baldwin	cbz	x6,.Lecb_tail4x
859c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
860c0855eaaSJohn Baldwin	eor	v4.16b,v7.16b,v0.16b
861c0855eaaSJohn Baldwin	orr	v0.16b,v2.16b,v2.16b
862c0855eaaSJohn Baldwin	eor	v5.16b,v7.16b,v1.16b
863c0855eaaSJohn Baldwin	orr	v1.16b,v3.16b,v3.16b
864c0855eaaSJohn Baldwin	eor	v17.16b,v7.16b,v24.16b
865c0855eaaSJohn Baldwin	orr	v24.16b,v27.16b,v27.16b
866c0855eaaSJohn Baldwin	eor	v30.16b,v7.16b,v25.16b
867c0855eaaSJohn Baldwin	orr	v25.16b,v28.16b,v28.16b
868c0855eaaSJohn Baldwin	eor	v31.16b,v7.16b,v26.16b
869c0855eaaSJohn Baldwin	st1	{v4.16b},[x1],#16
870c0855eaaSJohn Baldwin	orr	v26.16b,v29.16b,v29.16b
871c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
872c0855eaaSJohn Baldwin	mov	w6,w5
873c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
874c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
875c0855eaaSJohn Baldwin	st1	{v30.16b},[x1],#16
876c0855eaaSJohn Baldwin	st1	{v31.16b},[x1],#16
877c0855eaaSJohn Baldwin	b.hs	.Loop5x_ecb_dec
878c0855eaaSJohn Baldwin
879c0855eaaSJohn Baldwin	add	x2,x2,#0x50
880c0855eaaSJohn Baldwin	cbz	x2,.Lecb_done
881c0855eaaSJohn Baldwin
882c0855eaaSJohn Baldwin	add	w6,w5,#2
883c0855eaaSJohn Baldwin	subs	x2,x2,#0x30
884c0855eaaSJohn Baldwin	orr	v0.16b,v27.16b,v27.16b
885c0855eaaSJohn Baldwin	orr	v1.16b,v28.16b,v28.16b
886c0855eaaSJohn Baldwin	orr	v24.16b,v29.16b,v29.16b
887c0855eaaSJohn Baldwin	b.lo	.Lecb_dec_tail
888c0855eaaSJohn Baldwin
889c0855eaaSJohn Baldwin	b	.Loop3x_ecb_dec
890c0855eaaSJohn Baldwin
891c0855eaaSJohn Baldwin.align	4
892c0855eaaSJohn Baldwin.Lecb_tail4x:
893c0855eaaSJohn Baldwin	eor	v5.16b,v7.16b,v1.16b
894c0855eaaSJohn Baldwin	eor	v17.16b,v7.16b,v24.16b
895c0855eaaSJohn Baldwin	eor	v30.16b,v7.16b,v25.16b
896c0855eaaSJohn Baldwin	eor	v31.16b,v7.16b,v26.16b
897c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
898c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
899c0855eaaSJohn Baldwin	st1	{v30.16b},[x1],#16
900c0855eaaSJohn Baldwin	st1	{v31.16b},[x1],#16
901c0855eaaSJohn Baldwin
902c0855eaaSJohn Baldwin	b	.Lecb_done
903c0855eaaSJohn Baldwin.align	4
904c0855eaaSJohn Baldwin.Loop3x_ecb_dec:
905c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
906c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
907c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
908c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
909c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
910c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
911c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
912c0855eaaSJohn Baldwin	subs	w6,w6,#2
913c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
914c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
915c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
916c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
917c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
918c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
919c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
920c0855eaaSJohn Baldwin	b.gt	.Loop3x_ecb_dec
921c0855eaaSJohn Baldwin
922c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
923c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
924c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
925c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
926c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
927c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
928c0855eaaSJohn Baldwin	subs	x2,x2,#0x30
929c0855eaaSJohn Baldwin	csel	x6,x2,x6,lo				// x6, w6, is zero at this point
930c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
931c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
932c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
933c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
934c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
935c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
936c0855eaaSJohn Baldwin	add	x0,x0,x6 			// x0 is adjusted in such way that
937c0855eaaSJohn Baldwin						// at exit from the loop v1.16b-v24.16b
938c0855eaaSJohn Baldwin						// are loaded with last "words"
939c0855eaaSJohn Baldwin	mov	x7,x3
940c0855eaaSJohn Baldwin	aesd	v0.16b,v20.16b
941c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
942c0855eaaSJohn Baldwin	aesd	v1.16b,v20.16b
943c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
944c0855eaaSJohn Baldwin	aesd	v24.16b,v20.16b
945c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
946c0855eaaSJohn Baldwin	ld1	{v2.16b},[x0],#16
947c0855eaaSJohn Baldwin	aesd	v0.16b,v21.16b
948c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
949c0855eaaSJohn Baldwin	aesd	v1.16b,v21.16b
950c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
951c0855eaaSJohn Baldwin	aesd	v24.16b,v21.16b
952c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
953c0855eaaSJohn Baldwin	ld1	{v3.16b},[x0],#16
954c0855eaaSJohn Baldwin	aesd	v0.16b,v22.16b
955c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
956c0855eaaSJohn Baldwin	aesd	v1.16b,v22.16b
957c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
958c0855eaaSJohn Baldwin	aesd	v24.16b,v22.16b
959c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
960c0855eaaSJohn Baldwin	ld1	{v27.16b},[x0],#16
961c0855eaaSJohn Baldwin	aesd	v0.16b,v23.16b
962c0855eaaSJohn Baldwin	aesd	v1.16b,v23.16b
963c0855eaaSJohn Baldwin	aesd	v24.16b,v23.16b
964c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
965c0855eaaSJohn Baldwin	add	w6,w5,#2
966c0855eaaSJohn Baldwin	eor	v4.16b,v7.16b,v0.16b
967c0855eaaSJohn Baldwin	eor	v5.16b,v7.16b,v1.16b
968c0855eaaSJohn Baldwin	eor	v24.16b,v24.16b,v7.16b
969c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
970c0855eaaSJohn Baldwin	st1	{v4.16b},[x1],#16
971c0855eaaSJohn Baldwin	orr	v0.16b,v2.16b,v2.16b
972c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
973c0855eaaSJohn Baldwin	orr	v1.16b,v3.16b,v3.16b
974c0855eaaSJohn Baldwin	st1	{v24.16b},[x1],#16
975c0855eaaSJohn Baldwin	orr	v24.16b,v27.16b,v27.16b
976c0855eaaSJohn Baldwin	b.hs	.Loop3x_ecb_dec
977c0855eaaSJohn Baldwin
978c0855eaaSJohn Baldwin	cmn	x2,#0x30
979c0855eaaSJohn Baldwin	b.eq	.Lecb_done
980c0855eaaSJohn Baldwin	nop
981c0855eaaSJohn Baldwin
982c0855eaaSJohn Baldwin.Lecb_dec_tail:
983c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
984c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
985c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
986c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
987c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
988c0855eaaSJohn Baldwin	subs	w6,w6,#2
989c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
990c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
991c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
992c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
993c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
994c0855eaaSJohn Baldwin	b.gt	.Lecb_dec_tail
995c0855eaaSJohn Baldwin
996c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
997c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
998c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
999c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1000c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
1001c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1002c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
1003c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1004c0855eaaSJohn Baldwin	aesd	v1.16b,v20.16b
1005c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1006c0855eaaSJohn Baldwin	aesd	v24.16b,v20.16b
1007c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1008c0855eaaSJohn Baldwin	cmn	x2,#0x20
1009c0855eaaSJohn Baldwin	aesd	v1.16b,v21.16b
1010c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1011c0855eaaSJohn Baldwin	aesd	v24.16b,v21.16b
1012c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1013c0855eaaSJohn Baldwin	aesd	v1.16b,v22.16b
1014c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1015c0855eaaSJohn Baldwin	aesd	v24.16b,v22.16b
1016c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1017c0855eaaSJohn Baldwin	aesd	v1.16b,v23.16b
1018c0855eaaSJohn Baldwin	aesd	v24.16b,v23.16b
1019c0855eaaSJohn Baldwin	b.eq	.Lecb_dec_one
1020c0855eaaSJohn Baldwin	eor	v5.16b,v7.16b,v1.16b
1021c0855eaaSJohn Baldwin	eor	v17.16b,v7.16b,v24.16b
1022c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
1023c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
1024c0855eaaSJohn Baldwin	b	.Lecb_done
1025c0855eaaSJohn Baldwin
1026c0855eaaSJohn Baldwin.Lecb_dec_one:
1027c0855eaaSJohn Baldwin	eor	v5.16b,v7.16b,v24.16b
1028c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
1029c0855eaaSJohn Baldwin
1030c0855eaaSJohn Baldwin.Lecb_done:
1031c0855eaaSJohn Baldwin	ldr	x29,[sp],#16
1032c0855eaaSJohn Baldwin.Lecb_Final_abort:
1033c0855eaaSJohn Baldwin	ret
1034c0855eaaSJohn Baldwin.size	aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt
1035bc3d5698SJohn Baldwin.globl	aes_v8_cbc_encrypt
1036bc3d5698SJohn Baldwin.type	aes_v8_cbc_encrypt,%function
1037bc3d5698SJohn Baldwin.align	5
1038bc3d5698SJohn Baldwinaes_v8_cbc_encrypt:
1039bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
1040bd9588bcSAndrew Turner	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1041bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
1042bc3d5698SJohn Baldwin	add	x29,sp,#0
1043bc3d5698SJohn Baldwin	subs	x2,x2,#16
1044bc3d5698SJohn Baldwin	mov	x8,#16
1045bc3d5698SJohn Baldwin	b.lo	.Lcbc_abort
1046bc3d5698SJohn Baldwin	csel	x8,xzr,x8,eq
1047bc3d5698SJohn Baldwin
1048bc3d5698SJohn Baldwin	cmp	w5,#0			// en- or decrypting?
1049bc3d5698SJohn Baldwin	ldr	w5,[x3,#240]
1050bc3d5698SJohn Baldwin	and	x2,x2,#-16
1051bc3d5698SJohn Baldwin	ld1	{v6.16b},[x4]
1052bc3d5698SJohn Baldwin	ld1	{v0.16b},[x0],x8
1053bc3d5698SJohn Baldwin
1054bc3d5698SJohn Baldwin	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
1055bc3d5698SJohn Baldwin	sub	w5,w5,#6
1056bc3d5698SJohn Baldwin	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
1057bc3d5698SJohn Baldwin	sub	w5,w5,#2
1058bc3d5698SJohn Baldwin	ld1	{v18.4s,v19.4s},[x7],#32
1059bc3d5698SJohn Baldwin	ld1	{v20.4s,v21.4s},[x7],#32
1060bc3d5698SJohn Baldwin	ld1	{v22.4s,v23.4s},[x7],#32
1061bc3d5698SJohn Baldwin	ld1	{v7.4s},[x7]
1062bc3d5698SJohn Baldwin
1063bc3d5698SJohn Baldwin	add	x7,x3,#32
1064bc3d5698SJohn Baldwin	mov	w6,w5
1065bc3d5698SJohn Baldwin	b.eq	.Lcbc_dec
1066bc3d5698SJohn Baldwin
1067bc3d5698SJohn Baldwin	cmp	w5,#2
1068bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v6.16b
1069bc3d5698SJohn Baldwin	eor	v5.16b,v16.16b,v7.16b
1070bc3d5698SJohn Baldwin	b.eq	.Lcbc_enc128
1071bc3d5698SJohn Baldwin
1072bc3d5698SJohn Baldwin	ld1	{v2.4s,v3.4s},[x7]
1073bc3d5698SJohn Baldwin	add	x7,x3,#16
1074bc3d5698SJohn Baldwin	add	x6,x3,#16*4
1075bc3d5698SJohn Baldwin	add	x12,x3,#16*5
1076bc3d5698SJohn Baldwin	aese	v0.16b,v16.16b
1077bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1078bc3d5698SJohn Baldwin	add	x14,x3,#16*6
1079bc3d5698SJohn Baldwin	add	x3,x3,#16*7
1080bc3d5698SJohn Baldwin	b	.Lenter_cbc_enc
1081bc3d5698SJohn Baldwin
1082bc3d5698SJohn Baldwin.align	4
1083bc3d5698SJohn Baldwin.Loop_cbc_enc:
1084bc3d5698SJohn Baldwin	aese	v0.16b,v16.16b
1085bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1086bc3d5698SJohn Baldwin	st1	{v6.16b},[x1],#16
1087bc3d5698SJohn Baldwin.Lenter_cbc_enc:
1088bc3d5698SJohn Baldwin	aese	v0.16b,v17.16b
1089bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1090bc3d5698SJohn Baldwin	aese	v0.16b,v2.16b
1091bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1092bc3d5698SJohn Baldwin	ld1	{v16.4s},[x6]
1093bc3d5698SJohn Baldwin	cmp	w5,#4
1094bc3d5698SJohn Baldwin	aese	v0.16b,v3.16b
1095bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1096bc3d5698SJohn Baldwin	ld1	{v17.4s},[x12]
1097bc3d5698SJohn Baldwin	b.eq	.Lcbc_enc192
1098bc3d5698SJohn Baldwin
1099bc3d5698SJohn Baldwin	aese	v0.16b,v16.16b
1100bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1101bc3d5698SJohn Baldwin	ld1	{v16.4s},[x14]
1102bc3d5698SJohn Baldwin	aese	v0.16b,v17.16b
1103bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1104bc3d5698SJohn Baldwin	ld1	{v17.4s},[x3]
1105bc3d5698SJohn Baldwin	nop
1106bc3d5698SJohn Baldwin
1107bc3d5698SJohn Baldwin.Lcbc_enc192:
1108bc3d5698SJohn Baldwin	aese	v0.16b,v16.16b
1109bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1110bc3d5698SJohn Baldwin	subs	x2,x2,#16
1111bc3d5698SJohn Baldwin	aese	v0.16b,v17.16b
1112bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1113bc3d5698SJohn Baldwin	csel	x8,xzr,x8,eq
1114bc3d5698SJohn Baldwin	aese	v0.16b,v18.16b
1115bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1116bc3d5698SJohn Baldwin	aese	v0.16b,v19.16b
1117bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1118bc3d5698SJohn Baldwin	ld1	{v16.16b},[x0],x8
1119bc3d5698SJohn Baldwin	aese	v0.16b,v20.16b
1120bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1121bc3d5698SJohn Baldwin	eor	v16.16b,v16.16b,v5.16b
1122bc3d5698SJohn Baldwin	aese	v0.16b,v21.16b
1123bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1124bc3d5698SJohn Baldwin	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
1125bc3d5698SJohn Baldwin	aese	v0.16b,v22.16b
1126bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1127bc3d5698SJohn Baldwin	aese	v0.16b,v23.16b
1128bc3d5698SJohn Baldwin	eor	v6.16b,v0.16b,v7.16b
1129bc3d5698SJohn Baldwin	b.hs	.Loop_cbc_enc
1130bc3d5698SJohn Baldwin
1131bc3d5698SJohn Baldwin	st1	{v6.16b},[x1],#16
1132bc3d5698SJohn Baldwin	b	.Lcbc_done
1133bc3d5698SJohn Baldwin
1134bc3d5698SJohn Baldwin.align	5
1135bc3d5698SJohn Baldwin.Lcbc_enc128:
1136bc3d5698SJohn Baldwin	ld1	{v2.4s,v3.4s},[x7]
1137bc3d5698SJohn Baldwin	aese	v0.16b,v16.16b
1138bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1139bc3d5698SJohn Baldwin	b	.Lenter_cbc_enc128
1140bc3d5698SJohn Baldwin.Loop_cbc_enc128:
1141bc3d5698SJohn Baldwin	aese	v0.16b,v16.16b
1142bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1143bc3d5698SJohn Baldwin	st1	{v6.16b},[x1],#16
1144bc3d5698SJohn Baldwin.Lenter_cbc_enc128:
1145bc3d5698SJohn Baldwin	aese	v0.16b,v17.16b
1146bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1147bc3d5698SJohn Baldwin	subs	x2,x2,#16
1148bc3d5698SJohn Baldwin	aese	v0.16b,v2.16b
1149bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1150bc3d5698SJohn Baldwin	csel	x8,xzr,x8,eq
1151bc3d5698SJohn Baldwin	aese	v0.16b,v3.16b
1152bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1153bc3d5698SJohn Baldwin	aese	v0.16b,v18.16b
1154bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1155bc3d5698SJohn Baldwin	aese	v0.16b,v19.16b
1156bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1157bc3d5698SJohn Baldwin	ld1	{v16.16b},[x0],x8
1158bc3d5698SJohn Baldwin	aese	v0.16b,v20.16b
1159bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1160bc3d5698SJohn Baldwin	aese	v0.16b,v21.16b
1161bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1162bc3d5698SJohn Baldwin	aese	v0.16b,v22.16b
1163bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1164bc3d5698SJohn Baldwin	eor	v16.16b,v16.16b,v5.16b
1165bc3d5698SJohn Baldwin	aese	v0.16b,v23.16b
1166bc3d5698SJohn Baldwin	eor	v6.16b,v0.16b,v7.16b
1167bc3d5698SJohn Baldwin	b.hs	.Loop_cbc_enc128
1168bc3d5698SJohn Baldwin
1169bc3d5698SJohn Baldwin	st1	{v6.16b},[x1],#16
1170bc3d5698SJohn Baldwin	b	.Lcbc_done
1171bc3d5698SJohn Baldwin.align	5
1172bc3d5698SJohn Baldwin.Lcbc_dec:
1173c0855eaaSJohn Baldwin	ld1	{v24.16b},[x0],#16
1174bc3d5698SJohn Baldwin	subs	x2,x2,#32		// bias
1175bc3d5698SJohn Baldwin	add	w6,w5,#2
1176bc3d5698SJohn Baldwin	orr	v3.16b,v0.16b,v0.16b
1177bc3d5698SJohn Baldwin	orr	v1.16b,v0.16b,v0.16b
1178c0855eaaSJohn Baldwin	orr	v27.16b,v24.16b,v24.16b
1179bc3d5698SJohn Baldwin	b.lo	.Lcbc_dec_tail
1180bc3d5698SJohn Baldwin
1181c0855eaaSJohn Baldwin	orr	v1.16b,v24.16b,v24.16b
1182c0855eaaSJohn Baldwin	ld1	{v24.16b},[x0],#16
1183bc3d5698SJohn Baldwin	orr	v2.16b,v0.16b,v0.16b
1184bc3d5698SJohn Baldwin	orr	v3.16b,v1.16b,v1.16b
1185c0855eaaSJohn Baldwin	orr	v27.16b,v24.16b,v24.16b
1186c0855eaaSJohn Baldwin	cmp	x2,#32
1187c0855eaaSJohn Baldwin	b.lo	.Loop3x_cbc_dec
1188bc3d5698SJohn Baldwin
1189c0855eaaSJohn Baldwin	ld1	{v25.16b},[x0],#16
1190c0855eaaSJohn Baldwin	ld1	{v26.16b},[x0],#16
1191c0855eaaSJohn Baldwin	sub	x2,x2,#32		// bias
1192c0855eaaSJohn Baldwin	mov	w6,w5
1193c0855eaaSJohn Baldwin	orr	v28.16b,v25.16b,v25.16b
1194c0855eaaSJohn Baldwin	orr	v29.16b,v26.16b,v26.16b
1195c0855eaaSJohn Baldwin
1196c0855eaaSJohn Baldwin.Loop5x_cbc_dec:
1197bc3d5698SJohn Baldwin	aesd	v0.16b,v16.16b
1198bc3d5698SJohn Baldwin	aesimc	v0.16b,v0.16b
1199bc3d5698SJohn Baldwin	aesd	v1.16b,v16.16b
1200bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1201c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
1202c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1203c0855eaaSJohn Baldwin	aesd	v25.16b,v16.16b
1204c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
1205c0855eaaSJohn Baldwin	aesd	v26.16b,v16.16b
1206c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
1207bc3d5698SJohn Baldwin	ld1	{v16.4s},[x7],#16
1208bc3d5698SJohn Baldwin	subs	w6,w6,#2
1209bc3d5698SJohn Baldwin	aesd	v0.16b,v17.16b
1210bc3d5698SJohn Baldwin	aesimc	v0.16b,v0.16b
1211bc3d5698SJohn Baldwin	aesd	v1.16b,v17.16b
1212bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1213c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
1214c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1215c0855eaaSJohn Baldwin	aesd	v25.16b,v17.16b
1216c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
1217c0855eaaSJohn Baldwin	aesd	v26.16b,v17.16b
1218c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
1219c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
1220c0855eaaSJohn Baldwin	b.gt	.Loop5x_cbc_dec
1221c0855eaaSJohn Baldwin
1222c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
1223c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
1224c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
1225c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1226c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
1227c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1228c0855eaaSJohn Baldwin	aesd	v25.16b,v16.16b
1229c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
1230c0855eaaSJohn Baldwin	aesd	v26.16b,v16.16b
1231c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
1232c0855eaaSJohn Baldwin	cmp	x2,#0x40		// because .Lcbc_tail4x
1233c0855eaaSJohn Baldwin	sub	x2,x2,#0x50
1234c0855eaaSJohn Baldwin
1235c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
1236c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
1237c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
1238c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1239c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
1240c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1241c0855eaaSJohn Baldwin	aesd	v25.16b,v17.16b
1242c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
1243c0855eaaSJohn Baldwin	aesd	v26.16b,v17.16b
1244c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
1245c0855eaaSJohn Baldwin	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
1246c0855eaaSJohn Baldwin	mov	x7,x3
1247c0855eaaSJohn Baldwin
1248c0855eaaSJohn Baldwin	aesd	v0.16b,v18.16b
1249c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
1250c0855eaaSJohn Baldwin	aesd	v1.16b,v18.16b
1251c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1252c0855eaaSJohn Baldwin	aesd	v24.16b,v18.16b
1253c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1254c0855eaaSJohn Baldwin	aesd	v25.16b,v18.16b
1255c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
1256c0855eaaSJohn Baldwin	aesd	v26.16b,v18.16b
1257c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
1258c0855eaaSJohn Baldwin	add	x0,x0,x6		// x0 is adjusted in such way that
1259c0855eaaSJohn Baldwin					// at exit from the loop v1.16b-v26.16b
1260c0855eaaSJohn Baldwin					// are loaded with last "words"
1261c0855eaaSJohn Baldwin	add	x6,x2,#0x60		// because .Lcbc_tail4x
1262c0855eaaSJohn Baldwin
1263c0855eaaSJohn Baldwin	aesd	v0.16b,v19.16b
1264c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
1265c0855eaaSJohn Baldwin	aesd	v1.16b,v19.16b
1266c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1267c0855eaaSJohn Baldwin	aesd	v24.16b,v19.16b
1268c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1269c0855eaaSJohn Baldwin	aesd	v25.16b,v19.16b
1270c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
1271c0855eaaSJohn Baldwin	aesd	v26.16b,v19.16b
1272c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
1273c0855eaaSJohn Baldwin
1274c0855eaaSJohn Baldwin	aesd	v0.16b,v20.16b
1275c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
1276c0855eaaSJohn Baldwin	aesd	v1.16b,v20.16b
1277c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1278c0855eaaSJohn Baldwin	aesd	v24.16b,v20.16b
1279c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1280c0855eaaSJohn Baldwin	aesd	v25.16b,v20.16b
1281c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
1282c0855eaaSJohn Baldwin	aesd	v26.16b,v20.16b
1283c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
1284c0855eaaSJohn Baldwin
1285c0855eaaSJohn Baldwin	aesd	v0.16b,v21.16b
1286c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
1287c0855eaaSJohn Baldwin	aesd	v1.16b,v21.16b
1288c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1289c0855eaaSJohn Baldwin	aesd	v24.16b,v21.16b
1290c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1291c0855eaaSJohn Baldwin	aesd	v25.16b,v21.16b
1292c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
1293c0855eaaSJohn Baldwin	aesd	v26.16b,v21.16b
1294c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
1295c0855eaaSJohn Baldwin
1296c0855eaaSJohn Baldwin	aesd	v0.16b,v22.16b
1297c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
1298c0855eaaSJohn Baldwin	aesd	v1.16b,v22.16b
1299c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1300c0855eaaSJohn Baldwin	aesd	v24.16b,v22.16b
1301c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1302c0855eaaSJohn Baldwin	aesd	v25.16b,v22.16b
1303c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
1304c0855eaaSJohn Baldwin	aesd	v26.16b,v22.16b
1305c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
1306c0855eaaSJohn Baldwin
1307c0855eaaSJohn Baldwin	eor	v4.16b,v6.16b,v7.16b
1308c0855eaaSJohn Baldwin	aesd	v0.16b,v23.16b
1309c0855eaaSJohn Baldwin	eor	v5.16b,v2.16b,v7.16b
1310c0855eaaSJohn Baldwin	ld1	{v2.16b},[x0],#16
1311c0855eaaSJohn Baldwin	aesd	v1.16b,v23.16b
1312c0855eaaSJohn Baldwin	eor	v17.16b,v3.16b,v7.16b
1313c0855eaaSJohn Baldwin	ld1	{v3.16b},[x0],#16
1314c0855eaaSJohn Baldwin	aesd	v24.16b,v23.16b
1315c0855eaaSJohn Baldwin	eor	v30.16b,v27.16b,v7.16b
1316c0855eaaSJohn Baldwin	ld1	{v27.16b},[x0],#16
1317c0855eaaSJohn Baldwin	aesd	v25.16b,v23.16b
1318c0855eaaSJohn Baldwin	eor	v31.16b,v28.16b,v7.16b
1319c0855eaaSJohn Baldwin	ld1	{v28.16b},[x0],#16
1320c0855eaaSJohn Baldwin	aesd	v26.16b,v23.16b
1321c0855eaaSJohn Baldwin	orr	v6.16b,v29.16b,v29.16b
1322c0855eaaSJohn Baldwin	ld1	{v29.16b},[x0],#16
1323c0855eaaSJohn Baldwin	cbz	x6,.Lcbc_tail4x
1324c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1325c0855eaaSJohn Baldwin	eor	v4.16b,v4.16b,v0.16b
1326c0855eaaSJohn Baldwin	orr	v0.16b,v2.16b,v2.16b
1327c0855eaaSJohn Baldwin	eor	v5.16b,v5.16b,v1.16b
1328c0855eaaSJohn Baldwin	orr	v1.16b,v3.16b,v3.16b
1329c0855eaaSJohn Baldwin	eor	v17.16b,v17.16b,v24.16b
1330c0855eaaSJohn Baldwin	orr	v24.16b,v27.16b,v27.16b
1331c0855eaaSJohn Baldwin	eor	v30.16b,v30.16b,v25.16b
1332c0855eaaSJohn Baldwin	orr	v25.16b,v28.16b,v28.16b
1333c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v26.16b
1334c0855eaaSJohn Baldwin	st1	{v4.16b},[x1],#16
1335c0855eaaSJohn Baldwin	orr	v26.16b,v29.16b,v29.16b
1336c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
1337c0855eaaSJohn Baldwin	mov	w6,w5
1338c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
1339c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1340c0855eaaSJohn Baldwin	st1	{v30.16b},[x1],#16
1341c0855eaaSJohn Baldwin	st1	{v31.16b},[x1],#16
1342c0855eaaSJohn Baldwin	b.hs	.Loop5x_cbc_dec
1343c0855eaaSJohn Baldwin
1344c0855eaaSJohn Baldwin	add	x2,x2,#0x50
1345c0855eaaSJohn Baldwin	cbz	x2,.Lcbc_done
1346c0855eaaSJohn Baldwin
1347c0855eaaSJohn Baldwin	add	w6,w5,#2
1348c0855eaaSJohn Baldwin	subs	x2,x2,#0x30
1349c0855eaaSJohn Baldwin	orr	v0.16b,v27.16b,v27.16b
1350c0855eaaSJohn Baldwin	orr	v2.16b,v27.16b,v27.16b
1351c0855eaaSJohn Baldwin	orr	v1.16b,v28.16b,v28.16b
1352c0855eaaSJohn Baldwin	orr	v3.16b,v28.16b,v28.16b
1353c0855eaaSJohn Baldwin	orr	v24.16b,v29.16b,v29.16b
1354c0855eaaSJohn Baldwin	orr	v27.16b,v29.16b,v29.16b
1355c0855eaaSJohn Baldwin	b.lo	.Lcbc_dec_tail
1356c0855eaaSJohn Baldwin
1357c0855eaaSJohn Baldwin	b	.Loop3x_cbc_dec
1358c0855eaaSJohn Baldwin
1359c0855eaaSJohn Baldwin.align	4
1360c0855eaaSJohn Baldwin.Lcbc_tail4x:
1361c0855eaaSJohn Baldwin	eor	v5.16b,v4.16b,v1.16b
1362c0855eaaSJohn Baldwin	eor	v17.16b,v17.16b,v24.16b
1363c0855eaaSJohn Baldwin	eor	v30.16b,v30.16b,v25.16b
1364c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v26.16b
1365c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
1366c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
1367c0855eaaSJohn Baldwin	st1	{v30.16b},[x1],#16
1368c0855eaaSJohn Baldwin	st1	{v31.16b},[x1],#16
1369c0855eaaSJohn Baldwin
1370c0855eaaSJohn Baldwin	b	.Lcbc_done
1371c0855eaaSJohn Baldwin.align	4
1372c0855eaaSJohn Baldwin.Loop3x_cbc_dec:
1373c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
1374c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
1375c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
1376c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1377c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
1378c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1379c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
1380c0855eaaSJohn Baldwin	subs	w6,w6,#2
1381c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
1382c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
1383c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
1384c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
1385c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
1386c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1387bc3d5698SJohn Baldwin	ld1	{v17.4s},[x7],#16
1388bc3d5698SJohn Baldwin	b.gt	.Loop3x_cbc_dec
1389bc3d5698SJohn Baldwin
1390bc3d5698SJohn Baldwin	aesd	v0.16b,v16.16b
1391bc3d5698SJohn Baldwin	aesimc	v0.16b,v0.16b
1392bc3d5698SJohn Baldwin	aesd	v1.16b,v16.16b
1393bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1394c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
1395c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1396bc3d5698SJohn Baldwin	eor	v4.16b,v6.16b,v7.16b
1397bc3d5698SJohn Baldwin	subs	x2,x2,#0x30
1398bc3d5698SJohn Baldwin	eor	v5.16b,v2.16b,v7.16b
1399bc3d5698SJohn Baldwin	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
1400bc3d5698SJohn Baldwin	aesd	v0.16b,v17.16b
1401bc3d5698SJohn Baldwin	aesimc	v0.16b,v0.16b
1402bc3d5698SJohn Baldwin	aesd	v1.16b,v17.16b
1403bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1404c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
1405c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1406bc3d5698SJohn Baldwin	eor	v17.16b,v3.16b,v7.16b
1407bc3d5698SJohn Baldwin	add	x0,x0,x6		// x0 is adjusted in such way that
1408c0855eaaSJohn Baldwin					// at exit from the loop v1.16b-v24.16b
1409bc3d5698SJohn Baldwin					// are loaded with last "words"
1410c0855eaaSJohn Baldwin	orr	v6.16b,v27.16b,v27.16b
1411bc3d5698SJohn Baldwin	mov	x7,x3
1412bc3d5698SJohn Baldwin	aesd	v0.16b,v20.16b
1413bc3d5698SJohn Baldwin	aesimc	v0.16b,v0.16b
1414bc3d5698SJohn Baldwin	aesd	v1.16b,v20.16b
1415bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1416c0855eaaSJohn Baldwin	aesd	v24.16b,v20.16b
1417c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1418bc3d5698SJohn Baldwin	ld1	{v2.16b},[x0],#16
1419bc3d5698SJohn Baldwin	aesd	v0.16b,v21.16b
1420bc3d5698SJohn Baldwin	aesimc	v0.16b,v0.16b
1421bc3d5698SJohn Baldwin	aesd	v1.16b,v21.16b
1422bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1423c0855eaaSJohn Baldwin	aesd	v24.16b,v21.16b
1424c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1425bc3d5698SJohn Baldwin	ld1	{v3.16b},[x0],#16
1426bc3d5698SJohn Baldwin	aesd	v0.16b,v22.16b
1427bc3d5698SJohn Baldwin	aesimc	v0.16b,v0.16b
1428bc3d5698SJohn Baldwin	aesd	v1.16b,v22.16b
1429bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1430c0855eaaSJohn Baldwin	aesd	v24.16b,v22.16b
1431c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1432c0855eaaSJohn Baldwin	ld1	{v27.16b},[x0],#16
1433bc3d5698SJohn Baldwin	aesd	v0.16b,v23.16b
1434bc3d5698SJohn Baldwin	aesd	v1.16b,v23.16b
1435c0855eaaSJohn Baldwin	aesd	v24.16b,v23.16b
1436bc3d5698SJohn Baldwin	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1437bc3d5698SJohn Baldwin	add	w6,w5,#2
1438bc3d5698SJohn Baldwin	eor	v4.16b,v4.16b,v0.16b
1439bc3d5698SJohn Baldwin	eor	v5.16b,v5.16b,v1.16b
1440c0855eaaSJohn Baldwin	eor	v24.16b,v24.16b,v17.16b
1441bc3d5698SJohn Baldwin	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1442bc3d5698SJohn Baldwin	st1	{v4.16b},[x1],#16
1443bc3d5698SJohn Baldwin	orr	v0.16b,v2.16b,v2.16b
1444bc3d5698SJohn Baldwin	st1	{v5.16b},[x1],#16
1445bc3d5698SJohn Baldwin	orr	v1.16b,v3.16b,v3.16b
1446c0855eaaSJohn Baldwin	st1	{v24.16b},[x1],#16
1447c0855eaaSJohn Baldwin	orr	v24.16b,v27.16b,v27.16b
1448bc3d5698SJohn Baldwin	b.hs	.Loop3x_cbc_dec
1449bc3d5698SJohn Baldwin
1450bc3d5698SJohn Baldwin	cmn	x2,#0x30
1451bc3d5698SJohn Baldwin	b.eq	.Lcbc_done
1452bc3d5698SJohn Baldwin	nop
1453bc3d5698SJohn Baldwin
1454bc3d5698SJohn Baldwin.Lcbc_dec_tail:
1455bc3d5698SJohn Baldwin	aesd	v1.16b,v16.16b
1456bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1457c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
1458c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1459bc3d5698SJohn Baldwin	ld1	{v16.4s},[x7],#16
1460bc3d5698SJohn Baldwin	subs	w6,w6,#2
1461bc3d5698SJohn Baldwin	aesd	v1.16b,v17.16b
1462bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1463c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
1464c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1465bc3d5698SJohn Baldwin	ld1	{v17.4s},[x7],#16
1466bc3d5698SJohn Baldwin	b.gt	.Lcbc_dec_tail
1467bc3d5698SJohn Baldwin
1468bc3d5698SJohn Baldwin	aesd	v1.16b,v16.16b
1469bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1470c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
1471c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1472bc3d5698SJohn Baldwin	aesd	v1.16b,v17.16b
1473bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1474c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
1475c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1476bc3d5698SJohn Baldwin	aesd	v1.16b,v20.16b
1477bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1478c0855eaaSJohn Baldwin	aesd	v24.16b,v20.16b
1479c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1480bc3d5698SJohn Baldwin	cmn	x2,#0x20
1481bc3d5698SJohn Baldwin	aesd	v1.16b,v21.16b
1482bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1483c0855eaaSJohn Baldwin	aesd	v24.16b,v21.16b
1484c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1485bc3d5698SJohn Baldwin	eor	v5.16b,v6.16b,v7.16b
1486bc3d5698SJohn Baldwin	aesd	v1.16b,v22.16b
1487bc3d5698SJohn Baldwin	aesimc	v1.16b,v1.16b
1488c0855eaaSJohn Baldwin	aesd	v24.16b,v22.16b
1489c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
1490bc3d5698SJohn Baldwin	eor	v17.16b,v3.16b,v7.16b
1491bc3d5698SJohn Baldwin	aesd	v1.16b,v23.16b
1492c0855eaaSJohn Baldwin	aesd	v24.16b,v23.16b
1493bc3d5698SJohn Baldwin	b.eq	.Lcbc_dec_one
1494bc3d5698SJohn Baldwin	eor	v5.16b,v5.16b,v1.16b
1495c0855eaaSJohn Baldwin	eor	v17.16b,v17.16b,v24.16b
1496c0855eaaSJohn Baldwin	orr	v6.16b,v27.16b,v27.16b
1497bc3d5698SJohn Baldwin	st1	{v5.16b},[x1],#16
1498bc3d5698SJohn Baldwin	st1	{v17.16b},[x1],#16
1499bc3d5698SJohn Baldwin	b	.Lcbc_done
1500bc3d5698SJohn Baldwin
1501bc3d5698SJohn Baldwin.Lcbc_dec_one:
1502c0855eaaSJohn Baldwin	eor	v5.16b,v5.16b,v24.16b
1503c0855eaaSJohn Baldwin	orr	v6.16b,v27.16b,v27.16b
1504bc3d5698SJohn Baldwin	st1	{v5.16b},[x1],#16
1505bc3d5698SJohn Baldwin
1506bc3d5698SJohn Baldwin.Lcbc_done:
1507bc3d5698SJohn Baldwin	st1	{v6.16b},[x4]
1508bc3d5698SJohn Baldwin.Lcbc_abort:
1509bc3d5698SJohn Baldwin	ldr	x29,[sp],#16
1510bc3d5698SJohn Baldwin	ret
1511bc3d5698SJohn Baldwin.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
1512bc3d5698SJohn Baldwin.globl	aes_v8_ctr32_encrypt_blocks
1513bc3d5698SJohn Baldwin.type	aes_v8_ctr32_encrypt_blocks,%function
1514bc3d5698SJohn Baldwin.align	5
1515bc3d5698SJohn Baldwinaes_v8_ctr32_encrypt_blocks:
1516bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
1517bd9588bcSAndrew Turner	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1518bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
1519bc3d5698SJohn Baldwin	add	x29,sp,#0
1520bc3d5698SJohn Baldwin	ldr	w5,[x3,#240]
1521bc3d5698SJohn Baldwin
1522bc3d5698SJohn Baldwin	ldr	w8, [x4, #12]
1523*575878a5SEd Maste#ifdef __AARCH64EB__
1524c3c73b4fSJung-uk Kim	ld1	{v0.16b},[x4]
1525c3c73b4fSJung-uk Kim#else
1526bc3d5698SJohn Baldwin	ld1	{v0.4s},[x4]
1527c3c73b4fSJung-uk Kim#endif
1528bc3d5698SJohn Baldwin	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
1529bc3d5698SJohn Baldwin	sub	w5,w5,#4
1530bc3d5698SJohn Baldwin	mov	x12,#16
1531bc3d5698SJohn Baldwin	cmp	x2,#2
1532bc3d5698SJohn Baldwin	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
1533bc3d5698SJohn Baldwin	sub	w5,w5,#2
1534bc3d5698SJohn Baldwin	ld1	{v20.4s,v21.4s},[x7],#32
1535bc3d5698SJohn Baldwin	ld1	{v22.4s,v23.4s},[x7],#32
1536bc3d5698SJohn Baldwin	ld1	{v7.4s},[x7]
1537bc3d5698SJohn Baldwin	add	x7,x3,#32
1538bc3d5698SJohn Baldwin	mov	w6,w5
1539bc3d5698SJohn Baldwin	csel	x12,xzr,x12,lo
1540*575878a5SEd Maste#ifndef __AARCH64EB__
1541bc3d5698SJohn Baldwin	rev	w8, w8
1542bc3d5698SJohn Baldwin#endif
1543f443d080SJung-uk Kim	orr	v1.16b,v0.16b,v0.16b
1544bc3d5698SJohn Baldwin	add	w10, w8, #1
1545f443d080SJung-uk Kim	orr	v18.16b,v0.16b,v0.16b
1546f443d080SJung-uk Kim	add	w8, w8, #2
1547bc3d5698SJohn Baldwin	orr	v6.16b,v0.16b,v0.16b
1548bc3d5698SJohn Baldwin	rev	w10, w10
1549f443d080SJung-uk Kim	mov	v1.s[3],w10
1550bc3d5698SJohn Baldwin	b.ls	.Lctr32_tail
1551bc3d5698SJohn Baldwin	rev	w12, w8
1552bc3d5698SJohn Baldwin	sub	x2,x2,#3		// bias
1553f443d080SJung-uk Kim	mov	v18.s[3],w12
1554c0855eaaSJohn Baldwin	cmp	x2,#32
1555c0855eaaSJohn Baldwin	b.lo	.Loop3x_ctr32
1556c0855eaaSJohn Baldwin
1557c0855eaaSJohn Baldwin	add	w13,w8,#1
1558c0855eaaSJohn Baldwin	add	w14,w8,#2
1559c0855eaaSJohn Baldwin	orr	v24.16b,v0.16b,v0.16b
1560c0855eaaSJohn Baldwin	rev	w13,w13
1561c0855eaaSJohn Baldwin	orr	v25.16b,v0.16b,v0.16b
1562c0855eaaSJohn Baldwin	rev	w14,w14
1563c0855eaaSJohn Baldwin	mov	v24.s[3],w13
1564c0855eaaSJohn Baldwin	sub	x2,x2,#2		// bias
1565c0855eaaSJohn Baldwin	mov	v25.s[3],w14
1566c0855eaaSJohn Baldwin	add	w8,w8,#2
1567c0855eaaSJohn Baldwin	b	.Loop5x_ctr32
1568c0855eaaSJohn Baldwin
1569c0855eaaSJohn Baldwin.align	4
1570c0855eaaSJohn Baldwin.Loop5x_ctr32:
1571c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
1572c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1573c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
1574c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
1575c0855eaaSJohn Baldwin	aese	v18.16b,v16.16b
1576c0855eaaSJohn Baldwin	aesmc	v18.16b,v18.16b
1577c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
1578c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
1579c0855eaaSJohn Baldwin	aese	v25.16b,v16.16b
1580c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
1581c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
1582c0855eaaSJohn Baldwin	subs	w6,w6,#2
1583c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
1584c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1585c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
1586c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
1587c0855eaaSJohn Baldwin	aese	v18.16b,v17.16b
1588c0855eaaSJohn Baldwin	aesmc	v18.16b,v18.16b
1589c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
1590c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
1591c0855eaaSJohn Baldwin	aese	v25.16b,v17.16b
1592c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
1593c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
1594c0855eaaSJohn Baldwin	b.gt	.Loop5x_ctr32
1595c0855eaaSJohn Baldwin
1596c0855eaaSJohn Baldwin	mov	x7,x3
1597c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
1598c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1599c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
1600c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
1601c0855eaaSJohn Baldwin	aese	v18.16b,v16.16b
1602c0855eaaSJohn Baldwin	aesmc	v18.16b,v18.16b
1603c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
1604c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
1605c0855eaaSJohn Baldwin	aese	v25.16b,v16.16b
1606c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
1607c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1608c0855eaaSJohn Baldwin
1609c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
1610c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1611c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
1612c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
1613c0855eaaSJohn Baldwin	aese	v18.16b,v17.16b
1614c0855eaaSJohn Baldwin	aesmc	v18.16b,v18.16b
1615c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
1616c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
1617c0855eaaSJohn Baldwin	aese	v25.16b,v17.16b
1618c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
1619c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1620c0855eaaSJohn Baldwin
1621c0855eaaSJohn Baldwin	aese	v0.16b,v20.16b
1622c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1623c0855eaaSJohn Baldwin	add	w9,w8,#1
1624c0855eaaSJohn Baldwin	add	w10,w8,#2
1625c0855eaaSJohn Baldwin	aese	v1.16b,v20.16b
1626c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
1627c0855eaaSJohn Baldwin	add	w12,w8,#3
1628c0855eaaSJohn Baldwin	add	w13,w8,#4
1629c0855eaaSJohn Baldwin	aese	v18.16b,v20.16b
1630c0855eaaSJohn Baldwin	aesmc	v18.16b,v18.16b
1631c0855eaaSJohn Baldwin	add	w14,w8,#5
1632c0855eaaSJohn Baldwin	rev	w9,w9
1633c0855eaaSJohn Baldwin	aese	v24.16b,v20.16b
1634c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
1635c0855eaaSJohn Baldwin	rev	w10,w10
1636c0855eaaSJohn Baldwin	rev	w12,w12
1637c0855eaaSJohn Baldwin	aese	v25.16b,v20.16b
1638c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
1639c0855eaaSJohn Baldwin	rev	w13,w13
1640c0855eaaSJohn Baldwin	rev	w14,w14
1641c0855eaaSJohn Baldwin
1642c0855eaaSJohn Baldwin	aese	v0.16b,v21.16b
1643c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1644c0855eaaSJohn Baldwin	aese	v1.16b,v21.16b
1645c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
1646c0855eaaSJohn Baldwin	aese	v18.16b,v21.16b
1647c0855eaaSJohn Baldwin	aesmc	v18.16b,v18.16b
1648c0855eaaSJohn Baldwin	aese	v24.16b,v21.16b
1649c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
1650c0855eaaSJohn Baldwin	aese	v25.16b,v21.16b
1651c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
1652c0855eaaSJohn Baldwin
1653c0855eaaSJohn Baldwin	aese	v0.16b,v22.16b
1654c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1655c0855eaaSJohn Baldwin	ld1	{v2.16b},[x0],#16
1656c0855eaaSJohn Baldwin	aese	v1.16b,v22.16b
1657c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
1658c0855eaaSJohn Baldwin	ld1	{v3.16b},[x0],#16
1659c0855eaaSJohn Baldwin	aese	v18.16b,v22.16b
1660c0855eaaSJohn Baldwin	aesmc	v18.16b,v18.16b
1661c0855eaaSJohn Baldwin	ld1	{v19.16b},[x0],#16
1662c0855eaaSJohn Baldwin	aese	v24.16b,v22.16b
1663c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
1664c0855eaaSJohn Baldwin	ld1	{v26.16b},[x0],#16
1665c0855eaaSJohn Baldwin	aese	v25.16b,v22.16b
1666c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
1667c0855eaaSJohn Baldwin	ld1	{v27.16b},[x0],#16
1668c0855eaaSJohn Baldwin
1669c0855eaaSJohn Baldwin	aese	v0.16b,v23.16b
1670c0855eaaSJohn Baldwin	eor	v2.16b,v2.16b,v7.16b
1671c0855eaaSJohn Baldwin	aese	v1.16b,v23.16b
1672c0855eaaSJohn Baldwin	eor	v3.16b,v3.16b,v7.16b
1673c0855eaaSJohn Baldwin	aese	v18.16b,v23.16b
1674c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v7.16b
1675c0855eaaSJohn Baldwin	aese	v24.16b,v23.16b
1676c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v7.16b
1677c0855eaaSJohn Baldwin	aese	v25.16b,v23.16b
1678c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v7.16b
1679c0855eaaSJohn Baldwin
1680c0855eaaSJohn Baldwin	eor	v2.16b,v2.16b,v0.16b
1681c0855eaaSJohn Baldwin	orr	v0.16b,v6.16b,v6.16b
1682c0855eaaSJohn Baldwin	eor	v3.16b,v3.16b,v1.16b
1683c0855eaaSJohn Baldwin	orr	v1.16b,v6.16b,v6.16b
1684c0855eaaSJohn Baldwin	eor	v19.16b,v19.16b,v18.16b
1685c0855eaaSJohn Baldwin	orr	v18.16b,v6.16b,v6.16b
1686c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v24.16b
1687c0855eaaSJohn Baldwin	orr	v24.16b,v6.16b,v6.16b
1688c0855eaaSJohn Baldwin	eor	v27.16b,v27.16b,v25.16b
1689c0855eaaSJohn Baldwin	orr	v25.16b,v6.16b,v6.16b
1690c0855eaaSJohn Baldwin
1691c0855eaaSJohn Baldwin	st1	{v2.16b},[x1],#16
1692c0855eaaSJohn Baldwin	mov	v0.s[3],w9
1693c0855eaaSJohn Baldwin	st1	{v3.16b},[x1],#16
1694c0855eaaSJohn Baldwin	mov	v1.s[3],w10
1695c0855eaaSJohn Baldwin	st1	{v19.16b},[x1],#16
1696c0855eaaSJohn Baldwin	mov	v18.s[3],w12
1697c0855eaaSJohn Baldwin	st1	{v26.16b},[x1],#16
1698c0855eaaSJohn Baldwin	mov	v24.s[3],w13
1699c0855eaaSJohn Baldwin	st1	{v27.16b},[x1],#16
1700c0855eaaSJohn Baldwin	mov	v25.s[3],w14
1701c0855eaaSJohn Baldwin
1702c0855eaaSJohn Baldwin	mov	w6,w5
1703c0855eaaSJohn Baldwin	cbz	x2,.Lctr32_done
1704c0855eaaSJohn Baldwin
1705c0855eaaSJohn Baldwin	add	w8,w8,#5
1706c0855eaaSJohn Baldwin	subs	x2,x2,#5
1707c0855eaaSJohn Baldwin	b.hs	.Loop5x_ctr32
1708c0855eaaSJohn Baldwin
1709c0855eaaSJohn Baldwin	add	x2,x2,#5
1710c0855eaaSJohn Baldwin	sub	w8,w8,#5
1711c0855eaaSJohn Baldwin
1712c0855eaaSJohn Baldwin	cmp	x2,#2
1713c0855eaaSJohn Baldwin	mov	x12,#16
1714c0855eaaSJohn Baldwin	csel	x12,xzr,x12,lo
1715c0855eaaSJohn Baldwin	b.ls	.Lctr32_tail
1716c0855eaaSJohn Baldwin
1717c0855eaaSJohn Baldwin	sub	x2,x2,#3		// bias
1718c0855eaaSJohn Baldwin	add	w8,w8,#3
1719bc3d5698SJohn Baldwin	b	.Loop3x_ctr32
1720bc3d5698SJohn Baldwin
1721bc3d5698SJohn Baldwin.align	4
1722bc3d5698SJohn Baldwin.Loop3x_ctr32:
1723bc3d5698SJohn Baldwin	aese	v0.16b,v16.16b
1724bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1725bc3d5698SJohn Baldwin	aese	v1.16b,v16.16b
1726bc3d5698SJohn Baldwin	aesmc	v1.16b,v1.16b
1727bc3d5698SJohn Baldwin	aese	v18.16b,v16.16b
1728bc3d5698SJohn Baldwin	aesmc	v18.16b,v18.16b
1729bc3d5698SJohn Baldwin	ld1	{v16.4s},[x7],#16
1730bc3d5698SJohn Baldwin	subs	w6,w6,#2
1731bc3d5698SJohn Baldwin	aese	v0.16b,v17.16b
1732bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1733bc3d5698SJohn Baldwin	aese	v1.16b,v17.16b
1734bc3d5698SJohn Baldwin	aesmc	v1.16b,v1.16b
1735bc3d5698SJohn Baldwin	aese	v18.16b,v17.16b
1736bc3d5698SJohn Baldwin	aesmc	v18.16b,v18.16b
1737bc3d5698SJohn Baldwin	ld1	{v17.4s},[x7],#16
1738bc3d5698SJohn Baldwin	b.gt	.Loop3x_ctr32
1739bc3d5698SJohn Baldwin
1740bc3d5698SJohn Baldwin	aese	v0.16b,v16.16b
1741bc3d5698SJohn Baldwin	aesmc	v4.16b,v0.16b
1742bc3d5698SJohn Baldwin	aese	v1.16b,v16.16b
1743bc3d5698SJohn Baldwin	aesmc	v5.16b,v1.16b
1744bc3d5698SJohn Baldwin	ld1	{v2.16b},[x0],#16
1745f443d080SJung-uk Kim	orr	v0.16b,v6.16b,v6.16b
1746bc3d5698SJohn Baldwin	aese	v18.16b,v16.16b
1747bc3d5698SJohn Baldwin	aesmc	v18.16b,v18.16b
1748bc3d5698SJohn Baldwin	ld1	{v3.16b},[x0],#16
1749f443d080SJung-uk Kim	orr	v1.16b,v6.16b,v6.16b
1750bc3d5698SJohn Baldwin	aese	v4.16b,v17.16b
1751bc3d5698SJohn Baldwin	aesmc	v4.16b,v4.16b
1752bc3d5698SJohn Baldwin	aese	v5.16b,v17.16b
1753bc3d5698SJohn Baldwin	aesmc	v5.16b,v5.16b
1754bc3d5698SJohn Baldwin	ld1	{v19.16b},[x0],#16
1755bc3d5698SJohn Baldwin	mov	x7,x3
1756bc3d5698SJohn Baldwin	aese	v18.16b,v17.16b
1757bc3d5698SJohn Baldwin	aesmc	v17.16b,v18.16b
1758f443d080SJung-uk Kim	orr	v18.16b,v6.16b,v6.16b
1759f443d080SJung-uk Kim	add	w9,w8,#1
1760bc3d5698SJohn Baldwin	aese	v4.16b,v20.16b
1761bc3d5698SJohn Baldwin	aesmc	v4.16b,v4.16b
1762bc3d5698SJohn Baldwin	aese	v5.16b,v20.16b
1763bc3d5698SJohn Baldwin	aesmc	v5.16b,v5.16b
1764bc3d5698SJohn Baldwin	eor	v2.16b,v2.16b,v7.16b
1765bc3d5698SJohn Baldwin	add	w10,w8,#2
1766bc3d5698SJohn Baldwin	aese	v17.16b,v20.16b
1767bc3d5698SJohn Baldwin	aesmc	v17.16b,v17.16b
1768bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v7.16b
1769bc3d5698SJohn Baldwin	add	w8,w8,#3
1770bc3d5698SJohn Baldwin	aese	v4.16b,v21.16b
1771bc3d5698SJohn Baldwin	aesmc	v4.16b,v4.16b
1772bc3d5698SJohn Baldwin	aese	v5.16b,v21.16b
1773bc3d5698SJohn Baldwin	aesmc	v5.16b,v5.16b
1774bc3d5698SJohn Baldwin	eor	v19.16b,v19.16b,v7.16b
1775f443d080SJung-uk Kim	rev	w9,w9
1776bc3d5698SJohn Baldwin	aese	v17.16b,v21.16b
1777bc3d5698SJohn Baldwin	aesmc	v17.16b,v17.16b
1778f443d080SJung-uk Kim	mov	v0.s[3], w9
1779bc3d5698SJohn Baldwin	rev	w10,w10
1780bc3d5698SJohn Baldwin	aese	v4.16b,v22.16b
1781bc3d5698SJohn Baldwin	aesmc	v4.16b,v4.16b
1782bc3d5698SJohn Baldwin	aese	v5.16b,v22.16b
1783bc3d5698SJohn Baldwin	aesmc	v5.16b,v5.16b
1784f443d080SJung-uk Kim	mov	v1.s[3], w10
1785f443d080SJung-uk Kim	rev	w12,w8
1786bc3d5698SJohn Baldwin	aese	v17.16b,v22.16b
1787bc3d5698SJohn Baldwin	aesmc	v17.16b,v17.16b
1788f443d080SJung-uk Kim	mov	v18.s[3], w12
1789bc3d5698SJohn Baldwin	subs	x2,x2,#3
1790bc3d5698SJohn Baldwin	aese	v4.16b,v23.16b
1791bc3d5698SJohn Baldwin	aese	v5.16b,v23.16b
1792bc3d5698SJohn Baldwin	aese	v17.16b,v23.16b
1793bc3d5698SJohn Baldwin
1794bc3d5698SJohn Baldwin	eor	v2.16b,v2.16b,v4.16b
1795bc3d5698SJohn Baldwin	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1796bc3d5698SJohn Baldwin	st1	{v2.16b},[x1],#16
1797bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v5.16b
1798bc3d5698SJohn Baldwin	mov	w6,w5
1799bc3d5698SJohn Baldwin	st1	{v3.16b},[x1],#16
1800bc3d5698SJohn Baldwin	eor	v19.16b,v19.16b,v17.16b
1801bc3d5698SJohn Baldwin	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1802bc3d5698SJohn Baldwin	st1	{v19.16b},[x1],#16
1803bc3d5698SJohn Baldwin	b.hs	.Loop3x_ctr32
1804bc3d5698SJohn Baldwin
1805bc3d5698SJohn Baldwin	adds	x2,x2,#3
1806bc3d5698SJohn Baldwin	b.eq	.Lctr32_done
1807bc3d5698SJohn Baldwin	cmp	x2,#1
1808bc3d5698SJohn Baldwin	mov	x12,#16
1809bc3d5698SJohn Baldwin	csel	x12,xzr,x12,eq
1810bc3d5698SJohn Baldwin
1811bc3d5698SJohn Baldwin.Lctr32_tail:
1812bc3d5698SJohn Baldwin	aese	v0.16b,v16.16b
1813bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1814bc3d5698SJohn Baldwin	aese	v1.16b,v16.16b
1815bc3d5698SJohn Baldwin	aesmc	v1.16b,v1.16b
1816bc3d5698SJohn Baldwin	ld1	{v16.4s},[x7],#16
1817bc3d5698SJohn Baldwin	subs	w6,w6,#2
1818bc3d5698SJohn Baldwin	aese	v0.16b,v17.16b
1819bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1820bc3d5698SJohn Baldwin	aese	v1.16b,v17.16b
1821bc3d5698SJohn Baldwin	aesmc	v1.16b,v1.16b
1822bc3d5698SJohn Baldwin	ld1	{v17.4s},[x7],#16
1823bc3d5698SJohn Baldwin	b.gt	.Lctr32_tail
1824bc3d5698SJohn Baldwin
1825bc3d5698SJohn Baldwin	aese	v0.16b,v16.16b
1826bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1827bc3d5698SJohn Baldwin	aese	v1.16b,v16.16b
1828bc3d5698SJohn Baldwin	aesmc	v1.16b,v1.16b
1829bc3d5698SJohn Baldwin	aese	v0.16b,v17.16b
1830bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1831bc3d5698SJohn Baldwin	aese	v1.16b,v17.16b
1832bc3d5698SJohn Baldwin	aesmc	v1.16b,v1.16b
1833bc3d5698SJohn Baldwin	ld1	{v2.16b},[x0],x12
1834bc3d5698SJohn Baldwin	aese	v0.16b,v20.16b
1835bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1836bc3d5698SJohn Baldwin	aese	v1.16b,v20.16b
1837bc3d5698SJohn Baldwin	aesmc	v1.16b,v1.16b
1838bc3d5698SJohn Baldwin	ld1	{v3.16b},[x0]
1839bc3d5698SJohn Baldwin	aese	v0.16b,v21.16b
1840bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1841bc3d5698SJohn Baldwin	aese	v1.16b,v21.16b
1842bc3d5698SJohn Baldwin	aesmc	v1.16b,v1.16b
1843bc3d5698SJohn Baldwin	eor	v2.16b,v2.16b,v7.16b
1844bc3d5698SJohn Baldwin	aese	v0.16b,v22.16b
1845bc3d5698SJohn Baldwin	aesmc	v0.16b,v0.16b
1846bc3d5698SJohn Baldwin	aese	v1.16b,v22.16b
1847bc3d5698SJohn Baldwin	aesmc	v1.16b,v1.16b
1848bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v7.16b
1849bc3d5698SJohn Baldwin	aese	v0.16b,v23.16b
1850bc3d5698SJohn Baldwin	aese	v1.16b,v23.16b
1851bc3d5698SJohn Baldwin
1852bc3d5698SJohn Baldwin	cmp	x2,#1
1853bc3d5698SJohn Baldwin	eor	v2.16b,v2.16b,v0.16b
1854bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v1.16b
1855bc3d5698SJohn Baldwin	st1	{v2.16b},[x1],#16
1856bc3d5698SJohn Baldwin	b.eq	.Lctr32_done
1857bc3d5698SJohn Baldwin	st1	{v3.16b},[x1]
1858bc3d5698SJohn Baldwin
1859bc3d5698SJohn Baldwin.Lctr32_done:
1860bc3d5698SJohn Baldwin	ldr	x29,[sp],#16
1861bc3d5698SJohn Baldwin	ret
1862bc3d5698SJohn Baldwin.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
1863c0855eaaSJohn Baldwin.globl	aes_v8_xts_encrypt
1864c0855eaaSJohn Baldwin.type	aes_v8_xts_encrypt,%function
1865c0855eaaSJohn Baldwin.align	5
1866c0855eaaSJohn Baldwinaes_v8_xts_encrypt:
1867bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
1868c0855eaaSJohn Baldwin	cmp	x2,#16
1869c0855eaaSJohn Baldwin	// Original input data size bigger than 16, jump to big size processing.
1870c0855eaaSJohn Baldwin	b.ne	.Lxts_enc_big_size
1871c0855eaaSJohn Baldwin	// Encrypt the iv with key2, as the first XEX iv.
1872c0855eaaSJohn Baldwin	ldr	w6,[x4,#240]
1873c0855eaaSJohn Baldwin	ld1	{v0.4s},[x4],#16
1874c0855eaaSJohn Baldwin	ld1	{v6.16b},[x5]
1875c0855eaaSJohn Baldwin	sub	w6,w6,#2
1876c0855eaaSJohn Baldwin	ld1	{v1.4s},[x4],#16
1877c0855eaaSJohn Baldwin
1878c0855eaaSJohn Baldwin.Loop_enc_iv_enc:
1879c0855eaaSJohn Baldwin	aese	v6.16b,v0.16b
1880c0855eaaSJohn Baldwin	aesmc	v6.16b,v6.16b
1881c0855eaaSJohn Baldwin	ld1	{v0.4s},[x4],#16
1882c0855eaaSJohn Baldwin	subs	w6,w6,#2
1883c0855eaaSJohn Baldwin	aese	v6.16b,v1.16b
1884c0855eaaSJohn Baldwin	aesmc	v6.16b,v6.16b
1885c0855eaaSJohn Baldwin	ld1	{v1.4s},[x4],#16
1886c0855eaaSJohn Baldwin	b.gt	.Loop_enc_iv_enc
1887c0855eaaSJohn Baldwin
1888c0855eaaSJohn Baldwin	aese	v6.16b,v0.16b
1889c0855eaaSJohn Baldwin	aesmc	v6.16b,v6.16b
1890c0855eaaSJohn Baldwin	ld1	{v0.4s},[x4]
1891c0855eaaSJohn Baldwin	aese	v6.16b,v1.16b
1892c0855eaaSJohn Baldwin	eor	v6.16b,v6.16b,v0.16b
1893c0855eaaSJohn Baldwin
1894c0855eaaSJohn Baldwin	ld1	{v0.16b},[x0]
1895c0855eaaSJohn Baldwin	eor	v0.16b,v6.16b,v0.16b
1896c0855eaaSJohn Baldwin
1897c0855eaaSJohn Baldwin	ldr	w6,[x3,#240]
1898c0855eaaSJohn Baldwin	ld1	{v28.4s,v29.4s},[x3],#32		// load key schedule...
1899c0855eaaSJohn Baldwin
1900c0855eaaSJohn Baldwin	aese	v0.16b,v28.16b
1901c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1902c0855eaaSJohn Baldwin	ld1	{v16.4s,v17.4s},[x3],#32		// load key schedule...
1903c0855eaaSJohn Baldwin	aese	v0.16b,v29.16b
1904c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1905c0855eaaSJohn Baldwin	subs	w6,w6,#10		// if rounds==10, jump to aes-128-xts processing
1906c0855eaaSJohn Baldwin	b.eq	.Lxts_128_enc
1907c0855eaaSJohn Baldwin.Lxts_enc_round_loop:
1908c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
1909c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1910c0855eaaSJohn Baldwin	ld1	{v16.4s},[x3],#16		// load key schedule...
1911c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
1912c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1913c0855eaaSJohn Baldwin	ld1	{v17.4s},[x3],#16		// load key schedule...
1914c0855eaaSJohn Baldwin	subs	w6,w6,#2		// bias
1915c0855eaaSJohn Baldwin	b.gt	.Lxts_enc_round_loop
1916c0855eaaSJohn Baldwin.Lxts_128_enc:
1917c0855eaaSJohn Baldwin	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
1918c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
1919c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1920c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
1921c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1922c0855eaaSJohn Baldwin	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
1923c0855eaaSJohn Baldwin	aese	v0.16b,v18.16b
1924c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1925c0855eaaSJohn Baldwin	aese	v0.16b,v19.16b
1926c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1927c0855eaaSJohn Baldwin	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
1928c0855eaaSJohn Baldwin	aese	v0.16b,v20.16b
1929c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1930c0855eaaSJohn Baldwin	aese	v0.16b,v21.16b
1931c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1932c0855eaaSJohn Baldwin	ld1	{v7.4s},[x3]
1933c0855eaaSJohn Baldwin	aese	v0.16b,v22.16b
1934c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
1935c0855eaaSJohn Baldwin	aese	v0.16b,v23.16b
1936c0855eaaSJohn Baldwin	eor	v0.16b,v0.16b,v7.16b
1937c0855eaaSJohn Baldwin	eor	v0.16b,v0.16b,v6.16b
1938c0855eaaSJohn Baldwin	st1	{v0.16b},[x1]
1939c0855eaaSJohn Baldwin	b	.Lxts_enc_final_abort
1940c0855eaaSJohn Baldwin
1941c0855eaaSJohn Baldwin.align	4
1942c0855eaaSJohn Baldwin.Lxts_enc_big_size:
1943c0855eaaSJohn Baldwin	stp	x19,x20,[sp,#-64]!
1944c0855eaaSJohn Baldwin	stp	x21,x22,[sp,#48]
1945c0855eaaSJohn Baldwin	stp	d8,d9,[sp,#32]
1946c0855eaaSJohn Baldwin	stp	d10,d11,[sp,#16]
1947c0855eaaSJohn Baldwin
1948c0855eaaSJohn Baldwin	// tailcnt store the tail value of length%16.
1949c0855eaaSJohn Baldwin	and	x21,x2,#0xf
1950c0855eaaSJohn Baldwin	and	x2,x2,#-16
1951c0855eaaSJohn Baldwin	subs	x2,x2,#16
1952c0855eaaSJohn Baldwin	mov	x8,#16
1953c0855eaaSJohn Baldwin	b.lo	.Lxts_abort
1954c0855eaaSJohn Baldwin	csel	x8,xzr,x8,eq
1955c0855eaaSJohn Baldwin
1956c0855eaaSJohn Baldwin	// Firstly, encrypt the iv with key2, as the first iv of XEX.
1957c0855eaaSJohn Baldwin	ldr	w6,[x4,#240]
1958c0855eaaSJohn Baldwin	ld1	{v0.4s},[x4],#16
1959c0855eaaSJohn Baldwin	ld1	{v6.16b},[x5]
1960c0855eaaSJohn Baldwin	sub	w6,w6,#2
1961c0855eaaSJohn Baldwin	ld1	{v1.4s},[x4],#16
1962c0855eaaSJohn Baldwin
1963c0855eaaSJohn Baldwin.Loop_iv_enc:
1964c0855eaaSJohn Baldwin	aese	v6.16b,v0.16b
1965c0855eaaSJohn Baldwin	aesmc	v6.16b,v6.16b
1966c0855eaaSJohn Baldwin	ld1	{v0.4s},[x4],#16
1967c0855eaaSJohn Baldwin	subs	w6,w6,#2
1968c0855eaaSJohn Baldwin	aese	v6.16b,v1.16b
1969c0855eaaSJohn Baldwin	aesmc	v6.16b,v6.16b
1970c0855eaaSJohn Baldwin	ld1	{v1.4s},[x4],#16
1971c0855eaaSJohn Baldwin	b.gt	.Loop_iv_enc
1972c0855eaaSJohn Baldwin
1973c0855eaaSJohn Baldwin	aese	v6.16b,v0.16b
1974c0855eaaSJohn Baldwin	aesmc	v6.16b,v6.16b
1975c0855eaaSJohn Baldwin	ld1	{v0.4s},[x4]
1976c0855eaaSJohn Baldwin	aese	v6.16b,v1.16b
1977c0855eaaSJohn Baldwin	eor	v6.16b,v6.16b,v0.16b
1978c0855eaaSJohn Baldwin
1979c0855eaaSJohn Baldwin	// The iv for second block
1980c0855eaaSJohn Baldwin	// x9- iv(low), x10 - iv(high)
1981c0855eaaSJohn Baldwin	// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
1982c0855eaaSJohn Baldwin	fmov	x9,d6
1983c0855eaaSJohn Baldwin	fmov	x10,v6.d[1]
1984c0855eaaSJohn Baldwin	mov	w19,#0x87
1985c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
1986c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
1987c0855eaaSJohn Baldwin	and	w11,w19,w22,asr#31
1988c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl#1
1989c0855eaaSJohn Baldwin	fmov	d8,x9
1990c0855eaaSJohn Baldwin	fmov	v8.d[1],x10
1991c0855eaaSJohn Baldwin
1992c0855eaaSJohn Baldwin	ldr	w5,[x3,#240]		// next starting point
1993c0855eaaSJohn Baldwin	ld1	{v0.16b},[x0],x8
1994c0855eaaSJohn Baldwin
1995c0855eaaSJohn Baldwin	ld1	{v16.4s,v17.4s},[x3]			// load key schedule...
1996c0855eaaSJohn Baldwin	sub	w5,w5,#6
1997c0855eaaSJohn Baldwin	add	x7,x3,x5,lsl#4		// pointer to last 7 round keys
1998c0855eaaSJohn Baldwin	sub	w5,w5,#2
1999c0855eaaSJohn Baldwin	ld1	{v18.4s,v19.4s},[x7],#32
2000c0855eaaSJohn Baldwin	ld1	{v20.4s,v21.4s},[x7],#32
2001c0855eaaSJohn Baldwin	ld1	{v22.4s,v23.4s},[x7],#32
2002c0855eaaSJohn Baldwin	ld1	{v7.4s},[x7]
2003c0855eaaSJohn Baldwin
2004c0855eaaSJohn Baldwin	add	x7,x3,#32
2005c0855eaaSJohn Baldwin	mov	w6,w5
2006c0855eaaSJohn Baldwin
2007c0855eaaSJohn Baldwin	// Encryption
2008c0855eaaSJohn Baldwin.Lxts_enc:
2009c0855eaaSJohn Baldwin	ld1	{v24.16b},[x0],#16
2010c0855eaaSJohn Baldwin	subs	x2,x2,#32			// bias
2011c0855eaaSJohn Baldwin	add	w6,w5,#2
2012c0855eaaSJohn Baldwin	orr	v3.16b,v0.16b,v0.16b
2013c0855eaaSJohn Baldwin	orr	v1.16b,v0.16b,v0.16b
2014c0855eaaSJohn Baldwin	orr	v28.16b,v0.16b,v0.16b
2015c0855eaaSJohn Baldwin	orr	v27.16b,v24.16b,v24.16b
2016c0855eaaSJohn Baldwin	orr	v29.16b,v24.16b,v24.16b
2017c0855eaaSJohn Baldwin	b.lo	.Lxts_inner_enc_tail
2018c0855eaaSJohn Baldwin	eor	v0.16b,v0.16b,v6.16b			// before encryption, xor with iv
2019c0855eaaSJohn Baldwin	eor	v24.16b,v24.16b,v8.16b
2020c0855eaaSJohn Baldwin
2021c0855eaaSJohn Baldwin	// The iv for third block
2022c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2023c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2024c0855eaaSJohn Baldwin	and	w11,w19,w22,asr#31
2025c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl#1
2026c0855eaaSJohn Baldwin	fmov	d9,x9
2027c0855eaaSJohn Baldwin	fmov	v9.d[1],x10
2028c0855eaaSJohn Baldwin
2029c0855eaaSJohn Baldwin
2030c0855eaaSJohn Baldwin	orr	v1.16b,v24.16b,v24.16b
2031c0855eaaSJohn Baldwin	ld1	{v24.16b},[x0],#16
2032c0855eaaSJohn Baldwin	orr	v2.16b,v0.16b,v0.16b
2033c0855eaaSJohn Baldwin	orr	v3.16b,v1.16b,v1.16b
2034c0855eaaSJohn Baldwin	eor	v27.16b,v24.16b,v9.16b 		// the third block
2035c0855eaaSJohn Baldwin	eor	v24.16b,v24.16b,v9.16b
2036c0855eaaSJohn Baldwin	cmp	x2,#32
2037c0855eaaSJohn Baldwin	b.lo	.Lxts_outer_enc_tail
2038c0855eaaSJohn Baldwin
2039c0855eaaSJohn Baldwin	// The iv for fourth block
2040c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2041c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2042c0855eaaSJohn Baldwin	and	w11,w19,w22,asr#31
2043c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl#1
2044c0855eaaSJohn Baldwin	fmov	d10,x9
2045c0855eaaSJohn Baldwin	fmov	v10.d[1],x10
2046c0855eaaSJohn Baldwin
2047c0855eaaSJohn Baldwin	ld1	{v25.16b},[x0],#16
2048c0855eaaSJohn Baldwin	// The iv for fifth block
2049c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2050c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2051c0855eaaSJohn Baldwin	and	w11,w19,w22,asr#31
2052c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl#1
2053c0855eaaSJohn Baldwin	fmov	d11,x9
2054c0855eaaSJohn Baldwin	fmov	v11.d[1],x10
2055c0855eaaSJohn Baldwin
2056c0855eaaSJohn Baldwin	ld1	{v26.16b},[x0],#16
2057c0855eaaSJohn Baldwin	eor	v25.16b,v25.16b,v10.16b		// the fourth block
2058c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v11.16b
2059c0855eaaSJohn Baldwin	sub	x2,x2,#32			// bias
2060c0855eaaSJohn Baldwin	mov	w6,w5
2061c0855eaaSJohn Baldwin	b	.Loop5x_xts_enc
2062c0855eaaSJohn Baldwin
2063c0855eaaSJohn Baldwin.align	4
2064c0855eaaSJohn Baldwin.Loop5x_xts_enc:
2065c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
2066c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2067c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
2068c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2069c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
2070c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2071c0855eaaSJohn Baldwin	aese	v25.16b,v16.16b
2072c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
2073c0855eaaSJohn Baldwin	aese	v26.16b,v16.16b
2074c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
2075c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
2076c0855eaaSJohn Baldwin	subs	w6,w6,#2
2077c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
2078c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2079c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
2080c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2081c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
2082c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2083c0855eaaSJohn Baldwin	aese	v25.16b,v17.16b
2084c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
2085c0855eaaSJohn Baldwin	aese	v26.16b,v17.16b
2086c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
2087c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
2088c0855eaaSJohn Baldwin	b.gt	.Loop5x_xts_enc
2089c0855eaaSJohn Baldwin
2090c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
2091c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2092c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
2093c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2094c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
2095c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2096c0855eaaSJohn Baldwin	aese	v25.16b,v16.16b
2097c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
2098c0855eaaSJohn Baldwin	aese	v26.16b,v16.16b
2099c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
2100c0855eaaSJohn Baldwin	subs	x2,x2,#0x50			// because .Lxts_enc_tail4x
2101c0855eaaSJohn Baldwin
2102c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
2103c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2104c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
2105c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2106c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
2107c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2108c0855eaaSJohn Baldwin	aese	v25.16b,v17.16b
2109c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
2110c0855eaaSJohn Baldwin	aese	v26.16b,v17.16b
2111c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
2112c0855eaaSJohn Baldwin	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
2113c0855eaaSJohn Baldwin	mov	x7,x3
2114c0855eaaSJohn Baldwin
2115c0855eaaSJohn Baldwin	aese	v0.16b,v18.16b
2116c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2117c0855eaaSJohn Baldwin	aese	v1.16b,v18.16b
2118c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2119c0855eaaSJohn Baldwin	aese	v24.16b,v18.16b
2120c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2121c0855eaaSJohn Baldwin	aese	v25.16b,v18.16b
2122c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
2123c0855eaaSJohn Baldwin	aese	v26.16b,v18.16b
2124c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
2125c0855eaaSJohn Baldwin	add	x0,x0,x6		// x0 is adjusted in such way that
2126c0855eaaSJohn Baldwin						// at exit from the loop v1.16b-v26.16b
2127c0855eaaSJohn Baldwin						// are loaded with last "words"
2128c0855eaaSJohn Baldwin	add	x6,x2,#0x60		// because .Lxts_enc_tail4x
2129c0855eaaSJohn Baldwin
2130c0855eaaSJohn Baldwin	aese	v0.16b,v19.16b
2131c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2132c0855eaaSJohn Baldwin	aese	v1.16b,v19.16b
2133c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2134c0855eaaSJohn Baldwin	aese	v24.16b,v19.16b
2135c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2136c0855eaaSJohn Baldwin	aese	v25.16b,v19.16b
2137c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
2138c0855eaaSJohn Baldwin	aese	v26.16b,v19.16b
2139c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
2140c0855eaaSJohn Baldwin
2141c0855eaaSJohn Baldwin	aese	v0.16b,v20.16b
2142c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2143c0855eaaSJohn Baldwin	aese	v1.16b,v20.16b
2144c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2145c0855eaaSJohn Baldwin	aese	v24.16b,v20.16b
2146c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2147c0855eaaSJohn Baldwin	aese	v25.16b,v20.16b
2148c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
2149c0855eaaSJohn Baldwin	aese	v26.16b,v20.16b
2150c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
2151c0855eaaSJohn Baldwin
2152c0855eaaSJohn Baldwin	aese	v0.16b,v21.16b
2153c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2154c0855eaaSJohn Baldwin	aese	v1.16b,v21.16b
2155c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2156c0855eaaSJohn Baldwin	aese	v24.16b,v21.16b
2157c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2158c0855eaaSJohn Baldwin	aese	v25.16b,v21.16b
2159c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
2160c0855eaaSJohn Baldwin	aese	v26.16b,v21.16b
2161c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
2162c0855eaaSJohn Baldwin
2163c0855eaaSJohn Baldwin	aese	v0.16b,v22.16b
2164c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2165c0855eaaSJohn Baldwin	aese	v1.16b,v22.16b
2166c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2167c0855eaaSJohn Baldwin	aese	v24.16b,v22.16b
2168c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2169c0855eaaSJohn Baldwin	aese	v25.16b,v22.16b
2170c0855eaaSJohn Baldwin	aesmc	v25.16b,v25.16b
2171c0855eaaSJohn Baldwin	aese	v26.16b,v22.16b
2172c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
2173c0855eaaSJohn Baldwin
2174c0855eaaSJohn Baldwin	eor	v4.16b,v7.16b,v6.16b
2175c0855eaaSJohn Baldwin	aese	v0.16b,v23.16b
2176c0855eaaSJohn Baldwin	// The iv for first block of one iteration
2177c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2178c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2179c0855eaaSJohn Baldwin	and	w11,w19,w22,asr#31
2180c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl#1
2181c0855eaaSJohn Baldwin	fmov	d6,x9
2182c0855eaaSJohn Baldwin	fmov	v6.d[1],x10
2183c0855eaaSJohn Baldwin	eor	v5.16b,v7.16b,v8.16b
2184c0855eaaSJohn Baldwin	ld1	{v2.16b},[x0],#16
2185c0855eaaSJohn Baldwin	aese	v1.16b,v23.16b
2186c0855eaaSJohn Baldwin	// The iv for second block
2187c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2188c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2189c0855eaaSJohn Baldwin	and	w11,w19,w22,asr#31
2190c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl#1
2191c0855eaaSJohn Baldwin	fmov	d8,x9
2192c0855eaaSJohn Baldwin	fmov	v8.d[1],x10
2193c0855eaaSJohn Baldwin	eor	v17.16b,v7.16b,v9.16b
2194c0855eaaSJohn Baldwin	ld1	{v3.16b},[x0],#16
2195c0855eaaSJohn Baldwin	aese	v24.16b,v23.16b
2196c0855eaaSJohn Baldwin	// The iv for third block
2197c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2198c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2199c0855eaaSJohn Baldwin	and	w11,w19,w22,asr#31
2200c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl#1
2201c0855eaaSJohn Baldwin	fmov	d9,x9
2202c0855eaaSJohn Baldwin	fmov	v9.d[1],x10
2203c0855eaaSJohn Baldwin	eor	v30.16b,v7.16b,v10.16b
2204c0855eaaSJohn Baldwin	ld1	{v27.16b},[x0],#16
2205c0855eaaSJohn Baldwin	aese	v25.16b,v23.16b
2206c0855eaaSJohn Baldwin	// The iv for fourth block
2207c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2208c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2209c0855eaaSJohn Baldwin	and	w11,w19,w22,asr#31
2210c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl#1
2211c0855eaaSJohn Baldwin	fmov	d10,x9
2212c0855eaaSJohn Baldwin	fmov	v10.d[1],x10
2213c0855eaaSJohn Baldwin	eor	v31.16b,v7.16b,v11.16b
2214c0855eaaSJohn Baldwin	ld1	{v28.16b},[x0],#16
2215c0855eaaSJohn Baldwin	aese	v26.16b,v23.16b
2216c0855eaaSJohn Baldwin
2217c0855eaaSJohn Baldwin	// The iv for fifth block
2218c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2219c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2220c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2221c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2222c0855eaaSJohn Baldwin	fmov	d11,x9
2223c0855eaaSJohn Baldwin	fmov	v11.d[1],x10
2224c0855eaaSJohn Baldwin
2225c0855eaaSJohn Baldwin	ld1	{v29.16b},[x0],#16
2226c0855eaaSJohn Baldwin	cbz	x6,.Lxts_enc_tail4x
2227c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
2228c0855eaaSJohn Baldwin	eor	v4.16b,v4.16b,v0.16b
2229c0855eaaSJohn Baldwin	eor	v0.16b,v2.16b,v6.16b
2230c0855eaaSJohn Baldwin	eor	v5.16b,v5.16b,v1.16b
2231c0855eaaSJohn Baldwin	eor	v1.16b,v3.16b,v8.16b
2232c0855eaaSJohn Baldwin	eor	v17.16b,v17.16b,v24.16b
2233c0855eaaSJohn Baldwin	eor	v24.16b,v27.16b,v9.16b
2234c0855eaaSJohn Baldwin	eor	v30.16b,v30.16b,v25.16b
2235c0855eaaSJohn Baldwin	eor	v25.16b,v28.16b,v10.16b
2236c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v26.16b
2237c0855eaaSJohn Baldwin	st1	{v4.16b},[x1],#16
2238c0855eaaSJohn Baldwin	eor	v26.16b,v29.16b,v11.16b
2239c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
2240c0855eaaSJohn Baldwin	mov	w6,w5
2241c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
2242c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
2243c0855eaaSJohn Baldwin	st1	{v30.16b},[x1],#16
2244c0855eaaSJohn Baldwin	st1	{v31.16b},[x1],#16
2245c0855eaaSJohn Baldwin	b.hs	.Loop5x_xts_enc
2246c0855eaaSJohn Baldwin
2247c0855eaaSJohn Baldwin
2248c0855eaaSJohn Baldwin	// If left 4 blocks, borrow the five block's processing.
2249c0855eaaSJohn Baldwin	cmn	x2,#0x10
2250c0855eaaSJohn Baldwin	b.ne	.Loop5x_enc_after
2251c0855eaaSJohn Baldwin	orr	v11.16b,v10.16b,v10.16b
2252c0855eaaSJohn Baldwin	orr	v10.16b,v9.16b,v9.16b
2253c0855eaaSJohn Baldwin	orr	v9.16b,v8.16b,v8.16b
2254c0855eaaSJohn Baldwin	orr	v8.16b,v6.16b,v6.16b
2255c0855eaaSJohn Baldwin	fmov	x9,d11
2256c0855eaaSJohn Baldwin	fmov	x10,v11.d[1]
2257c0855eaaSJohn Baldwin	eor	v0.16b,v6.16b,v2.16b
2258c0855eaaSJohn Baldwin	eor	v1.16b,v8.16b,v3.16b
2259c0855eaaSJohn Baldwin	eor	v24.16b,v27.16b,v9.16b
2260c0855eaaSJohn Baldwin	eor	v25.16b,v28.16b,v10.16b
2261c0855eaaSJohn Baldwin	eor	v26.16b,v29.16b,v11.16b
2262c0855eaaSJohn Baldwin	b.eq	.Loop5x_xts_enc
2263c0855eaaSJohn Baldwin
2264c0855eaaSJohn Baldwin.Loop5x_enc_after:
2265c0855eaaSJohn Baldwin	add	x2,x2,#0x50
2266c0855eaaSJohn Baldwin	cbz	x2,.Lxts_enc_done
2267c0855eaaSJohn Baldwin
2268c0855eaaSJohn Baldwin	add	w6,w5,#2
2269c0855eaaSJohn Baldwin	subs	x2,x2,#0x30
2270c0855eaaSJohn Baldwin	b.lo	.Lxts_inner_enc_tail
2271c0855eaaSJohn Baldwin
2272c0855eaaSJohn Baldwin	eor	v0.16b,v6.16b,v27.16b
2273c0855eaaSJohn Baldwin	eor	v1.16b,v8.16b,v28.16b
2274c0855eaaSJohn Baldwin	eor	v24.16b,v29.16b,v9.16b
2275c0855eaaSJohn Baldwin	b	.Lxts_outer_enc_tail
2276c0855eaaSJohn Baldwin
2277c0855eaaSJohn Baldwin.align	4
2278c0855eaaSJohn Baldwin.Lxts_enc_tail4x:
2279c0855eaaSJohn Baldwin	add	x0,x0,#16
2280c0855eaaSJohn Baldwin	eor	v5.16b,v1.16b,v5.16b
2281c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
2282c0855eaaSJohn Baldwin	eor	v17.16b,v24.16b,v17.16b
2283c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
2284c0855eaaSJohn Baldwin	eor	v30.16b,v25.16b,v30.16b
2285c0855eaaSJohn Baldwin	eor	v31.16b,v26.16b,v31.16b
2286c0855eaaSJohn Baldwin	st1	{v30.16b,v31.16b},[x1],#32
2287c0855eaaSJohn Baldwin
2288c0855eaaSJohn Baldwin	b	.Lxts_enc_done
2289c0855eaaSJohn Baldwin.align	4
2290c0855eaaSJohn Baldwin.Lxts_outer_enc_tail:
2291c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
2292c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2293c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
2294c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2295c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
2296c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2297c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
2298c0855eaaSJohn Baldwin	subs	w6,w6,#2
2299c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
2300c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2301c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
2302c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2303c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
2304c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2305c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
2306c0855eaaSJohn Baldwin	b.gt	.Lxts_outer_enc_tail
2307c0855eaaSJohn Baldwin
2308c0855eaaSJohn Baldwin	aese	v0.16b,v16.16b
2309c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2310c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
2311c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2312c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
2313c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2314c0855eaaSJohn Baldwin	eor	v4.16b,v6.16b,v7.16b
2315c0855eaaSJohn Baldwin	subs	x2,x2,#0x30
2316c0855eaaSJohn Baldwin	// The iv for first block
2317c0855eaaSJohn Baldwin	fmov	x9,d9
2318c0855eaaSJohn Baldwin	fmov	x10,v9.d[1]
2319c0855eaaSJohn Baldwin	//mov	w19,#0x87
2320c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2321c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2322c0855eaaSJohn Baldwin	and	w11,w19,w22,asr#31
2323c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl#1
2324c0855eaaSJohn Baldwin	fmov	d6,x9
2325c0855eaaSJohn Baldwin	fmov	v6.d[1],x10
2326c0855eaaSJohn Baldwin	eor	v5.16b,v8.16b,v7.16b
2327c0855eaaSJohn Baldwin	csel	x6,x2,x6,lo       // x6, w6, is zero at this point
2328c0855eaaSJohn Baldwin	aese	v0.16b,v17.16b
2329c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2330c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
2331c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2332c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
2333c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2334c0855eaaSJohn Baldwin	eor	v17.16b,v9.16b,v7.16b
2335c0855eaaSJohn Baldwin
2336c0855eaaSJohn Baldwin	add	x6,x6,#0x20
2337c0855eaaSJohn Baldwin	add	x0,x0,x6
2338c0855eaaSJohn Baldwin	mov	x7,x3
2339c0855eaaSJohn Baldwin
2340c0855eaaSJohn Baldwin	aese	v0.16b,v20.16b
2341c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2342c0855eaaSJohn Baldwin	aese	v1.16b,v20.16b
2343c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2344c0855eaaSJohn Baldwin	aese	v24.16b,v20.16b
2345c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2346c0855eaaSJohn Baldwin	aese	v0.16b,v21.16b
2347c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2348c0855eaaSJohn Baldwin	aese	v1.16b,v21.16b
2349c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2350c0855eaaSJohn Baldwin	aese	v24.16b,v21.16b
2351c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2352c0855eaaSJohn Baldwin	aese	v0.16b,v22.16b
2353c0855eaaSJohn Baldwin	aesmc	v0.16b,v0.16b
2354c0855eaaSJohn Baldwin	aese	v1.16b,v22.16b
2355c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2356c0855eaaSJohn Baldwin	aese	v24.16b,v22.16b
2357c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2358c0855eaaSJohn Baldwin	aese	v0.16b,v23.16b
2359c0855eaaSJohn Baldwin	aese	v1.16b,v23.16b
2360c0855eaaSJohn Baldwin	aese	v24.16b,v23.16b
2361c0855eaaSJohn Baldwin	ld1	{v27.16b},[x0],#16
2362c0855eaaSJohn Baldwin	add	w6,w5,#2
2363c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16                // re-pre-load rndkey[0]
2364c0855eaaSJohn Baldwin	eor	v4.16b,v4.16b,v0.16b
2365c0855eaaSJohn Baldwin	eor	v5.16b,v5.16b,v1.16b
2366c0855eaaSJohn Baldwin	eor	v24.16b,v24.16b,v17.16b
2367c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16                // re-pre-load rndkey[1]
2368c0855eaaSJohn Baldwin	st1	{v4.16b},[x1],#16
2369c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
2370c0855eaaSJohn Baldwin	st1	{v24.16b},[x1],#16
2371c0855eaaSJohn Baldwin	cmn	x2,#0x30
2372c0855eaaSJohn Baldwin	b.eq	.Lxts_enc_done
2373c0855eaaSJohn Baldwin.Lxts_encxor_one:
2374c0855eaaSJohn Baldwin	orr	v28.16b,v3.16b,v3.16b
2375c0855eaaSJohn Baldwin	orr	v29.16b,v27.16b,v27.16b
2376c0855eaaSJohn Baldwin	nop
2377c0855eaaSJohn Baldwin
2378c0855eaaSJohn Baldwin.Lxts_inner_enc_tail:
2379c0855eaaSJohn Baldwin	cmn	x2,#0x10
2380c0855eaaSJohn Baldwin	eor	v1.16b,v28.16b,v6.16b
2381c0855eaaSJohn Baldwin	eor	v24.16b,v29.16b,v8.16b
2382c0855eaaSJohn Baldwin	b.eq	.Lxts_enc_tail_loop
2383c0855eaaSJohn Baldwin	eor	v24.16b,v29.16b,v6.16b
2384c0855eaaSJohn Baldwin.Lxts_enc_tail_loop:
2385c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
2386c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2387c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
2388c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2389c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
2390c0855eaaSJohn Baldwin	subs	w6,w6,#2
2391c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
2392c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2393c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
2394c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2395c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
2396c0855eaaSJohn Baldwin	b.gt	.Lxts_enc_tail_loop
2397c0855eaaSJohn Baldwin
2398c0855eaaSJohn Baldwin	aese	v1.16b,v16.16b
2399c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2400c0855eaaSJohn Baldwin	aese	v24.16b,v16.16b
2401c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2402c0855eaaSJohn Baldwin	aese	v1.16b,v17.16b
2403c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2404c0855eaaSJohn Baldwin	aese	v24.16b,v17.16b
2405c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2406c0855eaaSJohn Baldwin	aese	v1.16b,v20.16b
2407c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2408c0855eaaSJohn Baldwin	aese	v24.16b,v20.16b
2409c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2410c0855eaaSJohn Baldwin	cmn	x2,#0x20
2411c0855eaaSJohn Baldwin	aese	v1.16b,v21.16b
2412c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2413c0855eaaSJohn Baldwin	aese	v24.16b,v21.16b
2414c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2415c0855eaaSJohn Baldwin	eor	v5.16b,v6.16b,v7.16b
2416c0855eaaSJohn Baldwin	aese	v1.16b,v22.16b
2417c0855eaaSJohn Baldwin	aesmc	v1.16b,v1.16b
2418c0855eaaSJohn Baldwin	aese	v24.16b,v22.16b
2419c0855eaaSJohn Baldwin	aesmc	v24.16b,v24.16b
2420c0855eaaSJohn Baldwin	eor	v17.16b,v8.16b,v7.16b
2421c0855eaaSJohn Baldwin	aese	v1.16b,v23.16b
2422c0855eaaSJohn Baldwin	aese	v24.16b,v23.16b
2423c0855eaaSJohn Baldwin	b.eq	.Lxts_enc_one
2424c0855eaaSJohn Baldwin	eor	v5.16b,v5.16b,v1.16b
2425c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
2426c0855eaaSJohn Baldwin	eor	v17.16b,v17.16b,v24.16b
2427c0855eaaSJohn Baldwin	orr	v6.16b,v8.16b,v8.16b
2428c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
2429c0855eaaSJohn Baldwin	fmov	x9,d8
2430c0855eaaSJohn Baldwin	fmov	x10,v8.d[1]
2431c0855eaaSJohn Baldwin	mov	w19,#0x87
2432c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2433c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2434c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2435c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2436c0855eaaSJohn Baldwin	fmov	d6,x9
2437c0855eaaSJohn Baldwin	fmov	v6.d[1],x10
2438c0855eaaSJohn Baldwin	b	.Lxts_enc_done
2439c0855eaaSJohn Baldwin
2440c0855eaaSJohn Baldwin.Lxts_enc_one:
2441c0855eaaSJohn Baldwin	eor	v5.16b,v5.16b,v24.16b
2442c0855eaaSJohn Baldwin	orr	v6.16b,v6.16b,v6.16b
2443c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
2444c0855eaaSJohn Baldwin	fmov	x9,d6
2445c0855eaaSJohn Baldwin	fmov	x10,v6.d[1]
2446c0855eaaSJohn Baldwin	mov	w19,#0x87
2447c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2448c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2449c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2450c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2451c0855eaaSJohn Baldwin	fmov	d6,x9
2452c0855eaaSJohn Baldwin	fmov	v6.d[1],x10
2453c0855eaaSJohn Baldwin	b	.Lxts_enc_done
2454c0855eaaSJohn Baldwin.align	5
2455c0855eaaSJohn Baldwin.Lxts_enc_done:
2456c0855eaaSJohn Baldwin	// Process the tail block with cipher stealing.
2457c0855eaaSJohn Baldwin	tst	x21,#0xf
2458c0855eaaSJohn Baldwin	b.eq	.Lxts_abort
2459c0855eaaSJohn Baldwin
2460c0855eaaSJohn Baldwin	mov	x20,x0
2461c0855eaaSJohn Baldwin	mov	x13,x1
2462c0855eaaSJohn Baldwin	sub	x1,x1,#16
2463c0855eaaSJohn Baldwin.composite_enc_loop:
2464c0855eaaSJohn Baldwin	subs	x21,x21,#1
2465c0855eaaSJohn Baldwin	ldrb	w15,[x1,x21]
2466c0855eaaSJohn Baldwin	ldrb	w14,[x20,x21]
2467c0855eaaSJohn Baldwin	strb	w15,[x13,x21]
2468c0855eaaSJohn Baldwin	strb	w14,[x1,x21]
2469c0855eaaSJohn Baldwin	b.gt	.composite_enc_loop
2470c0855eaaSJohn Baldwin.Lxts_enc_load_done:
2471c0855eaaSJohn Baldwin	ld1	{v26.16b},[x1]
2472c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v6.16b
2473c0855eaaSJohn Baldwin
2474c0855eaaSJohn Baldwin	// Encrypt the composite block to get the last second encrypted text block
2475c0855eaaSJohn Baldwin	ldr	w6,[x3,#240]		// load key schedule...
2476c0855eaaSJohn Baldwin	ld1	{v0.4s},[x3],#16
2477c0855eaaSJohn Baldwin	sub	w6,w6,#2
2478c0855eaaSJohn Baldwin	ld1	{v1.4s},[x3],#16		// load key schedule...
2479c0855eaaSJohn Baldwin.Loop_final_enc:
2480c0855eaaSJohn Baldwin	aese	v26.16b,v0.16b
2481c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
2482c0855eaaSJohn Baldwin	ld1	{v0.4s},[x3],#16
2483c0855eaaSJohn Baldwin	subs	w6,w6,#2
2484c0855eaaSJohn Baldwin	aese	v26.16b,v1.16b
2485c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
2486c0855eaaSJohn Baldwin	ld1	{v1.4s},[x3],#16
2487c0855eaaSJohn Baldwin	b.gt	.Loop_final_enc
2488c0855eaaSJohn Baldwin
2489c0855eaaSJohn Baldwin	aese	v26.16b,v0.16b
2490c0855eaaSJohn Baldwin	aesmc	v26.16b,v26.16b
2491c0855eaaSJohn Baldwin	ld1	{v0.4s},[x3]
2492c0855eaaSJohn Baldwin	aese	v26.16b,v1.16b
2493c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v0.16b
2494c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v6.16b
2495c0855eaaSJohn Baldwin	st1	{v26.16b},[x1]
2496c0855eaaSJohn Baldwin
2497c0855eaaSJohn Baldwin.Lxts_abort:
2498c0855eaaSJohn Baldwin	ldp	x21,x22,[sp,#48]
2499c0855eaaSJohn Baldwin	ldp	d8,d9,[sp,#32]
2500c0855eaaSJohn Baldwin	ldp	d10,d11,[sp,#16]
2501c0855eaaSJohn Baldwin	ldp	x19,x20,[sp],#64
2502c0855eaaSJohn Baldwin.Lxts_enc_final_abort:
2503c0855eaaSJohn Baldwin	ret
2504c0855eaaSJohn Baldwin.size	aes_v8_xts_encrypt,.-aes_v8_xts_encrypt
2505c0855eaaSJohn Baldwin.globl	aes_v8_xts_decrypt
2506c0855eaaSJohn Baldwin.type	aes_v8_xts_decrypt,%function
2507c0855eaaSJohn Baldwin.align	5
2508c0855eaaSJohn Baldwinaes_v8_xts_decrypt:
2509bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
2510c0855eaaSJohn Baldwin	cmp	x2,#16
2511c0855eaaSJohn Baldwin	// Original input data size bigger than 16, jump to big size processing.
2512c0855eaaSJohn Baldwin	b.ne	.Lxts_dec_big_size
2513c0855eaaSJohn Baldwin	// Encrypt the iv with key2, as the first XEX iv.
2514c0855eaaSJohn Baldwin	ldr	w6,[x4,#240]
2515c0855eaaSJohn Baldwin	ld1	{v0.4s},[x4],#16
2516c0855eaaSJohn Baldwin	ld1	{v6.16b},[x5]
2517c0855eaaSJohn Baldwin	sub	w6,w6,#2
2518c0855eaaSJohn Baldwin	ld1	{v1.4s},[x4],#16
2519c0855eaaSJohn Baldwin
2520c0855eaaSJohn Baldwin.Loop_dec_small_iv_enc:
2521c0855eaaSJohn Baldwin	aese	v6.16b,v0.16b
2522c0855eaaSJohn Baldwin	aesmc	v6.16b,v6.16b
2523c0855eaaSJohn Baldwin	ld1	{v0.4s},[x4],#16
2524c0855eaaSJohn Baldwin	subs	w6,w6,#2
2525c0855eaaSJohn Baldwin	aese	v6.16b,v1.16b
2526c0855eaaSJohn Baldwin	aesmc	v6.16b,v6.16b
2527c0855eaaSJohn Baldwin	ld1	{v1.4s},[x4],#16
2528c0855eaaSJohn Baldwin	b.gt	.Loop_dec_small_iv_enc
2529c0855eaaSJohn Baldwin
2530c0855eaaSJohn Baldwin	aese	v6.16b,v0.16b
2531c0855eaaSJohn Baldwin	aesmc	v6.16b,v6.16b
2532c0855eaaSJohn Baldwin	ld1	{v0.4s},[x4]
2533c0855eaaSJohn Baldwin	aese	v6.16b,v1.16b
2534c0855eaaSJohn Baldwin	eor	v6.16b,v6.16b,v0.16b
2535c0855eaaSJohn Baldwin
2536c0855eaaSJohn Baldwin	ld1	{v0.16b},[x0]
2537c0855eaaSJohn Baldwin	eor	v0.16b,v6.16b,v0.16b
2538c0855eaaSJohn Baldwin
2539c0855eaaSJohn Baldwin	ldr	w6,[x3,#240]
2540c0855eaaSJohn Baldwin	ld1	{v28.4s,v29.4s},[x3],#32			// load key schedule...
2541c0855eaaSJohn Baldwin
2542c0855eaaSJohn Baldwin	aesd	v0.16b,v28.16b
2543c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2544c0855eaaSJohn Baldwin	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
2545c0855eaaSJohn Baldwin	aesd	v0.16b,v29.16b
2546c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2547c0855eaaSJohn Baldwin	subs	w6,w6,#10			// bias
2548c0855eaaSJohn Baldwin	b.eq	.Lxts_128_dec
2549c0855eaaSJohn Baldwin.Lxts_dec_round_loop:
2550c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
2551c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2552c0855eaaSJohn Baldwin	ld1	{v16.4s},[x3],#16			// load key schedule...
2553c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
2554c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2555c0855eaaSJohn Baldwin	ld1	{v17.4s},[x3],#16			// load key schedule...
2556c0855eaaSJohn Baldwin	subs	w6,w6,#2			// bias
2557c0855eaaSJohn Baldwin	b.gt	.Lxts_dec_round_loop
2558c0855eaaSJohn Baldwin.Lxts_128_dec:
2559c0855eaaSJohn Baldwin	ld1	{v18.4s,v19.4s},[x3],#32			// load key schedule...
2560c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
2561c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2562c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
2563c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2564c0855eaaSJohn Baldwin	ld1	{v20.4s,v21.4s},[x3],#32			// load key schedule...
2565c0855eaaSJohn Baldwin	aesd	v0.16b,v18.16b
2566c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2567c0855eaaSJohn Baldwin	aesd	v0.16b,v19.16b
2568c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2569c0855eaaSJohn Baldwin	ld1	{v22.4s,v23.4s},[x3],#32			// load key schedule...
2570c0855eaaSJohn Baldwin	aesd	v0.16b,v20.16b
2571c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2572c0855eaaSJohn Baldwin	aesd	v0.16b,v21.16b
2573c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2574c0855eaaSJohn Baldwin	ld1	{v7.4s},[x3]
2575c0855eaaSJohn Baldwin	aesd	v0.16b,v22.16b
2576c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2577c0855eaaSJohn Baldwin	aesd	v0.16b,v23.16b
2578c0855eaaSJohn Baldwin	eor	v0.16b,v0.16b,v7.16b
2579c0855eaaSJohn Baldwin	eor	v0.16b,v6.16b,v0.16b
2580c0855eaaSJohn Baldwin	st1	{v0.16b},[x1]
2581c0855eaaSJohn Baldwin	b	.Lxts_dec_final_abort
2582c0855eaaSJohn Baldwin.Lxts_dec_big_size:
2583c0855eaaSJohn Baldwin	stp	x19,x20,[sp,#-64]!
2584c0855eaaSJohn Baldwin	stp	x21,x22,[sp,#48]
2585c0855eaaSJohn Baldwin	stp	d8,d9,[sp,#32]
2586c0855eaaSJohn Baldwin	stp	d10,d11,[sp,#16]
2587c0855eaaSJohn Baldwin
2588c0855eaaSJohn Baldwin	and	x21,x2,#0xf
2589c0855eaaSJohn Baldwin	and	x2,x2,#-16
2590c0855eaaSJohn Baldwin	subs	x2,x2,#16
2591c0855eaaSJohn Baldwin	mov	x8,#16
2592c0855eaaSJohn Baldwin	b.lo	.Lxts_dec_abort
2593c0855eaaSJohn Baldwin
2594c0855eaaSJohn Baldwin	// Encrypt the iv with key2, as the first XEX iv
2595c0855eaaSJohn Baldwin	ldr	w6,[x4,#240]
2596c0855eaaSJohn Baldwin	ld1	{v0.4s},[x4],#16
2597c0855eaaSJohn Baldwin	ld1	{v6.16b},[x5]
2598c0855eaaSJohn Baldwin	sub	w6,w6,#2
2599c0855eaaSJohn Baldwin	ld1	{v1.4s},[x4],#16
2600c0855eaaSJohn Baldwin
2601c0855eaaSJohn Baldwin.Loop_dec_iv_enc:
2602c0855eaaSJohn Baldwin	aese	v6.16b,v0.16b
2603c0855eaaSJohn Baldwin	aesmc	v6.16b,v6.16b
2604c0855eaaSJohn Baldwin	ld1	{v0.4s},[x4],#16
2605c0855eaaSJohn Baldwin	subs	w6,w6,#2
2606c0855eaaSJohn Baldwin	aese	v6.16b,v1.16b
2607c0855eaaSJohn Baldwin	aesmc	v6.16b,v6.16b
2608c0855eaaSJohn Baldwin	ld1	{v1.4s},[x4],#16
2609c0855eaaSJohn Baldwin	b.gt	.Loop_dec_iv_enc
2610c0855eaaSJohn Baldwin
2611c0855eaaSJohn Baldwin	aese	v6.16b,v0.16b
2612c0855eaaSJohn Baldwin	aesmc	v6.16b,v6.16b
2613c0855eaaSJohn Baldwin	ld1	{v0.4s},[x4]
2614c0855eaaSJohn Baldwin	aese	v6.16b,v1.16b
2615c0855eaaSJohn Baldwin	eor	v6.16b,v6.16b,v0.16b
2616c0855eaaSJohn Baldwin
2617c0855eaaSJohn Baldwin	// The iv for second block
2618c0855eaaSJohn Baldwin	// x9- iv(low), x10 - iv(high)
2619c0855eaaSJohn Baldwin	// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
2620c0855eaaSJohn Baldwin	fmov	x9,d6
2621c0855eaaSJohn Baldwin	fmov	x10,v6.d[1]
2622c0855eaaSJohn Baldwin	mov	w19,#0x87
2623c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2624c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2625c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2626c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2627c0855eaaSJohn Baldwin	fmov	d8,x9
2628c0855eaaSJohn Baldwin	fmov	v8.d[1],x10
2629c0855eaaSJohn Baldwin
2630c0855eaaSJohn Baldwin	ldr	w5,[x3,#240]		// load rounds number
2631c0855eaaSJohn Baldwin
2632c0855eaaSJohn Baldwin	// The iv for third block
2633c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2634c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2635c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2636c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2637c0855eaaSJohn Baldwin	fmov	d9,x9
2638c0855eaaSJohn Baldwin	fmov	v9.d[1],x10
2639c0855eaaSJohn Baldwin
2640c0855eaaSJohn Baldwin	ld1	{v16.4s,v17.4s},[x3]			// load key schedule...
2641c0855eaaSJohn Baldwin	sub	w5,w5,#6
2642c0855eaaSJohn Baldwin	add	x7,x3,x5,lsl#4		// pointer to last 7 round keys
2643c0855eaaSJohn Baldwin	sub	w5,w5,#2
2644c0855eaaSJohn Baldwin	ld1	{v18.4s,v19.4s},[x7],#32		// load key schedule...
2645c0855eaaSJohn Baldwin	ld1	{v20.4s,v21.4s},[x7],#32
2646c0855eaaSJohn Baldwin	ld1	{v22.4s,v23.4s},[x7],#32
2647c0855eaaSJohn Baldwin	ld1	{v7.4s},[x7]
2648c0855eaaSJohn Baldwin
2649c0855eaaSJohn Baldwin	// The iv for fourth block
2650c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2651c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2652c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2653c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2654c0855eaaSJohn Baldwin	fmov	d10,x9
2655c0855eaaSJohn Baldwin	fmov	v10.d[1],x10
2656c0855eaaSJohn Baldwin
2657c0855eaaSJohn Baldwin	add	x7,x3,#32
2658c0855eaaSJohn Baldwin	mov	w6,w5
2659c0855eaaSJohn Baldwin	b	.Lxts_dec
2660c0855eaaSJohn Baldwin
2661c0855eaaSJohn Baldwin	// Decryption
2662c0855eaaSJohn Baldwin.align	5
2663c0855eaaSJohn Baldwin.Lxts_dec:
2664c0855eaaSJohn Baldwin	tst	x21,#0xf
2665c0855eaaSJohn Baldwin	b.eq	.Lxts_dec_begin
2666c0855eaaSJohn Baldwin	subs	x2,x2,#16
2667c0855eaaSJohn Baldwin	csel	x8,xzr,x8,eq
2668c0855eaaSJohn Baldwin	ld1	{v0.16b},[x0],#16
2669c0855eaaSJohn Baldwin	b.lo	.Lxts_done
2670c0855eaaSJohn Baldwin	sub	x0,x0,#16
2671c0855eaaSJohn Baldwin.Lxts_dec_begin:
2672c0855eaaSJohn Baldwin	ld1	{v0.16b},[x0],x8
2673c0855eaaSJohn Baldwin	subs	x2,x2,#32			// bias
2674c0855eaaSJohn Baldwin	add	w6,w5,#2
2675c0855eaaSJohn Baldwin	orr	v3.16b,v0.16b,v0.16b
2676c0855eaaSJohn Baldwin	orr	v1.16b,v0.16b,v0.16b
2677c0855eaaSJohn Baldwin	orr	v28.16b,v0.16b,v0.16b
2678c0855eaaSJohn Baldwin	ld1	{v24.16b},[x0],#16
2679c0855eaaSJohn Baldwin	orr	v27.16b,v24.16b,v24.16b
2680c0855eaaSJohn Baldwin	orr	v29.16b,v24.16b,v24.16b
2681c0855eaaSJohn Baldwin	b.lo	.Lxts_inner_dec_tail
2682c0855eaaSJohn Baldwin	eor	v0.16b,v0.16b,v6.16b			// before decryt, xor with iv
2683c0855eaaSJohn Baldwin	eor	v24.16b,v24.16b,v8.16b
2684c0855eaaSJohn Baldwin
2685c0855eaaSJohn Baldwin	orr	v1.16b,v24.16b,v24.16b
2686c0855eaaSJohn Baldwin	ld1	{v24.16b},[x0],#16
2687c0855eaaSJohn Baldwin	orr	v2.16b,v0.16b,v0.16b
2688c0855eaaSJohn Baldwin	orr	v3.16b,v1.16b,v1.16b
2689c0855eaaSJohn Baldwin	eor	v27.16b,v24.16b,v9.16b			// third block xox with third iv
2690c0855eaaSJohn Baldwin	eor	v24.16b,v24.16b,v9.16b
2691c0855eaaSJohn Baldwin	cmp	x2,#32
2692c0855eaaSJohn Baldwin	b.lo	.Lxts_outer_dec_tail
2693c0855eaaSJohn Baldwin
2694c0855eaaSJohn Baldwin	ld1	{v25.16b},[x0],#16
2695c0855eaaSJohn Baldwin
2696c0855eaaSJohn Baldwin	// The iv for fifth block
2697c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2698c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2699c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2700c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2701c0855eaaSJohn Baldwin	fmov	d11,x9
2702c0855eaaSJohn Baldwin	fmov	v11.d[1],x10
2703c0855eaaSJohn Baldwin
2704c0855eaaSJohn Baldwin	ld1	{v26.16b},[x0],#16
2705c0855eaaSJohn Baldwin	eor	v25.16b,v25.16b,v10.16b		// the fourth block
2706c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v11.16b
2707c0855eaaSJohn Baldwin	sub	x2,x2,#32			// bias
2708c0855eaaSJohn Baldwin	mov	w6,w5
2709c0855eaaSJohn Baldwin	b	.Loop5x_xts_dec
2710c0855eaaSJohn Baldwin
2711c0855eaaSJohn Baldwin.align	4
2712c0855eaaSJohn Baldwin.Loop5x_xts_dec:
2713c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
2714c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2715c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
2716c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2717c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
2718c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2719c0855eaaSJohn Baldwin	aesd	v25.16b,v16.16b
2720c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
2721c0855eaaSJohn Baldwin	aesd	v26.16b,v16.16b
2722c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
2723c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16		// load key schedule...
2724c0855eaaSJohn Baldwin	subs	w6,w6,#2
2725c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
2726c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2727c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
2728c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2729c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
2730c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2731c0855eaaSJohn Baldwin	aesd	v25.16b,v17.16b
2732c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
2733c0855eaaSJohn Baldwin	aesd	v26.16b,v17.16b
2734c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
2735c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16		// load key schedule...
2736c0855eaaSJohn Baldwin	b.gt	.Loop5x_xts_dec
2737c0855eaaSJohn Baldwin
2738c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
2739c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2740c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
2741c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2742c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
2743c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2744c0855eaaSJohn Baldwin	aesd	v25.16b,v16.16b
2745c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
2746c0855eaaSJohn Baldwin	aesd	v26.16b,v16.16b
2747c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
2748c0855eaaSJohn Baldwin	subs	x2,x2,#0x50			// because .Lxts_dec_tail4x
2749c0855eaaSJohn Baldwin
2750c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
2751c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2752c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
2753c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2754c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
2755c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2756c0855eaaSJohn Baldwin	aesd	v25.16b,v17.16b
2757c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
2758c0855eaaSJohn Baldwin	aesd	v26.16b,v17.16b
2759c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
2760c0855eaaSJohn Baldwin	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
2761c0855eaaSJohn Baldwin	mov	x7,x3
2762c0855eaaSJohn Baldwin
2763c0855eaaSJohn Baldwin	aesd	v0.16b,v18.16b
2764c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2765c0855eaaSJohn Baldwin	aesd	v1.16b,v18.16b
2766c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2767c0855eaaSJohn Baldwin	aesd	v24.16b,v18.16b
2768c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2769c0855eaaSJohn Baldwin	aesd	v25.16b,v18.16b
2770c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
2771c0855eaaSJohn Baldwin	aesd	v26.16b,v18.16b
2772c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
2773c0855eaaSJohn Baldwin	add	x0,x0,x6		// x0 is adjusted in such way that
2774c0855eaaSJohn Baldwin						// at exit from the loop v1.16b-v26.16b
2775c0855eaaSJohn Baldwin						// are loaded with last "words"
2776c0855eaaSJohn Baldwin	add	x6,x2,#0x60		// because .Lxts_dec_tail4x
2777c0855eaaSJohn Baldwin
2778c0855eaaSJohn Baldwin	aesd	v0.16b,v19.16b
2779c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2780c0855eaaSJohn Baldwin	aesd	v1.16b,v19.16b
2781c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2782c0855eaaSJohn Baldwin	aesd	v24.16b,v19.16b
2783c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2784c0855eaaSJohn Baldwin	aesd	v25.16b,v19.16b
2785c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
2786c0855eaaSJohn Baldwin	aesd	v26.16b,v19.16b
2787c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
2788c0855eaaSJohn Baldwin
2789c0855eaaSJohn Baldwin	aesd	v0.16b,v20.16b
2790c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2791c0855eaaSJohn Baldwin	aesd	v1.16b,v20.16b
2792c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2793c0855eaaSJohn Baldwin	aesd	v24.16b,v20.16b
2794c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2795c0855eaaSJohn Baldwin	aesd	v25.16b,v20.16b
2796c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
2797c0855eaaSJohn Baldwin	aesd	v26.16b,v20.16b
2798c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
2799c0855eaaSJohn Baldwin
2800c0855eaaSJohn Baldwin	aesd	v0.16b,v21.16b
2801c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2802c0855eaaSJohn Baldwin	aesd	v1.16b,v21.16b
2803c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2804c0855eaaSJohn Baldwin	aesd	v24.16b,v21.16b
2805c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2806c0855eaaSJohn Baldwin	aesd	v25.16b,v21.16b
2807c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
2808c0855eaaSJohn Baldwin	aesd	v26.16b,v21.16b
2809c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
2810c0855eaaSJohn Baldwin
2811c0855eaaSJohn Baldwin	aesd	v0.16b,v22.16b
2812c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2813c0855eaaSJohn Baldwin	aesd	v1.16b,v22.16b
2814c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2815c0855eaaSJohn Baldwin	aesd	v24.16b,v22.16b
2816c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2817c0855eaaSJohn Baldwin	aesd	v25.16b,v22.16b
2818c0855eaaSJohn Baldwin	aesimc	v25.16b,v25.16b
2819c0855eaaSJohn Baldwin	aesd	v26.16b,v22.16b
2820c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
2821c0855eaaSJohn Baldwin
2822c0855eaaSJohn Baldwin	eor	v4.16b,v7.16b,v6.16b
2823c0855eaaSJohn Baldwin	aesd	v0.16b,v23.16b
2824c0855eaaSJohn Baldwin	// The iv for first block of next iteration.
2825c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2826c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2827c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2828c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2829c0855eaaSJohn Baldwin	fmov	d6,x9
2830c0855eaaSJohn Baldwin	fmov	v6.d[1],x10
2831c0855eaaSJohn Baldwin	eor	v5.16b,v7.16b,v8.16b
2832c0855eaaSJohn Baldwin	ld1	{v2.16b},[x0],#16
2833c0855eaaSJohn Baldwin	aesd	v1.16b,v23.16b
2834c0855eaaSJohn Baldwin	// The iv for second block
2835c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2836c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2837c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2838c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2839c0855eaaSJohn Baldwin	fmov	d8,x9
2840c0855eaaSJohn Baldwin	fmov	v8.d[1],x10
2841c0855eaaSJohn Baldwin	eor	v17.16b,v7.16b,v9.16b
2842c0855eaaSJohn Baldwin	ld1	{v3.16b},[x0],#16
2843c0855eaaSJohn Baldwin	aesd	v24.16b,v23.16b
2844c0855eaaSJohn Baldwin	// The iv for third block
2845c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2846c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2847c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2848c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2849c0855eaaSJohn Baldwin	fmov	d9,x9
2850c0855eaaSJohn Baldwin	fmov	v9.d[1],x10
2851c0855eaaSJohn Baldwin	eor	v30.16b,v7.16b,v10.16b
2852c0855eaaSJohn Baldwin	ld1	{v27.16b},[x0],#16
2853c0855eaaSJohn Baldwin	aesd	v25.16b,v23.16b
2854c0855eaaSJohn Baldwin	// The iv for fourth block
2855c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2856c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2857c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2858c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2859c0855eaaSJohn Baldwin	fmov	d10,x9
2860c0855eaaSJohn Baldwin	fmov	v10.d[1],x10
2861c0855eaaSJohn Baldwin	eor	v31.16b,v7.16b,v11.16b
2862c0855eaaSJohn Baldwin	ld1	{v28.16b},[x0],#16
2863c0855eaaSJohn Baldwin	aesd	v26.16b,v23.16b
2864c0855eaaSJohn Baldwin
2865c0855eaaSJohn Baldwin	// The iv for fifth block
2866c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2867c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2868c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2869c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2870c0855eaaSJohn Baldwin	fmov	d11,x9
2871c0855eaaSJohn Baldwin	fmov	v11.d[1],x10
2872c0855eaaSJohn Baldwin
2873c0855eaaSJohn Baldwin	ld1	{v29.16b},[x0],#16
2874c0855eaaSJohn Baldwin	cbz	x6,.Lxts_dec_tail4x
2875c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
2876c0855eaaSJohn Baldwin	eor	v4.16b,v4.16b,v0.16b
2877c0855eaaSJohn Baldwin	eor	v0.16b,v2.16b,v6.16b
2878c0855eaaSJohn Baldwin	eor	v5.16b,v5.16b,v1.16b
2879c0855eaaSJohn Baldwin	eor	v1.16b,v3.16b,v8.16b
2880c0855eaaSJohn Baldwin	eor	v17.16b,v17.16b,v24.16b
2881c0855eaaSJohn Baldwin	eor	v24.16b,v27.16b,v9.16b
2882c0855eaaSJohn Baldwin	eor	v30.16b,v30.16b,v25.16b
2883c0855eaaSJohn Baldwin	eor	v25.16b,v28.16b,v10.16b
2884c0855eaaSJohn Baldwin	eor	v31.16b,v31.16b,v26.16b
2885c0855eaaSJohn Baldwin	st1	{v4.16b},[x1],#16
2886c0855eaaSJohn Baldwin	eor	v26.16b,v29.16b,v11.16b
2887c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
2888c0855eaaSJohn Baldwin	mov	w6,w5
2889c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
2890c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
2891c0855eaaSJohn Baldwin	st1	{v30.16b},[x1],#16
2892c0855eaaSJohn Baldwin	st1	{v31.16b},[x1],#16
2893c0855eaaSJohn Baldwin	b.hs	.Loop5x_xts_dec
2894c0855eaaSJohn Baldwin
2895c0855eaaSJohn Baldwin	cmn	x2,#0x10
2896c0855eaaSJohn Baldwin	b.ne	.Loop5x_dec_after
2897c0855eaaSJohn Baldwin	// If x2(x2) equal to -0x10, the left blocks is 4.
2898c0855eaaSJohn Baldwin	// After specially processing, utilize the five blocks processing again.
2899c0855eaaSJohn Baldwin	// It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b.
2900c0855eaaSJohn Baldwin	orr	v11.16b,v10.16b,v10.16b
2901c0855eaaSJohn Baldwin	orr	v10.16b,v9.16b,v9.16b
2902c0855eaaSJohn Baldwin	orr	v9.16b,v8.16b,v8.16b
2903c0855eaaSJohn Baldwin	orr	v8.16b,v6.16b,v6.16b
2904c0855eaaSJohn Baldwin	fmov	x9,d11
2905c0855eaaSJohn Baldwin	fmov	x10,v11.d[1]
2906c0855eaaSJohn Baldwin	eor	v0.16b,v6.16b,v2.16b
2907c0855eaaSJohn Baldwin	eor	v1.16b,v8.16b,v3.16b
2908c0855eaaSJohn Baldwin	eor	v24.16b,v27.16b,v9.16b
2909c0855eaaSJohn Baldwin	eor	v25.16b,v28.16b,v10.16b
2910c0855eaaSJohn Baldwin	eor	v26.16b,v29.16b,v11.16b
2911c0855eaaSJohn Baldwin	b.eq	.Loop5x_xts_dec
2912c0855eaaSJohn Baldwin
2913c0855eaaSJohn Baldwin.Loop5x_dec_after:
2914c0855eaaSJohn Baldwin	add	x2,x2,#0x50
2915c0855eaaSJohn Baldwin	cbz	x2,.Lxts_done
2916c0855eaaSJohn Baldwin
2917c0855eaaSJohn Baldwin	add	w6,w5,#2
2918c0855eaaSJohn Baldwin	subs	x2,x2,#0x30
2919c0855eaaSJohn Baldwin	b.lo	.Lxts_inner_dec_tail
2920c0855eaaSJohn Baldwin
2921c0855eaaSJohn Baldwin	eor	v0.16b,v6.16b,v27.16b
2922c0855eaaSJohn Baldwin	eor	v1.16b,v8.16b,v28.16b
2923c0855eaaSJohn Baldwin	eor	v24.16b,v29.16b,v9.16b
2924c0855eaaSJohn Baldwin	b	.Lxts_outer_dec_tail
2925c0855eaaSJohn Baldwin
2926c0855eaaSJohn Baldwin.align	4
2927c0855eaaSJohn Baldwin.Lxts_dec_tail4x:
2928c0855eaaSJohn Baldwin	add	x0,x0,#16
2929c0855eaaSJohn Baldwin	tst	x21,#0xf
2930c0855eaaSJohn Baldwin	eor	v5.16b,v1.16b,v4.16b
2931c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
2932c0855eaaSJohn Baldwin	eor	v17.16b,v24.16b,v17.16b
2933c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
2934c0855eaaSJohn Baldwin	eor	v30.16b,v25.16b,v30.16b
2935c0855eaaSJohn Baldwin	eor	v31.16b,v26.16b,v31.16b
2936c0855eaaSJohn Baldwin	st1	{v30.16b,v31.16b},[x1],#32
2937c0855eaaSJohn Baldwin
2938c0855eaaSJohn Baldwin	b.eq	.Lxts_dec_abort
2939c0855eaaSJohn Baldwin	ld1	{v0.16b},[x0],#16
2940c0855eaaSJohn Baldwin	b	.Lxts_done
2941c0855eaaSJohn Baldwin.align	4
2942c0855eaaSJohn Baldwin.Lxts_outer_dec_tail:
2943c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
2944c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2945c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
2946c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2947c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
2948c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2949c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
2950c0855eaaSJohn Baldwin	subs	w6,w6,#2
2951c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
2952c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2953c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
2954c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2955c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
2956c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2957c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
2958c0855eaaSJohn Baldwin	b.gt	.Lxts_outer_dec_tail
2959c0855eaaSJohn Baldwin
2960c0855eaaSJohn Baldwin	aesd	v0.16b,v16.16b
2961c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2962c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
2963c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2964c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
2965c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2966c0855eaaSJohn Baldwin	eor	v4.16b,v6.16b,v7.16b
2967c0855eaaSJohn Baldwin	subs	x2,x2,#0x30
2968c0855eaaSJohn Baldwin	// The iv for first block
2969c0855eaaSJohn Baldwin	fmov	x9,d9
2970c0855eaaSJohn Baldwin	fmov	x10,v9.d[1]
2971c0855eaaSJohn Baldwin	mov	w19,#0x87
2972c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2973c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2974c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2975c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2976c0855eaaSJohn Baldwin	fmov	d6,x9
2977c0855eaaSJohn Baldwin	fmov	v6.d[1],x10
2978c0855eaaSJohn Baldwin	eor	v5.16b,v8.16b,v7.16b
2979c0855eaaSJohn Baldwin	csel	x6,x2,x6,lo	// x6, w6, is zero at this point
2980c0855eaaSJohn Baldwin	aesd	v0.16b,v17.16b
2981c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
2982c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
2983c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
2984c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
2985c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
2986c0855eaaSJohn Baldwin	eor	v17.16b,v9.16b,v7.16b
2987c0855eaaSJohn Baldwin	// The iv for second block
2988c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
2989c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
2990c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
2991c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
2992c0855eaaSJohn Baldwin	fmov	d8,x9
2993c0855eaaSJohn Baldwin	fmov	v8.d[1],x10
2994c0855eaaSJohn Baldwin
2995c0855eaaSJohn Baldwin	add	x6,x6,#0x20
2996c0855eaaSJohn Baldwin	add	x0,x0,x6		// x0 is adjusted to the last data
2997c0855eaaSJohn Baldwin
2998c0855eaaSJohn Baldwin	mov	x7,x3
2999c0855eaaSJohn Baldwin
3000c0855eaaSJohn Baldwin	// The iv for third block
3001c0855eaaSJohn Baldwin	extr	x22,x10,x10,#32
3002c0855eaaSJohn Baldwin	extr	x10,x10,x9,#63
3003c0855eaaSJohn Baldwin	and	w11,w19,w22,asr #31
3004c0855eaaSJohn Baldwin	eor	x9,x11,x9,lsl #1
3005c0855eaaSJohn Baldwin	fmov	d9,x9
3006c0855eaaSJohn Baldwin	fmov	v9.d[1],x10
3007c0855eaaSJohn Baldwin
3008c0855eaaSJohn Baldwin	aesd	v0.16b,v20.16b
3009c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
3010c0855eaaSJohn Baldwin	aesd	v1.16b,v20.16b
3011c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
3012c0855eaaSJohn Baldwin	aesd	v24.16b,v20.16b
3013c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
3014c0855eaaSJohn Baldwin	aesd	v0.16b,v21.16b
3015c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
3016c0855eaaSJohn Baldwin	aesd	v1.16b,v21.16b
3017c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
3018c0855eaaSJohn Baldwin	aesd	v24.16b,v21.16b
3019c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
3020c0855eaaSJohn Baldwin	aesd	v0.16b,v22.16b
3021c0855eaaSJohn Baldwin	aesimc	v0.16b,v0.16b
3022c0855eaaSJohn Baldwin	aesd	v1.16b,v22.16b
3023c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
3024c0855eaaSJohn Baldwin	aesd	v24.16b,v22.16b
3025c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
3026c0855eaaSJohn Baldwin	ld1	{v27.16b},[x0],#16
3027c0855eaaSJohn Baldwin	aesd	v0.16b,v23.16b
3028c0855eaaSJohn Baldwin	aesd	v1.16b,v23.16b
3029c0855eaaSJohn Baldwin	aesd	v24.16b,v23.16b
3030c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
3031c0855eaaSJohn Baldwin	add	w6,w5,#2
3032c0855eaaSJohn Baldwin	eor	v4.16b,v4.16b,v0.16b
3033c0855eaaSJohn Baldwin	eor	v5.16b,v5.16b,v1.16b
3034c0855eaaSJohn Baldwin	eor	v24.16b,v24.16b,v17.16b
3035c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
3036c0855eaaSJohn Baldwin	st1	{v4.16b},[x1],#16
3037c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
3038c0855eaaSJohn Baldwin	st1	{v24.16b},[x1],#16
3039c0855eaaSJohn Baldwin
3040c0855eaaSJohn Baldwin	cmn	x2,#0x30
3041c0855eaaSJohn Baldwin	add	x2,x2,#0x30
3042c0855eaaSJohn Baldwin	b.eq	.Lxts_done
3043c0855eaaSJohn Baldwin	sub	x2,x2,#0x30
3044c0855eaaSJohn Baldwin	orr	v28.16b,v3.16b,v3.16b
3045c0855eaaSJohn Baldwin	orr	v29.16b,v27.16b,v27.16b
3046c0855eaaSJohn Baldwin	nop
3047c0855eaaSJohn Baldwin
3048c0855eaaSJohn Baldwin.Lxts_inner_dec_tail:
3049c0855eaaSJohn Baldwin	// x2 == -0x10 means two blocks left.
3050c0855eaaSJohn Baldwin	cmn	x2,#0x10
3051c0855eaaSJohn Baldwin	eor	v1.16b,v28.16b,v6.16b
3052c0855eaaSJohn Baldwin	eor	v24.16b,v29.16b,v8.16b
3053c0855eaaSJohn Baldwin	b.eq	.Lxts_dec_tail_loop
3054c0855eaaSJohn Baldwin	eor	v24.16b,v29.16b,v6.16b
3055c0855eaaSJohn Baldwin.Lxts_dec_tail_loop:
3056c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
3057c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
3058c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
3059c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
3060c0855eaaSJohn Baldwin	ld1	{v16.4s},[x7],#16
3061c0855eaaSJohn Baldwin	subs	w6,w6,#2
3062c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
3063c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
3064c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
3065c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
3066c0855eaaSJohn Baldwin	ld1	{v17.4s},[x7],#16
3067c0855eaaSJohn Baldwin	b.gt	.Lxts_dec_tail_loop
3068c0855eaaSJohn Baldwin
3069c0855eaaSJohn Baldwin	aesd	v1.16b,v16.16b
3070c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
3071c0855eaaSJohn Baldwin	aesd	v24.16b,v16.16b
3072c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
3073c0855eaaSJohn Baldwin	aesd	v1.16b,v17.16b
3074c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
3075c0855eaaSJohn Baldwin	aesd	v24.16b,v17.16b
3076c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
3077c0855eaaSJohn Baldwin	aesd	v1.16b,v20.16b
3078c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
3079c0855eaaSJohn Baldwin	aesd	v24.16b,v20.16b
3080c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
3081c0855eaaSJohn Baldwin	cmn	x2,#0x20
3082c0855eaaSJohn Baldwin	aesd	v1.16b,v21.16b
3083c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
3084c0855eaaSJohn Baldwin	aesd	v24.16b,v21.16b
3085c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
3086c0855eaaSJohn Baldwin	eor	v5.16b,v6.16b,v7.16b
3087c0855eaaSJohn Baldwin	aesd	v1.16b,v22.16b
3088c0855eaaSJohn Baldwin	aesimc	v1.16b,v1.16b
3089c0855eaaSJohn Baldwin	aesd	v24.16b,v22.16b
3090c0855eaaSJohn Baldwin	aesimc	v24.16b,v24.16b
3091c0855eaaSJohn Baldwin	eor	v17.16b,v8.16b,v7.16b
3092c0855eaaSJohn Baldwin	aesd	v1.16b,v23.16b
3093c0855eaaSJohn Baldwin	aesd	v24.16b,v23.16b
3094c0855eaaSJohn Baldwin	b.eq	.Lxts_dec_one
3095c0855eaaSJohn Baldwin	eor	v5.16b,v5.16b,v1.16b
3096c0855eaaSJohn Baldwin	eor	v17.16b,v17.16b,v24.16b
3097c0855eaaSJohn Baldwin	orr	v6.16b,v9.16b,v9.16b
3098c0855eaaSJohn Baldwin	orr	v8.16b,v10.16b,v10.16b
3099c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
3100c0855eaaSJohn Baldwin	st1	{v17.16b},[x1],#16
3101c0855eaaSJohn Baldwin	add	x2,x2,#16
3102c0855eaaSJohn Baldwin	b	.Lxts_done
3103c0855eaaSJohn Baldwin
3104c0855eaaSJohn Baldwin.Lxts_dec_one:
3105c0855eaaSJohn Baldwin	eor	v5.16b,v5.16b,v24.16b
3106c0855eaaSJohn Baldwin	orr	v6.16b,v8.16b,v8.16b
3107c0855eaaSJohn Baldwin	orr	v8.16b,v9.16b,v9.16b
3108c0855eaaSJohn Baldwin	st1	{v5.16b},[x1],#16
3109c0855eaaSJohn Baldwin	add	x2,x2,#32
3110c0855eaaSJohn Baldwin
3111c0855eaaSJohn Baldwin.Lxts_done:
3112c0855eaaSJohn Baldwin	tst	x21,#0xf
3113c0855eaaSJohn Baldwin	b.eq	.Lxts_dec_abort
3114c0855eaaSJohn Baldwin	// Processing the last two blocks with cipher stealing.
3115c0855eaaSJohn Baldwin	mov	x7,x3
3116c0855eaaSJohn Baldwin	cbnz	x2,.Lxts_dec_1st_done
3117c0855eaaSJohn Baldwin	ld1	{v0.16b},[x0],#16
3118c0855eaaSJohn Baldwin
3119c0855eaaSJohn Baldwin	// Decrypt the last secod block to get the last plain text block
3120c0855eaaSJohn Baldwin.Lxts_dec_1st_done:
3121c0855eaaSJohn Baldwin	eor	v26.16b,v0.16b,v8.16b
3122c0855eaaSJohn Baldwin	ldr	w6,[x3,#240]
3123c0855eaaSJohn Baldwin	ld1	{v0.4s},[x3],#16
3124c0855eaaSJohn Baldwin	sub	w6,w6,#2
3125c0855eaaSJohn Baldwin	ld1	{v1.4s},[x3],#16
3126c0855eaaSJohn Baldwin.Loop_final_2nd_dec:
3127c0855eaaSJohn Baldwin	aesd	v26.16b,v0.16b
3128c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
3129c0855eaaSJohn Baldwin	ld1	{v0.4s},[x3],#16		// load key schedule...
3130c0855eaaSJohn Baldwin	subs	w6,w6,#2
3131c0855eaaSJohn Baldwin	aesd	v26.16b,v1.16b
3132c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
3133c0855eaaSJohn Baldwin	ld1	{v1.4s},[x3],#16		// load key schedule...
3134c0855eaaSJohn Baldwin	b.gt	.Loop_final_2nd_dec
3135c0855eaaSJohn Baldwin
3136c0855eaaSJohn Baldwin	aesd	v26.16b,v0.16b
3137c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
3138c0855eaaSJohn Baldwin	ld1	{v0.4s},[x3]
3139c0855eaaSJohn Baldwin	aesd	v26.16b,v1.16b
3140c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v0.16b
3141c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v8.16b
3142c0855eaaSJohn Baldwin	st1	{v26.16b},[x1]
3143c0855eaaSJohn Baldwin
3144c0855eaaSJohn Baldwin	mov	x20,x0
3145c0855eaaSJohn Baldwin	add	x13,x1,#16
3146c0855eaaSJohn Baldwin
3147c0855eaaSJohn Baldwin	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3148c0855eaaSJohn Baldwin	// to get the last encrypted block.
3149c0855eaaSJohn Baldwin.composite_dec_loop:
3150c0855eaaSJohn Baldwin	subs	x21,x21,#1
3151c0855eaaSJohn Baldwin	ldrb	w15,[x1,x21]
3152c0855eaaSJohn Baldwin	ldrb	w14,[x20,x21]
3153c0855eaaSJohn Baldwin	strb	w15,[x13,x21]
3154c0855eaaSJohn Baldwin	strb	w14,[x1,x21]
3155c0855eaaSJohn Baldwin	b.gt	.composite_dec_loop
3156c0855eaaSJohn Baldwin.Lxts_dec_load_done:
3157c0855eaaSJohn Baldwin	ld1	{v26.16b},[x1]
3158c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v6.16b
3159c0855eaaSJohn Baldwin
3160c0855eaaSJohn Baldwin	// Decrypt the composite block to get the last second plain text block
3161c0855eaaSJohn Baldwin	ldr	w6,[x7,#240]
3162c0855eaaSJohn Baldwin	ld1	{v0.4s},[x7],#16
3163c0855eaaSJohn Baldwin	sub	w6,w6,#2
3164c0855eaaSJohn Baldwin	ld1	{v1.4s},[x7],#16
3165c0855eaaSJohn Baldwin.Loop_final_dec:
3166c0855eaaSJohn Baldwin	aesd	v26.16b,v0.16b
3167c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
3168c0855eaaSJohn Baldwin	ld1	{v0.4s},[x7],#16		// load key schedule...
3169c0855eaaSJohn Baldwin	subs	w6,w6,#2
3170c0855eaaSJohn Baldwin	aesd	v26.16b,v1.16b
3171c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
3172c0855eaaSJohn Baldwin	ld1	{v1.4s},[x7],#16		// load key schedule...
3173c0855eaaSJohn Baldwin	b.gt	.Loop_final_dec
3174c0855eaaSJohn Baldwin
3175c0855eaaSJohn Baldwin	aesd	v26.16b,v0.16b
3176c0855eaaSJohn Baldwin	aesimc	v26.16b,v26.16b
3177c0855eaaSJohn Baldwin	ld1	{v0.4s},[x7]
3178c0855eaaSJohn Baldwin	aesd	v26.16b,v1.16b
3179c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v0.16b
3180c0855eaaSJohn Baldwin	eor	v26.16b,v26.16b,v6.16b
3181c0855eaaSJohn Baldwin	st1	{v26.16b},[x1]
3182c0855eaaSJohn Baldwin
3183c0855eaaSJohn Baldwin.Lxts_dec_abort:
3184c0855eaaSJohn Baldwin	ldp	x21,x22,[sp,#48]
3185c0855eaaSJohn Baldwin	ldp	d8,d9,[sp,#32]
3186c0855eaaSJohn Baldwin	ldp	d10,d11,[sp,#16]
3187c0855eaaSJohn Baldwin	ldp	x19,x20,[sp],#64
3188c0855eaaSJohn Baldwin
3189c0855eaaSJohn Baldwin.Lxts_dec_final_abort:
3190c0855eaaSJohn Baldwin	ret
3191c0855eaaSJohn Baldwin.size	aes_v8_xts_decrypt,.-aes_v8_xts_decrypt
3192bc3d5698SJohn Baldwin#endif
3193