xref: /freebsd/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S (revision df21a004be237a1dccd03c7b47254625eea62fa9)
1/* Do not modify. This file is auto-generated from vpsm4_ex-armv8.pl. */
2// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved.
3//
4// Licensed under the Apache License 2.0 (the "License").  You may not use
5// this file except in compliance with the License.  You can obtain a copy
6// in the file LICENSE in the source distribution or at
7// https://www.openssl.org/source/license.html
8
9//
10// This module implements SM4 with ASIMD and AESE on AARCH64
11//
12// Dec 2022
13//
14
15// $output is the last argument if it looks like a file (it has an extension)
16// $flavour is the first argument if it doesn't look like a file
17#include "arm_arch.h"
18.arch	armv8-a+crypto
19.text
20
21.type	_vpsm4_ex_consts,%object
22.align	7
23_vpsm4_ex_consts:
24.Lck:
25.long	0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
26.long	0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
27.long	0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
28.long	0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
29.long	0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
30.long	0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
31.long	0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
32.long	0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
33.Lfk:
34.quad	0x56aa3350a3b1bac6,0xb27022dc677d9197
35.Lshuffles:
36.quad	0x0B0A090807060504,0x030201000F0E0D0C
37.Lxts_magic:
38.quad	0x0101010101010187,0x0101010101010101
39.Lsbox_magic:
40.quad	0x0b0e0104070a0d00,0x0306090c0f020508
41.quad	0x62185a2042387a00,0x22581a6002783a40
42.quad	0x15df62a89e54e923,0xc10bb67c4a803df7
43.quad	0xb9aa6b78c1d21300,0x1407c6d56c7fbead
44.quad	0x6404462679195b3b,0xe383c1a1fe9edcbc
45.quad	0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
46
47.size	_vpsm4_ex_consts,.-_vpsm4_ex_consts
48.type	_vpsm4_ex_set_key,%function
49.align	4
50_vpsm4_ex_set_key:
51	AARCH64_VALID_CALL_TARGET
52	ld1	{v5.4s},[x0]
53	adrp	x9, .Lsbox_magic
54	ldr	q26, [x9, #:lo12:.Lsbox_magic]
55	ldr	q27, [x9, #:lo12:.Lsbox_magic+16]
56	ldr	q28, [x9, #:lo12:.Lsbox_magic+32]
57	ldr	q29, [x9, #:lo12:.Lsbox_magic+48]
58	ldr	q30, [x9, #:lo12:.Lsbox_magic+64]
59	ldr	q31, [x9, #:lo12:.Lsbox_magic+80]
60#ifndef __AARCH64EB__
61	rev32	v5.16b,v5.16b
62#endif
63	adrp	x5,.Lshuffles
64	add	x5,x5,#:lo12:.Lshuffles
65	ld1	{v7.2d},[x5]
66	adrp	x5,.Lfk
67	add	x5,x5,#:lo12:.Lfk
68	ld1	{v6.2d},[x5]
69	eor	v5.16b,v5.16b,v6.16b
70	mov	x6,#32
71	adrp	x5,.Lck
72	add	x5,x5,#:lo12:.Lck
73	movi	v0.16b,#64
74	cbnz	w2,1f
75	add	x1,x1,124
761:
77	mov	w7,v5.s[1]
78	ldr	w8,[x5],#4
79	eor	w8,w8,w7
80	mov	w7,v5.s[2]
81	eor	w8,w8,w7
82	mov	w7,v5.s[3]
83	eor	w8,w8,w7
84	// optimize sbox using AESE instruction
85	mov	v4.s[0],w8
86	tbl	v0.16b, {v4.16b}, v26.16b
87	ushr	v2.16b, v0.16b, 4
88	and	v0.16b, v0.16b, v31.16b
89	tbl	v0.16b, {v28.16b}, v0.16b
90	tbl	v2.16b, {v27.16b}, v2.16b
91	eor	v0.16b, v0.16b, v2.16b
92	eor	v1.16b, v1.16b, v1.16b
93	aese	v0.16b,v1.16b
94	ushr	v2.16b, v0.16b, 4
95	and	v0.16b, v0.16b, v31.16b
96	tbl	v0.16b, {v30.16b}, v0.16b
97	tbl	v2.16b, {v29.16b}, v2.16b
98	eor	v0.16b, v0.16b, v2.16b
99	mov	w7,v0.s[0]
100	eor	w8,w7,w7,ror #19
101	eor	w8,w8,w7,ror #9
102	mov	w7,v5.s[0]
103	eor	w8,w8,w7
104	mov	v5.s[0],w8
105	cbz	w2,2f
106	str	w8,[x1],#4
107	b	3f
1082:
109	str	w8,[x1],#-4
1103:
111	tbl	v5.16b,{v5.16b},v7.16b
112	subs	x6,x6,#1
113	b.ne	1b
114	ret
115.size	_vpsm4_ex_set_key,.-_vpsm4_ex_set_key
116.type	_vpsm4_ex_enc_4blks,%function
117.align	4
118_vpsm4_ex_enc_4blks:
119	AARCH64_VALID_CALL_TARGET
120	mov	x10,x3
121	mov	w11,#8
12210:
123	ldp	w7,w8,[x10],8
124	dup	v12.4s,w7
125	dup	v13.4s,w8
126
127	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
128	eor	v14.16b,v6.16b,v7.16b
129	eor	v12.16b,v5.16b,v12.16b
130	eor	v12.16b,v14.16b,v12.16b
131	// optimize sbox using AESE instruction
132	tbl	v0.16b, {v12.16b}, v26.16b
133	ushr	v24.16b, v0.16b, 4
134	and	v0.16b, v0.16b, v31.16b
135	tbl	v0.16b, {v28.16b}, v0.16b
136	tbl	v24.16b, {v27.16b}, v24.16b
137	eor	v0.16b, v0.16b, v24.16b
138	eor	v1.16b, v1.16b, v1.16b
139	aese	v0.16b,v1.16b
140	ushr	v24.16b, v0.16b, 4
141	and	v0.16b, v0.16b, v31.16b
142	tbl	v0.16b, {v30.16b}, v0.16b
143	tbl	v24.16b, {v29.16b}, v24.16b
144	eor	v0.16b, v0.16b, v24.16b
145	mov	v12.16b,v0.16b
146
147	// linear transformation
148	ushr	v0.4s,v12.4s,32-2
149	ushr	v1.4s,v12.4s,32-10
150	ushr	v2.4s,v12.4s,32-18
151	ushr	v3.4s,v12.4s,32-24
152	sli	v0.4s,v12.4s,2
153	sli	v1.4s,v12.4s,10
154	sli	v2.4s,v12.4s,18
155	sli	v3.4s,v12.4s,24
156	eor	v24.16b,v0.16b,v12.16b
157	eor	v24.16b,v24.16b,v1.16b
158	eor	v12.16b,v2.16b,v3.16b
159	eor	v12.16b,v12.16b,v24.16b
160	eor	v4.16b,v4.16b,v12.16b
161
162	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
163	eor	v14.16b,v14.16b,v4.16b
164	eor	v13.16b,v14.16b,v13.16b
165	// optimize sbox using AESE instruction
166	tbl	v0.16b, {v13.16b}, v26.16b
167	ushr	v24.16b, v0.16b, 4
168	and	v0.16b, v0.16b, v31.16b
169	tbl	v0.16b, {v28.16b}, v0.16b
170	tbl	v24.16b, {v27.16b}, v24.16b
171	eor	v0.16b, v0.16b, v24.16b
172	eor	v1.16b, v1.16b, v1.16b
173	aese	v0.16b,v1.16b
174	ushr	v24.16b, v0.16b, 4
175	and	v0.16b, v0.16b, v31.16b
176	tbl	v0.16b, {v30.16b}, v0.16b
177	tbl	v24.16b, {v29.16b}, v24.16b
178	eor	v0.16b, v0.16b, v24.16b
179	mov	v13.16b,v0.16b
180
181	// linear transformation
182	ushr	v0.4s,v13.4s,32-2
183	ushr	v1.4s,v13.4s,32-10
184	ushr	v2.4s,v13.4s,32-18
185	ushr	v3.4s,v13.4s,32-24
186	sli	v0.4s,v13.4s,2
187	sli	v1.4s,v13.4s,10
188	sli	v2.4s,v13.4s,18
189	sli	v3.4s,v13.4s,24
190	eor	v24.16b,v0.16b,v13.16b
191	eor	v24.16b,v24.16b,v1.16b
192	eor	v13.16b,v2.16b,v3.16b
193	eor	v13.16b,v13.16b,v24.16b
194	ldp	w7,w8,[x10],8
195	eor	v5.16b,v5.16b,v13.16b
196
197	dup	v12.4s,w7
198	dup	v13.4s,w8
199
200	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
201	eor	v14.16b,v4.16b,v5.16b
202	eor	v12.16b,v7.16b,v12.16b
203	eor	v12.16b,v14.16b,v12.16b
204	// optimize sbox using AESE instruction
205	tbl	v0.16b, {v12.16b}, v26.16b
206	ushr	v24.16b, v0.16b, 4
207	and	v0.16b, v0.16b, v31.16b
208	tbl	v0.16b, {v28.16b}, v0.16b
209	tbl	v24.16b, {v27.16b}, v24.16b
210	eor	v0.16b, v0.16b, v24.16b
211	eor	v1.16b, v1.16b, v1.16b
212	aese	v0.16b,v1.16b
213	ushr	v24.16b, v0.16b, 4
214	and	v0.16b, v0.16b, v31.16b
215	tbl	v0.16b, {v30.16b}, v0.16b
216	tbl	v24.16b, {v29.16b}, v24.16b
217	eor	v0.16b, v0.16b, v24.16b
218	mov	v12.16b,v0.16b
219
220	// linear transformation
221	ushr	v0.4s,v12.4s,32-2
222	ushr	v1.4s,v12.4s,32-10
223	ushr	v2.4s,v12.4s,32-18
224	ushr	v3.4s,v12.4s,32-24
225	sli	v0.4s,v12.4s,2
226	sli	v1.4s,v12.4s,10
227	sli	v2.4s,v12.4s,18
228	sli	v3.4s,v12.4s,24
229	eor	v24.16b,v0.16b,v12.16b
230	eor	v24.16b,v24.16b,v1.16b
231	eor	v12.16b,v2.16b,v3.16b
232	eor	v12.16b,v12.16b,v24.16b
233	eor	v6.16b,v6.16b,v12.16b
234
235	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
236	eor	v14.16b,v14.16b,v6.16b
237	eor	v13.16b,v14.16b,v13.16b
238	// optimize sbox using AESE instruction
239	tbl	v0.16b, {v13.16b}, v26.16b
240	ushr	v24.16b, v0.16b, 4
241	and	v0.16b, v0.16b, v31.16b
242	tbl	v0.16b, {v28.16b}, v0.16b
243	tbl	v24.16b, {v27.16b}, v24.16b
244	eor	v0.16b, v0.16b, v24.16b
245	eor	v1.16b, v1.16b, v1.16b
246	aese	v0.16b,v1.16b
247	ushr	v24.16b, v0.16b, 4
248	and	v0.16b, v0.16b, v31.16b
249	tbl	v0.16b, {v30.16b}, v0.16b
250	tbl	v24.16b, {v29.16b}, v24.16b
251	eor	v0.16b, v0.16b, v24.16b
252	mov	v13.16b,v0.16b
253
254	// linear transformation
255	ushr	v0.4s,v13.4s,32-2
256	ushr	v1.4s,v13.4s,32-10
257	ushr	v2.4s,v13.4s,32-18
258	ushr	v3.4s,v13.4s,32-24
259	sli	v0.4s,v13.4s,2
260	sli	v1.4s,v13.4s,10
261	sli	v2.4s,v13.4s,18
262	sli	v3.4s,v13.4s,24
263	eor	v24.16b,v0.16b,v13.16b
264	eor	v24.16b,v24.16b,v1.16b
265	eor	v13.16b,v2.16b,v3.16b
266	eor	v13.16b,v13.16b,v24.16b
267	eor	v7.16b,v7.16b,v13.16b
268	subs	w11,w11,#1
269	b.ne	10b
270#ifndef __AARCH64EB__
271	rev32	v3.16b,v4.16b
272#else
273	mov	v3.16b,v4.16b
274#endif
275#ifndef __AARCH64EB__
276	rev32	v2.16b,v5.16b
277#else
278	mov	v2.16b,v5.16b
279#endif
280#ifndef __AARCH64EB__
281	rev32	v1.16b,v6.16b
282#else
283	mov	v1.16b,v6.16b
284#endif
285#ifndef __AARCH64EB__
286	rev32	v0.16b,v7.16b
287#else
288	mov	v0.16b,v7.16b
289#endif
290	ret
291.size	_vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks
292.type	_vpsm4_ex_enc_8blks,%function
293.align	4
294_vpsm4_ex_enc_8blks:
295	AARCH64_VALID_CALL_TARGET
296	mov	x10,x3
297	mov	w11,#8
29810:
299	ldp	w7,w8,[x10],8
300	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
301	dup	v12.4s,w7
302	eor	v14.16b,v6.16b,v7.16b
303	eor	v15.16b,v10.16b,v11.16b
304	eor	v0.16b,v5.16b,v12.16b
305	eor	v1.16b,v9.16b,v12.16b
306	eor	v12.16b,v14.16b,v0.16b
307	eor	v13.16b,v15.16b,v1.16b
308	// optimize sbox using AESE instruction
309	tbl	v0.16b, {v12.16b}, v26.16b
310	tbl	v1.16b, {v13.16b}, v26.16b
311	ushr	v24.16b, v0.16b, 4
312	and	v0.16b, v0.16b, v31.16b
313	tbl	v0.16b, {v28.16b}, v0.16b
314	tbl	v24.16b, {v27.16b}, v24.16b
315	eor	v0.16b, v0.16b, v24.16b
316	ushr	v24.16b, v1.16b, 4
317	and	v1.16b, v1.16b, v31.16b
318	tbl	v1.16b, {v28.16b}, v1.16b
319	tbl	v24.16b, {v27.16b}, v24.16b
320	eor	v1.16b, v1.16b, v24.16b
321	eor	v25.16b, v25.16b, v25.16b
322	aese	v0.16b,v25.16b
323	aese	v1.16b,v25.16b
324	ushr	v24.16b, v0.16b, 4
325	and	v0.16b, v0.16b, v31.16b
326	tbl	v0.16b, {v30.16b}, v0.16b
327	tbl	v24.16b, {v29.16b}, v24.16b
328	eor	v0.16b, v0.16b, v24.16b
329	ushr	v24.16b, v1.16b, 4
330	and	v1.16b, v1.16b, v31.16b
331	tbl	v1.16b, {v30.16b}, v1.16b
332	tbl	v24.16b, {v29.16b}, v24.16b
333	eor	v1.16b, v1.16b, v24.16b
334	mov	v12.16b,v0.16b
335	mov	v13.16b,v1.16b
336
337	// linear transformation
338	ushr	v0.4s,v12.4s,32-2
339	ushr	v25.4s,v13.4s,32-2
340	ushr	v1.4s,v12.4s,32-10
341	ushr	v2.4s,v12.4s,32-18
342	ushr	v3.4s,v12.4s,32-24
343	sli	v0.4s,v12.4s,2
344	sli	v25.4s,v13.4s,2
345	sli	v1.4s,v12.4s,10
346	sli	v2.4s,v12.4s,18
347	sli	v3.4s,v12.4s,24
348	eor	v24.16b,v0.16b,v12.16b
349	eor	v24.16b,v24.16b,v1.16b
350	eor	v12.16b,v2.16b,v3.16b
351	eor	v12.16b,v12.16b,v24.16b
352	ushr	v1.4s,v13.4s,32-10
353	ushr	v2.4s,v13.4s,32-18
354	ushr	v3.4s,v13.4s,32-24
355	sli	v1.4s,v13.4s,10
356	sli	v2.4s,v13.4s,18
357	sli	v3.4s,v13.4s,24
358	eor	v24.16b,v25.16b,v13.16b
359	eor	v24.16b,v24.16b,v1.16b
360	eor	v13.16b,v2.16b,v3.16b
361	eor	v13.16b,v13.16b,v24.16b
362	eor	v4.16b,v4.16b,v12.16b
363	eor	v8.16b,v8.16b,v13.16b
364
365	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
366	dup	v13.4s,w8
367	eor	v14.16b,v14.16b,v4.16b
368	eor	v15.16b,v15.16b,v8.16b
369	eor	v12.16b,v14.16b,v13.16b
370	eor	v13.16b,v15.16b,v13.16b
371	// optimize sbox using AESE instruction
372	tbl	v0.16b, {v12.16b}, v26.16b
373	tbl	v1.16b, {v13.16b}, v26.16b
374	ushr	v24.16b, v0.16b, 4
375	and	v0.16b, v0.16b, v31.16b
376	tbl	v0.16b, {v28.16b}, v0.16b
377	tbl	v24.16b, {v27.16b}, v24.16b
378	eor	v0.16b, v0.16b, v24.16b
379	ushr	v24.16b, v1.16b, 4
380	and	v1.16b, v1.16b, v31.16b
381	tbl	v1.16b, {v28.16b}, v1.16b
382	tbl	v24.16b, {v27.16b}, v24.16b
383	eor	v1.16b, v1.16b, v24.16b
384	eor	v25.16b, v25.16b, v25.16b
385	aese	v0.16b,v25.16b
386	aese	v1.16b,v25.16b
387	ushr	v24.16b, v0.16b, 4
388	and	v0.16b, v0.16b, v31.16b
389	tbl	v0.16b, {v30.16b}, v0.16b
390	tbl	v24.16b, {v29.16b}, v24.16b
391	eor	v0.16b, v0.16b, v24.16b
392	ushr	v24.16b, v1.16b, 4
393	and	v1.16b, v1.16b, v31.16b
394	tbl	v1.16b, {v30.16b}, v1.16b
395	tbl	v24.16b, {v29.16b}, v24.16b
396	eor	v1.16b, v1.16b, v24.16b
397	mov	v12.16b,v0.16b
398	mov	v13.16b,v1.16b
399
400	// linear transformation
401	ushr	v0.4s,v12.4s,32-2
402	ushr	v25.4s,v13.4s,32-2
403	ushr	v1.4s,v12.4s,32-10
404	ushr	v2.4s,v12.4s,32-18
405	ushr	v3.4s,v12.4s,32-24
406	sli	v0.4s,v12.4s,2
407	sli	v25.4s,v13.4s,2
408	sli	v1.4s,v12.4s,10
409	sli	v2.4s,v12.4s,18
410	sli	v3.4s,v12.4s,24
411	eor	v24.16b,v0.16b,v12.16b
412	eor	v24.16b,v24.16b,v1.16b
413	eor	v12.16b,v2.16b,v3.16b
414	eor	v12.16b,v12.16b,v24.16b
415	ushr	v1.4s,v13.4s,32-10
416	ushr	v2.4s,v13.4s,32-18
417	ushr	v3.4s,v13.4s,32-24
418	sli	v1.4s,v13.4s,10
419	sli	v2.4s,v13.4s,18
420	sli	v3.4s,v13.4s,24
421	eor	v24.16b,v25.16b,v13.16b
422	eor	v24.16b,v24.16b,v1.16b
423	eor	v13.16b,v2.16b,v3.16b
424	eor	v13.16b,v13.16b,v24.16b
425	ldp	w7,w8,[x10],8
426	eor	v5.16b,v5.16b,v12.16b
427	eor	v9.16b,v9.16b,v13.16b
428
429	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
430	dup	v12.4s,w7
431	eor	v14.16b,v4.16b,v5.16b
432	eor	v15.16b,v8.16b,v9.16b
433	eor	v0.16b,v7.16b,v12.16b
434	eor	v1.16b,v11.16b,v12.16b
435	eor	v12.16b,v14.16b,v0.16b
436	eor	v13.16b,v15.16b,v1.16b
437	// optimize sbox using AESE instruction
438	tbl	v0.16b, {v12.16b}, v26.16b
439	tbl	v1.16b, {v13.16b}, v26.16b
440	ushr	v24.16b, v0.16b, 4
441	and	v0.16b, v0.16b, v31.16b
442	tbl	v0.16b, {v28.16b}, v0.16b
443	tbl	v24.16b, {v27.16b}, v24.16b
444	eor	v0.16b, v0.16b, v24.16b
445	ushr	v24.16b, v1.16b, 4
446	and	v1.16b, v1.16b, v31.16b
447	tbl	v1.16b, {v28.16b}, v1.16b
448	tbl	v24.16b, {v27.16b}, v24.16b
449	eor	v1.16b, v1.16b, v24.16b
450	eor	v25.16b, v25.16b, v25.16b
451	aese	v0.16b,v25.16b
452	aese	v1.16b,v25.16b
453	ushr	v24.16b, v0.16b, 4
454	and	v0.16b, v0.16b, v31.16b
455	tbl	v0.16b, {v30.16b}, v0.16b
456	tbl	v24.16b, {v29.16b}, v24.16b
457	eor	v0.16b, v0.16b, v24.16b
458	ushr	v24.16b, v1.16b, 4
459	and	v1.16b, v1.16b, v31.16b
460	tbl	v1.16b, {v30.16b}, v1.16b
461	tbl	v24.16b, {v29.16b}, v24.16b
462	eor	v1.16b, v1.16b, v24.16b
463	mov	v12.16b,v0.16b
464	mov	v13.16b,v1.16b
465
466	// linear transformation
467	ushr	v0.4s,v12.4s,32-2
468	ushr	v25.4s,v13.4s,32-2
469	ushr	v1.4s,v12.4s,32-10
470	ushr	v2.4s,v12.4s,32-18
471	ushr	v3.4s,v12.4s,32-24
472	sli	v0.4s,v12.4s,2
473	sli	v25.4s,v13.4s,2
474	sli	v1.4s,v12.4s,10
475	sli	v2.4s,v12.4s,18
476	sli	v3.4s,v12.4s,24
477	eor	v24.16b,v0.16b,v12.16b
478	eor	v24.16b,v24.16b,v1.16b
479	eor	v12.16b,v2.16b,v3.16b
480	eor	v12.16b,v12.16b,v24.16b
481	ushr	v1.4s,v13.4s,32-10
482	ushr	v2.4s,v13.4s,32-18
483	ushr	v3.4s,v13.4s,32-24
484	sli	v1.4s,v13.4s,10
485	sli	v2.4s,v13.4s,18
486	sli	v3.4s,v13.4s,24
487	eor	v24.16b,v25.16b,v13.16b
488	eor	v24.16b,v24.16b,v1.16b
489	eor	v13.16b,v2.16b,v3.16b
490	eor	v13.16b,v13.16b,v24.16b
491	eor	v6.16b,v6.16b,v12.16b
492	eor	v10.16b,v10.16b,v13.16b
493
494	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
495	dup	v13.4s,w8
496	eor	v14.16b,v14.16b,v6.16b
497	eor	v15.16b,v15.16b,v10.16b
498	eor	v12.16b,v14.16b,v13.16b
499	eor	v13.16b,v15.16b,v13.16b
500	// optimize sbox using AESE instruction
501	tbl	v0.16b, {v12.16b}, v26.16b
502	tbl	v1.16b, {v13.16b}, v26.16b
503	ushr	v24.16b, v0.16b, 4
504	and	v0.16b, v0.16b, v31.16b
505	tbl	v0.16b, {v28.16b}, v0.16b
506	tbl	v24.16b, {v27.16b}, v24.16b
507	eor	v0.16b, v0.16b, v24.16b
508	ushr	v24.16b, v1.16b, 4
509	and	v1.16b, v1.16b, v31.16b
510	tbl	v1.16b, {v28.16b}, v1.16b
511	tbl	v24.16b, {v27.16b}, v24.16b
512	eor	v1.16b, v1.16b, v24.16b
513	eor	v25.16b, v25.16b, v25.16b
514	aese	v0.16b,v25.16b
515	aese	v1.16b,v25.16b
516	ushr	v24.16b, v0.16b, 4
517	and	v0.16b, v0.16b, v31.16b
518	tbl	v0.16b, {v30.16b}, v0.16b
519	tbl	v24.16b, {v29.16b}, v24.16b
520	eor	v0.16b, v0.16b, v24.16b
521	ushr	v24.16b, v1.16b, 4
522	and	v1.16b, v1.16b, v31.16b
523	tbl	v1.16b, {v30.16b}, v1.16b
524	tbl	v24.16b, {v29.16b}, v24.16b
525	eor	v1.16b, v1.16b, v24.16b
526	mov	v12.16b,v0.16b
527	mov	v13.16b,v1.16b
528
529	// linear transformation
530	ushr	v0.4s,v12.4s,32-2
531	ushr	v25.4s,v13.4s,32-2
532	ushr	v1.4s,v12.4s,32-10
533	ushr	v2.4s,v12.4s,32-18
534	ushr	v3.4s,v12.4s,32-24
535	sli	v0.4s,v12.4s,2
536	sli	v25.4s,v13.4s,2
537	sli	v1.4s,v12.4s,10
538	sli	v2.4s,v12.4s,18
539	sli	v3.4s,v12.4s,24
540	eor	v24.16b,v0.16b,v12.16b
541	eor	v24.16b,v24.16b,v1.16b
542	eor	v12.16b,v2.16b,v3.16b
543	eor	v12.16b,v12.16b,v24.16b
544	ushr	v1.4s,v13.4s,32-10
545	ushr	v2.4s,v13.4s,32-18
546	ushr	v3.4s,v13.4s,32-24
547	sli	v1.4s,v13.4s,10
548	sli	v2.4s,v13.4s,18
549	sli	v3.4s,v13.4s,24
550	eor	v24.16b,v25.16b,v13.16b
551	eor	v24.16b,v24.16b,v1.16b
552	eor	v13.16b,v2.16b,v3.16b
553	eor	v13.16b,v13.16b,v24.16b
554	eor	v7.16b,v7.16b,v12.16b
555	eor	v11.16b,v11.16b,v13.16b
556	subs	w11,w11,#1
557	b.ne	10b
558#ifndef __AARCH64EB__
559	rev32	v3.16b,v4.16b
560#else
561	mov	v3.16b,v4.16b
562#endif
563#ifndef __AARCH64EB__
564	rev32	v2.16b,v5.16b
565#else
566	mov	v2.16b,v5.16b
567#endif
568#ifndef __AARCH64EB__
569	rev32	v1.16b,v6.16b
570#else
571	mov	v1.16b,v6.16b
572#endif
573#ifndef __AARCH64EB__
574	rev32	v0.16b,v7.16b
575#else
576	mov	v0.16b,v7.16b
577#endif
578#ifndef __AARCH64EB__
579	rev32	v7.16b,v8.16b
580#else
581	mov	v7.16b,v8.16b
582#endif
583#ifndef __AARCH64EB__
584	rev32	v6.16b,v9.16b
585#else
586	mov	v6.16b,v9.16b
587#endif
588#ifndef __AARCH64EB__
589	rev32	v5.16b,v10.16b
590#else
591	mov	v5.16b,v10.16b
592#endif
593#ifndef __AARCH64EB__
594	rev32	v4.16b,v11.16b
595#else
596	mov	v4.16b,v11.16b
597#endif
598	ret
599.size	_vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks
600.globl	vpsm4_ex_set_encrypt_key
601.type	vpsm4_ex_set_encrypt_key,%function
602.align	5
603vpsm4_ex_set_encrypt_key:
604	AARCH64_SIGN_LINK_REGISTER
605	stp	x29,x30,[sp,#-16]!
606	mov	w2,1
607	bl	_vpsm4_ex_set_key
608	ldp	x29,x30,[sp],#16
609	AARCH64_VALIDATE_LINK_REGISTER
610	ret
611.size	vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key
612.globl	vpsm4_ex_set_decrypt_key
613.type	vpsm4_ex_set_decrypt_key,%function
614.align	5
615vpsm4_ex_set_decrypt_key:
616	AARCH64_SIGN_LINK_REGISTER
617	stp	x29,x30,[sp,#-16]!
618	mov	w2,0
619	bl	_vpsm4_ex_set_key
620	ldp	x29,x30,[sp],#16
621	AARCH64_VALIDATE_LINK_REGISTER
622	ret
623.size	vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key
624.globl	vpsm4_ex_encrypt
625.type	vpsm4_ex_encrypt,%function
626.align	5
627vpsm4_ex_encrypt:
628	AARCH64_VALID_CALL_TARGET
629	ld1	{v4.4s},[x0]
630	adrp	x9, .Lsbox_magic
631	ldr	q26, [x9, #:lo12:.Lsbox_magic]
632	ldr	q27, [x9, #:lo12:.Lsbox_magic+16]
633	ldr	q28, [x9, #:lo12:.Lsbox_magic+32]
634	ldr	q29, [x9, #:lo12:.Lsbox_magic+48]
635	ldr	q30, [x9, #:lo12:.Lsbox_magic+64]
636	ldr	q31, [x9, #:lo12:.Lsbox_magic+80]
637#ifndef __AARCH64EB__
638	rev32	v4.16b,v4.16b
639#endif
640	mov	x3,x2
641	mov	x10,x3
642	mov	w11,#8
643	mov	w12,v4.s[0]
644	mov	w13,v4.s[1]
645	mov	w14,v4.s[2]
646	mov	w15,v4.s[3]
64710:
648	ldp	w7,w8,[x10],8
649	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
650	eor	w6,w14,w15
651	eor	w9,w7,w13
652	eor	w6,w6,w9
653	mov	v3.s[0],w6
654	// optimize sbox using AESE instruction
655	tbl	v0.16b, {v3.16b}, v26.16b
656	ushr	v2.16b, v0.16b, 4
657	and	v0.16b, v0.16b, v31.16b
658	tbl	v0.16b, {v28.16b}, v0.16b
659	tbl	v2.16b, {v27.16b}, v2.16b
660	eor	v0.16b, v0.16b, v2.16b
661	eor	v1.16b, v1.16b, v1.16b
662	aese	v0.16b,v1.16b
663	ushr	v2.16b, v0.16b, 4
664	and	v0.16b, v0.16b, v31.16b
665	tbl	v0.16b, {v30.16b}, v0.16b
666	tbl	v2.16b, {v29.16b}, v2.16b
667	eor	v0.16b, v0.16b, v2.16b
668
669	mov	w7,v0.s[0]
670	eor	w6,w7,w7,ror #32-2
671	eor	w6,w6,w7,ror #32-10
672	eor	w6,w6,w7,ror #32-18
673	eor	w6,w6,w7,ror #32-24
674	eor	w12,w12,w6
675	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
676	eor	w6,w14,w15
677	eor	w9,w12,w8
678	eor	w6,w6,w9
679	mov	v3.s[0],w6
680	// optimize sbox using AESE instruction
681	tbl	v0.16b, {v3.16b}, v26.16b
682	ushr	v2.16b, v0.16b, 4
683	and	v0.16b, v0.16b, v31.16b
684	tbl	v0.16b, {v28.16b}, v0.16b
685	tbl	v2.16b, {v27.16b}, v2.16b
686	eor	v0.16b, v0.16b, v2.16b
687	eor	v1.16b, v1.16b, v1.16b
688	aese	v0.16b,v1.16b
689	ushr	v2.16b, v0.16b, 4
690	and	v0.16b, v0.16b, v31.16b
691	tbl	v0.16b, {v30.16b}, v0.16b
692	tbl	v2.16b, {v29.16b}, v2.16b
693	eor	v0.16b, v0.16b, v2.16b
694
695	mov	w7,v0.s[0]
696	eor	w6,w7,w7,ror #32-2
697	eor	w6,w6,w7,ror #32-10
698	eor	w6,w6,w7,ror #32-18
699	eor	w6,w6,w7,ror #32-24
700	ldp	w7,w8,[x10],8
701	eor	w13,w13,w6
702	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
703	eor	w6,w12,w13
704	eor	w9,w7,w15
705	eor	w6,w6,w9
706	mov	v3.s[0],w6
707	// optimize sbox using AESE instruction
708	tbl	v0.16b, {v3.16b}, v26.16b
709	ushr	v2.16b, v0.16b, 4
710	and	v0.16b, v0.16b, v31.16b
711	tbl	v0.16b, {v28.16b}, v0.16b
712	tbl	v2.16b, {v27.16b}, v2.16b
713	eor	v0.16b, v0.16b, v2.16b
714	eor	v1.16b, v1.16b, v1.16b
715	aese	v0.16b,v1.16b
716	ushr	v2.16b, v0.16b, 4
717	and	v0.16b, v0.16b, v31.16b
718	tbl	v0.16b, {v30.16b}, v0.16b
719	tbl	v2.16b, {v29.16b}, v2.16b
720	eor	v0.16b, v0.16b, v2.16b
721
722	mov	w7,v0.s[0]
723	eor	w6,w7,w7,ror #32-2
724	eor	w6,w6,w7,ror #32-10
725	eor	w6,w6,w7,ror #32-18
726	eor	w6,w6,w7,ror #32-24
727	eor	w14,w14,w6
728	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
729	eor	w6,w12,w13
730	eor	w9,w14,w8
731	eor	w6,w6,w9
732	mov	v3.s[0],w6
733	// optimize sbox using AESE instruction
734	tbl	v0.16b, {v3.16b}, v26.16b
735	ushr	v2.16b, v0.16b, 4
736	and	v0.16b, v0.16b, v31.16b
737	tbl	v0.16b, {v28.16b}, v0.16b
738	tbl	v2.16b, {v27.16b}, v2.16b
739	eor	v0.16b, v0.16b, v2.16b
740	eor	v1.16b, v1.16b, v1.16b
741	aese	v0.16b,v1.16b
742	ushr	v2.16b, v0.16b, 4
743	and	v0.16b, v0.16b, v31.16b
744	tbl	v0.16b, {v30.16b}, v0.16b
745	tbl	v2.16b, {v29.16b}, v2.16b
746	eor	v0.16b, v0.16b, v2.16b
747
748	mov	w7,v0.s[0]
749	eor	w6,w7,w7,ror #32-2
750	eor	w6,w6,w7,ror #32-10
751	eor	w6,w6,w7,ror #32-18
752	eor	w6,w6,w7,ror #32-24
753	eor	w15,w15,w6
754	subs	w11,w11,#1
755	b.ne	10b
756	mov	v4.s[0],w15
757	mov	v4.s[1],w14
758	mov	v4.s[2],w13
759	mov	v4.s[3],w12
760#ifndef __AARCH64EB__
761	rev32	v4.16b,v4.16b
762#endif
763	st1	{v4.4s},[x1]
764	ret
765.size	vpsm4_ex_encrypt,.-vpsm4_ex_encrypt
766.globl	vpsm4_ex_decrypt
767.type	vpsm4_ex_decrypt,%function
768.align	5
769vpsm4_ex_decrypt:
770	AARCH64_VALID_CALL_TARGET
771	ld1	{v4.4s},[x0]
772	adrp	x9, .Lsbox_magic
773	ldr	q26, [x9, #:lo12:.Lsbox_magic]
774	ldr	q27, [x9, #:lo12:.Lsbox_magic+16]
775	ldr	q28, [x9, #:lo12:.Lsbox_magic+32]
776	ldr	q29, [x9, #:lo12:.Lsbox_magic+48]
777	ldr	q30, [x9, #:lo12:.Lsbox_magic+64]
778	ldr	q31, [x9, #:lo12:.Lsbox_magic+80]
779#ifndef __AARCH64EB__
780	rev32	v4.16b,v4.16b
781#endif
782	mov	x3,x2
783	mov	x10,x3
784	mov	w11,#8
785	mov	w12,v4.s[0]
786	mov	w13,v4.s[1]
787	mov	w14,v4.s[2]
788	mov	w15,v4.s[3]
78910:
790	ldp	w7,w8,[x10],8
791	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
792	eor	w6,w14,w15
793	eor	w9,w7,w13
794	eor	w6,w6,w9
795	mov	v3.s[0],w6
796	// optimize sbox using AESE instruction
797	tbl	v0.16b, {v3.16b}, v26.16b
798	ushr	v2.16b, v0.16b, 4
799	and	v0.16b, v0.16b, v31.16b
800	tbl	v0.16b, {v28.16b}, v0.16b
801	tbl	v2.16b, {v27.16b}, v2.16b
802	eor	v0.16b, v0.16b, v2.16b
803	eor	v1.16b, v1.16b, v1.16b
804	aese	v0.16b,v1.16b
805	ushr	v2.16b, v0.16b, 4
806	and	v0.16b, v0.16b, v31.16b
807	tbl	v0.16b, {v30.16b}, v0.16b
808	tbl	v2.16b, {v29.16b}, v2.16b
809	eor	v0.16b, v0.16b, v2.16b
810
811	mov	w7,v0.s[0]
812	eor	w6,w7,w7,ror #32-2
813	eor	w6,w6,w7,ror #32-10
814	eor	w6,w6,w7,ror #32-18
815	eor	w6,w6,w7,ror #32-24
816	eor	w12,w12,w6
817	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
818	eor	w6,w14,w15
819	eor	w9,w12,w8
820	eor	w6,w6,w9
821	mov	v3.s[0],w6
822	// optimize sbox using AESE instruction
823	tbl	v0.16b, {v3.16b}, v26.16b
824	ushr	v2.16b, v0.16b, 4
825	and	v0.16b, v0.16b, v31.16b
826	tbl	v0.16b, {v28.16b}, v0.16b
827	tbl	v2.16b, {v27.16b}, v2.16b
828	eor	v0.16b, v0.16b, v2.16b
829	eor	v1.16b, v1.16b, v1.16b
830	aese	v0.16b,v1.16b
831	ushr	v2.16b, v0.16b, 4
832	and	v0.16b, v0.16b, v31.16b
833	tbl	v0.16b, {v30.16b}, v0.16b
834	tbl	v2.16b, {v29.16b}, v2.16b
835	eor	v0.16b, v0.16b, v2.16b
836
837	mov	w7,v0.s[0]
838	eor	w6,w7,w7,ror #32-2
839	eor	w6,w6,w7,ror #32-10
840	eor	w6,w6,w7,ror #32-18
841	eor	w6,w6,w7,ror #32-24
842	ldp	w7,w8,[x10],8
843	eor	w13,w13,w6
844	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
845	eor	w6,w12,w13
846	eor	w9,w7,w15
847	eor	w6,w6,w9
848	mov	v3.s[0],w6
849	// optimize sbox using AESE instruction
850	tbl	v0.16b, {v3.16b}, v26.16b
851	ushr	v2.16b, v0.16b, 4
852	and	v0.16b, v0.16b, v31.16b
853	tbl	v0.16b, {v28.16b}, v0.16b
854	tbl	v2.16b, {v27.16b}, v2.16b
855	eor	v0.16b, v0.16b, v2.16b
856	eor	v1.16b, v1.16b, v1.16b
857	aese	v0.16b,v1.16b
858	ushr	v2.16b, v0.16b, 4
859	and	v0.16b, v0.16b, v31.16b
860	tbl	v0.16b, {v30.16b}, v0.16b
861	tbl	v2.16b, {v29.16b}, v2.16b
862	eor	v0.16b, v0.16b, v2.16b
863
864	mov	w7,v0.s[0]
865	eor	w6,w7,w7,ror #32-2
866	eor	w6,w6,w7,ror #32-10
867	eor	w6,w6,w7,ror #32-18
868	eor	w6,w6,w7,ror #32-24
869	eor	w14,w14,w6
870	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
871	eor	w6,w12,w13
872	eor	w9,w14,w8
873	eor	w6,w6,w9
874	mov	v3.s[0],w6
875	// optimize sbox using AESE instruction
876	tbl	v0.16b, {v3.16b}, v26.16b
877	ushr	v2.16b, v0.16b, 4
878	and	v0.16b, v0.16b, v31.16b
879	tbl	v0.16b, {v28.16b}, v0.16b
880	tbl	v2.16b, {v27.16b}, v2.16b
881	eor	v0.16b, v0.16b, v2.16b
882	eor	v1.16b, v1.16b, v1.16b
883	aese	v0.16b,v1.16b
884	ushr	v2.16b, v0.16b, 4
885	and	v0.16b, v0.16b, v31.16b
886	tbl	v0.16b, {v30.16b}, v0.16b
887	tbl	v2.16b, {v29.16b}, v2.16b
888	eor	v0.16b, v0.16b, v2.16b
889
890	mov	w7,v0.s[0]
891	eor	w6,w7,w7,ror #32-2
892	eor	w6,w6,w7,ror #32-10
893	eor	w6,w6,w7,ror #32-18
894	eor	w6,w6,w7,ror #32-24
895	eor	w15,w15,w6
896	subs	w11,w11,#1
897	b.ne	10b
898	mov	v4.s[0],w15
899	mov	v4.s[1],w14
900	mov	v4.s[2],w13
901	mov	v4.s[3],w12
902#ifndef __AARCH64EB__
903	rev32	v4.16b,v4.16b
904#endif
905	st1	{v4.4s},[x1]
906	ret
907.size	vpsm4_ex_decrypt,.-vpsm4_ex_decrypt
908.globl	vpsm4_ex_ecb_encrypt
909.type	vpsm4_ex_ecb_encrypt,%function
910.align	5
911vpsm4_ex_ecb_encrypt:
912	AARCH64_SIGN_LINK_REGISTER
913	// convert length into blocks
914	lsr	x2,x2,4
915	stp	d8,d9,[sp,#-80]!
916	stp	d10,d11,[sp,#16]
917	stp	d12,d13,[sp,#32]
918	stp	d14,d15,[sp,#48]
919	stp	x29,x30,[sp,#64]
920	adrp	x9, .Lsbox_magic
921	ldr	q26, [x9, #:lo12:.Lsbox_magic]
922	ldr	q27, [x9, #:lo12:.Lsbox_magic+16]
923	ldr	q28, [x9, #:lo12:.Lsbox_magic+32]
924	ldr	q29, [x9, #:lo12:.Lsbox_magic+48]
925	ldr	q30, [x9, #:lo12:.Lsbox_magic+64]
926	ldr	q31, [x9, #:lo12:.Lsbox_magic+80]
927.Lecb_8_blocks_process:
928	cmp	w2,#8
929	b.lt	.Lecb_4_blocks_process
930	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
931	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
932#ifndef __AARCH64EB__
933	rev32	v4.16b,v4.16b
934#endif
935#ifndef __AARCH64EB__
936	rev32	v5.16b,v5.16b
937#endif
938#ifndef __AARCH64EB__
939	rev32	v6.16b,v6.16b
940#endif
941#ifndef __AARCH64EB__
942	rev32	v7.16b,v7.16b
943#endif
944#ifndef __AARCH64EB__
945	rev32	v8.16b,v8.16b
946#endif
947#ifndef __AARCH64EB__
948	rev32	v9.16b,v9.16b
949#endif
950#ifndef __AARCH64EB__
951	rev32	v10.16b,v10.16b
952#endif
953#ifndef __AARCH64EB__
954	rev32	v11.16b,v11.16b
955#endif
956	bl	_vpsm4_ex_enc_8blks
957	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
958	st4	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
959	subs	w2,w2,#8
960	b.gt	.Lecb_8_blocks_process
961	b	100f
962.Lecb_4_blocks_process:
963	cmp	w2,#4
964	b.lt	1f
965	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
966#ifndef __AARCH64EB__
967	rev32	v4.16b,v4.16b
968#endif
969#ifndef __AARCH64EB__
970	rev32	v5.16b,v5.16b
971#endif
972#ifndef __AARCH64EB__
973	rev32	v6.16b,v6.16b
974#endif
975#ifndef __AARCH64EB__
976	rev32	v7.16b,v7.16b
977#endif
978	bl	_vpsm4_ex_enc_4blks
979	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
980	sub	w2,w2,#4
9811:
982	// process last block
983	cmp	w2,#1
984	b.lt	100f
985	b.gt	1f
986	ld1	{v4.4s},[x0]
987#ifndef __AARCH64EB__
988	rev32	v4.16b,v4.16b
989#endif
990	mov	x10,x3
991	mov	w11,#8
992	mov	w12,v4.s[0]
993	mov	w13,v4.s[1]
994	mov	w14,v4.s[2]
995	mov	w15,v4.s[3]
99610:
997	ldp	w7,w8,[x10],8
998	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
999	eor	w6,w14,w15
1000	eor	w9,w7,w13
1001	eor	w6,w6,w9
1002	mov	v3.s[0],w6
1003	// optimize sbox using AESE instruction
1004	tbl	v0.16b, {v3.16b}, v26.16b
1005	ushr	v2.16b, v0.16b, 4
1006	and	v0.16b, v0.16b, v31.16b
1007	tbl	v0.16b, {v28.16b}, v0.16b
1008	tbl	v2.16b, {v27.16b}, v2.16b
1009	eor	v0.16b, v0.16b, v2.16b
1010	eor	v1.16b, v1.16b, v1.16b
1011	aese	v0.16b,v1.16b
1012	ushr	v2.16b, v0.16b, 4
1013	and	v0.16b, v0.16b, v31.16b
1014	tbl	v0.16b, {v30.16b}, v0.16b
1015	tbl	v2.16b, {v29.16b}, v2.16b
1016	eor	v0.16b, v0.16b, v2.16b
1017
1018	mov	w7,v0.s[0]
1019	eor	w6,w7,w7,ror #32-2
1020	eor	w6,w6,w7,ror #32-10
1021	eor	w6,w6,w7,ror #32-18
1022	eor	w6,w6,w7,ror #32-24
1023	eor	w12,w12,w6
1024	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1025	eor	w6,w14,w15
1026	eor	w9,w12,w8
1027	eor	w6,w6,w9
1028	mov	v3.s[0],w6
1029	// optimize sbox using AESE instruction
1030	tbl	v0.16b, {v3.16b}, v26.16b
1031	ushr	v2.16b, v0.16b, 4
1032	and	v0.16b, v0.16b, v31.16b
1033	tbl	v0.16b, {v28.16b}, v0.16b
1034	tbl	v2.16b, {v27.16b}, v2.16b
1035	eor	v0.16b, v0.16b, v2.16b
1036	eor	v1.16b, v1.16b, v1.16b
1037	aese	v0.16b,v1.16b
1038	ushr	v2.16b, v0.16b, 4
1039	and	v0.16b, v0.16b, v31.16b
1040	tbl	v0.16b, {v30.16b}, v0.16b
1041	tbl	v2.16b, {v29.16b}, v2.16b
1042	eor	v0.16b, v0.16b, v2.16b
1043
1044	mov	w7,v0.s[0]
1045	eor	w6,w7,w7,ror #32-2
1046	eor	w6,w6,w7,ror #32-10
1047	eor	w6,w6,w7,ror #32-18
1048	eor	w6,w6,w7,ror #32-24
1049	ldp	w7,w8,[x10],8
1050	eor	w13,w13,w6
1051	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1052	eor	w6,w12,w13
1053	eor	w9,w7,w15
1054	eor	w6,w6,w9
1055	mov	v3.s[0],w6
1056	// optimize sbox using AESE instruction
1057	tbl	v0.16b, {v3.16b}, v26.16b
1058	ushr	v2.16b, v0.16b, 4
1059	and	v0.16b, v0.16b, v31.16b
1060	tbl	v0.16b, {v28.16b}, v0.16b
1061	tbl	v2.16b, {v27.16b}, v2.16b
1062	eor	v0.16b, v0.16b, v2.16b
1063	eor	v1.16b, v1.16b, v1.16b
1064	aese	v0.16b,v1.16b
1065	ushr	v2.16b, v0.16b, 4
1066	and	v0.16b, v0.16b, v31.16b
1067	tbl	v0.16b, {v30.16b}, v0.16b
1068	tbl	v2.16b, {v29.16b}, v2.16b
1069	eor	v0.16b, v0.16b, v2.16b
1070
1071	mov	w7,v0.s[0]
1072	eor	w6,w7,w7,ror #32-2
1073	eor	w6,w6,w7,ror #32-10
1074	eor	w6,w6,w7,ror #32-18
1075	eor	w6,w6,w7,ror #32-24
1076	eor	w14,w14,w6
1077	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1078	eor	w6,w12,w13
1079	eor	w9,w14,w8
1080	eor	w6,w6,w9
1081	mov	v3.s[0],w6
1082	// optimize sbox using AESE instruction
1083	tbl	v0.16b, {v3.16b}, v26.16b
1084	ushr	v2.16b, v0.16b, 4
1085	and	v0.16b, v0.16b, v31.16b
1086	tbl	v0.16b, {v28.16b}, v0.16b
1087	tbl	v2.16b, {v27.16b}, v2.16b
1088	eor	v0.16b, v0.16b, v2.16b
1089	eor	v1.16b, v1.16b, v1.16b
1090	aese	v0.16b,v1.16b
1091	ushr	v2.16b, v0.16b, 4
1092	and	v0.16b, v0.16b, v31.16b
1093	tbl	v0.16b, {v30.16b}, v0.16b
1094	tbl	v2.16b, {v29.16b}, v2.16b
1095	eor	v0.16b, v0.16b, v2.16b
1096
1097	mov	w7,v0.s[0]
1098	eor	w6,w7,w7,ror #32-2
1099	eor	w6,w6,w7,ror #32-10
1100	eor	w6,w6,w7,ror #32-18
1101	eor	w6,w6,w7,ror #32-24
1102	eor	w15,w15,w6
1103	subs	w11,w11,#1
1104	b.ne	10b
1105	mov	v4.s[0],w15
1106	mov	v4.s[1],w14
1107	mov	v4.s[2],w13
1108	mov	v4.s[3],w12
1109#ifndef __AARCH64EB__
1110	rev32	v4.16b,v4.16b
1111#endif
1112	st1	{v4.4s},[x1]
1113	b	100f
11141:	//	process last 2 blocks
1115	ld4	{v4.s,v5.s,v6.s,v7.s}[0],[x0],#16
1116	ld4	{v4.s,v5.s,v6.s,v7.s}[1],[x0],#16
1117	cmp	w2,#2
1118	b.gt	1f
1119#ifndef __AARCH64EB__
1120	rev32	v4.16b,v4.16b
1121#endif
1122#ifndef __AARCH64EB__
1123	rev32	v5.16b,v5.16b
1124#endif
1125#ifndef __AARCH64EB__
1126	rev32	v6.16b,v6.16b
1127#endif
1128#ifndef __AARCH64EB__
1129	rev32	v7.16b,v7.16b
1130#endif
1131	bl	_vpsm4_ex_enc_4blks
1132	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
1133	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1]
1134	b	100f
11351:	//	process last 3 blocks
1136	ld4	{v4.s,v5.s,v6.s,v7.s}[2],[x0],#16
1137#ifndef __AARCH64EB__
1138	rev32	v4.16b,v4.16b
1139#endif
1140#ifndef __AARCH64EB__
1141	rev32	v5.16b,v5.16b
1142#endif
1143#ifndef __AARCH64EB__
1144	rev32	v6.16b,v6.16b
1145#endif
1146#ifndef __AARCH64EB__
1147	rev32	v7.16b,v7.16b
1148#endif
1149	bl	_vpsm4_ex_enc_4blks
1150	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
1151	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
1152	st4	{v0.s,v1.s,v2.s,v3.s}[2],[x1]
1153100:
1154	ldp	d10,d11,[sp,#16]
1155	ldp	d12,d13,[sp,#32]
1156	ldp	d14,d15,[sp,#48]
1157	ldp	x29,x30,[sp,#64]
1158	ldp	d8,d9,[sp],#80
1159	AARCH64_VALIDATE_LINK_REGISTER
1160	ret
1161.size	vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt
1162.globl	vpsm4_ex_cbc_encrypt
1163.type	vpsm4_ex_cbc_encrypt,%function
1164.align	5
1165vpsm4_ex_cbc_encrypt:
1166	AARCH64_VALID_CALL_TARGET
1167	lsr	x2,x2,4
1168	adrp	x9, .Lsbox_magic
1169	ldr	q26, [x9, #:lo12:.Lsbox_magic]
1170	ldr	q27, [x9, #:lo12:.Lsbox_magic+16]
1171	ldr	q28, [x9, #:lo12:.Lsbox_magic+32]
1172	ldr	q29, [x9, #:lo12:.Lsbox_magic+48]
1173	ldr	q30, [x9, #:lo12:.Lsbox_magic+64]
1174	ldr	q31, [x9, #:lo12:.Lsbox_magic+80]
1175	cbz	w5,.Ldec
1176	ld1	{v3.4s},[x4]
1177.Lcbc_4_blocks_enc:
1178	cmp	w2,#4
1179	b.lt	1f
1180	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1181	eor	v4.16b,v4.16b,v3.16b
1182#ifndef __AARCH64EB__
1183	rev32	v5.16b,v5.16b
1184#endif
1185#ifndef __AARCH64EB__
1186	rev32	v4.16b,v4.16b
1187#endif
1188#ifndef __AARCH64EB__
1189	rev32	v6.16b,v6.16b
1190#endif
1191#ifndef __AARCH64EB__
1192	rev32	v7.16b,v7.16b
1193#endif
1194	mov	x10,x3
1195	mov	w11,#8
1196	mov	w12,v4.s[0]
1197	mov	w13,v4.s[1]
1198	mov	w14,v4.s[2]
1199	mov	w15,v4.s[3]
120010:
1201	ldp	w7,w8,[x10],8
1202	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1203	eor	w6,w14,w15
1204	eor	w9,w7,w13
1205	eor	w6,w6,w9
1206	mov	v3.s[0],w6
1207	// optimize sbox using AESE instruction
1208	tbl	v0.16b, {v3.16b}, v26.16b
1209	ushr	v2.16b, v0.16b, 4
1210	and	v0.16b, v0.16b, v31.16b
1211	tbl	v0.16b, {v28.16b}, v0.16b
1212	tbl	v2.16b, {v27.16b}, v2.16b
1213	eor	v0.16b, v0.16b, v2.16b
1214	eor	v1.16b, v1.16b, v1.16b
1215	aese	v0.16b,v1.16b
1216	ushr	v2.16b, v0.16b, 4
1217	and	v0.16b, v0.16b, v31.16b
1218	tbl	v0.16b, {v30.16b}, v0.16b
1219	tbl	v2.16b, {v29.16b}, v2.16b
1220	eor	v0.16b, v0.16b, v2.16b
1221
1222	mov	w7,v0.s[0]
1223	eor	w6,w7,w7,ror #32-2
1224	eor	w6,w6,w7,ror #32-10
1225	eor	w6,w6,w7,ror #32-18
1226	eor	w6,w6,w7,ror #32-24
1227	eor	w12,w12,w6
1228	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1229	eor	w6,w14,w15
1230	eor	w9,w12,w8
1231	eor	w6,w6,w9
1232	mov	v3.s[0],w6
1233	// optimize sbox using AESE instruction
1234	tbl	v0.16b, {v3.16b}, v26.16b
1235	ushr	v2.16b, v0.16b, 4
1236	and	v0.16b, v0.16b, v31.16b
1237	tbl	v0.16b, {v28.16b}, v0.16b
1238	tbl	v2.16b, {v27.16b}, v2.16b
1239	eor	v0.16b, v0.16b, v2.16b
1240	eor	v1.16b, v1.16b, v1.16b
1241	aese	v0.16b,v1.16b
1242	ushr	v2.16b, v0.16b, 4
1243	and	v0.16b, v0.16b, v31.16b
1244	tbl	v0.16b, {v30.16b}, v0.16b
1245	tbl	v2.16b, {v29.16b}, v2.16b
1246	eor	v0.16b, v0.16b, v2.16b
1247
1248	mov	w7,v0.s[0]
1249	eor	w6,w7,w7,ror #32-2
1250	eor	w6,w6,w7,ror #32-10
1251	eor	w6,w6,w7,ror #32-18
1252	eor	w6,w6,w7,ror #32-24
1253	ldp	w7,w8,[x10],8
1254	eor	w13,w13,w6
1255	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1256	eor	w6,w12,w13
1257	eor	w9,w7,w15
1258	eor	w6,w6,w9
1259	mov	v3.s[0],w6
1260	// optimize sbox using AESE instruction
1261	tbl	v0.16b, {v3.16b}, v26.16b
1262	ushr	v2.16b, v0.16b, 4
1263	and	v0.16b, v0.16b, v31.16b
1264	tbl	v0.16b, {v28.16b}, v0.16b
1265	tbl	v2.16b, {v27.16b}, v2.16b
1266	eor	v0.16b, v0.16b, v2.16b
1267	eor	v1.16b, v1.16b, v1.16b
1268	aese	v0.16b,v1.16b
1269	ushr	v2.16b, v0.16b, 4
1270	and	v0.16b, v0.16b, v31.16b
1271	tbl	v0.16b, {v30.16b}, v0.16b
1272	tbl	v2.16b, {v29.16b}, v2.16b
1273	eor	v0.16b, v0.16b, v2.16b
1274
1275	mov	w7,v0.s[0]
1276	eor	w6,w7,w7,ror #32-2
1277	eor	w6,w6,w7,ror #32-10
1278	eor	w6,w6,w7,ror #32-18
1279	eor	w6,w6,w7,ror #32-24
1280	eor	w14,w14,w6
1281	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1282	eor	w6,w12,w13
1283	eor	w9,w14,w8
1284	eor	w6,w6,w9
1285	mov	v3.s[0],w6
1286	// optimize sbox using AESE instruction
1287	tbl	v0.16b, {v3.16b}, v26.16b
1288	ushr	v2.16b, v0.16b, 4
1289	and	v0.16b, v0.16b, v31.16b
1290	tbl	v0.16b, {v28.16b}, v0.16b
1291	tbl	v2.16b, {v27.16b}, v2.16b
1292	eor	v0.16b, v0.16b, v2.16b
1293	eor	v1.16b, v1.16b, v1.16b
1294	aese	v0.16b,v1.16b
1295	ushr	v2.16b, v0.16b, 4
1296	and	v0.16b, v0.16b, v31.16b
1297	tbl	v0.16b, {v30.16b}, v0.16b
1298	tbl	v2.16b, {v29.16b}, v2.16b
1299	eor	v0.16b, v0.16b, v2.16b
1300
1301	mov	w7,v0.s[0]
1302	eor	w6,w7,w7,ror #32-2
1303	eor	w6,w6,w7,ror #32-10
1304	eor	w6,w6,w7,ror #32-18
1305	eor	w6,w6,w7,ror #32-24
1306	eor	w15,w15,w6
1307	subs	w11,w11,#1
1308	b.ne	10b
1309	mov	v4.s[0],w15
1310	mov	v4.s[1],w14
1311	mov	v4.s[2],w13
1312	mov	v4.s[3],w12
1313	eor	v5.16b,v5.16b,v4.16b
1314	mov	x10,x3
1315	mov	w11,#8
1316	mov	w12,v5.s[0]
1317	mov	w13,v5.s[1]
1318	mov	w14,v5.s[2]
1319	mov	w15,v5.s[3]
132010:
1321	ldp	w7,w8,[x10],8
1322	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1323	eor	w6,w14,w15
1324	eor	w9,w7,w13
1325	eor	w6,w6,w9
1326	mov	v3.s[0],w6
1327	// optimize sbox using AESE instruction
1328	tbl	v0.16b, {v3.16b}, v26.16b
1329	ushr	v2.16b, v0.16b, 4
1330	and	v0.16b, v0.16b, v31.16b
1331	tbl	v0.16b, {v28.16b}, v0.16b
1332	tbl	v2.16b, {v27.16b}, v2.16b
1333	eor	v0.16b, v0.16b, v2.16b
1334	eor	v1.16b, v1.16b, v1.16b
1335	aese	v0.16b,v1.16b
1336	ushr	v2.16b, v0.16b, 4
1337	and	v0.16b, v0.16b, v31.16b
1338	tbl	v0.16b, {v30.16b}, v0.16b
1339	tbl	v2.16b, {v29.16b}, v2.16b
1340	eor	v0.16b, v0.16b, v2.16b
1341
1342	mov	w7,v0.s[0]
1343	eor	w6,w7,w7,ror #32-2
1344	eor	w6,w6,w7,ror #32-10
1345	eor	w6,w6,w7,ror #32-18
1346	eor	w6,w6,w7,ror #32-24
1347	eor	w12,w12,w6
1348	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1349	eor	w6,w14,w15
1350	eor	w9,w12,w8
1351	eor	w6,w6,w9
1352	mov	v3.s[0],w6
1353	// optimize sbox using AESE instruction
1354	tbl	v0.16b, {v3.16b}, v26.16b
1355	ushr	v2.16b, v0.16b, 4
1356	and	v0.16b, v0.16b, v31.16b
1357	tbl	v0.16b, {v28.16b}, v0.16b
1358	tbl	v2.16b, {v27.16b}, v2.16b
1359	eor	v0.16b, v0.16b, v2.16b
1360	eor	v1.16b, v1.16b, v1.16b
1361	aese	v0.16b,v1.16b
1362	ushr	v2.16b, v0.16b, 4
1363	and	v0.16b, v0.16b, v31.16b
1364	tbl	v0.16b, {v30.16b}, v0.16b
1365	tbl	v2.16b, {v29.16b}, v2.16b
1366	eor	v0.16b, v0.16b, v2.16b
1367
1368	mov	w7,v0.s[0]
1369	eor	w6,w7,w7,ror #32-2
1370	eor	w6,w6,w7,ror #32-10
1371	eor	w6,w6,w7,ror #32-18
1372	eor	w6,w6,w7,ror #32-24
1373	ldp	w7,w8,[x10],8
1374	eor	w13,w13,w6
1375	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1376	eor	w6,w12,w13
1377	eor	w9,w7,w15
1378	eor	w6,w6,w9
1379	mov	v3.s[0],w6
1380	// optimize sbox using AESE instruction
1381	tbl	v0.16b, {v3.16b}, v26.16b
1382	ushr	v2.16b, v0.16b, 4
1383	and	v0.16b, v0.16b, v31.16b
1384	tbl	v0.16b, {v28.16b}, v0.16b
1385	tbl	v2.16b, {v27.16b}, v2.16b
1386	eor	v0.16b, v0.16b, v2.16b
1387	eor	v1.16b, v1.16b, v1.16b
1388	aese	v0.16b,v1.16b
1389	ushr	v2.16b, v0.16b, 4
1390	and	v0.16b, v0.16b, v31.16b
1391	tbl	v0.16b, {v30.16b}, v0.16b
1392	tbl	v2.16b, {v29.16b}, v2.16b
1393	eor	v0.16b, v0.16b, v2.16b
1394
1395	mov	w7,v0.s[0]
1396	eor	w6,w7,w7,ror #32-2
1397	eor	w6,w6,w7,ror #32-10
1398	eor	w6,w6,w7,ror #32-18
1399	eor	w6,w6,w7,ror #32-24
1400	eor	w14,w14,w6
1401	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1402	eor	w6,w12,w13
1403	eor	w9,w14,w8
1404	eor	w6,w6,w9
1405	mov	v3.s[0],w6
1406	// optimize sbox using AESE instruction
1407	tbl	v0.16b, {v3.16b}, v26.16b
1408	ushr	v2.16b, v0.16b, 4
1409	and	v0.16b, v0.16b, v31.16b
1410	tbl	v0.16b, {v28.16b}, v0.16b
1411	tbl	v2.16b, {v27.16b}, v2.16b
1412	eor	v0.16b, v0.16b, v2.16b
1413	eor	v1.16b, v1.16b, v1.16b
1414	aese	v0.16b,v1.16b
1415	ushr	v2.16b, v0.16b, 4
1416	and	v0.16b, v0.16b, v31.16b
1417	tbl	v0.16b, {v30.16b}, v0.16b
1418	tbl	v2.16b, {v29.16b}, v2.16b
1419	eor	v0.16b, v0.16b, v2.16b
1420
1421	mov	w7,v0.s[0]
1422	eor	w6,w7,w7,ror #32-2
1423	eor	w6,w6,w7,ror #32-10
1424	eor	w6,w6,w7,ror #32-18
1425	eor	w6,w6,w7,ror #32-24
1426	eor	w15,w15,w6
1427	subs	w11,w11,#1
1428	b.ne	10b
1429	mov	v5.s[0],w15
1430	mov	v5.s[1],w14
1431	mov	v5.s[2],w13
1432	mov	v5.s[3],w12
1433#ifndef __AARCH64EB__
1434	rev32	v4.16b,v4.16b
1435#endif
1436	eor	v6.16b,v6.16b,v5.16b
1437	mov	x10,x3
1438	mov	w11,#8
1439	mov	w12,v6.s[0]
1440	mov	w13,v6.s[1]
1441	mov	w14,v6.s[2]
1442	mov	w15,v6.s[3]
144310:
1444	ldp	w7,w8,[x10],8
1445	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1446	eor	w6,w14,w15
1447	eor	w9,w7,w13
1448	eor	w6,w6,w9
1449	mov	v3.s[0],w6
1450	// optimize sbox using AESE instruction
1451	tbl	v0.16b, {v3.16b}, v26.16b
1452	ushr	v2.16b, v0.16b, 4
1453	and	v0.16b, v0.16b, v31.16b
1454	tbl	v0.16b, {v28.16b}, v0.16b
1455	tbl	v2.16b, {v27.16b}, v2.16b
1456	eor	v0.16b, v0.16b, v2.16b
1457	eor	v1.16b, v1.16b, v1.16b
1458	aese	v0.16b,v1.16b
1459	ushr	v2.16b, v0.16b, 4
1460	and	v0.16b, v0.16b, v31.16b
1461	tbl	v0.16b, {v30.16b}, v0.16b
1462	tbl	v2.16b, {v29.16b}, v2.16b
1463	eor	v0.16b, v0.16b, v2.16b
1464
1465	mov	w7,v0.s[0]
1466	eor	w6,w7,w7,ror #32-2
1467	eor	w6,w6,w7,ror #32-10
1468	eor	w6,w6,w7,ror #32-18
1469	eor	w6,w6,w7,ror #32-24
1470	eor	w12,w12,w6
1471	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1472	eor	w6,w14,w15
1473	eor	w9,w12,w8
1474	eor	w6,w6,w9
1475	mov	v3.s[0],w6
1476	// optimize sbox using AESE instruction
1477	tbl	v0.16b, {v3.16b}, v26.16b
1478	ushr	v2.16b, v0.16b, 4
1479	and	v0.16b, v0.16b, v31.16b
1480	tbl	v0.16b, {v28.16b}, v0.16b
1481	tbl	v2.16b, {v27.16b}, v2.16b
1482	eor	v0.16b, v0.16b, v2.16b
1483	eor	v1.16b, v1.16b, v1.16b
1484	aese	v0.16b,v1.16b
1485	ushr	v2.16b, v0.16b, 4
1486	and	v0.16b, v0.16b, v31.16b
1487	tbl	v0.16b, {v30.16b}, v0.16b
1488	tbl	v2.16b, {v29.16b}, v2.16b
1489	eor	v0.16b, v0.16b, v2.16b
1490
1491	mov	w7,v0.s[0]
1492	eor	w6,w7,w7,ror #32-2
1493	eor	w6,w6,w7,ror #32-10
1494	eor	w6,w6,w7,ror #32-18
1495	eor	w6,w6,w7,ror #32-24
1496	ldp	w7,w8,[x10],8
1497	eor	w13,w13,w6
1498	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1499	eor	w6,w12,w13
1500	eor	w9,w7,w15
1501	eor	w6,w6,w9
1502	mov	v3.s[0],w6
1503	// optimize sbox using AESE instruction
1504	tbl	v0.16b, {v3.16b}, v26.16b
1505	ushr	v2.16b, v0.16b, 4
1506	and	v0.16b, v0.16b, v31.16b
1507	tbl	v0.16b, {v28.16b}, v0.16b
1508	tbl	v2.16b, {v27.16b}, v2.16b
1509	eor	v0.16b, v0.16b, v2.16b
1510	eor	v1.16b, v1.16b, v1.16b
1511	aese	v0.16b,v1.16b
1512	ushr	v2.16b, v0.16b, 4
1513	and	v0.16b, v0.16b, v31.16b
1514	tbl	v0.16b, {v30.16b}, v0.16b
1515	tbl	v2.16b, {v29.16b}, v2.16b
1516	eor	v0.16b, v0.16b, v2.16b
1517
1518	mov	w7,v0.s[0]
1519	eor	w6,w7,w7,ror #32-2
1520	eor	w6,w6,w7,ror #32-10
1521	eor	w6,w6,w7,ror #32-18
1522	eor	w6,w6,w7,ror #32-24
1523	eor	w14,w14,w6
1524	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1525	eor	w6,w12,w13
1526	eor	w9,w14,w8
1527	eor	w6,w6,w9
1528	mov	v3.s[0],w6
1529	// optimize sbox using AESE instruction
1530	tbl	v0.16b, {v3.16b}, v26.16b
1531	ushr	v2.16b, v0.16b, 4
1532	and	v0.16b, v0.16b, v31.16b
1533	tbl	v0.16b, {v28.16b}, v0.16b
1534	tbl	v2.16b, {v27.16b}, v2.16b
1535	eor	v0.16b, v0.16b, v2.16b
1536	eor	v1.16b, v1.16b, v1.16b
1537	aese	v0.16b,v1.16b
1538	ushr	v2.16b, v0.16b, 4
1539	and	v0.16b, v0.16b, v31.16b
1540	tbl	v0.16b, {v30.16b}, v0.16b
1541	tbl	v2.16b, {v29.16b}, v2.16b
1542	eor	v0.16b, v0.16b, v2.16b
1543
1544	mov	w7,v0.s[0]
1545	eor	w6,w7,w7,ror #32-2
1546	eor	w6,w6,w7,ror #32-10
1547	eor	w6,w6,w7,ror #32-18
1548	eor	w6,w6,w7,ror #32-24
1549	eor	w15,w15,w6
1550	subs	w11,w11,#1
1551	b.ne	10b
1552	mov	v6.s[0],w15
1553	mov	v6.s[1],w14
1554	mov	v6.s[2],w13
1555	mov	v6.s[3],w12
1556#ifndef __AARCH64EB__
1557	rev32	v5.16b,v5.16b
1558#endif
1559	eor	v7.16b,v7.16b,v6.16b
1560	mov	x10,x3
1561	mov	w11,#8
1562	mov	w12,v7.s[0]
1563	mov	w13,v7.s[1]
1564	mov	w14,v7.s[2]
1565	mov	w15,v7.s[3]
156610:
1567	ldp	w7,w8,[x10],8
1568	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1569	eor	w6,w14,w15
1570	eor	w9,w7,w13
1571	eor	w6,w6,w9
1572	mov	v3.s[0],w6
1573	// optimize sbox using AESE instruction
1574	tbl	v0.16b, {v3.16b}, v26.16b
1575	ushr	v2.16b, v0.16b, 4
1576	and	v0.16b, v0.16b, v31.16b
1577	tbl	v0.16b, {v28.16b}, v0.16b
1578	tbl	v2.16b, {v27.16b}, v2.16b
1579	eor	v0.16b, v0.16b, v2.16b
1580	eor	v1.16b, v1.16b, v1.16b
1581	aese	v0.16b,v1.16b
1582	ushr	v2.16b, v0.16b, 4
1583	and	v0.16b, v0.16b, v31.16b
1584	tbl	v0.16b, {v30.16b}, v0.16b
1585	tbl	v2.16b, {v29.16b}, v2.16b
1586	eor	v0.16b, v0.16b, v2.16b
1587
1588	mov	w7,v0.s[0]
1589	eor	w6,w7,w7,ror #32-2
1590	eor	w6,w6,w7,ror #32-10
1591	eor	w6,w6,w7,ror #32-18
1592	eor	w6,w6,w7,ror #32-24
1593	eor	w12,w12,w6
1594	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1595	eor	w6,w14,w15
1596	eor	w9,w12,w8
1597	eor	w6,w6,w9
1598	mov	v3.s[0],w6
1599	// optimize sbox using AESE instruction
1600	tbl	v0.16b, {v3.16b}, v26.16b
1601	ushr	v2.16b, v0.16b, 4
1602	and	v0.16b, v0.16b, v31.16b
1603	tbl	v0.16b, {v28.16b}, v0.16b
1604	tbl	v2.16b, {v27.16b}, v2.16b
1605	eor	v0.16b, v0.16b, v2.16b
1606	eor	v1.16b, v1.16b, v1.16b
1607	aese	v0.16b,v1.16b
1608	ushr	v2.16b, v0.16b, 4
1609	and	v0.16b, v0.16b, v31.16b
1610	tbl	v0.16b, {v30.16b}, v0.16b
1611	tbl	v2.16b, {v29.16b}, v2.16b
1612	eor	v0.16b, v0.16b, v2.16b
1613
1614	mov	w7,v0.s[0]
1615	eor	w6,w7,w7,ror #32-2
1616	eor	w6,w6,w7,ror #32-10
1617	eor	w6,w6,w7,ror #32-18
1618	eor	w6,w6,w7,ror #32-24
1619	ldp	w7,w8,[x10],8
1620	eor	w13,w13,w6
1621	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1622	eor	w6,w12,w13
1623	eor	w9,w7,w15
1624	eor	w6,w6,w9
1625	mov	v3.s[0],w6
1626	// optimize sbox using AESE instruction
1627	tbl	v0.16b, {v3.16b}, v26.16b
1628	ushr	v2.16b, v0.16b, 4
1629	and	v0.16b, v0.16b, v31.16b
1630	tbl	v0.16b, {v28.16b}, v0.16b
1631	tbl	v2.16b, {v27.16b}, v2.16b
1632	eor	v0.16b, v0.16b, v2.16b
1633	eor	v1.16b, v1.16b, v1.16b
1634	aese	v0.16b,v1.16b
1635	ushr	v2.16b, v0.16b, 4
1636	and	v0.16b, v0.16b, v31.16b
1637	tbl	v0.16b, {v30.16b}, v0.16b
1638	tbl	v2.16b, {v29.16b}, v2.16b
1639	eor	v0.16b, v0.16b, v2.16b
1640
1641	mov	w7,v0.s[0]
1642	eor	w6,w7,w7,ror #32-2
1643	eor	w6,w6,w7,ror #32-10
1644	eor	w6,w6,w7,ror #32-18
1645	eor	w6,w6,w7,ror #32-24
1646	eor	w14,w14,w6
1647	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1648	eor	w6,w12,w13
1649	eor	w9,w14,w8
1650	eor	w6,w6,w9
1651	mov	v3.s[0],w6
1652	// optimize sbox using AESE instruction
1653	tbl	v0.16b, {v3.16b}, v26.16b
1654	ushr	v2.16b, v0.16b, 4
1655	and	v0.16b, v0.16b, v31.16b
1656	tbl	v0.16b, {v28.16b}, v0.16b
1657	tbl	v2.16b, {v27.16b}, v2.16b
1658	eor	v0.16b, v0.16b, v2.16b
1659	eor	v1.16b, v1.16b, v1.16b
1660	aese	v0.16b,v1.16b
1661	ushr	v2.16b, v0.16b, 4
1662	and	v0.16b, v0.16b, v31.16b
1663	tbl	v0.16b, {v30.16b}, v0.16b
1664	tbl	v2.16b, {v29.16b}, v2.16b
1665	eor	v0.16b, v0.16b, v2.16b
1666
1667	mov	w7,v0.s[0]
1668	eor	w6,w7,w7,ror #32-2
1669	eor	w6,w6,w7,ror #32-10
1670	eor	w6,w6,w7,ror #32-18
1671	eor	w6,w6,w7,ror #32-24
1672	eor	w15,w15,w6
1673	subs	w11,w11,#1
1674	b.ne	10b
1675	mov	v7.s[0],w15
1676	mov	v7.s[1],w14
1677	mov	v7.s[2],w13
1678	mov	v7.s[3],w12
1679#ifndef __AARCH64EB__
1680	rev32	v6.16b,v6.16b
1681#endif
1682#ifndef __AARCH64EB__
1683	rev32	v7.16b,v7.16b
1684#endif
1685	orr	v3.16b,v7.16b,v7.16b
1686	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
1687	subs	w2,w2,#4
1688	b.ne	.Lcbc_4_blocks_enc
1689	b	2f
16901:
1691	subs	w2,w2,#1
1692	b.lt	2f
1693	ld1	{v4.4s},[x0],#16
1694	eor	v3.16b,v3.16b,v4.16b
1695#ifndef __AARCH64EB__
1696	rev32	v3.16b,v3.16b
1697#endif
1698	mov	x10,x3
1699	mov	w11,#8
1700	mov	w12,v3.s[0]
1701	mov	w13,v3.s[1]
1702	mov	w14,v3.s[2]
1703	mov	w15,v3.s[3]
170410:
1705	ldp	w7,w8,[x10],8
1706	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1707	eor	w6,w14,w15
1708	eor	w9,w7,w13
1709	eor	w6,w6,w9
1710	mov	v3.s[0],w6
1711	// optimize sbox using AESE instruction
1712	tbl	v0.16b, {v3.16b}, v26.16b
1713	ushr	v2.16b, v0.16b, 4
1714	and	v0.16b, v0.16b, v31.16b
1715	tbl	v0.16b, {v28.16b}, v0.16b
1716	tbl	v2.16b, {v27.16b}, v2.16b
1717	eor	v0.16b, v0.16b, v2.16b
1718	eor	v1.16b, v1.16b, v1.16b
1719	aese	v0.16b,v1.16b
1720	ushr	v2.16b, v0.16b, 4
1721	and	v0.16b, v0.16b, v31.16b
1722	tbl	v0.16b, {v30.16b}, v0.16b
1723	tbl	v2.16b, {v29.16b}, v2.16b
1724	eor	v0.16b, v0.16b, v2.16b
1725
1726	mov	w7,v0.s[0]
1727	eor	w6,w7,w7,ror #32-2
1728	eor	w6,w6,w7,ror #32-10
1729	eor	w6,w6,w7,ror #32-18
1730	eor	w6,w6,w7,ror #32-24
1731	eor	w12,w12,w6
1732	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1733	eor	w6,w14,w15
1734	eor	w9,w12,w8
1735	eor	w6,w6,w9
1736	mov	v3.s[0],w6
1737	// optimize sbox using AESE instruction
1738	tbl	v0.16b, {v3.16b}, v26.16b
1739	ushr	v2.16b, v0.16b, 4
1740	and	v0.16b, v0.16b, v31.16b
1741	tbl	v0.16b, {v28.16b}, v0.16b
1742	tbl	v2.16b, {v27.16b}, v2.16b
1743	eor	v0.16b, v0.16b, v2.16b
1744	eor	v1.16b, v1.16b, v1.16b
1745	aese	v0.16b,v1.16b
1746	ushr	v2.16b, v0.16b, 4
1747	and	v0.16b, v0.16b, v31.16b
1748	tbl	v0.16b, {v30.16b}, v0.16b
1749	tbl	v2.16b, {v29.16b}, v2.16b
1750	eor	v0.16b, v0.16b, v2.16b
1751
1752	mov	w7,v0.s[0]
1753	eor	w6,w7,w7,ror #32-2
1754	eor	w6,w6,w7,ror #32-10
1755	eor	w6,w6,w7,ror #32-18
1756	eor	w6,w6,w7,ror #32-24
1757	ldp	w7,w8,[x10],8
1758	eor	w13,w13,w6
1759	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1760	eor	w6,w12,w13
1761	eor	w9,w7,w15
1762	eor	w6,w6,w9
1763	mov	v3.s[0],w6
1764	// optimize sbox using AESE instruction
1765	tbl	v0.16b, {v3.16b}, v26.16b
1766	ushr	v2.16b, v0.16b, 4
1767	and	v0.16b, v0.16b, v31.16b
1768	tbl	v0.16b, {v28.16b}, v0.16b
1769	tbl	v2.16b, {v27.16b}, v2.16b
1770	eor	v0.16b, v0.16b, v2.16b
1771	eor	v1.16b, v1.16b, v1.16b
1772	aese	v0.16b,v1.16b
1773	ushr	v2.16b, v0.16b, 4
1774	and	v0.16b, v0.16b, v31.16b
1775	tbl	v0.16b, {v30.16b}, v0.16b
1776	tbl	v2.16b, {v29.16b}, v2.16b
1777	eor	v0.16b, v0.16b, v2.16b
1778
1779	mov	w7,v0.s[0]
1780	eor	w6,w7,w7,ror #32-2
1781	eor	w6,w6,w7,ror #32-10
1782	eor	w6,w6,w7,ror #32-18
1783	eor	w6,w6,w7,ror #32-24
1784	eor	w14,w14,w6
1785	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1786	eor	w6,w12,w13
1787	eor	w9,w14,w8
1788	eor	w6,w6,w9
1789	mov	v3.s[0],w6
1790	// optimize sbox using AESE instruction
1791	tbl	v0.16b, {v3.16b}, v26.16b
1792	ushr	v2.16b, v0.16b, 4
1793	and	v0.16b, v0.16b, v31.16b
1794	tbl	v0.16b, {v28.16b}, v0.16b
1795	tbl	v2.16b, {v27.16b}, v2.16b
1796	eor	v0.16b, v0.16b, v2.16b
1797	eor	v1.16b, v1.16b, v1.16b
1798	aese	v0.16b,v1.16b
1799	ushr	v2.16b, v0.16b, 4
1800	and	v0.16b, v0.16b, v31.16b
1801	tbl	v0.16b, {v30.16b}, v0.16b
1802	tbl	v2.16b, {v29.16b}, v2.16b
1803	eor	v0.16b, v0.16b, v2.16b
1804
1805	mov	w7,v0.s[0]
1806	eor	w6,w7,w7,ror #32-2
1807	eor	w6,w6,w7,ror #32-10
1808	eor	w6,w6,w7,ror #32-18
1809	eor	w6,w6,w7,ror #32-24
1810	eor	w15,w15,w6
1811	subs	w11,w11,#1
1812	b.ne	10b
1813	mov	v3.s[0],w15
1814	mov	v3.s[1],w14
1815	mov	v3.s[2],w13
1816	mov	v3.s[3],w12
1817#ifndef __AARCH64EB__
1818	rev32	v3.16b,v3.16b
1819#endif
1820	st1	{v3.4s},[x1],#16
1821	b	1b
18222:
1823	// save back IV
1824	st1	{v3.4s},[x4]
1825	ret
1826
1827.Ldec:
1828	// decryption mode starts
1829	AARCH64_SIGN_LINK_REGISTER
1830	stp	d8,d9,[sp,#-80]!
1831	stp	d10,d11,[sp,#16]
1832	stp	d12,d13,[sp,#32]
1833	stp	d14,d15,[sp,#48]
1834	stp	x29,x30,[sp,#64]
1835.Lcbc_8_blocks_dec:
1836	cmp	w2,#8
1837	b.lt	1f
1838	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0]
1839	add	x10,x0,#64
1840	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x10]
1841#ifndef __AARCH64EB__
1842	rev32	v4.16b,v4.16b
1843#endif
1844#ifndef __AARCH64EB__
1845	rev32	v5.16b,v5.16b
1846#endif
1847#ifndef __AARCH64EB__
1848	rev32	v6.16b,v6.16b
1849#endif
1850#ifndef __AARCH64EB__
1851	rev32	v7.16b,v7.16b
1852#endif
1853#ifndef __AARCH64EB__
1854	rev32	v8.16b,v8.16b
1855#endif
1856#ifndef __AARCH64EB__
1857	rev32	v9.16b,v9.16b
1858#endif
1859#ifndef __AARCH64EB__
1860	rev32	v10.16b,v10.16b
1861#endif
1862#ifndef __AARCH64EB__
1863	rev32	v11.16b,v11.16b
1864#endif
1865	bl	_vpsm4_ex_enc_8blks
1866	zip1	v8.4s,v0.4s,v1.4s
1867	zip2	v9.4s,v0.4s,v1.4s
1868	zip1	v10.4s,v2.4s,v3.4s
1869	zip2	v11.4s,v2.4s,v3.4s
1870	zip1	v0.2d,v8.2d,v10.2d
1871	zip2	v1.2d,v8.2d,v10.2d
1872	zip1	v2.2d,v9.2d,v11.2d
1873	zip2	v3.2d,v9.2d,v11.2d
1874	zip1	v8.4s,v4.4s,v5.4s
1875	zip2	v9.4s,v4.4s,v5.4s
1876	zip1	v10.4s,v6.4s,v7.4s
1877	zip2	v11.4s,v6.4s,v7.4s
1878	zip1	v4.2d,v8.2d,v10.2d
1879	zip2	v5.2d,v8.2d,v10.2d
1880	zip1	v6.2d,v9.2d,v11.2d
1881	zip2	v7.2d,v9.2d,v11.2d
1882	ld1	{v15.4s},[x4]
1883	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
1884	// note ivec1 and vtmpx[3] are reusing the same register
1885	// care needs to be taken to avoid conflict
1886	eor	v0.16b,v0.16b,v15.16b
1887	ld1	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
1888	eor	v1.16b,v1.16b,v8.16b
1889	eor	v2.16b,v2.16b,v9.16b
1890	eor	v3.16b,v3.16b,v10.16b
1891	// save back IV
1892	st1	{v15.4s}, [x4]
1893	eor	v4.16b,v4.16b,v11.16b
1894	eor	v5.16b,v5.16b,v12.16b
1895	eor	v6.16b,v6.16b,v13.16b
1896	eor	v7.16b,v7.16b,v14.16b
1897	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
1898	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
1899	subs	w2,w2,#8
1900	b.gt	.Lcbc_8_blocks_dec
1901	b.eq	100f
19021:
1903	ld1	{v15.4s},[x4]
1904.Lcbc_4_blocks_dec:
1905	cmp	w2,#4
1906	b.lt	1f
1907	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0]
1908#ifndef __AARCH64EB__
1909	rev32	v4.16b,v4.16b
1910#endif
1911#ifndef __AARCH64EB__
1912	rev32	v5.16b,v5.16b
1913#endif
1914#ifndef __AARCH64EB__
1915	rev32	v6.16b,v6.16b
1916#endif
1917#ifndef __AARCH64EB__
1918	rev32	v7.16b,v7.16b
1919#endif
1920	bl	_vpsm4_ex_enc_4blks
1921	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1922	zip1	v8.4s,v0.4s,v1.4s
1923	zip2	v9.4s,v0.4s,v1.4s
1924	zip1	v10.4s,v2.4s,v3.4s
1925	zip2	v11.4s,v2.4s,v3.4s
1926	zip1	v0.2d,v8.2d,v10.2d
1927	zip2	v1.2d,v8.2d,v10.2d
1928	zip1	v2.2d,v9.2d,v11.2d
1929	zip2	v3.2d,v9.2d,v11.2d
1930	eor	v0.16b,v0.16b,v15.16b
1931	eor	v1.16b,v1.16b,v4.16b
1932	orr	v15.16b,v7.16b,v7.16b
1933	eor	v2.16b,v2.16b,v5.16b
1934	eor	v3.16b,v3.16b,v6.16b
1935	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
1936	subs	w2,w2,#4
1937	b.gt	.Lcbc_4_blocks_dec
1938	// save back IV
1939	st1	{v7.4s}, [x4]
1940	b	100f
19411:	//	last block
1942	subs	w2,w2,#1
1943	b.lt	100f
1944	b.gt	1f
1945	ld1	{v4.4s},[x0],#16
1946	// save back IV
1947	st1	{v4.4s}, [x4]
1948#ifndef __AARCH64EB__
1949	rev32	v8.16b,v4.16b
1950#else
1951	mov	v8.16b,v4.16b
1952#endif
1953	mov	x10,x3
1954	mov	w11,#8
1955	mov	w12,v8.s[0]
1956	mov	w13,v8.s[1]
1957	mov	w14,v8.s[2]
1958	mov	w15,v8.s[3]
195910:
1960	ldp	w7,w8,[x10],8
1961	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1962	eor	w6,w14,w15
1963	eor	w9,w7,w13
1964	eor	w6,w6,w9
1965	mov	v3.s[0],w6
1966	// optimize sbox using AESE instruction
1967	tbl	v0.16b, {v3.16b}, v26.16b
1968	ushr	v2.16b, v0.16b, 4
1969	and	v0.16b, v0.16b, v31.16b
1970	tbl	v0.16b, {v28.16b}, v0.16b
1971	tbl	v2.16b, {v27.16b}, v2.16b
1972	eor	v0.16b, v0.16b, v2.16b
1973	eor	v1.16b, v1.16b, v1.16b
1974	aese	v0.16b,v1.16b
1975	ushr	v2.16b, v0.16b, 4
1976	and	v0.16b, v0.16b, v31.16b
1977	tbl	v0.16b, {v30.16b}, v0.16b
1978	tbl	v2.16b, {v29.16b}, v2.16b
1979	eor	v0.16b, v0.16b, v2.16b
1980
1981	mov	w7,v0.s[0]
1982	eor	w6,w7,w7,ror #32-2
1983	eor	w6,w6,w7,ror #32-10
1984	eor	w6,w6,w7,ror #32-18
1985	eor	w6,w6,w7,ror #32-24
1986	eor	w12,w12,w6
1987	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1988	eor	w6,w14,w15
1989	eor	w9,w12,w8
1990	eor	w6,w6,w9
1991	mov	v3.s[0],w6
1992	// optimize sbox using AESE instruction
1993	tbl	v0.16b, {v3.16b}, v26.16b
1994	ushr	v2.16b, v0.16b, 4
1995	and	v0.16b, v0.16b, v31.16b
1996	tbl	v0.16b, {v28.16b}, v0.16b
1997	tbl	v2.16b, {v27.16b}, v2.16b
1998	eor	v0.16b, v0.16b, v2.16b
1999	eor	v1.16b, v1.16b, v1.16b
2000	aese	v0.16b,v1.16b
2001	ushr	v2.16b, v0.16b, 4
2002	and	v0.16b, v0.16b, v31.16b
2003	tbl	v0.16b, {v30.16b}, v0.16b
2004	tbl	v2.16b, {v29.16b}, v2.16b
2005	eor	v0.16b, v0.16b, v2.16b
2006
2007	mov	w7,v0.s[0]
2008	eor	w6,w7,w7,ror #32-2
2009	eor	w6,w6,w7,ror #32-10
2010	eor	w6,w6,w7,ror #32-18
2011	eor	w6,w6,w7,ror #32-24
2012	ldp	w7,w8,[x10],8
2013	eor	w13,w13,w6
2014	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2015	eor	w6,w12,w13
2016	eor	w9,w7,w15
2017	eor	w6,w6,w9
2018	mov	v3.s[0],w6
2019	// optimize sbox using AESE instruction
2020	tbl	v0.16b, {v3.16b}, v26.16b
2021	ushr	v2.16b, v0.16b, 4
2022	and	v0.16b, v0.16b, v31.16b
2023	tbl	v0.16b, {v28.16b}, v0.16b
2024	tbl	v2.16b, {v27.16b}, v2.16b
2025	eor	v0.16b, v0.16b, v2.16b
2026	eor	v1.16b, v1.16b, v1.16b
2027	aese	v0.16b,v1.16b
2028	ushr	v2.16b, v0.16b, 4
2029	and	v0.16b, v0.16b, v31.16b
2030	tbl	v0.16b, {v30.16b}, v0.16b
2031	tbl	v2.16b, {v29.16b}, v2.16b
2032	eor	v0.16b, v0.16b, v2.16b
2033
2034	mov	w7,v0.s[0]
2035	eor	w6,w7,w7,ror #32-2
2036	eor	w6,w6,w7,ror #32-10
2037	eor	w6,w6,w7,ror #32-18
2038	eor	w6,w6,w7,ror #32-24
2039	eor	w14,w14,w6
2040	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2041	eor	w6,w12,w13
2042	eor	w9,w14,w8
2043	eor	w6,w6,w9
2044	mov	v3.s[0],w6
2045	// optimize sbox using AESE instruction
2046	tbl	v0.16b, {v3.16b}, v26.16b
2047	ushr	v2.16b, v0.16b, 4
2048	and	v0.16b, v0.16b, v31.16b
2049	tbl	v0.16b, {v28.16b}, v0.16b
2050	tbl	v2.16b, {v27.16b}, v2.16b
2051	eor	v0.16b, v0.16b, v2.16b
2052	eor	v1.16b, v1.16b, v1.16b
2053	aese	v0.16b,v1.16b
2054	ushr	v2.16b, v0.16b, 4
2055	and	v0.16b, v0.16b, v31.16b
2056	tbl	v0.16b, {v30.16b}, v0.16b
2057	tbl	v2.16b, {v29.16b}, v2.16b
2058	eor	v0.16b, v0.16b, v2.16b
2059
2060	mov	w7,v0.s[0]
2061	eor	w6,w7,w7,ror #32-2
2062	eor	w6,w6,w7,ror #32-10
2063	eor	w6,w6,w7,ror #32-18
2064	eor	w6,w6,w7,ror #32-24
2065	eor	w15,w15,w6
2066	subs	w11,w11,#1
2067	b.ne	10b
2068	mov	v8.s[0],w15
2069	mov	v8.s[1],w14
2070	mov	v8.s[2],w13
2071	mov	v8.s[3],w12
2072#ifndef __AARCH64EB__
2073	rev32	v8.16b,v8.16b
2074#endif
2075	eor	v8.16b,v8.16b,v15.16b
2076	st1	{v8.4s},[x1],#16
2077	b	100f
20781:	//	last two blocks
2079	ld4	{v4.s,v5.s,v6.s,v7.s}[0],[x0]
2080	add	x10,x0,#16
2081	ld4	{v4.s,v5.s,v6.s,v7.s}[1],[x10],#16
2082	subs	w2,w2,1
2083	b.gt	1f
2084#ifndef __AARCH64EB__
2085	rev32	v4.16b,v4.16b
2086#endif
2087#ifndef __AARCH64EB__
2088	rev32	v5.16b,v5.16b
2089#endif
2090#ifndef __AARCH64EB__
2091	rev32	v6.16b,v6.16b
2092#endif
2093#ifndef __AARCH64EB__
2094	rev32	v7.16b,v7.16b
2095#endif
2096	bl	_vpsm4_ex_enc_4blks
2097	ld1	{v4.4s,v5.4s},[x0],#32
2098	zip1	v8.4s,v0.4s,v1.4s
2099	zip2	v9.4s,v0.4s,v1.4s
2100	zip1	v10.4s,v2.4s,v3.4s
2101	zip2	v11.4s,v2.4s,v3.4s
2102	zip1	v0.2d,v8.2d,v10.2d
2103	zip2	v1.2d,v8.2d,v10.2d
2104	zip1	v2.2d,v9.2d,v11.2d
2105	zip2	v3.2d,v9.2d,v11.2d
2106	eor	v0.16b,v0.16b,v15.16b
2107	eor	v1.16b,v1.16b,v4.16b
2108	st1	{v0.4s,v1.4s},[x1],#32
2109	// save back IV
2110	st1	{v5.4s}, [x4]
2111	b	100f
21121:	//	last 3 blocks
2113	ld4	{v4.s,v5.s,v6.s,v7.s}[2],[x10]
2114#ifndef __AARCH64EB__
2115	rev32	v4.16b,v4.16b
2116#endif
2117#ifndef __AARCH64EB__
2118	rev32	v5.16b,v5.16b
2119#endif
2120#ifndef __AARCH64EB__
2121	rev32	v6.16b,v6.16b
2122#endif
2123#ifndef __AARCH64EB__
2124	rev32	v7.16b,v7.16b
2125#endif
2126	bl	_vpsm4_ex_enc_4blks
2127	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
2128	zip1	v8.4s,v0.4s,v1.4s
2129	zip2	v9.4s,v0.4s,v1.4s
2130	zip1	v10.4s,v2.4s,v3.4s
2131	zip2	v11.4s,v2.4s,v3.4s
2132	zip1	v0.2d,v8.2d,v10.2d
2133	zip2	v1.2d,v8.2d,v10.2d
2134	zip1	v2.2d,v9.2d,v11.2d
2135	zip2	v3.2d,v9.2d,v11.2d
2136	eor	v0.16b,v0.16b,v15.16b
2137	eor	v1.16b,v1.16b,v4.16b
2138	eor	v2.16b,v2.16b,v5.16b
2139	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
2140	// save back IV
2141	st1	{v6.4s}, [x4]
2142100:
2143	ldp	d10,d11,[sp,#16]
2144	ldp	d12,d13,[sp,#32]
2145	ldp	d14,d15,[sp,#48]
2146	ldp	x29,x30,[sp,#64]
2147	ldp	d8,d9,[sp],#80
2148	AARCH64_VALIDATE_LINK_REGISTER
2149	ret
2150.size	vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt
2151.globl	vpsm4_ex_ctr32_encrypt_blocks
2152.type	vpsm4_ex_ctr32_encrypt_blocks,%function
2153.align	5
2154vpsm4_ex_ctr32_encrypt_blocks:
2155	AARCH64_VALID_CALL_TARGET
2156	ld1	{v3.4s},[x4]
2157#ifndef __AARCH64EB__
2158	rev32	v3.16b,v3.16b
2159#endif
2160	adrp	x9, .Lsbox_magic
2161	ldr	q26, [x9, #:lo12:.Lsbox_magic]
2162	ldr	q27, [x9, #:lo12:.Lsbox_magic+16]
2163	ldr	q28, [x9, #:lo12:.Lsbox_magic+32]
2164	ldr	q29, [x9, #:lo12:.Lsbox_magic+48]
2165	ldr	q30, [x9, #:lo12:.Lsbox_magic+64]
2166	ldr	q31, [x9, #:lo12:.Lsbox_magic+80]
2167	cmp	w2,#1
2168	b.ne	1f
2169	// fast processing for one single block without
2170	// context saving overhead
2171	mov	x10,x3
2172	mov	w11,#8
2173	mov	w12,v3.s[0]
2174	mov	w13,v3.s[1]
2175	mov	w14,v3.s[2]
2176	mov	w15,v3.s[3]
217710:
2178	ldp	w7,w8,[x10],8
2179	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2180	eor	w6,w14,w15
2181	eor	w9,w7,w13
2182	eor	w6,w6,w9
2183	mov	v3.s[0],w6
2184	// optimize sbox using AESE instruction
2185	tbl	v0.16b, {v3.16b}, v26.16b
2186	ushr	v2.16b, v0.16b, 4
2187	and	v0.16b, v0.16b, v31.16b
2188	tbl	v0.16b, {v28.16b}, v0.16b
2189	tbl	v2.16b, {v27.16b}, v2.16b
2190	eor	v0.16b, v0.16b, v2.16b
2191	eor	v1.16b, v1.16b, v1.16b
2192	aese	v0.16b,v1.16b
2193	ushr	v2.16b, v0.16b, 4
2194	and	v0.16b, v0.16b, v31.16b
2195	tbl	v0.16b, {v30.16b}, v0.16b
2196	tbl	v2.16b, {v29.16b}, v2.16b
2197	eor	v0.16b, v0.16b, v2.16b
2198
2199	mov	w7,v0.s[0]
2200	eor	w6,w7,w7,ror #32-2
2201	eor	w6,w6,w7,ror #32-10
2202	eor	w6,w6,w7,ror #32-18
2203	eor	w6,w6,w7,ror #32-24
2204	eor	w12,w12,w6
2205	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2206	eor	w6,w14,w15
2207	eor	w9,w12,w8
2208	eor	w6,w6,w9
2209	mov	v3.s[0],w6
2210	// optimize sbox using AESE instruction
2211	tbl	v0.16b, {v3.16b}, v26.16b
2212	ushr	v2.16b, v0.16b, 4
2213	and	v0.16b, v0.16b, v31.16b
2214	tbl	v0.16b, {v28.16b}, v0.16b
2215	tbl	v2.16b, {v27.16b}, v2.16b
2216	eor	v0.16b, v0.16b, v2.16b
2217	eor	v1.16b, v1.16b, v1.16b
2218	aese	v0.16b,v1.16b
2219	ushr	v2.16b, v0.16b, 4
2220	and	v0.16b, v0.16b, v31.16b
2221	tbl	v0.16b, {v30.16b}, v0.16b
2222	tbl	v2.16b, {v29.16b}, v2.16b
2223	eor	v0.16b, v0.16b, v2.16b
2224
2225	mov	w7,v0.s[0]
2226	eor	w6,w7,w7,ror #32-2
2227	eor	w6,w6,w7,ror #32-10
2228	eor	w6,w6,w7,ror #32-18
2229	eor	w6,w6,w7,ror #32-24
2230	ldp	w7,w8,[x10],8
2231	eor	w13,w13,w6
2232	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2233	eor	w6,w12,w13
2234	eor	w9,w7,w15
2235	eor	w6,w6,w9
2236	mov	v3.s[0],w6
2237	// optimize sbox using AESE instruction
2238	tbl	v0.16b, {v3.16b}, v26.16b
2239	ushr	v2.16b, v0.16b, 4
2240	and	v0.16b, v0.16b, v31.16b
2241	tbl	v0.16b, {v28.16b}, v0.16b
2242	tbl	v2.16b, {v27.16b}, v2.16b
2243	eor	v0.16b, v0.16b, v2.16b
2244	eor	v1.16b, v1.16b, v1.16b
2245	aese	v0.16b,v1.16b
2246	ushr	v2.16b, v0.16b, 4
2247	and	v0.16b, v0.16b, v31.16b
2248	tbl	v0.16b, {v30.16b}, v0.16b
2249	tbl	v2.16b, {v29.16b}, v2.16b
2250	eor	v0.16b, v0.16b, v2.16b
2251
2252	mov	w7,v0.s[0]
2253	eor	w6,w7,w7,ror #32-2
2254	eor	w6,w6,w7,ror #32-10
2255	eor	w6,w6,w7,ror #32-18
2256	eor	w6,w6,w7,ror #32-24
2257	eor	w14,w14,w6
2258	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2259	eor	w6,w12,w13
2260	eor	w9,w14,w8
2261	eor	w6,w6,w9
2262	mov	v3.s[0],w6
2263	// optimize sbox using AESE instruction
2264	tbl	v0.16b, {v3.16b}, v26.16b
2265	ushr	v2.16b, v0.16b, 4
2266	and	v0.16b, v0.16b, v31.16b
2267	tbl	v0.16b, {v28.16b}, v0.16b
2268	tbl	v2.16b, {v27.16b}, v2.16b
2269	eor	v0.16b, v0.16b, v2.16b
2270	eor	v1.16b, v1.16b, v1.16b
2271	aese	v0.16b,v1.16b
2272	ushr	v2.16b, v0.16b, 4
2273	and	v0.16b, v0.16b, v31.16b
2274	tbl	v0.16b, {v30.16b}, v0.16b
2275	tbl	v2.16b, {v29.16b}, v2.16b
2276	eor	v0.16b, v0.16b, v2.16b
2277
2278	mov	w7,v0.s[0]
2279	eor	w6,w7,w7,ror #32-2
2280	eor	w6,w6,w7,ror #32-10
2281	eor	w6,w6,w7,ror #32-18
2282	eor	w6,w6,w7,ror #32-24
2283	eor	w15,w15,w6
2284	subs	w11,w11,#1
2285	b.ne	10b
2286	mov	v3.s[0],w15
2287	mov	v3.s[1],w14
2288	mov	v3.s[2],w13
2289	mov	v3.s[3],w12
2290#ifndef __AARCH64EB__
2291	rev32	v3.16b,v3.16b
2292#endif
2293	ld1	{v4.4s},[x0]
2294	eor	v4.16b,v4.16b,v3.16b
2295	st1	{v4.4s},[x1]
2296	ret
22971:
2298	AARCH64_SIGN_LINK_REGISTER
2299	stp	d8,d9,[sp,#-80]!
2300	stp	d10,d11,[sp,#16]
2301	stp	d12,d13,[sp,#32]
2302	stp	d14,d15,[sp,#48]
2303	stp	x29,x30,[sp,#64]
2304	mov	w12,v3.s[0]
2305	mov	w13,v3.s[1]
2306	mov	w14,v3.s[2]
2307	mov	w5,v3.s[3]
2308.Lctr32_4_blocks_process:
2309	cmp	w2,#4
2310	b.lt	1f
2311	dup	v4.4s,w12
2312	dup	v5.4s,w13
2313	dup	v6.4s,w14
2314	mov	v7.s[0],w5
2315	add	w5,w5,#1
2316	mov	v7.s[1],w5
2317	add	w5,w5,#1
2318	mov	v7.s[2],w5
2319	add	w5,w5,#1
2320	mov	v7.s[3],w5
2321	add	w5,w5,#1
2322	cmp	w2,#8
2323	b.ge	.Lctr32_8_blocks_process
2324	bl	_vpsm4_ex_enc_4blks
2325	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2326	eor	v0.16b,v0.16b,v12.16b
2327	eor	v1.16b,v1.16b,v13.16b
2328	eor	v2.16b,v2.16b,v14.16b
2329	eor	v3.16b,v3.16b,v15.16b
2330	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2331	subs	w2,w2,#4
2332	b.ne	.Lctr32_4_blocks_process
2333	b	100f
2334.Lctr32_8_blocks_process:
2335	dup	v8.4s,w12
2336	dup	v9.4s,w13
2337	dup	v10.4s,w14
2338	mov	v11.s[0],w5
2339	add	w5,w5,#1
2340	mov	v11.s[1],w5
2341	add	w5,w5,#1
2342	mov	v11.s[2],w5
2343	add	w5,w5,#1
2344	mov	v11.s[3],w5
2345	add	w5,w5,#1
2346	bl	_vpsm4_ex_enc_8blks
2347	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2348	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2349	eor	v0.16b,v0.16b,v12.16b
2350	eor	v1.16b,v1.16b,v13.16b
2351	eor	v2.16b,v2.16b,v14.16b
2352	eor	v3.16b,v3.16b,v15.16b
2353	eor	v4.16b,v4.16b,v8.16b
2354	eor	v5.16b,v5.16b,v9.16b
2355	eor	v6.16b,v6.16b,v10.16b
2356	eor	v7.16b,v7.16b,v11.16b
2357	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2358	st4	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
2359	subs	w2,w2,#8
2360	b.ne	.Lctr32_4_blocks_process
2361	b	100f
23621:	//	last block processing
2363	subs	w2,w2,#1
2364	b.lt	100f
2365	b.gt	1f
2366	mov	v3.s[0],w12
2367	mov	v3.s[1],w13
2368	mov	v3.s[2],w14
2369	mov	v3.s[3],w5
2370	mov	x10,x3
2371	mov	w11,#8
2372	mov	w12,v3.s[0]
2373	mov	w13,v3.s[1]
2374	mov	w14,v3.s[2]
2375	mov	w15,v3.s[3]
237610:
2377	ldp	w7,w8,[x10],8
2378	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2379	eor	w6,w14,w15
2380	eor	w9,w7,w13
2381	eor	w6,w6,w9
2382	mov	v3.s[0],w6
2383	// optimize sbox using AESE instruction
2384	tbl	v0.16b, {v3.16b}, v26.16b
2385	ushr	v2.16b, v0.16b, 4
2386	and	v0.16b, v0.16b, v31.16b
2387	tbl	v0.16b, {v28.16b}, v0.16b
2388	tbl	v2.16b, {v27.16b}, v2.16b
2389	eor	v0.16b, v0.16b, v2.16b
2390	eor	v1.16b, v1.16b, v1.16b
2391	aese	v0.16b,v1.16b
2392	ushr	v2.16b, v0.16b, 4
2393	and	v0.16b, v0.16b, v31.16b
2394	tbl	v0.16b, {v30.16b}, v0.16b
2395	tbl	v2.16b, {v29.16b}, v2.16b
2396	eor	v0.16b, v0.16b, v2.16b
2397
2398	mov	w7,v0.s[0]
2399	eor	w6,w7,w7,ror #32-2
2400	eor	w6,w6,w7,ror #32-10
2401	eor	w6,w6,w7,ror #32-18
2402	eor	w6,w6,w7,ror #32-24
2403	eor	w12,w12,w6
2404	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2405	eor	w6,w14,w15
2406	eor	w9,w12,w8
2407	eor	w6,w6,w9
2408	mov	v3.s[0],w6
2409	// optimize sbox using AESE instruction
2410	tbl	v0.16b, {v3.16b}, v26.16b
2411	ushr	v2.16b, v0.16b, 4
2412	and	v0.16b, v0.16b, v31.16b
2413	tbl	v0.16b, {v28.16b}, v0.16b
2414	tbl	v2.16b, {v27.16b}, v2.16b
2415	eor	v0.16b, v0.16b, v2.16b
2416	eor	v1.16b, v1.16b, v1.16b
2417	aese	v0.16b,v1.16b
2418	ushr	v2.16b, v0.16b, 4
2419	and	v0.16b, v0.16b, v31.16b
2420	tbl	v0.16b, {v30.16b}, v0.16b
2421	tbl	v2.16b, {v29.16b}, v2.16b
2422	eor	v0.16b, v0.16b, v2.16b
2423
2424	mov	w7,v0.s[0]
2425	eor	w6,w7,w7,ror #32-2
2426	eor	w6,w6,w7,ror #32-10
2427	eor	w6,w6,w7,ror #32-18
2428	eor	w6,w6,w7,ror #32-24
2429	ldp	w7,w8,[x10],8
2430	eor	w13,w13,w6
2431	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2432	eor	w6,w12,w13
2433	eor	w9,w7,w15
2434	eor	w6,w6,w9
2435	mov	v3.s[0],w6
2436	// optimize sbox using AESE instruction
2437	tbl	v0.16b, {v3.16b}, v26.16b
2438	ushr	v2.16b, v0.16b, 4
2439	and	v0.16b, v0.16b, v31.16b
2440	tbl	v0.16b, {v28.16b}, v0.16b
2441	tbl	v2.16b, {v27.16b}, v2.16b
2442	eor	v0.16b, v0.16b, v2.16b
2443	eor	v1.16b, v1.16b, v1.16b
2444	aese	v0.16b,v1.16b
2445	ushr	v2.16b, v0.16b, 4
2446	and	v0.16b, v0.16b, v31.16b
2447	tbl	v0.16b, {v30.16b}, v0.16b
2448	tbl	v2.16b, {v29.16b}, v2.16b
2449	eor	v0.16b, v0.16b, v2.16b
2450
2451	mov	w7,v0.s[0]
2452	eor	w6,w7,w7,ror #32-2
2453	eor	w6,w6,w7,ror #32-10
2454	eor	w6,w6,w7,ror #32-18
2455	eor	w6,w6,w7,ror #32-24
2456	eor	w14,w14,w6
2457	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2458	eor	w6,w12,w13
2459	eor	w9,w14,w8
2460	eor	w6,w6,w9
2461	mov	v3.s[0],w6
2462	// optimize sbox using AESE instruction
2463	tbl	v0.16b, {v3.16b}, v26.16b
2464	ushr	v2.16b, v0.16b, 4
2465	and	v0.16b, v0.16b, v31.16b
2466	tbl	v0.16b, {v28.16b}, v0.16b
2467	tbl	v2.16b, {v27.16b}, v2.16b
2468	eor	v0.16b, v0.16b, v2.16b
2469	eor	v1.16b, v1.16b, v1.16b
2470	aese	v0.16b,v1.16b
2471	ushr	v2.16b, v0.16b, 4
2472	and	v0.16b, v0.16b, v31.16b
2473	tbl	v0.16b, {v30.16b}, v0.16b
2474	tbl	v2.16b, {v29.16b}, v2.16b
2475	eor	v0.16b, v0.16b, v2.16b
2476
2477	mov	w7,v0.s[0]
2478	eor	w6,w7,w7,ror #32-2
2479	eor	w6,w6,w7,ror #32-10
2480	eor	w6,w6,w7,ror #32-18
2481	eor	w6,w6,w7,ror #32-24
2482	eor	w15,w15,w6
2483	subs	w11,w11,#1
2484	b.ne	10b
2485	mov	v3.s[0],w15
2486	mov	v3.s[1],w14
2487	mov	v3.s[2],w13
2488	mov	v3.s[3],w12
2489#ifndef __AARCH64EB__
2490	rev32	v3.16b,v3.16b
2491#endif
2492	ld1	{v4.4s},[x0]
2493	eor	v4.16b,v4.16b,v3.16b
2494	st1	{v4.4s},[x1]
2495	b	100f
24961:	//	last 2 blocks processing
2497	dup	v4.4s,w12
2498	dup	v5.4s,w13
2499	dup	v6.4s,w14
2500	mov	v7.s[0],w5
2501	add	w5,w5,#1
2502	mov	v7.s[1],w5
2503	subs	w2,w2,#1
2504	b.ne	1f
2505	bl	_vpsm4_ex_enc_4blks
2506	ld4	{v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
2507	ld4	{v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
2508	eor	v0.16b,v0.16b,v12.16b
2509	eor	v1.16b,v1.16b,v13.16b
2510	eor	v2.16b,v2.16b,v14.16b
2511	eor	v3.16b,v3.16b,v15.16b
2512	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
2513	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
2514	b	100f
25151:	//	last 3 blocks processing
2516	add	w5,w5,#1
2517	mov	v7.s[2],w5
2518	bl	_vpsm4_ex_enc_4blks
2519	ld4	{v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
2520	ld4	{v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
2521	ld4	{v12.s,v13.s,v14.s,v15.s}[2],[x0],#16
2522	eor	v0.16b,v0.16b,v12.16b
2523	eor	v1.16b,v1.16b,v13.16b
2524	eor	v2.16b,v2.16b,v14.16b
2525	eor	v3.16b,v3.16b,v15.16b
2526	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
2527	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
2528	st4	{v0.s,v1.s,v2.s,v3.s}[2],[x1],#16
2529100:
2530	ldp	d10,d11,[sp,#16]
2531	ldp	d12,d13,[sp,#32]
2532	ldp	d14,d15,[sp,#48]
2533	ldp	x29,x30,[sp,#64]
2534	ldp	d8,d9,[sp],#80
2535	AARCH64_VALIDATE_LINK_REGISTER
2536	ret
2537.size	vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks
2538.globl	vpsm4_ex_xts_encrypt_gb
2539.type	vpsm4_ex_xts_encrypt_gb,%function
2540.align	5
2541vpsm4_ex_xts_encrypt_gb:
2542	AARCH64_SIGN_LINK_REGISTER
2543	stp	x15, x16, [sp, #-0x10]!
2544	stp	x17, x18, [sp, #-0x10]!
2545	stp	x19, x20, [sp, #-0x10]!
2546	stp	x21, x22, [sp, #-0x10]!
2547	stp	x23, x24, [sp, #-0x10]!
2548	stp	x25, x26, [sp, #-0x10]!
2549	stp	x27, x28, [sp, #-0x10]!
2550	stp	x29, x30, [sp, #-0x10]!
2551	stp	d8, d9, [sp, #-0x10]!
2552	stp	d10, d11, [sp, #-0x10]!
2553	stp	d12, d13, [sp, #-0x10]!
2554	stp	d14, d15, [sp, #-0x10]!
2555	mov	x26,x3
2556	mov	x27,x4
2557	mov	w28,w6
2558	ld1	{v16.4s}, [x5]
2559	mov	x3,x27
2560	adrp	x9, .Lsbox_magic
2561	ldr	q26, [x9, #:lo12:.Lsbox_magic]
2562	ldr	q27, [x9, #:lo12:.Lsbox_magic+16]
2563	ldr	q28, [x9, #:lo12:.Lsbox_magic+32]
2564	ldr	q29, [x9, #:lo12:.Lsbox_magic+48]
2565	ldr	q30, [x9, #:lo12:.Lsbox_magic+64]
2566	ldr	q31, [x9, #:lo12:.Lsbox_magic+80]
2567#ifndef __AARCH64EB__
2568	rev32	v16.16b,v16.16b
2569#endif
2570	mov	x10,x3
2571	mov	w11,#8
2572	mov	w12,v16.s[0]
2573	mov	w13,v16.s[1]
2574	mov	w14,v16.s[2]
2575	mov	w15,v16.s[3]
257610:
2577	ldp	w7,w8,[x10],8
2578	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2579	eor	w6,w14,w15
2580	eor	w9,w7,w13
2581	eor	w6,w6,w9
2582	mov	v3.s[0],w6
2583	// optimize sbox using AESE instruction
2584	tbl	v0.16b, {v3.16b}, v26.16b
2585	ushr	v2.16b, v0.16b, 4
2586	and	v0.16b, v0.16b, v31.16b
2587	tbl	v0.16b, {v28.16b}, v0.16b
2588	tbl	v2.16b, {v27.16b}, v2.16b
2589	eor	v0.16b, v0.16b, v2.16b
2590	eor	v1.16b, v1.16b, v1.16b
2591	aese	v0.16b,v1.16b
2592	ushr	v2.16b, v0.16b, 4
2593	and	v0.16b, v0.16b, v31.16b
2594	tbl	v0.16b, {v30.16b}, v0.16b
2595	tbl	v2.16b, {v29.16b}, v2.16b
2596	eor	v0.16b, v0.16b, v2.16b
2597
2598	mov	w7,v0.s[0]
2599	eor	w6,w7,w7,ror #32-2
2600	eor	w6,w6,w7,ror #32-10
2601	eor	w6,w6,w7,ror #32-18
2602	eor	w6,w6,w7,ror #32-24
2603	eor	w12,w12,w6
2604	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2605	eor	w6,w14,w15
2606	eor	w9,w12,w8
2607	eor	w6,w6,w9
2608	mov	v3.s[0],w6
2609	// optimize sbox using AESE instruction
2610	tbl	v0.16b, {v3.16b}, v26.16b
2611	ushr	v2.16b, v0.16b, 4
2612	and	v0.16b, v0.16b, v31.16b
2613	tbl	v0.16b, {v28.16b}, v0.16b
2614	tbl	v2.16b, {v27.16b}, v2.16b
2615	eor	v0.16b, v0.16b, v2.16b
2616	eor	v1.16b, v1.16b, v1.16b
2617	aese	v0.16b,v1.16b
2618	ushr	v2.16b, v0.16b, 4
2619	and	v0.16b, v0.16b, v31.16b
2620	tbl	v0.16b, {v30.16b}, v0.16b
2621	tbl	v2.16b, {v29.16b}, v2.16b
2622	eor	v0.16b, v0.16b, v2.16b
2623
2624	mov	w7,v0.s[0]
2625	eor	w6,w7,w7,ror #32-2
2626	eor	w6,w6,w7,ror #32-10
2627	eor	w6,w6,w7,ror #32-18
2628	eor	w6,w6,w7,ror #32-24
2629	ldp	w7,w8,[x10],8
2630	eor	w13,w13,w6
2631	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2632	eor	w6,w12,w13
2633	eor	w9,w7,w15
2634	eor	w6,w6,w9
2635	mov	v3.s[0],w6
2636	// optimize sbox using AESE instruction
2637	tbl	v0.16b, {v3.16b}, v26.16b
2638	ushr	v2.16b, v0.16b, 4
2639	and	v0.16b, v0.16b, v31.16b
2640	tbl	v0.16b, {v28.16b}, v0.16b
2641	tbl	v2.16b, {v27.16b}, v2.16b
2642	eor	v0.16b, v0.16b, v2.16b
2643	eor	v1.16b, v1.16b, v1.16b
2644	aese	v0.16b,v1.16b
2645	ushr	v2.16b, v0.16b, 4
2646	and	v0.16b, v0.16b, v31.16b
2647	tbl	v0.16b, {v30.16b}, v0.16b
2648	tbl	v2.16b, {v29.16b}, v2.16b
2649	eor	v0.16b, v0.16b, v2.16b
2650
2651	mov	w7,v0.s[0]
2652	eor	w6,w7,w7,ror #32-2
2653	eor	w6,w6,w7,ror #32-10
2654	eor	w6,w6,w7,ror #32-18
2655	eor	w6,w6,w7,ror #32-24
2656	eor	w14,w14,w6
2657	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2658	eor	w6,w12,w13
2659	eor	w9,w14,w8
2660	eor	w6,w6,w9
2661	mov	v3.s[0],w6
2662	// optimize sbox using AESE instruction
2663	tbl	v0.16b, {v3.16b}, v26.16b
2664	ushr	v2.16b, v0.16b, 4
2665	and	v0.16b, v0.16b, v31.16b
2666	tbl	v0.16b, {v28.16b}, v0.16b
2667	tbl	v2.16b, {v27.16b}, v2.16b
2668	eor	v0.16b, v0.16b, v2.16b
2669	eor	v1.16b, v1.16b, v1.16b
2670	aese	v0.16b,v1.16b
2671	ushr	v2.16b, v0.16b, 4
2672	and	v0.16b, v0.16b, v31.16b
2673	tbl	v0.16b, {v30.16b}, v0.16b
2674	tbl	v2.16b, {v29.16b}, v2.16b
2675	eor	v0.16b, v0.16b, v2.16b
2676
2677	mov	w7,v0.s[0]
2678	eor	w6,w7,w7,ror #32-2
2679	eor	w6,w6,w7,ror #32-10
2680	eor	w6,w6,w7,ror #32-18
2681	eor	w6,w6,w7,ror #32-24
2682	eor	w15,w15,w6
2683	subs	w11,w11,#1
2684	b.ne	10b
2685	mov	v16.s[0],w15
2686	mov	v16.s[1],w14
2687	mov	v16.s[2],w13
2688	mov	v16.s[3],w12
2689#ifndef __AARCH64EB__
2690	rev32	v16.16b,v16.16b
2691#endif
2692	mov	x3,x26
2693	and	x29,x2,#0x0F
2694	// convert length into blocks
2695	lsr	x2,x2,4
2696	cmp	x2,#1
2697	b.lt	.return_gb
2698
2699	cmp	x29,0
2700	// If the encryption/decryption Length is N times of 16,
2701	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
2702	b.eq	.xts_encrypt_blocks_gb
2703
2704	// If the encryption/decryption length is not N times of 16,
2705	// the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb
2706	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
2707	subs	x2,x2,#1
2708	b.eq	.only_2blks_tweak_gb
2709.xts_encrypt_blocks_gb:
2710	rbit	v16.16b,v16.16b
2711#ifdef __AARCH64EB__
2712	rev32	v16.16b,v16.16b
2713#endif
2714	mov	x12,v16.d[0]
2715	mov	x13,v16.d[1]
2716	mov	w7,0x87
2717	extr	x9,x13,x13,#32
2718	extr	x15,x13,x12,#63
2719	and	w8,w7,w9,asr#31
2720	eor	x14,x8,x12,lsl#1
2721	mov	w7,0x87
2722	extr	x9,x15,x15,#32
2723	extr	x17,x15,x14,#63
2724	and	w8,w7,w9,asr#31
2725	eor	x16,x8,x14,lsl#1
2726	mov	w7,0x87
2727	extr	x9,x17,x17,#32
2728	extr	x19,x17,x16,#63
2729	and	w8,w7,w9,asr#31
2730	eor	x18,x8,x16,lsl#1
2731	mov	w7,0x87
2732	extr	x9,x19,x19,#32
2733	extr	x21,x19,x18,#63
2734	and	w8,w7,w9,asr#31
2735	eor	x20,x8,x18,lsl#1
2736	mov	w7,0x87
2737	extr	x9,x21,x21,#32
2738	extr	x23,x21,x20,#63
2739	and	w8,w7,w9,asr#31
2740	eor	x22,x8,x20,lsl#1
2741	mov	w7,0x87
2742	extr	x9,x23,x23,#32
2743	extr	x25,x23,x22,#63
2744	and	w8,w7,w9,asr#31
2745	eor	x24,x8,x22,lsl#1
2746	mov	w7,0x87
2747	extr	x9,x25,x25,#32
2748	extr	x27,x25,x24,#63
2749	and	w8,w7,w9,asr#31
2750	eor	x26,x8,x24,lsl#1
2751.Lxts_8_blocks_process_gb:
2752	cmp	x2,#8
2753	mov	v16.d[0],x12
2754	mov	v16.d[1],x13
2755#ifdef __AARCH64EB__
2756	rev32	v16.16b,v16.16b
2757#endif
2758	mov	w7,0x87
2759	extr	x9,x27,x27,#32
2760	extr	x13,x27,x26,#63
2761	and	w8,w7,w9,asr#31
2762	eor	x12,x8,x26,lsl#1
2763	mov	v17.d[0],x14
2764	mov	v17.d[1],x15
2765#ifdef __AARCH64EB__
2766	rev32	v17.16b,v17.16b
2767#endif
2768	mov	w7,0x87
2769	extr	x9,x13,x13,#32
2770	extr	x15,x13,x12,#63
2771	and	w8,w7,w9,asr#31
2772	eor	x14,x8,x12,lsl#1
2773	mov	v18.d[0],x16
2774	mov	v18.d[1],x17
2775#ifdef __AARCH64EB__
2776	rev32	v18.16b,v18.16b
2777#endif
2778	mov	w7,0x87
2779	extr	x9,x15,x15,#32
2780	extr	x17,x15,x14,#63
2781	and	w8,w7,w9,asr#31
2782	eor	x16,x8,x14,lsl#1
2783	mov	v19.d[0],x18
2784	mov	v19.d[1],x19
2785#ifdef __AARCH64EB__
2786	rev32	v19.16b,v19.16b
2787#endif
2788	mov	w7,0x87
2789	extr	x9,x17,x17,#32
2790	extr	x19,x17,x16,#63
2791	and	w8,w7,w9,asr#31
2792	eor	x18,x8,x16,lsl#1
2793	mov	v20.d[0],x20
2794	mov	v20.d[1],x21
2795#ifdef __AARCH64EB__
2796	rev32	v20.16b,v20.16b
2797#endif
2798	mov	w7,0x87
2799	extr	x9,x19,x19,#32
2800	extr	x21,x19,x18,#63
2801	and	w8,w7,w9,asr#31
2802	eor	x20,x8,x18,lsl#1
2803	mov	v21.d[0],x22
2804	mov	v21.d[1],x23
2805#ifdef __AARCH64EB__
2806	rev32	v21.16b,v21.16b
2807#endif
2808	mov	w7,0x87
2809	extr	x9,x21,x21,#32
2810	extr	x23,x21,x20,#63
2811	and	w8,w7,w9,asr#31
2812	eor	x22,x8,x20,lsl#1
2813	mov	v22.d[0],x24
2814	mov	v22.d[1],x25
2815#ifdef __AARCH64EB__
2816	rev32	v22.16b,v22.16b
2817#endif
2818	mov	w7,0x87
2819	extr	x9,x23,x23,#32
2820	extr	x25,x23,x22,#63
2821	and	w8,w7,w9,asr#31
2822	eor	x24,x8,x22,lsl#1
2823	mov	v23.d[0],x26
2824	mov	v23.d[1],x27
2825#ifdef __AARCH64EB__
2826	rev32	v23.16b,v23.16b
2827#endif
2828	mov	w7,0x87
2829	extr	x9,x25,x25,#32
2830	extr	x27,x25,x24,#63
2831	and	w8,w7,w9,asr#31
2832	eor	x26,x8,x24,lsl#1
2833	b.lt	.Lxts_4_blocks_process_gb
2834	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
2835	rbit	v16.16b,v16.16b
2836	rbit	v17.16b,v17.16b
2837	rbit	v18.16b,v18.16b
2838	rbit	v19.16b,v19.16b
2839	eor	v4.16b, v4.16b, v16.16b
2840	eor	v5.16b, v5.16b, v17.16b
2841	eor	v6.16b, v6.16b, v18.16b
2842	eor	v7.16b, v7.16b, v19.16b
2843	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2844	rbit	v20.16b,v20.16b
2845	rbit	v21.16b,v21.16b
2846	rbit	v22.16b,v22.16b
2847	rbit	v23.16b,v23.16b
2848	eor	v8.16b, v8.16b, v20.16b
2849	eor	v9.16b, v9.16b, v21.16b
2850	eor	v10.16b, v10.16b, v22.16b
2851	eor	v11.16b, v11.16b, v23.16b
2852#ifndef __AARCH64EB__
2853	rev32	v4.16b,v4.16b
2854#endif
2855#ifndef __AARCH64EB__
2856	rev32	v5.16b,v5.16b
2857#endif
2858#ifndef __AARCH64EB__
2859	rev32	v6.16b,v6.16b
2860#endif
2861#ifndef __AARCH64EB__
2862	rev32	v7.16b,v7.16b
2863#endif
2864#ifndef __AARCH64EB__
2865	rev32	v8.16b,v8.16b
2866#endif
2867#ifndef __AARCH64EB__
2868	rev32	v9.16b,v9.16b
2869#endif
2870#ifndef __AARCH64EB__
2871	rev32	v10.16b,v10.16b
2872#endif
2873#ifndef __AARCH64EB__
2874	rev32	v11.16b,v11.16b
2875#endif
2876	zip1	v0.4s,v4.4s,v5.4s
2877	zip2	v1.4s,v4.4s,v5.4s
2878	zip1	v2.4s,v6.4s,v7.4s
2879	zip2	v3.4s,v6.4s,v7.4s
2880	zip1	v4.2d,v0.2d,v2.2d
2881	zip2	v5.2d,v0.2d,v2.2d
2882	zip1	v6.2d,v1.2d,v3.2d
2883	zip2	v7.2d,v1.2d,v3.2d
2884	zip1	v0.4s,v8.4s,v9.4s
2885	zip2	v1.4s,v8.4s,v9.4s
2886	zip1	v2.4s,v10.4s,v11.4s
2887	zip2	v3.4s,v10.4s,v11.4s
2888	zip1	v8.2d,v0.2d,v2.2d
2889	zip2	v9.2d,v0.2d,v2.2d
2890	zip1	v10.2d,v1.2d,v3.2d
2891	zip2	v11.2d,v1.2d,v3.2d
2892	bl	_vpsm4_ex_enc_8blks
2893	zip1	v8.4s,v0.4s,v1.4s
2894	zip2	v9.4s,v0.4s,v1.4s
2895	zip1	v10.4s,v2.4s,v3.4s
2896	zip2	v11.4s,v2.4s,v3.4s
2897	zip1	v0.2d,v8.2d,v10.2d
2898	zip2	v1.2d,v8.2d,v10.2d
2899	zip1	v2.2d,v9.2d,v11.2d
2900	zip2	v3.2d,v9.2d,v11.2d
2901	zip1	v8.4s,v4.4s,v5.4s
2902	zip2	v9.4s,v4.4s,v5.4s
2903	zip1	v10.4s,v6.4s,v7.4s
2904	zip2	v11.4s,v6.4s,v7.4s
2905	zip1	v4.2d,v8.2d,v10.2d
2906	zip2	v5.2d,v8.2d,v10.2d
2907	zip1	v6.2d,v9.2d,v11.2d
2908	zip2	v7.2d,v9.2d,v11.2d
2909	eor	v0.16b, v0.16b, v16.16b
2910	eor	v1.16b, v1.16b, v17.16b
2911	eor	v2.16b, v2.16b, v18.16b
2912	eor	v3.16b, v3.16b, v19.16b
2913	eor	v4.16b, v4.16b, v20.16b
2914	eor	v5.16b, v5.16b, v21.16b
2915	eor	v6.16b, v6.16b, v22.16b
2916	eor	v7.16b, v7.16b, v23.16b
2917
2918	// save the last tweak
2919	mov	v25.16b,v23.16b
2920	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2921	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
2922	subs	x2,x2,#8
2923	b.gt	.Lxts_8_blocks_process_gb
2924	b	100f
2925.Lxts_4_blocks_process_gb:
2926	cmp	x2,#4
2927	b.lt	1f
2928	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
2929	rbit	v16.16b,v16.16b
2930	rbit	v17.16b,v17.16b
2931	rbit	v18.16b,v18.16b
2932	rbit	v19.16b,v19.16b
2933	eor	v4.16b, v4.16b, v16.16b
2934	eor	v5.16b, v5.16b, v17.16b
2935	eor	v6.16b, v6.16b, v18.16b
2936	eor	v7.16b, v7.16b, v19.16b
2937#ifndef __AARCH64EB__
2938	rev32	v4.16b,v4.16b
2939#endif
2940#ifndef __AARCH64EB__
2941	rev32	v5.16b,v5.16b
2942#endif
2943#ifndef __AARCH64EB__
2944	rev32	v6.16b,v6.16b
2945#endif
2946#ifndef __AARCH64EB__
2947	rev32	v7.16b,v7.16b
2948#endif
2949	zip1	v0.4s,v4.4s,v5.4s
2950	zip2	v1.4s,v4.4s,v5.4s
2951	zip1	v2.4s,v6.4s,v7.4s
2952	zip2	v3.4s,v6.4s,v7.4s
2953	zip1	v4.2d,v0.2d,v2.2d
2954	zip2	v5.2d,v0.2d,v2.2d
2955	zip1	v6.2d,v1.2d,v3.2d
2956	zip2	v7.2d,v1.2d,v3.2d
2957	bl	_vpsm4_ex_enc_4blks
2958	zip1	v4.4s,v0.4s,v1.4s
2959	zip2	v5.4s,v0.4s,v1.4s
2960	zip1	v6.4s,v2.4s,v3.4s
2961	zip2	v7.4s,v2.4s,v3.4s
2962	zip1	v0.2d,v4.2d,v6.2d
2963	zip2	v1.2d,v4.2d,v6.2d
2964	zip1	v2.2d,v5.2d,v7.2d
2965	zip2	v3.2d,v5.2d,v7.2d
2966	eor	v0.16b, v0.16b, v16.16b
2967	eor	v1.16b, v1.16b, v17.16b
2968	eor	v2.16b, v2.16b, v18.16b
2969	eor	v3.16b, v3.16b, v19.16b
2970	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2971	sub	x2,x2,#4
2972	mov	v16.16b,v20.16b
2973	mov	v17.16b,v21.16b
2974	mov	v18.16b,v22.16b
2975	// save the last tweak
2976	mov	v25.16b,v19.16b
29771:
2978	// process last block
2979	cmp	x2,#1
2980	b.lt	100f
2981	b.gt	1f
2982	ld1	{v4.4s},[x0],#16
2983	rbit	v16.16b,v16.16b
2984	eor	v4.16b, v4.16b, v16.16b
2985#ifndef __AARCH64EB__
2986	rev32	v4.16b,v4.16b
2987#endif
2988	mov	x10,x3
2989	mov	w11,#8
2990	mov	w12,v4.s[0]
2991	mov	w13,v4.s[1]
2992	mov	w14,v4.s[2]
2993	mov	w15,v4.s[3]
299410:
2995	ldp	w7,w8,[x10],8
2996	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2997	eor	w6,w14,w15
2998	eor	w9,w7,w13
2999	eor	w6,w6,w9
3000	mov	v3.s[0],w6
3001	// optimize sbox using AESE instruction
3002	tbl	v0.16b, {v3.16b}, v26.16b
3003	ushr	v2.16b, v0.16b, 4
3004	and	v0.16b, v0.16b, v31.16b
3005	tbl	v0.16b, {v28.16b}, v0.16b
3006	tbl	v2.16b, {v27.16b}, v2.16b
3007	eor	v0.16b, v0.16b, v2.16b
3008	eor	v1.16b, v1.16b, v1.16b
3009	aese	v0.16b,v1.16b
3010	ushr	v2.16b, v0.16b, 4
3011	and	v0.16b, v0.16b, v31.16b
3012	tbl	v0.16b, {v30.16b}, v0.16b
3013	tbl	v2.16b, {v29.16b}, v2.16b
3014	eor	v0.16b, v0.16b, v2.16b
3015
3016	mov	w7,v0.s[0]
3017	eor	w6,w7,w7,ror #32-2
3018	eor	w6,w6,w7,ror #32-10
3019	eor	w6,w6,w7,ror #32-18
3020	eor	w6,w6,w7,ror #32-24
3021	eor	w12,w12,w6
3022	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3023	eor	w6,w14,w15
3024	eor	w9,w12,w8
3025	eor	w6,w6,w9
3026	mov	v3.s[0],w6
3027	// optimize sbox using AESE instruction
3028	tbl	v0.16b, {v3.16b}, v26.16b
3029	ushr	v2.16b, v0.16b, 4
3030	and	v0.16b, v0.16b, v31.16b
3031	tbl	v0.16b, {v28.16b}, v0.16b
3032	tbl	v2.16b, {v27.16b}, v2.16b
3033	eor	v0.16b, v0.16b, v2.16b
3034	eor	v1.16b, v1.16b, v1.16b
3035	aese	v0.16b,v1.16b
3036	ushr	v2.16b, v0.16b, 4
3037	and	v0.16b, v0.16b, v31.16b
3038	tbl	v0.16b, {v30.16b}, v0.16b
3039	tbl	v2.16b, {v29.16b}, v2.16b
3040	eor	v0.16b, v0.16b, v2.16b
3041
3042	mov	w7,v0.s[0]
3043	eor	w6,w7,w7,ror #32-2
3044	eor	w6,w6,w7,ror #32-10
3045	eor	w6,w6,w7,ror #32-18
3046	eor	w6,w6,w7,ror #32-24
3047	ldp	w7,w8,[x10],8
3048	eor	w13,w13,w6
3049	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3050	eor	w6,w12,w13
3051	eor	w9,w7,w15
3052	eor	w6,w6,w9
3053	mov	v3.s[0],w6
3054	// optimize sbox using AESE instruction
3055	tbl	v0.16b, {v3.16b}, v26.16b
3056	ushr	v2.16b, v0.16b, 4
3057	and	v0.16b, v0.16b, v31.16b
3058	tbl	v0.16b, {v28.16b}, v0.16b
3059	tbl	v2.16b, {v27.16b}, v2.16b
3060	eor	v0.16b, v0.16b, v2.16b
3061	eor	v1.16b, v1.16b, v1.16b
3062	aese	v0.16b,v1.16b
3063	ushr	v2.16b, v0.16b, 4
3064	and	v0.16b, v0.16b, v31.16b
3065	tbl	v0.16b, {v30.16b}, v0.16b
3066	tbl	v2.16b, {v29.16b}, v2.16b
3067	eor	v0.16b, v0.16b, v2.16b
3068
3069	mov	w7,v0.s[0]
3070	eor	w6,w7,w7,ror #32-2
3071	eor	w6,w6,w7,ror #32-10
3072	eor	w6,w6,w7,ror #32-18
3073	eor	w6,w6,w7,ror #32-24
3074	eor	w14,w14,w6
3075	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3076	eor	w6,w12,w13
3077	eor	w9,w14,w8
3078	eor	w6,w6,w9
3079	mov	v3.s[0],w6
3080	// optimize sbox using AESE instruction
3081	tbl	v0.16b, {v3.16b}, v26.16b
3082	ushr	v2.16b, v0.16b, 4
3083	and	v0.16b, v0.16b, v31.16b
3084	tbl	v0.16b, {v28.16b}, v0.16b
3085	tbl	v2.16b, {v27.16b}, v2.16b
3086	eor	v0.16b, v0.16b, v2.16b
3087	eor	v1.16b, v1.16b, v1.16b
3088	aese	v0.16b,v1.16b
3089	ushr	v2.16b, v0.16b, 4
3090	and	v0.16b, v0.16b, v31.16b
3091	tbl	v0.16b, {v30.16b}, v0.16b
3092	tbl	v2.16b, {v29.16b}, v2.16b
3093	eor	v0.16b, v0.16b, v2.16b
3094
3095	mov	w7,v0.s[0]
3096	eor	w6,w7,w7,ror #32-2
3097	eor	w6,w6,w7,ror #32-10
3098	eor	w6,w6,w7,ror #32-18
3099	eor	w6,w6,w7,ror #32-24
3100	eor	w15,w15,w6
3101	subs	w11,w11,#1
3102	b.ne	10b
3103	mov	v4.s[0],w15
3104	mov	v4.s[1],w14
3105	mov	v4.s[2],w13
3106	mov	v4.s[3],w12
3107#ifndef __AARCH64EB__
3108	rev32	v4.16b,v4.16b
3109#endif
3110	eor	v4.16b, v4.16b, v16.16b
3111	st1	{v4.4s},[x1],#16
3112	// save the last tweak
3113	mov	v25.16b,v16.16b
3114	b	100f
31151:	//	process last 2 blocks
3116	cmp	x2,#2
3117	b.gt	1f
3118	ld1	{v4.4s,v5.4s},[x0],#32
3119	rbit	v16.16b,v16.16b
3120	rbit	v17.16b,v17.16b
3121	eor	v4.16b, v4.16b, v16.16b
3122	eor	v5.16b, v5.16b, v17.16b
3123#ifndef __AARCH64EB__
3124	rev32	v4.16b,v4.16b
3125#endif
3126#ifndef __AARCH64EB__
3127	rev32	v5.16b,v5.16b
3128#endif
3129	zip1	v0.4s,v4.4s,v5.4s
3130	zip2	v1.4s,v4.4s,v5.4s
3131	zip1	v2.4s,v6.4s,v7.4s
3132	zip2	v3.4s,v6.4s,v7.4s
3133	zip1	v4.2d,v0.2d,v2.2d
3134	zip2	v5.2d,v0.2d,v2.2d
3135	zip1	v6.2d,v1.2d,v3.2d
3136	zip2	v7.2d,v1.2d,v3.2d
3137	bl	_vpsm4_ex_enc_4blks
3138	zip1	v4.4s,v0.4s,v1.4s
3139	zip2	v5.4s,v0.4s,v1.4s
3140	zip1	v6.4s,v2.4s,v3.4s
3141	zip2	v7.4s,v2.4s,v3.4s
3142	zip1	v0.2d,v4.2d,v6.2d
3143	zip2	v1.2d,v4.2d,v6.2d
3144	zip1	v2.2d,v5.2d,v7.2d
3145	zip2	v3.2d,v5.2d,v7.2d
3146	eor	v0.16b, v0.16b, v16.16b
3147	eor	v1.16b, v1.16b, v17.16b
3148	st1	{v0.4s,v1.4s},[x1],#32
3149	// save the last tweak
3150	mov	v25.16b,v17.16b
3151	b	100f
31521:	//	process last 3 blocks
3153	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
3154	rbit	v16.16b,v16.16b
3155	rbit	v17.16b,v17.16b
3156	rbit	v18.16b,v18.16b
3157	eor	v4.16b, v4.16b, v16.16b
3158	eor	v5.16b, v5.16b, v17.16b
3159	eor	v6.16b, v6.16b, v18.16b
3160#ifndef __AARCH64EB__
3161	rev32	v4.16b,v4.16b
3162#endif
3163#ifndef __AARCH64EB__
3164	rev32	v5.16b,v5.16b
3165#endif
3166#ifndef __AARCH64EB__
3167	rev32	v6.16b,v6.16b
3168#endif
3169	zip1	v0.4s,v4.4s,v5.4s
3170	zip2	v1.4s,v4.4s,v5.4s
3171	zip1	v2.4s,v6.4s,v7.4s
3172	zip2	v3.4s,v6.4s,v7.4s
3173	zip1	v4.2d,v0.2d,v2.2d
3174	zip2	v5.2d,v0.2d,v2.2d
3175	zip1	v6.2d,v1.2d,v3.2d
3176	zip2	v7.2d,v1.2d,v3.2d
3177	bl	_vpsm4_ex_enc_4blks
3178	zip1	v4.4s,v0.4s,v1.4s
3179	zip2	v5.4s,v0.4s,v1.4s
3180	zip1	v6.4s,v2.4s,v3.4s
3181	zip2	v7.4s,v2.4s,v3.4s
3182	zip1	v0.2d,v4.2d,v6.2d
3183	zip2	v1.2d,v4.2d,v6.2d
3184	zip1	v2.2d,v5.2d,v7.2d
3185	zip2	v3.2d,v5.2d,v7.2d
3186	eor	v0.16b, v0.16b, v16.16b
3187	eor	v1.16b, v1.16b, v17.16b
3188	eor	v2.16b, v2.16b, v18.16b
3189	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
3190	// save the last tweak
3191	mov	v25.16b,v18.16b
3192100:
3193	cmp	x29,0
3194	b.eq	.return_gb
3195
3196// This branch calculates the last two tweaks,
3197// while the encryption/decryption length is larger than 32
3198.last_2blks_tweak_gb:
3199#ifdef __AARCH64EB__
3200	rev32	v25.16b,v25.16b
3201#endif
3202	rbit	v2.16b,v25.16b
3203	adrp	x9, .Lxts_magic
3204	ldr	q0, [x9, #:lo12:.Lxts_magic]
3205	shl	v17.16b, v2.16b, #1
3206	ext	v1.16b, v2.16b, v2.16b,#15
3207	ushr	v1.16b, v1.16b, #7
3208	mul	v1.16b, v1.16b, v0.16b
3209	eor	v17.16b, v17.16b, v1.16b
3210	rbit	v17.16b,v17.16b
3211	rbit	v2.16b,v17.16b
3212	adrp	x9, .Lxts_magic
3213	ldr	q0, [x9, #:lo12:.Lxts_magic]
3214	shl	v18.16b, v2.16b, #1
3215	ext	v1.16b, v2.16b, v2.16b,#15
3216	ushr	v1.16b, v1.16b, #7
3217	mul	v1.16b, v1.16b, v0.16b
3218	eor	v18.16b, v18.16b, v1.16b
3219	rbit	v18.16b,v18.16b
3220	b	.check_dec_gb
3221
3222
3223// This branch calculates the last two tweaks,
3224// while the encryption/decryption length is equal to 32, who only need two tweaks
3225.only_2blks_tweak_gb:
3226	mov	v17.16b,v16.16b
3227#ifdef __AARCH64EB__
3228	rev32	v17.16b,v17.16b
3229#endif
3230	rbit	v2.16b,v17.16b
3231	adrp	x9, .Lxts_magic
3232	ldr	q0, [x9, #:lo12:.Lxts_magic]
3233	shl	v18.16b, v2.16b, #1
3234	ext	v1.16b, v2.16b, v2.16b,#15
3235	ushr	v1.16b, v1.16b, #7
3236	mul	v1.16b, v1.16b, v0.16b
3237	eor	v18.16b, v18.16b, v1.16b
3238	rbit	v18.16b,v18.16b
3239	b	.check_dec_gb
3240
3241
3242// Determine whether encryption or decryption is required.
3243// The last two tweaks need to be swapped for decryption.
3244.check_dec_gb:
3245	// encryption:1 decryption:0
3246	cmp	w28,1
3247	b.eq	.process_last_2blks_gb
3248	mov	v0.16B,v17.16b
3249	mov	v17.16B,v18.16b
3250	mov	v18.16B,v0.16b
3251
3252.process_last_2blks_gb:
3253#ifdef __AARCH64EB__
3254	rev32	v17.16b,v17.16b
3255#endif
3256#ifdef __AARCH64EB__
3257	rev32	v18.16b,v18.16b
3258#endif
3259	ld1	{v4.4s},[x0],#16
3260	eor	v4.16b, v4.16b, v17.16b
3261#ifndef __AARCH64EB__
3262	rev32	v4.16b,v4.16b
3263#endif
3264	mov	x10,x3
3265	mov	w11,#8
3266	mov	w12,v4.s[0]
3267	mov	w13,v4.s[1]
3268	mov	w14,v4.s[2]
3269	mov	w15,v4.s[3]
327010:
3271	ldp	w7,w8,[x10],8
3272	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3273	eor	w6,w14,w15
3274	eor	w9,w7,w13
3275	eor	w6,w6,w9
3276	mov	v3.s[0],w6
3277	// optimize sbox using AESE instruction
3278	tbl	v0.16b, {v3.16b}, v26.16b
3279	ushr	v2.16b, v0.16b, 4
3280	and	v0.16b, v0.16b, v31.16b
3281	tbl	v0.16b, {v28.16b}, v0.16b
3282	tbl	v2.16b, {v27.16b}, v2.16b
3283	eor	v0.16b, v0.16b, v2.16b
3284	eor	v1.16b, v1.16b, v1.16b
3285	aese	v0.16b,v1.16b
3286	ushr	v2.16b, v0.16b, 4
3287	and	v0.16b, v0.16b, v31.16b
3288	tbl	v0.16b, {v30.16b}, v0.16b
3289	tbl	v2.16b, {v29.16b}, v2.16b
3290	eor	v0.16b, v0.16b, v2.16b
3291
3292	mov	w7,v0.s[0]
3293	eor	w6,w7,w7,ror #32-2
3294	eor	w6,w6,w7,ror #32-10
3295	eor	w6,w6,w7,ror #32-18
3296	eor	w6,w6,w7,ror #32-24
3297	eor	w12,w12,w6
3298	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3299	eor	w6,w14,w15
3300	eor	w9,w12,w8
3301	eor	w6,w6,w9
3302	mov	v3.s[0],w6
3303	// optimize sbox using AESE instruction
3304	tbl	v0.16b, {v3.16b}, v26.16b
3305	ushr	v2.16b, v0.16b, 4
3306	and	v0.16b, v0.16b, v31.16b
3307	tbl	v0.16b, {v28.16b}, v0.16b
3308	tbl	v2.16b, {v27.16b}, v2.16b
3309	eor	v0.16b, v0.16b, v2.16b
3310	eor	v1.16b, v1.16b, v1.16b
3311	aese	v0.16b,v1.16b
3312	ushr	v2.16b, v0.16b, 4
3313	and	v0.16b, v0.16b, v31.16b
3314	tbl	v0.16b, {v30.16b}, v0.16b
3315	tbl	v2.16b, {v29.16b}, v2.16b
3316	eor	v0.16b, v0.16b, v2.16b
3317
3318	mov	w7,v0.s[0]
3319	eor	w6,w7,w7,ror #32-2
3320	eor	w6,w6,w7,ror #32-10
3321	eor	w6,w6,w7,ror #32-18
3322	eor	w6,w6,w7,ror #32-24
3323	ldp	w7,w8,[x10],8
3324	eor	w13,w13,w6
3325	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3326	eor	w6,w12,w13
3327	eor	w9,w7,w15
3328	eor	w6,w6,w9
3329	mov	v3.s[0],w6
3330	// optimize sbox using AESE instruction
3331	tbl	v0.16b, {v3.16b}, v26.16b
3332	ushr	v2.16b, v0.16b, 4
3333	and	v0.16b, v0.16b, v31.16b
3334	tbl	v0.16b, {v28.16b}, v0.16b
3335	tbl	v2.16b, {v27.16b}, v2.16b
3336	eor	v0.16b, v0.16b, v2.16b
3337	eor	v1.16b, v1.16b, v1.16b
3338	aese	v0.16b,v1.16b
3339	ushr	v2.16b, v0.16b, 4
3340	and	v0.16b, v0.16b, v31.16b
3341	tbl	v0.16b, {v30.16b}, v0.16b
3342	tbl	v2.16b, {v29.16b}, v2.16b
3343	eor	v0.16b, v0.16b, v2.16b
3344
3345	mov	w7,v0.s[0]
3346	eor	w6,w7,w7,ror #32-2
3347	eor	w6,w6,w7,ror #32-10
3348	eor	w6,w6,w7,ror #32-18
3349	eor	w6,w6,w7,ror #32-24
3350	eor	w14,w14,w6
3351	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3352	eor	w6,w12,w13
3353	eor	w9,w14,w8
3354	eor	w6,w6,w9
3355	mov	v3.s[0],w6
3356	// optimize sbox using AESE instruction
3357	tbl	v0.16b, {v3.16b}, v26.16b
3358	ushr	v2.16b, v0.16b, 4
3359	and	v0.16b, v0.16b, v31.16b
3360	tbl	v0.16b, {v28.16b}, v0.16b
3361	tbl	v2.16b, {v27.16b}, v2.16b
3362	eor	v0.16b, v0.16b, v2.16b
3363	eor	v1.16b, v1.16b, v1.16b
3364	aese	v0.16b,v1.16b
3365	ushr	v2.16b, v0.16b, 4
3366	and	v0.16b, v0.16b, v31.16b
3367	tbl	v0.16b, {v30.16b}, v0.16b
3368	tbl	v2.16b, {v29.16b}, v2.16b
3369	eor	v0.16b, v0.16b, v2.16b
3370
3371	mov	w7,v0.s[0]
3372	eor	w6,w7,w7,ror #32-2
3373	eor	w6,w6,w7,ror #32-10
3374	eor	w6,w6,w7,ror #32-18
3375	eor	w6,w6,w7,ror #32-24
3376	eor	w15,w15,w6
3377	subs	w11,w11,#1
3378	b.ne	10b
3379	mov	v4.s[0],w15
3380	mov	v4.s[1],w14
3381	mov	v4.s[2],w13
3382	mov	v4.s[3],w12
3383#ifndef __AARCH64EB__
3384	rev32	v4.16b,v4.16b
3385#endif
3386	eor	v4.16b, v4.16b, v17.16b
3387	st1	{v4.4s},[x1],#16
3388
3389	sub	x26,x1,16
3390.loop_gb:
3391	subs	x29,x29,1
3392	ldrb	w7,[x26,x29]
3393	ldrb	w8,[x0,x29]
3394	strb	w8,[x26,x29]
3395	strb	w7,[x1,x29]
3396	b.gt	.loop_gb
3397	ld1	{v4.4s}, [x26]
3398	eor	v4.16b, v4.16b, v18.16b
3399#ifndef __AARCH64EB__
3400	rev32	v4.16b,v4.16b
3401#endif
3402	mov	x10,x3
3403	mov	w11,#8
3404	mov	w12,v4.s[0]
3405	mov	w13,v4.s[1]
3406	mov	w14,v4.s[2]
3407	mov	w15,v4.s[3]
340810:
3409	ldp	w7,w8,[x10],8
3410	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3411	eor	w6,w14,w15
3412	eor	w9,w7,w13
3413	eor	w6,w6,w9
3414	mov	v3.s[0],w6
3415	// optimize sbox using AESE instruction
3416	tbl	v0.16b, {v3.16b}, v26.16b
3417	ushr	v2.16b, v0.16b, 4
3418	and	v0.16b, v0.16b, v31.16b
3419	tbl	v0.16b, {v28.16b}, v0.16b
3420	tbl	v2.16b, {v27.16b}, v2.16b
3421	eor	v0.16b, v0.16b, v2.16b
3422	eor	v1.16b, v1.16b, v1.16b
3423	aese	v0.16b,v1.16b
3424	ushr	v2.16b, v0.16b, 4
3425	and	v0.16b, v0.16b, v31.16b
3426	tbl	v0.16b, {v30.16b}, v0.16b
3427	tbl	v2.16b, {v29.16b}, v2.16b
3428	eor	v0.16b, v0.16b, v2.16b
3429
3430	mov	w7,v0.s[0]
3431	eor	w6,w7,w7,ror #32-2
3432	eor	w6,w6,w7,ror #32-10
3433	eor	w6,w6,w7,ror #32-18
3434	eor	w6,w6,w7,ror #32-24
3435	eor	w12,w12,w6
3436	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3437	eor	w6,w14,w15
3438	eor	w9,w12,w8
3439	eor	w6,w6,w9
3440	mov	v3.s[0],w6
3441	// optimize sbox using AESE instruction
3442	tbl	v0.16b, {v3.16b}, v26.16b
3443	ushr	v2.16b, v0.16b, 4
3444	and	v0.16b, v0.16b, v31.16b
3445	tbl	v0.16b, {v28.16b}, v0.16b
3446	tbl	v2.16b, {v27.16b}, v2.16b
3447	eor	v0.16b, v0.16b, v2.16b
3448	eor	v1.16b, v1.16b, v1.16b
3449	aese	v0.16b,v1.16b
3450	ushr	v2.16b, v0.16b, 4
3451	and	v0.16b, v0.16b, v31.16b
3452	tbl	v0.16b, {v30.16b}, v0.16b
3453	tbl	v2.16b, {v29.16b}, v2.16b
3454	eor	v0.16b, v0.16b, v2.16b
3455
3456	mov	w7,v0.s[0]
3457	eor	w6,w7,w7,ror #32-2
3458	eor	w6,w6,w7,ror #32-10
3459	eor	w6,w6,w7,ror #32-18
3460	eor	w6,w6,w7,ror #32-24
3461	ldp	w7,w8,[x10],8
3462	eor	w13,w13,w6
3463	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3464	eor	w6,w12,w13
3465	eor	w9,w7,w15
3466	eor	w6,w6,w9
3467	mov	v3.s[0],w6
3468	// optimize sbox using AESE instruction
3469	tbl	v0.16b, {v3.16b}, v26.16b
3470	ushr	v2.16b, v0.16b, 4
3471	and	v0.16b, v0.16b, v31.16b
3472	tbl	v0.16b, {v28.16b}, v0.16b
3473	tbl	v2.16b, {v27.16b}, v2.16b
3474	eor	v0.16b, v0.16b, v2.16b
3475	eor	v1.16b, v1.16b, v1.16b
3476	aese	v0.16b,v1.16b
3477	ushr	v2.16b, v0.16b, 4
3478	and	v0.16b, v0.16b, v31.16b
3479	tbl	v0.16b, {v30.16b}, v0.16b
3480	tbl	v2.16b, {v29.16b}, v2.16b
3481	eor	v0.16b, v0.16b, v2.16b
3482
3483	mov	w7,v0.s[0]
3484	eor	w6,w7,w7,ror #32-2
3485	eor	w6,w6,w7,ror #32-10
3486	eor	w6,w6,w7,ror #32-18
3487	eor	w6,w6,w7,ror #32-24
3488	eor	w14,w14,w6
3489	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3490	eor	w6,w12,w13
3491	eor	w9,w14,w8
3492	eor	w6,w6,w9
3493	mov	v3.s[0],w6
3494	// optimize sbox using AESE instruction
3495	tbl	v0.16b, {v3.16b}, v26.16b
3496	ushr	v2.16b, v0.16b, 4
3497	and	v0.16b, v0.16b, v31.16b
3498	tbl	v0.16b, {v28.16b}, v0.16b
3499	tbl	v2.16b, {v27.16b}, v2.16b
3500	eor	v0.16b, v0.16b, v2.16b
3501	eor	v1.16b, v1.16b, v1.16b
3502	aese	v0.16b,v1.16b
3503	ushr	v2.16b, v0.16b, 4
3504	and	v0.16b, v0.16b, v31.16b
3505	tbl	v0.16b, {v30.16b}, v0.16b
3506	tbl	v2.16b, {v29.16b}, v2.16b
3507	eor	v0.16b, v0.16b, v2.16b
3508
3509	mov	w7,v0.s[0]
3510	eor	w6,w7,w7,ror #32-2
3511	eor	w6,w6,w7,ror #32-10
3512	eor	w6,w6,w7,ror #32-18
3513	eor	w6,w6,w7,ror #32-24
3514	eor	w15,w15,w6
3515	subs	w11,w11,#1
3516	b.ne	10b
3517	mov	v4.s[0],w15
3518	mov	v4.s[1],w14
3519	mov	v4.s[2],w13
3520	mov	v4.s[3],w12
3521#ifndef __AARCH64EB__
3522	rev32	v4.16b,v4.16b
3523#endif
3524	eor	v4.16b, v4.16b, v18.16b
3525	st1	{v4.4s}, [x26]
3526.return_gb:
3527	ldp	d14, d15, [sp], #0x10
3528	ldp	d12, d13, [sp], #0x10
3529	ldp	d10, d11, [sp], #0x10
3530	ldp	d8, d9, [sp], #0x10
3531	ldp	x29, x30, [sp], #0x10
3532	ldp	x27, x28, [sp], #0x10
3533	ldp	x25, x26, [sp], #0x10
3534	ldp	x23, x24, [sp], #0x10
3535	ldp	x21, x22, [sp], #0x10
3536	ldp	x19, x20, [sp], #0x10
3537	ldp	x17, x18, [sp], #0x10
3538	ldp	x15, x16, [sp], #0x10
3539	AARCH64_VALIDATE_LINK_REGISTER
3540	ret
3541.size	vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb
3542.globl	vpsm4_ex_xts_encrypt
3543.type	vpsm4_ex_xts_encrypt,%function
3544.align	5
3545vpsm4_ex_xts_encrypt:
3546	AARCH64_SIGN_LINK_REGISTER
3547	stp	x15, x16, [sp, #-0x10]!
3548	stp	x17, x18, [sp, #-0x10]!
3549	stp	x19, x20, [sp, #-0x10]!
3550	stp	x21, x22, [sp, #-0x10]!
3551	stp	x23, x24, [sp, #-0x10]!
3552	stp	x25, x26, [sp, #-0x10]!
3553	stp	x27, x28, [sp, #-0x10]!
3554	stp	x29, x30, [sp, #-0x10]!
3555	stp	d8, d9, [sp, #-0x10]!
3556	stp	d10, d11, [sp, #-0x10]!
3557	stp	d12, d13, [sp, #-0x10]!
3558	stp	d14, d15, [sp, #-0x10]!
3559	mov	x26,x3
3560	mov	x27,x4
3561	mov	w28,w6
3562	ld1	{v16.4s}, [x5]
3563	mov	x3,x27
3564	adrp	x9, .Lsbox_magic
3565	ldr	q26, [x9, #:lo12:.Lsbox_magic]
3566	ldr	q27, [x9, #:lo12:.Lsbox_magic+16]
3567	ldr	q28, [x9, #:lo12:.Lsbox_magic+32]
3568	ldr	q29, [x9, #:lo12:.Lsbox_magic+48]
3569	ldr	q30, [x9, #:lo12:.Lsbox_magic+64]
3570	ldr	q31, [x9, #:lo12:.Lsbox_magic+80]
3571#ifndef __AARCH64EB__
3572	rev32	v16.16b,v16.16b
3573#endif
3574	mov	x10,x3
3575	mov	w11,#8
3576	mov	w12,v16.s[0]
3577	mov	w13,v16.s[1]
3578	mov	w14,v16.s[2]
3579	mov	w15,v16.s[3]
358010:
3581	ldp	w7,w8,[x10],8
3582	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3583	eor	w6,w14,w15
3584	eor	w9,w7,w13
3585	eor	w6,w6,w9
3586	mov	v3.s[0],w6
3587	// optimize sbox using AESE instruction
3588	tbl	v0.16b, {v3.16b}, v26.16b
3589	ushr	v2.16b, v0.16b, 4
3590	and	v0.16b, v0.16b, v31.16b
3591	tbl	v0.16b, {v28.16b}, v0.16b
3592	tbl	v2.16b, {v27.16b}, v2.16b
3593	eor	v0.16b, v0.16b, v2.16b
3594	eor	v1.16b, v1.16b, v1.16b
3595	aese	v0.16b,v1.16b
3596	ushr	v2.16b, v0.16b, 4
3597	and	v0.16b, v0.16b, v31.16b
3598	tbl	v0.16b, {v30.16b}, v0.16b
3599	tbl	v2.16b, {v29.16b}, v2.16b
3600	eor	v0.16b, v0.16b, v2.16b
3601
3602	mov	w7,v0.s[0]
3603	eor	w6,w7,w7,ror #32-2
3604	eor	w6,w6,w7,ror #32-10
3605	eor	w6,w6,w7,ror #32-18
3606	eor	w6,w6,w7,ror #32-24
3607	eor	w12,w12,w6
3608	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3609	eor	w6,w14,w15
3610	eor	w9,w12,w8
3611	eor	w6,w6,w9
3612	mov	v3.s[0],w6
3613	// optimize sbox using AESE instruction
3614	tbl	v0.16b, {v3.16b}, v26.16b
3615	ushr	v2.16b, v0.16b, 4
3616	and	v0.16b, v0.16b, v31.16b
3617	tbl	v0.16b, {v28.16b}, v0.16b
3618	tbl	v2.16b, {v27.16b}, v2.16b
3619	eor	v0.16b, v0.16b, v2.16b
3620	eor	v1.16b, v1.16b, v1.16b
3621	aese	v0.16b,v1.16b
3622	ushr	v2.16b, v0.16b, 4
3623	and	v0.16b, v0.16b, v31.16b
3624	tbl	v0.16b, {v30.16b}, v0.16b
3625	tbl	v2.16b, {v29.16b}, v2.16b
3626	eor	v0.16b, v0.16b, v2.16b
3627
3628	mov	w7,v0.s[0]
3629	eor	w6,w7,w7,ror #32-2
3630	eor	w6,w6,w7,ror #32-10
3631	eor	w6,w6,w7,ror #32-18
3632	eor	w6,w6,w7,ror #32-24
3633	ldp	w7,w8,[x10],8
3634	eor	w13,w13,w6
3635	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3636	eor	w6,w12,w13
3637	eor	w9,w7,w15
3638	eor	w6,w6,w9
3639	mov	v3.s[0],w6
3640	// optimize sbox using AESE instruction
3641	tbl	v0.16b, {v3.16b}, v26.16b
3642	ushr	v2.16b, v0.16b, 4
3643	and	v0.16b, v0.16b, v31.16b
3644	tbl	v0.16b, {v28.16b}, v0.16b
3645	tbl	v2.16b, {v27.16b}, v2.16b
3646	eor	v0.16b, v0.16b, v2.16b
3647	eor	v1.16b, v1.16b, v1.16b
3648	aese	v0.16b,v1.16b
3649	ushr	v2.16b, v0.16b, 4
3650	and	v0.16b, v0.16b, v31.16b
3651	tbl	v0.16b, {v30.16b}, v0.16b
3652	tbl	v2.16b, {v29.16b}, v2.16b
3653	eor	v0.16b, v0.16b, v2.16b
3654
3655	mov	w7,v0.s[0]
3656	eor	w6,w7,w7,ror #32-2
3657	eor	w6,w6,w7,ror #32-10
3658	eor	w6,w6,w7,ror #32-18
3659	eor	w6,w6,w7,ror #32-24
3660	eor	w14,w14,w6
3661	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3662	eor	w6,w12,w13
3663	eor	w9,w14,w8
3664	eor	w6,w6,w9
3665	mov	v3.s[0],w6
3666	// optimize sbox using AESE instruction
3667	tbl	v0.16b, {v3.16b}, v26.16b
3668	ushr	v2.16b, v0.16b, 4
3669	and	v0.16b, v0.16b, v31.16b
3670	tbl	v0.16b, {v28.16b}, v0.16b
3671	tbl	v2.16b, {v27.16b}, v2.16b
3672	eor	v0.16b, v0.16b, v2.16b
3673	eor	v1.16b, v1.16b, v1.16b
3674	aese	v0.16b,v1.16b
3675	ushr	v2.16b, v0.16b, 4
3676	and	v0.16b, v0.16b, v31.16b
3677	tbl	v0.16b, {v30.16b}, v0.16b
3678	tbl	v2.16b, {v29.16b}, v2.16b
3679	eor	v0.16b, v0.16b, v2.16b
3680
3681	mov	w7,v0.s[0]
3682	eor	w6,w7,w7,ror #32-2
3683	eor	w6,w6,w7,ror #32-10
3684	eor	w6,w6,w7,ror #32-18
3685	eor	w6,w6,w7,ror #32-24
3686	eor	w15,w15,w6
3687	subs	w11,w11,#1
3688	b.ne	10b
3689	mov	v16.s[0],w15
3690	mov	v16.s[1],w14
3691	mov	v16.s[2],w13
3692	mov	v16.s[3],w12
3693#ifndef __AARCH64EB__
3694	rev32	v16.16b,v16.16b
3695#endif
3696	mov	x3,x26
3697	and	x29,x2,#0x0F
3698	// convert length into blocks
3699	lsr	x2,x2,4
3700	cmp	x2,#1
3701	b.lt	.return
3702
3703	cmp	x29,0
3704	// If the encryption/decryption Length is N times of 16,
3705	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks
3706	b.eq	.xts_encrypt_blocks
3707
3708	// If the encryption/decryption length is not N times of 16,
3709	// the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak
3710	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks
3711	subs	x2,x2,#1
3712	b.eq	.only_2blks_tweak
3713.xts_encrypt_blocks:
3714#ifdef __AARCH64EB__
3715	rev32	v16.16b,v16.16b
3716#endif
3717	mov	x12,v16.d[0]
3718	mov	x13,v16.d[1]
3719	mov	w7,0x87
3720	extr	x9,x13,x13,#32
3721	extr	x15,x13,x12,#63
3722	and	w8,w7,w9,asr#31
3723	eor	x14,x8,x12,lsl#1
3724	mov	w7,0x87
3725	extr	x9,x15,x15,#32
3726	extr	x17,x15,x14,#63
3727	and	w8,w7,w9,asr#31
3728	eor	x16,x8,x14,lsl#1
3729	mov	w7,0x87
3730	extr	x9,x17,x17,#32
3731	extr	x19,x17,x16,#63
3732	and	w8,w7,w9,asr#31
3733	eor	x18,x8,x16,lsl#1
3734	mov	w7,0x87
3735	extr	x9,x19,x19,#32
3736	extr	x21,x19,x18,#63
3737	and	w8,w7,w9,asr#31
3738	eor	x20,x8,x18,lsl#1
3739	mov	w7,0x87
3740	extr	x9,x21,x21,#32
3741	extr	x23,x21,x20,#63
3742	and	w8,w7,w9,asr#31
3743	eor	x22,x8,x20,lsl#1
3744	mov	w7,0x87
3745	extr	x9,x23,x23,#32
3746	extr	x25,x23,x22,#63
3747	and	w8,w7,w9,asr#31
3748	eor	x24,x8,x22,lsl#1
3749	mov	w7,0x87
3750	extr	x9,x25,x25,#32
3751	extr	x27,x25,x24,#63
3752	and	w8,w7,w9,asr#31
3753	eor	x26,x8,x24,lsl#1
3754.Lxts_8_blocks_process:
3755	cmp	x2,#8
3756	mov	v16.d[0],x12
3757	mov	v16.d[1],x13
3758#ifdef __AARCH64EB__
3759	rev32	v16.16b,v16.16b
3760#endif
3761	mov	w7,0x87
3762	extr	x9,x27,x27,#32
3763	extr	x13,x27,x26,#63
3764	and	w8,w7,w9,asr#31
3765	eor	x12,x8,x26,lsl#1
3766	mov	v17.d[0],x14
3767	mov	v17.d[1],x15
3768#ifdef __AARCH64EB__
3769	rev32	v17.16b,v17.16b
3770#endif
3771	mov	w7,0x87
3772	extr	x9,x13,x13,#32
3773	extr	x15,x13,x12,#63
3774	and	w8,w7,w9,asr#31
3775	eor	x14,x8,x12,lsl#1
3776	mov	v18.d[0],x16
3777	mov	v18.d[1],x17
3778#ifdef __AARCH64EB__
3779	rev32	v18.16b,v18.16b
3780#endif
3781	mov	w7,0x87
3782	extr	x9,x15,x15,#32
3783	extr	x17,x15,x14,#63
3784	and	w8,w7,w9,asr#31
3785	eor	x16,x8,x14,lsl#1
3786	mov	v19.d[0],x18
3787	mov	v19.d[1],x19
3788#ifdef __AARCH64EB__
3789	rev32	v19.16b,v19.16b
3790#endif
3791	mov	w7,0x87
3792	extr	x9,x17,x17,#32
3793	extr	x19,x17,x16,#63
3794	and	w8,w7,w9,asr#31
3795	eor	x18,x8,x16,lsl#1
3796	mov	v20.d[0],x20
3797	mov	v20.d[1],x21
3798#ifdef __AARCH64EB__
3799	rev32	v20.16b,v20.16b
3800#endif
3801	mov	w7,0x87
3802	extr	x9,x19,x19,#32
3803	extr	x21,x19,x18,#63
3804	and	w8,w7,w9,asr#31
3805	eor	x20,x8,x18,lsl#1
3806	mov	v21.d[0],x22
3807	mov	v21.d[1],x23
3808#ifdef __AARCH64EB__
3809	rev32	v21.16b,v21.16b
3810#endif
3811	mov	w7,0x87
3812	extr	x9,x21,x21,#32
3813	extr	x23,x21,x20,#63
3814	and	w8,w7,w9,asr#31
3815	eor	x22,x8,x20,lsl#1
3816	mov	v22.d[0],x24
3817	mov	v22.d[1],x25
3818#ifdef __AARCH64EB__
3819	rev32	v22.16b,v22.16b
3820#endif
3821	mov	w7,0x87
3822	extr	x9,x23,x23,#32
3823	extr	x25,x23,x22,#63
3824	and	w8,w7,w9,asr#31
3825	eor	x24,x8,x22,lsl#1
3826	mov	v23.d[0],x26
3827	mov	v23.d[1],x27
3828#ifdef __AARCH64EB__
3829	rev32	v23.16b,v23.16b
3830#endif
3831	mov	w7,0x87
3832	extr	x9,x25,x25,#32
3833	extr	x27,x25,x24,#63
3834	and	w8,w7,w9,asr#31
3835	eor	x26,x8,x24,lsl#1
3836	b.lt	.Lxts_4_blocks_process
3837	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3838	eor	v4.16b, v4.16b, v16.16b
3839	eor	v5.16b, v5.16b, v17.16b
3840	eor	v6.16b, v6.16b, v18.16b
3841	eor	v7.16b, v7.16b, v19.16b
3842	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
3843	eor	v8.16b, v8.16b, v20.16b
3844	eor	v9.16b, v9.16b, v21.16b
3845	eor	v10.16b, v10.16b, v22.16b
3846	eor	v11.16b, v11.16b, v23.16b
3847#ifndef __AARCH64EB__
3848	rev32	v4.16b,v4.16b
3849#endif
3850#ifndef __AARCH64EB__
3851	rev32	v5.16b,v5.16b
3852#endif
3853#ifndef __AARCH64EB__
3854	rev32	v6.16b,v6.16b
3855#endif
3856#ifndef __AARCH64EB__
3857	rev32	v7.16b,v7.16b
3858#endif
3859#ifndef __AARCH64EB__
3860	rev32	v8.16b,v8.16b
3861#endif
3862#ifndef __AARCH64EB__
3863	rev32	v9.16b,v9.16b
3864#endif
3865#ifndef __AARCH64EB__
3866	rev32	v10.16b,v10.16b
3867#endif
3868#ifndef __AARCH64EB__
3869	rev32	v11.16b,v11.16b
3870#endif
3871	zip1	v0.4s,v4.4s,v5.4s
3872	zip2	v1.4s,v4.4s,v5.4s
3873	zip1	v2.4s,v6.4s,v7.4s
3874	zip2	v3.4s,v6.4s,v7.4s
3875	zip1	v4.2d,v0.2d,v2.2d
3876	zip2	v5.2d,v0.2d,v2.2d
3877	zip1	v6.2d,v1.2d,v3.2d
3878	zip2	v7.2d,v1.2d,v3.2d
3879	zip1	v0.4s,v8.4s,v9.4s
3880	zip2	v1.4s,v8.4s,v9.4s
3881	zip1	v2.4s,v10.4s,v11.4s
3882	zip2	v3.4s,v10.4s,v11.4s
3883	zip1	v8.2d,v0.2d,v2.2d
3884	zip2	v9.2d,v0.2d,v2.2d
3885	zip1	v10.2d,v1.2d,v3.2d
3886	zip2	v11.2d,v1.2d,v3.2d
3887	bl	_vpsm4_ex_enc_8blks
3888	zip1	v8.4s,v0.4s,v1.4s
3889	zip2	v9.4s,v0.4s,v1.4s
3890	zip1	v10.4s,v2.4s,v3.4s
3891	zip2	v11.4s,v2.4s,v3.4s
3892	zip1	v0.2d,v8.2d,v10.2d
3893	zip2	v1.2d,v8.2d,v10.2d
3894	zip1	v2.2d,v9.2d,v11.2d
3895	zip2	v3.2d,v9.2d,v11.2d
3896	zip1	v8.4s,v4.4s,v5.4s
3897	zip2	v9.4s,v4.4s,v5.4s
3898	zip1	v10.4s,v6.4s,v7.4s
3899	zip2	v11.4s,v6.4s,v7.4s
3900	zip1	v4.2d,v8.2d,v10.2d
3901	zip2	v5.2d,v8.2d,v10.2d
3902	zip1	v6.2d,v9.2d,v11.2d
3903	zip2	v7.2d,v9.2d,v11.2d
3904	eor	v0.16b, v0.16b, v16.16b
3905	eor	v1.16b, v1.16b, v17.16b
3906	eor	v2.16b, v2.16b, v18.16b
3907	eor	v3.16b, v3.16b, v19.16b
3908	eor	v4.16b, v4.16b, v20.16b
3909	eor	v5.16b, v5.16b, v21.16b
3910	eor	v6.16b, v6.16b, v22.16b
3911	eor	v7.16b, v7.16b, v23.16b
3912
3913	// save the last tweak
3914	mov	v25.16b,v23.16b
3915	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
3916	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
3917	subs	x2,x2,#8
3918	b.gt	.Lxts_8_blocks_process
3919	b	100f
3920.Lxts_4_blocks_process:
3921	cmp	x2,#4
3922	b.lt	1f
3923	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3924	eor	v4.16b, v4.16b, v16.16b
3925	eor	v5.16b, v5.16b, v17.16b
3926	eor	v6.16b, v6.16b, v18.16b
3927	eor	v7.16b, v7.16b, v19.16b
3928#ifndef __AARCH64EB__
3929	rev32	v4.16b,v4.16b
3930#endif
3931#ifndef __AARCH64EB__
3932	rev32	v5.16b,v5.16b
3933#endif
3934#ifndef __AARCH64EB__
3935	rev32	v6.16b,v6.16b
3936#endif
3937#ifndef __AARCH64EB__
3938	rev32	v7.16b,v7.16b
3939#endif
3940	zip1	v0.4s,v4.4s,v5.4s
3941	zip2	v1.4s,v4.4s,v5.4s
3942	zip1	v2.4s,v6.4s,v7.4s
3943	zip2	v3.4s,v6.4s,v7.4s
3944	zip1	v4.2d,v0.2d,v2.2d
3945	zip2	v5.2d,v0.2d,v2.2d
3946	zip1	v6.2d,v1.2d,v3.2d
3947	zip2	v7.2d,v1.2d,v3.2d
3948	bl	_vpsm4_ex_enc_4blks
3949	zip1	v4.4s,v0.4s,v1.4s
3950	zip2	v5.4s,v0.4s,v1.4s
3951	zip1	v6.4s,v2.4s,v3.4s
3952	zip2	v7.4s,v2.4s,v3.4s
3953	zip1	v0.2d,v4.2d,v6.2d
3954	zip2	v1.2d,v4.2d,v6.2d
3955	zip1	v2.2d,v5.2d,v7.2d
3956	zip2	v3.2d,v5.2d,v7.2d
3957	eor	v0.16b, v0.16b, v16.16b
3958	eor	v1.16b, v1.16b, v17.16b
3959	eor	v2.16b, v2.16b, v18.16b
3960	eor	v3.16b, v3.16b, v19.16b
3961	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
3962	sub	x2,x2,#4
3963	mov	v16.16b,v20.16b
3964	mov	v17.16b,v21.16b
3965	mov	v18.16b,v22.16b
3966	// save the last tweak
3967	mov	v25.16b,v19.16b
39681:
3969	// process last block
3970	cmp	x2,#1
3971	b.lt	100f
3972	b.gt	1f
3973	ld1	{v4.4s},[x0],#16
3974	eor	v4.16b, v4.16b, v16.16b
3975#ifndef __AARCH64EB__
3976	rev32	v4.16b,v4.16b
3977#endif
3978	mov	x10,x3
3979	mov	w11,#8
3980	mov	w12,v4.s[0]
3981	mov	w13,v4.s[1]
3982	mov	w14,v4.s[2]
3983	mov	w15,v4.s[3]
398410:
3985	ldp	w7,w8,[x10],8
3986	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3987	eor	w6,w14,w15
3988	eor	w9,w7,w13
3989	eor	w6,w6,w9
3990	mov	v3.s[0],w6
3991	// optimize sbox using AESE instruction
3992	tbl	v0.16b, {v3.16b}, v26.16b
3993	ushr	v2.16b, v0.16b, 4
3994	and	v0.16b, v0.16b, v31.16b
3995	tbl	v0.16b, {v28.16b}, v0.16b
3996	tbl	v2.16b, {v27.16b}, v2.16b
3997	eor	v0.16b, v0.16b, v2.16b
3998	eor	v1.16b, v1.16b, v1.16b
3999	aese	v0.16b,v1.16b
4000	ushr	v2.16b, v0.16b, 4
4001	and	v0.16b, v0.16b, v31.16b
4002	tbl	v0.16b, {v30.16b}, v0.16b
4003	tbl	v2.16b, {v29.16b}, v2.16b
4004	eor	v0.16b, v0.16b, v2.16b
4005
4006	mov	w7,v0.s[0]
4007	eor	w6,w7,w7,ror #32-2
4008	eor	w6,w6,w7,ror #32-10
4009	eor	w6,w6,w7,ror #32-18
4010	eor	w6,w6,w7,ror #32-24
4011	eor	w12,w12,w6
4012	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4013	eor	w6,w14,w15
4014	eor	w9,w12,w8
4015	eor	w6,w6,w9
4016	mov	v3.s[0],w6
4017	// optimize sbox using AESE instruction
4018	tbl	v0.16b, {v3.16b}, v26.16b
4019	ushr	v2.16b, v0.16b, 4
4020	and	v0.16b, v0.16b, v31.16b
4021	tbl	v0.16b, {v28.16b}, v0.16b
4022	tbl	v2.16b, {v27.16b}, v2.16b
4023	eor	v0.16b, v0.16b, v2.16b
4024	eor	v1.16b, v1.16b, v1.16b
4025	aese	v0.16b,v1.16b
4026	ushr	v2.16b, v0.16b, 4
4027	and	v0.16b, v0.16b, v31.16b
4028	tbl	v0.16b, {v30.16b}, v0.16b
4029	tbl	v2.16b, {v29.16b}, v2.16b
4030	eor	v0.16b, v0.16b, v2.16b
4031
4032	mov	w7,v0.s[0]
4033	eor	w6,w7,w7,ror #32-2
4034	eor	w6,w6,w7,ror #32-10
4035	eor	w6,w6,w7,ror #32-18
4036	eor	w6,w6,w7,ror #32-24
4037	ldp	w7,w8,[x10],8
4038	eor	w13,w13,w6
4039	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4040	eor	w6,w12,w13
4041	eor	w9,w7,w15
4042	eor	w6,w6,w9
4043	mov	v3.s[0],w6
4044	// optimize sbox using AESE instruction
4045	tbl	v0.16b, {v3.16b}, v26.16b
4046	ushr	v2.16b, v0.16b, 4
4047	and	v0.16b, v0.16b, v31.16b
4048	tbl	v0.16b, {v28.16b}, v0.16b
4049	tbl	v2.16b, {v27.16b}, v2.16b
4050	eor	v0.16b, v0.16b, v2.16b
4051	eor	v1.16b, v1.16b, v1.16b
4052	aese	v0.16b,v1.16b
4053	ushr	v2.16b, v0.16b, 4
4054	and	v0.16b, v0.16b, v31.16b
4055	tbl	v0.16b, {v30.16b}, v0.16b
4056	tbl	v2.16b, {v29.16b}, v2.16b
4057	eor	v0.16b, v0.16b, v2.16b
4058
4059	mov	w7,v0.s[0]
4060	eor	w6,w7,w7,ror #32-2
4061	eor	w6,w6,w7,ror #32-10
4062	eor	w6,w6,w7,ror #32-18
4063	eor	w6,w6,w7,ror #32-24
4064	eor	w14,w14,w6
4065	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4066	eor	w6,w12,w13
4067	eor	w9,w14,w8
4068	eor	w6,w6,w9
4069	mov	v3.s[0],w6
4070	// optimize sbox using AESE instruction
4071	tbl	v0.16b, {v3.16b}, v26.16b
4072	ushr	v2.16b, v0.16b, 4
4073	and	v0.16b, v0.16b, v31.16b
4074	tbl	v0.16b, {v28.16b}, v0.16b
4075	tbl	v2.16b, {v27.16b}, v2.16b
4076	eor	v0.16b, v0.16b, v2.16b
4077	eor	v1.16b, v1.16b, v1.16b
4078	aese	v0.16b,v1.16b
4079	ushr	v2.16b, v0.16b, 4
4080	and	v0.16b, v0.16b, v31.16b
4081	tbl	v0.16b, {v30.16b}, v0.16b
4082	tbl	v2.16b, {v29.16b}, v2.16b
4083	eor	v0.16b, v0.16b, v2.16b
4084
4085	mov	w7,v0.s[0]
4086	eor	w6,w7,w7,ror #32-2
4087	eor	w6,w6,w7,ror #32-10
4088	eor	w6,w6,w7,ror #32-18
4089	eor	w6,w6,w7,ror #32-24
4090	eor	w15,w15,w6
4091	subs	w11,w11,#1
4092	b.ne	10b
4093	mov	v4.s[0],w15
4094	mov	v4.s[1],w14
4095	mov	v4.s[2],w13
4096	mov	v4.s[3],w12
4097#ifndef __AARCH64EB__
4098	rev32	v4.16b,v4.16b
4099#endif
4100	eor	v4.16b, v4.16b, v16.16b
4101	st1	{v4.4s},[x1],#16
4102	// save the last tweak
4103	mov	v25.16b,v16.16b
4104	b	100f
41051:	//	process last 2 blocks
4106	cmp	x2,#2
4107	b.gt	1f
4108	ld1	{v4.4s,v5.4s},[x0],#32
4109	eor	v4.16b, v4.16b, v16.16b
4110	eor	v5.16b, v5.16b, v17.16b
4111#ifndef __AARCH64EB__
4112	rev32	v4.16b,v4.16b
4113#endif
4114#ifndef __AARCH64EB__
4115	rev32	v5.16b,v5.16b
4116#endif
4117	zip1	v0.4s,v4.4s,v5.4s
4118	zip2	v1.4s,v4.4s,v5.4s
4119	zip1	v2.4s,v6.4s,v7.4s
4120	zip2	v3.4s,v6.4s,v7.4s
4121	zip1	v4.2d,v0.2d,v2.2d
4122	zip2	v5.2d,v0.2d,v2.2d
4123	zip1	v6.2d,v1.2d,v3.2d
4124	zip2	v7.2d,v1.2d,v3.2d
4125	bl	_vpsm4_ex_enc_4blks
4126	zip1	v4.4s,v0.4s,v1.4s
4127	zip2	v5.4s,v0.4s,v1.4s
4128	zip1	v6.4s,v2.4s,v3.4s
4129	zip2	v7.4s,v2.4s,v3.4s
4130	zip1	v0.2d,v4.2d,v6.2d
4131	zip2	v1.2d,v4.2d,v6.2d
4132	zip1	v2.2d,v5.2d,v7.2d
4133	zip2	v3.2d,v5.2d,v7.2d
4134	eor	v0.16b, v0.16b, v16.16b
4135	eor	v1.16b, v1.16b, v17.16b
4136	st1	{v0.4s,v1.4s},[x1],#32
4137	// save the last tweak
4138	mov	v25.16b,v17.16b
4139	b	100f
41401:	//	process last 3 blocks
4141	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
4142	eor	v4.16b, v4.16b, v16.16b
4143	eor	v5.16b, v5.16b, v17.16b
4144	eor	v6.16b, v6.16b, v18.16b
4145#ifndef __AARCH64EB__
4146	rev32	v4.16b,v4.16b
4147#endif
4148#ifndef __AARCH64EB__
4149	rev32	v5.16b,v5.16b
4150#endif
4151#ifndef __AARCH64EB__
4152	rev32	v6.16b,v6.16b
4153#endif
4154	zip1	v0.4s,v4.4s,v5.4s
4155	zip2	v1.4s,v4.4s,v5.4s
4156	zip1	v2.4s,v6.4s,v7.4s
4157	zip2	v3.4s,v6.4s,v7.4s
4158	zip1	v4.2d,v0.2d,v2.2d
4159	zip2	v5.2d,v0.2d,v2.2d
4160	zip1	v6.2d,v1.2d,v3.2d
4161	zip2	v7.2d,v1.2d,v3.2d
4162	bl	_vpsm4_ex_enc_4blks
4163	zip1	v4.4s,v0.4s,v1.4s
4164	zip2	v5.4s,v0.4s,v1.4s
4165	zip1	v6.4s,v2.4s,v3.4s
4166	zip2	v7.4s,v2.4s,v3.4s
4167	zip1	v0.2d,v4.2d,v6.2d
4168	zip2	v1.2d,v4.2d,v6.2d
4169	zip1	v2.2d,v5.2d,v7.2d
4170	zip2	v3.2d,v5.2d,v7.2d
4171	eor	v0.16b, v0.16b, v16.16b
4172	eor	v1.16b, v1.16b, v17.16b
4173	eor	v2.16b, v2.16b, v18.16b
4174	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
4175	// save the last tweak
4176	mov	v25.16b,v18.16b
4177100:
4178	cmp	x29,0
4179	b.eq	.return
4180
4181// This branch calculates the last two tweaks,
4182// while the encryption/decryption length is larger than 32
4183.last_2blks_tweak:
4184#ifdef __AARCH64EB__
4185	rev32	v25.16b,v25.16b
4186#endif
4187	mov	v2.16b,v25.16b
4188	adrp	x9, .Lxts_magic
4189	ldr	q0, [x9, #:lo12:.Lxts_magic]
4190	shl	v17.16b, v2.16b, #1
4191	ext	v1.16b, v2.16b, v2.16b,#15
4192	ushr	v1.16b, v1.16b, #7
4193	mul	v1.16b, v1.16b, v0.16b
4194	eor	v17.16b, v17.16b, v1.16b
4195	mov	v2.16b,v17.16b
4196	adrp	x9, .Lxts_magic
4197	ldr	q0, [x9, #:lo12:.Lxts_magic]
4198	shl	v18.16b, v2.16b, #1
4199	ext	v1.16b, v2.16b, v2.16b,#15
4200	ushr	v1.16b, v1.16b, #7
4201	mul	v1.16b, v1.16b, v0.16b
4202	eor	v18.16b, v18.16b, v1.16b
4203	b	.check_dec
4204
4205
4206// This branch calculates the last two tweaks,
4207// while the encryption/decryption length is equal to 32, who only need two tweaks
4208.only_2blks_tweak:
4209	mov	v17.16b,v16.16b
4210#ifdef __AARCH64EB__
4211	rev32	v17.16b,v17.16b
4212#endif
4213	mov	v2.16b,v17.16b
4214	adrp	x9, .Lxts_magic
4215	ldr	q0, [x9, #:lo12:.Lxts_magic]
4216	shl	v18.16b, v2.16b, #1
4217	ext	v1.16b, v2.16b, v2.16b,#15
4218	ushr	v1.16b, v1.16b, #7
4219	mul	v1.16b, v1.16b, v0.16b
4220	eor	v18.16b, v18.16b, v1.16b
4221	b	.check_dec
4222
4223
4224// Determine whether encryption or decryption is required.
4225// The last two tweaks need to be swapped for decryption.
4226.check_dec:
4227	// encryption:1 decryption:0
4228	cmp	w28,1
4229	b.eq	.process_last_2blks
4230	mov	v0.16B,v17.16b
4231	mov	v17.16B,v18.16b
4232	mov	v18.16B,v0.16b
4233
4234.process_last_2blks:
4235#ifdef __AARCH64EB__
4236	rev32	v17.16b,v17.16b
4237#endif
4238#ifdef __AARCH64EB__
4239	rev32	v18.16b,v18.16b
4240#endif
4241	ld1	{v4.4s},[x0],#16
4242	eor	v4.16b, v4.16b, v17.16b
4243#ifndef __AARCH64EB__
4244	rev32	v4.16b,v4.16b
4245#endif
4246	mov	x10,x3
4247	mov	w11,#8
4248	mov	w12,v4.s[0]
4249	mov	w13,v4.s[1]
4250	mov	w14,v4.s[2]
4251	mov	w15,v4.s[3]
425210:
4253	ldp	w7,w8,[x10],8
4254	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4255	eor	w6,w14,w15
4256	eor	w9,w7,w13
4257	eor	w6,w6,w9
4258	mov	v3.s[0],w6
4259	// optimize sbox using AESE instruction
4260	tbl	v0.16b, {v3.16b}, v26.16b
4261	ushr	v2.16b, v0.16b, 4
4262	and	v0.16b, v0.16b, v31.16b
4263	tbl	v0.16b, {v28.16b}, v0.16b
4264	tbl	v2.16b, {v27.16b}, v2.16b
4265	eor	v0.16b, v0.16b, v2.16b
4266	eor	v1.16b, v1.16b, v1.16b
4267	aese	v0.16b,v1.16b
4268	ushr	v2.16b, v0.16b, 4
4269	and	v0.16b, v0.16b, v31.16b
4270	tbl	v0.16b, {v30.16b}, v0.16b
4271	tbl	v2.16b, {v29.16b}, v2.16b
4272	eor	v0.16b, v0.16b, v2.16b
4273
4274	mov	w7,v0.s[0]
4275	eor	w6,w7,w7,ror #32-2
4276	eor	w6,w6,w7,ror #32-10
4277	eor	w6,w6,w7,ror #32-18
4278	eor	w6,w6,w7,ror #32-24
4279	eor	w12,w12,w6
4280	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4281	eor	w6,w14,w15
4282	eor	w9,w12,w8
4283	eor	w6,w6,w9
4284	mov	v3.s[0],w6
4285	// optimize sbox using AESE instruction
4286	tbl	v0.16b, {v3.16b}, v26.16b
4287	ushr	v2.16b, v0.16b, 4
4288	and	v0.16b, v0.16b, v31.16b
4289	tbl	v0.16b, {v28.16b}, v0.16b
4290	tbl	v2.16b, {v27.16b}, v2.16b
4291	eor	v0.16b, v0.16b, v2.16b
4292	eor	v1.16b, v1.16b, v1.16b
4293	aese	v0.16b,v1.16b
4294	ushr	v2.16b, v0.16b, 4
4295	and	v0.16b, v0.16b, v31.16b
4296	tbl	v0.16b, {v30.16b}, v0.16b
4297	tbl	v2.16b, {v29.16b}, v2.16b
4298	eor	v0.16b, v0.16b, v2.16b
4299
4300	mov	w7,v0.s[0]
4301	eor	w6,w7,w7,ror #32-2
4302	eor	w6,w6,w7,ror #32-10
4303	eor	w6,w6,w7,ror #32-18
4304	eor	w6,w6,w7,ror #32-24
4305	ldp	w7,w8,[x10],8
4306	eor	w13,w13,w6
4307	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4308	eor	w6,w12,w13
4309	eor	w9,w7,w15
4310	eor	w6,w6,w9
4311	mov	v3.s[0],w6
4312	// optimize sbox using AESE instruction
4313	tbl	v0.16b, {v3.16b}, v26.16b
4314	ushr	v2.16b, v0.16b, 4
4315	and	v0.16b, v0.16b, v31.16b
4316	tbl	v0.16b, {v28.16b}, v0.16b
4317	tbl	v2.16b, {v27.16b}, v2.16b
4318	eor	v0.16b, v0.16b, v2.16b
4319	eor	v1.16b, v1.16b, v1.16b
4320	aese	v0.16b,v1.16b
4321	ushr	v2.16b, v0.16b, 4
4322	and	v0.16b, v0.16b, v31.16b
4323	tbl	v0.16b, {v30.16b}, v0.16b
4324	tbl	v2.16b, {v29.16b}, v2.16b
4325	eor	v0.16b, v0.16b, v2.16b
4326
4327	mov	w7,v0.s[0]
4328	eor	w6,w7,w7,ror #32-2
4329	eor	w6,w6,w7,ror #32-10
4330	eor	w6,w6,w7,ror #32-18
4331	eor	w6,w6,w7,ror #32-24
4332	eor	w14,w14,w6
4333	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4334	eor	w6,w12,w13
4335	eor	w9,w14,w8
4336	eor	w6,w6,w9
4337	mov	v3.s[0],w6
4338	// optimize sbox using AESE instruction
4339	tbl	v0.16b, {v3.16b}, v26.16b
4340	ushr	v2.16b, v0.16b, 4
4341	and	v0.16b, v0.16b, v31.16b
4342	tbl	v0.16b, {v28.16b}, v0.16b
4343	tbl	v2.16b, {v27.16b}, v2.16b
4344	eor	v0.16b, v0.16b, v2.16b
4345	eor	v1.16b, v1.16b, v1.16b
4346	aese	v0.16b,v1.16b
4347	ushr	v2.16b, v0.16b, 4
4348	and	v0.16b, v0.16b, v31.16b
4349	tbl	v0.16b, {v30.16b}, v0.16b
4350	tbl	v2.16b, {v29.16b}, v2.16b
4351	eor	v0.16b, v0.16b, v2.16b
4352
4353	mov	w7,v0.s[0]
4354	eor	w6,w7,w7,ror #32-2
4355	eor	w6,w6,w7,ror #32-10
4356	eor	w6,w6,w7,ror #32-18
4357	eor	w6,w6,w7,ror #32-24
4358	eor	w15,w15,w6
4359	subs	w11,w11,#1
4360	b.ne	10b
4361	mov	v4.s[0],w15
4362	mov	v4.s[1],w14
4363	mov	v4.s[2],w13
4364	mov	v4.s[3],w12
4365#ifndef __AARCH64EB__
4366	rev32	v4.16b,v4.16b
4367#endif
4368	eor	v4.16b, v4.16b, v17.16b
4369	st1	{v4.4s},[x1],#16
4370
4371	sub	x26,x1,16
4372.loop:
4373	subs	x29,x29,1
4374	ldrb	w7,[x26,x29]
4375	ldrb	w8,[x0,x29]
4376	strb	w8,[x26,x29]
4377	strb	w7,[x1,x29]
4378	b.gt	.loop
4379	ld1	{v4.4s}, [x26]
4380	eor	v4.16b, v4.16b, v18.16b
4381#ifndef __AARCH64EB__
4382	rev32	v4.16b,v4.16b
4383#endif
4384	mov	x10,x3
4385	mov	w11,#8
4386	mov	w12,v4.s[0]
4387	mov	w13,v4.s[1]
4388	mov	w14,v4.s[2]
4389	mov	w15,v4.s[3]
439010:
4391	ldp	w7,w8,[x10],8
4392	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4393	eor	w6,w14,w15
4394	eor	w9,w7,w13
4395	eor	w6,w6,w9
4396	mov	v3.s[0],w6
4397	// optimize sbox using AESE instruction
4398	tbl	v0.16b, {v3.16b}, v26.16b
4399	ushr	v2.16b, v0.16b, 4
4400	and	v0.16b, v0.16b, v31.16b
4401	tbl	v0.16b, {v28.16b}, v0.16b
4402	tbl	v2.16b, {v27.16b}, v2.16b
4403	eor	v0.16b, v0.16b, v2.16b
4404	eor	v1.16b, v1.16b, v1.16b
4405	aese	v0.16b,v1.16b
4406	ushr	v2.16b, v0.16b, 4
4407	and	v0.16b, v0.16b, v31.16b
4408	tbl	v0.16b, {v30.16b}, v0.16b
4409	tbl	v2.16b, {v29.16b}, v2.16b
4410	eor	v0.16b, v0.16b, v2.16b
4411
4412	mov	w7,v0.s[0]
4413	eor	w6,w7,w7,ror #32-2
4414	eor	w6,w6,w7,ror #32-10
4415	eor	w6,w6,w7,ror #32-18
4416	eor	w6,w6,w7,ror #32-24
4417	eor	w12,w12,w6
4418	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4419	eor	w6,w14,w15
4420	eor	w9,w12,w8
4421	eor	w6,w6,w9
4422	mov	v3.s[0],w6
4423	// optimize sbox using AESE instruction
4424	tbl	v0.16b, {v3.16b}, v26.16b
4425	ushr	v2.16b, v0.16b, 4
4426	and	v0.16b, v0.16b, v31.16b
4427	tbl	v0.16b, {v28.16b}, v0.16b
4428	tbl	v2.16b, {v27.16b}, v2.16b
4429	eor	v0.16b, v0.16b, v2.16b
4430	eor	v1.16b, v1.16b, v1.16b
4431	aese	v0.16b,v1.16b
4432	ushr	v2.16b, v0.16b, 4
4433	and	v0.16b, v0.16b, v31.16b
4434	tbl	v0.16b, {v30.16b}, v0.16b
4435	tbl	v2.16b, {v29.16b}, v2.16b
4436	eor	v0.16b, v0.16b, v2.16b
4437
4438	mov	w7,v0.s[0]
4439	eor	w6,w7,w7,ror #32-2
4440	eor	w6,w6,w7,ror #32-10
4441	eor	w6,w6,w7,ror #32-18
4442	eor	w6,w6,w7,ror #32-24
4443	ldp	w7,w8,[x10],8
4444	eor	w13,w13,w6
4445	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4446	eor	w6,w12,w13
4447	eor	w9,w7,w15
4448	eor	w6,w6,w9
4449	mov	v3.s[0],w6
4450	// optimize sbox using AESE instruction
4451	tbl	v0.16b, {v3.16b}, v26.16b
4452	ushr	v2.16b, v0.16b, 4
4453	and	v0.16b, v0.16b, v31.16b
4454	tbl	v0.16b, {v28.16b}, v0.16b
4455	tbl	v2.16b, {v27.16b}, v2.16b
4456	eor	v0.16b, v0.16b, v2.16b
4457	eor	v1.16b, v1.16b, v1.16b
4458	aese	v0.16b,v1.16b
4459	ushr	v2.16b, v0.16b, 4
4460	and	v0.16b, v0.16b, v31.16b
4461	tbl	v0.16b, {v30.16b}, v0.16b
4462	tbl	v2.16b, {v29.16b}, v2.16b
4463	eor	v0.16b, v0.16b, v2.16b
4464
4465	mov	w7,v0.s[0]
4466	eor	w6,w7,w7,ror #32-2
4467	eor	w6,w6,w7,ror #32-10
4468	eor	w6,w6,w7,ror #32-18
4469	eor	w6,w6,w7,ror #32-24
4470	eor	w14,w14,w6
4471	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4472	eor	w6,w12,w13
4473	eor	w9,w14,w8
4474	eor	w6,w6,w9
4475	mov	v3.s[0],w6
4476	// optimize sbox using AESE instruction
4477	tbl	v0.16b, {v3.16b}, v26.16b
4478	ushr	v2.16b, v0.16b, 4
4479	and	v0.16b, v0.16b, v31.16b
4480	tbl	v0.16b, {v28.16b}, v0.16b
4481	tbl	v2.16b, {v27.16b}, v2.16b
4482	eor	v0.16b, v0.16b, v2.16b
4483	eor	v1.16b, v1.16b, v1.16b
4484	aese	v0.16b,v1.16b
4485	ushr	v2.16b, v0.16b, 4
4486	and	v0.16b, v0.16b, v31.16b
4487	tbl	v0.16b, {v30.16b}, v0.16b
4488	tbl	v2.16b, {v29.16b}, v2.16b
4489	eor	v0.16b, v0.16b, v2.16b
4490
4491	mov	w7,v0.s[0]
4492	eor	w6,w7,w7,ror #32-2
4493	eor	w6,w6,w7,ror #32-10
4494	eor	w6,w6,w7,ror #32-18
4495	eor	w6,w6,w7,ror #32-24
4496	eor	w15,w15,w6
4497	subs	w11,w11,#1
4498	b.ne	10b
4499	mov	v4.s[0],w15
4500	mov	v4.s[1],w14
4501	mov	v4.s[2],w13
4502	mov	v4.s[3],w12
4503#ifndef __AARCH64EB__
4504	rev32	v4.16b,v4.16b
4505#endif
4506	eor	v4.16b, v4.16b, v18.16b
4507	st1	{v4.4s}, [x26]
4508.return:
4509	ldp	d14, d15, [sp], #0x10
4510	ldp	d12, d13, [sp], #0x10
4511	ldp	d10, d11, [sp], #0x10
4512	ldp	d8, d9, [sp], #0x10
4513	ldp	x29, x30, [sp], #0x10
4514	ldp	x27, x28, [sp], #0x10
4515	ldp	x25, x26, [sp], #0x10
4516	ldp	x23, x24, [sp], #0x10
4517	ldp	x21, x22, [sp], #0x10
4518	ldp	x19, x20, [sp], #0x10
4519	ldp	x17, x18, [sp], #0x10
4520	ldp	x15, x16, [sp], #0x10
4521	AARCH64_VALIDATE_LINK_REGISTER
4522	ret
4523.size	vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt
4524