xref: /freebsd/sys/crypto/openssl/aarch64/vpsm4-armv8.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from vpsm4-armv8.pl. */
2// Copyright 2020-2025 The OpenSSL Project Authors. All Rights Reserved.
3//
4// Licensed under the Apache License 2.0 (the "License").  You may not use
5// this file except in compliance with the License.  You can obtain a copy
6// in the file LICENSE in the source distribution or at
7// https://www.openssl.org/source/license.html
8
9//
10// This module implements SM4 with ASIMD on aarch64
11//
12// Feb 2022
13//
14
15// $output is the last argument if it looks like a file (it has an extension)
16// $flavour is the first argument if it doesn't look like a file
17#include "arm_arch.h"
18.arch	armv8-a
19.text
20
21.section	.rodata
22.type	_vpsm4_consts,%object
23.align	7
24_vpsm4_consts:
25.Lsbox:
26.byte	0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
27.byte	0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
28.byte	0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
29.byte	0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
30.byte	0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
31.byte	0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
32.byte	0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
33.byte	0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
34.byte	0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
35.byte	0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
36.byte	0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
37.byte	0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
38.byte	0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
39.byte	0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
40.byte	0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
41.byte	0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
42.Lck:
43.long	0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
44.long	0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
45.long	0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
46.long	0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
47.long	0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
48.long	0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
49.long	0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
50.long	0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
51.Lfk:
52.quad	0x56aa3350a3b1bac6,0xb27022dc677d9197
53.Lshuffles:
54.quad	0x0B0A090807060504,0x030201000F0E0D0C
55.Lxts_magic:
56.quad	0x0101010101010187,0x0101010101010101
57
58.size	_vpsm4_consts,.-_vpsm4_consts
59
60.previous
61
62.type	_vpsm4_set_key,%function
63.align	4
64_vpsm4_set_key:
65	AARCH64_VALID_CALL_TARGET
66	ld1	{v5.4s},[x0]
67	adrp	x10,.Lsbox
68	add	x10,x10,#:lo12:.Lsbox
69	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
70	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
71	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
72	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
73#ifndef __AARCH64EB__
74	rev32	v5.16b,v5.16b
75#endif
76	adrp	x5,.Lshuffles
77	add	x5,x5,#:lo12:.Lshuffles
78	ld1	{v7.2d},[x5]
79	adrp	x5,.Lfk
80	add	x5,x5,#:lo12:.Lfk
81	ld1	{v6.2d},[x5]
82	eor	v5.16b,v5.16b,v6.16b
83	mov	x6,#32
84	adrp	x5,.Lck
85	add	x5,x5,#:lo12:.Lck
86	movi	v0.16b,#64
87	cbnz	w2,1f
88	add	x1,x1,124
891:
90	mov	w7,v5.s[1]
91	ldr	w8,[x5],#4
92	eor	w8,w8,w7
93	mov	w7,v5.s[2]
94	eor	w8,w8,w7
95	mov	w7,v5.s[3]
96	eor	w8,w8,w7
97	// sbox lookup
98	mov	v4.s[0],w8
99	tbl	v1.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v4.16b
100	sub	v4.16b,v4.16b,v0.16b
101	tbx	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v4.16b
102	sub	v4.16b,v4.16b,v0.16b
103	tbx	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v4.16b
104	sub	v4.16b,v4.16b,v0.16b
105	tbx	v1.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v4.16b
106	mov	w7,v1.s[0]
107	eor	w8,w7,w7,ror #19
108	eor	w8,w8,w7,ror #9
109	mov	w7,v5.s[0]
110	eor	w8,w8,w7
111	mov	v5.s[0],w8
112	cbz	w2,2f
113	str	w8,[x1],#4
114	b	3f
1152:
116	str	w8,[x1],#-4
1173:
118	tbl	v5.16b,{v5.16b},v7.16b
119	subs	x6,x6,#1
120	b.ne	1b
121	ret
122.size	_vpsm4_set_key,.-_vpsm4_set_key
123.type	_vpsm4_enc_4blks,%function
124.align	4
125_vpsm4_enc_4blks:
126	AARCH64_VALID_CALL_TARGET
127	mov	x10,x3
128	mov	w11,#8
12910:
130	ldp	w7,w8,[x10],8
131	dup	v12.4s,w7
132	dup	v13.4s,w8
133
134	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
135	eor	v14.16b,v6.16b,v7.16b
136	eor	v12.16b,v5.16b,v12.16b
137	eor	v12.16b,v14.16b,v12.16b
138	movi	v0.16b,#64
139	movi	v1.16b,#128
140	movi	v2.16b,#192
141	sub	v0.16b,v12.16b,v0.16b
142	sub	v1.16b,v12.16b,v1.16b
143	sub	v2.16b,v12.16b,v2.16b
144	tbl	v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
145	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
146	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
147	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
148	add	v0.2d,v0.2d,v1.2d
149	add	v2.2d,v2.2d,v12.2d
150	add	v12.2d,v0.2d,v2.2d
151
152	ushr	v0.4s,v12.4s,32-2
153	sli	v0.4s,v12.4s,2
154	ushr	v2.4s,v12.4s,32-10
155	eor	v1.16b,v0.16b,v12.16b
156	sli	v2.4s,v12.4s,10
157	eor	v1.16b,v2.16b,v1.16b
158	ushr	v0.4s,v12.4s,32-18
159	sli	v0.4s,v12.4s,18
160	ushr	v2.4s,v12.4s,32-24
161	eor	v1.16b,v0.16b,v1.16b
162	sli	v2.4s,v12.4s,24
163	eor	v12.16b,v2.16b,v1.16b
164	eor	v4.16b,v4.16b,v12.16b
165
166	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
167	eor	v14.16b,v14.16b,v4.16b
168	eor	v13.16b,v14.16b,v13.16b
169	movi	v0.16b,#64
170	movi	v1.16b,#128
171	movi	v2.16b,#192
172	sub	v0.16b,v13.16b,v0.16b
173	sub	v1.16b,v13.16b,v1.16b
174	sub	v2.16b,v13.16b,v2.16b
175	tbl	v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
176	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
177	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
178	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
179	add	v0.2d,v0.2d,v1.2d
180	add	v2.2d,v2.2d,v13.2d
181	add	v13.2d,v0.2d,v2.2d
182
183	ushr	v0.4s,v13.4s,32-2
184	sli	v0.4s,v13.4s,2
185	ushr	v2.4s,v13.4s,32-10
186	eor	v1.16b,v0.16b,v13.16b
187	sli	v2.4s,v13.4s,10
188	eor	v1.16b,v2.16b,v1.16b
189	ushr	v0.4s,v13.4s,32-18
190	sli	v0.4s,v13.4s,18
191	ushr	v2.4s,v13.4s,32-24
192	eor	v1.16b,v0.16b,v1.16b
193	sli	v2.4s,v13.4s,24
194	eor	v13.16b,v2.16b,v1.16b
195	ldp	w7,w8,[x10],8
196	eor	v5.16b,v5.16b,v13.16b
197
198	dup	v12.4s,w7
199	dup	v13.4s,w8
200
201	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
202	eor	v14.16b,v4.16b,v5.16b
203	eor	v12.16b,v7.16b,v12.16b
204	eor	v12.16b,v14.16b,v12.16b
205	movi	v0.16b,#64
206	movi	v1.16b,#128
207	movi	v2.16b,#192
208	sub	v0.16b,v12.16b,v0.16b
209	sub	v1.16b,v12.16b,v1.16b
210	sub	v2.16b,v12.16b,v2.16b
211	tbl	v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
212	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
213	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
214	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
215	add	v0.2d,v0.2d,v1.2d
216	add	v2.2d,v2.2d,v12.2d
217	add	v12.2d,v0.2d,v2.2d
218
219	ushr	v0.4s,v12.4s,32-2
220	sli	v0.4s,v12.4s,2
221	ushr	v2.4s,v12.4s,32-10
222	eor	v1.16b,v0.16b,v12.16b
223	sli	v2.4s,v12.4s,10
224	eor	v1.16b,v2.16b,v1.16b
225	ushr	v0.4s,v12.4s,32-18
226	sli	v0.4s,v12.4s,18
227	ushr	v2.4s,v12.4s,32-24
228	eor	v1.16b,v0.16b,v1.16b
229	sli	v2.4s,v12.4s,24
230	eor	v12.16b,v2.16b,v1.16b
231	eor	v6.16b,v6.16b,v12.16b
232
233	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
234	eor	v14.16b,v14.16b,v6.16b
235	eor	v13.16b,v14.16b,v13.16b
236	movi	v0.16b,#64
237	movi	v1.16b,#128
238	movi	v2.16b,#192
239	sub	v0.16b,v13.16b,v0.16b
240	sub	v1.16b,v13.16b,v1.16b
241	sub	v2.16b,v13.16b,v2.16b
242	tbl	v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
243	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
244	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
245	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
246	add	v0.2d,v0.2d,v1.2d
247	add	v2.2d,v2.2d,v13.2d
248	add	v13.2d,v0.2d,v2.2d
249
250	ushr	v0.4s,v13.4s,32-2
251	sli	v0.4s,v13.4s,2
252	ushr	v2.4s,v13.4s,32-10
253	eor	v1.16b,v0.16b,v13.16b
254	sli	v2.4s,v13.4s,10
255	eor	v1.16b,v2.16b,v1.16b
256	ushr	v0.4s,v13.4s,32-18
257	sli	v0.4s,v13.4s,18
258	ushr	v2.4s,v13.4s,32-24
259	eor	v1.16b,v0.16b,v1.16b
260	sli	v2.4s,v13.4s,24
261	eor	v13.16b,v2.16b,v1.16b
262	eor	v7.16b,v7.16b,v13.16b
263	subs	w11,w11,#1
264	b.ne	10b
265#ifndef __AARCH64EB__
266	rev32	v3.16b,v4.16b
267#else
268	mov	v3.16b,v4.16b
269#endif
270#ifndef __AARCH64EB__
271	rev32	v2.16b,v5.16b
272#else
273	mov	v2.16b,v5.16b
274#endif
275#ifndef __AARCH64EB__
276	rev32	v1.16b,v6.16b
277#else
278	mov	v1.16b,v6.16b
279#endif
280#ifndef __AARCH64EB__
281	rev32	v0.16b,v7.16b
282#else
283	mov	v0.16b,v7.16b
284#endif
285	ret
286.size	_vpsm4_enc_4blks,.-_vpsm4_enc_4blks
287.type	_vpsm4_enc_8blks,%function
288.align	4
289_vpsm4_enc_8blks:
290	AARCH64_VALID_CALL_TARGET
291	mov	x10,x3
292	mov	w11,#8
29310:
294	ldp	w7,w8,[x10],8
295	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
296	dup	v12.4s,w7
297	eor	v14.16b,v6.16b,v7.16b
298	eor	v15.16b,v10.16b,v11.16b
299	eor	v0.16b,v5.16b,v12.16b
300	eor	v1.16b,v9.16b,v12.16b
301	eor	v12.16b,v14.16b,v0.16b
302	eor	v13.16b,v15.16b,v1.16b
303	movi	v3.16b,#64
304	sub	v0.16b,v12.16b,v3.16b
305	sub	v1.16b,v0.16b,v3.16b
306	sub	v2.16b,v1.16b,v3.16b
307	tbl	v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
308	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
309	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
310	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
311	add	v1.2d,v0.2d,v1.2d
312	add	v12.2d,v2.2d,v12.2d
313	add	v12.2d,v1.2d,v12.2d
314
315	sub	v0.16b,v13.16b,v3.16b
316	sub	v1.16b,v0.16b,v3.16b
317	sub	v2.16b,v1.16b,v3.16b
318	tbl	v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
319	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
320	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
321	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
322	add	v1.2d,v0.2d,v1.2d
323	add	v13.2d,v2.2d,v13.2d
324	add	v13.2d,v1.2d,v13.2d
325
326	ushr	v0.4s,v12.4s,32-2
327	sli	v0.4s,v12.4s,2
328	ushr	v2.4s,v13.4s,32-2
329	eor	v1.16b,v0.16b,v12.16b
330	sli	v2.4s,v13.4s,2
331
332	ushr	v0.4s,v12.4s,32-10
333	eor	v3.16b,v2.16b,v13.16b
334	sli	v0.4s,v12.4s,10
335	ushr	v2.4s,v13.4s,32-10
336	eor	v1.16b,v0.16b,v1.16b
337	sli	v2.4s,v13.4s,10
338
339	ushr	v0.4s,v12.4s,32-18
340	eor	v3.16b,v2.16b,v3.16b
341	sli	v0.4s,v12.4s,18
342	ushr	v2.4s,v13.4s,32-18
343	eor	v1.16b,v0.16b,v1.16b
344	sli	v2.4s,v13.4s,18
345
346	ushr	v0.4s,v12.4s,32-24
347	eor	v3.16b,v2.16b,v3.16b
348	sli	v0.4s,v12.4s,24
349	ushr	v2.4s,v13.4s,32-24
350	eor	v12.16b,v0.16b,v1.16b
351	sli	v2.4s,v13.4s,24
352	eor	v13.16b,v2.16b,v3.16b
353	eor	v4.16b,v4.16b,v12.16b
354	eor	v8.16b,v8.16b,v13.16b
355
356	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
357	dup	v13.4s,w8
358	eor	v14.16b,v14.16b,v4.16b
359	eor	v15.16b,v15.16b,v8.16b
360	eor	v12.16b,v14.16b,v13.16b
361	eor	v13.16b,v15.16b,v13.16b
362	movi	v3.16b,#64
363	sub	v0.16b,v12.16b,v3.16b
364	sub	v1.16b,v0.16b,v3.16b
365	sub	v2.16b,v1.16b,v3.16b
366	tbl	v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
367	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
368	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
369	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
370	add	v1.2d,v0.2d,v1.2d
371	add	v12.2d,v2.2d,v12.2d
372	add	v12.2d,v1.2d,v12.2d
373
374	sub	v0.16b,v13.16b,v3.16b
375	sub	v1.16b,v0.16b,v3.16b
376	sub	v2.16b,v1.16b,v3.16b
377	tbl	v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
378	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
379	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
380	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
381	add	v1.2d,v0.2d,v1.2d
382	add	v13.2d,v2.2d,v13.2d
383	add	v13.2d,v1.2d,v13.2d
384
385	ushr	v0.4s,v12.4s,32-2
386	sli	v0.4s,v12.4s,2
387	ushr	v2.4s,v13.4s,32-2
388	eor	v1.16b,v0.16b,v12.16b
389	sli	v2.4s,v13.4s,2
390
391	ushr	v0.4s,v12.4s,32-10
392	eor	v3.16b,v2.16b,v13.16b
393	sli	v0.4s,v12.4s,10
394	ushr	v2.4s,v13.4s,32-10
395	eor	v1.16b,v0.16b,v1.16b
396	sli	v2.4s,v13.4s,10
397
398	ushr	v0.4s,v12.4s,32-18
399	eor	v3.16b,v2.16b,v3.16b
400	sli	v0.4s,v12.4s,18
401	ushr	v2.4s,v13.4s,32-18
402	eor	v1.16b,v0.16b,v1.16b
403	sli	v2.4s,v13.4s,18
404
405	ushr	v0.4s,v12.4s,32-24
406	eor	v3.16b,v2.16b,v3.16b
407	sli	v0.4s,v12.4s,24
408	ushr	v2.4s,v13.4s,32-24
409	eor	v12.16b,v0.16b,v1.16b
410	sli	v2.4s,v13.4s,24
411	eor	v13.16b,v2.16b,v3.16b
412	ldp	w7,w8,[x10],8
413	eor	v5.16b,v5.16b,v12.16b
414	eor	v9.16b,v9.16b,v13.16b
415
416	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
417	dup	v12.4s,w7
418	eor	v14.16b,v4.16b,v5.16b
419	eor	v15.16b,v8.16b,v9.16b
420	eor	v0.16b,v7.16b,v12.16b
421	eor	v1.16b,v11.16b,v12.16b
422	eor	v12.16b,v14.16b,v0.16b
423	eor	v13.16b,v15.16b,v1.16b
424	movi	v3.16b,#64
425	sub	v0.16b,v12.16b,v3.16b
426	sub	v1.16b,v0.16b,v3.16b
427	sub	v2.16b,v1.16b,v3.16b
428	tbl	v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
429	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
430	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
431	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
432	add	v1.2d,v0.2d,v1.2d
433	add	v12.2d,v2.2d,v12.2d
434	add	v12.2d,v1.2d,v12.2d
435
436	sub	v0.16b,v13.16b,v3.16b
437	sub	v1.16b,v0.16b,v3.16b
438	sub	v2.16b,v1.16b,v3.16b
439	tbl	v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
440	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
441	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
442	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
443	add	v1.2d,v0.2d,v1.2d
444	add	v13.2d,v2.2d,v13.2d
445	add	v13.2d,v1.2d,v13.2d
446
447	ushr	v0.4s,v12.4s,32-2
448	sli	v0.4s,v12.4s,2
449	ushr	v2.4s,v13.4s,32-2
450	eor	v1.16b,v0.16b,v12.16b
451	sli	v2.4s,v13.4s,2
452
453	ushr	v0.4s,v12.4s,32-10
454	eor	v3.16b,v2.16b,v13.16b
455	sli	v0.4s,v12.4s,10
456	ushr	v2.4s,v13.4s,32-10
457	eor	v1.16b,v0.16b,v1.16b
458	sli	v2.4s,v13.4s,10
459
460	ushr	v0.4s,v12.4s,32-18
461	eor	v3.16b,v2.16b,v3.16b
462	sli	v0.4s,v12.4s,18
463	ushr	v2.4s,v13.4s,32-18
464	eor	v1.16b,v0.16b,v1.16b
465	sli	v2.4s,v13.4s,18
466
467	ushr	v0.4s,v12.4s,32-24
468	eor	v3.16b,v2.16b,v3.16b
469	sli	v0.4s,v12.4s,24
470	ushr	v2.4s,v13.4s,32-24
471	eor	v12.16b,v0.16b,v1.16b
472	sli	v2.4s,v13.4s,24
473	eor	v13.16b,v2.16b,v3.16b
474	eor	v6.16b,v6.16b,v12.16b
475	eor	v10.16b,v10.16b,v13.16b
476
477	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
478	dup	v13.4s,w8
479	eor	v14.16b,v14.16b,v6.16b
480	eor	v15.16b,v15.16b,v10.16b
481	eor	v12.16b,v14.16b,v13.16b
482	eor	v13.16b,v15.16b,v13.16b
483	movi	v3.16b,#64
484	sub	v0.16b,v12.16b,v3.16b
485	sub	v1.16b,v0.16b,v3.16b
486	sub	v2.16b,v1.16b,v3.16b
487	tbl	v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
488	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
489	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
490	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
491	add	v1.2d,v0.2d,v1.2d
492	add	v12.2d,v2.2d,v12.2d
493	add	v12.2d,v1.2d,v12.2d
494
495	sub	v0.16b,v13.16b,v3.16b
496	sub	v1.16b,v0.16b,v3.16b
497	sub	v2.16b,v1.16b,v3.16b
498	tbl	v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
499	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
500	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
501	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
502	add	v1.2d,v0.2d,v1.2d
503	add	v13.2d,v2.2d,v13.2d
504	add	v13.2d,v1.2d,v13.2d
505
506	ushr	v0.4s,v12.4s,32-2
507	sli	v0.4s,v12.4s,2
508	ushr	v2.4s,v13.4s,32-2
509	eor	v1.16b,v0.16b,v12.16b
510	sli	v2.4s,v13.4s,2
511
512	ushr	v0.4s,v12.4s,32-10
513	eor	v3.16b,v2.16b,v13.16b
514	sli	v0.4s,v12.4s,10
515	ushr	v2.4s,v13.4s,32-10
516	eor	v1.16b,v0.16b,v1.16b
517	sli	v2.4s,v13.4s,10
518
519	ushr	v0.4s,v12.4s,32-18
520	eor	v3.16b,v2.16b,v3.16b
521	sli	v0.4s,v12.4s,18
522	ushr	v2.4s,v13.4s,32-18
523	eor	v1.16b,v0.16b,v1.16b
524	sli	v2.4s,v13.4s,18
525
526	ushr	v0.4s,v12.4s,32-24
527	eor	v3.16b,v2.16b,v3.16b
528	sli	v0.4s,v12.4s,24
529	ushr	v2.4s,v13.4s,32-24
530	eor	v12.16b,v0.16b,v1.16b
531	sli	v2.4s,v13.4s,24
532	eor	v13.16b,v2.16b,v3.16b
533	eor	v7.16b,v7.16b,v12.16b
534	eor	v11.16b,v11.16b,v13.16b
535	subs	w11,w11,#1
536	b.ne	10b
537#ifndef __AARCH64EB__
538	rev32	v3.16b,v4.16b
539#else
540	mov	v3.16b,v4.16b
541#endif
542#ifndef __AARCH64EB__
543	rev32	v2.16b,v5.16b
544#else
545	mov	v2.16b,v5.16b
546#endif
547#ifndef __AARCH64EB__
548	rev32	v1.16b,v6.16b
549#else
550	mov	v1.16b,v6.16b
551#endif
552#ifndef __AARCH64EB__
553	rev32	v0.16b,v7.16b
554#else
555	mov	v0.16b,v7.16b
556#endif
557#ifndef __AARCH64EB__
558	rev32	v7.16b,v8.16b
559#else
560	mov	v7.16b,v8.16b
561#endif
562#ifndef __AARCH64EB__
563	rev32	v6.16b,v9.16b
564#else
565	mov	v6.16b,v9.16b
566#endif
567#ifndef __AARCH64EB__
568	rev32	v5.16b,v10.16b
569#else
570	mov	v5.16b,v10.16b
571#endif
572#ifndef __AARCH64EB__
573	rev32	v4.16b,v11.16b
574#else
575	mov	v4.16b,v11.16b
576#endif
577	ret
578.size	_vpsm4_enc_8blks,.-_vpsm4_enc_8blks
579.globl	vpsm4_set_encrypt_key
580.type	vpsm4_set_encrypt_key,%function
581.align	5
582vpsm4_set_encrypt_key:
583	AARCH64_SIGN_LINK_REGISTER
584	stp	x29,x30,[sp,#-16]!
585	mov	w2,1
586	bl	_vpsm4_set_key
587	ldp	x29,x30,[sp],#16
588	AARCH64_VALIDATE_LINK_REGISTER
589	ret
590.size	vpsm4_set_encrypt_key,.-vpsm4_set_encrypt_key
591.globl	vpsm4_set_decrypt_key
592.type	vpsm4_set_decrypt_key,%function
593.align	5
594vpsm4_set_decrypt_key:
595	AARCH64_SIGN_LINK_REGISTER
596	stp	x29,x30,[sp,#-16]!
597	mov	w2,0
598	bl	_vpsm4_set_key
599	ldp	x29,x30,[sp],#16
600	AARCH64_VALIDATE_LINK_REGISTER
601	ret
602.size	vpsm4_set_decrypt_key,.-vpsm4_set_decrypt_key
603.globl	vpsm4_encrypt
604.type	vpsm4_encrypt,%function
605.align	5
606vpsm4_encrypt:
607	AARCH64_VALID_CALL_TARGET
608	ld1	{v4.4s},[x0]
609	adrp	x10,.Lsbox
610	add	x10,x10,#:lo12:.Lsbox
611	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
612	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
613	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
614	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
615#ifndef __AARCH64EB__
616	rev32	v4.16b,v4.16b
617#endif
618	mov	x3,x2
619	mov	x10,x3
620	mov	w11,#8
621	mov	w12,v4.s[0]
622	mov	w13,v4.s[1]
623	mov	w14,v4.s[2]
624	mov	w15,v4.s[3]
62510:
626	ldp	w7,w8,[x10],8
627	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
628	eor	w6,w14,w15
629	eor	w9,w7,w13
630	eor	w6,w6,w9
631	movi	v1.16b,#64
632	movi	v2.16b,#128
633	movi	v3.16b,#192
634	mov	v0.s[0],w6
635
636	sub	v1.16b,v0.16b,v1.16b
637	sub	v2.16b,v0.16b,v2.16b
638	sub	v3.16b,v0.16b,v3.16b
639
640	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
641	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
642	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
643	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
644
645	mov	w6,v0.s[0]
646	mov	w7,v1.s[0]
647	mov	w9,v2.s[0]
648	add	w7,w6,w7
649	mov	w6,v3.s[0]
650	add	w7,w7,w9
651	add	w7,w7,w6
652
653	eor	w6,w7,w7,ror #32-2
654	eor	w6,w6,w7,ror #32-10
655	eor	w6,w6,w7,ror #32-18
656	eor	w6,w6,w7,ror #32-24
657	eor	w12,w12,w6
658	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
659	eor	w6,w14,w15
660	eor	w9,w12,w8
661	eor	w6,w6,w9
662	movi	v1.16b,#64
663	movi	v2.16b,#128
664	movi	v3.16b,#192
665	mov	v0.s[0],w6
666
667	sub	v1.16b,v0.16b,v1.16b
668	sub	v2.16b,v0.16b,v2.16b
669	sub	v3.16b,v0.16b,v3.16b
670
671	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
672	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
673	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
674	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
675
676	mov	w6,v0.s[0]
677	mov	w7,v1.s[0]
678	mov	w9,v2.s[0]
679	add	w7,w6,w7
680	mov	w6,v3.s[0]
681	add	w7,w7,w9
682	add	w7,w7,w6
683
684	eor	w6,w7,w7,ror #32-2
685	eor	w6,w6,w7,ror #32-10
686	eor	w6,w6,w7,ror #32-18
687	eor	w6,w6,w7,ror #32-24
688	ldp	w7,w8,[x10],8
689	eor	w13,w13,w6
690	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
691	eor	w6,w12,w13
692	eor	w9,w7,w15
693	eor	w6,w6,w9
694	movi	v1.16b,#64
695	movi	v2.16b,#128
696	movi	v3.16b,#192
697	mov	v0.s[0],w6
698
699	sub	v1.16b,v0.16b,v1.16b
700	sub	v2.16b,v0.16b,v2.16b
701	sub	v3.16b,v0.16b,v3.16b
702
703	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
704	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
705	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
706	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
707
708	mov	w6,v0.s[0]
709	mov	w7,v1.s[0]
710	mov	w9,v2.s[0]
711	add	w7,w6,w7
712	mov	w6,v3.s[0]
713	add	w7,w7,w9
714	add	w7,w7,w6
715
716	eor	w6,w7,w7,ror #32-2
717	eor	w6,w6,w7,ror #32-10
718	eor	w6,w6,w7,ror #32-18
719	eor	w6,w6,w7,ror #32-24
720	eor	w14,w14,w6
721	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
722	eor	w6,w12,w13
723	eor	w9,w14,w8
724	eor	w6,w6,w9
725	movi	v1.16b,#64
726	movi	v2.16b,#128
727	movi	v3.16b,#192
728	mov	v0.s[0],w6
729
730	sub	v1.16b,v0.16b,v1.16b
731	sub	v2.16b,v0.16b,v2.16b
732	sub	v3.16b,v0.16b,v3.16b
733
734	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
735	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
736	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
737	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
738
739	mov	w6,v0.s[0]
740	mov	w7,v1.s[0]
741	mov	w9,v2.s[0]
742	add	w7,w6,w7
743	mov	w6,v3.s[0]
744	add	w7,w7,w9
745	add	w7,w7,w6
746
747	eor	w6,w7,w7,ror #32-2
748	eor	w6,w6,w7,ror #32-10
749	eor	w6,w6,w7,ror #32-18
750	eor	w6,w6,w7,ror #32-24
751	eor	w15,w15,w6
752	subs	w11,w11,#1
753	b.ne	10b
754	mov	v4.s[0],w15
755	mov	v4.s[1],w14
756	mov	v4.s[2],w13
757	mov	v4.s[3],w12
758#ifndef __AARCH64EB__
759	rev32	v4.16b,v4.16b
760#endif
761	st1	{v4.4s},[x1]
762	ret
763.size	vpsm4_encrypt,.-vpsm4_encrypt
764.globl	vpsm4_decrypt
765.type	vpsm4_decrypt,%function
766.align	5
767vpsm4_decrypt:
768	AARCH64_VALID_CALL_TARGET
769	ld1	{v4.4s},[x0]
770	adrp	x10,.Lsbox
771	add	x10,x10,#:lo12:.Lsbox
772	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
773	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
774	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
775	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
776#ifndef __AARCH64EB__
777	rev32	v4.16b,v4.16b
778#endif
779	mov	x3,x2
780	mov	x10,x3
781	mov	w11,#8
782	mov	w12,v4.s[0]
783	mov	w13,v4.s[1]
784	mov	w14,v4.s[2]
785	mov	w15,v4.s[3]
78610:
787	ldp	w7,w8,[x10],8
788	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
789	eor	w6,w14,w15
790	eor	w9,w7,w13
791	eor	w6,w6,w9
792	movi	v1.16b,#64
793	movi	v2.16b,#128
794	movi	v3.16b,#192
795	mov	v0.s[0],w6
796
797	sub	v1.16b,v0.16b,v1.16b
798	sub	v2.16b,v0.16b,v2.16b
799	sub	v3.16b,v0.16b,v3.16b
800
801	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
802	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
803	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
804	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
805
806	mov	w6,v0.s[0]
807	mov	w7,v1.s[0]
808	mov	w9,v2.s[0]
809	add	w7,w6,w7
810	mov	w6,v3.s[0]
811	add	w7,w7,w9
812	add	w7,w7,w6
813
814	eor	w6,w7,w7,ror #32-2
815	eor	w6,w6,w7,ror #32-10
816	eor	w6,w6,w7,ror #32-18
817	eor	w6,w6,w7,ror #32-24
818	eor	w12,w12,w6
819	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
820	eor	w6,w14,w15
821	eor	w9,w12,w8
822	eor	w6,w6,w9
823	movi	v1.16b,#64
824	movi	v2.16b,#128
825	movi	v3.16b,#192
826	mov	v0.s[0],w6
827
828	sub	v1.16b,v0.16b,v1.16b
829	sub	v2.16b,v0.16b,v2.16b
830	sub	v3.16b,v0.16b,v3.16b
831
832	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
833	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
834	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
835	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
836
837	mov	w6,v0.s[0]
838	mov	w7,v1.s[0]
839	mov	w9,v2.s[0]
840	add	w7,w6,w7
841	mov	w6,v3.s[0]
842	add	w7,w7,w9
843	add	w7,w7,w6
844
845	eor	w6,w7,w7,ror #32-2
846	eor	w6,w6,w7,ror #32-10
847	eor	w6,w6,w7,ror #32-18
848	eor	w6,w6,w7,ror #32-24
849	ldp	w7,w8,[x10],8
850	eor	w13,w13,w6
851	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
852	eor	w6,w12,w13
853	eor	w9,w7,w15
854	eor	w6,w6,w9
855	movi	v1.16b,#64
856	movi	v2.16b,#128
857	movi	v3.16b,#192
858	mov	v0.s[0],w6
859
860	sub	v1.16b,v0.16b,v1.16b
861	sub	v2.16b,v0.16b,v2.16b
862	sub	v3.16b,v0.16b,v3.16b
863
864	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
865	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
866	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
867	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
868
869	mov	w6,v0.s[0]
870	mov	w7,v1.s[0]
871	mov	w9,v2.s[0]
872	add	w7,w6,w7
873	mov	w6,v3.s[0]
874	add	w7,w7,w9
875	add	w7,w7,w6
876
877	eor	w6,w7,w7,ror #32-2
878	eor	w6,w6,w7,ror #32-10
879	eor	w6,w6,w7,ror #32-18
880	eor	w6,w6,w7,ror #32-24
881	eor	w14,w14,w6
882	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
883	eor	w6,w12,w13
884	eor	w9,w14,w8
885	eor	w6,w6,w9
886	movi	v1.16b,#64
887	movi	v2.16b,#128
888	movi	v3.16b,#192
889	mov	v0.s[0],w6
890
891	sub	v1.16b,v0.16b,v1.16b
892	sub	v2.16b,v0.16b,v2.16b
893	sub	v3.16b,v0.16b,v3.16b
894
895	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
896	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
897	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
898	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
899
900	mov	w6,v0.s[0]
901	mov	w7,v1.s[0]
902	mov	w9,v2.s[0]
903	add	w7,w6,w7
904	mov	w6,v3.s[0]
905	add	w7,w7,w9
906	add	w7,w7,w6
907
908	eor	w6,w7,w7,ror #32-2
909	eor	w6,w6,w7,ror #32-10
910	eor	w6,w6,w7,ror #32-18
911	eor	w6,w6,w7,ror #32-24
912	eor	w15,w15,w6
913	subs	w11,w11,#1
914	b.ne	10b
915	mov	v4.s[0],w15
916	mov	v4.s[1],w14
917	mov	v4.s[2],w13
918	mov	v4.s[3],w12
919#ifndef __AARCH64EB__
920	rev32	v4.16b,v4.16b
921#endif
922	st1	{v4.4s},[x1]
923	ret
924.size	vpsm4_decrypt,.-vpsm4_decrypt
925.globl	vpsm4_ecb_encrypt
926.type	vpsm4_ecb_encrypt,%function
927.align	5
928vpsm4_ecb_encrypt:
929	AARCH64_SIGN_LINK_REGISTER
930	// convert length into blocks
931	lsr	x2,x2,4
932	stp	d8,d9,[sp,#-80]!
933	stp	d10,d11,[sp,#16]
934	stp	d12,d13,[sp,#32]
935	stp	d14,d15,[sp,#48]
936	stp	x29,x30,[sp,#64]
937	adrp	x10,.Lsbox
938	add	x10,x10,#:lo12:.Lsbox
939	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
940	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
941	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
942	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
943.Lecb_8_blocks_process:
944	cmp	w2,#8
945	b.lt	.Lecb_4_blocks_process
946	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
947	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
948#ifndef __AARCH64EB__
949	rev32	v4.16b,v4.16b
950#endif
951#ifndef __AARCH64EB__
952	rev32	v5.16b,v5.16b
953#endif
954#ifndef __AARCH64EB__
955	rev32	v6.16b,v6.16b
956#endif
957#ifndef __AARCH64EB__
958	rev32	v7.16b,v7.16b
959#endif
960#ifndef __AARCH64EB__
961	rev32	v8.16b,v8.16b
962#endif
963#ifndef __AARCH64EB__
964	rev32	v9.16b,v9.16b
965#endif
966#ifndef __AARCH64EB__
967	rev32	v10.16b,v10.16b
968#endif
969#ifndef __AARCH64EB__
970	rev32	v11.16b,v11.16b
971#endif
972	bl	_vpsm4_enc_8blks
973	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
974	st4	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
975	subs	w2,w2,#8
976	b.gt	.Lecb_8_blocks_process
977	b	100f
978.Lecb_4_blocks_process:
979	cmp	w2,#4
980	b.lt	1f
981	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
982#ifndef __AARCH64EB__
983	rev32	v4.16b,v4.16b
984#endif
985#ifndef __AARCH64EB__
986	rev32	v5.16b,v5.16b
987#endif
988#ifndef __AARCH64EB__
989	rev32	v6.16b,v6.16b
990#endif
991#ifndef __AARCH64EB__
992	rev32	v7.16b,v7.16b
993#endif
994	bl	_vpsm4_enc_4blks
995	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
996	sub	w2,w2,#4
9971:
998	// process last block
999	cmp	w2,#1
1000	b.lt	100f
1001	b.gt	1f
1002	ld1	{v4.4s},[x0]
1003#ifndef __AARCH64EB__
1004	rev32	v4.16b,v4.16b
1005#endif
1006	mov	x10,x3
1007	mov	w11,#8
1008	mov	w12,v4.s[0]
1009	mov	w13,v4.s[1]
1010	mov	w14,v4.s[2]
1011	mov	w15,v4.s[3]
101210:
1013	ldp	w7,w8,[x10],8
1014	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1015	eor	w6,w14,w15
1016	eor	w9,w7,w13
1017	eor	w6,w6,w9
1018	movi	v1.16b,#64
1019	movi	v2.16b,#128
1020	movi	v3.16b,#192
1021	mov	v0.s[0],w6
1022
1023	sub	v1.16b,v0.16b,v1.16b
1024	sub	v2.16b,v0.16b,v2.16b
1025	sub	v3.16b,v0.16b,v3.16b
1026
1027	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1028	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1029	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1030	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1031
1032	mov	w6,v0.s[0]
1033	mov	w7,v1.s[0]
1034	mov	w9,v2.s[0]
1035	add	w7,w6,w7
1036	mov	w6,v3.s[0]
1037	add	w7,w7,w9
1038	add	w7,w7,w6
1039
1040	eor	w6,w7,w7,ror #32-2
1041	eor	w6,w6,w7,ror #32-10
1042	eor	w6,w6,w7,ror #32-18
1043	eor	w6,w6,w7,ror #32-24
1044	eor	w12,w12,w6
1045	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1046	eor	w6,w14,w15
1047	eor	w9,w12,w8
1048	eor	w6,w6,w9
1049	movi	v1.16b,#64
1050	movi	v2.16b,#128
1051	movi	v3.16b,#192
1052	mov	v0.s[0],w6
1053
1054	sub	v1.16b,v0.16b,v1.16b
1055	sub	v2.16b,v0.16b,v2.16b
1056	sub	v3.16b,v0.16b,v3.16b
1057
1058	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1059	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1060	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1061	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1062
1063	mov	w6,v0.s[0]
1064	mov	w7,v1.s[0]
1065	mov	w9,v2.s[0]
1066	add	w7,w6,w7
1067	mov	w6,v3.s[0]
1068	add	w7,w7,w9
1069	add	w7,w7,w6
1070
1071	eor	w6,w7,w7,ror #32-2
1072	eor	w6,w6,w7,ror #32-10
1073	eor	w6,w6,w7,ror #32-18
1074	eor	w6,w6,w7,ror #32-24
1075	ldp	w7,w8,[x10],8
1076	eor	w13,w13,w6
1077	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1078	eor	w6,w12,w13
1079	eor	w9,w7,w15
1080	eor	w6,w6,w9
1081	movi	v1.16b,#64
1082	movi	v2.16b,#128
1083	movi	v3.16b,#192
1084	mov	v0.s[0],w6
1085
1086	sub	v1.16b,v0.16b,v1.16b
1087	sub	v2.16b,v0.16b,v2.16b
1088	sub	v3.16b,v0.16b,v3.16b
1089
1090	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1091	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1092	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1093	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1094
1095	mov	w6,v0.s[0]
1096	mov	w7,v1.s[0]
1097	mov	w9,v2.s[0]
1098	add	w7,w6,w7
1099	mov	w6,v3.s[0]
1100	add	w7,w7,w9
1101	add	w7,w7,w6
1102
1103	eor	w6,w7,w7,ror #32-2
1104	eor	w6,w6,w7,ror #32-10
1105	eor	w6,w6,w7,ror #32-18
1106	eor	w6,w6,w7,ror #32-24
1107	eor	w14,w14,w6
1108	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1109	eor	w6,w12,w13
1110	eor	w9,w14,w8
1111	eor	w6,w6,w9
1112	movi	v1.16b,#64
1113	movi	v2.16b,#128
1114	movi	v3.16b,#192
1115	mov	v0.s[0],w6
1116
1117	sub	v1.16b,v0.16b,v1.16b
1118	sub	v2.16b,v0.16b,v2.16b
1119	sub	v3.16b,v0.16b,v3.16b
1120
1121	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1122	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1123	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1124	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1125
1126	mov	w6,v0.s[0]
1127	mov	w7,v1.s[0]
1128	mov	w9,v2.s[0]
1129	add	w7,w6,w7
1130	mov	w6,v3.s[0]
1131	add	w7,w7,w9
1132	add	w7,w7,w6
1133
1134	eor	w6,w7,w7,ror #32-2
1135	eor	w6,w6,w7,ror #32-10
1136	eor	w6,w6,w7,ror #32-18
1137	eor	w6,w6,w7,ror #32-24
1138	eor	w15,w15,w6
1139	subs	w11,w11,#1
1140	b.ne	10b
1141	mov	v4.s[0],w15
1142	mov	v4.s[1],w14
1143	mov	v4.s[2],w13
1144	mov	v4.s[3],w12
1145#ifndef __AARCH64EB__
1146	rev32	v4.16b,v4.16b
1147#endif
1148	st1	{v4.4s},[x1]
1149	b	100f
11501:	//	process last 2 blocks
1151	ld4	{v4.s,v5.s,v6.s,v7.s}[0],[x0],#16
1152	ld4	{v4.s,v5.s,v6.s,v7.s}[1],[x0],#16
1153	cmp	w2,#2
1154	b.gt	1f
1155#ifndef __AARCH64EB__
1156	rev32	v4.16b,v4.16b
1157#endif
1158#ifndef __AARCH64EB__
1159	rev32	v5.16b,v5.16b
1160#endif
1161#ifndef __AARCH64EB__
1162	rev32	v6.16b,v6.16b
1163#endif
1164#ifndef __AARCH64EB__
1165	rev32	v7.16b,v7.16b
1166#endif
1167	bl	_vpsm4_enc_4blks
1168	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
1169	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1]
1170	b	100f
11711:	//	process last 3 blocks
1172	ld4	{v4.s,v5.s,v6.s,v7.s}[2],[x0],#16
1173#ifndef __AARCH64EB__
1174	rev32	v4.16b,v4.16b
1175#endif
1176#ifndef __AARCH64EB__
1177	rev32	v5.16b,v5.16b
1178#endif
1179#ifndef __AARCH64EB__
1180	rev32	v6.16b,v6.16b
1181#endif
1182#ifndef __AARCH64EB__
1183	rev32	v7.16b,v7.16b
1184#endif
1185	bl	_vpsm4_enc_4blks
1186	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
1187	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
1188	st4	{v0.s,v1.s,v2.s,v3.s}[2],[x1]
1189100:
1190	ldp	d10,d11,[sp,#16]
1191	ldp	d12,d13,[sp,#32]
1192	ldp	d14,d15,[sp,#48]
1193	ldp	x29,x30,[sp,#64]
1194	ldp	d8,d9,[sp],#80
1195	AARCH64_VALIDATE_LINK_REGISTER
1196	ret
1197.size	vpsm4_ecb_encrypt,.-vpsm4_ecb_encrypt
1198.globl	vpsm4_cbc_encrypt
1199.type	vpsm4_cbc_encrypt,%function
1200.align	5
1201vpsm4_cbc_encrypt:
1202	AARCH64_VALID_CALL_TARGET
1203	lsr	x2,x2,4
1204	adrp	x10,.Lsbox
1205	add	x10,x10,#:lo12:.Lsbox
1206	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
1207	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
1208	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
1209	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
1210	cbz	w5,.Ldec
1211	ld1	{v3.4s},[x4]
1212.Lcbc_4_blocks_enc:
1213	cmp	w2,#4
1214	b.lt	1f
1215	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1216	eor	v4.16b,v4.16b,v3.16b
1217#ifndef __AARCH64EB__
1218	rev32	v5.16b,v5.16b
1219#endif
1220#ifndef __AARCH64EB__
1221	rev32	v4.16b,v4.16b
1222#endif
1223#ifndef __AARCH64EB__
1224	rev32	v6.16b,v6.16b
1225#endif
1226#ifndef __AARCH64EB__
1227	rev32	v7.16b,v7.16b
1228#endif
1229	mov	x10,x3
1230	mov	w11,#8
1231	mov	w12,v4.s[0]
1232	mov	w13,v4.s[1]
1233	mov	w14,v4.s[2]
1234	mov	w15,v4.s[3]
123510:
1236	ldp	w7,w8,[x10],8
1237	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1238	eor	w6,w14,w15
1239	eor	w9,w7,w13
1240	eor	w6,w6,w9
1241	movi	v1.16b,#64
1242	movi	v2.16b,#128
1243	movi	v3.16b,#192
1244	mov	v0.s[0],w6
1245
1246	sub	v1.16b,v0.16b,v1.16b
1247	sub	v2.16b,v0.16b,v2.16b
1248	sub	v3.16b,v0.16b,v3.16b
1249
1250	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1251	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1252	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1253	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1254
1255	mov	w6,v0.s[0]
1256	mov	w7,v1.s[0]
1257	mov	w9,v2.s[0]
1258	add	w7,w6,w7
1259	mov	w6,v3.s[0]
1260	add	w7,w7,w9
1261	add	w7,w7,w6
1262
1263	eor	w6,w7,w7,ror #32-2
1264	eor	w6,w6,w7,ror #32-10
1265	eor	w6,w6,w7,ror #32-18
1266	eor	w6,w6,w7,ror #32-24
1267	eor	w12,w12,w6
1268	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1269	eor	w6,w14,w15
1270	eor	w9,w12,w8
1271	eor	w6,w6,w9
1272	movi	v1.16b,#64
1273	movi	v2.16b,#128
1274	movi	v3.16b,#192
1275	mov	v0.s[0],w6
1276
1277	sub	v1.16b,v0.16b,v1.16b
1278	sub	v2.16b,v0.16b,v2.16b
1279	sub	v3.16b,v0.16b,v3.16b
1280
1281	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1282	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1283	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1284	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1285
1286	mov	w6,v0.s[0]
1287	mov	w7,v1.s[0]
1288	mov	w9,v2.s[0]
1289	add	w7,w6,w7
1290	mov	w6,v3.s[0]
1291	add	w7,w7,w9
1292	add	w7,w7,w6
1293
1294	eor	w6,w7,w7,ror #32-2
1295	eor	w6,w6,w7,ror #32-10
1296	eor	w6,w6,w7,ror #32-18
1297	eor	w6,w6,w7,ror #32-24
1298	ldp	w7,w8,[x10],8
1299	eor	w13,w13,w6
1300	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1301	eor	w6,w12,w13
1302	eor	w9,w7,w15
1303	eor	w6,w6,w9
1304	movi	v1.16b,#64
1305	movi	v2.16b,#128
1306	movi	v3.16b,#192
1307	mov	v0.s[0],w6
1308
1309	sub	v1.16b,v0.16b,v1.16b
1310	sub	v2.16b,v0.16b,v2.16b
1311	sub	v3.16b,v0.16b,v3.16b
1312
1313	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1314	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1315	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1316	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1317
1318	mov	w6,v0.s[0]
1319	mov	w7,v1.s[0]
1320	mov	w9,v2.s[0]
1321	add	w7,w6,w7
1322	mov	w6,v3.s[0]
1323	add	w7,w7,w9
1324	add	w7,w7,w6
1325
1326	eor	w6,w7,w7,ror #32-2
1327	eor	w6,w6,w7,ror #32-10
1328	eor	w6,w6,w7,ror #32-18
1329	eor	w6,w6,w7,ror #32-24
1330	eor	w14,w14,w6
1331	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1332	eor	w6,w12,w13
1333	eor	w9,w14,w8
1334	eor	w6,w6,w9
1335	movi	v1.16b,#64
1336	movi	v2.16b,#128
1337	movi	v3.16b,#192
1338	mov	v0.s[0],w6
1339
1340	sub	v1.16b,v0.16b,v1.16b
1341	sub	v2.16b,v0.16b,v2.16b
1342	sub	v3.16b,v0.16b,v3.16b
1343
1344	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1345	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1346	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1347	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1348
1349	mov	w6,v0.s[0]
1350	mov	w7,v1.s[0]
1351	mov	w9,v2.s[0]
1352	add	w7,w6,w7
1353	mov	w6,v3.s[0]
1354	add	w7,w7,w9
1355	add	w7,w7,w6
1356
1357	eor	w6,w7,w7,ror #32-2
1358	eor	w6,w6,w7,ror #32-10
1359	eor	w6,w6,w7,ror #32-18
1360	eor	w6,w6,w7,ror #32-24
1361	eor	w15,w15,w6
1362	subs	w11,w11,#1
1363	b.ne	10b
1364	mov	v4.s[0],w15
1365	mov	v4.s[1],w14
1366	mov	v4.s[2],w13
1367	mov	v4.s[3],w12
1368	eor	v5.16b,v5.16b,v4.16b
1369	mov	x10,x3
1370	mov	w11,#8
1371	mov	w12,v5.s[0]
1372	mov	w13,v5.s[1]
1373	mov	w14,v5.s[2]
1374	mov	w15,v5.s[3]
137510:
1376	ldp	w7,w8,[x10],8
1377	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1378	eor	w6,w14,w15
1379	eor	w9,w7,w13
1380	eor	w6,w6,w9
1381	movi	v1.16b,#64
1382	movi	v2.16b,#128
1383	movi	v3.16b,#192
1384	mov	v0.s[0],w6
1385
1386	sub	v1.16b,v0.16b,v1.16b
1387	sub	v2.16b,v0.16b,v2.16b
1388	sub	v3.16b,v0.16b,v3.16b
1389
1390	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1391	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1392	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1393	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1394
1395	mov	w6,v0.s[0]
1396	mov	w7,v1.s[0]
1397	mov	w9,v2.s[0]
1398	add	w7,w6,w7
1399	mov	w6,v3.s[0]
1400	add	w7,w7,w9
1401	add	w7,w7,w6
1402
1403	eor	w6,w7,w7,ror #32-2
1404	eor	w6,w6,w7,ror #32-10
1405	eor	w6,w6,w7,ror #32-18
1406	eor	w6,w6,w7,ror #32-24
1407	eor	w12,w12,w6
1408	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1409	eor	w6,w14,w15
1410	eor	w9,w12,w8
1411	eor	w6,w6,w9
1412	movi	v1.16b,#64
1413	movi	v2.16b,#128
1414	movi	v3.16b,#192
1415	mov	v0.s[0],w6
1416
1417	sub	v1.16b,v0.16b,v1.16b
1418	sub	v2.16b,v0.16b,v2.16b
1419	sub	v3.16b,v0.16b,v3.16b
1420
1421	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1422	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1423	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1424	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1425
1426	mov	w6,v0.s[0]
1427	mov	w7,v1.s[0]
1428	mov	w9,v2.s[0]
1429	add	w7,w6,w7
1430	mov	w6,v3.s[0]
1431	add	w7,w7,w9
1432	add	w7,w7,w6
1433
1434	eor	w6,w7,w7,ror #32-2
1435	eor	w6,w6,w7,ror #32-10
1436	eor	w6,w6,w7,ror #32-18
1437	eor	w6,w6,w7,ror #32-24
1438	ldp	w7,w8,[x10],8
1439	eor	w13,w13,w6
1440	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1441	eor	w6,w12,w13
1442	eor	w9,w7,w15
1443	eor	w6,w6,w9
1444	movi	v1.16b,#64
1445	movi	v2.16b,#128
1446	movi	v3.16b,#192
1447	mov	v0.s[0],w6
1448
1449	sub	v1.16b,v0.16b,v1.16b
1450	sub	v2.16b,v0.16b,v2.16b
1451	sub	v3.16b,v0.16b,v3.16b
1452
1453	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1454	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1455	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1456	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1457
1458	mov	w6,v0.s[0]
1459	mov	w7,v1.s[0]
1460	mov	w9,v2.s[0]
1461	add	w7,w6,w7
1462	mov	w6,v3.s[0]
1463	add	w7,w7,w9
1464	add	w7,w7,w6
1465
1466	eor	w6,w7,w7,ror #32-2
1467	eor	w6,w6,w7,ror #32-10
1468	eor	w6,w6,w7,ror #32-18
1469	eor	w6,w6,w7,ror #32-24
1470	eor	w14,w14,w6
1471	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1472	eor	w6,w12,w13
1473	eor	w9,w14,w8
1474	eor	w6,w6,w9
1475	movi	v1.16b,#64
1476	movi	v2.16b,#128
1477	movi	v3.16b,#192
1478	mov	v0.s[0],w6
1479
1480	sub	v1.16b,v0.16b,v1.16b
1481	sub	v2.16b,v0.16b,v2.16b
1482	sub	v3.16b,v0.16b,v3.16b
1483
1484	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1485	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1486	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1487	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1488
1489	mov	w6,v0.s[0]
1490	mov	w7,v1.s[0]
1491	mov	w9,v2.s[0]
1492	add	w7,w6,w7
1493	mov	w6,v3.s[0]
1494	add	w7,w7,w9
1495	add	w7,w7,w6
1496
1497	eor	w6,w7,w7,ror #32-2
1498	eor	w6,w6,w7,ror #32-10
1499	eor	w6,w6,w7,ror #32-18
1500	eor	w6,w6,w7,ror #32-24
1501	eor	w15,w15,w6
1502	subs	w11,w11,#1
1503	b.ne	10b
1504	mov	v5.s[0],w15
1505	mov	v5.s[1],w14
1506	mov	v5.s[2],w13
1507	mov	v5.s[3],w12
1508#ifndef __AARCH64EB__
1509	rev32	v4.16b,v4.16b
1510#endif
1511	eor	v6.16b,v6.16b,v5.16b
1512	mov	x10,x3
1513	mov	w11,#8
1514	mov	w12,v6.s[0]
1515	mov	w13,v6.s[1]
1516	mov	w14,v6.s[2]
1517	mov	w15,v6.s[3]
151810:
1519	ldp	w7,w8,[x10],8
1520	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1521	eor	w6,w14,w15
1522	eor	w9,w7,w13
1523	eor	w6,w6,w9
1524	movi	v1.16b,#64
1525	movi	v2.16b,#128
1526	movi	v3.16b,#192
1527	mov	v0.s[0],w6
1528
1529	sub	v1.16b,v0.16b,v1.16b
1530	sub	v2.16b,v0.16b,v2.16b
1531	sub	v3.16b,v0.16b,v3.16b
1532
1533	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1534	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1535	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1536	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1537
1538	mov	w6,v0.s[0]
1539	mov	w7,v1.s[0]
1540	mov	w9,v2.s[0]
1541	add	w7,w6,w7
1542	mov	w6,v3.s[0]
1543	add	w7,w7,w9
1544	add	w7,w7,w6
1545
1546	eor	w6,w7,w7,ror #32-2
1547	eor	w6,w6,w7,ror #32-10
1548	eor	w6,w6,w7,ror #32-18
1549	eor	w6,w6,w7,ror #32-24
1550	eor	w12,w12,w6
1551	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1552	eor	w6,w14,w15
1553	eor	w9,w12,w8
1554	eor	w6,w6,w9
1555	movi	v1.16b,#64
1556	movi	v2.16b,#128
1557	movi	v3.16b,#192
1558	mov	v0.s[0],w6
1559
1560	sub	v1.16b,v0.16b,v1.16b
1561	sub	v2.16b,v0.16b,v2.16b
1562	sub	v3.16b,v0.16b,v3.16b
1563
1564	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1565	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1566	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1567	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1568
1569	mov	w6,v0.s[0]
1570	mov	w7,v1.s[0]
1571	mov	w9,v2.s[0]
1572	add	w7,w6,w7
1573	mov	w6,v3.s[0]
1574	add	w7,w7,w9
1575	add	w7,w7,w6
1576
1577	eor	w6,w7,w7,ror #32-2
1578	eor	w6,w6,w7,ror #32-10
1579	eor	w6,w6,w7,ror #32-18
1580	eor	w6,w6,w7,ror #32-24
1581	ldp	w7,w8,[x10],8
1582	eor	w13,w13,w6
1583	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1584	eor	w6,w12,w13
1585	eor	w9,w7,w15
1586	eor	w6,w6,w9
1587	movi	v1.16b,#64
1588	movi	v2.16b,#128
1589	movi	v3.16b,#192
1590	mov	v0.s[0],w6
1591
1592	sub	v1.16b,v0.16b,v1.16b
1593	sub	v2.16b,v0.16b,v2.16b
1594	sub	v3.16b,v0.16b,v3.16b
1595
1596	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1597	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1598	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1599	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1600
1601	mov	w6,v0.s[0]
1602	mov	w7,v1.s[0]
1603	mov	w9,v2.s[0]
1604	add	w7,w6,w7
1605	mov	w6,v3.s[0]
1606	add	w7,w7,w9
1607	add	w7,w7,w6
1608
1609	eor	w6,w7,w7,ror #32-2
1610	eor	w6,w6,w7,ror #32-10
1611	eor	w6,w6,w7,ror #32-18
1612	eor	w6,w6,w7,ror #32-24
1613	eor	w14,w14,w6
1614	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1615	eor	w6,w12,w13
1616	eor	w9,w14,w8
1617	eor	w6,w6,w9
1618	movi	v1.16b,#64
1619	movi	v2.16b,#128
1620	movi	v3.16b,#192
1621	mov	v0.s[0],w6
1622
1623	sub	v1.16b,v0.16b,v1.16b
1624	sub	v2.16b,v0.16b,v2.16b
1625	sub	v3.16b,v0.16b,v3.16b
1626
1627	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1628	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1629	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1630	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1631
1632	mov	w6,v0.s[0]
1633	mov	w7,v1.s[0]
1634	mov	w9,v2.s[0]
1635	add	w7,w6,w7
1636	mov	w6,v3.s[0]
1637	add	w7,w7,w9
1638	add	w7,w7,w6
1639
1640	eor	w6,w7,w7,ror #32-2
1641	eor	w6,w6,w7,ror #32-10
1642	eor	w6,w6,w7,ror #32-18
1643	eor	w6,w6,w7,ror #32-24
1644	eor	w15,w15,w6
1645	subs	w11,w11,#1
1646	b.ne	10b
1647	mov	v6.s[0],w15
1648	mov	v6.s[1],w14
1649	mov	v6.s[2],w13
1650	mov	v6.s[3],w12
1651#ifndef __AARCH64EB__
1652	rev32	v5.16b,v5.16b
1653#endif
1654	eor	v7.16b,v7.16b,v6.16b
1655	mov	x10,x3
1656	mov	w11,#8
1657	mov	w12,v7.s[0]
1658	mov	w13,v7.s[1]
1659	mov	w14,v7.s[2]
1660	mov	w15,v7.s[3]
166110:
1662	ldp	w7,w8,[x10],8
1663	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1664	eor	w6,w14,w15
1665	eor	w9,w7,w13
1666	eor	w6,w6,w9
1667	movi	v1.16b,#64
1668	movi	v2.16b,#128
1669	movi	v3.16b,#192
1670	mov	v0.s[0],w6
1671
1672	sub	v1.16b,v0.16b,v1.16b
1673	sub	v2.16b,v0.16b,v2.16b
1674	sub	v3.16b,v0.16b,v3.16b
1675
1676	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1677	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1678	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1679	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1680
1681	mov	w6,v0.s[0]
1682	mov	w7,v1.s[0]
1683	mov	w9,v2.s[0]
1684	add	w7,w6,w7
1685	mov	w6,v3.s[0]
1686	add	w7,w7,w9
1687	add	w7,w7,w6
1688
1689	eor	w6,w7,w7,ror #32-2
1690	eor	w6,w6,w7,ror #32-10
1691	eor	w6,w6,w7,ror #32-18
1692	eor	w6,w6,w7,ror #32-24
1693	eor	w12,w12,w6
1694	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1695	eor	w6,w14,w15
1696	eor	w9,w12,w8
1697	eor	w6,w6,w9
1698	movi	v1.16b,#64
1699	movi	v2.16b,#128
1700	movi	v3.16b,#192
1701	mov	v0.s[0],w6
1702
1703	sub	v1.16b,v0.16b,v1.16b
1704	sub	v2.16b,v0.16b,v2.16b
1705	sub	v3.16b,v0.16b,v3.16b
1706
1707	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1708	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1709	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1710	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1711
1712	mov	w6,v0.s[0]
1713	mov	w7,v1.s[0]
1714	mov	w9,v2.s[0]
1715	add	w7,w6,w7
1716	mov	w6,v3.s[0]
1717	add	w7,w7,w9
1718	add	w7,w7,w6
1719
1720	eor	w6,w7,w7,ror #32-2
1721	eor	w6,w6,w7,ror #32-10
1722	eor	w6,w6,w7,ror #32-18
1723	eor	w6,w6,w7,ror #32-24
1724	ldp	w7,w8,[x10],8
1725	eor	w13,w13,w6
1726	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1727	eor	w6,w12,w13
1728	eor	w9,w7,w15
1729	eor	w6,w6,w9
1730	movi	v1.16b,#64
1731	movi	v2.16b,#128
1732	movi	v3.16b,#192
1733	mov	v0.s[0],w6
1734
1735	sub	v1.16b,v0.16b,v1.16b
1736	sub	v2.16b,v0.16b,v2.16b
1737	sub	v3.16b,v0.16b,v3.16b
1738
1739	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1740	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1741	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1742	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1743
1744	mov	w6,v0.s[0]
1745	mov	w7,v1.s[0]
1746	mov	w9,v2.s[0]
1747	add	w7,w6,w7
1748	mov	w6,v3.s[0]
1749	add	w7,w7,w9
1750	add	w7,w7,w6
1751
1752	eor	w6,w7,w7,ror #32-2
1753	eor	w6,w6,w7,ror #32-10
1754	eor	w6,w6,w7,ror #32-18
1755	eor	w6,w6,w7,ror #32-24
1756	eor	w14,w14,w6
1757	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1758	eor	w6,w12,w13
1759	eor	w9,w14,w8
1760	eor	w6,w6,w9
1761	movi	v1.16b,#64
1762	movi	v2.16b,#128
1763	movi	v3.16b,#192
1764	mov	v0.s[0],w6
1765
1766	sub	v1.16b,v0.16b,v1.16b
1767	sub	v2.16b,v0.16b,v2.16b
1768	sub	v3.16b,v0.16b,v3.16b
1769
1770	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1771	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1772	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1773	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1774
1775	mov	w6,v0.s[0]
1776	mov	w7,v1.s[0]
1777	mov	w9,v2.s[0]
1778	add	w7,w6,w7
1779	mov	w6,v3.s[0]
1780	add	w7,w7,w9
1781	add	w7,w7,w6
1782
1783	eor	w6,w7,w7,ror #32-2
1784	eor	w6,w6,w7,ror #32-10
1785	eor	w6,w6,w7,ror #32-18
1786	eor	w6,w6,w7,ror #32-24
1787	eor	w15,w15,w6
1788	subs	w11,w11,#1
1789	b.ne	10b
1790	mov	v7.s[0],w15
1791	mov	v7.s[1],w14
1792	mov	v7.s[2],w13
1793	mov	v7.s[3],w12
1794#ifndef __AARCH64EB__
1795	rev32	v6.16b,v6.16b
1796#endif
1797#ifndef __AARCH64EB__
1798	rev32	v7.16b,v7.16b
1799#endif
1800	orr	v3.16b,v7.16b,v7.16b
1801	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
1802	subs	w2,w2,#4
1803	b.ne	.Lcbc_4_blocks_enc
1804	b	2f
18051:
1806	subs	w2,w2,#1
1807	b.lt	2f
1808	ld1	{v4.4s},[x0],#16
1809	eor	v3.16b,v3.16b,v4.16b
1810#ifndef __AARCH64EB__
1811	rev32	v3.16b,v3.16b
1812#endif
1813	mov	x10,x3
1814	mov	w11,#8
1815	mov	w12,v3.s[0]
1816	mov	w13,v3.s[1]
1817	mov	w14,v3.s[2]
1818	mov	w15,v3.s[3]
181910:
1820	ldp	w7,w8,[x10],8
1821	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1822	eor	w6,w14,w15
1823	eor	w9,w7,w13
1824	eor	w6,w6,w9
1825	movi	v1.16b,#64
1826	movi	v2.16b,#128
1827	movi	v3.16b,#192
1828	mov	v0.s[0],w6
1829
1830	sub	v1.16b,v0.16b,v1.16b
1831	sub	v2.16b,v0.16b,v2.16b
1832	sub	v3.16b,v0.16b,v3.16b
1833
1834	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1835	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1836	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1837	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1838
1839	mov	w6,v0.s[0]
1840	mov	w7,v1.s[0]
1841	mov	w9,v2.s[0]
1842	add	w7,w6,w7
1843	mov	w6,v3.s[0]
1844	add	w7,w7,w9
1845	add	w7,w7,w6
1846
1847	eor	w6,w7,w7,ror #32-2
1848	eor	w6,w6,w7,ror #32-10
1849	eor	w6,w6,w7,ror #32-18
1850	eor	w6,w6,w7,ror #32-24
1851	eor	w12,w12,w6
1852	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1853	eor	w6,w14,w15
1854	eor	w9,w12,w8
1855	eor	w6,w6,w9
1856	movi	v1.16b,#64
1857	movi	v2.16b,#128
1858	movi	v3.16b,#192
1859	mov	v0.s[0],w6
1860
1861	sub	v1.16b,v0.16b,v1.16b
1862	sub	v2.16b,v0.16b,v2.16b
1863	sub	v3.16b,v0.16b,v3.16b
1864
1865	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1866	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1867	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1868	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1869
1870	mov	w6,v0.s[0]
1871	mov	w7,v1.s[0]
1872	mov	w9,v2.s[0]
1873	add	w7,w6,w7
1874	mov	w6,v3.s[0]
1875	add	w7,w7,w9
1876	add	w7,w7,w6
1877
1878	eor	w6,w7,w7,ror #32-2
1879	eor	w6,w6,w7,ror #32-10
1880	eor	w6,w6,w7,ror #32-18
1881	eor	w6,w6,w7,ror #32-24
1882	ldp	w7,w8,[x10],8
1883	eor	w13,w13,w6
1884	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1885	eor	w6,w12,w13
1886	eor	w9,w7,w15
1887	eor	w6,w6,w9
1888	movi	v1.16b,#64
1889	movi	v2.16b,#128
1890	movi	v3.16b,#192
1891	mov	v0.s[0],w6
1892
1893	sub	v1.16b,v0.16b,v1.16b
1894	sub	v2.16b,v0.16b,v2.16b
1895	sub	v3.16b,v0.16b,v3.16b
1896
1897	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1898	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1899	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1900	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1901
1902	mov	w6,v0.s[0]
1903	mov	w7,v1.s[0]
1904	mov	w9,v2.s[0]
1905	add	w7,w6,w7
1906	mov	w6,v3.s[0]
1907	add	w7,w7,w9
1908	add	w7,w7,w6
1909
1910	eor	w6,w7,w7,ror #32-2
1911	eor	w6,w6,w7,ror #32-10
1912	eor	w6,w6,w7,ror #32-18
1913	eor	w6,w6,w7,ror #32-24
1914	eor	w14,w14,w6
1915	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1916	eor	w6,w12,w13
1917	eor	w9,w14,w8
1918	eor	w6,w6,w9
1919	movi	v1.16b,#64
1920	movi	v2.16b,#128
1921	movi	v3.16b,#192
1922	mov	v0.s[0],w6
1923
1924	sub	v1.16b,v0.16b,v1.16b
1925	sub	v2.16b,v0.16b,v2.16b
1926	sub	v3.16b,v0.16b,v3.16b
1927
1928	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1929	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1930	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1931	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1932
1933	mov	w6,v0.s[0]
1934	mov	w7,v1.s[0]
1935	mov	w9,v2.s[0]
1936	add	w7,w6,w7
1937	mov	w6,v3.s[0]
1938	add	w7,w7,w9
1939	add	w7,w7,w6
1940
1941	eor	w6,w7,w7,ror #32-2
1942	eor	w6,w6,w7,ror #32-10
1943	eor	w6,w6,w7,ror #32-18
1944	eor	w6,w6,w7,ror #32-24
1945	eor	w15,w15,w6
1946	subs	w11,w11,#1
1947	b.ne	10b
1948	mov	v3.s[0],w15
1949	mov	v3.s[1],w14
1950	mov	v3.s[2],w13
1951	mov	v3.s[3],w12
1952#ifndef __AARCH64EB__
1953	rev32	v3.16b,v3.16b
1954#endif
1955	st1	{v3.4s},[x1],#16
1956	b	1b
19572:
1958	// save back IV
1959	st1	{v3.4s},[x4]
1960	ret
1961
1962.Ldec:
1963	// decryption mode starts
1964	AARCH64_SIGN_LINK_REGISTER
1965	stp	d8,d9,[sp,#-80]!
1966	stp	d10,d11,[sp,#16]
1967	stp	d12,d13,[sp,#32]
1968	stp	d14,d15,[sp,#48]
1969	stp	x29,x30,[sp,#64]
1970.Lcbc_8_blocks_dec:
1971	cmp	w2,#8
1972	b.lt	1f
1973	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0]
1974	add	x10,x0,#64
1975	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x10]
1976#ifndef __AARCH64EB__
1977	rev32	v4.16b,v4.16b
1978#endif
1979#ifndef __AARCH64EB__
1980	rev32	v5.16b,v5.16b
1981#endif
1982#ifndef __AARCH64EB__
1983	rev32	v6.16b,v6.16b
1984#endif
1985#ifndef __AARCH64EB__
1986	rev32	v7.16b,v7.16b
1987#endif
1988#ifndef __AARCH64EB__
1989	rev32	v8.16b,v8.16b
1990#endif
1991#ifndef __AARCH64EB__
1992	rev32	v9.16b,v9.16b
1993#endif
1994#ifndef __AARCH64EB__
1995	rev32	v10.16b,v10.16b
1996#endif
1997#ifndef __AARCH64EB__
1998	rev32	v11.16b,v11.16b
1999#endif
2000	bl	_vpsm4_enc_8blks
2001	zip1	v8.4s,v0.4s,v1.4s
2002	zip2	v9.4s,v0.4s,v1.4s
2003	zip1	v10.4s,v2.4s,v3.4s
2004	zip2	v11.4s,v2.4s,v3.4s
2005	zip1	v0.2d,v8.2d,v10.2d
2006	zip2	v1.2d,v8.2d,v10.2d
2007	zip1	v2.2d,v9.2d,v11.2d
2008	zip2	v3.2d,v9.2d,v11.2d
2009	zip1	v8.4s,v4.4s,v5.4s
2010	zip2	v9.4s,v4.4s,v5.4s
2011	zip1	v10.4s,v6.4s,v7.4s
2012	zip2	v11.4s,v6.4s,v7.4s
2013	zip1	v4.2d,v8.2d,v10.2d
2014	zip2	v5.2d,v8.2d,v10.2d
2015	zip1	v6.2d,v9.2d,v11.2d
2016	zip2	v7.2d,v9.2d,v11.2d
2017	ld1	{v15.4s},[x4]
2018	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2019	// note ivec1 and vtmpx[3] are reusing the same register
2020	// care needs to be taken to avoid conflict
2021	eor	v0.16b,v0.16b,v15.16b
2022	ld1	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2023	eor	v1.16b,v1.16b,v8.16b
2024	eor	v2.16b,v2.16b,v9.16b
2025	eor	v3.16b,v3.16b,v10.16b
2026	// save back IV
2027	st1	{v15.4s}, [x4]
2028	eor	v4.16b,v4.16b,v11.16b
2029	eor	v5.16b,v5.16b,v12.16b
2030	eor	v6.16b,v6.16b,v13.16b
2031	eor	v7.16b,v7.16b,v14.16b
2032	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2033	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
2034	subs	w2,w2,#8
2035	b.gt	.Lcbc_8_blocks_dec
2036	b.eq	100f
20371:
2038	ld1	{v15.4s},[x4]
2039.Lcbc_4_blocks_dec:
2040	cmp	w2,#4
2041	b.lt	1f
2042	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0]
2043#ifndef __AARCH64EB__
2044	rev32	v4.16b,v4.16b
2045#endif
2046#ifndef __AARCH64EB__
2047	rev32	v5.16b,v5.16b
2048#endif
2049#ifndef __AARCH64EB__
2050	rev32	v6.16b,v6.16b
2051#endif
2052#ifndef __AARCH64EB__
2053	rev32	v7.16b,v7.16b
2054#endif
2055	bl	_vpsm4_enc_4blks
2056	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
2057	zip1	v8.4s,v0.4s,v1.4s
2058	zip2	v9.4s,v0.4s,v1.4s
2059	zip1	v10.4s,v2.4s,v3.4s
2060	zip2	v11.4s,v2.4s,v3.4s
2061	zip1	v0.2d,v8.2d,v10.2d
2062	zip2	v1.2d,v8.2d,v10.2d
2063	zip1	v2.2d,v9.2d,v11.2d
2064	zip2	v3.2d,v9.2d,v11.2d
2065	eor	v0.16b,v0.16b,v15.16b
2066	eor	v1.16b,v1.16b,v4.16b
2067	orr	v15.16b,v7.16b,v7.16b
2068	eor	v2.16b,v2.16b,v5.16b
2069	eor	v3.16b,v3.16b,v6.16b
2070	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2071	subs	w2,w2,#4
2072	b.gt	.Lcbc_4_blocks_dec
2073	// save back IV
2074	st1	{v7.4s}, [x4]
2075	b	100f
20761:	//	last block
2077	subs	w2,w2,#1
2078	b.lt	100f
2079	b.gt	1f
2080	ld1	{v4.4s},[x0],#16
2081	// save back IV
2082	st1	{v4.4s}, [x4]
2083#ifndef __AARCH64EB__
2084	rev32	v8.16b,v4.16b
2085#else
2086	mov	v8.16b,v4.16b
2087#endif
2088	mov	x10,x3
2089	mov	w11,#8
2090	mov	w12,v8.s[0]
2091	mov	w13,v8.s[1]
2092	mov	w14,v8.s[2]
2093	mov	w15,v8.s[3]
209410:
2095	ldp	w7,w8,[x10],8
2096	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2097	eor	w6,w14,w15
2098	eor	w9,w7,w13
2099	eor	w6,w6,w9
2100	movi	v1.16b,#64
2101	movi	v2.16b,#128
2102	movi	v3.16b,#192
2103	mov	v0.s[0],w6
2104
2105	sub	v1.16b,v0.16b,v1.16b
2106	sub	v2.16b,v0.16b,v2.16b
2107	sub	v3.16b,v0.16b,v3.16b
2108
2109	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2110	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2111	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2112	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2113
2114	mov	w6,v0.s[0]
2115	mov	w7,v1.s[0]
2116	mov	w9,v2.s[0]
2117	add	w7,w6,w7
2118	mov	w6,v3.s[0]
2119	add	w7,w7,w9
2120	add	w7,w7,w6
2121
2122	eor	w6,w7,w7,ror #32-2
2123	eor	w6,w6,w7,ror #32-10
2124	eor	w6,w6,w7,ror #32-18
2125	eor	w6,w6,w7,ror #32-24
2126	eor	w12,w12,w6
2127	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2128	eor	w6,w14,w15
2129	eor	w9,w12,w8
2130	eor	w6,w6,w9
2131	movi	v1.16b,#64
2132	movi	v2.16b,#128
2133	movi	v3.16b,#192
2134	mov	v0.s[0],w6
2135
2136	sub	v1.16b,v0.16b,v1.16b
2137	sub	v2.16b,v0.16b,v2.16b
2138	sub	v3.16b,v0.16b,v3.16b
2139
2140	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2141	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2142	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2143	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2144
2145	mov	w6,v0.s[0]
2146	mov	w7,v1.s[0]
2147	mov	w9,v2.s[0]
2148	add	w7,w6,w7
2149	mov	w6,v3.s[0]
2150	add	w7,w7,w9
2151	add	w7,w7,w6
2152
2153	eor	w6,w7,w7,ror #32-2
2154	eor	w6,w6,w7,ror #32-10
2155	eor	w6,w6,w7,ror #32-18
2156	eor	w6,w6,w7,ror #32-24
2157	ldp	w7,w8,[x10],8
2158	eor	w13,w13,w6
2159	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2160	eor	w6,w12,w13
2161	eor	w9,w7,w15
2162	eor	w6,w6,w9
2163	movi	v1.16b,#64
2164	movi	v2.16b,#128
2165	movi	v3.16b,#192
2166	mov	v0.s[0],w6
2167
2168	sub	v1.16b,v0.16b,v1.16b
2169	sub	v2.16b,v0.16b,v2.16b
2170	sub	v3.16b,v0.16b,v3.16b
2171
2172	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2173	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2174	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2175	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2176
2177	mov	w6,v0.s[0]
2178	mov	w7,v1.s[0]
2179	mov	w9,v2.s[0]
2180	add	w7,w6,w7
2181	mov	w6,v3.s[0]
2182	add	w7,w7,w9
2183	add	w7,w7,w6
2184
2185	eor	w6,w7,w7,ror #32-2
2186	eor	w6,w6,w7,ror #32-10
2187	eor	w6,w6,w7,ror #32-18
2188	eor	w6,w6,w7,ror #32-24
2189	eor	w14,w14,w6
2190	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2191	eor	w6,w12,w13
2192	eor	w9,w14,w8
2193	eor	w6,w6,w9
2194	movi	v1.16b,#64
2195	movi	v2.16b,#128
2196	movi	v3.16b,#192
2197	mov	v0.s[0],w6
2198
2199	sub	v1.16b,v0.16b,v1.16b
2200	sub	v2.16b,v0.16b,v2.16b
2201	sub	v3.16b,v0.16b,v3.16b
2202
2203	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2204	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2205	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2206	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2207
2208	mov	w6,v0.s[0]
2209	mov	w7,v1.s[0]
2210	mov	w9,v2.s[0]
2211	add	w7,w6,w7
2212	mov	w6,v3.s[0]
2213	add	w7,w7,w9
2214	add	w7,w7,w6
2215
2216	eor	w6,w7,w7,ror #32-2
2217	eor	w6,w6,w7,ror #32-10
2218	eor	w6,w6,w7,ror #32-18
2219	eor	w6,w6,w7,ror #32-24
2220	eor	w15,w15,w6
2221	subs	w11,w11,#1
2222	b.ne	10b
2223	mov	v8.s[0],w15
2224	mov	v8.s[1],w14
2225	mov	v8.s[2],w13
2226	mov	v8.s[3],w12
2227#ifndef __AARCH64EB__
2228	rev32	v8.16b,v8.16b
2229#endif
2230	eor	v8.16b,v8.16b,v15.16b
2231	st1	{v8.4s},[x1],#16
2232	b	100f
22331:	//	last two blocks
2234	ld4	{v4.s,v5.s,v6.s,v7.s}[0],[x0]
2235	add	x10,x0,#16
2236	ld4	{v4.s,v5.s,v6.s,v7.s}[1],[x10],#16
2237	subs	w2,w2,1
2238	b.gt	1f
2239#ifndef __AARCH64EB__
2240	rev32	v4.16b,v4.16b
2241#endif
2242#ifndef __AARCH64EB__
2243	rev32	v5.16b,v5.16b
2244#endif
2245#ifndef __AARCH64EB__
2246	rev32	v6.16b,v6.16b
2247#endif
2248#ifndef __AARCH64EB__
2249	rev32	v7.16b,v7.16b
2250#endif
2251	bl	_vpsm4_enc_4blks
2252	ld1	{v4.4s,v5.4s},[x0],#32
2253	zip1	v8.4s,v0.4s,v1.4s
2254	zip2	v9.4s,v0.4s,v1.4s
2255	zip1	v10.4s,v2.4s,v3.4s
2256	zip2	v11.4s,v2.4s,v3.4s
2257	zip1	v0.2d,v8.2d,v10.2d
2258	zip2	v1.2d,v8.2d,v10.2d
2259	zip1	v2.2d,v9.2d,v11.2d
2260	zip2	v3.2d,v9.2d,v11.2d
2261	eor	v0.16b,v0.16b,v15.16b
2262	eor	v1.16b,v1.16b,v4.16b
2263	st1	{v0.4s,v1.4s},[x1],#32
2264	// save back IV
2265	st1	{v5.4s}, [x4]
2266	b	100f
22671:	//	last 3 blocks
2268	ld4	{v4.s,v5.s,v6.s,v7.s}[2],[x10]
2269#ifndef __AARCH64EB__
2270	rev32	v4.16b,v4.16b
2271#endif
2272#ifndef __AARCH64EB__
2273	rev32	v5.16b,v5.16b
2274#endif
2275#ifndef __AARCH64EB__
2276	rev32	v6.16b,v6.16b
2277#endif
2278#ifndef __AARCH64EB__
2279	rev32	v7.16b,v7.16b
2280#endif
2281	bl	_vpsm4_enc_4blks
2282	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
2283	zip1	v8.4s,v0.4s,v1.4s
2284	zip2	v9.4s,v0.4s,v1.4s
2285	zip1	v10.4s,v2.4s,v3.4s
2286	zip2	v11.4s,v2.4s,v3.4s
2287	zip1	v0.2d,v8.2d,v10.2d
2288	zip2	v1.2d,v8.2d,v10.2d
2289	zip1	v2.2d,v9.2d,v11.2d
2290	zip2	v3.2d,v9.2d,v11.2d
2291	eor	v0.16b,v0.16b,v15.16b
2292	eor	v1.16b,v1.16b,v4.16b
2293	eor	v2.16b,v2.16b,v5.16b
2294	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
2295	// save back IV
2296	st1	{v6.4s}, [x4]
2297100:
2298	ldp	d10,d11,[sp,#16]
2299	ldp	d12,d13,[sp,#32]
2300	ldp	d14,d15,[sp,#48]
2301	ldp	x29,x30,[sp,#64]
2302	ldp	d8,d9,[sp],#80
2303	AARCH64_VALIDATE_LINK_REGISTER
2304	ret
2305.size	vpsm4_cbc_encrypt,.-vpsm4_cbc_encrypt
2306.globl	vpsm4_ctr32_encrypt_blocks
2307.type	vpsm4_ctr32_encrypt_blocks,%function
2308.align	5
2309vpsm4_ctr32_encrypt_blocks:
2310	AARCH64_VALID_CALL_TARGET
2311	ld1	{v3.4s},[x4]
2312#ifndef __AARCH64EB__
2313	rev32	v3.16b,v3.16b
2314#endif
2315	adrp	x10,.Lsbox
2316	add	x10,x10,#:lo12:.Lsbox
2317	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
2318	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
2319	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
2320	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
2321	cmp	w2,#1
2322	b.ne	1f
2323	// fast processing for one single block without
2324	// context saving overhead
2325	mov	x10,x3
2326	mov	w11,#8
2327	mov	w12,v3.s[0]
2328	mov	w13,v3.s[1]
2329	mov	w14,v3.s[2]
2330	mov	w15,v3.s[3]
233110:
2332	ldp	w7,w8,[x10],8
2333	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2334	eor	w6,w14,w15
2335	eor	w9,w7,w13
2336	eor	w6,w6,w9
2337	movi	v1.16b,#64
2338	movi	v2.16b,#128
2339	movi	v3.16b,#192
2340	mov	v0.s[0],w6
2341
2342	sub	v1.16b,v0.16b,v1.16b
2343	sub	v2.16b,v0.16b,v2.16b
2344	sub	v3.16b,v0.16b,v3.16b
2345
2346	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2347	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2348	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2349	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2350
2351	mov	w6,v0.s[0]
2352	mov	w7,v1.s[0]
2353	mov	w9,v2.s[0]
2354	add	w7,w6,w7
2355	mov	w6,v3.s[0]
2356	add	w7,w7,w9
2357	add	w7,w7,w6
2358
2359	eor	w6,w7,w7,ror #32-2
2360	eor	w6,w6,w7,ror #32-10
2361	eor	w6,w6,w7,ror #32-18
2362	eor	w6,w6,w7,ror #32-24
2363	eor	w12,w12,w6
2364	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2365	eor	w6,w14,w15
2366	eor	w9,w12,w8
2367	eor	w6,w6,w9
2368	movi	v1.16b,#64
2369	movi	v2.16b,#128
2370	movi	v3.16b,#192
2371	mov	v0.s[0],w6
2372
2373	sub	v1.16b,v0.16b,v1.16b
2374	sub	v2.16b,v0.16b,v2.16b
2375	sub	v3.16b,v0.16b,v3.16b
2376
2377	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2378	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2379	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2380	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2381
2382	mov	w6,v0.s[0]
2383	mov	w7,v1.s[0]
2384	mov	w9,v2.s[0]
2385	add	w7,w6,w7
2386	mov	w6,v3.s[0]
2387	add	w7,w7,w9
2388	add	w7,w7,w6
2389
2390	eor	w6,w7,w7,ror #32-2
2391	eor	w6,w6,w7,ror #32-10
2392	eor	w6,w6,w7,ror #32-18
2393	eor	w6,w6,w7,ror #32-24
2394	ldp	w7,w8,[x10],8
2395	eor	w13,w13,w6
2396	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2397	eor	w6,w12,w13
2398	eor	w9,w7,w15
2399	eor	w6,w6,w9
2400	movi	v1.16b,#64
2401	movi	v2.16b,#128
2402	movi	v3.16b,#192
2403	mov	v0.s[0],w6
2404
2405	sub	v1.16b,v0.16b,v1.16b
2406	sub	v2.16b,v0.16b,v2.16b
2407	sub	v3.16b,v0.16b,v3.16b
2408
2409	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2410	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2411	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2412	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2413
2414	mov	w6,v0.s[0]
2415	mov	w7,v1.s[0]
2416	mov	w9,v2.s[0]
2417	add	w7,w6,w7
2418	mov	w6,v3.s[0]
2419	add	w7,w7,w9
2420	add	w7,w7,w6
2421
2422	eor	w6,w7,w7,ror #32-2
2423	eor	w6,w6,w7,ror #32-10
2424	eor	w6,w6,w7,ror #32-18
2425	eor	w6,w6,w7,ror #32-24
2426	eor	w14,w14,w6
2427	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2428	eor	w6,w12,w13
2429	eor	w9,w14,w8
2430	eor	w6,w6,w9
2431	movi	v1.16b,#64
2432	movi	v2.16b,#128
2433	movi	v3.16b,#192
2434	mov	v0.s[0],w6
2435
2436	sub	v1.16b,v0.16b,v1.16b
2437	sub	v2.16b,v0.16b,v2.16b
2438	sub	v3.16b,v0.16b,v3.16b
2439
2440	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2441	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2442	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2443	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2444
2445	mov	w6,v0.s[0]
2446	mov	w7,v1.s[0]
2447	mov	w9,v2.s[0]
2448	add	w7,w6,w7
2449	mov	w6,v3.s[0]
2450	add	w7,w7,w9
2451	add	w7,w7,w6
2452
2453	eor	w6,w7,w7,ror #32-2
2454	eor	w6,w6,w7,ror #32-10
2455	eor	w6,w6,w7,ror #32-18
2456	eor	w6,w6,w7,ror #32-24
2457	eor	w15,w15,w6
2458	subs	w11,w11,#1
2459	b.ne	10b
2460	mov	v3.s[0],w15
2461	mov	v3.s[1],w14
2462	mov	v3.s[2],w13
2463	mov	v3.s[3],w12
2464#ifndef __AARCH64EB__
2465	rev32	v3.16b,v3.16b
2466#endif
2467	ld1	{v4.4s},[x0]
2468	eor	v4.16b,v4.16b,v3.16b
2469	st1	{v4.4s},[x1]
2470	ret
24711:
2472	AARCH64_SIGN_LINK_REGISTER
2473	stp	d8,d9,[sp,#-80]!
2474	stp	d10,d11,[sp,#16]
2475	stp	d12,d13,[sp,#32]
2476	stp	d14,d15,[sp,#48]
2477	stp	x29,x30,[sp,#64]
2478	mov	w12,v3.s[0]
2479	mov	w13,v3.s[1]
2480	mov	w14,v3.s[2]
2481	mov	w5,v3.s[3]
2482.Lctr32_4_blocks_process:
2483	cmp	w2,#4
2484	b.lt	1f
2485	dup	v4.4s,w12
2486	dup	v5.4s,w13
2487	dup	v6.4s,w14
2488	mov	v7.s[0],w5
2489	add	w5,w5,#1
2490	mov	v7.s[1],w5
2491	add	w5,w5,#1
2492	mov	v7.s[2],w5
2493	add	w5,w5,#1
2494	mov	v7.s[3],w5
2495	add	w5,w5,#1
2496	cmp	w2,#8
2497	b.ge	.Lctr32_8_blocks_process
2498	bl	_vpsm4_enc_4blks
2499	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2500	eor	v0.16b,v0.16b,v12.16b
2501	eor	v1.16b,v1.16b,v13.16b
2502	eor	v2.16b,v2.16b,v14.16b
2503	eor	v3.16b,v3.16b,v15.16b
2504	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2505	subs	w2,w2,#4
2506	b.ne	.Lctr32_4_blocks_process
2507	b	100f
2508.Lctr32_8_blocks_process:
2509	dup	v8.4s,w12
2510	dup	v9.4s,w13
2511	dup	v10.4s,w14
2512	mov	v11.s[0],w5
2513	add	w5,w5,#1
2514	mov	v11.s[1],w5
2515	add	w5,w5,#1
2516	mov	v11.s[2],w5
2517	add	w5,w5,#1
2518	mov	v11.s[3],w5
2519	add	w5,w5,#1
2520	bl	_vpsm4_enc_8blks
2521	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2522	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2523	eor	v0.16b,v0.16b,v12.16b
2524	eor	v1.16b,v1.16b,v13.16b
2525	eor	v2.16b,v2.16b,v14.16b
2526	eor	v3.16b,v3.16b,v15.16b
2527	eor	v4.16b,v4.16b,v8.16b
2528	eor	v5.16b,v5.16b,v9.16b
2529	eor	v6.16b,v6.16b,v10.16b
2530	eor	v7.16b,v7.16b,v11.16b
2531	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2532	st4	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
2533	subs	w2,w2,#8
2534	b.ne	.Lctr32_4_blocks_process
2535	b	100f
25361:	//	last block processing
2537	subs	w2,w2,#1
2538	b.lt	100f
2539	b.gt	1f
2540	mov	v3.s[0],w12
2541	mov	v3.s[1],w13
2542	mov	v3.s[2],w14
2543	mov	v3.s[3],w5
2544	mov	x10,x3
2545	mov	w11,#8
2546	mov	w12,v3.s[0]
2547	mov	w13,v3.s[1]
2548	mov	w14,v3.s[2]
2549	mov	w15,v3.s[3]
255010:
2551	ldp	w7,w8,[x10],8
2552	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2553	eor	w6,w14,w15
2554	eor	w9,w7,w13
2555	eor	w6,w6,w9
2556	movi	v1.16b,#64
2557	movi	v2.16b,#128
2558	movi	v3.16b,#192
2559	mov	v0.s[0],w6
2560
2561	sub	v1.16b,v0.16b,v1.16b
2562	sub	v2.16b,v0.16b,v2.16b
2563	sub	v3.16b,v0.16b,v3.16b
2564
2565	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2566	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2567	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2568	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2569
2570	mov	w6,v0.s[0]
2571	mov	w7,v1.s[0]
2572	mov	w9,v2.s[0]
2573	add	w7,w6,w7
2574	mov	w6,v3.s[0]
2575	add	w7,w7,w9
2576	add	w7,w7,w6
2577
2578	eor	w6,w7,w7,ror #32-2
2579	eor	w6,w6,w7,ror #32-10
2580	eor	w6,w6,w7,ror #32-18
2581	eor	w6,w6,w7,ror #32-24
2582	eor	w12,w12,w6
2583	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2584	eor	w6,w14,w15
2585	eor	w9,w12,w8
2586	eor	w6,w6,w9
2587	movi	v1.16b,#64
2588	movi	v2.16b,#128
2589	movi	v3.16b,#192
2590	mov	v0.s[0],w6
2591
2592	sub	v1.16b,v0.16b,v1.16b
2593	sub	v2.16b,v0.16b,v2.16b
2594	sub	v3.16b,v0.16b,v3.16b
2595
2596	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2597	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2598	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2599	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2600
2601	mov	w6,v0.s[0]
2602	mov	w7,v1.s[0]
2603	mov	w9,v2.s[0]
2604	add	w7,w6,w7
2605	mov	w6,v3.s[0]
2606	add	w7,w7,w9
2607	add	w7,w7,w6
2608
2609	eor	w6,w7,w7,ror #32-2
2610	eor	w6,w6,w7,ror #32-10
2611	eor	w6,w6,w7,ror #32-18
2612	eor	w6,w6,w7,ror #32-24
2613	ldp	w7,w8,[x10],8
2614	eor	w13,w13,w6
2615	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2616	eor	w6,w12,w13
2617	eor	w9,w7,w15
2618	eor	w6,w6,w9
2619	movi	v1.16b,#64
2620	movi	v2.16b,#128
2621	movi	v3.16b,#192
2622	mov	v0.s[0],w6
2623
2624	sub	v1.16b,v0.16b,v1.16b
2625	sub	v2.16b,v0.16b,v2.16b
2626	sub	v3.16b,v0.16b,v3.16b
2627
2628	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2629	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2630	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2631	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2632
2633	mov	w6,v0.s[0]
2634	mov	w7,v1.s[0]
2635	mov	w9,v2.s[0]
2636	add	w7,w6,w7
2637	mov	w6,v3.s[0]
2638	add	w7,w7,w9
2639	add	w7,w7,w6
2640
2641	eor	w6,w7,w7,ror #32-2
2642	eor	w6,w6,w7,ror #32-10
2643	eor	w6,w6,w7,ror #32-18
2644	eor	w6,w6,w7,ror #32-24
2645	eor	w14,w14,w6
2646	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2647	eor	w6,w12,w13
2648	eor	w9,w14,w8
2649	eor	w6,w6,w9
2650	movi	v1.16b,#64
2651	movi	v2.16b,#128
2652	movi	v3.16b,#192
2653	mov	v0.s[0],w6
2654
2655	sub	v1.16b,v0.16b,v1.16b
2656	sub	v2.16b,v0.16b,v2.16b
2657	sub	v3.16b,v0.16b,v3.16b
2658
2659	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2660	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2661	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2662	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2663
2664	mov	w6,v0.s[0]
2665	mov	w7,v1.s[0]
2666	mov	w9,v2.s[0]
2667	add	w7,w6,w7
2668	mov	w6,v3.s[0]
2669	add	w7,w7,w9
2670	add	w7,w7,w6
2671
2672	eor	w6,w7,w7,ror #32-2
2673	eor	w6,w6,w7,ror #32-10
2674	eor	w6,w6,w7,ror #32-18
2675	eor	w6,w6,w7,ror #32-24
2676	eor	w15,w15,w6
2677	subs	w11,w11,#1
2678	b.ne	10b
2679	mov	v3.s[0],w15
2680	mov	v3.s[1],w14
2681	mov	v3.s[2],w13
2682	mov	v3.s[3],w12
2683#ifndef __AARCH64EB__
2684	rev32	v3.16b,v3.16b
2685#endif
2686	ld1	{v4.4s},[x0]
2687	eor	v4.16b,v4.16b,v3.16b
2688	st1	{v4.4s},[x1]
2689	b	100f
26901:	//	last 2 blocks processing
2691	dup	v4.4s,w12
2692	dup	v5.4s,w13
2693	dup	v6.4s,w14
2694	mov	v7.s[0],w5
2695	add	w5,w5,#1
2696	mov	v7.s[1],w5
2697	subs	w2,w2,#1
2698	b.ne	1f
2699	bl	_vpsm4_enc_4blks
2700	ld4	{v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
2701	ld4	{v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
2702	eor	v0.16b,v0.16b,v12.16b
2703	eor	v1.16b,v1.16b,v13.16b
2704	eor	v2.16b,v2.16b,v14.16b
2705	eor	v3.16b,v3.16b,v15.16b
2706	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
2707	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
2708	b	100f
27091:	//	last 3 blocks processing
2710	add	w5,w5,#1
2711	mov	v7.s[2],w5
2712	bl	_vpsm4_enc_4blks
2713	ld4	{v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
2714	ld4	{v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
2715	ld4	{v12.s,v13.s,v14.s,v15.s}[2],[x0],#16
2716	eor	v0.16b,v0.16b,v12.16b
2717	eor	v1.16b,v1.16b,v13.16b
2718	eor	v2.16b,v2.16b,v14.16b
2719	eor	v3.16b,v3.16b,v15.16b
2720	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
2721	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
2722	st4	{v0.s,v1.s,v2.s,v3.s}[2],[x1],#16
2723100:
2724	ldp	d10,d11,[sp,#16]
2725	ldp	d12,d13,[sp,#32]
2726	ldp	d14,d15,[sp,#48]
2727	ldp	x29,x30,[sp,#64]
2728	ldp	d8,d9,[sp],#80
2729	AARCH64_VALIDATE_LINK_REGISTER
2730	ret
2731.size	vpsm4_ctr32_encrypt_blocks,.-vpsm4_ctr32_encrypt_blocks
2732.globl	vpsm4_xts_encrypt_gb
2733.type	vpsm4_xts_encrypt_gb,%function
2734.align	5
2735vpsm4_xts_encrypt_gb:
2736	AARCH64_SIGN_LINK_REGISTER
2737	stp	x15, x16, [sp, #-0x10]!
2738	stp	x17, x18, [sp, #-0x10]!
2739	stp	x19, x20, [sp, #-0x10]!
2740	stp	x21, x22, [sp, #-0x10]!
2741	stp	x23, x24, [sp, #-0x10]!
2742	stp	x25, x26, [sp, #-0x10]!
2743	stp	x27, x28, [sp, #-0x10]!
2744	stp	x29, x30, [sp, #-0x10]!
2745	stp	d8, d9, [sp, #-0x10]!
2746	stp	d10, d11, [sp, #-0x10]!
2747	stp	d12, d13, [sp, #-0x10]!
2748	stp	d14, d15, [sp, #-0x10]!
2749	mov	x26,x3
2750	mov	x27,x4
2751	mov	w28,w6
2752	ld1	{v8.4s}, [x5]
2753	mov	x3,x27
2754	adrp	x10,.Lsbox
2755	add	x10,x10,#:lo12:.Lsbox
2756	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
2757	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
2758	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
2759	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
2760#ifndef __AARCH64EB__
2761	rev32	v8.16b,v8.16b
2762#endif
2763	mov	x10,x3
2764	mov	w11,#8
2765	mov	w12,v8.s[0]
2766	mov	w13,v8.s[1]
2767	mov	w14,v8.s[2]
2768	mov	w15,v8.s[3]
276910:
2770	ldp	w7,w8,[x10],8
2771	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2772	eor	w6,w14,w15
2773	eor	w9,w7,w13
2774	eor	w6,w6,w9
2775	movi	v1.16b,#64
2776	movi	v2.16b,#128
2777	movi	v3.16b,#192
2778	mov	v0.s[0],w6
2779
2780	sub	v1.16b,v0.16b,v1.16b
2781	sub	v2.16b,v0.16b,v2.16b
2782	sub	v3.16b,v0.16b,v3.16b
2783
2784	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2785	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2786	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2787	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2788
2789	mov	w6,v0.s[0]
2790	mov	w7,v1.s[0]
2791	mov	w9,v2.s[0]
2792	add	w7,w6,w7
2793	mov	w6,v3.s[0]
2794	add	w7,w7,w9
2795	add	w7,w7,w6
2796
2797	eor	w6,w7,w7,ror #32-2
2798	eor	w6,w6,w7,ror #32-10
2799	eor	w6,w6,w7,ror #32-18
2800	eor	w6,w6,w7,ror #32-24
2801	eor	w12,w12,w6
2802	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2803	eor	w6,w14,w15
2804	eor	w9,w12,w8
2805	eor	w6,w6,w9
2806	movi	v1.16b,#64
2807	movi	v2.16b,#128
2808	movi	v3.16b,#192
2809	mov	v0.s[0],w6
2810
2811	sub	v1.16b,v0.16b,v1.16b
2812	sub	v2.16b,v0.16b,v2.16b
2813	sub	v3.16b,v0.16b,v3.16b
2814
2815	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2816	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2817	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2818	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2819
2820	mov	w6,v0.s[0]
2821	mov	w7,v1.s[0]
2822	mov	w9,v2.s[0]
2823	add	w7,w6,w7
2824	mov	w6,v3.s[0]
2825	add	w7,w7,w9
2826	add	w7,w7,w6
2827
2828	eor	w6,w7,w7,ror #32-2
2829	eor	w6,w6,w7,ror #32-10
2830	eor	w6,w6,w7,ror #32-18
2831	eor	w6,w6,w7,ror #32-24
2832	ldp	w7,w8,[x10],8
2833	eor	w13,w13,w6
2834	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2835	eor	w6,w12,w13
2836	eor	w9,w7,w15
2837	eor	w6,w6,w9
2838	movi	v1.16b,#64
2839	movi	v2.16b,#128
2840	movi	v3.16b,#192
2841	mov	v0.s[0],w6
2842
2843	sub	v1.16b,v0.16b,v1.16b
2844	sub	v2.16b,v0.16b,v2.16b
2845	sub	v3.16b,v0.16b,v3.16b
2846
2847	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2848	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2849	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2850	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2851
2852	mov	w6,v0.s[0]
2853	mov	w7,v1.s[0]
2854	mov	w9,v2.s[0]
2855	add	w7,w6,w7
2856	mov	w6,v3.s[0]
2857	add	w7,w7,w9
2858	add	w7,w7,w6
2859
2860	eor	w6,w7,w7,ror #32-2
2861	eor	w6,w6,w7,ror #32-10
2862	eor	w6,w6,w7,ror #32-18
2863	eor	w6,w6,w7,ror #32-24
2864	eor	w14,w14,w6
2865	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2866	eor	w6,w12,w13
2867	eor	w9,w14,w8
2868	eor	w6,w6,w9
2869	movi	v1.16b,#64
2870	movi	v2.16b,#128
2871	movi	v3.16b,#192
2872	mov	v0.s[0],w6
2873
2874	sub	v1.16b,v0.16b,v1.16b
2875	sub	v2.16b,v0.16b,v2.16b
2876	sub	v3.16b,v0.16b,v3.16b
2877
2878	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2879	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2880	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2881	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2882
2883	mov	w6,v0.s[0]
2884	mov	w7,v1.s[0]
2885	mov	w9,v2.s[0]
2886	add	w7,w6,w7
2887	mov	w6,v3.s[0]
2888	add	w7,w7,w9
2889	add	w7,w7,w6
2890
2891	eor	w6,w7,w7,ror #32-2
2892	eor	w6,w6,w7,ror #32-10
2893	eor	w6,w6,w7,ror #32-18
2894	eor	w6,w6,w7,ror #32-24
2895	eor	w15,w15,w6
2896	subs	w11,w11,#1
2897	b.ne	10b
2898	mov	v8.s[0],w15
2899	mov	v8.s[1],w14
2900	mov	v8.s[2],w13
2901	mov	v8.s[3],w12
2902#ifndef __AARCH64EB__
2903	rev32	v8.16b,v8.16b
2904#endif
2905	mov	x3,x26
2906	and	x29,x2,#0x0F
2907	// convert length into blocks
2908	lsr	x2,x2,4
2909	cmp	x2,#1
2910	b.lt	.return_gb
2911
2912	cmp	x29,0
2913	// If the encryption/decryption Length is N times of 16,
2914	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
2915	b.eq	.xts_encrypt_blocks_gb
2916
2917	// If the encryption/decryption length is not N times of 16,
2918	// the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb
2919	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
2920	subs	x2,x2,#1
2921	b.eq	.only_2blks_tweak_gb
2922.xts_encrypt_blocks_gb:
2923	rbit	v8.16b,v8.16b
2924#ifdef __AARCH64EB__
2925	rev32	v8.16b,v8.16b
2926#endif
2927	mov	x12,v8.d[0]
2928	mov	x13,v8.d[1]
2929	mov	w7,0x87
2930	extr	x9,x13,x13,#32
2931	extr	x15,x13,x12,#63
2932	and	w8,w7,w9,asr#31
2933	eor	x14,x8,x12,lsl#1
2934	mov	w7,0x87
2935	extr	x9,x15,x15,#32
2936	extr	x17,x15,x14,#63
2937	and	w8,w7,w9,asr#31
2938	eor	x16,x8,x14,lsl#1
2939	mov	w7,0x87
2940	extr	x9,x17,x17,#32
2941	extr	x19,x17,x16,#63
2942	and	w8,w7,w9,asr#31
2943	eor	x18,x8,x16,lsl#1
2944	mov	w7,0x87
2945	extr	x9,x19,x19,#32
2946	extr	x21,x19,x18,#63
2947	and	w8,w7,w9,asr#31
2948	eor	x20,x8,x18,lsl#1
2949	mov	w7,0x87
2950	extr	x9,x21,x21,#32
2951	extr	x23,x21,x20,#63
2952	and	w8,w7,w9,asr#31
2953	eor	x22,x8,x20,lsl#1
2954	mov	w7,0x87
2955	extr	x9,x23,x23,#32
2956	extr	x25,x23,x22,#63
2957	and	w8,w7,w9,asr#31
2958	eor	x24,x8,x22,lsl#1
2959	mov	w7,0x87
2960	extr	x9,x25,x25,#32
2961	extr	x27,x25,x24,#63
2962	and	w8,w7,w9,asr#31
2963	eor	x26,x8,x24,lsl#1
2964.Lxts_8_blocks_process_gb:
2965	cmp	x2,#8
2966	b.lt	.Lxts_4_blocks_process_gb
2967	mov	v0.d[0],x12
2968	mov	v0.d[1],x13
2969#ifdef __AARCH64EB__
2970	rev32	v0.16b,v0.16b
2971#endif
2972	mov	v1.d[0],x14
2973	mov	v1.d[1],x15
2974#ifdef __AARCH64EB__
2975	rev32	v1.16b,v1.16b
2976#endif
2977	mov	v2.d[0],x16
2978	mov	v2.d[1],x17
2979#ifdef __AARCH64EB__
2980	rev32	v2.16b,v2.16b
2981#endif
2982	mov	v3.d[0],x18
2983	mov	v3.d[1],x19
2984#ifdef __AARCH64EB__
2985	rev32	v3.16b,v3.16b
2986#endif
2987	mov	v12.d[0],x20
2988	mov	v12.d[1],x21
2989#ifdef __AARCH64EB__
2990	rev32	v12.16b,v12.16b
2991#endif
2992	mov	v13.d[0],x22
2993	mov	v13.d[1],x23
2994#ifdef __AARCH64EB__
2995	rev32	v13.16b,v13.16b
2996#endif
2997	mov	v14.d[0],x24
2998	mov	v14.d[1],x25
2999#ifdef __AARCH64EB__
3000	rev32	v14.16b,v14.16b
3001#endif
3002	mov	v15.d[0],x26
3003	mov	v15.d[1],x27
3004#ifdef __AARCH64EB__
3005	rev32	v15.16b,v15.16b
3006#endif
3007	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3008	rbit	v0.16b,v0.16b
3009	rbit	v1.16b,v1.16b
3010	rbit	v2.16b,v2.16b
3011	rbit	v3.16b,v3.16b
3012	eor	v4.16b, v4.16b, v0.16b
3013	eor	v5.16b, v5.16b, v1.16b
3014	eor	v6.16b, v6.16b, v2.16b
3015	eor	v7.16b, v7.16b, v3.16b
3016	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
3017	rbit	v12.16b,v12.16b
3018	rbit	v13.16b,v13.16b
3019	rbit	v14.16b,v14.16b
3020	rbit	v15.16b,v15.16b
3021	eor	v8.16b, v8.16b, v12.16b
3022	eor	v9.16b, v9.16b, v13.16b
3023	eor	v10.16b, v10.16b, v14.16b
3024	eor	v11.16b, v11.16b, v15.16b
3025#ifndef __AARCH64EB__
3026	rev32	v4.16b,v4.16b
3027#endif
3028#ifndef __AARCH64EB__
3029	rev32	v5.16b,v5.16b
3030#endif
3031#ifndef __AARCH64EB__
3032	rev32	v6.16b,v6.16b
3033#endif
3034#ifndef __AARCH64EB__
3035	rev32	v7.16b,v7.16b
3036#endif
3037#ifndef __AARCH64EB__
3038	rev32	v8.16b,v8.16b
3039#endif
3040#ifndef __AARCH64EB__
3041	rev32	v9.16b,v9.16b
3042#endif
3043#ifndef __AARCH64EB__
3044	rev32	v10.16b,v10.16b
3045#endif
3046#ifndef __AARCH64EB__
3047	rev32	v11.16b,v11.16b
3048#endif
3049	zip1	v0.4s,v4.4s,v5.4s
3050	zip2	v1.4s,v4.4s,v5.4s
3051	zip1	v2.4s,v6.4s,v7.4s
3052	zip2	v3.4s,v6.4s,v7.4s
3053	zip1	v4.2d,v0.2d,v2.2d
3054	zip2	v5.2d,v0.2d,v2.2d
3055	zip1	v6.2d,v1.2d,v3.2d
3056	zip2	v7.2d,v1.2d,v3.2d
3057	zip1	v0.4s,v8.4s,v9.4s
3058	zip2	v1.4s,v8.4s,v9.4s
3059	zip1	v2.4s,v10.4s,v11.4s
3060	zip2	v3.4s,v10.4s,v11.4s
3061	zip1	v8.2d,v0.2d,v2.2d
3062	zip2	v9.2d,v0.2d,v2.2d
3063	zip1	v10.2d,v1.2d,v3.2d
3064	zip2	v11.2d,v1.2d,v3.2d
3065	bl	_vpsm4_enc_8blks
3066	zip1	v8.4s,v0.4s,v1.4s
3067	zip2	v9.4s,v0.4s,v1.4s
3068	zip1	v10.4s,v2.4s,v3.4s
3069	zip2	v11.4s,v2.4s,v3.4s
3070	zip1	v0.2d,v8.2d,v10.2d
3071	zip2	v1.2d,v8.2d,v10.2d
3072	zip1	v2.2d,v9.2d,v11.2d
3073	zip2	v3.2d,v9.2d,v11.2d
3074	zip1	v8.4s,v4.4s,v5.4s
3075	zip2	v9.4s,v4.4s,v5.4s
3076	zip1	v10.4s,v6.4s,v7.4s
3077	zip2	v11.4s,v6.4s,v7.4s
3078	zip1	v4.2d,v8.2d,v10.2d
3079	zip2	v5.2d,v8.2d,v10.2d
3080	zip1	v6.2d,v9.2d,v11.2d
3081	zip2	v7.2d,v9.2d,v11.2d
3082	mov	v12.d[0],x12
3083	mov	v12.d[1],x13
3084#ifdef __AARCH64EB__
3085	rev32	v12.16b,v12.16b
3086#endif
3087	mov	w7,0x87
3088	extr	x9,x27,x27,#32
3089	extr	x13,x27,x26,#63
3090	and	w8,w7,w9,asr#31
3091	eor	x12,x8,x26,lsl#1
3092	mov	v13.d[0],x14
3093	mov	v13.d[1],x15
3094#ifdef __AARCH64EB__
3095	rev32	v13.16b,v13.16b
3096#endif
3097	mov	w7,0x87
3098	extr	x9,x13,x13,#32
3099	extr	x15,x13,x12,#63
3100	and	w8,w7,w9,asr#31
3101	eor	x14,x8,x12,lsl#1
3102	mov	v14.d[0],x16
3103	mov	v14.d[1],x17
3104#ifdef __AARCH64EB__
3105	rev32	v14.16b,v14.16b
3106#endif
3107	mov	w7,0x87
3108	extr	x9,x15,x15,#32
3109	extr	x17,x15,x14,#63
3110	and	w8,w7,w9,asr#31
3111	eor	x16,x8,x14,lsl#1
3112	mov	v15.d[0],x18
3113	mov	v15.d[1],x19
3114#ifdef __AARCH64EB__
3115	rev32	v15.16b,v15.16b
3116#endif
3117	mov	w7,0x87
3118	extr	x9,x17,x17,#32
3119	extr	x19,x17,x16,#63
3120	and	w8,w7,w9,asr#31
3121	eor	x18,x8,x16,lsl#1
3122	mov	v8.d[0],x20
3123	mov	v8.d[1],x21
3124#ifdef __AARCH64EB__
3125	rev32	v8.16b,v8.16b
3126#endif
3127	mov	w7,0x87
3128	extr	x9,x19,x19,#32
3129	extr	x21,x19,x18,#63
3130	and	w8,w7,w9,asr#31
3131	eor	x20,x8,x18,lsl#1
3132	mov	v9.d[0],x22
3133	mov	v9.d[1],x23
3134#ifdef __AARCH64EB__
3135	rev32	v9.16b,v9.16b
3136#endif
3137	mov	w7,0x87
3138	extr	x9,x21,x21,#32
3139	extr	x23,x21,x20,#63
3140	and	w8,w7,w9,asr#31
3141	eor	x22,x8,x20,lsl#1
3142	mov	v10.d[0],x24
3143	mov	v10.d[1],x25
3144#ifdef __AARCH64EB__
3145	rev32	v10.16b,v10.16b
3146#endif
3147	mov	w7,0x87
3148	extr	x9,x23,x23,#32
3149	extr	x25,x23,x22,#63
3150	and	w8,w7,w9,asr#31
3151	eor	x24,x8,x22,lsl#1
3152	mov	v11.d[0],x26
3153	mov	v11.d[1],x27
3154#ifdef __AARCH64EB__
3155	rev32	v11.16b,v11.16b
3156#endif
3157	mov	w7,0x87
3158	extr	x9,x25,x25,#32
3159	extr	x27,x25,x24,#63
3160	and	w8,w7,w9,asr#31
3161	eor	x26,x8,x24,lsl#1
3162	eor	v0.16b, v0.16b, v12.16b
3163	eor	v1.16b, v1.16b, v13.16b
3164	eor	v2.16b, v2.16b, v14.16b
3165	eor	v3.16b, v3.16b, v15.16b
3166	eor	v4.16b, v4.16b, v8.16b
3167	eor	v5.16b, v5.16b, v9.16b
3168	eor	v6.16b, v6.16b, v10.16b
3169	eor	v7.16b, v7.16b, v11.16b
3170
3171	// save the last tweak
3172	st1	{v11.4s},[x5]
3173	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
3174	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
3175	subs	x2,x2,#8
3176	b.gt	.Lxts_8_blocks_process_gb
3177	b	100f
3178.Lxts_4_blocks_process_gb:
3179	mov	v8.d[0],x12
3180	mov	v8.d[1],x13
3181#ifdef __AARCH64EB__
3182	rev32	v8.16b,v8.16b
3183#endif
3184	mov	v9.d[0],x14
3185	mov	v9.d[1],x15
3186#ifdef __AARCH64EB__
3187	rev32	v9.16b,v9.16b
3188#endif
3189	mov	v10.d[0],x16
3190	mov	v10.d[1],x17
3191#ifdef __AARCH64EB__
3192	rev32	v10.16b,v10.16b
3193#endif
3194	mov	v11.d[0],x18
3195	mov	v11.d[1],x19
3196#ifdef __AARCH64EB__
3197	rev32	v11.16b,v11.16b
3198#endif
3199	cmp	x2,#4
3200	b.lt	1f
3201	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3202	rbit	v8.16b,v8.16b
3203	rbit	v9.16b,v9.16b
3204	rbit	v10.16b,v10.16b
3205	rbit	v11.16b,v11.16b
3206	eor	v4.16b, v4.16b, v8.16b
3207	eor	v5.16b, v5.16b, v9.16b
3208	eor	v6.16b, v6.16b, v10.16b
3209	eor	v7.16b, v7.16b, v11.16b
3210#ifndef __AARCH64EB__
3211	rev32	v4.16b,v4.16b
3212#endif
3213#ifndef __AARCH64EB__
3214	rev32	v5.16b,v5.16b
3215#endif
3216#ifndef __AARCH64EB__
3217	rev32	v6.16b,v6.16b
3218#endif
3219#ifndef __AARCH64EB__
3220	rev32	v7.16b,v7.16b
3221#endif
3222	zip1	v0.4s,v4.4s,v5.4s
3223	zip2	v1.4s,v4.4s,v5.4s
3224	zip1	v2.4s,v6.4s,v7.4s
3225	zip2	v3.4s,v6.4s,v7.4s
3226	zip1	v4.2d,v0.2d,v2.2d
3227	zip2	v5.2d,v0.2d,v2.2d
3228	zip1	v6.2d,v1.2d,v3.2d
3229	zip2	v7.2d,v1.2d,v3.2d
3230	bl	_vpsm4_enc_4blks
3231	zip1	v4.4s,v0.4s,v1.4s
3232	zip2	v5.4s,v0.4s,v1.4s
3233	zip1	v6.4s,v2.4s,v3.4s
3234	zip2	v7.4s,v2.4s,v3.4s
3235	zip1	v0.2d,v4.2d,v6.2d
3236	zip2	v1.2d,v4.2d,v6.2d
3237	zip1	v2.2d,v5.2d,v7.2d
3238	zip2	v3.2d,v5.2d,v7.2d
3239	eor	v0.16b, v0.16b, v8.16b
3240	eor	v1.16b, v1.16b, v9.16b
3241	eor	v2.16b, v2.16b, v10.16b
3242	eor	v3.16b, v3.16b, v11.16b
3243	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
3244	sub	x2,x2,#4
3245	mov	v8.d[0],x20
3246	mov	v8.d[1],x21
3247#ifdef __AARCH64EB__
3248	rev32	v8.16b,v8.16b
3249#endif
3250	mov	v9.d[0],x22
3251	mov	v9.d[1],x23
3252#ifdef __AARCH64EB__
3253	rev32	v9.16b,v9.16b
3254#endif
3255	mov	v10.d[0],x24
3256	mov	v10.d[1],x25
3257#ifdef __AARCH64EB__
3258	rev32	v10.16b,v10.16b
3259#endif
3260	// save the last tweak
3261	st1	{v11.4s},[x5]
32621:
3263	// process last block
3264	cmp	x2,#1
3265	b.lt	100f
3266	b.gt	1f
3267	ld1	{v4.4s},[x0],#16
3268	rbit	v8.16b,v8.16b
3269	eor	v4.16b, v4.16b, v8.16b
3270#ifndef __AARCH64EB__
3271	rev32	v4.16b,v4.16b
3272#endif
3273	mov	x10,x3
3274	mov	w11,#8
3275	mov	w12,v4.s[0]
3276	mov	w13,v4.s[1]
3277	mov	w14,v4.s[2]
3278	mov	w15,v4.s[3]
327910:
3280	ldp	w7,w8,[x10],8
3281	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3282	eor	w6,w14,w15
3283	eor	w9,w7,w13
3284	eor	w6,w6,w9
3285	movi	v1.16b,#64
3286	movi	v2.16b,#128
3287	movi	v3.16b,#192
3288	mov	v0.s[0],w6
3289
3290	sub	v1.16b,v0.16b,v1.16b
3291	sub	v2.16b,v0.16b,v2.16b
3292	sub	v3.16b,v0.16b,v3.16b
3293
3294	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3295	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3296	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3297	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3298
3299	mov	w6,v0.s[0]
3300	mov	w7,v1.s[0]
3301	mov	w9,v2.s[0]
3302	add	w7,w6,w7
3303	mov	w6,v3.s[0]
3304	add	w7,w7,w9
3305	add	w7,w7,w6
3306
3307	eor	w6,w7,w7,ror #32-2
3308	eor	w6,w6,w7,ror #32-10
3309	eor	w6,w6,w7,ror #32-18
3310	eor	w6,w6,w7,ror #32-24
3311	eor	w12,w12,w6
3312	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3313	eor	w6,w14,w15
3314	eor	w9,w12,w8
3315	eor	w6,w6,w9
3316	movi	v1.16b,#64
3317	movi	v2.16b,#128
3318	movi	v3.16b,#192
3319	mov	v0.s[0],w6
3320
3321	sub	v1.16b,v0.16b,v1.16b
3322	sub	v2.16b,v0.16b,v2.16b
3323	sub	v3.16b,v0.16b,v3.16b
3324
3325	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3326	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3327	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3328	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3329
3330	mov	w6,v0.s[0]
3331	mov	w7,v1.s[0]
3332	mov	w9,v2.s[0]
3333	add	w7,w6,w7
3334	mov	w6,v3.s[0]
3335	add	w7,w7,w9
3336	add	w7,w7,w6
3337
3338	eor	w6,w7,w7,ror #32-2
3339	eor	w6,w6,w7,ror #32-10
3340	eor	w6,w6,w7,ror #32-18
3341	eor	w6,w6,w7,ror #32-24
3342	ldp	w7,w8,[x10],8
3343	eor	w13,w13,w6
3344	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3345	eor	w6,w12,w13
3346	eor	w9,w7,w15
3347	eor	w6,w6,w9
3348	movi	v1.16b,#64
3349	movi	v2.16b,#128
3350	movi	v3.16b,#192
3351	mov	v0.s[0],w6
3352
3353	sub	v1.16b,v0.16b,v1.16b
3354	sub	v2.16b,v0.16b,v2.16b
3355	sub	v3.16b,v0.16b,v3.16b
3356
3357	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3358	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3359	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3360	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3361
3362	mov	w6,v0.s[0]
3363	mov	w7,v1.s[0]
3364	mov	w9,v2.s[0]
3365	add	w7,w6,w7
3366	mov	w6,v3.s[0]
3367	add	w7,w7,w9
3368	add	w7,w7,w6
3369
3370	eor	w6,w7,w7,ror #32-2
3371	eor	w6,w6,w7,ror #32-10
3372	eor	w6,w6,w7,ror #32-18
3373	eor	w6,w6,w7,ror #32-24
3374	eor	w14,w14,w6
3375	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3376	eor	w6,w12,w13
3377	eor	w9,w14,w8
3378	eor	w6,w6,w9
3379	movi	v1.16b,#64
3380	movi	v2.16b,#128
3381	movi	v3.16b,#192
3382	mov	v0.s[0],w6
3383
3384	sub	v1.16b,v0.16b,v1.16b
3385	sub	v2.16b,v0.16b,v2.16b
3386	sub	v3.16b,v0.16b,v3.16b
3387
3388	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3389	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3390	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3391	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3392
3393	mov	w6,v0.s[0]
3394	mov	w7,v1.s[0]
3395	mov	w9,v2.s[0]
3396	add	w7,w6,w7
3397	mov	w6,v3.s[0]
3398	add	w7,w7,w9
3399	add	w7,w7,w6
3400
3401	eor	w6,w7,w7,ror #32-2
3402	eor	w6,w6,w7,ror #32-10
3403	eor	w6,w6,w7,ror #32-18
3404	eor	w6,w6,w7,ror #32-24
3405	eor	w15,w15,w6
3406	subs	w11,w11,#1
3407	b.ne	10b
3408	mov	v4.s[0],w15
3409	mov	v4.s[1],w14
3410	mov	v4.s[2],w13
3411	mov	v4.s[3],w12
3412#ifndef __AARCH64EB__
3413	rev32	v4.16b,v4.16b
3414#endif
3415	eor	v4.16b, v4.16b, v8.16b
3416	st1	{v4.4s},[x1],#16
3417	// save the last tweak
3418	st1	{v8.4s},[x5]
3419	b	100f
34201:	//	process last 2 blocks
3421	cmp	x2,#2
3422	b.gt	1f
3423	ld1	{v4.4s,v5.4s},[x0],#32
3424	rbit	v8.16b,v8.16b
3425	rbit	v9.16b,v9.16b
3426	eor	v4.16b, v4.16b, v8.16b
3427	eor	v5.16b, v5.16b, v9.16b
3428#ifndef __AARCH64EB__
3429	rev32	v4.16b,v4.16b
3430#endif
3431#ifndef __AARCH64EB__
3432	rev32	v5.16b,v5.16b
3433#endif
3434	zip1	v0.4s,v4.4s,v5.4s
3435	zip2	v1.4s,v4.4s,v5.4s
3436	zip1	v2.4s,v6.4s,v7.4s
3437	zip2	v3.4s,v6.4s,v7.4s
3438	zip1	v4.2d,v0.2d,v2.2d
3439	zip2	v5.2d,v0.2d,v2.2d
3440	zip1	v6.2d,v1.2d,v3.2d
3441	zip2	v7.2d,v1.2d,v3.2d
3442	bl	_vpsm4_enc_4blks
3443	zip1	v4.4s,v0.4s,v1.4s
3444	zip2	v5.4s,v0.4s,v1.4s
3445	zip1	v6.4s,v2.4s,v3.4s
3446	zip2	v7.4s,v2.4s,v3.4s
3447	zip1	v0.2d,v4.2d,v6.2d
3448	zip2	v1.2d,v4.2d,v6.2d
3449	zip1	v2.2d,v5.2d,v7.2d
3450	zip2	v3.2d,v5.2d,v7.2d
3451	eor	v0.16b, v0.16b, v8.16b
3452	eor	v1.16b, v1.16b, v9.16b
3453	st1	{v0.4s,v1.4s},[x1],#32
3454	// save the last tweak
3455	st1	{v9.4s},[x5]
3456	b	100f
34571:	//	process last 3 blocks
3458	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
3459	rbit	v8.16b,v8.16b
3460	rbit	v9.16b,v9.16b
3461	rbit	v10.16b,v10.16b
3462	eor	v4.16b, v4.16b, v8.16b
3463	eor	v5.16b, v5.16b, v9.16b
3464	eor	v6.16b, v6.16b, v10.16b
3465#ifndef __AARCH64EB__
3466	rev32	v4.16b,v4.16b
3467#endif
3468#ifndef __AARCH64EB__
3469	rev32	v5.16b,v5.16b
3470#endif
3471#ifndef __AARCH64EB__
3472	rev32	v6.16b,v6.16b
3473#endif
3474	zip1	v0.4s,v4.4s,v5.4s
3475	zip2	v1.4s,v4.4s,v5.4s
3476	zip1	v2.4s,v6.4s,v7.4s
3477	zip2	v3.4s,v6.4s,v7.4s
3478	zip1	v4.2d,v0.2d,v2.2d
3479	zip2	v5.2d,v0.2d,v2.2d
3480	zip1	v6.2d,v1.2d,v3.2d
3481	zip2	v7.2d,v1.2d,v3.2d
3482	bl	_vpsm4_enc_4blks
3483	zip1	v4.4s,v0.4s,v1.4s
3484	zip2	v5.4s,v0.4s,v1.4s
3485	zip1	v6.4s,v2.4s,v3.4s
3486	zip2	v7.4s,v2.4s,v3.4s
3487	zip1	v0.2d,v4.2d,v6.2d
3488	zip2	v1.2d,v4.2d,v6.2d
3489	zip1	v2.2d,v5.2d,v7.2d
3490	zip2	v3.2d,v5.2d,v7.2d
3491	eor	v0.16b, v0.16b, v8.16b
3492	eor	v1.16b, v1.16b, v9.16b
3493	eor	v2.16b, v2.16b, v10.16b
3494	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
3495	// save the last tweak
3496	st1	{v10.4s},[x5]
3497100:
3498	cmp	x29,0
3499	b.eq	.return_gb
3500
3501// This branch calculates the last two tweaks,
3502// while the encryption/decryption length is larger than 32
3503.last_2blks_tweak_gb:
3504	ld1	{v8.4s},[x5]
3505#ifdef __AARCH64EB__
3506	rev32	v8.16b,v8.16b
3507#endif
3508	rbit	v2.16b,v8.16b
3509	adrp	x10,.Lxts_magic
3510	ldr	q0, [x10, #:lo12:.Lxts_magic]
3511	shl	v9.16b, v2.16b, #1
3512	ext	v1.16b, v2.16b, v2.16b,#15
3513	ushr	v1.16b, v1.16b, #7
3514	mul	v1.16b, v1.16b, v0.16b
3515	eor	v9.16b, v9.16b, v1.16b
3516	rbit	v9.16b,v9.16b
3517	rbit	v2.16b,v9.16b
3518	adrp	x10,.Lxts_magic
3519	ldr	q0, [x10, #:lo12:.Lxts_magic]
3520	shl	v10.16b, v2.16b, #1
3521	ext	v1.16b, v2.16b, v2.16b,#15
3522	ushr	v1.16b, v1.16b, #7
3523	mul	v1.16b, v1.16b, v0.16b
3524	eor	v10.16b, v10.16b, v1.16b
3525	rbit	v10.16b,v10.16b
3526	b	.check_dec_gb
3527
3528
3529// This branch calculates the last two tweaks,
3530// while the encryption/decryption length is equal to 32, who only need two tweaks
3531.only_2blks_tweak_gb:
3532	mov	v9.16b,v8.16b
3533#ifdef __AARCH64EB__
3534	rev32	v9.16b,v9.16b
3535#endif
3536	rbit	v2.16b,v9.16b
3537	adrp	x10,.Lxts_magic
3538	ldr	q0, [x10, #:lo12:.Lxts_magic]
3539	shl	v10.16b, v2.16b, #1
3540	ext	v1.16b, v2.16b, v2.16b,#15
3541	ushr	v1.16b, v1.16b, #7
3542	mul	v1.16b, v1.16b, v0.16b
3543	eor	v10.16b, v10.16b, v1.16b
3544	rbit	v10.16b,v10.16b
3545	b	.check_dec_gb
3546
3547
3548// Determine whether encryption or decryption is required.
3549// The last two tweaks need to be swapped for decryption.
3550.check_dec_gb:
3551	// encryption:1 decryption:0
3552	cmp	w28,1
3553	b.eq	.process_last_2blks_gb
3554	mov	v0.16B,v9.16b
3555	mov	v9.16B,v10.16b
3556	mov	v10.16B,v0.16b
3557
3558.process_last_2blks_gb:
3559#ifdef __AARCH64EB__
3560	rev32	v9.16b,v9.16b
3561#endif
3562#ifdef __AARCH64EB__
3563	rev32	v10.16b,v10.16b
3564#endif
3565	ld1	{v4.4s},[x0],#16
3566	eor	v4.16b, v4.16b, v9.16b
3567#ifndef __AARCH64EB__
3568	rev32	v4.16b,v4.16b
3569#endif
3570	mov	x10,x3
3571	mov	w11,#8
3572	mov	w12,v4.s[0]
3573	mov	w13,v4.s[1]
3574	mov	w14,v4.s[2]
3575	mov	w15,v4.s[3]
357610:
3577	ldp	w7,w8,[x10],8
3578	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3579	eor	w6,w14,w15
3580	eor	w9,w7,w13
3581	eor	w6,w6,w9
3582	movi	v1.16b,#64
3583	movi	v2.16b,#128
3584	movi	v3.16b,#192
3585	mov	v0.s[0],w6
3586
3587	sub	v1.16b,v0.16b,v1.16b
3588	sub	v2.16b,v0.16b,v2.16b
3589	sub	v3.16b,v0.16b,v3.16b
3590
3591	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3592	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3593	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3594	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3595
3596	mov	w6,v0.s[0]
3597	mov	w7,v1.s[0]
3598	mov	w9,v2.s[0]
3599	add	w7,w6,w7
3600	mov	w6,v3.s[0]
3601	add	w7,w7,w9
3602	add	w7,w7,w6
3603
3604	eor	w6,w7,w7,ror #32-2
3605	eor	w6,w6,w7,ror #32-10
3606	eor	w6,w6,w7,ror #32-18
3607	eor	w6,w6,w7,ror #32-24
3608	eor	w12,w12,w6
3609	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3610	eor	w6,w14,w15
3611	eor	w9,w12,w8
3612	eor	w6,w6,w9
3613	movi	v1.16b,#64
3614	movi	v2.16b,#128
3615	movi	v3.16b,#192
3616	mov	v0.s[0],w6
3617
3618	sub	v1.16b,v0.16b,v1.16b
3619	sub	v2.16b,v0.16b,v2.16b
3620	sub	v3.16b,v0.16b,v3.16b
3621
3622	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3623	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3624	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3625	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3626
3627	mov	w6,v0.s[0]
3628	mov	w7,v1.s[0]
3629	mov	w9,v2.s[0]
3630	add	w7,w6,w7
3631	mov	w6,v3.s[0]
3632	add	w7,w7,w9
3633	add	w7,w7,w6
3634
3635	eor	w6,w7,w7,ror #32-2
3636	eor	w6,w6,w7,ror #32-10
3637	eor	w6,w6,w7,ror #32-18
3638	eor	w6,w6,w7,ror #32-24
3639	ldp	w7,w8,[x10],8
3640	eor	w13,w13,w6
3641	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3642	eor	w6,w12,w13
3643	eor	w9,w7,w15
3644	eor	w6,w6,w9
3645	movi	v1.16b,#64
3646	movi	v2.16b,#128
3647	movi	v3.16b,#192
3648	mov	v0.s[0],w6
3649
3650	sub	v1.16b,v0.16b,v1.16b
3651	sub	v2.16b,v0.16b,v2.16b
3652	sub	v3.16b,v0.16b,v3.16b
3653
3654	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3655	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3656	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3657	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3658
3659	mov	w6,v0.s[0]
3660	mov	w7,v1.s[0]
3661	mov	w9,v2.s[0]
3662	add	w7,w6,w7
3663	mov	w6,v3.s[0]
3664	add	w7,w7,w9
3665	add	w7,w7,w6
3666
3667	eor	w6,w7,w7,ror #32-2
3668	eor	w6,w6,w7,ror #32-10
3669	eor	w6,w6,w7,ror #32-18
3670	eor	w6,w6,w7,ror #32-24
3671	eor	w14,w14,w6
3672	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3673	eor	w6,w12,w13
3674	eor	w9,w14,w8
3675	eor	w6,w6,w9
3676	movi	v1.16b,#64
3677	movi	v2.16b,#128
3678	movi	v3.16b,#192
3679	mov	v0.s[0],w6
3680
3681	sub	v1.16b,v0.16b,v1.16b
3682	sub	v2.16b,v0.16b,v2.16b
3683	sub	v3.16b,v0.16b,v3.16b
3684
3685	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3686	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3687	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3688	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3689
3690	mov	w6,v0.s[0]
3691	mov	w7,v1.s[0]
3692	mov	w9,v2.s[0]
3693	add	w7,w6,w7
3694	mov	w6,v3.s[0]
3695	add	w7,w7,w9
3696	add	w7,w7,w6
3697
3698	eor	w6,w7,w7,ror #32-2
3699	eor	w6,w6,w7,ror #32-10
3700	eor	w6,w6,w7,ror #32-18
3701	eor	w6,w6,w7,ror #32-24
3702	eor	w15,w15,w6
3703	subs	w11,w11,#1
3704	b.ne	10b
3705	mov	v4.s[0],w15
3706	mov	v4.s[1],w14
3707	mov	v4.s[2],w13
3708	mov	v4.s[3],w12
3709#ifndef __AARCH64EB__
3710	rev32	v4.16b,v4.16b
3711#endif
3712	eor	v4.16b, v4.16b, v9.16b
3713	st1	{v4.4s},[x1],#16
3714
3715	sub	x26,x1,16
3716.loop_gb:
3717	subs	x29,x29,1
3718	ldrb	w7,[x26,x29]
3719	ldrb	w8,[x0,x29]
3720	strb	w8,[x26,x29]
3721	strb	w7,[x1,x29]
3722	b.gt	.loop_gb
3723	ld1	{v4.4s}, [x26]
3724	eor	v4.16b, v4.16b, v10.16b
3725#ifndef __AARCH64EB__
3726	rev32	v4.16b,v4.16b
3727#endif
3728	mov	x10,x3
3729	mov	w11,#8
3730	mov	w12,v4.s[0]
3731	mov	w13,v4.s[1]
3732	mov	w14,v4.s[2]
3733	mov	w15,v4.s[3]
373410:
3735	ldp	w7,w8,[x10],8
3736	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3737	eor	w6,w14,w15
3738	eor	w9,w7,w13
3739	eor	w6,w6,w9
3740	movi	v1.16b,#64
3741	movi	v2.16b,#128
3742	movi	v3.16b,#192
3743	mov	v0.s[0],w6
3744
3745	sub	v1.16b,v0.16b,v1.16b
3746	sub	v2.16b,v0.16b,v2.16b
3747	sub	v3.16b,v0.16b,v3.16b
3748
3749	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3750	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3751	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3752	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3753
3754	mov	w6,v0.s[0]
3755	mov	w7,v1.s[0]
3756	mov	w9,v2.s[0]
3757	add	w7,w6,w7
3758	mov	w6,v3.s[0]
3759	add	w7,w7,w9
3760	add	w7,w7,w6
3761
3762	eor	w6,w7,w7,ror #32-2
3763	eor	w6,w6,w7,ror #32-10
3764	eor	w6,w6,w7,ror #32-18
3765	eor	w6,w6,w7,ror #32-24
3766	eor	w12,w12,w6
3767	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3768	eor	w6,w14,w15
3769	eor	w9,w12,w8
3770	eor	w6,w6,w9
3771	movi	v1.16b,#64
3772	movi	v2.16b,#128
3773	movi	v3.16b,#192
3774	mov	v0.s[0],w6
3775
3776	sub	v1.16b,v0.16b,v1.16b
3777	sub	v2.16b,v0.16b,v2.16b
3778	sub	v3.16b,v0.16b,v3.16b
3779
3780	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3781	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3782	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3783	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3784
3785	mov	w6,v0.s[0]
3786	mov	w7,v1.s[0]
3787	mov	w9,v2.s[0]
3788	add	w7,w6,w7
3789	mov	w6,v3.s[0]
3790	add	w7,w7,w9
3791	add	w7,w7,w6
3792
3793	eor	w6,w7,w7,ror #32-2
3794	eor	w6,w6,w7,ror #32-10
3795	eor	w6,w6,w7,ror #32-18
3796	eor	w6,w6,w7,ror #32-24
3797	ldp	w7,w8,[x10],8
3798	eor	w13,w13,w6
3799	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3800	eor	w6,w12,w13
3801	eor	w9,w7,w15
3802	eor	w6,w6,w9
3803	movi	v1.16b,#64
3804	movi	v2.16b,#128
3805	movi	v3.16b,#192
3806	mov	v0.s[0],w6
3807
3808	sub	v1.16b,v0.16b,v1.16b
3809	sub	v2.16b,v0.16b,v2.16b
3810	sub	v3.16b,v0.16b,v3.16b
3811
3812	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3813	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3814	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3815	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3816
3817	mov	w6,v0.s[0]
3818	mov	w7,v1.s[0]
3819	mov	w9,v2.s[0]
3820	add	w7,w6,w7
3821	mov	w6,v3.s[0]
3822	add	w7,w7,w9
3823	add	w7,w7,w6
3824
3825	eor	w6,w7,w7,ror #32-2
3826	eor	w6,w6,w7,ror #32-10
3827	eor	w6,w6,w7,ror #32-18
3828	eor	w6,w6,w7,ror #32-24
3829	eor	w14,w14,w6
3830	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3831	eor	w6,w12,w13
3832	eor	w9,w14,w8
3833	eor	w6,w6,w9
3834	movi	v1.16b,#64
3835	movi	v2.16b,#128
3836	movi	v3.16b,#192
3837	mov	v0.s[0],w6
3838
3839	sub	v1.16b,v0.16b,v1.16b
3840	sub	v2.16b,v0.16b,v2.16b
3841	sub	v3.16b,v0.16b,v3.16b
3842
3843	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3844	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3845	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3846	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3847
3848	mov	w6,v0.s[0]
3849	mov	w7,v1.s[0]
3850	mov	w9,v2.s[0]
3851	add	w7,w6,w7
3852	mov	w6,v3.s[0]
3853	add	w7,w7,w9
3854	add	w7,w7,w6
3855
3856	eor	w6,w7,w7,ror #32-2
3857	eor	w6,w6,w7,ror #32-10
3858	eor	w6,w6,w7,ror #32-18
3859	eor	w6,w6,w7,ror #32-24
3860	eor	w15,w15,w6
3861	subs	w11,w11,#1
3862	b.ne	10b
3863	mov	v4.s[0],w15
3864	mov	v4.s[1],w14
3865	mov	v4.s[2],w13
3866	mov	v4.s[3],w12
3867#ifndef __AARCH64EB__
3868	rev32	v4.16b,v4.16b
3869#endif
3870	eor	v4.16b, v4.16b, v10.16b
3871	st1	{v4.4s}, [x26]
3872.return_gb:
3873	ldp	d14, d15, [sp], #0x10
3874	ldp	d12, d13, [sp], #0x10
3875	ldp	d10, d11, [sp], #0x10
3876	ldp	d8, d9, [sp], #0x10
3877	ldp	x29, x30, [sp], #0x10
3878	ldp	x27, x28, [sp], #0x10
3879	ldp	x25, x26, [sp], #0x10
3880	ldp	x23, x24, [sp], #0x10
3881	ldp	x21, x22, [sp], #0x10
3882	ldp	x19, x20, [sp], #0x10
3883	ldp	x17, x18, [sp], #0x10
3884	ldp	x15, x16, [sp], #0x10
3885	AARCH64_VALIDATE_LINK_REGISTER
3886	ret
3887.size	vpsm4_xts_encrypt_gb,.-vpsm4_xts_encrypt_gb
3888.globl	vpsm4_xts_encrypt
3889.type	vpsm4_xts_encrypt,%function
3890.align	5
3891vpsm4_xts_encrypt:
3892	AARCH64_SIGN_LINK_REGISTER
3893	stp	x15, x16, [sp, #-0x10]!
3894	stp	x17, x18, [sp, #-0x10]!
3895	stp	x19, x20, [sp, #-0x10]!
3896	stp	x21, x22, [sp, #-0x10]!
3897	stp	x23, x24, [sp, #-0x10]!
3898	stp	x25, x26, [sp, #-0x10]!
3899	stp	x27, x28, [sp, #-0x10]!
3900	stp	x29, x30, [sp, #-0x10]!
3901	stp	d8, d9, [sp, #-0x10]!
3902	stp	d10, d11, [sp, #-0x10]!
3903	stp	d12, d13, [sp, #-0x10]!
3904	stp	d14, d15, [sp, #-0x10]!
3905	mov	x26,x3
3906	mov	x27,x4
3907	mov	w28,w6
3908	ld1	{v8.4s}, [x5]
3909	mov	x3,x27
3910	adrp	x10,.Lsbox
3911	add	x10,x10,#:lo12:.Lsbox
3912	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
3913	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
3914	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
3915	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
3916#ifndef __AARCH64EB__
3917	rev32	v8.16b,v8.16b
3918#endif
3919	mov	x10,x3
3920	mov	w11,#8
3921	mov	w12,v8.s[0]
3922	mov	w13,v8.s[1]
3923	mov	w14,v8.s[2]
3924	mov	w15,v8.s[3]
392510:
3926	ldp	w7,w8,[x10],8
3927	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3928	eor	w6,w14,w15
3929	eor	w9,w7,w13
3930	eor	w6,w6,w9
3931	movi	v1.16b,#64
3932	movi	v2.16b,#128
3933	movi	v3.16b,#192
3934	mov	v0.s[0],w6
3935
3936	sub	v1.16b,v0.16b,v1.16b
3937	sub	v2.16b,v0.16b,v2.16b
3938	sub	v3.16b,v0.16b,v3.16b
3939
3940	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3941	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3942	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3943	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3944
3945	mov	w6,v0.s[0]
3946	mov	w7,v1.s[0]
3947	mov	w9,v2.s[0]
3948	add	w7,w6,w7
3949	mov	w6,v3.s[0]
3950	add	w7,w7,w9
3951	add	w7,w7,w6
3952
3953	eor	w6,w7,w7,ror #32-2
3954	eor	w6,w6,w7,ror #32-10
3955	eor	w6,w6,w7,ror #32-18
3956	eor	w6,w6,w7,ror #32-24
3957	eor	w12,w12,w6
3958	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3959	eor	w6,w14,w15
3960	eor	w9,w12,w8
3961	eor	w6,w6,w9
3962	movi	v1.16b,#64
3963	movi	v2.16b,#128
3964	movi	v3.16b,#192
3965	mov	v0.s[0],w6
3966
3967	sub	v1.16b,v0.16b,v1.16b
3968	sub	v2.16b,v0.16b,v2.16b
3969	sub	v3.16b,v0.16b,v3.16b
3970
3971	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3972	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3973	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3974	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3975
3976	mov	w6,v0.s[0]
3977	mov	w7,v1.s[0]
3978	mov	w9,v2.s[0]
3979	add	w7,w6,w7
3980	mov	w6,v3.s[0]
3981	add	w7,w7,w9
3982	add	w7,w7,w6
3983
3984	eor	w6,w7,w7,ror #32-2
3985	eor	w6,w6,w7,ror #32-10
3986	eor	w6,w6,w7,ror #32-18
3987	eor	w6,w6,w7,ror #32-24
3988	ldp	w7,w8,[x10],8
3989	eor	w13,w13,w6
3990	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3991	eor	w6,w12,w13
3992	eor	w9,w7,w15
3993	eor	w6,w6,w9
3994	movi	v1.16b,#64
3995	movi	v2.16b,#128
3996	movi	v3.16b,#192
3997	mov	v0.s[0],w6
3998
3999	sub	v1.16b,v0.16b,v1.16b
4000	sub	v2.16b,v0.16b,v2.16b
4001	sub	v3.16b,v0.16b,v3.16b
4002
4003	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4004	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4005	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4006	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4007
4008	mov	w6,v0.s[0]
4009	mov	w7,v1.s[0]
4010	mov	w9,v2.s[0]
4011	add	w7,w6,w7
4012	mov	w6,v3.s[0]
4013	add	w7,w7,w9
4014	add	w7,w7,w6
4015
4016	eor	w6,w7,w7,ror #32-2
4017	eor	w6,w6,w7,ror #32-10
4018	eor	w6,w6,w7,ror #32-18
4019	eor	w6,w6,w7,ror #32-24
4020	eor	w14,w14,w6
4021	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4022	eor	w6,w12,w13
4023	eor	w9,w14,w8
4024	eor	w6,w6,w9
4025	movi	v1.16b,#64
4026	movi	v2.16b,#128
4027	movi	v3.16b,#192
4028	mov	v0.s[0],w6
4029
4030	sub	v1.16b,v0.16b,v1.16b
4031	sub	v2.16b,v0.16b,v2.16b
4032	sub	v3.16b,v0.16b,v3.16b
4033
4034	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4035	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4036	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4037	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4038
4039	mov	w6,v0.s[0]
4040	mov	w7,v1.s[0]
4041	mov	w9,v2.s[0]
4042	add	w7,w6,w7
4043	mov	w6,v3.s[0]
4044	add	w7,w7,w9
4045	add	w7,w7,w6
4046
4047	eor	w6,w7,w7,ror #32-2
4048	eor	w6,w6,w7,ror #32-10
4049	eor	w6,w6,w7,ror #32-18
4050	eor	w6,w6,w7,ror #32-24
4051	eor	w15,w15,w6
4052	subs	w11,w11,#1
4053	b.ne	10b
4054	mov	v8.s[0],w15
4055	mov	v8.s[1],w14
4056	mov	v8.s[2],w13
4057	mov	v8.s[3],w12
4058#ifndef __AARCH64EB__
4059	rev32	v8.16b,v8.16b
4060#endif
4061	mov	x3,x26
4062	and	x29,x2,#0x0F
4063	// convert length into blocks
4064	lsr	x2,x2,4
4065	cmp	x2,#1
4066	b.lt	.return
4067
4068	cmp	x29,0
4069	// If the encryption/decryption Length is N times of 16,
4070	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks
4071	b.eq	.xts_encrypt_blocks
4072
4073	// If the encryption/decryption length is not N times of 16,
4074	// the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak
4075	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks
4076	subs	x2,x2,#1
4077	b.eq	.only_2blks_tweak
4078.xts_encrypt_blocks:
4079#ifdef __AARCH64EB__
4080	rev32	v8.16b,v8.16b
4081#endif
4082	mov	x12,v8.d[0]
4083	mov	x13,v8.d[1]
4084	mov	w7,0x87
4085	extr	x9,x13,x13,#32
4086	extr	x15,x13,x12,#63
4087	and	w8,w7,w9,asr#31
4088	eor	x14,x8,x12,lsl#1
4089	mov	w7,0x87
4090	extr	x9,x15,x15,#32
4091	extr	x17,x15,x14,#63
4092	and	w8,w7,w9,asr#31
4093	eor	x16,x8,x14,lsl#1
4094	mov	w7,0x87
4095	extr	x9,x17,x17,#32
4096	extr	x19,x17,x16,#63
4097	and	w8,w7,w9,asr#31
4098	eor	x18,x8,x16,lsl#1
4099	mov	w7,0x87
4100	extr	x9,x19,x19,#32
4101	extr	x21,x19,x18,#63
4102	and	w8,w7,w9,asr#31
4103	eor	x20,x8,x18,lsl#1
4104	mov	w7,0x87
4105	extr	x9,x21,x21,#32
4106	extr	x23,x21,x20,#63
4107	and	w8,w7,w9,asr#31
4108	eor	x22,x8,x20,lsl#1
4109	mov	w7,0x87
4110	extr	x9,x23,x23,#32
4111	extr	x25,x23,x22,#63
4112	and	w8,w7,w9,asr#31
4113	eor	x24,x8,x22,lsl#1
4114	mov	w7,0x87
4115	extr	x9,x25,x25,#32
4116	extr	x27,x25,x24,#63
4117	and	w8,w7,w9,asr#31
4118	eor	x26,x8,x24,lsl#1
4119.Lxts_8_blocks_process:
4120	cmp	x2,#8
4121	b.lt	.Lxts_4_blocks_process
4122	mov	v0.d[0],x12
4123	mov	v0.d[1],x13
4124#ifdef __AARCH64EB__
4125	rev32	v0.16b,v0.16b
4126#endif
4127	mov	v1.d[0],x14
4128	mov	v1.d[1],x15
4129#ifdef __AARCH64EB__
4130	rev32	v1.16b,v1.16b
4131#endif
4132	mov	v2.d[0],x16
4133	mov	v2.d[1],x17
4134#ifdef __AARCH64EB__
4135	rev32	v2.16b,v2.16b
4136#endif
4137	mov	v3.d[0],x18
4138	mov	v3.d[1],x19
4139#ifdef __AARCH64EB__
4140	rev32	v3.16b,v3.16b
4141#endif
4142	mov	v12.d[0],x20
4143	mov	v12.d[1],x21
4144#ifdef __AARCH64EB__
4145	rev32	v12.16b,v12.16b
4146#endif
4147	mov	v13.d[0],x22
4148	mov	v13.d[1],x23
4149#ifdef __AARCH64EB__
4150	rev32	v13.16b,v13.16b
4151#endif
4152	mov	v14.d[0],x24
4153	mov	v14.d[1],x25
4154#ifdef __AARCH64EB__
4155	rev32	v14.16b,v14.16b
4156#endif
4157	mov	v15.d[0],x26
4158	mov	v15.d[1],x27
4159#ifdef __AARCH64EB__
4160	rev32	v15.16b,v15.16b
4161#endif
4162	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
4163	eor	v4.16b, v4.16b, v0.16b
4164	eor	v5.16b, v5.16b, v1.16b
4165	eor	v6.16b, v6.16b, v2.16b
4166	eor	v7.16b, v7.16b, v3.16b
4167	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
4168	eor	v8.16b, v8.16b, v12.16b
4169	eor	v9.16b, v9.16b, v13.16b
4170	eor	v10.16b, v10.16b, v14.16b
4171	eor	v11.16b, v11.16b, v15.16b
4172#ifndef __AARCH64EB__
4173	rev32	v4.16b,v4.16b
4174#endif
4175#ifndef __AARCH64EB__
4176	rev32	v5.16b,v5.16b
4177#endif
4178#ifndef __AARCH64EB__
4179	rev32	v6.16b,v6.16b
4180#endif
4181#ifndef __AARCH64EB__
4182	rev32	v7.16b,v7.16b
4183#endif
4184#ifndef __AARCH64EB__
4185	rev32	v8.16b,v8.16b
4186#endif
4187#ifndef __AARCH64EB__
4188	rev32	v9.16b,v9.16b
4189#endif
4190#ifndef __AARCH64EB__
4191	rev32	v10.16b,v10.16b
4192#endif
4193#ifndef __AARCH64EB__
4194	rev32	v11.16b,v11.16b
4195#endif
4196	zip1	v0.4s,v4.4s,v5.4s
4197	zip2	v1.4s,v4.4s,v5.4s
4198	zip1	v2.4s,v6.4s,v7.4s
4199	zip2	v3.4s,v6.4s,v7.4s
4200	zip1	v4.2d,v0.2d,v2.2d
4201	zip2	v5.2d,v0.2d,v2.2d
4202	zip1	v6.2d,v1.2d,v3.2d
4203	zip2	v7.2d,v1.2d,v3.2d
4204	zip1	v0.4s,v8.4s,v9.4s
4205	zip2	v1.4s,v8.4s,v9.4s
4206	zip1	v2.4s,v10.4s,v11.4s
4207	zip2	v3.4s,v10.4s,v11.4s
4208	zip1	v8.2d,v0.2d,v2.2d
4209	zip2	v9.2d,v0.2d,v2.2d
4210	zip1	v10.2d,v1.2d,v3.2d
4211	zip2	v11.2d,v1.2d,v3.2d
4212	bl	_vpsm4_enc_8blks
4213	zip1	v8.4s,v0.4s,v1.4s
4214	zip2	v9.4s,v0.4s,v1.4s
4215	zip1	v10.4s,v2.4s,v3.4s
4216	zip2	v11.4s,v2.4s,v3.4s
4217	zip1	v0.2d,v8.2d,v10.2d
4218	zip2	v1.2d,v8.2d,v10.2d
4219	zip1	v2.2d,v9.2d,v11.2d
4220	zip2	v3.2d,v9.2d,v11.2d
4221	zip1	v8.4s,v4.4s,v5.4s
4222	zip2	v9.4s,v4.4s,v5.4s
4223	zip1	v10.4s,v6.4s,v7.4s
4224	zip2	v11.4s,v6.4s,v7.4s
4225	zip1	v4.2d,v8.2d,v10.2d
4226	zip2	v5.2d,v8.2d,v10.2d
4227	zip1	v6.2d,v9.2d,v11.2d
4228	zip2	v7.2d,v9.2d,v11.2d
4229	mov	v12.d[0],x12
4230	mov	v12.d[1],x13
4231#ifdef __AARCH64EB__
4232	rev32	v12.16b,v12.16b
4233#endif
4234	mov	w7,0x87
4235	extr	x9,x27,x27,#32
4236	extr	x13,x27,x26,#63
4237	and	w8,w7,w9,asr#31
4238	eor	x12,x8,x26,lsl#1
4239	mov	v13.d[0],x14
4240	mov	v13.d[1],x15
4241#ifdef __AARCH64EB__
4242	rev32	v13.16b,v13.16b
4243#endif
4244	mov	w7,0x87
4245	extr	x9,x13,x13,#32
4246	extr	x15,x13,x12,#63
4247	and	w8,w7,w9,asr#31
4248	eor	x14,x8,x12,lsl#1
4249	mov	v14.d[0],x16
4250	mov	v14.d[1],x17
4251#ifdef __AARCH64EB__
4252	rev32	v14.16b,v14.16b
4253#endif
4254	mov	w7,0x87
4255	extr	x9,x15,x15,#32
4256	extr	x17,x15,x14,#63
4257	and	w8,w7,w9,asr#31
4258	eor	x16,x8,x14,lsl#1
4259	mov	v15.d[0],x18
4260	mov	v15.d[1],x19
4261#ifdef __AARCH64EB__
4262	rev32	v15.16b,v15.16b
4263#endif
4264	mov	w7,0x87
4265	extr	x9,x17,x17,#32
4266	extr	x19,x17,x16,#63
4267	and	w8,w7,w9,asr#31
4268	eor	x18,x8,x16,lsl#1
4269	mov	v8.d[0],x20
4270	mov	v8.d[1],x21
4271#ifdef __AARCH64EB__
4272	rev32	v8.16b,v8.16b
4273#endif
4274	mov	w7,0x87
4275	extr	x9,x19,x19,#32
4276	extr	x21,x19,x18,#63
4277	and	w8,w7,w9,asr#31
4278	eor	x20,x8,x18,lsl#1
4279	mov	v9.d[0],x22
4280	mov	v9.d[1],x23
4281#ifdef __AARCH64EB__
4282	rev32	v9.16b,v9.16b
4283#endif
4284	mov	w7,0x87
4285	extr	x9,x21,x21,#32
4286	extr	x23,x21,x20,#63
4287	and	w8,w7,w9,asr#31
4288	eor	x22,x8,x20,lsl#1
4289	mov	v10.d[0],x24
4290	mov	v10.d[1],x25
4291#ifdef __AARCH64EB__
4292	rev32	v10.16b,v10.16b
4293#endif
4294	mov	w7,0x87
4295	extr	x9,x23,x23,#32
4296	extr	x25,x23,x22,#63
4297	and	w8,w7,w9,asr#31
4298	eor	x24,x8,x22,lsl#1
4299	mov	v11.d[0],x26
4300	mov	v11.d[1],x27
4301#ifdef __AARCH64EB__
4302	rev32	v11.16b,v11.16b
4303#endif
4304	mov	w7,0x87
4305	extr	x9,x25,x25,#32
4306	extr	x27,x25,x24,#63
4307	and	w8,w7,w9,asr#31
4308	eor	x26,x8,x24,lsl#1
4309	eor	v0.16b, v0.16b, v12.16b
4310	eor	v1.16b, v1.16b, v13.16b
4311	eor	v2.16b, v2.16b, v14.16b
4312	eor	v3.16b, v3.16b, v15.16b
4313	eor	v4.16b, v4.16b, v8.16b
4314	eor	v5.16b, v5.16b, v9.16b
4315	eor	v6.16b, v6.16b, v10.16b
4316	eor	v7.16b, v7.16b, v11.16b
4317
4318	// save the last tweak
4319	st1	{v11.4s},[x5]
4320	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
4321	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
4322	subs	x2,x2,#8
4323	b.gt	.Lxts_8_blocks_process
4324	b	100f
4325.Lxts_4_blocks_process:
4326	mov	v8.d[0],x12
4327	mov	v8.d[1],x13
4328#ifdef __AARCH64EB__
4329	rev32	v8.16b,v8.16b
4330#endif
4331	mov	v9.d[0],x14
4332	mov	v9.d[1],x15
4333#ifdef __AARCH64EB__
4334	rev32	v9.16b,v9.16b
4335#endif
4336	mov	v10.d[0],x16
4337	mov	v10.d[1],x17
4338#ifdef __AARCH64EB__
4339	rev32	v10.16b,v10.16b
4340#endif
4341	mov	v11.d[0],x18
4342	mov	v11.d[1],x19
4343#ifdef __AARCH64EB__
4344	rev32	v11.16b,v11.16b
4345#endif
4346	cmp	x2,#4
4347	b.lt	1f
4348	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
4349	eor	v4.16b, v4.16b, v8.16b
4350	eor	v5.16b, v5.16b, v9.16b
4351	eor	v6.16b, v6.16b, v10.16b
4352	eor	v7.16b, v7.16b, v11.16b
4353#ifndef __AARCH64EB__
4354	rev32	v4.16b,v4.16b
4355#endif
4356#ifndef __AARCH64EB__
4357	rev32	v5.16b,v5.16b
4358#endif
4359#ifndef __AARCH64EB__
4360	rev32	v6.16b,v6.16b
4361#endif
4362#ifndef __AARCH64EB__
4363	rev32	v7.16b,v7.16b
4364#endif
4365	zip1	v0.4s,v4.4s,v5.4s
4366	zip2	v1.4s,v4.4s,v5.4s
4367	zip1	v2.4s,v6.4s,v7.4s
4368	zip2	v3.4s,v6.4s,v7.4s
4369	zip1	v4.2d,v0.2d,v2.2d
4370	zip2	v5.2d,v0.2d,v2.2d
4371	zip1	v6.2d,v1.2d,v3.2d
4372	zip2	v7.2d,v1.2d,v3.2d
4373	bl	_vpsm4_enc_4blks
4374	zip1	v4.4s,v0.4s,v1.4s
4375	zip2	v5.4s,v0.4s,v1.4s
4376	zip1	v6.4s,v2.4s,v3.4s
4377	zip2	v7.4s,v2.4s,v3.4s
4378	zip1	v0.2d,v4.2d,v6.2d
4379	zip2	v1.2d,v4.2d,v6.2d
4380	zip1	v2.2d,v5.2d,v7.2d
4381	zip2	v3.2d,v5.2d,v7.2d
4382	eor	v0.16b, v0.16b, v8.16b
4383	eor	v1.16b, v1.16b, v9.16b
4384	eor	v2.16b, v2.16b, v10.16b
4385	eor	v3.16b, v3.16b, v11.16b
4386	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
4387	sub	x2,x2,#4
4388	mov	v8.d[0],x20
4389	mov	v8.d[1],x21
4390#ifdef __AARCH64EB__
4391	rev32	v8.16b,v8.16b
4392#endif
4393	mov	v9.d[0],x22
4394	mov	v9.d[1],x23
4395#ifdef __AARCH64EB__
4396	rev32	v9.16b,v9.16b
4397#endif
4398	mov	v10.d[0],x24
4399	mov	v10.d[1],x25
4400#ifdef __AARCH64EB__
4401	rev32	v10.16b,v10.16b
4402#endif
4403	// save the last tweak
4404	st1	{v11.4s},[x5]
44051:
4406	// process last block
4407	cmp	x2,#1
4408	b.lt	100f
4409	b.gt	1f
4410	ld1	{v4.4s},[x0],#16
4411	eor	v4.16b, v4.16b, v8.16b
4412#ifndef __AARCH64EB__
4413	rev32	v4.16b,v4.16b
4414#endif
4415	mov	x10,x3
4416	mov	w11,#8
4417	mov	w12,v4.s[0]
4418	mov	w13,v4.s[1]
4419	mov	w14,v4.s[2]
4420	mov	w15,v4.s[3]
442110:
4422	ldp	w7,w8,[x10],8
4423	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4424	eor	w6,w14,w15
4425	eor	w9,w7,w13
4426	eor	w6,w6,w9
4427	movi	v1.16b,#64
4428	movi	v2.16b,#128
4429	movi	v3.16b,#192
4430	mov	v0.s[0],w6
4431
4432	sub	v1.16b,v0.16b,v1.16b
4433	sub	v2.16b,v0.16b,v2.16b
4434	sub	v3.16b,v0.16b,v3.16b
4435
4436	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4437	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4438	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4439	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4440
4441	mov	w6,v0.s[0]
4442	mov	w7,v1.s[0]
4443	mov	w9,v2.s[0]
4444	add	w7,w6,w7
4445	mov	w6,v3.s[0]
4446	add	w7,w7,w9
4447	add	w7,w7,w6
4448
4449	eor	w6,w7,w7,ror #32-2
4450	eor	w6,w6,w7,ror #32-10
4451	eor	w6,w6,w7,ror #32-18
4452	eor	w6,w6,w7,ror #32-24
4453	eor	w12,w12,w6
4454	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4455	eor	w6,w14,w15
4456	eor	w9,w12,w8
4457	eor	w6,w6,w9
4458	movi	v1.16b,#64
4459	movi	v2.16b,#128
4460	movi	v3.16b,#192
4461	mov	v0.s[0],w6
4462
4463	sub	v1.16b,v0.16b,v1.16b
4464	sub	v2.16b,v0.16b,v2.16b
4465	sub	v3.16b,v0.16b,v3.16b
4466
4467	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4468	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4469	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4470	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4471
4472	mov	w6,v0.s[0]
4473	mov	w7,v1.s[0]
4474	mov	w9,v2.s[0]
4475	add	w7,w6,w7
4476	mov	w6,v3.s[0]
4477	add	w7,w7,w9
4478	add	w7,w7,w6
4479
4480	eor	w6,w7,w7,ror #32-2
4481	eor	w6,w6,w7,ror #32-10
4482	eor	w6,w6,w7,ror #32-18
4483	eor	w6,w6,w7,ror #32-24
4484	ldp	w7,w8,[x10],8
4485	eor	w13,w13,w6
4486	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4487	eor	w6,w12,w13
4488	eor	w9,w7,w15
4489	eor	w6,w6,w9
4490	movi	v1.16b,#64
4491	movi	v2.16b,#128
4492	movi	v3.16b,#192
4493	mov	v0.s[0],w6
4494
4495	sub	v1.16b,v0.16b,v1.16b
4496	sub	v2.16b,v0.16b,v2.16b
4497	sub	v3.16b,v0.16b,v3.16b
4498
4499	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4500	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4501	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4502	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4503
4504	mov	w6,v0.s[0]
4505	mov	w7,v1.s[0]
4506	mov	w9,v2.s[0]
4507	add	w7,w6,w7
4508	mov	w6,v3.s[0]
4509	add	w7,w7,w9
4510	add	w7,w7,w6
4511
4512	eor	w6,w7,w7,ror #32-2
4513	eor	w6,w6,w7,ror #32-10
4514	eor	w6,w6,w7,ror #32-18
4515	eor	w6,w6,w7,ror #32-24
4516	eor	w14,w14,w6
4517	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4518	eor	w6,w12,w13
4519	eor	w9,w14,w8
4520	eor	w6,w6,w9
4521	movi	v1.16b,#64
4522	movi	v2.16b,#128
4523	movi	v3.16b,#192
4524	mov	v0.s[0],w6
4525
4526	sub	v1.16b,v0.16b,v1.16b
4527	sub	v2.16b,v0.16b,v2.16b
4528	sub	v3.16b,v0.16b,v3.16b
4529
4530	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4531	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4532	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4533	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4534
4535	mov	w6,v0.s[0]
4536	mov	w7,v1.s[0]
4537	mov	w9,v2.s[0]
4538	add	w7,w6,w7
4539	mov	w6,v3.s[0]
4540	add	w7,w7,w9
4541	add	w7,w7,w6
4542
4543	eor	w6,w7,w7,ror #32-2
4544	eor	w6,w6,w7,ror #32-10
4545	eor	w6,w6,w7,ror #32-18
4546	eor	w6,w6,w7,ror #32-24
4547	eor	w15,w15,w6
4548	subs	w11,w11,#1
4549	b.ne	10b
4550	mov	v4.s[0],w15
4551	mov	v4.s[1],w14
4552	mov	v4.s[2],w13
4553	mov	v4.s[3],w12
4554#ifndef __AARCH64EB__
4555	rev32	v4.16b,v4.16b
4556#endif
4557	eor	v4.16b, v4.16b, v8.16b
4558	st1	{v4.4s},[x1],#16
4559	// save the last tweak
4560	st1	{v8.4s},[x5]
4561	b	100f
45621:	//	process last 2 blocks
4563	cmp	x2,#2
4564	b.gt	1f
4565	ld1	{v4.4s,v5.4s},[x0],#32
4566	eor	v4.16b, v4.16b, v8.16b
4567	eor	v5.16b, v5.16b, v9.16b
4568#ifndef __AARCH64EB__
4569	rev32	v4.16b,v4.16b
4570#endif
4571#ifndef __AARCH64EB__
4572	rev32	v5.16b,v5.16b
4573#endif
4574	zip1	v0.4s,v4.4s,v5.4s
4575	zip2	v1.4s,v4.4s,v5.4s
4576	zip1	v2.4s,v6.4s,v7.4s
4577	zip2	v3.4s,v6.4s,v7.4s
4578	zip1	v4.2d,v0.2d,v2.2d
4579	zip2	v5.2d,v0.2d,v2.2d
4580	zip1	v6.2d,v1.2d,v3.2d
4581	zip2	v7.2d,v1.2d,v3.2d
4582	bl	_vpsm4_enc_4blks
4583	zip1	v4.4s,v0.4s,v1.4s
4584	zip2	v5.4s,v0.4s,v1.4s
4585	zip1	v6.4s,v2.4s,v3.4s
4586	zip2	v7.4s,v2.4s,v3.4s
4587	zip1	v0.2d,v4.2d,v6.2d
4588	zip2	v1.2d,v4.2d,v6.2d
4589	zip1	v2.2d,v5.2d,v7.2d
4590	zip2	v3.2d,v5.2d,v7.2d
4591	eor	v0.16b, v0.16b, v8.16b
4592	eor	v1.16b, v1.16b, v9.16b
4593	st1	{v0.4s,v1.4s},[x1],#32
4594	// save the last tweak
4595	st1	{v9.4s},[x5]
4596	b	100f
45971:	//	process last 3 blocks
4598	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
4599	eor	v4.16b, v4.16b, v8.16b
4600	eor	v5.16b, v5.16b, v9.16b
4601	eor	v6.16b, v6.16b, v10.16b
4602#ifndef __AARCH64EB__
4603	rev32	v4.16b,v4.16b
4604#endif
4605#ifndef __AARCH64EB__
4606	rev32	v5.16b,v5.16b
4607#endif
4608#ifndef __AARCH64EB__
4609	rev32	v6.16b,v6.16b
4610#endif
4611	zip1	v0.4s,v4.4s,v5.4s
4612	zip2	v1.4s,v4.4s,v5.4s
4613	zip1	v2.4s,v6.4s,v7.4s
4614	zip2	v3.4s,v6.4s,v7.4s
4615	zip1	v4.2d,v0.2d,v2.2d
4616	zip2	v5.2d,v0.2d,v2.2d
4617	zip1	v6.2d,v1.2d,v3.2d
4618	zip2	v7.2d,v1.2d,v3.2d
4619	bl	_vpsm4_enc_4blks
4620	zip1	v4.4s,v0.4s,v1.4s
4621	zip2	v5.4s,v0.4s,v1.4s
4622	zip1	v6.4s,v2.4s,v3.4s
4623	zip2	v7.4s,v2.4s,v3.4s
4624	zip1	v0.2d,v4.2d,v6.2d
4625	zip2	v1.2d,v4.2d,v6.2d
4626	zip1	v2.2d,v5.2d,v7.2d
4627	zip2	v3.2d,v5.2d,v7.2d
4628	eor	v0.16b, v0.16b, v8.16b
4629	eor	v1.16b, v1.16b, v9.16b
4630	eor	v2.16b, v2.16b, v10.16b
4631	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
4632	// save the last tweak
4633	st1	{v10.4s},[x5]
4634100:
4635	cmp	x29,0
4636	b.eq	.return
4637
4638// This branch calculates the last two tweaks,
4639// while the encryption/decryption length is larger than 32
4640.last_2blks_tweak:
4641	ld1	{v8.4s},[x5]
4642#ifdef __AARCH64EB__
4643	rev32	v8.16b,v8.16b
4644#endif
4645	mov	v2.16b,v8.16b
4646	adrp	x10,.Lxts_magic
4647	ldr	q0, [x10, #:lo12:.Lxts_magic]
4648	shl	v9.16b, v2.16b, #1
4649	ext	v1.16b, v2.16b, v2.16b,#15
4650	ushr	v1.16b, v1.16b, #7
4651	mul	v1.16b, v1.16b, v0.16b
4652	eor	v9.16b, v9.16b, v1.16b
4653	mov	v2.16b,v9.16b
4654	adrp	x10,.Lxts_magic
4655	ldr	q0, [x10, #:lo12:.Lxts_magic]
4656	shl	v10.16b, v2.16b, #1
4657	ext	v1.16b, v2.16b, v2.16b,#15
4658	ushr	v1.16b, v1.16b, #7
4659	mul	v1.16b, v1.16b, v0.16b
4660	eor	v10.16b, v10.16b, v1.16b
4661	b	.check_dec
4662
4663
4664// This branch calculates the last two tweaks,
4665// while the encryption/decryption length is equal to 32, who only need two tweaks
4666.only_2blks_tweak:
4667	mov	v9.16b,v8.16b
4668#ifdef __AARCH64EB__
4669	rev32	v9.16b,v9.16b
4670#endif
4671	mov	v2.16b,v9.16b
4672	adrp	x10,.Lxts_magic
4673	ldr	q0, [x10, #:lo12:.Lxts_magic]
4674	shl	v10.16b, v2.16b, #1
4675	ext	v1.16b, v2.16b, v2.16b,#15
4676	ushr	v1.16b, v1.16b, #7
4677	mul	v1.16b, v1.16b, v0.16b
4678	eor	v10.16b, v10.16b, v1.16b
4679	b	.check_dec
4680
4681
4682// Determine whether encryption or decryption is required.
4683// The last two tweaks need to be swapped for decryption.
4684.check_dec:
4685	// encryption:1 decryption:0
4686	cmp	w28,1
4687	b.eq	.process_last_2blks
4688	mov	v0.16B,v9.16b
4689	mov	v9.16B,v10.16b
4690	mov	v10.16B,v0.16b
4691
4692.process_last_2blks:
4693#ifdef __AARCH64EB__
4694	rev32	v9.16b,v9.16b
4695#endif
4696#ifdef __AARCH64EB__
4697	rev32	v10.16b,v10.16b
4698#endif
4699	ld1	{v4.4s},[x0],#16
4700	eor	v4.16b, v4.16b, v9.16b
4701#ifndef __AARCH64EB__
4702	rev32	v4.16b,v4.16b
4703#endif
4704	mov	x10,x3
4705	mov	w11,#8
4706	mov	w12,v4.s[0]
4707	mov	w13,v4.s[1]
4708	mov	w14,v4.s[2]
4709	mov	w15,v4.s[3]
471010:
4711	ldp	w7,w8,[x10],8
4712	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4713	eor	w6,w14,w15
4714	eor	w9,w7,w13
4715	eor	w6,w6,w9
4716	movi	v1.16b,#64
4717	movi	v2.16b,#128
4718	movi	v3.16b,#192
4719	mov	v0.s[0],w6
4720
4721	sub	v1.16b,v0.16b,v1.16b
4722	sub	v2.16b,v0.16b,v2.16b
4723	sub	v3.16b,v0.16b,v3.16b
4724
4725	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4726	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4727	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4728	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4729
4730	mov	w6,v0.s[0]
4731	mov	w7,v1.s[0]
4732	mov	w9,v2.s[0]
4733	add	w7,w6,w7
4734	mov	w6,v3.s[0]
4735	add	w7,w7,w9
4736	add	w7,w7,w6
4737
4738	eor	w6,w7,w7,ror #32-2
4739	eor	w6,w6,w7,ror #32-10
4740	eor	w6,w6,w7,ror #32-18
4741	eor	w6,w6,w7,ror #32-24
4742	eor	w12,w12,w6
4743	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4744	eor	w6,w14,w15
4745	eor	w9,w12,w8
4746	eor	w6,w6,w9
4747	movi	v1.16b,#64
4748	movi	v2.16b,#128
4749	movi	v3.16b,#192
4750	mov	v0.s[0],w6
4751
4752	sub	v1.16b,v0.16b,v1.16b
4753	sub	v2.16b,v0.16b,v2.16b
4754	sub	v3.16b,v0.16b,v3.16b
4755
4756	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4757	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4758	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4759	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4760
4761	mov	w6,v0.s[0]
4762	mov	w7,v1.s[0]
4763	mov	w9,v2.s[0]
4764	add	w7,w6,w7
4765	mov	w6,v3.s[0]
4766	add	w7,w7,w9
4767	add	w7,w7,w6
4768
4769	eor	w6,w7,w7,ror #32-2
4770	eor	w6,w6,w7,ror #32-10
4771	eor	w6,w6,w7,ror #32-18
4772	eor	w6,w6,w7,ror #32-24
4773	ldp	w7,w8,[x10],8
4774	eor	w13,w13,w6
4775	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4776	eor	w6,w12,w13
4777	eor	w9,w7,w15
4778	eor	w6,w6,w9
4779	movi	v1.16b,#64
4780	movi	v2.16b,#128
4781	movi	v3.16b,#192
4782	mov	v0.s[0],w6
4783
4784	sub	v1.16b,v0.16b,v1.16b
4785	sub	v2.16b,v0.16b,v2.16b
4786	sub	v3.16b,v0.16b,v3.16b
4787
4788	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4789	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4790	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4791	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4792
4793	mov	w6,v0.s[0]
4794	mov	w7,v1.s[0]
4795	mov	w9,v2.s[0]
4796	add	w7,w6,w7
4797	mov	w6,v3.s[0]
4798	add	w7,w7,w9
4799	add	w7,w7,w6
4800
4801	eor	w6,w7,w7,ror #32-2
4802	eor	w6,w6,w7,ror #32-10
4803	eor	w6,w6,w7,ror #32-18
4804	eor	w6,w6,w7,ror #32-24
4805	eor	w14,w14,w6
4806	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4807	eor	w6,w12,w13
4808	eor	w9,w14,w8
4809	eor	w6,w6,w9
4810	movi	v1.16b,#64
4811	movi	v2.16b,#128
4812	movi	v3.16b,#192
4813	mov	v0.s[0],w6
4814
4815	sub	v1.16b,v0.16b,v1.16b
4816	sub	v2.16b,v0.16b,v2.16b
4817	sub	v3.16b,v0.16b,v3.16b
4818
4819	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4820	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4821	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4822	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4823
4824	mov	w6,v0.s[0]
4825	mov	w7,v1.s[0]
4826	mov	w9,v2.s[0]
4827	add	w7,w6,w7
4828	mov	w6,v3.s[0]
4829	add	w7,w7,w9
4830	add	w7,w7,w6
4831
4832	eor	w6,w7,w7,ror #32-2
4833	eor	w6,w6,w7,ror #32-10
4834	eor	w6,w6,w7,ror #32-18
4835	eor	w6,w6,w7,ror #32-24
4836	eor	w15,w15,w6
4837	subs	w11,w11,#1
4838	b.ne	10b
4839	mov	v4.s[0],w15
4840	mov	v4.s[1],w14
4841	mov	v4.s[2],w13
4842	mov	v4.s[3],w12
4843#ifndef __AARCH64EB__
4844	rev32	v4.16b,v4.16b
4845#endif
4846	eor	v4.16b, v4.16b, v9.16b
4847	st1	{v4.4s},[x1],#16
4848
4849	sub	x26,x1,16
4850.loop:
4851	subs	x29,x29,1
4852	ldrb	w7,[x26,x29]
4853	ldrb	w8,[x0,x29]
4854	strb	w8,[x26,x29]
4855	strb	w7,[x1,x29]
4856	b.gt	.loop
4857	ld1	{v4.4s}, [x26]
4858	eor	v4.16b, v4.16b, v10.16b
4859#ifndef __AARCH64EB__
4860	rev32	v4.16b,v4.16b
4861#endif
4862	mov	x10,x3
4863	mov	w11,#8
4864	mov	w12,v4.s[0]
4865	mov	w13,v4.s[1]
4866	mov	w14,v4.s[2]
4867	mov	w15,v4.s[3]
486810:
4869	ldp	w7,w8,[x10],8
4870	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4871	eor	w6,w14,w15
4872	eor	w9,w7,w13
4873	eor	w6,w6,w9
4874	movi	v1.16b,#64
4875	movi	v2.16b,#128
4876	movi	v3.16b,#192
4877	mov	v0.s[0],w6
4878
4879	sub	v1.16b,v0.16b,v1.16b
4880	sub	v2.16b,v0.16b,v2.16b
4881	sub	v3.16b,v0.16b,v3.16b
4882
4883	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4884	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4885	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4886	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4887
4888	mov	w6,v0.s[0]
4889	mov	w7,v1.s[0]
4890	mov	w9,v2.s[0]
4891	add	w7,w6,w7
4892	mov	w6,v3.s[0]
4893	add	w7,w7,w9
4894	add	w7,w7,w6
4895
4896	eor	w6,w7,w7,ror #32-2
4897	eor	w6,w6,w7,ror #32-10
4898	eor	w6,w6,w7,ror #32-18
4899	eor	w6,w6,w7,ror #32-24
4900	eor	w12,w12,w6
4901	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4902	eor	w6,w14,w15
4903	eor	w9,w12,w8
4904	eor	w6,w6,w9
4905	movi	v1.16b,#64
4906	movi	v2.16b,#128
4907	movi	v3.16b,#192
4908	mov	v0.s[0],w6
4909
4910	sub	v1.16b,v0.16b,v1.16b
4911	sub	v2.16b,v0.16b,v2.16b
4912	sub	v3.16b,v0.16b,v3.16b
4913
4914	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4915	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4916	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4917	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4918
4919	mov	w6,v0.s[0]
4920	mov	w7,v1.s[0]
4921	mov	w9,v2.s[0]
4922	add	w7,w6,w7
4923	mov	w6,v3.s[0]
4924	add	w7,w7,w9
4925	add	w7,w7,w6
4926
4927	eor	w6,w7,w7,ror #32-2
4928	eor	w6,w6,w7,ror #32-10
4929	eor	w6,w6,w7,ror #32-18
4930	eor	w6,w6,w7,ror #32-24
4931	ldp	w7,w8,[x10],8
4932	eor	w13,w13,w6
4933	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4934	eor	w6,w12,w13
4935	eor	w9,w7,w15
4936	eor	w6,w6,w9
4937	movi	v1.16b,#64
4938	movi	v2.16b,#128
4939	movi	v3.16b,#192
4940	mov	v0.s[0],w6
4941
4942	sub	v1.16b,v0.16b,v1.16b
4943	sub	v2.16b,v0.16b,v2.16b
4944	sub	v3.16b,v0.16b,v3.16b
4945
4946	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4947	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4948	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4949	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4950
4951	mov	w6,v0.s[0]
4952	mov	w7,v1.s[0]
4953	mov	w9,v2.s[0]
4954	add	w7,w6,w7
4955	mov	w6,v3.s[0]
4956	add	w7,w7,w9
4957	add	w7,w7,w6
4958
4959	eor	w6,w7,w7,ror #32-2
4960	eor	w6,w6,w7,ror #32-10
4961	eor	w6,w6,w7,ror #32-18
4962	eor	w6,w6,w7,ror #32-24
4963	eor	w14,w14,w6
4964	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4965	eor	w6,w12,w13
4966	eor	w9,w14,w8
4967	eor	w6,w6,w9
4968	movi	v1.16b,#64
4969	movi	v2.16b,#128
4970	movi	v3.16b,#192
4971	mov	v0.s[0],w6
4972
4973	sub	v1.16b,v0.16b,v1.16b
4974	sub	v2.16b,v0.16b,v2.16b
4975	sub	v3.16b,v0.16b,v3.16b
4976
4977	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4978	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4979	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4980	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4981
4982	mov	w6,v0.s[0]
4983	mov	w7,v1.s[0]
4984	mov	w9,v2.s[0]
4985	add	w7,w6,w7
4986	mov	w6,v3.s[0]
4987	add	w7,w7,w9
4988	add	w7,w7,w6
4989
4990	eor	w6,w7,w7,ror #32-2
4991	eor	w6,w6,w7,ror #32-10
4992	eor	w6,w6,w7,ror #32-18
4993	eor	w6,w6,w7,ror #32-24
4994	eor	w15,w15,w6
4995	subs	w11,w11,#1
4996	b.ne	10b
4997	mov	v4.s[0],w15
4998	mov	v4.s[1],w14
4999	mov	v4.s[2],w13
5000	mov	v4.s[3],w12
5001#ifndef __AARCH64EB__
5002	rev32	v4.16b,v4.16b
5003#endif
5004	eor	v4.16b, v4.16b, v10.16b
5005	st1	{v4.4s}, [x26]
5006.return:
5007	ldp	d14, d15, [sp], #0x10
5008	ldp	d12, d13, [sp], #0x10
5009	ldp	d10, d11, [sp], #0x10
5010	ldp	d8, d9, [sp], #0x10
5011	ldp	x29, x30, [sp], #0x10
5012	ldp	x27, x28, [sp], #0x10
5013	ldp	x25, x26, [sp], #0x10
5014	ldp	x23, x24, [sp], #0x10
5015	ldp	x21, x22, [sp], #0x10
5016	ldp	x19, x20, [sp], #0x10
5017	ldp	x17, x18, [sp], #0x10
5018	ldp	x15, x16, [sp], #0x10
5019	AARCH64_VALIDATE_LINK_REGISTER
5020	ret
5021.size	vpsm4_xts_encrypt,.-vpsm4_xts_encrypt
5022