xref: /linux/arch/arm64/crypto/aes-neonbs-core.S (revision 2f804aca48322f02a8f44cca540663845ee80fb1)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Bit sliced AES using NEON instructions
4 *
5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/*
9 * The algorithm implemented here is described in detail by the paper
10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
12 *
13 * This implementation is based primarily on the OpenSSL implementation
14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
15 */
16
17#include <linux/linkage.h>
18#include <linux/cfi_types.h>
19#include <asm/assembler.h>
20
21	.text
22
23	rounds		.req	x11
24	bskey		.req	x12
25
26	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
27	eor		\b2, \b2, \b1
28	eor		\b5, \b5, \b6
29	eor		\b3, \b3, \b0
30	eor		\b6, \b6, \b2
31	eor		\b5, \b5, \b0
32	eor		\b6, \b6, \b3
33	eor		\b3, \b3, \b7
34	eor		\b7, \b7, \b5
35	eor		\b3, \b3, \b4
36	eor		\b4, \b4, \b5
37	eor		\b2, \b2, \b7
38	eor		\b3, \b3, \b1
39	eor		\b1, \b1, \b5
40	.endm
41
42	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
43	eor		\b0, \b0, \b6
44	eor		\b1, \b1, \b4
45	eor		\b4, \b4, \b6
46	eor		\b2, \b2, \b0
47	eor		\b6, \b6, \b1
48	eor		\b1, \b1, \b5
49	eor		\b5, \b5, \b3
50	eor		\b3, \b3, \b7
51	eor		\b7, \b7, \b5
52	eor		\b2, \b2, \b5
53	eor		\b4, \b4, \b7
54	.endm
55
56	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
57	eor		\b1, \b1, \b7
58	eor		\b4, \b4, \b7
59	eor		\b7, \b7, \b5
60	eor		\b1, \b1, \b3
61	eor		\b2, \b2, \b5
62	eor		\b3, \b3, \b7
63	eor		\b6, \b6, \b1
64	eor		\b2, \b2, \b0
65	eor		\b5, \b5, \b3
66	eor		\b4, \b4, \b6
67	eor		\b0, \b0, \b6
68	eor		\b1, \b1, \b4
69	.endm
70
71	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
72	eor		\b1, \b1, \b5
73	eor		\b2, \b2, \b7
74	eor		\b3, \b3, \b1
75	eor		\b4, \b4, \b5
76	eor		\b7, \b7, \b5
77	eor		\b3, \b3, \b4
78	eor 		\b5, \b5, \b0
79	eor		\b3, \b3, \b7
80	eor		\b6, \b6, \b2
81	eor		\b2, \b2, \b1
82	eor		\b6, \b6, \b3
83	eor		\b3, \b3, \b0
84	eor		\b5, \b5, \b6
85	.endm
86
87	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
88	eor 		\t0, \y0, \y1
89	and		\t0, \t0, \x0
90	eor		\x0, \x0, \x1
91	and		\t1, \x1, \y0
92	and		\x0, \x0, \y1
93	eor		\x1, \t1, \t0
94	eor		\x0, \x0, \t1
95	.endm
96
97	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
98	eor		\t0, \y0, \y1
99	eor 		\t1, \y2, \y3
100	and		\t0, \t0, \x0
101	and		\t1, \t1, \x2
102	eor		\x0, \x0, \x1
103	eor		\x2, \x2, \x3
104	and		\x1, \x1, \y0
105	and		\x3, \x3, \y2
106	and		\x0, \x0, \y1
107	and		\x2, \x2, \y3
108	eor		\x1, \x1, \x0
109	eor		\x2, \x2, \x3
110	eor		\x0, \x0, \t0
111	eor		\x3, \x3, \t1
112	.endm
113
114	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
115				    y0, y1, y2, y3, t0, t1, t2, t3
116	eor		\t0, \x0, \x2
117	eor		\t1, \x1, \x3
118	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
119	eor		\y0, \y0, \y2
120	eor		\y1, \y1, \y3
121	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
122	eor		\x0, \x0, \t0
123	eor		\x2, \x2, \t0
124	eor		\x1, \x1, \t1
125	eor		\x3, \x3, \t1
126	eor		\t0, \x4, \x6
127	eor		\t1, \x5, \x7
128	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
129	eor		\y0, \y0, \y2
130	eor		\y1, \y1, \y3
131	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
132	eor		\x4, \x4, \t0
133	eor		\x6, \x6, \t0
134	eor		\x5, \x5, \t1
135	eor		\x7, \x7, \t1
136	.endm
137
138	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
139				   t0, t1, t2, t3, s0, s1, s2, s3
140	eor		\t3, \x4, \x6
141	eor		\t0, \x5, \x7
142	eor		\t1, \x1, \x3
143	eor		\s1, \x7, \x6
144	eor		\s0, \x0, \x2
145	eor		\s3, \t3, \t0
146	orr		\t2, \t0, \t1
147	and		\s2, \t3, \s0
148	orr		\t3, \t3, \s0
149	eor		\s0, \s0, \t1
150	and		\t0, \t0, \t1
151	eor		\t1, \x3, \x2
152	and		\s3, \s3, \s0
153	and		\s1, \s1, \t1
154	eor		\t1, \x4, \x5
155	eor		\s0, \x1, \x0
156	eor		\t3, \t3, \s1
157	eor		\t2, \t2, \s1
158	and		\s1, \t1, \s0
159	orr		\t1, \t1, \s0
160	eor		\t3, \t3, \s3
161	eor		\t0, \t0, \s1
162	eor		\t2, \t2, \s2
163	eor		\t1, \t1, \s3
164	eor		\t0, \t0, \s2
165	and		\s0, \x7, \x3
166	eor		\t1, \t1, \s2
167	and		\s1, \x6, \x2
168	and		\s2, \x5, \x1
169	orr		\s3, \x4, \x0
170	eor		\t3, \t3, \s0
171	eor		\t1, \t1, \s2
172	eor		\s0, \t0, \s3
173	eor		\t2, \t2, \s1
174	and		\s2, \t3, \t1
175	eor		\s1, \t2, \s2
176	eor		\s3, \s0, \s2
177	bsl		\s1, \t1, \s0
178	not		\t0, \s0
179	bsl		\s0, \s1, \s3
180	bsl		\t0, \s1, \s3
181	bsl		\s3, \t3, \t2
182	eor		\t3, \t3, \t2
183	and		\s2, \s0, \s3
184	eor		\t1, \t1, \t0
185	eor		\s2, \s2, \t3
186	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
187			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
188	.endm
189
190	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
191			      t0, t1, t2, t3, s0, s1, s2, s3
192	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
193			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
194	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
195			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
196			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
197			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
198	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
199			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
200	.endm
201
202	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
203				  t0, t1, t2, t3, s0, s1, s2, s3
204	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
205			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
206	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
207			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
208			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
209			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
210	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
211			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
212	.endm
213
214	.macro		enc_next_rk
215	ldp		q16, q17, [bskey], #128
216	ldp		q18, q19, [bskey, #-96]
217	ldp		q20, q21, [bskey, #-64]
218	ldp		q22, q23, [bskey, #-32]
219	.endm
220
221	.macro		dec_next_rk
222	ldp		q16, q17, [bskey, #-128]!
223	ldp		q18, q19, [bskey, #32]
224	ldp		q20, q21, [bskey, #64]
225	ldp		q22, q23, [bskey, #96]
226	.endm
227
228	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
229	eor		\x0\().16b, \x0\().16b, v16.16b
230	eor		\x1\().16b, \x1\().16b, v17.16b
231	eor		\x2\().16b, \x2\().16b, v18.16b
232	eor		\x3\().16b, \x3\().16b, v19.16b
233	eor		\x4\().16b, \x4\().16b, v20.16b
234	eor		\x5\().16b, \x5\().16b, v21.16b
235	eor		\x6\().16b, \x6\().16b, v22.16b
236	eor		\x7\().16b, \x7\().16b, v23.16b
237	.endm
238
239	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
240	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
241	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
242	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
243	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
244	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
245	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
246	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
247	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
248	.endm
249
250	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
251				  t0, t1, t2, t3, t4, t5, t6, t7, inv
252	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
253	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
254	eor		\x0\().16b, \x0\().16b, \t0\().16b
255	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
256	eor		\x1\().16b, \x1\().16b, \t1\().16b
257	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
258	eor		\x2\().16b, \x2\().16b, \t2\().16b
259	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
260	eor		\x3\().16b, \x3\().16b, \t3\().16b
261	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
262	eor		\x4\().16b, \x4\().16b, \t4\().16b
263	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
264	eor		\x5\().16b, \x5\().16b, \t5\().16b
265	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
266	eor		\x6\().16b, \x6\().16b, \t6\().16b
267	eor		\t1\().16b, \t1\().16b, \x0\().16b
268	eor		\x7\().16b, \x7\().16b, \t7\().16b
269	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
270	eor		\t2\().16b, \t2\().16b, \x1\().16b
271	eor		\t0\().16b, \t0\().16b, \x7\().16b
272	eor		\t1\().16b, \t1\().16b, \x7\().16b
273	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
274	eor		\t5\().16b, \t5\().16b, \x4\().16b
275	eor		\x0\().16b, \x0\().16b, \t0\().16b
276	eor		\t6\().16b, \t6\().16b, \x5\().16b
277	eor		\x1\().16b, \x1\().16b, \t1\().16b
278	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
279	eor		\t4\().16b, \t4\().16b, \x3\().16b
280	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
281	eor		\t7\().16b, \t7\().16b, \x6\().16b
282	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
283	eor		\t3\().16b, \t3\().16b, \x2\().16b
284	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
285	eor		\t4\().16b, \t4\().16b, \x7\().16b
286	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
287	eor		\t3\().16b, \t3\().16b, \x7\().16b
288	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
289	eor		\x7\().16b, \t1\().16b, \t5\().16b
290	.ifb		\inv
291	eor		\x2\().16b, \t0\().16b, \t4\().16b
292	eor		\x4\().16b, \x4\().16b, \t3\().16b
293	eor		\x5\().16b, \x5\().16b, \t7\().16b
294	eor		\x3\().16b, \x3\().16b, \t6\().16b
295	eor		\x6\().16b, \x6\().16b, \t2\().16b
296	.else
297	eor		\t3\().16b, \t3\().16b, \x4\().16b
298	eor		\x5\().16b, \x5\().16b, \t7\().16b
299	eor		\x2\().16b, \x3\().16b, \t6\().16b
300	eor		\x3\().16b, \t0\().16b, \t4\().16b
301	eor		\x4\().16b, \x6\().16b, \t2\().16b
302	mov		\x6\().16b, \t3\().16b
303	.endif
304	.endm
305
306	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
307				      t0, t1, t2, t3, t4, t5, t6, t7
308	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
309	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
310	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
311	eor		\t0\().16b, \t0\().16b, \x0\().16b
312	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
313	eor		\t6\().16b, \t6\().16b, \x6\().16b
314	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
315	eor		\t7\().16b, \t7\().16b, \x7\().16b
316	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
317	eor		\t1\().16b, \t1\().16b, \x1\().16b
318	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
319	eor		\t2\().16b, \t2\().16b, \x2\().16b
320	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
321	eor		\t3\().16b, \t3\().16b, \x3\().16b
322	eor		\t4\().16b, \t4\().16b, \x4\().16b
323	eor		\t5\().16b, \t5\().16b, \x5\().16b
324	eor		\x0\().16b, \x0\().16b, \t6\().16b
325	eor		\x1\().16b, \x1\().16b, \t6\().16b
326	eor		\x2\().16b, \x2\().16b, \t0\().16b
327	eor		\x4\().16b, \x4\().16b, \t2\().16b
328	eor		\x3\().16b, \x3\().16b, \t1\().16b
329	eor		\x1\().16b, \x1\().16b, \t7\().16b
330	eor		\x2\().16b, \x2\().16b, \t7\().16b
331	eor		\x4\().16b, \x4\().16b, \t6\().16b
332	eor		\x5\().16b, \x5\().16b, \t3\().16b
333	eor		\x3\().16b, \x3\().16b, \t6\().16b
334	eor		\x6\().16b, \x6\().16b, \t4\().16b
335	eor		\x4\().16b, \x4\().16b, \t7\().16b
336	eor		\x5\().16b, \x5\().16b, \t7\().16b
337	eor		\x7\().16b, \x7\().16b, \t5\().16b
338	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
339			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
340	.endm
341
342	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
343	ushr		\t0\().2d, \b0\().2d, #\n
344	ushr		\t1\().2d, \b1\().2d, #\n
345	eor		\t0\().16b, \t0\().16b, \a0\().16b
346	eor		\t1\().16b, \t1\().16b, \a1\().16b
347	and		\t0\().16b, \t0\().16b, \mask\().16b
348	and		\t1\().16b, \t1\().16b, \mask\().16b
349	eor		\a0\().16b, \a0\().16b, \t0\().16b
350	shl		\t0\().2d, \t0\().2d, #\n
351	eor		\a1\().16b, \a1\().16b, \t1\().16b
352	shl		\t1\().2d, \t1\().2d, #\n
353	eor		\b0\().16b, \b0\().16b, \t0\().16b
354	eor		\b1\().16b, \b1\().16b, \t1\().16b
355	.endm
356
357	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
358	movi		\t0\().16b, #0x55
359	movi		\t1\().16b, #0x33
360	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
361	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
362	movi		\t0\().16b, #0x0f
363	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
364	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
365	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
366	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
367	.endm
368
369
370	.align		6
371M0:	.octa		0x0004080c0105090d02060a0e03070b0f
372
373M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
374SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
375SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
376
377M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
378ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
379ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
380
381	/*
382	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
383	 */
384SYM_FUNC_START(aesbs_convert_key)
385	ld1		{v7.4s}, [x1], #16		// load round 0 key
386	ld1		{v17.4s}, [x1], #16		// load round 1 key
387
388	movi		v8.16b,  #0x01			// bit masks
389	movi		v9.16b,  #0x02
390	movi		v10.16b, #0x04
391	movi		v11.16b, #0x08
392	movi		v12.16b, #0x10
393	movi		v13.16b, #0x20
394	movi		v14.16b, #0x40
395	movi		v15.16b, #0x80
396	ldr		q16, M0
397
398	sub		x2, x2, #1
399	str		q7, [x0], #16		// save round 0 key
400
401.Lkey_loop:
402	tbl		v7.16b ,{v17.16b}, v16.16b
403	ld1		{v17.4s}, [x1], #16		// load next round key
404
405	cmtst		v0.16b, v7.16b, v8.16b
406	cmtst		v1.16b, v7.16b, v9.16b
407	cmtst		v2.16b, v7.16b, v10.16b
408	cmtst		v3.16b, v7.16b, v11.16b
409	cmtst		v4.16b, v7.16b, v12.16b
410	cmtst		v5.16b, v7.16b, v13.16b
411	cmtst		v6.16b, v7.16b, v14.16b
412	cmtst		v7.16b, v7.16b, v15.16b
413	not		v0.16b, v0.16b
414	not		v1.16b, v1.16b
415	not		v5.16b, v5.16b
416	not		v6.16b, v6.16b
417
418	subs		x2, x2, #1
419	stp		q0, q1, [x0], #128
420	stp		q2, q3, [x0, #-96]
421	stp		q4, q5, [x0, #-64]
422	stp		q6, q7, [x0, #-32]
423	b.ne		.Lkey_loop
424
425	movi		v7.16b, #0x63			// compose .L63
426	eor		v17.16b, v17.16b, v7.16b
427	str		q17, [x0]
428	ret
429SYM_FUNC_END(aesbs_convert_key)
430
431	.align		4
432SYM_FUNC_START_LOCAL(aesbs_encrypt8)
433	ldr		q9, [bskey], #16		// round 0 key
434	ldr		q8, M0SR
435	ldr		q24, SR
436
437	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
438	eor		v11.16b, v1.16b, v9.16b
439	tbl		v0.16b, {v10.16b}, v8.16b
440	eor		v12.16b, v2.16b, v9.16b
441	tbl		v1.16b, {v11.16b}, v8.16b
442	eor		v13.16b, v3.16b, v9.16b
443	tbl		v2.16b, {v12.16b}, v8.16b
444	eor		v14.16b, v4.16b, v9.16b
445	tbl		v3.16b, {v13.16b}, v8.16b
446	eor		v15.16b, v5.16b, v9.16b
447	tbl		v4.16b, {v14.16b}, v8.16b
448	eor		v10.16b, v6.16b, v9.16b
449	tbl		v5.16b, {v15.16b}, v8.16b
450	eor		v11.16b, v7.16b, v9.16b
451	tbl		v6.16b, {v10.16b}, v8.16b
452	tbl		v7.16b, {v11.16b}, v8.16b
453
454	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
455
456	sub		rounds, rounds, #1
457	b		.Lenc_sbox
458
459.Lenc_loop:
460	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
461.Lenc_sbox:
462	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
463								v13, v14, v15
464	subs		rounds, rounds, #1
465	b.cc		.Lenc_done
466
467	enc_next_rk
468
469	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
470								v13, v14, v15
471
472	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
473
474	b.ne		.Lenc_loop
475	ldr		q24, SRM0
476	b		.Lenc_loop
477
478.Lenc_done:
479	ldr		q12, [bskey]			// last round key
480
481	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
482
483	eor		v0.16b, v0.16b, v12.16b
484	eor		v1.16b, v1.16b, v12.16b
485	eor		v4.16b, v4.16b, v12.16b
486	eor		v6.16b, v6.16b, v12.16b
487	eor		v3.16b, v3.16b, v12.16b
488	eor		v7.16b, v7.16b, v12.16b
489	eor		v2.16b, v2.16b, v12.16b
490	eor		v5.16b, v5.16b, v12.16b
491	ret
492SYM_FUNC_END(aesbs_encrypt8)
493
494	.align		4
495SYM_FUNC_START_LOCAL(aesbs_decrypt8)
496	lsl		x9, rounds, #7
497	add		bskey, bskey, x9
498
499	ldr		q9, [bskey, #-112]!		// round 0 key
500	ldr		q8, M0ISR
501	ldr		q24, ISR
502
503	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
504	eor		v11.16b, v1.16b, v9.16b
505	tbl		v0.16b, {v10.16b}, v8.16b
506	eor		v12.16b, v2.16b, v9.16b
507	tbl		v1.16b, {v11.16b}, v8.16b
508	eor		v13.16b, v3.16b, v9.16b
509	tbl		v2.16b, {v12.16b}, v8.16b
510	eor		v14.16b, v4.16b, v9.16b
511	tbl		v3.16b, {v13.16b}, v8.16b
512	eor		v15.16b, v5.16b, v9.16b
513	tbl		v4.16b, {v14.16b}, v8.16b
514	eor		v10.16b, v6.16b, v9.16b
515	tbl		v5.16b, {v15.16b}, v8.16b
516	eor		v11.16b, v7.16b, v9.16b
517	tbl		v6.16b, {v10.16b}, v8.16b
518	tbl		v7.16b, {v11.16b}, v8.16b
519
520	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
521
522	sub		rounds, rounds, #1
523	b		.Ldec_sbox
524
525.Ldec_loop:
526	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
527.Ldec_sbox:
528	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
529								v13, v14, v15
530	subs		rounds, rounds, #1
531	b.cc		.Ldec_done
532
533	dec_next_rk
534
535	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
536
537	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
538								v13, v14, v15
539
540	b.ne		.Ldec_loop
541	ldr		q24, ISRM0
542	b		.Ldec_loop
543.Ldec_done:
544	ldr		q12, [bskey, #-16]		// last round key
545
546	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
547
548	eor		v0.16b, v0.16b, v12.16b
549	eor		v1.16b, v1.16b, v12.16b
550	eor		v6.16b, v6.16b, v12.16b
551	eor		v4.16b, v4.16b, v12.16b
552	eor		v2.16b, v2.16b, v12.16b
553	eor		v7.16b, v7.16b, v12.16b
554	eor		v3.16b, v3.16b, v12.16b
555	eor		v5.16b, v5.16b, v12.16b
556	ret
557SYM_FUNC_END(aesbs_decrypt8)
558
559	/*
560	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
561	 *		     int blocks)
562	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
563	 *		     int blocks)
564	 */
565	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
566	frame_push	5
567
568	mov		x19, x0
569	mov		x20, x1
570	mov		x21, x2
571	mov		x22, x3
572	mov		x23, x4
573
57499:	mov		x5, #1
575	lsl		x5, x5, x23
576	subs		w23, w23, #8
577	csel		x23, x23, xzr, pl
578	csel		x5, x5, xzr, mi
579
580	ld1		{v0.16b}, [x20], #16
581	tbnz		x5, #1, 0f
582	ld1		{v1.16b}, [x20], #16
583	tbnz		x5, #2, 0f
584	ld1		{v2.16b}, [x20], #16
585	tbnz		x5, #3, 0f
586	ld1		{v3.16b}, [x20], #16
587	tbnz		x5, #4, 0f
588	ld1		{v4.16b}, [x20], #16
589	tbnz		x5, #5, 0f
590	ld1		{v5.16b}, [x20], #16
591	tbnz		x5, #6, 0f
592	ld1		{v6.16b}, [x20], #16
593	tbnz		x5, #7, 0f
594	ld1		{v7.16b}, [x20], #16
595
5960:	mov		bskey, x21
597	mov		rounds, x22
598	bl		\do8
599
600	st1		{\o0\().16b}, [x19], #16
601	tbnz		x5, #1, 1f
602	st1		{\o1\().16b}, [x19], #16
603	tbnz		x5, #2, 1f
604	st1		{\o2\().16b}, [x19], #16
605	tbnz		x5, #3, 1f
606	st1		{\o3\().16b}, [x19], #16
607	tbnz		x5, #4, 1f
608	st1		{\o4\().16b}, [x19], #16
609	tbnz		x5, #5, 1f
610	st1		{\o5\().16b}, [x19], #16
611	tbnz		x5, #6, 1f
612	st1		{\o6\().16b}, [x19], #16
613	tbnz		x5, #7, 1f
614	st1		{\o7\().16b}, [x19], #16
615
616	cbz		x23, 1f
617	b		99b
618
6191:	frame_pop
620	ret
621	.endm
622
623	.align		4
624SYM_TYPED_FUNC_START(aesbs_ecb_encrypt)
625	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
626SYM_FUNC_END(aesbs_ecb_encrypt)
627
628	.align		4
629SYM_TYPED_FUNC_START(aesbs_ecb_decrypt)
630	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
631SYM_FUNC_END(aesbs_ecb_decrypt)
632
633	/*
634	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
635	 *		     int blocks, u8 iv[])
636	 */
637	.align		4
638SYM_FUNC_START(aesbs_cbc_decrypt)
639	frame_push	6
640
641	mov		x19, x0
642	mov		x20, x1
643	mov		x21, x2
644	mov		x22, x3
645	mov		x23, x4
646	mov		x24, x5
647
64899:	mov		x6, #1
649	lsl		x6, x6, x23
650	subs		w23, w23, #8
651	csel		x23, x23, xzr, pl
652	csel		x6, x6, xzr, mi
653
654	ld1		{v0.16b}, [x20], #16
655	mov		v25.16b, v0.16b
656	tbnz		x6, #1, 0f
657	ld1		{v1.16b}, [x20], #16
658	mov		v26.16b, v1.16b
659	tbnz		x6, #2, 0f
660	ld1		{v2.16b}, [x20], #16
661	mov		v27.16b, v2.16b
662	tbnz		x6, #3, 0f
663	ld1		{v3.16b}, [x20], #16
664	mov		v28.16b, v3.16b
665	tbnz		x6, #4, 0f
666	ld1		{v4.16b}, [x20], #16
667	mov		v29.16b, v4.16b
668	tbnz		x6, #5, 0f
669	ld1		{v5.16b}, [x20], #16
670	mov		v30.16b, v5.16b
671	tbnz		x6, #6, 0f
672	ld1		{v6.16b}, [x20], #16
673	mov		v31.16b, v6.16b
674	tbnz		x6, #7, 0f
675	ld1		{v7.16b}, [x20]
676
6770:	mov		bskey, x21
678	mov		rounds, x22
679	bl		aesbs_decrypt8
680
681	ld1		{v24.16b}, [x24]		// load IV
682
683	eor		v1.16b, v1.16b, v25.16b
684	eor		v6.16b, v6.16b, v26.16b
685	eor		v4.16b, v4.16b, v27.16b
686	eor		v2.16b, v2.16b, v28.16b
687	eor		v7.16b, v7.16b, v29.16b
688	eor		v0.16b, v0.16b, v24.16b
689	eor		v3.16b, v3.16b, v30.16b
690	eor		v5.16b, v5.16b, v31.16b
691
692	st1		{v0.16b}, [x19], #16
693	mov		v24.16b, v25.16b
694	tbnz		x6, #1, 1f
695	st1		{v1.16b}, [x19], #16
696	mov		v24.16b, v26.16b
697	tbnz		x6, #2, 1f
698	st1		{v6.16b}, [x19], #16
699	mov		v24.16b, v27.16b
700	tbnz		x6, #3, 1f
701	st1		{v4.16b}, [x19], #16
702	mov		v24.16b, v28.16b
703	tbnz		x6, #4, 1f
704	st1		{v2.16b}, [x19], #16
705	mov		v24.16b, v29.16b
706	tbnz		x6, #5, 1f
707	st1		{v7.16b}, [x19], #16
708	mov		v24.16b, v30.16b
709	tbnz		x6, #6, 1f
710	st1		{v3.16b}, [x19], #16
711	mov		v24.16b, v31.16b
712	tbnz		x6, #7, 1f
713	ld1		{v24.16b}, [x20], #16
714	st1		{v5.16b}, [x19], #16
7151:	st1		{v24.16b}, [x24]		// store IV
716
717	cbz		x23, 2f
718	b		99b
719
7202:	frame_pop
721	ret
722SYM_FUNC_END(aesbs_cbc_decrypt)
723
724	.macro		next_tweak, out, in, const, tmp
725	sshr		\tmp\().2d,  \in\().2d,   #63
726	and		\tmp\().16b, \tmp\().16b, \const\().16b
727	add		\out\().2d,  \in\().2d,   \in\().2d
728	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
729	eor		\out\().16b, \out\().16b, \tmp\().16b
730	.endm
731
732	/*
733	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
734	 *		     int blocks, u8 iv[])
735	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
736	 *		     int blocks, u8 iv[])
737	 */
738SYM_FUNC_START_LOCAL(__xts_crypt8)
739	movi		v18.2s, #0x1
740	movi		v19.2s, #0x87
741	uzp1		v18.4s, v18.4s, v19.4s
742
743	ld1		{v0.16b-v3.16b}, [x1], #64
744	ld1		{v4.16b-v7.16b}, [x1], #64
745
746	next_tweak	v26, v25, v18, v19
747	next_tweak	v27, v26, v18, v19
748	next_tweak	v28, v27, v18, v19
749	next_tweak	v29, v28, v18, v19
750	next_tweak	v30, v29, v18, v19
751	next_tweak	v31, v30, v18, v19
752	next_tweak	v16, v31, v18, v19
753	next_tweak	v17, v16, v18, v19
754
755	eor		v0.16b, v0.16b, v25.16b
756	eor		v1.16b, v1.16b, v26.16b
757	eor		v2.16b, v2.16b, v27.16b
758	eor		v3.16b, v3.16b, v28.16b
759	eor		v4.16b, v4.16b, v29.16b
760	eor		v5.16b, v5.16b, v30.16b
761	eor		v6.16b, v6.16b, v31.16b
762	eor		v7.16b, v7.16b, v16.16b
763
764	stp		q16, q17, [x6]
765
766	mov		bskey, x2
767	mov		rounds, x3
768	br		x16
769SYM_FUNC_END(__xts_crypt8)
770
771	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
772	frame_push	0, 32
773	add		x6, sp, #.Lframe_local_offset
774
775	ld1		{v25.16b}, [x5]
776
7770:	adr		x16, \do8
778	bl		__xts_crypt8
779
780	eor		v16.16b, \o0\().16b, v25.16b
781	eor		v17.16b, \o1\().16b, v26.16b
782	eor		v18.16b, \o2\().16b, v27.16b
783	eor		v19.16b, \o3\().16b, v28.16b
784
785	ldp		q24, q25, [x6]
786
787	eor		v20.16b, \o4\().16b, v29.16b
788	eor		v21.16b, \o5\().16b, v30.16b
789	eor		v22.16b, \o6\().16b, v31.16b
790	eor		v23.16b, \o7\().16b, v24.16b
791
792	st1		{v16.16b-v19.16b}, [x0], #64
793	st1		{v20.16b-v23.16b}, [x0], #64
794
795	subs		x4, x4, #8
796	b.gt		0b
797
798	st1		{v25.16b}, [x5]
799	frame_pop
800	ret
801	.endm
802
803SYM_TYPED_FUNC_START(aesbs_xts_encrypt)
804	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
805SYM_FUNC_END(aesbs_xts_encrypt)
806
807SYM_TYPED_FUNC_START(aesbs_xts_decrypt)
808	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
809SYM_FUNC_END(aesbs_xts_decrypt)
810
811	.macro		next_ctr, v
812	mov		\v\().d[1], x8
813	adds		x8, x8, #1
814	mov		\v\().d[0], x7
815	adc		x7, x7, xzr
816	rev64		\v\().16b, \v\().16b
817	.endm
818
819	/*
820	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
821	 *		     int rounds, int blocks, u8 iv[])
822	 */
823SYM_FUNC_START(aesbs_ctr_encrypt)
824	frame_push	0
825	ldp		x7, x8, [x5]
826	ld1		{v0.16b}, [x5]
827CPU_LE(	rev		x7, x7		)
828CPU_LE(	rev		x8, x8		)
829	adds		x8, x8, #1
830	adc		x7, x7, xzr
831
8320:	next_ctr	v1
833	next_ctr	v2
834	next_ctr	v3
835	next_ctr	v4
836	next_ctr	v5
837	next_ctr	v6
838	next_ctr	v7
839
840	mov		bskey, x2
841	mov		rounds, x3
842	bl		aesbs_encrypt8
843
844	ld1		{ v8.16b-v11.16b}, [x1], #64
845	ld1		{v12.16b-v15.16b}, [x1], #64
846
847	eor		v8.16b, v0.16b, v8.16b
848	eor		v9.16b, v1.16b, v9.16b
849	eor		v10.16b, v4.16b, v10.16b
850	eor		v11.16b, v6.16b, v11.16b
851	eor		v12.16b, v3.16b, v12.16b
852	eor		v13.16b, v7.16b, v13.16b
853	eor		v14.16b, v2.16b, v14.16b
854	eor		v15.16b, v5.16b, v15.16b
855
856	st1		{ v8.16b-v11.16b}, [x0], #64
857	st1		{v12.16b-v15.16b}, [x0], #64
858
859	next_ctr	v0
860	subs		x4, x4, #8
861	b.gt		0b
862
863	st1		{v0.16b}, [x5]
864	frame_pop
865	ret
866SYM_FUNC_END(aesbs_ctr_encrypt)
867