xref: /linux/arch/arm64/crypto/aes-neonbs-core.S (revision 07f0148aafe8c95a3a76cd59e9e75b4d78d1d31d)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Bit sliced AES using NEON instructions
4 *
5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/*
9 * The algorithm implemented here is described in detail by the paper
10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
12 *
13 * This implementation is based primarily on the OpenSSL implementation
14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
15 */
16
17#include <linux/linkage.h>
18#include <asm/assembler.h>
19
20	.text
21
22	rounds		.req	x11
23	bskey		.req	x12
24
25	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
26	eor		\b2, \b2, \b1
27	eor		\b5, \b5, \b6
28	eor		\b3, \b3, \b0
29	eor		\b6, \b6, \b2
30	eor		\b5, \b5, \b0
31	eor		\b6, \b6, \b3
32	eor		\b3, \b3, \b7
33	eor		\b7, \b7, \b5
34	eor		\b3, \b3, \b4
35	eor		\b4, \b4, \b5
36	eor		\b2, \b2, \b7
37	eor		\b3, \b3, \b1
38	eor		\b1, \b1, \b5
39	.endm
40
41	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
42	eor		\b0, \b0, \b6
43	eor		\b1, \b1, \b4
44	eor		\b4, \b4, \b6
45	eor		\b2, \b2, \b0
46	eor		\b6, \b6, \b1
47	eor		\b1, \b1, \b5
48	eor		\b5, \b5, \b3
49	eor		\b3, \b3, \b7
50	eor		\b7, \b7, \b5
51	eor		\b2, \b2, \b5
52	eor		\b4, \b4, \b7
53	.endm
54
55	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
56	eor		\b1, \b1, \b7
57	eor		\b4, \b4, \b7
58	eor		\b7, \b7, \b5
59	eor		\b1, \b1, \b3
60	eor		\b2, \b2, \b5
61	eor		\b3, \b3, \b7
62	eor		\b6, \b6, \b1
63	eor		\b2, \b2, \b0
64	eor		\b5, \b5, \b3
65	eor		\b4, \b4, \b6
66	eor		\b0, \b0, \b6
67	eor		\b1, \b1, \b4
68	.endm
69
70	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
71	eor		\b1, \b1, \b5
72	eor		\b2, \b2, \b7
73	eor		\b3, \b3, \b1
74	eor		\b4, \b4, \b5
75	eor		\b7, \b7, \b5
76	eor		\b3, \b3, \b4
77	eor 		\b5, \b5, \b0
78	eor		\b3, \b3, \b7
79	eor		\b6, \b6, \b2
80	eor		\b2, \b2, \b1
81	eor		\b6, \b6, \b3
82	eor		\b3, \b3, \b0
83	eor		\b5, \b5, \b6
84	.endm
85
86	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
87	eor 		\t0, \y0, \y1
88	and		\t0, \t0, \x0
89	eor		\x0, \x0, \x1
90	and		\t1, \x1, \y0
91	and		\x0, \x0, \y1
92	eor		\x1, \t1, \t0
93	eor		\x0, \x0, \t1
94	.endm
95
96	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
97	eor		\t0, \y0, \y1
98	eor 		\t1, \y2, \y3
99	and		\t0, \t0, \x0
100	and		\t1, \t1, \x2
101	eor		\x0, \x0, \x1
102	eor		\x2, \x2, \x3
103	and		\x1, \x1, \y0
104	and		\x3, \x3, \y2
105	and		\x0, \x0, \y1
106	and		\x2, \x2, \y3
107	eor		\x1, \x1, \x0
108	eor		\x2, \x2, \x3
109	eor		\x0, \x0, \t0
110	eor		\x3, \x3, \t1
111	.endm
112
113	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
114				    y0, y1, y2, y3, t0, t1, t2, t3
115	eor		\t0, \x0, \x2
116	eor		\t1, \x1, \x3
117	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
118	eor		\y0, \y0, \y2
119	eor		\y1, \y1, \y3
120	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
121	eor		\x0, \x0, \t0
122	eor		\x2, \x2, \t0
123	eor		\x1, \x1, \t1
124	eor		\x3, \x3, \t1
125	eor		\t0, \x4, \x6
126	eor		\t1, \x5, \x7
127	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
128	eor		\y0, \y0, \y2
129	eor		\y1, \y1, \y3
130	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
131	eor		\x4, \x4, \t0
132	eor		\x6, \x6, \t0
133	eor		\x5, \x5, \t1
134	eor		\x7, \x7, \t1
135	.endm
136
137	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
138				   t0, t1, t2, t3, s0, s1, s2, s3
139	eor		\t3, \x4, \x6
140	eor		\t0, \x5, \x7
141	eor		\t1, \x1, \x3
142	eor		\s1, \x7, \x6
143	eor		\s0, \x0, \x2
144	eor		\s3, \t3, \t0
145	orr		\t2, \t0, \t1
146	and		\s2, \t3, \s0
147	orr		\t3, \t3, \s0
148	eor		\s0, \s0, \t1
149	and		\t0, \t0, \t1
150	eor		\t1, \x3, \x2
151	and		\s3, \s3, \s0
152	and		\s1, \s1, \t1
153	eor		\t1, \x4, \x5
154	eor		\s0, \x1, \x0
155	eor		\t3, \t3, \s1
156	eor		\t2, \t2, \s1
157	and		\s1, \t1, \s0
158	orr		\t1, \t1, \s0
159	eor		\t3, \t3, \s3
160	eor		\t0, \t0, \s1
161	eor		\t2, \t2, \s2
162	eor		\t1, \t1, \s3
163	eor		\t0, \t0, \s2
164	and		\s0, \x7, \x3
165	eor		\t1, \t1, \s2
166	and		\s1, \x6, \x2
167	and		\s2, \x5, \x1
168	orr		\s3, \x4, \x0
169	eor		\t3, \t3, \s0
170	eor		\t1, \t1, \s2
171	eor		\s0, \t0, \s3
172	eor		\t2, \t2, \s1
173	and		\s2, \t3, \t1
174	eor		\s1, \t2, \s2
175	eor		\s3, \s0, \s2
176	bsl		\s1, \t1, \s0
177	not		\t0, \s0
178	bsl		\s0, \s1, \s3
179	bsl		\t0, \s1, \s3
180	bsl		\s3, \t3, \t2
181	eor		\t3, \t3, \t2
182	and		\s2, \s0, \s3
183	eor		\t1, \t1, \t0
184	eor		\s2, \s2, \t3
185	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
186			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
187	.endm
188
189	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
190			      t0, t1, t2, t3, s0, s1, s2, s3
191	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
192			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
193	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
194			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
195			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
196			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
197	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
198			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
199	.endm
200
201	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
202				  t0, t1, t2, t3, s0, s1, s2, s3
203	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
204			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
205	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
206			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
207			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
208			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
209	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
210			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
211	.endm
212
213	.macro		enc_next_rk
214	ldp		q16, q17, [bskey], #128
215	ldp		q18, q19, [bskey, #-96]
216	ldp		q20, q21, [bskey, #-64]
217	ldp		q22, q23, [bskey, #-32]
218	.endm
219
220	.macro		dec_next_rk
221	ldp		q16, q17, [bskey, #-128]!
222	ldp		q18, q19, [bskey, #32]
223	ldp		q20, q21, [bskey, #64]
224	ldp		q22, q23, [bskey, #96]
225	.endm
226
227	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
228	eor		\x0\().16b, \x0\().16b, v16.16b
229	eor		\x1\().16b, \x1\().16b, v17.16b
230	eor		\x2\().16b, \x2\().16b, v18.16b
231	eor		\x3\().16b, \x3\().16b, v19.16b
232	eor		\x4\().16b, \x4\().16b, v20.16b
233	eor		\x5\().16b, \x5\().16b, v21.16b
234	eor		\x6\().16b, \x6\().16b, v22.16b
235	eor		\x7\().16b, \x7\().16b, v23.16b
236	.endm
237
238	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
239	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
240	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
241	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
242	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
243	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
244	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
245	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
246	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
247	.endm
248
249	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
250				  t0, t1, t2, t3, t4, t5, t6, t7, inv
251	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
252	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
253	eor		\x0\().16b, \x0\().16b, \t0\().16b
254	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
255	eor		\x1\().16b, \x1\().16b, \t1\().16b
256	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
257	eor		\x2\().16b, \x2\().16b, \t2\().16b
258	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
259	eor		\x3\().16b, \x3\().16b, \t3\().16b
260	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
261	eor		\x4\().16b, \x4\().16b, \t4\().16b
262	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
263	eor		\x5\().16b, \x5\().16b, \t5\().16b
264	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
265	eor		\x6\().16b, \x6\().16b, \t6\().16b
266	eor		\t1\().16b, \t1\().16b, \x0\().16b
267	eor		\x7\().16b, \x7\().16b, \t7\().16b
268	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
269	eor		\t2\().16b, \t2\().16b, \x1\().16b
270	eor		\t0\().16b, \t0\().16b, \x7\().16b
271	eor		\t1\().16b, \t1\().16b, \x7\().16b
272	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
273	eor		\t5\().16b, \t5\().16b, \x4\().16b
274	eor		\x0\().16b, \x0\().16b, \t0\().16b
275	eor		\t6\().16b, \t6\().16b, \x5\().16b
276	eor		\x1\().16b, \x1\().16b, \t1\().16b
277	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
278	eor		\t4\().16b, \t4\().16b, \x3\().16b
279	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
280	eor		\t7\().16b, \t7\().16b, \x6\().16b
281	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
282	eor		\t3\().16b, \t3\().16b, \x2\().16b
283	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
284	eor		\t4\().16b, \t4\().16b, \x7\().16b
285	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
286	eor		\t3\().16b, \t3\().16b, \x7\().16b
287	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
288	eor		\x7\().16b, \t1\().16b, \t5\().16b
289	.ifb		\inv
290	eor		\x2\().16b, \t0\().16b, \t4\().16b
291	eor		\x4\().16b, \x4\().16b, \t3\().16b
292	eor		\x5\().16b, \x5\().16b, \t7\().16b
293	eor		\x3\().16b, \x3\().16b, \t6\().16b
294	eor		\x6\().16b, \x6\().16b, \t2\().16b
295	.else
296	eor		\t3\().16b, \t3\().16b, \x4\().16b
297	eor		\x5\().16b, \x5\().16b, \t7\().16b
298	eor		\x2\().16b, \x3\().16b, \t6\().16b
299	eor		\x3\().16b, \t0\().16b, \t4\().16b
300	eor		\x4\().16b, \x6\().16b, \t2\().16b
301	mov		\x6\().16b, \t3\().16b
302	.endif
303	.endm
304
305	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
306				      t0, t1, t2, t3, t4, t5, t6, t7
307	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
308	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
309	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
310	eor		\t0\().16b, \t0\().16b, \x0\().16b
311	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
312	eor		\t6\().16b, \t6\().16b, \x6\().16b
313	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
314	eor		\t7\().16b, \t7\().16b, \x7\().16b
315	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
316	eor		\t1\().16b, \t1\().16b, \x1\().16b
317	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
318	eor		\t2\().16b, \t2\().16b, \x2\().16b
319	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
320	eor		\t3\().16b, \t3\().16b, \x3\().16b
321	eor		\t4\().16b, \t4\().16b, \x4\().16b
322	eor		\t5\().16b, \t5\().16b, \x5\().16b
323	eor		\x0\().16b, \x0\().16b, \t6\().16b
324	eor		\x1\().16b, \x1\().16b, \t6\().16b
325	eor		\x2\().16b, \x2\().16b, \t0\().16b
326	eor		\x4\().16b, \x4\().16b, \t2\().16b
327	eor		\x3\().16b, \x3\().16b, \t1\().16b
328	eor		\x1\().16b, \x1\().16b, \t7\().16b
329	eor		\x2\().16b, \x2\().16b, \t7\().16b
330	eor		\x4\().16b, \x4\().16b, \t6\().16b
331	eor		\x5\().16b, \x5\().16b, \t3\().16b
332	eor		\x3\().16b, \x3\().16b, \t6\().16b
333	eor		\x6\().16b, \x6\().16b, \t4\().16b
334	eor		\x4\().16b, \x4\().16b, \t7\().16b
335	eor		\x5\().16b, \x5\().16b, \t7\().16b
336	eor		\x7\().16b, \x7\().16b, \t5\().16b
337	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
338			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
339	.endm
340
341	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
342	ushr		\t0\().2d, \b0\().2d, #\n
343	ushr		\t1\().2d, \b1\().2d, #\n
344	eor		\t0\().16b, \t0\().16b, \a0\().16b
345	eor		\t1\().16b, \t1\().16b, \a1\().16b
346	and		\t0\().16b, \t0\().16b, \mask\().16b
347	and		\t1\().16b, \t1\().16b, \mask\().16b
348	eor		\a0\().16b, \a0\().16b, \t0\().16b
349	shl		\t0\().2d, \t0\().2d, #\n
350	eor		\a1\().16b, \a1\().16b, \t1\().16b
351	shl		\t1\().2d, \t1\().2d, #\n
352	eor		\b0\().16b, \b0\().16b, \t0\().16b
353	eor		\b1\().16b, \b1\().16b, \t1\().16b
354	.endm
355
356	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
357	movi		\t0\().16b, #0x55
358	movi		\t1\().16b, #0x33
359	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
360	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
361	movi		\t0\().16b, #0x0f
362	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
363	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
364	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
365	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
366	.endm
367
368
369	.align		6
370M0:	.octa		0x0004080c0105090d02060a0e03070b0f
371
372M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
373SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
374SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
375
376M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
377ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
378ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
379
380	/*
381	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
382	 */
383SYM_FUNC_START(aesbs_convert_key)
384	ld1		{v7.4s}, [x1], #16		// load round 0 key
385	ld1		{v17.4s}, [x1], #16		// load round 1 key
386
387	movi		v8.16b,  #0x01			// bit masks
388	movi		v9.16b,  #0x02
389	movi		v10.16b, #0x04
390	movi		v11.16b, #0x08
391	movi		v12.16b, #0x10
392	movi		v13.16b, #0x20
393	movi		v14.16b, #0x40
394	movi		v15.16b, #0x80
395	ldr		q16, M0
396
397	sub		x2, x2, #1
398	str		q7, [x0], #16		// save round 0 key
399
400.Lkey_loop:
401	tbl		v7.16b ,{v17.16b}, v16.16b
402	ld1		{v17.4s}, [x1], #16		// load next round key
403
404	cmtst		v0.16b, v7.16b, v8.16b
405	cmtst		v1.16b, v7.16b, v9.16b
406	cmtst		v2.16b, v7.16b, v10.16b
407	cmtst		v3.16b, v7.16b, v11.16b
408	cmtst		v4.16b, v7.16b, v12.16b
409	cmtst		v5.16b, v7.16b, v13.16b
410	cmtst		v6.16b, v7.16b, v14.16b
411	cmtst		v7.16b, v7.16b, v15.16b
412	not		v0.16b, v0.16b
413	not		v1.16b, v1.16b
414	not		v5.16b, v5.16b
415	not		v6.16b, v6.16b
416
417	subs		x2, x2, #1
418	stp		q0, q1, [x0], #128
419	stp		q2, q3, [x0, #-96]
420	stp		q4, q5, [x0, #-64]
421	stp		q6, q7, [x0, #-32]
422	b.ne		.Lkey_loop
423
424	movi		v7.16b, #0x63			// compose .L63
425	eor		v17.16b, v17.16b, v7.16b
426	str		q17, [x0]
427	ret
428SYM_FUNC_END(aesbs_convert_key)
429
430	.align		4
431SYM_FUNC_START_LOCAL(aesbs_encrypt8)
432	ldr		q9, [bskey], #16		// round 0 key
433	ldr		q8, M0SR
434	ldr		q24, SR
435
436	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
437	eor		v11.16b, v1.16b, v9.16b
438	tbl		v0.16b, {v10.16b}, v8.16b
439	eor		v12.16b, v2.16b, v9.16b
440	tbl		v1.16b, {v11.16b}, v8.16b
441	eor		v13.16b, v3.16b, v9.16b
442	tbl		v2.16b, {v12.16b}, v8.16b
443	eor		v14.16b, v4.16b, v9.16b
444	tbl		v3.16b, {v13.16b}, v8.16b
445	eor		v15.16b, v5.16b, v9.16b
446	tbl		v4.16b, {v14.16b}, v8.16b
447	eor		v10.16b, v6.16b, v9.16b
448	tbl		v5.16b, {v15.16b}, v8.16b
449	eor		v11.16b, v7.16b, v9.16b
450	tbl		v6.16b, {v10.16b}, v8.16b
451	tbl		v7.16b, {v11.16b}, v8.16b
452
453	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
454
455	sub		rounds, rounds, #1
456	b		.Lenc_sbox
457
458.Lenc_loop:
459	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
460.Lenc_sbox:
461	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
462								v13, v14, v15
463	subs		rounds, rounds, #1
464	b.cc		.Lenc_done
465
466	enc_next_rk
467
468	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
469								v13, v14, v15
470
471	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
472
473	b.ne		.Lenc_loop
474	ldr		q24, SRM0
475	b		.Lenc_loop
476
477.Lenc_done:
478	ldr		q12, [bskey]			// last round key
479
480	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
481
482	eor		v0.16b, v0.16b, v12.16b
483	eor		v1.16b, v1.16b, v12.16b
484	eor		v4.16b, v4.16b, v12.16b
485	eor		v6.16b, v6.16b, v12.16b
486	eor		v3.16b, v3.16b, v12.16b
487	eor		v7.16b, v7.16b, v12.16b
488	eor		v2.16b, v2.16b, v12.16b
489	eor		v5.16b, v5.16b, v12.16b
490	ret
491SYM_FUNC_END(aesbs_encrypt8)
492
493	.align		4
494SYM_FUNC_START_LOCAL(aesbs_decrypt8)
495	lsl		x9, rounds, #7
496	add		bskey, bskey, x9
497
498	ldr		q9, [bskey, #-112]!		// round 0 key
499	ldr		q8, M0ISR
500	ldr		q24, ISR
501
502	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
503	eor		v11.16b, v1.16b, v9.16b
504	tbl		v0.16b, {v10.16b}, v8.16b
505	eor		v12.16b, v2.16b, v9.16b
506	tbl		v1.16b, {v11.16b}, v8.16b
507	eor		v13.16b, v3.16b, v9.16b
508	tbl		v2.16b, {v12.16b}, v8.16b
509	eor		v14.16b, v4.16b, v9.16b
510	tbl		v3.16b, {v13.16b}, v8.16b
511	eor		v15.16b, v5.16b, v9.16b
512	tbl		v4.16b, {v14.16b}, v8.16b
513	eor		v10.16b, v6.16b, v9.16b
514	tbl		v5.16b, {v15.16b}, v8.16b
515	eor		v11.16b, v7.16b, v9.16b
516	tbl		v6.16b, {v10.16b}, v8.16b
517	tbl		v7.16b, {v11.16b}, v8.16b
518
519	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
520
521	sub		rounds, rounds, #1
522	b		.Ldec_sbox
523
524.Ldec_loop:
525	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
526.Ldec_sbox:
527	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
528								v13, v14, v15
529	subs		rounds, rounds, #1
530	b.cc		.Ldec_done
531
532	dec_next_rk
533
534	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
535
536	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
537								v13, v14, v15
538
539	b.ne		.Ldec_loop
540	ldr		q24, ISRM0
541	b		.Ldec_loop
542.Ldec_done:
543	ldr		q12, [bskey, #-16]		// last round key
544
545	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
546
547	eor		v0.16b, v0.16b, v12.16b
548	eor		v1.16b, v1.16b, v12.16b
549	eor		v6.16b, v6.16b, v12.16b
550	eor		v4.16b, v4.16b, v12.16b
551	eor		v2.16b, v2.16b, v12.16b
552	eor		v7.16b, v7.16b, v12.16b
553	eor		v3.16b, v3.16b, v12.16b
554	eor		v5.16b, v5.16b, v12.16b
555	ret
556SYM_FUNC_END(aesbs_decrypt8)
557
558	/*
559	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
560	 *		     int blocks)
561	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
562	 *		     int blocks)
563	 */
564	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
565	frame_push	5
566
567	mov		x19, x0
568	mov		x20, x1
569	mov		x21, x2
570	mov		x22, x3
571	mov		x23, x4
572
57399:	mov		x5, #1
574	lsl		x5, x5, x23
575	subs		w23, w23, #8
576	csel		x23, x23, xzr, pl
577	csel		x5, x5, xzr, mi
578
579	ld1		{v0.16b}, [x20], #16
580	tbnz		x5, #1, 0f
581	ld1		{v1.16b}, [x20], #16
582	tbnz		x5, #2, 0f
583	ld1		{v2.16b}, [x20], #16
584	tbnz		x5, #3, 0f
585	ld1		{v3.16b}, [x20], #16
586	tbnz		x5, #4, 0f
587	ld1		{v4.16b}, [x20], #16
588	tbnz		x5, #5, 0f
589	ld1		{v5.16b}, [x20], #16
590	tbnz		x5, #6, 0f
591	ld1		{v6.16b}, [x20], #16
592	tbnz		x5, #7, 0f
593	ld1		{v7.16b}, [x20], #16
594
5950:	mov		bskey, x21
596	mov		rounds, x22
597	bl		\do8
598
599	st1		{\o0\().16b}, [x19], #16
600	tbnz		x5, #1, 1f
601	st1		{\o1\().16b}, [x19], #16
602	tbnz		x5, #2, 1f
603	st1		{\o2\().16b}, [x19], #16
604	tbnz		x5, #3, 1f
605	st1		{\o3\().16b}, [x19], #16
606	tbnz		x5, #4, 1f
607	st1		{\o4\().16b}, [x19], #16
608	tbnz		x5, #5, 1f
609	st1		{\o5\().16b}, [x19], #16
610	tbnz		x5, #6, 1f
611	st1		{\o6\().16b}, [x19], #16
612	tbnz		x5, #7, 1f
613	st1		{\o7\().16b}, [x19], #16
614
615	cbz		x23, 1f
616	b		99b
617
6181:	frame_pop
619	ret
620	.endm
621
622	.align		4
623SYM_FUNC_START(aesbs_ecb_encrypt)
624	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
625SYM_FUNC_END(aesbs_ecb_encrypt)
626
627	.align		4
628SYM_FUNC_START(aesbs_ecb_decrypt)
629	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
630SYM_FUNC_END(aesbs_ecb_decrypt)
631
632	/*
633	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
634	 *		     int blocks, u8 iv[])
635	 */
636	.align		4
637SYM_FUNC_START(aesbs_cbc_decrypt)
638	frame_push	6
639
640	mov		x19, x0
641	mov		x20, x1
642	mov		x21, x2
643	mov		x22, x3
644	mov		x23, x4
645	mov		x24, x5
646
64799:	mov		x6, #1
648	lsl		x6, x6, x23
649	subs		w23, w23, #8
650	csel		x23, x23, xzr, pl
651	csel		x6, x6, xzr, mi
652
653	ld1		{v0.16b}, [x20], #16
654	mov		v25.16b, v0.16b
655	tbnz		x6, #1, 0f
656	ld1		{v1.16b}, [x20], #16
657	mov		v26.16b, v1.16b
658	tbnz		x6, #2, 0f
659	ld1		{v2.16b}, [x20], #16
660	mov		v27.16b, v2.16b
661	tbnz		x6, #3, 0f
662	ld1		{v3.16b}, [x20], #16
663	mov		v28.16b, v3.16b
664	tbnz		x6, #4, 0f
665	ld1		{v4.16b}, [x20], #16
666	mov		v29.16b, v4.16b
667	tbnz		x6, #5, 0f
668	ld1		{v5.16b}, [x20], #16
669	mov		v30.16b, v5.16b
670	tbnz		x6, #6, 0f
671	ld1		{v6.16b}, [x20], #16
672	mov		v31.16b, v6.16b
673	tbnz		x6, #7, 0f
674	ld1		{v7.16b}, [x20]
675
6760:	mov		bskey, x21
677	mov		rounds, x22
678	bl		aesbs_decrypt8
679
680	ld1		{v24.16b}, [x24]		// load IV
681
682	eor		v1.16b, v1.16b, v25.16b
683	eor		v6.16b, v6.16b, v26.16b
684	eor		v4.16b, v4.16b, v27.16b
685	eor		v2.16b, v2.16b, v28.16b
686	eor		v7.16b, v7.16b, v29.16b
687	eor		v0.16b, v0.16b, v24.16b
688	eor		v3.16b, v3.16b, v30.16b
689	eor		v5.16b, v5.16b, v31.16b
690
691	st1		{v0.16b}, [x19], #16
692	mov		v24.16b, v25.16b
693	tbnz		x6, #1, 1f
694	st1		{v1.16b}, [x19], #16
695	mov		v24.16b, v26.16b
696	tbnz		x6, #2, 1f
697	st1		{v6.16b}, [x19], #16
698	mov		v24.16b, v27.16b
699	tbnz		x6, #3, 1f
700	st1		{v4.16b}, [x19], #16
701	mov		v24.16b, v28.16b
702	tbnz		x6, #4, 1f
703	st1		{v2.16b}, [x19], #16
704	mov		v24.16b, v29.16b
705	tbnz		x6, #5, 1f
706	st1		{v7.16b}, [x19], #16
707	mov		v24.16b, v30.16b
708	tbnz		x6, #6, 1f
709	st1		{v3.16b}, [x19], #16
710	mov		v24.16b, v31.16b
711	tbnz		x6, #7, 1f
712	ld1		{v24.16b}, [x20], #16
713	st1		{v5.16b}, [x19], #16
7141:	st1		{v24.16b}, [x24]		// store IV
715
716	cbz		x23, 2f
717	b		99b
718
7192:	frame_pop
720	ret
721SYM_FUNC_END(aesbs_cbc_decrypt)
722
723	.macro		next_tweak, out, in, const, tmp
724	sshr		\tmp\().2d,  \in\().2d,   #63
725	and		\tmp\().16b, \tmp\().16b, \const\().16b
726	add		\out\().2d,  \in\().2d,   \in\().2d
727	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
728	eor		\out\().16b, \out\().16b, \tmp\().16b
729	.endm
730
731	/*
732	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
733	 *		     int blocks, u8 iv[])
734	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
735	 *		     int blocks, u8 iv[])
736	 */
737SYM_FUNC_START_LOCAL(__xts_crypt8)
738	movi		v18.2s, #0x1
739	movi		v19.2s, #0x87
740	uzp1		v18.4s, v18.4s, v19.4s
741
742	ld1		{v0.16b-v3.16b}, [x1], #64
743	ld1		{v4.16b-v7.16b}, [x1], #64
744
745	next_tweak	v26, v25, v18, v19
746	next_tweak	v27, v26, v18, v19
747	next_tweak	v28, v27, v18, v19
748	next_tweak	v29, v28, v18, v19
749	next_tweak	v30, v29, v18, v19
750	next_tweak	v31, v30, v18, v19
751	next_tweak	v16, v31, v18, v19
752	next_tweak	v17, v16, v18, v19
753
754	eor		v0.16b, v0.16b, v25.16b
755	eor		v1.16b, v1.16b, v26.16b
756	eor		v2.16b, v2.16b, v27.16b
757	eor		v3.16b, v3.16b, v28.16b
758	eor		v4.16b, v4.16b, v29.16b
759	eor		v5.16b, v5.16b, v30.16b
760	eor		v6.16b, v6.16b, v31.16b
761	eor		v7.16b, v7.16b, v16.16b
762
763	stp		q16, q17, [x6]
764
765	mov		bskey, x2
766	mov		rounds, x3
767	br		x16
768SYM_FUNC_END(__xts_crypt8)
769
770	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
771	frame_push	0, 32
772	add		x6, sp, #.Lframe_local_offset
773
774	ld1		{v25.16b}, [x5]
775
7760:	adr		x16, \do8
777	bl		__xts_crypt8
778
779	eor		v16.16b, \o0\().16b, v25.16b
780	eor		v17.16b, \o1\().16b, v26.16b
781	eor		v18.16b, \o2\().16b, v27.16b
782	eor		v19.16b, \o3\().16b, v28.16b
783
784	ldp		q24, q25, [x6]
785
786	eor		v20.16b, \o4\().16b, v29.16b
787	eor		v21.16b, \o5\().16b, v30.16b
788	eor		v22.16b, \o6\().16b, v31.16b
789	eor		v23.16b, \o7\().16b, v24.16b
790
791	st1		{v16.16b-v19.16b}, [x0], #64
792	st1		{v20.16b-v23.16b}, [x0], #64
793
794	subs		x4, x4, #8
795	b.gt		0b
796
797	st1		{v25.16b}, [x5]
798	frame_pop
799	ret
800	.endm
801
802SYM_FUNC_START(aesbs_xts_encrypt)
803	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
804SYM_FUNC_END(aesbs_xts_encrypt)
805
806SYM_FUNC_START(aesbs_xts_decrypt)
807	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
808SYM_FUNC_END(aesbs_xts_decrypt)
809
810	.macro		next_ctr, v
811	mov		\v\().d[1], x8
812	adds		x8, x8, #1
813	mov		\v\().d[0], x7
814	adc		x7, x7, xzr
815	rev64		\v\().16b, \v\().16b
816	.endm
817
818	/*
819	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
820	 *		     int rounds, int blocks, u8 iv[])
821	 */
822SYM_FUNC_START(aesbs_ctr_encrypt)
823	frame_push	0
824	ldp		x7, x8, [x5]
825	ld1		{v0.16b}, [x5]
826CPU_LE(	rev		x7, x7		)
827CPU_LE(	rev		x8, x8		)
828	adds		x8, x8, #1
829	adc		x7, x7, xzr
830
8310:	next_ctr	v1
832	next_ctr	v2
833	next_ctr	v3
834	next_ctr	v4
835	next_ctr	v5
836	next_ctr	v6
837	next_ctr	v7
838
839	mov		bskey, x2
840	mov		rounds, x3
841	bl		aesbs_encrypt8
842
843	ld1		{ v8.16b-v11.16b}, [x1], #64
844	ld1		{v12.16b-v15.16b}, [x1], #64
845
846	eor		v8.16b, v0.16b, v8.16b
847	eor		v9.16b, v1.16b, v9.16b
848	eor		v10.16b, v4.16b, v10.16b
849	eor		v11.16b, v6.16b, v11.16b
850	eor		v12.16b, v3.16b, v12.16b
851	eor		v13.16b, v7.16b, v13.16b
852	eor		v14.16b, v2.16b, v14.16b
853	eor		v15.16b, v5.16b, v15.16b
854
855	st1		{ v8.16b-v11.16b}, [x0], #64
856	st1		{v12.16b-v15.16b}, [x0], #64
857
858	next_ctr	v0
859	subs		x4, x4, #8
860	b.gt		0b
861
862	st1		{v0.16b}, [x5]
863	frame_pop
864	ret
865SYM_FUNC_END(aesbs_ctr_encrypt)
866