xref: /linux/arch/arm64/crypto/aes-neonbs-core.S (revision 9009b455811b0fa1f6b0adfa94db136984db5a38)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Bit sliced AES using NEON instructions
4 *
5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/*
9 * The algorithm implemented here is described in detail by the paper
10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
12 *
13 * This implementation is based primarily on the OpenSSL implementation
14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
15 */
16
17#include <linux/linkage.h>
18#include <asm/assembler.h>
19
20	.text
21
22	rounds		.req	x11
23	bskey		.req	x12
24
25	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
26	eor		\b2, \b2, \b1
27	eor		\b5, \b5, \b6
28	eor		\b3, \b3, \b0
29	eor		\b6, \b6, \b2
30	eor		\b5, \b5, \b0
31	eor		\b6, \b6, \b3
32	eor		\b3, \b3, \b7
33	eor		\b7, \b7, \b5
34	eor		\b3, \b3, \b4
35	eor		\b4, \b4, \b5
36	eor		\b2, \b2, \b7
37	eor		\b3, \b3, \b1
38	eor		\b1, \b1, \b5
39	.endm
40
41	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
42	eor		\b0, \b0, \b6
43	eor		\b1, \b1, \b4
44	eor		\b4, \b4, \b6
45	eor		\b2, \b2, \b0
46	eor		\b6, \b6, \b1
47	eor		\b1, \b1, \b5
48	eor		\b5, \b5, \b3
49	eor		\b3, \b3, \b7
50	eor		\b7, \b7, \b5
51	eor		\b2, \b2, \b5
52	eor		\b4, \b4, \b7
53	.endm
54
55	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
56	eor		\b1, \b1, \b7
57	eor		\b4, \b4, \b7
58	eor		\b7, \b7, \b5
59	eor		\b1, \b1, \b3
60	eor		\b2, \b2, \b5
61	eor		\b3, \b3, \b7
62	eor		\b6, \b6, \b1
63	eor		\b2, \b2, \b0
64	eor		\b5, \b5, \b3
65	eor		\b4, \b4, \b6
66	eor		\b0, \b0, \b6
67	eor		\b1, \b1, \b4
68	.endm
69
70	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
71	eor		\b1, \b1, \b5
72	eor		\b2, \b2, \b7
73	eor		\b3, \b3, \b1
74	eor		\b4, \b4, \b5
75	eor		\b7, \b7, \b5
76	eor		\b3, \b3, \b4
77	eor 		\b5, \b5, \b0
78	eor		\b3, \b3, \b7
79	eor		\b6, \b6, \b2
80	eor		\b2, \b2, \b1
81	eor		\b6, \b6, \b3
82	eor		\b3, \b3, \b0
83	eor		\b5, \b5, \b6
84	.endm
85
86	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
87	eor 		\t0, \y0, \y1
88	and		\t0, \t0, \x0
89	eor		\x0, \x0, \x1
90	and		\t1, \x1, \y0
91	and		\x0, \x0, \y1
92	eor		\x1, \t1, \t0
93	eor		\x0, \x0, \t1
94	.endm
95
96	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
97	eor		\t0, \y0, \y1
98	eor 		\t1, \y2, \y3
99	and		\t0, \t0, \x0
100	and		\t1, \t1, \x2
101	eor		\x0, \x0, \x1
102	eor		\x2, \x2, \x3
103	and		\x1, \x1, \y0
104	and		\x3, \x3, \y2
105	and		\x0, \x0, \y1
106	and		\x2, \x2, \y3
107	eor		\x1, \x1, \x0
108	eor		\x2, \x2, \x3
109	eor		\x0, \x0, \t0
110	eor		\x3, \x3, \t1
111	.endm
112
113	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
114				    y0, y1, y2, y3, t0, t1, t2, t3
115	eor		\t0, \x0, \x2
116	eor		\t1, \x1, \x3
117	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
118	eor		\y0, \y0, \y2
119	eor		\y1, \y1, \y3
120	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
121	eor		\x0, \x0, \t0
122	eor		\x2, \x2, \t0
123	eor		\x1, \x1, \t1
124	eor		\x3, \x3, \t1
125	eor		\t0, \x4, \x6
126	eor		\t1, \x5, \x7
127	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
128	eor		\y0, \y0, \y2
129	eor		\y1, \y1, \y3
130	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
131	eor		\x4, \x4, \t0
132	eor		\x6, \x6, \t0
133	eor		\x5, \x5, \t1
134	eor		\x7, \x7, \t1
135	.endm
136
137	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
138				   t0, t1, t2, t3, s0, s1, s2, s3
139	eor		\t3, \x4, \x6
140	eor		\t0, \x5, \x7
141	eor		\t1, \x1, \x3
142	eor		\s1, \x7, \x6
143	eor		\s0, \x0, \x2
144	eor		\s3, \t3, \t0
145	orr		\t2, \t0, \t1
146	and		\s2, \t3, \s0
147	orr		\t3, \t3, \s0
148	eor		\s0, \s0, \t1
149	and		\t0, \t0, \t1
150	eor		\t1, \x3, \x2
151	and		\s3, \s3, \s0
152	and		\s1, \s1, \t1
153	eor		\t1, \x4, \x5
154	eor		\s0, \x1, \x0
155	eor		\t3, \t3, \s1
156	eor		\t2, \t2, \s1
157	and		\s1, \t1, \s0
158	orr		\t1, \t1, \s0
159	eor		\t3, \t3, \s3
160	eor		\t0, \t0, \s1
161	eor		\t2, \t2, \s2
162	eor		\t1, \t1, \s3
163	eor		\t0, \t0, \s2
164	and		\s0, \x7, \x3
165	eor		\t1, \t1, \s2
166	and		\s1, \x6, \x2
167	and		\s2, \x5, \x1
168	orr		\s3, \x4, \x0
169	eor		\t3, \t3, \s0
170	eor		\t1, \t1, \s2
171	eor		\s0, \t0, \s3
172	eor		\t2, \t2, \s1
173	and		\s2, \t3, \t1
174	eor		\s1, \t2, \s2
175	eor		\s3, \s0, \s2
176	bsl		\s1, \t1, \s0
177	not		\t0, \s0
178	bsl		\s0, \s1, \s3
179	bsl		\t0, \s1, \s3
180	bsl		\s3, \t3, \t2
181	eor		\t3, \t3, \t2
182	and		\s2, \s0, \s3
183	eor		\t1, \t1, \t0
184	eor		\s2, \s2, \t3
185	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
186			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
187	.endm
188
189	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
190			      t0, t1, t2, t3, s0, s1, s2, s3
191	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
192			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
193	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
194			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
195			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
196			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
197	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
198			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
199	.endm
200
201	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
202				  t0, t1, t2, t3, s0, s1, s2, s3
203	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
204			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
205	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
206			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
207			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
208			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
209	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
210			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
211	.endm
212
213	.macro		enc_next_rk
214	ldp		q16, q17, [bskey], #128
215	ldp		q18, q19, [bskey, #-96]
216	ldp		q20, q21, [bskey, #-64]
217	ldp		q22, q23, [bskey, #-32]
218	.endm
219
220	.macro		dec_next_rk
221	ldp		q16, q17, [bskey, #-128]!
222	ldp		q18, q19, [bskey, #32]
223	ldp		q20, q21, [bskey, #64]
224	ldp		q22, q23, [bskey, #96]
225	.endm
226
227	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
228	eor		\x0\().16b, \x0\().16b, v16.16b
229	eor		\x1\().16b, \x1\().16b, v17.16b
230	eor		\x2\().16b, \x2\().16b, v18.16b
231	eor		\x3\().16b, \x3\().16b, v19.16b
232	eor		\x4\().16b, \x4\().16b, v20.16b
233	eor		\x5\().16b, \x5\().16b, v21.16b
234	eor		\x6\().16b, \x6\().16b, v22.16b
235	eor		\x7\().16b, \x7\().16b, v23.16b
236	.endm
237
238	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
239	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
240	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
241	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
242	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
243	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
244	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
245	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
246	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
247	.endm
248
249	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
250				  t0, t1, t2, t3, t4, t5, t6, t7, inv
251	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
252	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
253	eor		\x0\().16b, \x0\().16b, \t0\().16b
254	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
255	eor		\x1\().16b, \x1\().16b, \t1\().16b
256	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
257	eor		\x2\().16b, \x2\().16b, \t2\().16b
258	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
259	eor		\x3\().16b, \x3\().16b, \t3\().16b
260	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
261	eor		\x4\().16b, \x4\().16b, \t4\().16b
262	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
263	eor		\x5\().16b, \x5\().16b, \t5\().16b
264	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
265	eor		\x6\().16b, \x6\().16b, \t6\().16b
266	eor		\t1\().16b, \t1\().16b, \x0\().16b
267	eor		\x7\().16b, \x7\().16b, \t7\().16b
268	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
269	eor		\t2\().16b, \t2\().16b, \x1\().16b
270	eor		\t0\().16b, \t0\().16b, \x7\().16b
271	eor		\t1\().16b, \t1\().16b, \x7\().16b
272	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
273	eor		\t5\().16b, \t5\().16b, \x4\().16b
274	eor		\x0\().16b, \x0\().16b, \t0\().16b
275	eor		\t6\().16b, \t6\().16b, \x5\().16b
276	eor		\x1\().16b, \x1\().16b, \t1\().16b
277	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
278	eor		\t4\().16b, \t4\().16b, \x3\().16b
279	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
280	eor		\t7\().16b, \t7\().16b, \x6\().16b
281	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
282	eor		\t3\().16b, \t3\().16b, \x2\().16b
283	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
284	eor		\t4\().16b, \t4\().16b, \x7\().16b
285	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
286	eor		\t3\().16b, \t3\().16b, \x7\().16b
287	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
288	eor		\x7\().16b, \t1\().16b, \t5\().16b
289	.ifb		\inv
290	eor		\x2\().16b, \t0\().16b, \t4\().16b
291	eor		\x4\().16b, \x4\().16b, \t3\().16b
292	eor		\x5\().16b, \x5\().16b, \t7\().16b
293	eor		\x3\().16b, \x3\().16b, \t6\().16b
294	eor		\x6\().16b, \x6\().16b, \t2\().16b
295	.else
296	eor		\t3\().16b, \t3\().16b, \x4\().16b
297	eor		\x5\().16b, \x5\().16b, \t7\().16b
298	eor		\x2\().16b, \x3\().16b, \t6\().16b
299	eor		\x3\().16b, \t0\().16b, \t4\().16b
300	eor		\x4\().16b, \x6\().16b, \t2\().16b
301	mov		\x6\().16b, \t3\().16b
302	.endif
303	.endm
304
305	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
306				      t0, t1, t2, t3, t4, t5, t6, t7
307	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
308	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
309	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
310	eor		\t0\().16b, \t0\().16b, \x0\().16b
311	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
312	eor		\t6\().16b, \t6\().16b, \x6\().16b
313	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
314	eor		\t7\().16b, \t7\().16b, \x7\().16b
315	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
316	eor		\t1\().16b, \t1\().16b, \x1\().16b
317	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
318	eor		\t2\().16b, \t2\().16b, \x2\().16b
319	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
320	eor		\t3\().16b, \t3\().16b, \x3\().16b
321	eor		\t4\().16b, \t4\().16b, \x4\().16b
322	eor		\t5\().16b, \t5\().16b, \x5\().16b
323	eor		\x0\().16b, \x0\().16b, \t6\().16b
324	eor		\x1\().16b, \x1\().16b, \t6\().16b
325	eor		\x2\().16b, \x2\().16b, \t0\().16b
326	eor		\x4\().16b, \x4\().16b, \t2\().16b
327	eor		\x3\().16b, \x3\().16b, \t1\().16b
328	eor		\x1\().16b, \x1\().16b, \t7\().16b
329	eor		\x2\().16b, \x2\().16b, \t7\().16b
330	eor		\x4\().16b, \x4\().16b, \t6\().16b
331	eor		\x5\().16b, \x5\().16b, \t3\().16b
332	eor		\x3\().16b, \x3\().16b, \t6\().16b
333	eor		\x6\().16b, \x6\().16b, \t4\().16b
334	eor		\x4\().16b, \x4\().16b, \t7\().16b
335	eor		\x5\().16b, \x5\().16b, \t7\().16b
336	eor		\x7\().16b, \x7\().16b, \t5\().16b
337	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
338			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
339	.endm
340
341	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
342	ushr		\t0\().2d, \b0\().2d, #\n
343	ushr		\t1\().2d, \b1\().2d, #\n
344	eor		\t0\().16b, \t0\().16b, \a0\().16b
345	eor		\t1\().16b, \t1\().16b, \a1\().16b
346	and		\t0\().16b, \t0\().16b, \mask\().16b
347	and		\t1\().16b, \t1\().16b, \mask\().16b
348	eor		\a0\().16b, \a0\().16b, \t0\().16b
349	shl		\t0\().2d, \t0\().2d, #\n
350	eor		\a1\().16b, \a1\().16b, \t1\().16b
351	shl		\t1\().2d, \t1\().2d, #\n
352	eor		\b0\().16b, \b0\().16b, \t0\().16b
353	eor		\b1\().16b, \b1\().16b, \t1\().16b
354	.endm
355
356	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
357	movi		\t0\().16b, #0x55
358	movi		\t1\().16b, #0x33
359	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
360	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
361	movi		\t0\().16b, #0x0f
362	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
363	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
364	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
365	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
366	.endm
367
368
369	.align		6
370M0:	.octa		0x0004080c0105090d02060a0e03070b0f
371
372M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
373SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
374SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
375
376M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
377ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
378ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
379
380	/*
381	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
382	 */
383SYM_FUNC_START(aesbs_convert_key)
384	ld1		{v7.4s}, [x1], #16		// load round 0 key
385	ld1		{v17.4s}, [x1], #16		// load round 1 key
386
387	movi		v8.16b,  #0x01			// bit masks
388	movi		v9.16b,  #0x02
389	movi		v10.16b, #0x04
390	movi		v11.16b, #0x08
391	movi		v12.16b, #0x10
392	movi		v13.16b, #0x20
393	movi		v14.16b, #0x40
394	movi		v15.16b, #0x80
395	ldr		q16, M0
396
397	sub		x2, x2, #1
398	str		q7, [x0], #16		// save round 0 key
399
400.Lkey_loop:
401	tbl		v7.16b ,{v17.16b}, v16.16b
402	ld1		{v17.4s}, [x1], #16		// load next round key
403
404	cmtst		v0.16b, v7.16b, v8.16b
405	cmtst		v1.16b, v7.16b, v9.16b
406	cmtst		v2.16b, v7.16b, v10.16b
407	cmtst		v3.16b, v7.16b, v11.16b
408	cmtst		v4.16b, v7.16b, v12.16b
409	cmtst		v5.16b, v7.16b, v13.16b
410	cmtst		v6.16b, v7.16b, v14.16b
411	cmtst		v7.16b, v7.16b, v15.16b
412	not		v0.16b, v0.16b
413	not		v1.16b, v1.16b
414	not		v5.16b, v5.16b
415	not		v6.16b, v6.16b
416
417	subs		x2, x2, #1
418	stp		q0, q1, [x0], #128
419	stp		q2, q3, [x0, #-96]
420	stp		q4, q5, [x0, #-64]
421	stp		q6, q7, [x0, #-32]
422	b.ne		.Lkey_loop
423
424	movi		v7.16b, #0x63			// compose .L63
425	eor		v17.16b, v17.16b, v7.16b
426	str		q17, [x0]
427	ret
428SYM_FUNC_END(aesbs_convert_key)
429
430	.align		4
431SYM_FUNC_START_LOCAL(aesbs_encrypt8)
432	ldr		q9, [bskey], #16		// round 0 key
433	ldr		q8, M0SR
434	ldr		q24, SR
435
436	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
437	eor		v11.16b, v1.16b, v9.16b
438	tbl		v0.16b, {v10.16b}, v8.16b
439	eor		v12.16b, v2.16b, v9.16b
440	tbl		v1.16b, {v11.16b}, v8.16b
441	eor		v13.16b, v3.16b, v9.16b
442	tbl		v2.16b, {v12.16b}, v8.16b
443	eor		v14.16b, v4.16b, v9.16b
444	tbl		v3.16b, {v13.16b}, v8.16b
445	eor		v15.16b, v5.16b, v9.16b
446	tbl		v4.16b, {v14.16b}, v8.16b
447	eor		v10.16b, v6.16b, v9.16b
448	tbl		v5.16b, {v15.16b}, v8.16b
449	eor		v11.16b, v7.16b, v9.16b
450	tbl		v6.16b, {v10.16b}, v8.16b
451	tbl		v7.16b, {v11.16b}, v8.16b
452
453	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
454
455	sub		rounds, rounds, #1
456	b		.Lenc_sbox
457
458.Lenc_loop:
459	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
460.Lenc_sbox:
461	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
462								v13, v14, v15
463	subs		rounds, rounds, #1
464	b.cc		.Lenc_done
465
466	enc_next_rk
467
468	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
469								v13, v14, v15
470
471	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
472
473	b.ne		.Lenc_loop
474	ldr		q24, SRM0
475	b		.Lenc_loop
476
477.Lenc_done:
478	ldr		q12, [bskey]			// last round key
479
480	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
481
482	eor		v0.16b, v0.16b, v12.16b
483	eor		v1.16b, v1.16b, v12.16b
484	eor		v4.16b, v4.16b, v12.16b
485	eor		v6.16b, v6.16b, v12.16b
486	eor		v3.16b, v3.16b, v12.16b
487	eor		v7.16b, v7.16b, v12.16b
488	eor		v2.16b, v2.16b, v12.16b
489	eor		v5.16b, v5.16b, v12.16b
490	ret
491SYM_FUNC_END(aesbs_encrypt8)
492
493	.align		4
494SYM_FUNC_START_LOCAL(aesbs_decrypt8)
495	lsl		x9, rounds, #7
496	add		bskey, bskey, x9
497
498	ldr		q9, [bskey, #-112]!		// round 0 key
499	ldr		q8, M0ISR
500	ldr		q24, ISR
501
502	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
503	eor		v11.16b, v1.16b, v9.16b
504	tbl		v0.16b, {v10.16b}, v8.16b
505	eor		v12.16b, v2.16b, v9.16b
506	tbl		v1.16b, {v11.16b}, v8.16b
507	eor		v13.16b, v3.16b, v9.16b
508	tbl		v2.16b, {v12.16b}, v8.16b
509	eor		v14.16b, v4.16b, v9.16b
510	tbl		v3.16b, {v13.16b}, v8.16b
511	eor		v15.16b, v5.16b, v9.16b
512	tbl		v4.16b, {v14.16b}, v8.16b
513	eor		v10.16b, v6.16b, v9.16b
514	tbl		v5.16b, {v15.16b}, v8.16b
515	eor		v11.16b, v7.16b, v9.16b
516	tbl		v6.16b, {v10.16b}, v8.16b
517	tbl		v7.16b, {v11.16b}, v8.16b
518
519	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
520
521	sub		rounds, rounds, #1
522	b		.Ldec_sbox
523
524.Ldec_loop:
525	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
526.Ldec_sbox:
527	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
528								v13, v14, v15
529	subs		rounds, rounds, #1
530	b.cc		.Ldec_done
531
532	dec_next_rk
533
534	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
535
536	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
537								v13, v14, v15
538
539	b.ne		.Ldec_loop
540	ldr		q24, ISRM0
541	b		.Ldec_loop
542.Ldec_done:
543	ldr		q12, [bskey, #-16]		// last round key
544
545	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
546
547	eor		v0.16b, v0.16b, v12.16b
548	eor		v1.16b, v1.16b, v12.16b
549	eor		v6.16b, v6.16b, v12.16b
550	eor		v4.16b, v4.16b, v12.16b
551	eor		v2.16b, v2.16b, v12.16b
552	eor		v7.16b, v7.16b, v12.16b
553	eor		v3.16b, v3.16b, v12.16b
554	eor		v5.16b, v5.16b, v12.16b
555	ret
556SYM_FUNC_END(aesbs_decrypt8)
557
558	/*
559	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
560	 *		     int blocks)
561	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
562	 *		     int blocks)
563	 */
564	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
565	frame_push	5
566
567	mov		x19, x0
568	mov		x20, x1
569	mov		x21, x2
570	mov		x22, x3
571	mov		x23, x4
572
57399:	mov		x5, #1
574	lsl		x5, x5, x23
575	subs		w23, w23, #8
576	csel		x23, x23, xzr, pl
577	csel		x5, x5, xzr, mi
578
579	ld1		{v0.16b}, [x20], #16
580	tbnz		x5, #1, 0f
581	ld1		{v1.16b}, [x20], #16
582	tbnz		x5, #2, 0f
583	ld1		{v2.16b}, [x20], #16
584	tbnz		x5, #3, 0f
585	ld1		{v3.16b}, [x20], #16
586	tbnz		x5, #4, 0f
587	ld1		{v4.16b}, [x20], #16
588	tbnz		x5, #5, 0f
589	ld1		{v5.16b}, [x20], #16
590	tbnz		x5, #6, 0f
591	ld1		{v6.16b}, [x20], #16
592	tbnz		x5, #7, 0f
593	ld1		{v7.16b}, [x20], #16
594
5950:	mov		bskey, x21
596	mov		rounds, x22
597	bl		\do8
598
599	st1		{\o0\().16b}, [x19], #16
600	tbnz		x5, #1, 1f
601	st1		{\o1\().16b}, [x19], #16
602	tbnz		x5, #2, 1f
603	st1		{\o2\().16b}, [x19], #16
604	tbnz		x5, #3, 1f
605	st1		{\o3\().16b}, [x19], #16
606	tbnz		x5, #4, 1f
607	st1		{\o4\().16b}, [x19], #16
608	tbnz		x5, #5, 1f
609	st1		{\o5\().16b}, [x19], #16
610	tbnz		x5, #6, 1f
611	st1		{\o6\().16b}, [x19], #16
612	tbnz		x5, #7, 1f
613	st1		{\o7\().16b}, [x19], #16
614
615	cbz		x23, 1f
616	b		99b
617
6181:	frame_pop
619	ret
620	.endm
621
622	.align		4
623SYM_FUNC_START(aesbs_ecb_encrypt)
624	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
625SYM_FUNC_END(aesbs_ecb_encrypt)
626
627	.align		4
628SYM_FUNC_START(aesbs_ecb_decrypt)
629	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
630SYM_FUNC_END(aesbs_ecb_decrypt)
631
632	/*
633	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
634	 *		     int blocks, u8 iv[])
635	 */
636	.align		4
637SYM_FUNC_START(aesbs_cbc_decrypt)
638	frame_push	6
639
640	mov		x19, x0
641	mov		x20, x1
642	mov		x21, x2
643	mov		x22, x3
644	mov		x23, x4
645	mov		x24, x5
646
64799:	mov		x6, #1
648	lsl		x6, x6, x23
649	subs		w23, w23, #8
650	csel		x23, x23, xzr, pl
651	csel		x6, x6, xzr, mi
652
653	ld1		{v0.16b}, [x20], #16
654	mov		v25.16b, v0.16b
655	tbnz		x6, #1, 0f
656	ld1		{v1.16b}, [x20], #16
657	mov		v26.16b, v1.16b
658	tbnz		x6, #2, 0f
659	ld1		{v2.16b}, [x20], #16
660	mov		v27.16b, v2.16b
661	tbnz		x6, #3, 0f
662	ld1		{v3.16b}, [x20], #16
663	mov		v28.16b, v3.16b
664	tbnz		x6, #4, 0f
665	ld1		{v4.16b}, [x20], #16
666	mov		v29.16b, v4.16b
667	tbnz		x6, #5, 0f
668	ld1		{v5.16b}, [x20], #16
669	mov		v30.16b, v5.16b
670	tbnz		x6, #6, 0f
671	ld1		{v6.16b}, [x20], #16
672	mov		v31.16b, v6.16b
673	tbnz		x6, #7, 0f
674	ld1		{v7.16b}, [x20]
675
6760:	mov		bskey, x21
677	mov		rounds, x22
678	bl		aesbs_decrypt8
679
680	ld1		{v24.16b}, [x24]		// load IV
681
682	eor		v1.16b, v1.16b, v25.16b
683	eor		v6.16b, v6.16b, v26.16b
684	eor		v4.16b, v4.16b, v27.16b
685	eor		v2.16b, v2.16b, v28.16b
686	eor		v7.16b, v7.16b, v29.16b
687	eor		v0.16b, v0.16b, v24.16b
688	eor		v3.16b, v3.16b, v30.16b
689	eor		v5.16b, v5.16b, v31.16b
690
691	st1		{v0.16b}, [x19], #16
692	mov		v24.16b, v25.16b
693	tbnz		x6, #1, 1f
694	st1		{v1.16b}, [x19], #16
695	mov		v24.16b, v26.16b
696	tbnz		x6, #2, 1f
697	st1		{v6.16b}, [x19], #16
698	mov		v24.16b, v27.16b
699	tbnz		x6, #3, 1f
700	st1		{v4.16b}, [x19], #16
701	mov		v24.16b, v28.16b
702	tbnz		x6, #4, 1f
703	st1		{v2.16b}, [x19], #16
704	mov		v24.16b, v29.16b
705	tbnz		x6, #5, 1f
706	st1		{v7.16b}, [x19], #16
707	mov		v24.16b, v30.16b
708	tbnz		x6, #6, 1f
709	st1		{v3.16b}, [x19], #16
710	mov		v24.16b, v31.16b
711	tbnz		x6, #7, 1f
712	ld1		{v24.16b}, [x20], #16
713	st1		{v5.16b}, [x19], #16
7141:	st1		{v24.16b}, [x24]		// store IV
715
716	cbz		x23, 2f
717	b		99b
718
7192:	frame_pop
720	ret
721SYM_FUNC_END(aesbs_cbc_decrypt)
722
723	.macro		next_tweak, out, in, const, tmp
724	sshr		\tmp\().2d,  \in\().2d,   #63
725	and		\tmp\().16b, \tmp\().16b, \const\().16b
726	add		\out\().2d,  \in\().2d,   \in\().2d
727	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
728	eor		\out\().16b, \out\().16b, \tmp\().16b
729	.endm
730
731	/*
732	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
733	 *		     int blocks, u8 iv[])
734	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
735	 *		     int blocks, u8 iv[])
736	 */
737SYM_FUNC_START_LOCAL(__xts_crypt8)
738	mov		x6, #1
739	lsl		x6, x6, x23
740	subs		w23, w23, #8
741	csel		x23, x23, xzr, pl
742	csel		x6, x6, xzr, mi
743
744	ld1		{v0.16b}, [x20], #16
745	next_tweak	v26, v25, v30, v31
746	eor		v0.16b, v0.16b, v25.16b
747	tbnz		x6, #1, 0f
748
749	ld1		{v1.16b}, [x20], #16
750	next_tweak	v27, v26, v30, v31
751	eor		v1.16b, v1.16b, v26.16b
752	tbnz		x6, #2, 0f
753
754	ld1		{v2.16b}, [x20], #16
755	next_tweak	v28, v27, v30, v31
756	eor		v2.16b, v2.16b, v27.16b
757	tbnz		x6, #3, 0f
758
759	ld1		{v3.16b}, [x20], #16
760	next_tweak	v29, v28, v30, v31
761	eor		v3.16b, v3.16b, v28.16b
762	tbnz		x6, #4, 0f
763
764	ld1		{v4.16b}, [x20], #16
765	str		q29, [sp, #.Lframe_local_offset]
766	eor		v4.16b, v4.16b, v29.16b
767	next_tweak	v29, v29, v30, v31
768	tbnz		x6, #5, 0f
769
770	ld1		{v5.16b}, [x20], #16
771	str		q29, [sp, #.Lframe_local_offset + 16]
772	eor		v5.16b, v5.16b, v29.16b
773	next_tweak	v29, v29, v30, v31
774	tbnz		x6, #6, 0f
775
776	ld1		{v6.16b}, [x20], #16
777	str		q29, [sp, #.Lframe_local_offset + 32]
778	eor		v6.16b, v6.16b, v29.16b
779	next_tweak	v29, v29, v30, v31
780	tbnz		x6, #7, 0f
781
782	ld1		{v7.16b}, [x20], #16
783	str		q29, [sp, #.Lframe_local_offset + 48]
784	eor		v7.16b, v7.16b, v29.16b
785	next_tweak	v29, v29, v30, v31
786
7870:	mov		bskey, x21
788	mov		rounds, x22
789	br		x16
790SYM_FUNC_END(__xts_crypt8)
791
792	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
793	frame_push	6, 64
794
795	mov		x19, x0
796	mov		x20, x1
797	mov		x21, x2
798	mov		x22, x3
799	mov		x23, x4
800	mov		x24, x5
801
802	movi		v30.2s, #0x1
803	movi		v25.2s, #0x87
804	uzp1		v30.4s, v30.4s, v25.4s
805	ld1		{v25.16b}, [x24]
806
80799:	adr		x16, \do8
808	bl		__xts_crypt8
809
810	ldp		q16, q17, [sp, #.Lframe_local_offset]
811	ldp		q18, q19, [sp, #.Lframe_local_offset + 32]
812
813	eor		\o0\().16b, \o0\().16b, v25.16b
814	eor		\o1\().16b, \o1\().16b, v26.16b
815	eor		\o2\().16b, \o2\().16b, v27.16b
816	eor		\o3\().16b, \o3\().16b, v28.16b
817
818	st1		{\o0\().16b}, [x19], #16
819	mov		v25.16b, v26.16b
820	tbnz		x6, #1, 1f
821	st1		{\o1\().16b}, [x19], #16
822	mov		v25.16b, v27.16b
823	tbnz		x6, #2, 1f
824	st1		{\o2\().16b}, [x19], #16
825	mov		v25.16b, v28.16b
826	tbnz		x6, #3, 1f
827	st1		{\o3\().16b}, [x19], #16
828	mov		v25.16b, v29.16b
829	tbnz		x6, #4, 1f
830
831	eor		\o4\().16b, \o4\().16b, v16.16b
832	eor		\o5\().16b, \o5\().16b, v17.16b
833	eor		\o6\().16b, \o6\().16b, v18.16b
834	eor		\o7\().16b, \o7\().16b, v19.16b
835
836	st1		{\o4\().16b}, [x19], #16
837	tbnz		x6, #5, 1f
838	st1		{\o5\().16b}, [x19], #16
839	tbnz		x6, #6, 1f
840	st1		{\o6\().16b}, [x19], #16
841	tbnz		x6, #7, 1f
842	st1		{\o7\().16b}, [x19], #16
843
844	cbz		x23, 1f
845	st1		{v25.16b}, [x24]
846
847	b		99b
848
8491:	st1		{v25.16b}, [x24]
850	frame_pop
851	ret
852	.endm
853
854SYM_FUNC_START(aesbs_xts_encrypt)
855	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
856SYM_FUNC_END(aesbs_xts_encrypt)
857
858SYM_FUNC_START(aesbs_xts_decrypt)
859	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
860SYM_FUNC_END(aesbs_xts_decrypt)
861
862	.macro		next_ctr, v
863	mov		\v\().d[1], x8
864	adds		x8, x8, #1
865	mov		\v\().d[0], x7
866	adc		x7, x7, xzr
867	rev64		\v\().16b, \v\().16b
868	.endm
869
870	/*
871	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
872	 *		     int rounds, int blocks, u8 iv[], u8 final[])
873	 */
874SYM_FUNC_START(aesbs_ctr_encrypt)
875	frame_push	8
876
877	mov		x19, x0
878	mov		x20, x1
879	mov		x21, x2
880	mov		x22, x3
881	mov		x23, x4
882	mov		x24, x5
883	mov		x25, x6
884
885	cmp		x25, #0
886	cset		x26, ne
887	add		x23, x23, x26		// do one extra block if final
888
889	ldp		x7, x8, [x24]
890	ld1		{v0.16b}, [x24]
891CPU_LE(	rev		x7, x7		)
892CPU_LE(	rev		x8, x8		)
893	adds		x8, x8, #1
894	adc		x7, x7, xzr
895
89699:	mov		x9, #1
897	lsl		x9, x9, x23
898	subs		w23, w23, #8
899	csel		x23, x23, xzr, pl
900	csel		x9, x9, xzr, le
901
902	tbnz		x9, #1, 0f
903	next_ctr	v1
904	tbnz		x9, #2, 0f
905	next_ctr	v2
906	tbnz		x9, #3, 0f
907	next_ctr	v3
908	tbnz		x9, #4, 0f
909	next_ctr	v4
910	tbnz		x9, #5, 0f
911	next_ctr	v5
912	tbnz		x9, #6, 0f
913	next_ctr	v6
914	tbnz		x9, #7, 0f
915	next_ctr	v7
916
9170:	mov		bskey, x21
918	mov		rounds, x22
919	bl		aesbs_encrypt8
920
921	lsr		x9, x9, x26		// disregard the extra block
922	tbnz		x9, #0, 0f
923
924	ld1		{v8.16b}, [x20], #16
925	eor		v0.16b, v0.16b, v8.16b
926	st1		{v0.16b}, [x19], #16
927	tbnz		x9, #1, 1f
928
929	ld1		{v9.16b}, [x20], #16
930	eor		v1.16b, v1.16b, v9.16b
931	st1		{v1.16b}, [x19], #16
932	tbnz		x9, #2, 2f
933
934	ld1		{v10.16b}, [x20], #16
935	eor		v4.16b, v4.16b, v10.16b
936	st1		{v4.16b}, [x19], #16
937	tbnz		x9, #3, 3f
938
939	ld1		{v11.16b}, [x20], #16
940	eor		v6.16b, v6.16b, v11.16b
941	st1		{v6.16b}, [x19], #16
942	tbnz		x9, #4, 4f
943
944	ld1		{v12.16b}, [x20], #16
945	eor		v3.16b, v3.16b, v12.16b
946	st1		{v3.16b}, [x19], #16
947	tbnz		x9, #5, 5f
948
949	ld1		{v13.16b}, [x20], #16
950	eor		v7.16b, v7.16b, v13.16b
951	st1		{v7.16b}, [x19], #16
952	tbnz		x9, #6, 6f
953
954	ld1		{v14.16b}, [x20], #16
955	eor		v2.16b, v2.16b, v14.16b
956	st1		{v2.16b}, [x19], #16
957	tbnz		x9, #7, 7f
958
959	ld1		{v15.16b}, [x20], #16
960	eor		v5.16b, v5.16b, v15.16b
961	st1		{v5.16b}, [x19], #16
962
9638:	next_ctr	v0
964	st1		{v0.16b}, [x24]
965	cbz		x23, .Lctr_done
966
967	b		99b
968
969.Lctr_done:
970	frame_pop
971	ret
972
973	/*
974	 * If we are handling the tail of the input (x6 != NULL), return the
975	 * final keystream block back to the caller.
976	 */
9770:	cbz		x25, 8b
978	st1		{v0.16b}, [x25]
979	b		8b
9801:	cbz		x25, 8b
981	st1		{v1.16b}, [x25]
982	b		8b
9832:	cbz		x25, 8b
984	st1		{v4.16b}, [x25]
985	b		8b
9863:	cbz		x25, 8b
987	st1		{v6.16b}, [x25]
988	b		8b
9894:	cbz		x25, 8b
990	st1		{v3.16b}, [x25]
991	b		8b
9925:	cbz		x25, 8b
993	st1		{v7.16b}, [x25]
994	b		8b
9956:	cbz		x25, 8b
996	st1		{v2.16b}, [x25]
997	b		8b
9987:	cbz		x25, 8b
999	st1		{v5.16b}, [x25]
1000	b		8b
1001SYM_FUNC_END(aesbs_ctr_encrypt)
1002