xref: /linux/arch/arm64/crypto/aes-neonbs-core.S (revision 6fdcba32711044c35c0e1b094cbd8f3f0b4472c9)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Bit sliced AES using NEON instructions
4 *
5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/*
9 * The algorithm implemented here is described in detail by the paper
10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
12 *
13 * This implementation is based primarily on the OpenSSL implementation
14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
15 */
16
17#include <linux/linkage.h>
18#include <asm/assembler.h>
19
20	.text
21
22	rounds		.req	x11
23	bskey		.req	x12
24
25	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
26	eor		\b2, \b2, \b1
27	eor		\b5, \b5, \b6
28	eor		\b3, \b3, \b0
29	eor		\b6, \b6, \b2
30	eor		\b5, \b5, \b0
31	eor		\b6, \b6, \b3
32	eor		\b3, \b3, \b7
33	eor		\b7, \b7, \b5
34	eor		\b3, \b3, \b4
35	eor		\b4, \b4, \b5
36	eor		\b2, \b2, \b7
37	eor		\b3, \b3, \b1
38	eor		\b1, \b1, \b5
39	.endm
40
41	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
42	eor		\b0, \b0, \b6
43	eor		\b1, \b1, \b4
44	eor		\b4, \b4, \b6
45	eor		\b2, \b2, \b0
46	eor		\b6, \b6, \b1
47	eor		\b1, \b1, \b5
48	eor		\b5, \b5, \b3
49	eor		\b3, \b3, \b7
50	eor		\b7, \b7, \b5
51	eor		\b2, \b2, \b5
52	eor		\b4, \b4, \b7
53	.endm
54
55	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
56	eor		\b1, \b1, \b7
57	eor		\b4, \b4, \b7
58	eor		\b7, \b7, \b5
59	eor		\b1, \b1, \b3
60	eor		\b2, \b2, \b5
61	eor		\b3, \b3, \b7
62	eor		\b6, \b6, \b1
63	eor		\b2, \b2, \b0
64	eor		\b5, \b5, \b3
65	eor		\b4, \b4, \b6
66	eor		\b0, \b0, \b6
67	eor		\b1, \b1, \b4
68	.endm
69
70	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
71	eor		\b1, \b1, \b5
72	eor		\b2, \b2, \b7
73	eor		\b3, \b3, \b1
74	eor		\b4, \b4, \b5
75	eor		\b7, \b7, \b5
76	eor		\b3, \b3, \b4
77	eor 		\b5, \b5, \b0
78	eor		\b3, \b3, \b7
79	eor		\b6, \b6, \b2
80	eor		\b2, \b2, \b1
81	eor		\b6, \b6, \b3
82	eor		\b3, \b3, \b0
83	eor		\b5, \b5, \b6
84	.endm
85
86	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
87	eor 		\t0, \y0, \y1
88	and		\t0, \t0, \x0
89	eor		\x0, \x0, \x1
90	and		\t1, \x1, \y0
91	and		\x0, \x0, \y1
92	eor		\x1, \t1, \t0
93	eor		\x0, \x0, \t1
94	.endm
95
96	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
97	eor		\t0, \y0, \y1
98	eor 		\t1, \y2, \y3
99	and		\t0, \t0, \x0
100	and		\t1, \t1, \x2
101	eor		\x0, \x0, \x1
102	eor		\x2, \x2, \x3
103	and		\x1, \x1, \y0
104	and		\x3, \x3, \y2
105	and		\x0, \x0, \y1
106	and		\x2, \x2, \y3
107	eor		\x1, \x1, \x0
108	eor		\x2, \x2, \x3
109	eor		\x0, \x0, \t0
110	eor		\x3, \x3, \t1
111	.endm
112
113	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
114				    y0, y1, y2, y3, t0, t1, t2, t3
115	eor		\t0, \x0, \x2
116	eor		\t1, \x1, \x3
117	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
118	eor		\y0, \y0, \y2
119	eor		\y1, \y1, \y3
120	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
121	eor		\x0, \x0, \t0
122	eor		\x2, \x2, \t0
123	eor		\x1, \x1, \t1
124	eor		\x3, \x3, \t1
125	eor		\t0, \x4, \x6
126	eor		\t1, \x5, \x7
127	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
128	eor		\y0, \y0, \y2
129	eor		\y1, \y1, \y3
130	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
131	eor		\x4, \x4, \t0
132	eor		\x6, \x6, \t0
133	eor		\x5, \x5, \t1
134	eor		\x7, \x7, \t1
135	.endm
136
137	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
138				   t0, t1, t2, t3, s0, s1, s2, s3
139	eor		\t3, \x4, \x6
140	eor		\t0, \x5, \x7
141	eor		\t1, \x1, \x3
142	eor		\s1, \x7, \x6
143	eor		\s0, \x0, \x2
144	eor		\s3, \t3, \t0
145	orr		\t2, \t0, \t1
146	and		\s2, \t3, \s0
147	orr		\t3, \t3, \s0
148	eor		\s0, \s0, \t1
149	and		\t0, \t0, \t1
150	eor		\t1, \x3, \x2
151	and		\s3, \s3, \s0
152	and		\s1, \s1, \t1
153	eor		\t1, \x4, \x5
154	eor		\s0, \x1, \x0
155	eor		\t3, \t3, \s1
156	eor		\t2, \t2, \s1
157	and		\s1, \t1, \s0
158	orr		\t1, \t1, \s0
159	eor		\t3, \t3, \s3
160	eor		\t0, \t0, \s1
161	eor		\t2, \t2, \s2
162	eor		\t1, \t1, \s3
163	eor		\t0, \t0, \s2
164	and		\s0, \x7, \x3
165	eor		\t1, \t1, \s2
166	and		\s1, \x6, \x2
167	and		\s2, \x5, \x1
168	orr		\s3, \x4, \x0
169	eor		\t3, \t3, \s0
170	eor		\t1, \t1, \s2
171	eor		\s0, \t0, \s3
172	eor		\t2, \t2, \s1
173	and		\s2, \t3, \t1
174	eor		\s1, \t2, \s2
175	eor		\s3, \s0, \s2
176	bsl		\s1, \t1, \s0
177	not		\t0, \s0
178	bsl		\s0, \s1, \s3
179	bsl		\t0, \s1, \s3
180	bsl		\s3, \t3, \t2
181	eor		\t3, \t3, \t2
182	and		\s2, \s0, \s3
183	eor		\t1, \t1, \t0
184	eor		\s2, \s2, \t3
185	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
186			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
187	.endm
188
189	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
190			      t0, t1, t2, t3, s0, s1, s2, s3
191	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
192			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
193	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
194			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
195			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
196			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
197	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
198			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
199	.endm
200
201	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
202				  t0, t1, t2, t3, s0, s1, s2, s3
203	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
204			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
205	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
206			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
207			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
208			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
209	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
210			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
211	.endm
212
213	.macro		enc_next_rk
214	ldp		q16, q17, [bskey], #128
215	ldp		q18, q19, [bskey, #-96]
216	ldp		q20, q21, [bskey, #-64]
217	ldp		q22, q23, [bskey, #-32]
218	.endm
219
220	.macro		dec_next_rk
221	ldp		q16, q17, [bskey, #-128]!
222	ldp		q18, q19, [bskey, #32]
223	ldp		q20, q21, [bskey, #64]
224	ldp		q22, q23, [bskey, #96]
225	.endm
226
227	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
228	eor		\x0\().16b, \x0\().16b, v16.16b
229	eor		\x1\().16b, \x1\().16b, v17.16b
230	eor		\x2\().16b, \x2\().16b, v18.16b
231	eor		\x3\().16b, \x3\().16b, v19.16b
232	eor		\x4\().16b, \x4\().16b, v20.16b
233	eor		\x5\().16b, \x5\().16b, v21.16b
234	eor		\x6\().16b, \x6\().16b, v22.16b
235	eor		\x7\().16b, \x7\().16b, v23.16b
236	.endm
237
238	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
239	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
240	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
241	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
242	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
243	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
244	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
245	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
246	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
247	.endm
248
249	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
250				  t0, t1, t2, t3, t4, t5, t6, t7, inv
251	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
252	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
253	eor		\x0\().16b, \x0\().16b, \t0\().16b
254	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
255	eor		\x1\().16b, \x1\().16b, \t1\().16b
256	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
257	eor		\x2\().16b, \x2\().16b, \t2\().16b
258	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
259	eor		\x3\().16b, \x3\().16b, \t3\().16b
260	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
261	eor		\x4\().16b, \x4\().16b, \t4\().16b
262	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
263	eor		\x5\().16b, \x5\().16b, \t5\().16b
264	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
265	eor		\x6\().16b, \x6\().16b, \t6\().16b
266	eor		\t1\().16b, \t1\().16b, \x0\().16b
267	eor		\x7\().16b, \x7\().16b, \t7\().16b
268	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
269	eor		\t2\().16b, \t2\().16b, \x1\().16b
270	eor		\t0\().16b, \t0\().16b, \x7\().16b
271	eor		\t1\().16b, \t1\().16b, \x7\().16b
272	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
273	eor		\t5\().16b, \t5\().16b, \x4\().16b
274	eor		\x0\().16b, \x0\().16b, \t0\().16b
275	eor		\t6\().16b, \t6\().16b, \x5\().16b
276	eor		\x1\().16b, \x1\().16b, \t1\().16b
277	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
278	eor		\t4\().16b, \t4\().16b, \x3\().16b
279	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
280	eor		\t7\().16b, \t7\().16b, \x6\().16b
281	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
282	eor		\t3\().16b, \t3\().16b, \x2\().16b
283	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
284	eor		\t4\().16b, \t4\().16b, \x7\().16b
285	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
286	eor		\t3\().16b, \t3\().16b, \x7\().16b
287	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
288	eor		\x7\().16b, \t1\().16b, \t5\().16b
289	.ifb		\inv
290	eor		\x2\().16b, \t0\().16b, \t4\().16b
291	eor		\x4\().16b, \x4\().16b, \t3\().16b
292	eor		\x5\().16b, \x5\().16b, \t7\().16b
293	eor		\x3\().16b, \x3\().16b, \t6\().16b
294	eor		\x6\().16b, \x6\().16b, \t2\().16b
295	.else
296	eor		\t3\().16b, \t3\().16b, \x4\().16b
297	eor		\x5\().16b, \x5\().16b, \t7\().16b
298	eor		\x2\().16b, \x3\().16b, \t6\().16b
299	eor		\x3\().16b, \t0\().16b, \t4\().16b
300	eor		\x4\().16b, \x6\().16b, \t2\().16b
301	mov		\x6\().16b, \t3\().16b
302	.endif
303	.endm
304
305	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
306				      t0, t1, t2, t3, t4, t5, t6, t7
307	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
308	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
309	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
310	eor		\t0\().16b, \t0\().16b, \x0\().16b
311	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
312	eor		\t6\().16b, \t6\().16b, \x6\().16b
313	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
314	eor		\t7\().16b, \t7\().16b, \x7\().16b
315	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
316	eor		\t1\().16b, \t1\().16b, \x1\().16b
317	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
318	eor		\t2\().16b, \t2\().16b, \x2\().16b
319	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
320	eor		\t3\().16b, \t3\().16b, \x3\().16b
321	eor		\t4\().16b, \t4\().16b, \x4\().16b
322	eor		\t5\().16b, \t5\().16b, \x5\().16b
323	eor		\x0\().16b, \x0\().16b, \t6\().16b
324	eor		\x1\().16b, \x1\().16b, \t6\().16b
325	eor		\x2\().16b, \x2\().16b, \t0\().16b
326	eor		\x4\().16b, \x4\().16b, \t2\().16b
327	eor		\x3\().16b, \x3\().16b, \t1\().16b
328	eor		\x1\().16b, \x1\().16b, \t7\().16b
329	eor		\x2\().16b, \x2\().16b, \t7\().16b
330	eor		\x4\().16b, \x4\().16b, \t6\().16b
331	eor		\x5\().16b, \x5\().16b, \t3\().16b
332	eor		\x3\().16b, \x3\().16b, \t6\().16b
333	eor		\x6\().16b, \x6\().16b, \t4\().16b
334	eor		\x4\().16b, \x4\().16b, \t7\().16b
335	eor		\x5\().16b, \x5\().16b, \t7\().16b
336	eor		\x7\().16b, \x7\().16b, \t5\().16b
337	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
338			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
339	.endm
340
341	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
342	ushr		\t0\().2d, \b0\().2d, #\n
343	ushr		\t1\().2d, \b1\().2d, #\n
344	eor		\t0\().16b, \t0\().16b, \a0\().16b
345	eor		\t1\().16b, \t1\().16b, \a1\().16b
346	and		\t0\().16b, \t0\().16b, \mask\().16b
347	and		\t1\().16b, \t1\().16b, \mask\().16b
348	eor		\a0\().16b, \a0\().16b, \t0\().16b
349	shl		\t0\().2d, \t0\().2d, #\n
350	eor		\a1\().16b, \a1\().16b, \t1\().16b
351	shl		\t1\().2d, \t1\().2d, #\n
352	eor		\b0\().16b, \b0\().16b, \t0\().16b
353	eor		\b1\().16b, \b1\().16b, \t1\().16b
354	.endm
355
356	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
357	movi		\t0\().16b, #0x55
358	movi		\t1\().16b, #0x33
359	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
360	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
361	movi		\t0\().16b, #0x0f
362	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
363	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
364	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
365	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
366	.endm
367
368
369	.align		6
370M0:	.octa		0x0004080c0105090d02060a0e03070b0f
371
372M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
373SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
374SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
375
376M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
377ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
378ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
379
380	/*
381	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
382	 */
383ENTRY(aesbs_convert_key)
384	ld1		{v7.4s}, [x1], #16		// load round 0 key
385	ld1		{v17.4s}, [x1], #16		// load round 1 key
386
387	movi		v8.16b,  #0x01			// bit masks
388	movi		v9.16b,  #0x02
389	movi		v10.16b, #0x04
390	movi		v11.16b, #0x08
391	movi		v12.16b, #0x10
392	movi		v13.16b, #0x20
393	movi		v14.16b, #0x40
394	movi		v15.16b, #0x80
395	ldr		q16, M0
396
397	sub		x2, x2, #1
398	str		q7, [x0], #16		// save round 0 key
399
400.Lkey_loop:
401	tbl		v7.16b ,{v17.16b}, v16.16b
402	ld1		{v17.4s}, [x1], #16		// load next round key
403
404	cmtst		v0.16b, v7.16b, v8.16b
405	cmtst		v1.16b, v7.16b, v9.16b
406	cmtst		v2.16b, v7.16b, v10.16b
407	cmtst		v3.16b, v7.16b, v11.16b
408	cmtst		v4.16b, v7.16b, v12.16b
409	cmtst		v5.16b, v7.16b, v13.16b
410	cmtst		v6.16b, v7.16b, v14.16b
411	cmtst		v7.16b, v7.16b, v15.16b
412	not		v0.16b, v0.16b
413	not		v1.16b, v1.16b
414	not		v5.16b, v5.16b
415	not		v6.16b, v6.16b
416
417	subs		x2, x2, #1
418	stp		q0, q1, [x0], #128
419	stp		q2, q3, [x0, #-96]
420	stp		q4, q5, [x0, #-64]
421	stp		q6, q7, [x0, #-32]
422	b.ne		.Lkey_loop
423
424	movi		v7.16b, #0x63			// compose .L63
425	eor		v17.16b, v17.16b, v7.16b
426	str		q17, [x0]
427	ret
428ENDPROC(aesbs_convert_key)
429
430	.align		4
431aesbs_encrypt8:
432	ldr		q9, [bskey], #16		// round 0 key
433	ldr		q8, M0SR
434	ldr		q24, SR
435
436	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
437	eor		v11.16b, v1.16b, v9.16b
438	tbl		v0.16b, {v10.16b}, v8.16b
439	eor		v12.16b, v2.16b, v9.16b
440	tbl		v1.16b, {v11.16b}, v8.16b
441	eor		v13.16b, v3.16b, v9.16b
442	tbl		v2.16b, {v12.16b}, v8.16b
443	eor		v14.16b, v4.16b, v9.16b
444	tbl		v3.16b, {v13.16b}, v8.16b
445	eor		v15.16b, v5.16b, v9.16b
446	tbl		v4.16b, {v14.16b}, v8.16b
447	eor		v10.16b, v6.16b, v9.16b
448	tbl		v5.16b, {v15.16b}, v8.16b
449	eor		v11.16b, v7.16b, v9.16b
450	tbl		v6.16b, {v10.16b}, v8.16b
451	tbl		v7.16b, {v11.16b}, v8.16b
452
453	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
454
455	sub		rounds, rounds, #1
456	b		.Lenc_sbox
457
458.Lenc_loop:
459	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
460.Lenc_sbox:
461	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
462								v13, v14, v15
463	subs		rounds, rounds, #1
464	b.cc		.Lenc_done
465
466	enc_next_rk
467
468	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
469								v13, v14, v15
470
471	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
472
473	b.ne		.Lenc_loop
474	ldr		q24, SRM0
475	b		.Lenc_loop
476
477.Lenc_done:
478	ldr		q12, [bskey]			// last round key
479
480	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
481
482	eor		v0.16b, v0.16b, v12.16b
483	eor		v1.16b, v1.16b, v12.16b
484	eor		v4.16b, v4.16b, v12.16b
485	eor		v6.16b, v6.16b, v12.16b
486	eor		v3.16b, v3.16b, v12.16b
487	eor		v7.16b, v7.16b, v12.16b
488	eor		v2.16b, v2.16b, v12.16b
489	eor		v5.16b, v5.16b, v12.16b
490	ret
491ENDPROC(aesbs_encrypt8)
492
493	.align		4
494aesbs_decrypt8:
495	lsl		x9, rounds, #7
496	add		bskey, bskey, x9
497
498	ldr		q9, [bskey, #-112]!		// round 0 key
499	ldr		q8, M0ISR
500	ldr		q24, ISR
501
502	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
503	eor		v11.16b, v1.16b, v9.16b
504	tbl		v0.16b, {v10.16b}, v8.16b
505	eor		v12.16b, v2.16b, v9.16b
506	tbl		v1.16b, {v11.16b}, v8.16b
507	eor		v13.16b, v3.16b, v9.16b
508	tbl		v2.16b, {v12.16b}, v8.16b
509	eor		v14.16b, v4.16b, v9.16b
510	tbl		v3.16b, {v13.16b}, v8.16b
511	eor		v15.16b, v5.16b, v9.16b
512	tbl		v4.16b, {v14.16b}, v8.16b
513	eor		v10.16b, v6.16b, v9.16b
514	tbl		v5.16b, {v15.16b}, v8.16b
515	eor		v11.16b, v7.16b, v9.16b
516	tbl		v6.16b, {v10.16b}, v8.16b
517	tbl		v7.16b, {v11.16b}, v8.16b
518
519	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
520
521	sub		rounds, rounds, #1
522	b		.Ldec_sbox
523
524.Ldec_loop:
525	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
526.Ldec_sbox:
527	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
528								v13, v14, v15
529	subs		rounds, rounds, #1
530	b.cc		.Ldec_done
531
532	dec_next_rk
533
534	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
535
536	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
537								v13, v14, v15
538
539	b.ne		.Ldec_loop
540	ldr		q24, ISRM0
541	b		.Ldec_loop
542.Ldec_done:
543	ldr		q12, [bskey, #-16]		// last round key
544
545	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
546
547	eor		v0.16b, v0.16b, v12.16b
548	eor		v1.16b, v1.16b, v12.16b
549	eor		v6.16b, v6.16b, v12.16b
550	eor		v4.16b, v4.16b, v12.16b
551	eor		v2.16b, v2.16b, v12.16b
552	eor		v7.16b, v7.16b, v12.16b
553	eor		v3.16b, v3.16b, v12.16b
554	eor		v5.16b, v5.16b, v12.16b
555	ret
556ENDPROC(aesbs_decrypt8)
557
558	/*
559	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
560	 *		     int blocks)
561	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
562	 *		     int blocks)
563	 */
564	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
565	frame_push	5
566
567	mov		x19, x0
568	mov		x20, x1
569	mov		x21, x2
570	mov		x22, x3
571	mov		x23, x4
572
57399:	mov		x5, #1
574	lsl		x5, x5, x23
575	subs		w23, w23, #8
576	csel		x23, x23, xzr, pl
577	csel		x5, x5, xzr, mi
578
579	ld1		{v0.16b}, [x20], #16
580	tbnz		x5, #1, 0f
581	ld1		{v1.16b}, [x20], #16
582	tbnz		x5, #2, 0f
583	ld1		{v2.16b}, [x20], #16
584	tbnz		x5, #3, 0f
585	ld1		{v3.16b}, [x20], #16
586	tbnz		x5, #4, 0f
587	ld1		{v4.16b}, [x20], #16
588	tbnz		x5, #5, 0f
589	ld1		{v5.16b}, [x20], #16
590	tbnz		x5, #6, 0f
591	ld1		{v6.16b}, [x20], #16
592	tbnz		x5, #7, 0f
593	ld1		{v7.16b}, [x20], #16
594
5950:	mov		bskey, x21
596	mov		rounds, x22
597	bl		\do8
598
599	st1		{\o0\().16b}, [x19], #16
600	tbnz		x5, #1, 1f
601	st1		{\o1\().16b}, [x19], #16
602	tbnz		x5, #2, 1f
603	st1		{\o2\().16b}, [x19], #16
604	tbnz		x5, #3, 1f
605	st1		{\o3\().16b}, [x19], #16
606	tbnz		x5, #4, 1f
607	st1		{\o4\().16b}, [x19], #16
608	tbnz		x5, #5, 1f
609	st1		{\o5\().16b}, [x19], #16
610	tbnz		x5, #6, 1f
611	st1		{\o6\().16b}, [x19], #16
612	tbnz		x5, #7, 1f
613	st1		{\o7\().16b}, [x19], #16
614
615	cbz		x23, 1f
616	cond_yield_neon
617	b		99b
618
6191:	frame_pop
620	ret
621	.endm
622
623	.align		4
624ENTRY(aesbs_ecb_encrypt)
625	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
626ENDPROC(aesbs_ecb_encrypt)
627
628	.align		4
629ENTRY(aesbs_ecb_decrypt)
630	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
631ENDPROC(aesbs_ecb_decrypt)
632
633	/*
634	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
635	 *		     int blocks, u8 iv[])
636	 */
637	.align		4
638ENTRY(aesbs_cbc_decrypt)
639	frame_push	6
640
641	mov		x19, x0
642	mov		x20, x1
643	mov		x21, x2
644	mov		x22, x3
645	mov		x23, x4
646	mov		x24, x5
647
64899:	mov		x6, #1
649	lsl		x6, x6, x23
650	subs		w23, w23, #8
651	csel		x23, x23, xzr, pl
652	csel		x6, x6, xzr, mi
653
654	ld1		{v0.16b}, [x20], #16
655	mov		v25.16b, v0.16b
656	tbnz		x6, #1, 0f
657	ld1		{v1.16b}, [x20], #16
658	mov		v26.16b, v1.16b
659	tbnz		x6, #2, 0f
660	ld1		{v2.16b}, [x20], #16
661	mov		v27.16b, v2.16b
662	tbnz		x6, #3, 0f
663	ld1		{v3.16b}, [x20], #16
664	mov		v28.16b, v3.16b
665	tbnz		x6, #4, 0f
666	ld1		{v4.16b}, [x20], #16
667	mov		v29.16b, v4.16b
668	tbnz		x6, #5, 0f
669	ld1		{v5.16b}, [x20], #16
670	mov		v30.16b, v5.16b
671	tbnz		x6, #6, 0f
672	ld1		{v6.16b}, [x20], #16
673	mov		v31.16b, v6.16b
674	tbnz		x6, #7, 0f
675	ld1		{v7.16b}, [x20]
676
6770:	mov		bskey, x21
678	mov		rounds, x22
679	bl		aesbs_decrypt8
680
681	ld1		{v24.16b}, [x24]		// load IV
682
683	eor		v1.16b, v1.16b, v25.16b
684	eor		v6.16b, v6.16b, v26.16b
685	eor		v4.16b, v4.16b, v27.16b
686	eor		v2.16b, v2.16b, v28.16b
687	eor		v7.16b, v7.16b, v29.16b
688	eor		v0.16b, v0.16b, v24.16b
689	eor		v3.16b, v3.16b, v30.16b
690	eor		v5.16b, v5.16b, v31.16b
691
692	st1		{v0.16b}, [x19], #16
693	mov		v24.16b, v25.16b
694	tbnz		x6, #1, 1f
695	st1		{v1.16b}, [x19], #16
696	mov		v24.16b, v26.16b
697	tbnz		x6, #2, 1f
698	st1		{v6.16b}, [x19], #16
699	mov		v24.16b, v27.16b
700	tbnz		x6, #3, 1f
701	st1		{v4.16b}, [x19], #16
702	mov		v24.16b, v28.16b
703	tbnz		x6, #4, 1f
704	st1		{v2.16b}, [x19], #16
705	mov		v24.16b, v29.16b
706	tbnz		x6, #5, 1f
707	st1		{v7.16b}, [x19], #16
708	mov		v24.16b, v30.16b
709	tbnz		x6, #6, 1f
710	st1		{v3.16b}, [x19], #16
711	mov		v24.16b, v31.16b
712	tbnz		x6, #7, 1f
713	ld1		{v24.16b}, [x20], #16
714	st1		{v5.16b}, [x19], #16
7151:	st1		{v24.16b}, [x24]		// store IV
716
717	cbz		x23, 2f
718	cond_yield_neon
719	b		99b
720
7212:	frame_pop
722	ret
723ENDPROC(aesbs_cbc_decrypt)
724
725	.macro		next_tweak, out, in, const, tmp
726	sshr		\tmp\().2d,  \in\().2d,   #63
727	and		\tmp\().16b, \tmp\().16b, \const\().16b
728	add		\out\().2d,  \in\().2d,   \in\().2d
729	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
730	eor		\out\().16b, \out\().16b, \tmp\().16b
731	.endm
732
733	/*
734	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
735	 *		     int blocks, u8 iv[])
736	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
737	 *		     int blocks, u8 iv[])
738	 */
739__xts_crypt8:
740	mov		x6, #1
741	lsl		x6, x6, x23
742	subs		w23, w23, #8
743	csel		x23, x23, xzr, pl
744	csel		x6, x6, xzr, mi
745
746	ld1		{v0.16b}, [x20], #16
747	next_tweak	v26, v25, v30, v31
748	eor		v0.16b, v0.16b, v25.16b
749	tbnz		x6, #1, 0f
750
751	ld1		{v1.16b}, [x20], #16
752	next_tweak	v27, v26, v30, v31
753	eor		v1.16b, v1.16b, v26.16b
754	tbnz		x6, #2, 0f
755
756	ld1		{v2.16b}, [x20], #16
757	next_tweak	v28, v27, v30, v31
758	eor		v2.16b, v2.16b, v27.16b
759	tbnz		x6, #3, 0f
760
761	ld1		{v3.16b}, [x20], #16
762	next_tweak	v29, v28, v30, v31
763	eor		v3.16b, v3.16b, v28.16b
764	tbnz		x6, #4, 0f
765
766	ld1		{v4.16b}, [x20], #16
767	str		q29, [sp, #.Lframe_local_offset]
768	eor		v4.16b, v4.16b, v29.16b
769	next_tweak	v29, v29, v30, v31
770	tbnz		x6, #5, 0f
771
772	ld1		{v5.16b}, [x20], #16
773	str		q29, [sp, #.Lframe_local_offset + 16]
774	eor		v5.16b, v5.16b, v29.16b
775	next_tweak	v29, v29, v30, v31
776	tbnz		x6, #6, 0f
777
778	ld1		{v6.16b}, [x20], #16
779	str		q29, [sp, #.Lframe_local_offset + 32]
780	eor		v6.16b, v6.16b, v29.16b
781	next_tweak	v29, v29, v30, v31
782	tbnz		x6, #7, 0f
783
784	ld1		{v7.16b}, [x20], #16
785	str		q29, [sp, #.Lframe_local_offset + 48]
786	eor		v7.16b, v7.16b, v29.16b
787	next_tweak	v29, v29, v30, v31
788
7890:	mov		bskey, x21
790	mov		rounds, x22
791	br		x7
792ENDPROC(__xts_crypt8)
793
794	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
795	frame_push	6, 64
796
797	mov		x19, x0
798	mov		x20, x1
799	mov		x21, x2
800	mov		x22, x3
801	mov		x23, x4
802	mov		x24, x5
803
8040:	movi		v30.2s, #0x1
805	movi		v25.2s, #0x87
806	uzp1		v30.4s, v30.4s, v25.4s
807	ld1		{v25.16b}, [x24]
808
80999:	adr		x7, \do8
810	bl		__xts_crypt8
811
812	ldp		q16, q17, [sp, #.Lframe_local_offset]
813	ldp		q18, q19, [sp, #.Lframe_local_offset + 32]
814
815	eor		\o0\().16b, \o0\().16b, v25.16b
816	eor		\o1\().16b, \o1\().16b, v26.16b
817	eor		\o2\().16b, \o2\().16b, v27.16b
818	eor		\o3\().16b, \o3\().16b, v28.16b
819
820	st1		{\o0\().16b}, [x19], #16
821	mov		v25.16b, v26.16b
822	tbnz		x6, #1, 1f
823	st1		{\o1\().16b}, [x19], #16
824	mov		v25.16b, v27.16b
825	tbnz		x6, #2, 1f
826	st1		{\o2\().16b}, [x19], #16
827	mov		v25.16b, v28.16b
828	tbnz		x6, #3, 1f
829	st1		{\o3\().16b}, [x19], #16
830	mov		v25.16b, v29.16b
831	tbnz		x6, #4, 1f
832
833	eor		\o4\().16b, \o4\().16b, v16.16b
834	eor		\o5\().16b, \o5\().16b, v17.16b
835	eor		\o6\().16b, \o6\().16b, v18.16b
836	eor		\o7\().16b, \o7\().16b, v19.16b
837
838	st1		{\o4\().16b}, [x19], #16
839	tbnz		x6, #5, 1f
840	st1		{\o5\().16b}, [x19], #16
841	tbnz		x6, #6, 1f
842	st1		{\o6\().16b}, [x19], #16
843	tbnz		x6, #7, 1f
844	st1		{\o7\().16b}, [x19], #16
845
846	cbz		x23, 1f
847	st1		{v25.16b}, [x24]
848
849	cond_yield_neon	0b
850	b		99b
851
8521:	st1		{v25.16b}, [x24]
853	frame_pop
854	ret
855	.endm
856
857ENTRY(aesbs_xts_encrypt)
858	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
859ENDPROC(aesbs_xts_encrypt)
860
861ENTRY(aesbs_xts_decrypt)
862	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
863ENDPROC(aesbs_xts_decrypt)
864
865	.macro		next_ctr, v
866	mov		\v\().d[1], x8
867	adds		x8, x8, #1
868	mov		\v\().d[0], x7
869	adc		x7, x7, xzr
870	rev64		\v\().16b, \v\().16b
871	.endm
872
873	/*
874	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
875	 *		     int rounds, int blocks, u8 iv[], u8 final[])
876	 */
877ENTRY(aesbs_ctr_encrypt)
878	frame_push	8
879
880	mov		x19, x0
881	mov		x20, x1
882	mov		x21, x2
883	mov		x22, x3
884	mov		x23, x4
885	mov		x24, x5
886	mov		x25, x6
887
888	cmp		x25, #0
889	cset		x26, ne
890	add		x23, x23, x26		// do one extra block if final
891
89298:	ldp		x7, x8, [x24]
893	ld1		{v0.16b}, [x24]
894CPU_LE(	rev		x7, x7		)
895CPU_LE(	rev		x8, x8		)
896	adds		x8, x8, #1
897	adc		x7, x7, xzr
898
89999:	mov		x9, #1
900	lsl		x9, x9, x23
901	subs		w23, w23, #8
902	csel		x23, x23, xzr, pl
903	csel		x9, x9, xzr, le
904
905	tbnz		x9, #1, 0f
906	next_ctr	v1
907	tbnz		x9, #2, 0f
908	next_ctr	v2
909	tbnz		x9, #3, 0f
910	next_ctr	v3
911	tbnz		x9, #4, 0f
912	next_ctr	v4
913	tbnz		x9, #5, 0f
914	next_ctr	v5
915	tbnz		x9, #6, 0f
916	next_ctr	v6
917	tbnz		x9, #7, 0f
918	next_ctr	v7
919
9200:	mov		bskey, x21
921	mov		rounds, x22
922	bl		aesbs_encrypt8
923
924	lsr		x9, x9, x26		// disregard the extra block
925	tbnz		x9, #0, 0f
926
927	ld1		{v8.16b}, [x20], #16
928	eor		v0.16b, v0.16b, v8.16b
929	st1		{v0.16b}, [x19], #16
930	tbnz		x9, #1, 1f
931
932	ld1		{v9.16b}, [x20], #16
933	eor		v1.16b, v1.16b, v9.16b
934	st1		{v1.16b}, [x19], #16
935	tbnz		x9, #2, 2f
936
937	ld1		{v10.16b}, [x20], #16
938	eor		v4.16b, v4.16b, v10.16b
939	st1		{v4.16b}, [x19], #16
940	tbnz		x9, #3, 3f
941
942	ld1		{v11.16b}, [x20], #16
943	eor		v6.16b, v6.16b, v11.16b
944	st1		{v6.16b}, [x19], #16
945	tbnz		x9, #4, 4f
946
947	ld1		{v12.16b}, [x20], #16
948	eor		v3.16b, v3.16b, v12.16b
949	st1		{v3.16b}, [x19], #16
950	tbnz		x9, #5, 5f
951
952	ld1		{v13.16b}, [x20], #16
953	eor		v7.16b, v7.16b, v13.16b
954	st1		{v7.16b}, [x19], #16
955	tbnz		x9, #6, 6f
956
957	ld1		{v14.16b}, [x20], #16
958	eor		v2.16b, v2.16b, v14.16b
959	st1		{v2.16b}, [x19], #16
960	tbnz		x9, #7, 7f
961
962	ld1		{v15.16b}, [x20], #16
963	eor		v5.16b, v5.16b, v15.16b
964	st1		{v5.16b}, [x19], #16
965
9668:	next_ctr	v0
967	st1		{v0.16b}, [x24]
968	cbz		x23, .Lctr_done
969
970	cond_yield_neon	98b
971	b		99b
972
973.Lctr_done:
974	frame_pop
975	ret
976
977	/*
978	 * If we are handling the tail of the input (x6 != NULL), return the
979	 * final keystream block back to the caller.
980	 */
9810:	cbz		x25, 8b
982	st1		{v0.16b}, [x25]
983	b		8b
9841:	cbz		x25, 8b
985	st1		{v1.16b}, [x25]
986	b		8b
9872:	cbz		x25, 8b
988	st1		{v4.16b}, [x25]
989	b		8b
9903:	cbz		x25, 8b
991	st1		{v6.16b}, [x25]
992	b		8b
9934:	cbz		x25, 8b
994	st1		{v3.16b}, [x25]
995	b		8b
9965:	cbz		x25, 8b
997	st1		{v7.16b}, [x25]
998	b		8b
9996:	cbz		x25, 8b
1000	st1		{v2.16b}, [x25]
1001	b		8b
10027:	cbz		x25, 8b
1003	st1		{v5.16b}, [x25]
1004	b		8b
1005ENDPROC(aesbs_ctr_encrypt)
1006