xref: /linux/arch/arm64/crypto/aes-neonbs-core.S (revision 2b64b2ed277ff23e785fbdb65098ee7e1252d64f)
1/*
2 * Bit sliced AES using NEON instructions
3 *
4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/*
12 * The algorithm implemented here is described in detail by the paper
13 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
14 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
15 *
16 * This implementation is based primarily on the OpenSSL implementation
17 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
18 */
19
20#include <linux/linkage.h>
21#include <asm/assembler.h>
22
23	.text
24
25	rounds		.req	x11
26	bskey		.req	x12
27
28	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
29	eor		\b2, \b2, \b1
30	eor		\b5, \b5, \b6
31	eor		\b3, \b3, \b0
32	eor		\b6, \b6, \b2
33	eor		\b5, \b5, \b0
34	eor		\b6, \b6, \b3
35	eor		\b3, \b3, \b7
36	eor		\b7, \b7, \b5
37	eor		\b3, \b3, \b4
38	eor		\b4, \b4, \b5
39	eor		\b2, \b2, \b7
40	eor		\b3, \b3, \b1
41	eor		\b1, \b1, \b5
42	.endm
43
44	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
45	eor		\b0, \b0, \b6
46	eor		\b1, \b1, \b4
47	eor		\b4, \b4, \b6
48	eor		\b2, \b2, \b0
49	eor		\b6, \b6, \b1
50	eor		\b1, \b1, \b5
51	eor		\b5, \b5, \b3
52	eor		\b3, \b3, \b7
53	eor		\b7, \b7, \b5
54	eor		\b2, \b2, \b5
55	eor		\b4, \b4, \b7
56	.endm
57
58	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
59	eor		\b1, \b1, \b7
60	eor		\b4, \b4, \b7
61	eor		\b7, \b7, \b5
62	eor		\b1, \b1, \b3
63	eor		\b2, \b2, \b5
64	eor		\b3, \b3, \b7
65	eor		\b6, \b6, \b1
66	eor		\b2, \b2, \b0
67	eor		\b5, \b5, \b3
68	eor		\b4, \b4, \b6
69	eor		\b0, \b0, \b6
70	eor		\b1, \b1, \b4
71	.endm
72
73	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
74	eor		\b1, \b1, \b5
75	eor		\b2, \b2, \b7
76	eor		\b3, \b3, \b1
77	eor		\b4, \b4, \b5
78	eor		\b7, \b7, \b5
79	eor		\b3, \b3, \b4
80	eor 		\b5, \b5, \b0
81	eor		\b3, \b3, \b7
82	eor		\b6, \b6, \b2
83	eor		\b2, \b2, \b1
84	eor		\b6, \b6, \b3
85	eor		\b3, \b3, \b0
86	eor		\b5, \b5, \b6
87	.endm
88
89	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
90	eor 		\t0, \y0, \y1
91	and		\t0, \t0, \x0
92	eor		\x0, \x0, \x1
93	and		\t1, \x1, \y0
94	and		\x0, \x0, \y1
95	eor		\x1, \t1, \t0
96	eor		\x0, \x0, \t1
97	.endm
98
99	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
100	eor		\t0, \y0, \y1
101	eor 		\t1, \y2, \y3
102	and		\t0, \t0, \x0
103	and		\t1, \t1, \x2
104	eor		\x0, \x0, \x1
105	eor		\x2, \x2, \x3
106	and		\x1, \x1, \y0
107	and		\x3, \x3, \y2
108	and		\x0, \x0, \y1
109	and		\x2, \x2, \y3
110	eor		\x1, \x1, \x0
111	eor		\x2, \x2, \x3
112	eor		\x0, \x0, \t0
113	eor		\x3, \x3, \t1
114	.endm
115
116	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
117				    y0, y1, y2, y3, t0, t1, t2, t3
118	eor		\t0, \x0, \x2
119	eor		\t1, \x1, \x3
120	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
121	eor		\y0, \y0, \y2
122	eor		\y1, \y1, \y3
123	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
124	eor		\x0, \x0, \t0
125	eor		\x2, \x2, \t0
126	eor		\x1, \x1, \t1
127	eor		\x3, \x3, \t1
128	eor		\t0, \x4, \x6
129	eor		\t1, \x5, \x7
130	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
131	eor		\y0, \y0, \y2
132	eor		\y1, \y1, \y3
133	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
134	eor		\x4, \x4, \t0
135	eor		\x6, \x6, \t0
136	eor		\x5, \x5, \t1
137	eor		\x7, \x7, \t1
138	.endm
139
140	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
141				   t0, t1, t2, t3, s0, s1, s2, s3
142	eor		\t3, \x4, \x6
143	eor		\t0, \x5, \x7
144	eor		\t1, \x1, \x3
145	eor		\s1, \x7, \x6
146	eor		\s0, \x0, \x2
147	eor		\s3, \t3, \t0
148	orr		\t2, \t0, \t1
149	and		\s2, \t3, \s0
150	orr		\t3, \t3, \s0
151	eor		\s0, \s0, \t1
152	and		\t0, \t0, \t1
153	eor		\t1, \x3, \x2
154	and		\s3, \s3, \s0
155	and		\s1, \s1, \t1
156	eor		\t1, \x4, \x5
157	eor		\s0, \x1, \x0
158	eor		\t3, \t3, \s1
159	eor		\t2, \t2, \s1
160	and		\s1, \t1, \s0
161	orr		\t1, \t1, \s0
162	eor		\t3, \t3, \s3
163	eor		\t0, \t0, \s1
164	eor		\t2, \t2, \s2
165	eor		\t1, \t1, \s3
166	eor		\t0, \t0, \s2
167	and		\s0, \x7, \x3
168	eor		\t1, \t1, \s2
169	and		\s1, \x6, \x2
170	and		\s2, \x5, \x1
171	orr		\s3, \x4, \x0
172	eor		\t3, \t3, \s0
173	eor		\t1, \t1, \s2
174	eor		\s0, \t0, \s3
175	eor		\t2, \t2, \s1
176	and		\s2, \t3, \t1
177	eor		\s1, \t2, \s2
178	eor		\s3, \s0, \s2
179	bsl		\s1, \t1, \s0
180	not		\t0, \s0
181	bsl		\s0, \s1, \s3
182	bsl		\t0, \s1, \s3
183	bsl		\s3, \t3, \t2
184	eor		\t3, \t3, \t2
185	and		\s2, \s0, \s3
186	eor		\t1, \t1, \t0
187	eor		\s2, \s2, \t3
188	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
189			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
190	.endm
191
192	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
193			      t0, t1, t2, t3, s0, s1, s2, s3
194	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
195			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
196	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
197			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
198			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
199			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
200	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
201			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
202	.endm
203
204	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
205				  t0, t1, t2, t3, s0, s1, s2, s3
206	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
207			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
208	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
209			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
210			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
211			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
212	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
213			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
214	.endm
215
216	.macro		enc_next_rk
217	ldp		q16, q17, [bskey], #128
218	ldp		q18, q19, [bskey, #-96]
219	ldp		q20, q21, [bskey, #-64]
220	ldp		q22, q23, [bskey, #-32]
221	.endm
222
223	.macro		dec_next_rk
224	ldp		q16, q17, [bskey, #-128]!
225	ldp		q18, q19, [bskey, #32]
226	ldp		q20, q21, [bskey, #64]
227	ldp		q22, q23, [bskey, #96]
228	.endm
229
230	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
231	eor		\x0\().16b, \x0\().16b, v16.16b
232	eor		\x1\().16b, \x1\().16b, v17.16b
233	eor		\x2\().16b, \x2\().16b, v18.16b
234	eor		\x3\().16b, \x3\().16b, v19.16b
235	eor		\x4\().16b, \x4\().16b, v20.16b
236	eor		\x5\().16b, \x5\().16b, v21.16b
237	eor		\x6\().16b, \x6\().16b, v22.16b
238	eor		\x7\().16b, \x7\().16b, v23.16b
239	.endm
240
241	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
242	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
243	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
244	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
245	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
246	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
247	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
248	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
249	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
250	.endm
251
252	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
253				  t0, t1, t2, t3, t4, t5, t6, t7, inv
254	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
255	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
256	eor		\x0\().16b, \x0\().16b, \t0\().16b
257	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
258	eor		\x1\().16b, \x1\().16b, \t1\().16b
259	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
260	eor		\x2\().16b, \x2\().16b, \t2\().16b
261	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
262	eor		\x3\().16b, \x3\().16b, \t3\().16b
263	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
264	eor		\x4\().16b, \x4\().16b, \t4\().16b
265	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
266	eor		\x5\().16b, \x5\().16b, \t5\().16b
267	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
268	eor		\x6\().16b, \x6\().16b, \t6\().16b
269	eor		\t1\().16b, \t1\().16b, \x0\().16b
270	eor		\x7\().16b, \x7\().16b, \t7\().16b
271	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
272	eor		\t2\().16b, \t2\().16b, \x1\().16b
273	eor		\t0\().16b, \t0\().16b, \x7\().16b
274	eor		\t1\().16b, \t1\().16b, \x7\().16b
275	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
276	eor		\t5\().16b, \t5\().16b, \x4\().16b
277	eor		\x0\().16b, \x0\().16b, \t0\().16b
278	eor		\t6\().16b, \t6\().16b, \x5\().16b
279	eor		\x1\().16b, \x1\().16b, \t1\().16b
280	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
281	eor		\t4\().16b, \t4\().16b, \x3\().16b
282	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
283	eor		\t7\().16b, \t7\().16b, \x6\().16b
284	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
285	eor		\t3\().16b, \t3\().16b, \x2\().16b
286	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
287	eor		\t4\().16b, \t4\().16b, \x7\().16b
288	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
289	eor		\t3\().16b, \t3\().16b, \x7\().16b
290	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
291	eor		\x7\().16b, \t1\().16b, \t5\().16b
292	.ifb		\inv
293	eor		\x2\().16b, \t0\().16b, \t4\().16b
294	eor		\x4\().16b, \x4\().16b, \t3\().16b
295	eor		\x5\().16b, \x5\().16b, \t7\().16b
296	eor		\x3\().16b, \x3\().16b, \t6\().16b
297	eor		\x6\().16b, \x6\().16b, \t2\().16b
298	.else
299	eor		\t3\().16b, \t3\().16b, \x4\().16b
300	eor		\x5\().16b, \x5\().16b, \t7\().16b
301	eor		\x2\().16b, \x3\().16b, \t6\().16b
302	eor		\x3\().16b, \t0\().16b, \t4\().16b
303	eor		\x4\().16b, \x6\().16b, \t2\().16b
304	mov		\x6\().16b, \t3\().16b
305	.endif
306	.endm
307
308	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
309				      t0, t1, t2, t3, t4, t5, t6, t7
310	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
311	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
312	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
313	eor		\t0\().16b, \t0\().16b, \x0\().16b
314	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
315	eor		\t6\().16b, \t6\().16b, \x6\().16b
316	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
317	eor		\t7\().16b, \t7\().16b, \x7\().16b
318	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
319	eor		\t1\().16b, \t1\().16b, \x1\().16b
320	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
321	eor		\t2\().16b, \t2\().16b, \x2\().16b
322	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
323	eor		\t3\().16b, \t3\().16b, \x3\().16b
324	eor		\t4\().16b, \t4\().16b, \x4\().16b
325	eor		\t5\().16b, \t5\().16b, \x5\().16b
326	eor		\x0\().16b, \x0\().16b, \t6\().16b
327	eor		\x1\().16b, \x1\().16b, \t6\().16b
328	eor		\x2\().16b, \x2\().16b, \t0\().16b
329	eor		\x4\().16b, \x4\().16b, \t2\().16b
330	eor		\x3\().16b, \x3\().16b, \t1\().16b
331	eor		\x1\().16b, \x1\().16b, \t7\().16b
332	eor		\x2\().16b, \x2\().16b, \t7\().16b
333	eor		\x4\().16b, \x4\().16b, \t6\().16b
334	eor		\x5\().16b, \x5\().16b, \t3\().16b
335	eor		\x3\().16b, \x3\().16b, \t6\().16b
336	eor		\x6\().16b, \x6\().16b, \t4\().16b
337	eor		\x4\().16b, \x4\().16b, \t7\().16b
338	eor		\x5\().16b, \x5\().16b, \t7\().16b
339	eor		\x7\().16b, \x7\().16b, \t5\().16b
340	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
341			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
342	.endm
343
344	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
345	ushr		\t0\().2d, \b0\().2d, #\n
346	ushr		\t1\().2d, \b1\().2d, #\n
347	eor		\t0\().16b, \t0\().16b, \a0\().16b
348	eor		\t1\().16b, \t1\().16b, \a1\().16b
349	and		\t0\().16b, \t0\().16b, \mask\().16b
350	and		\t1\().16b, \t1\().16b, \mask\().16b
351	eor		\a0\().16b, \a0\().16b, \t0\().16b
352	shl		\t0\().2d, \t0\().2d, #\n
353	eor		\a1\().16b, \a1\().16b, \t1\().16b
354	shl		\t1\().2d, \t1\().2d, #\n
355	eor		\b0\().16b, \b0\().16b, \t0\().16b
356	eor		\b1\().16b, \b1\().16b, \t1\().16b
357	.endm
358
359	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
360	movi		\t0\().16b, #0x55
361	movi		\t1\().16b, #0x33
362	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
363	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
364	movi		\t0\().16b, #0x0f
365	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
366	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
367	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
368	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
369	.endm
370
371
372	.align		6
373M0:	.octa		0x0004080c0105090d02060a0e03070b0f
374
375M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
376SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
377SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
378
379M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
380ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
381ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
382
383	/*
384	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
385	 */
386ENTRY(aesbs_convert_key)
387	ld1		{v7.4s}, [x1], #16		// load round 0 key
388	ld1		{v17.4s}, [x1], #16		// load round 1 key
389
390	movi		v8.16b,  #0x01			// bit masks
391	movi		v9.16b,  #0x02
392	movi		v10.16b, #0x04
393	movi		v11.16b, #0x08
394	movi		v12.16b, #0x10
395	movi		v13.16b, #0x20
396	movi		v14.16b, #0x40
397	movi		v15.16b, #0x80
398	ldr		q16, M0
399
400	sub		x2, x2, #1
401	str		q7, [x0], #16		// save round 0 key
402
403.Lkey_loop:
404	tbl		v7.16b ,{v17.16b}, v16.16b
405	ld1		{v17.4s}, [x1], #16		// load next round key
406
407	cmtst		v0.16b, v7.16b, v8.16b
408	cmtst		v1.16b, v7.16b, v9.16b
409	cmtst		v2.16b, v7.16b, v10.16b
410	cmtst		v3.16b, v7.16b, v11.16b
411	cmtst		v4.16b, v7.16b, v12.16b
412	cmtst		v5.16b, v7.16b, v13.16b
413	cmtst		v6.16b, v7.16b, v14.16b
414	cmtst		v7.16b, v7.16b, v15.16b
415	not		v0.16b, v0.16b
416	not		v1.16b, v1.16b
417	not		v5.16b, v5.16b
418	not		v6.16b, v6.16b
419
420	subs		x2, x2, #1
421	stp		q0, q1, [x0], #128
422	stp		q2, q3, [x0, #-96]
423	stp		q4, q5, [x0, #-64]
424	stp		q6, q7, [x0, #-32]
425	b.ne		.Lkey_loop
426
427	movi		v7.16b, #0x63			// compose .L63
428	eor		v17.16b, v17.16b, v7.16b
429	str		q17, [x0]
430	ret
431ENDPROC(aesbs_convert_key)
432
433	.align		4
434aesbs_encrypt8:
435	ldr		q9, [bskey], #16		// round 0 key
436	ldr		q8, M0SR
437	ldr		q24, SR
438
439	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
440	eor		v11.16b, v1.16b, v9.16b
441	tbl		v0.16b, {v10.16b}, v8.16b
442	eor		v12.16b, v2.16b, v9.16b
443	tbl		v1.16b, {v11.16b}, v8.16b
444	eor		v13.16b, v3.16b, v9.16b
445	tbl		v2.16b, {v12.16b}, v8.16b
446	eor		v14.16b, v4.16b, v9.16b
447	tbl		v3.16b, {v13.16b}, v8.16b
448	eor		v15.16b, v5.16b, v9.16b
449	tbl		v4.16b, {v14.16b}, v8.16b
450	eor		v10.16b, v6.16b, v9.16b
451	tbl		v5.16b, {v15.16b}, v8.16b
452	eor		v11.16b, v7.16b, v9.16b
453	tbl		v6.16b, {v10.16b}, v8.16b
454	tbl		v7.16b, {v11.16b}, v8.16b
455
456	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
457
458	sub		rounds, rounds, #1
459	b		.Lenc_sbox
460
461.Lenc_loop:
462	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
463.Lenc_sbox:
464	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
465								v13, v14, v15
466	subs		rounds, rounds, #1
467	b.cc		.Lenc_done
468
469	enc_next_rk
470
471	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
472								v13, v14, v15
473
474	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
475
476	b.ne		.Lenc_loop
477	ldr		q24, SRM0
478	b		.Lenc_loop
479
480.Lenc_done:
481	ldr		q12, [bskey]			// last round key
482
483	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
484
485	eor		v0.16b, v0.16b, v12.16b
486	eor		v1.16b, v1.16b, v12.16b
487	eor		v4.16b, v4.16b, v12.16b
488	eor		v6.16b, v6.16b, v12.16b
489	eor		v3.16b, v3.16b, v12.16b
490	eor		v7.16b, v7.16b, v12.16b
491	eor		v2.16b, v2.16b, v12.16b
492	eor		v5.16b, v5.16b, v12.16b
493	ret
494ENDPROC(aesbs_encrypt8)
495
496	.align		4
497aesbs_decrypt8:
498	lsl		x9, rounds, #7
499	add		bskey, bskey, x9
500
501	ldr		q9, [bskey, #-112]!		// round 0 key
502	ldr		q8, M0ISR
503	ldr		q24, ISR
504
505	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
506	eor		v11.16b, v1.16b, v9.16b
507	tbl		v0.16b, {v10.16b}, v8.16b
508	eor		v12.16b, v2.16b, v9.16b
509	tbl		v1.16b, {v11.16b}, v8.16b
510	eor		v13.16b, v3.16b, v9.16b
511	tbl		v2.16b, {v12.16b}, v8.16b
512	eor		v14.16b, v4.16b, v9.16b
513	tbl		v3.16b, {v13.16b}, v8.16b
514	eor		v15.16b, v5.16b, v9.16b
515	tbl		v4.16b, {v14.16b}, v8.16b
516	eor		v10.16b, v6.16b, v9.16b
517	tbl		v5.16b, {v15.16b}, v8.16b
518	eor		v11.16b, v7.16b, v9.16b
519	tbl		v6.16b, {v10.16b}, v8.16b
520	tbl		v7.16b, {v11.16b}, v8.16b
521
522	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
523
524	sub		rounds, rounds, #1
525	b		.Ldec_sbox
526
527.Ldec_loop:
528	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
529.Ldec_sbox:
530	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
531								v13, v14, v15
532	subs		rounds, rounds, #1
533	b.cc		.Ldec_done
534
535	dec_next_rk
536
537	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
538
539	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
540								v13, v14, v15
541
542	b.ne		.Ldec_loop
543	ldr		q24, ISRM0
544	b		.Ldec_loop
545.Ldec_done:
546	ldr		q12, [bskey, #-16]		// last round key
547
548	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
549
550	eor		v0.16b, v0.16b, v12.16b
551	eor		v1.16b, v1.16b, v12.16b
552	eor		v6.16b, v6.16b, v12.16b
553	eor		v4.16b, v4.16b, v12.16b
554	eor		v2.16b, v2.16b, v12.16b
555	eor		v7.16b, v7.16b, v12.16b
556	eor		v3.16b, v3.16b, v12.16b
557	eor		v5.16b, v5.16b, v12.16b
558	ret
559ENDPROC(aesbs_decrypt8)
560
561	/*
562	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
563	 *		     int blocks)
564	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
565	 *		     int blocks)
566	 */
567	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
568	frame_push	5
569
570	mov		x19, x0
571	mov		x20, x1
572	mov		x21, x2
573	mov		x22, x3
574	mov		x23, x4
575
57699:	mov		x5, #1
577	lsl		x5, x5, x23
578	subs		w23, w23, #8
579	csel		x23, x23, xzr, pl
580	csel		x5, x5, xzr, mi
581
582	ld1		{v0.16b}, [x20], #16
583	tbnz		x5, #1, 0f
584	ld1		{v1.16b}, [x20], #16
585	tbnz		x5, #2, 0f
586	ld1		{v2.16b}, [x20], #16
587	tbnz		x5, #3, 0f
588	ld1		{v3.16b}, [x20], #16
589	tbnz		x5, #4, 0f
590	ld1		{v4.16b}, [x20], #16
591	tbnz		x5, #5, 0f
592	ld1		{v5.16b}, [x20], #16
593	tbnz		x5, #6, 0f
594	ld1		{v6.16b}, [x20], #16
595	tbnz		x5, #7, 0f
596	ld1		{v7.16b}, [x20], #16
597
5980:	mov		bskey, x21
599	mov		rounds, x22
600	bl		\do8
601
602	st1		{\o0\().16b}, [x19], #16
603	tbnz		x5, #1, 1f
604	st1		{\o1\().16b}, [x19], #16
605	tbnz		x5, #2, 1f
606	st1		{\o2\().16b}, [x19], #16
607	tbnz		x5, #3, 1f
608	st1		{\o3\().16b}, [x19], #16
609	tbnz		x5, #4, 1f
610	st1		{\o4\().16b}, [x19], #16
611	tbnz		x5, #5, 1f
612	st1		{\o5\().16b}, [x19], #16
613	tbnz		x5, #6, 1f
614	st1		{\o6\().16b}, [x19], #16
615	tbnz		x5, #7, 1f
616	st1		{\o7\().16b}, [x19], #16
617
618	cbz		x23, 1f
619	cond_yield_neon
620	b		99b
621
6221:	frame_pop
623	ret
624	.endm
625
626	.align		4
627ENTRY(aesbs_ecb_encrypt)
628	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
629ENDPROC(aesbs_ecb_encrypt)
630
631	.align		4
632ENTRY(aesbs_ecb_decrypt)
633	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
634ENDPROC(aesbs_ecb_decrypt)
635
636	/*
637	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
638	 *		     int blocks, u8 iv[])
639	 */
640	.align		4
641ENTRY(aesbs_cbc_decrypt)
642	frame_push	6
643
644	mov		x19, x0
645	mov		x20, x1
646	mov		x21, x2
647	mov		x22, x3
648	mov		x23, x4
649	mov		x24, x5
650
65199:	mov		x6, #1
652	lsl		x6, x6, x23
653	subs		w23, w23, #8
654	csel		x23, x23, xzr, pl
655	csel		x6, x6, xzr, mi
656
657	ld1		{v0.16b}, [x20], #16
658	mov		v25.16b, v0.16b
659	tbnz		x6, #1, 0f
660	ld1		{v1.16b}, [x20], #16
661	mov		v26.16b, v1.16b
662	tbnz		x6, #2, 0f
663	ld1		{v2.16b}, [x20], #16
664	mov		v27.16b, v2.16b
665	tbnz		x6, #3, 0f
666	ld1		{v3.16b}, [x20], #16
667	mov		v28.16b, v3.16b
668	tbnz		x6, #4, 0f
669	ld1		{v4.16b}, [x20], #16
670	mov		v29.16b, v4.16b
671	tbnz		x6, #5, 0f
672	ld1		{v5.16b}, [x20], #16
673	mov		v30.16b, v5.16b
674	tbnz		x6, #6, 0f
675	ld1		{v6.16b}, [x20], #16
676	mov		v31.16b, v6.16b
677	tbnz		x6, #7, 0f
678	ld1		{v7.16b}, [x20]
679
6800:	mov		bskey, x21
681	mov		rounds, x22
682	bl		aesbs_decrypt8
683
684	ld1		{v24.16b}, [x24]		// load IV
685
686	eor		v1.16b, v1.16b, v25.16b
687	eor		v6.16b, v6.16b, v26.16b
688	eor		v4.16b, v4.16b, v27.16b
689	eor		v2.16b, v2.16b, v28.16b
690	eor		v7.16b, v7.16b, v29.16b
691	eor		v0.16b, v0.16b, v24.16b
692	eor		v3.16b, v3.16b, v30.16b
693	eor		v5.16b, v5.16b, v31.16b
694
695	st1		{v0.16b}, [x19], #16
696	mov		v24.16b, v25.16b
697	tbnz		x6, #1, 1f
698	st1		{v1.16b}, [x19], #16
699	mov		v24.16b, v26.16b
700	tbnz		x6, #2, 1f
701	st1		{v6.16b}, [x19], #16
702	mov		v24.16b, v27.16b
703	tbnz		x6, #3, 1f
704	st1		{v4.16b}, [x19], #16
705	mov		v24.16b, v28.16b
706	tbnz		x6, #4, 1f
707	st1		{v2.16b}, [x19], #16
708	mov		v24.16b, v29.16b
709	tbnz		x6, #5, 1f
710	st1		{v7.16b}, [x19], #16
711	mov		v24.16b, v30.16b
712	tbnz		x6, #6, 1f
713	st1		{v3.16b}, [x19], #16
714	mov		v24.16b, v31.16b
715	tbnz		x6, #7, 1f
716	ld1		{v24.16b}, [x20], #16
717	st1		{v5.16b}, [x19], #16
7181:	st1		{v24.16b}, [x24]		// store IV
719
720	cbz		x23, 2f
721	cond_yield_neon
722	b		99b
723
7242:	frame_pop
725	ret
726ENDPROC(aesbs_cbc_decrypt)
727
728	.macro		next_tweak, out, in, const, tmp
729	sshr		\tmp\().2d,  \in\().2d,   #63
730	and		\tmp\().16b, \tmp\().16b, \const\().16b
731	add		\out\().2d,  \in\().2d,   \in\().2d
732	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
733	eor		\out\().16b, \out\().16b, \tmp\().16b
734	.endm
735
736	.align		4
737.Lxts_mul_x:
738CPU_LE(	.quad		1, 0x87		)
739CPU_BE(	.quad		0x87, 1		)
740
741	/*
742	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
743	 *		     int blocks, u8 iv[])
744	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
745	 *		     int blocks, u8 iv[])
746	 */
747__xts_crypt8:
748	mov		x6, #1
749	lsl		x6, x6, x23
750	subs		w23, w23, #8
751	csel		x23, x23, xzr, pl
752	csel		x6, x6, xzr, mi
753
754	ld1		{v0.16b}, [x20], #16
755	next_tweak	v26, v25, v30, v31
756	eor		v0.16b, v0.16b, v25.16b
757	tbnz		x6, #1, 0f
758
759	ld1		{v1.16b}, [x20], #16
760	next_tweak	v27, v26, v30, v31
761	eor		v1.16b, v1.16b, v26.16b
762	tbnz		x6, #2, 0f
763
764	ld1		{v2.16b}, [x20], #16
765	next_tweak	v28, v27, v30, v31
766	eor		v2.16b, v2.16b, v27.16b
767	tbnz		x6, #3, 0f
768
769	ld1		{v3.16b}, [x20], #16
770	next_tweak	v29, v28, v30, v31
771	eor		v3.16b, v3.16b, v28.16b
772	tbnz		x6, #4, 0f
773
774	ld1		{v4.16b}, [x20], #16
775	str		q29, [sp, #.Lframe_local_offset]
776	eor		v4.16b, v4.16b, v29.16b
777	next_tweak	v29, v29, v30, v31
778	tbnz		x6, #5, 0f
779
780	ld1		{v5.16b}, [x20], #16
781	str		q29, [sp, #.Lframe_local_offset + 16]
782	eor		v5.16b, v5.16b, v29.16b
783	next_tweak	v29, v29, v30, v31
784	tbnz		x6, #6, 0f
785
786	ld1		{v6.16b}, [x20], #16
787	str		q29, [sp, #.Lframe_local_offset + 32]
788	eor		v6.16b, v6.16b, v29.16b
789	next_tweak	v29, v29, v30, v31
790	tbnz		x6, #7, 0f
791
792	ld1		{v7.16b}, [x20], #16
793	str		q29, [sp, #.Lframe_local_offset + 48]
794	eor		v7.16b, v7.16b, v29.16b
795	next_tweak	v29, v29, v30, v31
796
7970:	mov		bskey, x21
798	mov		rounds, x22
799	br		x7
800ENDPROC(__xts_crypt8)
801
802	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
803	frame_push	6, 64
804
805	mov		x19, x0
806	mov		x20, x1
807	mov		x21, x2
808	mov		x22, x3
809	mov		x23, x4
810	mov		x24, x5
811
8120:	ldr		q30, .Lxts_mul_x
813	ld1		{v25.16b}, [x24]
814
81599:	adr		x7, \do8
816	bl		__xts_crypt8
817
818	ldp		q16, q17, [sp, #.Lframe_local_offset]
819	ldp		q18, q19, [sp, #.Lframe_local_offset + 32]
820
821	eor		\o0\().16b, \o0\().16b, v25.16b
822	eor		\o1\().16b, \o1\().16b, v26.16b
823	eor		\o2\().16b, \o2\().16b, v27.16b
824	eor		\o3\().16b, \o3\().16b, v28.16b
825
826	st1		{\o0\().16b}, [x19], #16
827	mov		v25.16b, v26.16b
828	tbnz		x6, #1, 1f
829	st1		{\o1\().16b}, [x19], #16
830	mov		v25.16b, v27.16b
831	tbnz		x6, #2, 1f
832	st1		{\o2\().16b}, [x19], #16
833	mov		v25.16b, v28.16b
834	tbnz		x6, #3, 1f
835	st1		{\o3\().16b}, [x19], #16
836	mov		v25.16b, v29.16b
837	tbnz		x6, #4, 1f
838
839	eor		\o4\().16b, \o4\().16b, v16.16b
840	eor		\o5\().16b, \o5\().16b, v17.16b
841	eor		\o6\().16b, \o6\().16b, v18.16b
842	eor		\o7\().16b, \o7\().16b, v19.16b
843
844	st1		{\o4\().16b}, [x19], #16
845	tbnz		x6, #5, 1f
846	st1		{\o5\().16b}, [x19], #16
847	tbnz		x6, #6, 1f
848	st1		{\o6\().16b}, [x19], #16
849	tbnz		x6, #7, 1f
850	st1		{\o7\().16b}, [x19], #16
851
852	cbz		x23, 1f
853	st1		{v25.16b}, [x24]
854
855	cond_yield_neon	0b
856	b		99b
857
8581:	st1		{v25.16b}, [x24]
859	frame_pop
860	ret
861	.endm
862
863ENTRY(aesbs_xts_encrypt)
864	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
865ENDPROC(aesbs_xts_encrypt)
866
867ENTRY(aesbs_xts_decrypt)
868	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
869ENDPROC(aesbs_xts_decrypt)
870
871	.macro		next_ctr, v
872	mov		\v\().d[1], x8
873	adds		x8, x8, #1
874	mov		\v\().d[0], x7
875	adc		x7, x7, xzr
876	rev64		\v\().16b, \v\().16b
877	.endm
878
879	/*
880	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
881	 *		     int rounds, int blocks, u8 iv[], u8 final[])
882	 */
883ENTRY(aesbs_ctr_encrypt)
884	frame_push	8
885
886	mov		x19, x0
887	mov		x20, x1
888	mov		x21, x2
889	mov		x22, x3
890	mov		x23, x4
891	mov		x24, x5
892	mov		x25, x6
893
894	cmp		x25, #0
895	cset		x26, ne
896	add		x23, x23, x26		// do one extra block if final
897
89898:	ldp		x7, x8, [x24]
899	ld1		{v0.16b}, [x24]
900CPU_LE(	rev		x7, x7		)
901CPU_LE(	rev		x8, x8		)
902	adds		x8, x8, #1
903	adc		x7, x7, xzr
904
90599:	mov		x9, #1
906	lsl		x9, x9, x23
907	subs		w23, w23, #8
908	csel		x23, x23, xzr, pl
909	csel		x9, x9, xzr, le
910
911	tbnz		x9, #1, 0f
912	next_ctr	v1
913	tbnz		x9, #2, 0f
914	next_ctr	v2
915	tbnz		x9, #3, 0f
916	next_ctr	v3
917	tbnz		x9, #4, 0f
918	next_ctr	v4
919	tbnz		x9, #5, 0f
920	next_ctr	v5
921	tbnz		x9, #6, 0f
922	next_ctr	v6
923	tbnz		x9, #7, 0f
924	next_ctr	v7
925
9260:	mov		bskey, x21
927	mov		rounds, x22
928	bl		aesbs_encrypt8
929
930	lsr		x9, x9, x26		// disregard the extra block
931	tbnz		x9, #0, 0f
932
933	ld1		{v8.16b}, [x20], #16
934	eor		v0.16b, v0.16b, v8.16b
935	st1		{v0.16b}, [x19], #16
936	tbnz		x9, #1, 1f
937
938	ld1		{v9.16b}, [x20], #16
939	eor		v1.16b, v1.16b, v9.16b
940	st1		{v1.16b}, [x19], #16
941	tbnz		x9, #2, 2f
942
943	ld1		{v10.16b}, [x20], #16
944	eor		v4.16b, v4.16b, v10.16b
945	st1		{v4.16b}, [x19], #16
946	tbnz		x9, #3, 3f
947
948	ld1		{v11.16b}, [x20], #16
949	eor		v6.16b, v6.16b, v11.16b
950	st1		{v6.16b}, [x19], #16
951	tbnz		x9, #4, 4f
952
953	ld1		{v12.16b}, [x20], #16
954	eor		v3.16b, v3.16b, v12.16b
955	st1		{v3.16b}, [x19], #16
956	tbnz		x9, #5, 5f
957
958	ld1		{v13.16b}, [x20], #16
959	eor		v7.16b, v7.16b, v13.16b
960	st1		{v7.16b}, [x19], #16
961	tbnz		x9, #6, 6f
962
963	ld1		{v14.16b}, [x20], #16
964	eor		v2.16b, v2.16b, v14.16b
965	st1		{v2.16b}, [x19], #16
966	tbnz		x9, #7, 7f
967
968	ld1		{v15.16b}, [x20], #16
969	eor		v5.16b, v5.16b, v15.16b
970	st1		{v5.16b}, [x19], #16
971
9728:	next_ctr	v0
973	st1		{v0.16b}, [x24]
974	cbz		x23, .Lctr_done
975
976	cond_yield_neon	98b
977	b		99b
978
979.Lctr_done:
980	frame_pop
981	ret
982
983	/*
984	 * If we are handling the tail of the input (x6 != NULL), return the
985	 * final keystream block back to the caller.
986	 */
9870:	cbz		x25, 8b
988	st1		{v0.16b}, [x25]
989	b		8b
9901:	cbz		x25, 8b
991	st1		{v1.16b}, [x25]
992	b		8b
9932:	cbz		x25, 8b
994	st1		{v4.16b}, [x25]
995	b		8b
9963:	cbz		x25, 8b
997	st1		{v6.16b}, [x25]
998	b		8b
9994:	cbz		x25, 8b
1000	st1		{v3.16b}, [x25]
1001	b		8b
10025:	cbz		x25, 8b
1003	st1		{v7.16b}, [x25]
1004	b		8b
10056:	cbz		x25, 8b
1006	st1		{v2.16b}, [x25]
1007	b		8b
10087:	cbz		x25, 8b
1009	st1		{v5.16b}, [x25]
1010	b		8b
1011ENDPROC(aesbs_ctr_encrypt)
1012