xref: /linux/arch/arm64/crypto/aes-neon.S (revision b7019ac550eb3916f34d79db583e9b7ea2524afa)
1/*
2 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
3 *
4 * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14#define AES_ENTRY(func)		ENTRY(neon_ ## func)
15#define AES_ENDPROC(func)	ENDPROC(neon_ ## func)
16
17	xtsmask		.req	v7
18
19	.macro		xts_reload_mask, tmp
20	xts_load_mask	\tmp
21	.endm
22
23	/* multiply by polynomial 'x' in GF(2^8) */
24	.macro		mul_by_x, out, in, temp, const
25	sshr		\temp, \in, #7
26	shl		\out, \in, #1
27	and		\temp, \temp, \const
28	eor		\out, \out, \temp
29	.endm
30
31	/* multiply by polynomial 'x^2' in GF(2^8) */
32	.macro		mul_by_x2, out, in, temp, const
33	ushr		\temp, \in, #6
34	shl		\out, \in, #2
35	pmul		\temp, \temp, \const
36	eor		\out, \out, \temp
37	.endm
38
39	/* preload the entire Sbox */
40	.macro		prepare, sbox, shiftrows, temp
41	movi		v12.16b, #0x1b
42	ldr_l		q13, \shiftrows, \temp
43	ldr_l		q14, .Lror32by8, \temp
44	adr_l		\temp, \sbox
45	ld1		{v16.16b-v19.16b}, [\temp], #64
46	ld1		{v20.16b-v23.16b}, [\temp], #64
47	ld1		{v24.16b-v27.16b}, [\temp], #64
48	ld1		{v28.16b-v31.16b}, [\temp]
49	.endm
50
51	/* do preload for encryption */
52	.macro		enc_prepare, ignore0, ignore1, temp
53	prepare		.LForward_Sbox, .LForward_ShiftRows, \temp
54	.endm
55
56	.macro		enc_switch_key, ignore0, ignore1, temp
57	/* do nothing */
58	.endm
59
60	/* do preload for decryption */
61	.macro		dec_prepare, ignore0, ignore1, temp
62	prepare		.LReverse_Sbox, .LReverse_ShiftRows, \temp
63	.endm
64
65	/* apply SubBytes transformation using the the preloaded Sbox */
66	.macro		sub_bytes, in
67	sub		v9.16b, \in\().16b, v15.16b
68	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
69	sub		v10.16b, v9.16b, v15.16b
70	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
71	sub		v11.16b, v10.16b, v15.16b
72	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
73	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
74	.endm
75
76	/* apply MixColumns transformation */
77	.macro		mix_columns, in, enc
78	.if		\enc == 0
79	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
80	mul_by_x2	v8.16b, \in\().16b, v9.16b, v12.16b
81	eor		\in\().16b, \in\().16b, v8.16b
82	rev32		v8.8h, v8.8h
83	eor		\in\().16b, \in\().16b, v8.16b
84	.endif
85
86	mul_by_x	v9.16b, \in\().16b, v8.16b, v12.16b
87	rev32		v8.8h, \in\().8h
88	eor		v8.16b, v8.16b, v9.16b
89	eor		\in\().16b, \in\().16b, v8.16b
90	tbl		\in\().16b, {\in\().16b}, v14.16b
91	eor		\in\().16b, \in\().16b, v8.16b
92	.endm
93
94	.macro		do_block, enc, in, rounds, rk, rkp, i
95	ld1		{v15.4s}, [\rk]
96	add		\rkp, \rk, #16
97	mov		\i, \rounds
981111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
99	movi		v15.16b, #0x40
100	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
101	sub_bytes	\in
102	subs		\i, \i, #1
103	ld1		{v15.4s}, [\rkp], #16
104	beq		2222f
105	mix_columns	\in, \enc
106	b		1111b
1072222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
108	.endm
109
110	.macro		encrypt_block, in, rounds, rk, rkp, i
111	do_block	1, \in, \rounds, \rk, \rkp, \i
112	.endm
113
114	.macro		decrypt_block, in, rounds, rk, rkp, i
115	do_block	0, \in, \rounds, \rk, \rkp, \i
116	.endm
117
118	/*
119	 * Interleaved versions: functionally equivalent to the
120	 * ones above, but applied to 2 or 4 AES states in parallel.
121	 */
122
123	.macro		sub_bytes_2x, in0, in1
124	sub		v8.16b, \in0\().16b, v15.16b
125	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
126	sub		v9.16b, \in1\().16b, v15.16b
127	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
128	sub		v10.16b, v8.16b, v15.16b
129	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
130	sub		v11.16b, v9.16b, v15.16b
131	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
132	sub		v8.16b, v10.16b, v15.16b
133	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
134	sub		v9.16b, v11.16b, v15.16b
135	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
136	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
137	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
138	.endm
139
140	.macro		sub_bytes_4x, in0, in1, in2, in3
141	sub		v8.16b, \in0\().16b, v15.16b
142	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
143	sub		v9.16b, \in1\().16b, v15.16b
144	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
145	sub		v10.16b, \in2\().16b, v15.16b
146	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
147	sub		v11.16b, \in3\().16b, v15.16b
148	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
149	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
150	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
151	sub		v8.16b, v8.16b, v15.16b
152	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
153	sub		v9.16b, v9.16b, v15.16b
154	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
155	sub		v10.16b, v10.16b, v15.16b
156	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
157	sub		v11.16b, v11.16b, v15.16b
158	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
159	sub		v8.16b, v8.16b, v15.16b
160	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
161	sub		v9.16b, v9.16b, v15.16b
162	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
163	sub		v10.16b, v10.16b, v15.16b
164	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
165	sub		v11.16b, v11.16b, v15.16b
166	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
167	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
168	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
169	.endm
170
171	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
172	sshr		\tmp0\().16b, \in0\().16b, #7
173	shl		\out0\().16b, \in0\().16b, #1
174	sshr		\tmp1\().16b, \in1\().16b, #7
175	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
176	shl		\out1\().16b, \in1\().16b, #1
177	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
178	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
179	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
180	.endm
181
182	.macro		mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
183	ushr		\tmp0\().16b, \in0\().16b, #6
184	shl		\out0\().16b, \in0\().16b, #2
185	ushr		\tmp1\().16b, \in1\().16b, #6
186	pmul		\tmp0\().16b, \tmp0\().16b, \const\().16b
187	shl		\out1\().16b, \in1\().16b, #2
188	pmul		\tmp1\().16b, \tmp1\().16b, \const\().16b
189	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
190	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
191	.endm
192
193	.macro		mix_columns_2x, in0, in1, enc
194	.if		\enc == 0
195	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
196	mul_by_x2_2x	v8, v9, \in0, \in1, v10, v11, v12
197	eor		\in0\().16b, \in0\().16b, v8.16b
198	rev32		v8.8h, v8.8h
199	eor		\in1\().16b, \in1\().16b, v9.16b
200	rev32		v9.8h, v9.8h
201	eor		\in0\().16b, \in0\().16b, v8.16b
202	eor		\in1\().16b, \in1\().16b, v9.16b
203	.endif
204
205	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v12
206	rev32		v10.8h, \in0\().8h
207	rev32		v11.8h, \in1\().8h
208	eor		v10.16b, v10.16b, v8.16b
209	eor		v11.16b, v11.16b, v9.16b
210	eor		\in0\().16b, \in0\().16b, v10.16b
211	eor		\in1\().16b, \in1\().16b, v11.16b
212	tbl		\in0\().16b, {\in0\().16b}, v14.16b
213	tbl		\in1\().16b, {\in1\().16b}, v14.16b
214	eor		\in0\().16b, \in0\().16b, v10.16b
215	eor		\in1\().16b, \in1\().16b, v11.16b
216	.endm
217
218	.macro		do_block_2x, enc, in0, in1, rounds, rk, rkp, i
219	ld1		{v15.4s}, [\rk]
220	add		\rkp, \rk, #16
221	mov		\i, \rounds
2221111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
223	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
224	movi		v15.16b, #0x40
225	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
226	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
227	sub_bytes_2x	\in0, \in1
228	subs		\i, \i, #1
229	ld1		{v15.4s}, [\rkp], #16
230	beq		2222f
231	mix_columns_2x	\in0, \in1, \enc
232	b		1111b
2332222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
234	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
235	.endm
236
237	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
238	ld1		{v15.4s}, [\rk]
239	add		\rkp, \rk, #16
240	mov		\i, \rounds
2411111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
242	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
243	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
244	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
245	movi		v15.16b, #0x40
246	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
247	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
248	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
249	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
250	sub_bytes_4x	\in0, \in1, \in2, \in3
251	subs		\i, \i, #1
252	ld1		{v15.4s}, [\rkp], #16
253	beq		2222f
254	mix_columns_2x	\in0, \in1, \enc
255	mix_columns_2x	\in2, \in3, \enc
256	b		1111b
2572222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
258	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
259	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
260	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
261	.endm
262
263	.macro		encrypt_block2x, in0, in1, rounds, rk, rkp, i
264	do_block_2x	1, \in0, \in1, \rounds, \rk, \rkp, \i
265	.endm
266
267	.macro		decrypt_block2x, in0, in1, rounds, rk, rkp, i
268	do_block_2x	0, \in0, \in1, \rounds, \rk, \rkp, \i
269	.endm
270
271	.macro		encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
272	do_block_4x	1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
273	.endm
274
275	.macro		decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
276	do_block_4x	0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
277	.endm
278
279#include "aes-modes.S"
280
281	.section	".rodata", "a"
282	.align		6
283.LForward_Sbox:
284	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
285	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
286	.byte		0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
287	.byte		0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
288	.byte		0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
289	.byte		0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
290	.byte		0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
291	.byte		0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
292	.byte		0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
293	.byte		0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
294	.byte		0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
295	.byte		0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
296	.byte		0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
297	.byte		0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
298	.byte		0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
299	.byte		0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
300	.byte		0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
301	.byte		0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
302	.byte		0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
303	.byte		0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
304	.byte		0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
305	.byte		0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
306	.byte		0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
307	.byte		0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
308	.byte		0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
309	.byte		0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
310	.byte		0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
311	.byte		0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
312	.byte		0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
313	.byte		0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
314	.byte		0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
315	.byte		0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
316
317.LReverse_Sbox:
318	.byte		0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
319	.byte		0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
320	.byte		0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
321	.byte		0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
322	.byte		0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
323	.byte		0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
324	.byte		0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
325	.byte		0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
326	.byte		0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
327	.byte		0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
328	.byte		0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
329	.byte		0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
330	.byte		0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
331	.byte		0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
332	.byte		0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
333	.byte		0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
334	.byte		0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
335	.byte		0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
336	.byte		0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
337	.byte		0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
338	.byte		0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
339	.byte		0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
340	.byte		0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
341	.byte		0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
342	.byte		0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
343	.byte		0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
344	.byte		0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
345	.byte		0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
346	.byte		0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
347	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
348	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
349	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
350
351.LForward_ShiftRows:
352	.octa		0x0b06010c07020d08030e09040f0a0500
353
354.LReverse_ShiftRows:
355	.octa		0x0306090c0f0205080b0e0104070a0d00
356
357.Lror32by8:
358	.octa		0x0c0f0e0d080b0a090407060500030201
359