xref: /linux/arch/arm64/crypto/aes-neon.S (revision add452d09a38c7a7c44aea55c1015392cebf9fa7)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
4 *
5 * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11#define AES_FUNC_START(func)		SYM_FUNC_START(neon_ ## func)
12#define AES_FUNC_END(func)		SYM_FUNC_END(neon_ ## func)
13
14	xtsmask		.req	v7
15	cbciv		.req	v7
16	vctr		.req	v4
17
18	.macro		xts_reload_mask, tmp
19	xts_load_mask	\tmp
20	.endm
21
22	/* special case for the neon-bs driver calling into this one for CTS */
23	.macro		xts_cts_skip_tw, reg, lbl
24	tbnz		\reg, #1, \lbl
25	.endm
26
27	/* multiply by polynomial 'x' in GF(2^8) */
28	.macro		mul_by_x, out, in, temp, const
29	sshr		\temp, \in, #7
30	shl		\out, \in, #1
31	and		\temp, \temp, \const
32	eor		\out, \out, \temp
33	.endm
34
35	/* multiply by polynomial 'x^2' in GF(2^8) */
36	.macro		mul_by_x2, out, in, temp, const
37	ushr		\temp, \in, #6
38	shl		\out, \in, #2
39	pmul		\temp, \temp, \const
40	eor		\out, \out, \temp
41	.endm
42
43	/* preload the entire Sbox */
44	.macro		prepare, sbox, shiftrows, temp
45	movi		v12.16b, #0x1b
46	ldr_l		q13, \shiftrows, \temp
47	ldr_l		q14, .Lror32by8, \temp
48	adr_l		\temp, \sbox
49	ld1		{v16.16b-v19.16b}, [\temp], #64
50	ld1		{v20.16b-v23.16b}, [\temp], #64
51	ld1		{v24.16b-v27.16b}, [\temp], #64
52	ld1		{v28.16b-v31.16b}, [\temp]
53	.endm
54
55	/* do preload for encryption */
56	.macro		enc_prepare, ignore0, ignore1, temp
57	prepare		crypto_aes_sbox, .LForward_ShiftRows, \temp
58	.endm
59
60	.macro		enc_switch_key, ignore0, ignore1, temp
61	/* do nothing */
62	.endm
63
64	/* do preload for decryption */
65	.macro		dec_prepare, ignore0, ignore1, temp
66	prepare		crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
67	.endm
68
69	/* apply SubBytes transformation using the preloaded Sbox */
70	.macro		sub_bytes, in
71	sub		v9.16b, \in\().16b, v15.16b
72	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
73	sub		v10.16b, v9.16b, v15.16b
74	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
75	sub		v11.16b, v10.16b, v15.16b
76	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
77	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
78	.endm
79
80	/* apply MixColumns transformation */
81	.macro		mix_columns, in, enc
82	.if		\enc == 0
83	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
84	mul_by_x2	v8.16b, \in\().16b, v9.16b, v12.16b
85	eor		\in\().16b, \in\().16b, v8.16b
86	rev32		v8.8h, v8.8h
87	eor		\in\().16b, \in\().16b, v8.16b
88	.endif
89
90	mul_by_x	v9.16b, \in\().16b, v8.16b, v12.16b
91	rev32		v8.8h, \in\().8h
92	eor		v8.16b, v8.16b, v9.16b
93	eor		\in\().16b, \in\().16b, v8.16b
94	tbl		\in\().16b, {\in\().16b}, v14.16b
95	eor		\in\().16b, \in\().16b, v8.16b
96	.endm
97
98	.macro		do_block, enc, in, rounds, rk, rkp, i
99	ld1		{v15.4s}, [\rk]
100	add		\rkp, \rk, #16
101	mov		\i, \rounds
102.La\@:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
103	movi		v15.16b, #0x40
104	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
105	sub_bytes	\in
106	sub		\i, \i, #1
107	ld1		{v15.4s}, [\rkp], #16
108	cbz		\i, .Lb\@
109	mix_columns	\in, \enc
110	b		.La\@
111.Lb\@:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
112	.endm
113
114	.macro		encrypt_block, in, rounds, rk, rkp, i
115	do_block	1, \in, \rounds, \rk, \rkp, \i
116	.endm
117
118	.macro		decrypt_block, in, rounds, rk, rkp, i
119	do_block	0, \in, \rounds, \rk, \rkp, \i
120	.endm
121
122	/*
123	 * Interleaved versions: functionally equivalent to the
124	 * ones above, but applied to AES states in parallel.
125	 */
126
127	.macro		sub_bytes_4x, in0, in1, in2, in3
128	sub		v8.16b, \in0\().16b, v15.16b
129	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
130	sub		v9.16b, \in1\().16b, v15.16b
131	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
132	sub		v10.16b, \in2\().16b, v15.16b
133	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
134	sub		v11.16b, \in3\().16b, v15.16b
135	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
136	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
137	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
138	sub		v8.16b, v8.16b, v15.16b
139	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
140	sub		v9.16b, v9.16b, v15.16b
141	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
142	sub		v10.16b, v10.16b, v15.16b
143	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
144	sub		v11.16b, v11.16b, v15.16b
145	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
146	sub		v8.16b, v8.16b, v15.16b
147	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
148	sub		v9.16b, v9.16b, v15.16b
149	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
150	sub		v10.16b, v10.16b, v15.16b
151	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
152	sub		v11.16b, v11.16b, v15.16b
153	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
154	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
155	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
156	.endm
157
158	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
159	sshr		\tmp0\().16b, \in0\().16b, #7
160	shl		\out0\().16b, \in0\().16b, #1
161	sshr		\tmp1\().16b, \in1\().16b, #7
162	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
163	shl		\out1\().16b, \in1\().16b, #1
164	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
165	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
166	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
167	.endm
168
169	.macro		mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
170	ushr		\tmp0\().16b, \in0\().16b, #6
171	shl		\out0\().16b, \in0\().16b, #2
172	ushr		\tmp1\().16b, \in1\().16b, #6
173	pmul		\tmp0\().16b, \tmp0\().16b, \const\().16b
174	shl		\out1\().16b, \in1\().16b, #2
175	pmul		\tmp1\().16b, \tmp1\().16b, \const\().16b
176	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
177	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
178	.endm
179
180	.macro		mix_columns_2x, in0, in1, enc
181	.if		\enc == 0
182	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
183	mul_by_x2_2x	v8, v9, \in0, \in1, v10, v11, v12
184	eor		\in0\().16b, \in0\().16b, v8.16b
185	rev32		v8.8h, v8.8h
186	eor		\in1\().16b, \in1\().16b, v9.16b
187	rev32		v9.8h, v9.8h
188	eor		\in0\().16b, \in0\().16b, v8.16b
189	eor		\in1\().16b, \in1\().16b, v9.16b
190	.endif
191
192	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v12
193	rev32		v10.8h, \in0\().8h
194	rev32		v11.8h, \in1\().8h
195	eor		v10.16b, v10.16b, v8.16b
196	eor		v11.16b, v11.16b, v9.16b
197	eor		\in0\().16b, \in0\().16b, v10.16b
198	eor		\in1\().16b, \in1\().16b, v11.16b
199	tbl		\in0\().16b, {\in0\().16b}, v14.16b
200	tbl		\in1\().16b, {\in1\().16b}, v14.16b
201	eor		\in0\().16b, \in0\().16b, v10.16b
202	eor		\in1\().16b, \in1\().16b, v11.16b
203	.endm
204
205	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
206	ld1		{v15.4s}, [\rk]
207	add		\rkp, \rk, #16
208	mov		\i, \rounds
209.La\@:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
210	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
211	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
212	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
213	movi		v15.16b, #0x40
214	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
215	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
216	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
217	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
218	sub_bytes_4x	\in0, \in1, \in2, \in3
219	sub		\i, \i, #1
220	ld1		{v15.4s}, [\rkp], #16
221	cbz		\i, .Lb\@
222	mix_columns_2x	\in0, \in1, \enc
223	mix_columns_2x	\in2, \in3, \enc
224	b		.La\@
225.Lb\@:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
226	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
227	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
228	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
229	.endm
230
231	.macro		encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
232	do_block_4x	1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
233	.endm
234
235	.macro		decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
236	do_block_4x	0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
237	.endm
238
239#include "aes-modes.S"
240
241	.section	".rodata", "a"
242	.align		4
243.LForward_ShiftRows:
244	.octa		0x0b06010c07020d08030e09040f0a0500
245
246.LReverse_ShiftRows:
247	.octa		0x0306090c0f0205080b0e0104070a0d00
248
249.Lror32by8:
250	.octa		0x0c0f0e0d080b0a090407060500030201
251