xref: /linux/arch/arm/crypto/aes-ce-core.S (revision e5c86679d5e864947a52fb31e45a425dea3e7fa9)
1/*
2 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
3 *
4 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14	.text
15	.fpu		crypto-neon-fp-armv8
16	.align		3
17
18	.macro		enc_round, state, key
19	aese.8		\state, \key
20	aesmc.8		\state, \state
21	.endm
22
23	.macro		dec_round, state, key
24	aesd.8		\state, \key
25	aesimc.8	\state, \state
26	.endm
27
28	.macro		enc_dround, key1, key2
29	enc_round	q0, \key1
30	enc_round	q0, \key2
31	.endm
32
33	.macro		dec_dround, key1, key2
34	dec_round	q0, \key1
35	dec_round	q0, \key2
36	.endm
37
38	.macro		enc_fround, key1, key2, key3
39	enc_round	q0, \key1
40	aese.8		q0, \key2
41	veor		q0, q0, \key3
42	.endm
43
44	.macro		dec_fround, key1, key2, key3
45	dec_round	q0, \key1
46	aesd.8		q0, \key2
47	veor		q0, q0, \key3
48	.endm
49
50	.macro		enc_dround_3x, key1, key2
51	enc_round	q0, \key1
52	enc_round	q1, \key1
53	enc_round	q2, \key1
54	enc_round	q0, \key2
55	enc_round	q1, \key2
56	enc_round	q2, \key2
57	.endm
58
59	.macro		dec_dround_3x, key1, key2
60	dec_round	q0, \key1
61	dec_round	q1, \key1
62	dec_round	q2, \key1
63	dec_round	q0, \key2
64	dec_round	q1, \key2
65	dec_round	q2, \key2
66	.endm
67
68	.macro		enc_fround_3x, key1, key2, key3
69	enc_round	q0, \key1
70	enc_round	q1, \key1
71	enc_round	q2, \key1
72	aese.8		q0, \key2
73	aese.8		q1, \key2
74	aese.8		q2, \key2
75	veor		q0, q0, \key3
76	veor		q1, q1, \key3
77	veor		q2, q2, \key3
78	.endm
79
80	.macro		dec_fround_3x, key1, key2, key3
81	dec_round	q0, \key1
82	dec_round	q1, \key1
83	dec_round	q2, \key1
84	aesd.8		q0, \key2
85	aesd.8		q1, \key2
86	aesd.8		q2, \key2
87	veor		q0, q0, \key3
88	veor		q1, q1, \key3
89	veor		q2, q2, \key3
90	.endm
91
92	.macro		do_block, dround, fround
93	cmp		r3, #12			@ which key size?
94	vld1.8		{q10-q11}, [ip]!
95	\dround		q8, q9
96	vld1.8		{q12-q13}, [ip]!
97	\dround		q10, q11
98	vld1.8		{q10-q11}, [ip]!
99	\dround		q12, q13
100	vld1.8		{q12-q13}, [ip]!
101	\dround		q10, q11
102	blo		0f			@ AES-128: 10 rounds
103	vld1.8		{q10-q11}, [ip]!
104	\dround		q12, q13
105	beq		1f			@ AES-192: 12 rounds
106	vld1.8		{q12-q13}, [ip]
107	\dround		q10, q11
1080:	\fround		q12, q13, q14
109	bx		lr
110
1111:	\fround		q10, q11, q14
112	bx		lr
113	.endm
114
115	/*
116	 * Internal, non-AAPCS compliant functions that implement the core AES
117	 * transforms. These should preserve all registers except q0 - q2 and ip
118	 * Arguments:
119	 *   q0        : first in/output block
120	 *   q1        : second in/output block (_3x version only)
121	 *   q2        : third in/output block (_3x version only)
122	 *   q8        : first round key
123	 *   q9        : secound round key
124	 *   q14       : final round key
125	 *   r2        : address of round key array
126	 *   r3        : number of rounds
127	 */
128	.align		6
129aes_encrypt:
130	add		ip, r2, #32		@ 3rd round key
131.Laes_encrypt_tweak:
132	do_block	enc_dround, enc_fround
133ENDPROC(aes_encrypt)
134
135	.align		6
136aes_decrypt:
137	add		ip, r2, #32		@ 3rd round key
138	do_block	dec_dround, dec_fround
139ENDPROC(aes_decrypt)
140
141	.align		6
142aes_encrypt_3x:
143	add		ip, r2, #32		@ 3rd round key
144	do_block	enc_dround_3x, enc_fround_3x
145ENDPROC(aes_encrypt_3x)
146
147	.align		6
148aes_decrypt_3x:
149	add		ip, r2, #32		@ 3rd round key
150	do_block	dec_dround_3x, dec_fround_3x
151ENDPROC(aes_decrypt_3x)
152
153	.macro		prepare_key, rk, rounds
154	add		ip, \rk, \rounds, lsl #4
155	vld1.8		{q8-q9}, [\rk]		@ load first 2 round keys
156	vld1.8		{q14}, [ip]		@ load last round key
157	.endm
158
159	/*
160	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
161	 *		   int blocks)
162	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
163	 *		   int blocks)
164	 */
165ENTRY(ce_aes_ecb_encrypt)
166	push		{r4, lr}
167	ldr		r4, [sp, #8]
168	prepare_key	r2, r3
169.Lecbencloop3x:
170	subs		r4, r4, #3
171	bmi		.Lecbenc1x
172	vld1.8		{q0-q1}, [r1]!
173	vld1.8		{q2}, [r1]!
174	bl		aes_encrypt_3x
175	vst1.8		{q0-q1}, [r0]!
176	vst1.8		{q2}, [r0]!
177	b		.Lecbencloop3x
178.Lecbenc1x:
179	adds		r4, r4, #3
180	beq		.Lecbencout
181.Lecbencloop:
182	vld1.8		{q0}, [r1]!
183	bl		aes_encrypt
184	vst1.8		{q0}, [r0]!
185	subs		r4, r4, #1
186	bne		.Lecbencloop
187.Lecbencout:
188	pop		{r4, pc}
189ENDPROC(ce_aes_ecb_encrypt)
190
191ENTRY(ce_aes_ecb_decrypt)
192	push		{r4, lr}
193	ldr		r4, [sp, #8]
194	prepare_key	r2, r3
195.Lecbdecloop3x:
196	subs		r4, r4, #3
197	bmi		.Lecbdec1x
198	vld1.8		{q0-q1}, [r1]!
199	vld1.8		{q2}, [r1]!
200	bl		aes_decrypt_3x
201	vst1.8		{q0-q1}, [r0]!
202	vst1.8		{q2}, [r0]!
203	b		.Lecbdecloop3x
204.Lecbdec1x:
205	adds		r4, r4, #3
206	beq		.Lecbdecout
207.Lecbdecloop:
208	vld1.8		{q0}, [r1]!
209	bl		aes_decrypt
210	vst1.8		{q0}, [r0]!
211	subs		r4, r4, #1
212	bne		.Lecbdecloop
213.Lecbdecout:
214	pop		{r4, pc}
215ENDPROC(ce_aes_ecb_decrypt)
216
217	/*
218	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
219	 *		   int blocks, u8 iv[])
220	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
221	 *		   int blocks, u8 iv[])
222	 */
223ENTRY(ce_aes_cbc_encrypt)
224	push		{r4-r6, lr}
225	ldrd		r4, r5, [sp, #16]
226	vld1.8		{q0}, [r5]
227	prepare_key	r2, r3
228.Lcbcencloop:
229	vld1.8		{q1}, [r1]!		@ get next pt block
230	veor		q0, q0, q1		@ ..and xor with iv
231	bl		aes_encrypt
232	vst1.8		{q0}, [r0]!
233	subs		r4, r4, #1
234	bne		.Lcbcencloop
235	vst1.8		{q0}, [r5]
236	pop		{r4-r6, pc}
237ENDPROC(ce_aes_cbc_encrypt)
238
239ENTRY(ce_aes_cbc_decrypt)
240	push		{r4-r6, lr}
241	ldrd		r4, r5, [sp, #16]
242	vld1.8		{q6}, [r5]		@ keep iv in q6
243	prepare_key	r2, r3
244.Lcbcdecloop3x:
245	subs		r4, r4, #3
246	bmi		.Lcbcdec1x
247	vld1.8		{q0-q1}, [r1]!
248	vld1.8		{q2}, [r1]!
249	vmov		q3, q0
250	vmov		q4, q1
251	vmov		q5, q2
252	bl		aes_decrypt_3x
253	veor		q0, q0, q6
254	veor		q1, q1, q3
255	veor		q2, q2, q4
256	vmov		q6, q5
257	vst1.8		{q0-q1}, [r0]!
258	vst1.8		{q2}, [r0]!
259	b		.Lcbcdecloop3x
260.Lcbcdec1x:
261	adds		r4, r4, #3
262	beq		.Lcbcdecout
263	vmov		q15, q14		@ preserve last round key
264.Lcbcdecloop:
265	vld1.8		{q0}, [r1]!		@ get next ct block
266	veor		q14, q15, q6		@ combine prev ct with last key
267	vmov		q6, q0
268	bl		aes_decrypt
269	vst1.8		{q0}, [r0]!
270	subs		r4, r4, #1
271	bne		.Lcbcdecloop
272.Lcbcdecout:
273	vst1.8		{q6}, [r5]		@ keep iv in q6
274	pop		{r4-r6, pc}
275ENDPROC(ce_aes_cbc_decrypt)
276
277	/*
278	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
279	 *		   int blocks, u8 ctr[])
280	 */
281ENTRY(ce_aes_ctr_encrypt)
282	push		{r4-r6, lr}
283	ldrd		r4, r5, [sp, #16]
284	vld1.8		{q6}, [r5]		@ load ctr
285	prepare_key	r2, r3
286	vmov		r6, s27			@ keep swabbed ctr in r6
287	rev		r6, r6
288	cmn		r6, r4			@ 32 bit overflow?
289	bcs		.Lctrloop
290.Lctrloop3x:
291	subs		r4, r4, #3
292	bmi		.Lctr1x
293	add		r6, r6, #1
294	vmov		q0, q6
295	vmov		q1, q6
296	rev		ip, r6
297	add		r6, r6, #1
298	vmov		q2, q6
299	vmov		s7, ip
300	rev		ip, r6
301	add		r6, r6, #1
302	vmov		s11, ip
303	vld1.8		{q3-q4}, [r1]!
304	vld1.8		{q5}, [r1]!
305	bl		aes_encrypt_3x
306	veor		q0, q0, q3
307	veor		q1, q1, q4
308	veor		q2, q2, q5
309	rev		ip, r6
310	vst1.8		{q0-q1}, [r0]!
311	vst1.8		{q2}, [r0]!
312	vmov		s27, ip
313	b		.Lctrloop3x
314.Lctr1x:
315	adds		r4, r4, #3
316	beq		.Lctrout
317.Lctrloop:
318	vmov		q0, q6
319	bl		aes_encrypt
320	subs		r4, r4, #1
321	bmi		.Lctrtailblock		@ blocks < 0 means tail block
322	vld1.8		{q3}, [r1]!
323	veor		q3, q0, q3
324	vst1.8		{q3}, [r0]!
325
326	adds		r6, r6, #1		@ increment BE ctr
327	rev		ip, r6
328	vmov		s27, ip
329	bcs		.Lctrcarry
330	teq		r4, #0
331	bne		.Lctrloop
332.Lctrout:
333	vst1.8		{q6}, [r5]
334	pop		{r4-r6, pc}
335
336.Lctrtailblock:
337	vst1.8		{q0}, [r0, :64]		@ return just the key stream
338	pop		{r4-r6, pc}
339
340.Lctrcarry:
341	.irp		sreg, s26, s25, s24
342	vmov		ip, \sreg		@ load next word of ctr
343	rev		ip, ip			@ ... to handle the carry
344	adds		ip, ip, #1
345	rev		ip, ip
346	vmov		\sreg, ip
347	bcc		0f
348	.endr
3490:	teq		r4, #0
350	beq		.Lctrout
351	b		.Lctrloop
352ENDPROC(ce_aes_ctr_encrypt)
353
354	/*
355	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
356	 *		   int blocks, u8 iv[], u8 const rk2[], int first)
357	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
358	 *		   int blocks, u8 iv[], u8 const rk2[], int first)
359	 */
360
361	.macro		next_tweak, out, in, const, tmp
362	vshr.s64	\tmp, \in, #63
363	vand		\tmp, \tmp, \const
364	vadd.u64	\out, \in, \in
365	vext.8		\tmp, \tmp, \tmp, #8
366	veor		\out, \out, \tmp
367	.endm
368
369	.align		3
370.Lxts_mul_x:
371	.quad		1, 0x87
372
373ce_aes_xts_init:
374	vldr		d14, .Lxts_mul_x
375	vldr		d15, .Lxts_mul_x + 8
376
377	ldrd		r4, r5, [sp, #16]	@ load args
378	ldr		r6, [sp, #28]
379	vld1.8		{q0}, [r5]		@ load iv
380	teq		r6, #1			@ start of a block?
381	bxne		lr
382
383	@ Encrypt the IV in q0 with the second AES key. This should only
384	@ be done at the start of a block.
385	ldr		r6, [sp, #24]		@ load AES key 2
386	prepare_key	r6, r3
387	add		ip, r6, #32		@ 3rd round key of key 2
388	b		.Laes_encrypt_tweak	@ tail call
389ENDPROC(ce_aes_xts_init)
390
391ENTRY(ce_aes_xts_encrypt)
392	push		{r4-r6, lr}
393
394	bl		ce_aes_xts_init		@ run shared prologue
395	prepare_key	r2, r3
396	vmov		q3, q0
397
398	teq		r6, #0			@ start of a block?
399	bne		.Lxtsenc3x
400
401.Lxtsencloop3x:
402	next_tweak	q3, q3, q7, q6
403.Lxtsenc3x:
404	subs		r4, r4, #3
405	bmi		.Lxtsenc1x
406	vld1.8		{q0-q1}, [r1]!		@ get 3 pt blocks
407	vld1.8		{q2}, [r1]!
408	next_tweak	q4, q3, q7, q6
409	veor		q0, q0, q3
410	next_tweak	q5, q4, q7, q6
411	veor		q1, q1, q4
412	veor		q2, q2, q5
413	bl		aes_encrypt_3x
414	veor		q0, q0, q3
415	veor		q1, q1, q4
416	veor		q2, q2, q5
417	vst1.8		{q0-q1}, [r0]!		@ write 3 ct blocks
418	vst1.8		{q2}, [r0]!
419	vmov		q3, q5
420	teq		r4, #0
421	beq		.Lxtsencout
422	b		.Lxtsencloop3x
423.Lxtsenc1x:
424	adds		r4, r4, #3
425	beq		.Lxtsencout
426.Lxtsencloop:
427	vld1.8		{q0}, [r1]!
428	veor		q0, q0, q3
429	bl		aes_encrypt
430	veor		q0, q0, q3
431	vst1.8		{q0}, [r0]!
432	subs		r4, r4, #1
433	beq		.Lxtsencout
434	next_tweak	q3, q3, q7, q6
435	b		.Lxtsencloop
436.Lxtsencout:
437	vst1.8		{q3}, [r5]
438	pop		{r4-r6, pc}
439ENDPROC(ce_aes_xts_encrypt)
440
441
442ENTRY(ce_aes_xts_decrypt)
443	push		{r4-r6, lr}
444
445	bl		ce_aes_xts_init		@ run shared prologue
446	prepare_key	r2, r3
447	vmov		q3, q0
448
449	teq		r6, #0			@ start of a block?
450	bne		.Lxtsdec3x
451
452.Lxtsdecloop3x:
453	next_tweak	q3, q3, q7, q6
454.Lxtsdec3x:
455	subs		r4, r4, #3
456	bmi		.Lxtsdec1x
457	vld1.8		{q0-q1}, [r1]!		@ get 3 ct blocks
458	vld1.8		{q2}, [r1]!
459	next_tweak	q4, q3, q7, q6
460	veor		q0, q0, q3
461	next_tweak	q5, q4, q7, q6
462	veor		q1, q1, q4
463	veor		q2, q2, q5
464	bl		aes_decrypt_3x
465	veor		q0, q0, q3
466	veor		q1, q1, q4
467	veor		q2, q2, q5
468	vst1.8		{q0-q1}, [r0]!		@ write 3 pt blocks
469	vst1.8		{q2}, [r0]!
470	vmov		q3, q5
471	teq		r4, #0
472	beq		.Lxtsdecout
473	b		.Lxtsdecloop3x
474.Lxtsdec1x:
475	adds		r4, r4, #3
476	beq		.Lxtsdecout
477.Lxtsdecloop:
478	vld1.8		{q0}, [r1]!
479	veor		q0, q0, q3
480	add		ip, r2, #32		@ 3rd round key
481	bl		aes_decrypt
482	veor		q0, q0, q3
483	vst1.8		{q0}, [r0]!
484	subs		r4, r4, #1
485	beq		.Lxtsdecout
486	next_tweak	q3, q3, q7, q6
487	b		.Lxtsdecloop
488.Lxtsdecout:
489	vst1.8		{q3}, [r5]
490	pop		{r4-r6, pc}
491ENDPROC(ce_aes_xts_decrypt)
492
493	/*
494	 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
495	 *                             AES sbox substitution on each byte in
496	 *                             'input'
497	 */
498ENTRY(ce_aes_sub)
499	vdup.32		q1, r0
500	veor		q0, q0, q0
501	aese.8		q0, q1
502	vmov		r0, s0
503	bx		lr
504ENDPROC(ce_aes_sub)
505
506	/*
507	 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
508	 *                                        operation on round key *src
509	 */
510ENTRY(ce_aes_invert)
511	vld1.8		{q0}, [r1]
512	aesimc.8	q0, q0
513	vst1.8		{q0}, [r0]
514	bx		lr
515ENDPROC(ce_aes_invert)
516