xref: /linux/arch/arm/crypto/aes-ce-core.S (revision 9cfc5c90ad38c8fc11bfd39de42a107da00871ba)
1/*
2 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
3 *
4 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14	.text
15	.fpu		crypto-neon-fp-armv8
16	.align		3
17
18	.macro		enc_round, state, key
19	aese.8		\state, \key
20	aesmc.8		\state, \state
21	.endm
22
23	.macro		dec_round, state, key
24	aesd.8		\state, \key
25	aesimc.8	\state, \state
26	.endm
27
28	.macro		enc_dround, key1, key2
29	enc_round	q0, \key1
30	enc_round	q0, \key2
31	.endm
32
33	.macro		dec_dround, key1, key2
34	dec_round	q0, \key1
35	dec_round	q0, \key2
36	.endm
37
38	.macro		enc_fround, key1, key2, key3
39	enc_round	q0, \key1
40	aese.8		q0, \key2
41	veor		q0, q0, \key3
42	.endm
43
44	.macro		dec_fround, key1, key2, key3
45	dec_round	q0, \key1
46	aesd.8		q0, \key2
47	veor		q0, q0, \key3
48	.endm
49
50	.macro		enc_dround_3x, key1, key2
51	enc_round	q0, \key1
52	enc_round	q1, \key1
53	enc_round	q2, \key1
54	enc_round	q0, \key2
55	enc_round	q1, \key2
56	enc_round	q2, \key2
57	.endm
58
59	.macro		dec_dround_3x, key1, key2
60	dec_round	q0, \key1
61	dec_round	q1, \key1
62	dec_round	q2, \key1
63	dec_round	q0, \key2
64	dec_round	q1, \key2
65	dec_round	q2, \key2
66	.endm
67
68	.macro		enc_fround_3x, key1, key2, key3
69	enc_round	q0, \key1
70	enc_round	q1, \key1
71	enc_round	q2, \key1
72	aese.8		q0, \key2
73	aese.8		q1, \key2
74	aese.8		q2, \key2
75	veor		q0, q0, \key3
76	veor		q1, q1, \key3
77	veor		q2, q2, \key3
78	.endm
79
80	.macro		dec_fround_3x, key1, key2, key3
81	dec_round	q0, \key1
82	dec_round	q1, \key1
83	dec_round	q2, \key1
84	aesd.8		q0, \key2
85	aesd.8		q1, \key2
86	aesd.8		q2, \key2
87	veor		q0, q0, \key3
88	veor		q1, q1, \key3
89	veor		q2, q2, \key3
90	.endm
91
92	.macro		do_block, dround, fround
93	cmp		r3, #12			@ which key size?
94	vld1.8		{q10-q11}, [ip]!
95	\dround		q8, q9
96	vld1.8		{q12-q13}, [ip]!
97	\dround		q10, q11
98	vld1.8		{q10-q11}, [ip]!
99	\dround		q12, q13
100	vld1.8		{q12-q13}, [ip]!
101	\dround		q10, q11
102	blo		0f			@ AES-128: 10 rounds
103	vld1.8		{q10-q11}, [ip]!
104	\dround		q12, q13
105	beq		1f			@ AES-192: 12 rounds
106	vld1.8		{q12-q13}, [ip]
107	\dround		q10, q11
1080:	\fround		q12, q13, q14
109	bx		lr
110
1111:	\fround		q10, q11, q14
112	bx		lr
113	.endm
114
115	/*
116	 * Internal, non-AAPCS compliant functions that implement the core AES
117	 * transforms. These should preserve all registers except q0 - q2 and ip
118	 * Arguments:
119	 *   q0        : first in/output block
120	 *   q1        : second in/output block (_3x version only)
121	 *   q2        : third in/output block (_3x version only)
122	 *   q8        : first round key
123	 *   q9        : secound round key
124	 *   q14       : final round key
125	 *   r2        : address of round key array
126	 *   r3        : number of rounds
127	 */
128	.align		6
129aes_encrypt:
130	add		ip, r2, #32		@ 3rd round key
131.Laes_encrypt_tweak:
132	do_block	enc_dround, enc_fround
133ENDPROC(aes_encrypt)
134
135	.align		6
136aes_decrypt:
137	add		ip, r2, #32		@ 3rd round key
138	do_block	dec_dround, dec_fround
139ENDPROC(aes_decrypt)
140
141	.align		6
142aes_encrypt_3x:
143	add		ip, r2, #32		@ 3rd round key
144	do_block	enc_dround_3x, enc_fround_3x
145ENDPROC(aes_encrypt_3x)
146
147	.align		6
148aes_decrypt_3x:
149	add		ip, r2, #32		@ 3rd round key
150	do_block	dec_dround_3x, dec_fround_3x
151ENDPROC(aes_decrypt_3x)
152
153	.macro		prepare_key, rk, rounds
154	add		ip, \rk, \rounds, lsl #4
155	vld1.8		{q8-q9}, [\rk]		@ load first 2 round keys
156	vld1.8		{q14}, [ip]		@ load last round key
157	.endm
158
159	/*
160	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
161	 *		   int blocks)
162	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
163	 *		   int blocks)
164	 */
165ENTRY(ce_aes_ecb_encrypt)
166	push		{r4, lr}
167	ldr		r4, [sp, #8]
168	prepare_key	r2, r3
169.Lecbencloop3x:
170	subs		r4, r4, #3
171	bmi		.Lecbenc1x
172	vld1.8		{q0-q1}, [r1, :64]!
173	vld1.8		{q2}, [r1, :64]!
174	bl		aes_encrypt_3x
175	vst1.8		{q0-q1}, [r0, :64]!
176	vst1.8		{q2}, [r0, :64]!
177	b		.Lecbencloop3x
178.Lecbenc1x:
179	adds		r4, r4, #3
180	beq		.Lecbencout
181.Lecbencloop:
182	vld1.8		{q0}, [r1, :64]!
183	bl		aes_encrypt
184	vst1.8		{q0}, [r0, :64]!
185	subs		r4, r4, #1
186	bne		.Lecbencloop
187.Lecbencout:
188	pop		{r4, pc}
189ENDPROC(ce_aes_ecb_encrypt)
190
191ENTRY(ce_aes_ecb_decrypt)
192	push		{r4, lr}
193	ldr		r4, [sp, #8]
194	prepare_key	r2, r3
195.Lecbdecloop3x:
196	subs		r4, r4, #3
197	bmi		.Lecbdec1x
198	vld1.8		{q0-q1}, [r1, :64]!
199	vld1.8		{q2}, [r1, :64]!
200	bl		aes_decrypt_3x
201	vst1.8		{q0-q1}, [r0, :64]!
202	vst1.8		{q2}, [r0, :64]!
203	b		.Lecbdecloop3x
204.Lecbdec1x:
205	adds		r4, r4, #3
206	beq		.Lecbdecout
207.Lecbdecloop:
208	vld1.8		{q0}, [r1, :64]!
209	bl		aes_decrypt
210	vst1.8		{q0}, [r0, :64]!
211	subs		r4, r4, #1
212	bne		.Lecbdecloop
213.Lecbdecout:
214	pop		{r4, pc}
215ENDPROC(ce_aes_ecb_decrypt)
216
217	/*
218	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
219	 *		   int blocks, u8 iv[])
220	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
221	 *		   int blocks, u8 iv[])
222	 */
223ENTRY(ce_aes_cbc_encrypt)
224	push		{r4-r6, lr}
225	ldrd		r4, r5, [sp, #16]
226	vld1.8		{q0}, [r5]
227	prepare_key	r2, r3
228.Lcbcencloop:
229	vld1.8		{q1}, [r1, :64]!	@ get next pt block
230	veor		q0, q0, q1		@ ..and xor with iv
231	bl		aes_encrypt
232	vst1.8		{q0}, [r0, :64]!
233	subs		r4, r4, #1
234	bne		.Lcbcencloop
235	vst1.8		{q0}, [r5]
236	pop		{r4-r6, pc}
237ENDPROC(ce_aes_cbc_encrypt)
238
239ENTRY(ce_aes_cbc_decrypt)
240	push		{r4-r6, lr}
241	ldrd		r4, r5, [sp, #16]
242	vld1.8		{q6}, [r5]		@ keep iv in q6
243	prepare_key	r2, r3
244.Lcbcdecloop3x:
245	subs		r4, r4, #3
246	bmi		.Lcbcdec1x
247	vld1.8		{q0-q1}, [r1, :64]!
248	vld1.8		{q2}, [r1, :64]!
249	vmov		q3, q0
250	vmov		q4, q1
251	vmov		q5, q2
252	bl		aes_decrypt_3x
253	veor		q0, q0, q6
254	veor		q1, q1, q3
255	veor		q2, q2, q4
256	vmov		q6, q5
257	vst1.8		{q0-q1}, [r0, :64]!
258	vst1.8		{q2}, [r0, :64]!
259	b		.Lcbcdecloop3x
260.Lcbcdec1x:
261	adds		r4, r4, #3
262	beq		.Lcbcdecout
263	vmov		q15, q14		@ preserve last round key
264.Lcbcdecloop:
265	vld1.8		{q0}, [r1, :64]!	@ get next ct block
266	veor		q14, q15, q6		@ combine prev ct with last key
267	vmov		q6, q0
268	bl		aes_decrypt
269	vst1.8		{q0}, [r0, :64]!
270	subs		r4, r4, #1
271	bne		.Lcbcdecloop
272.Lcbcdecout:
273	vst1.8		{q6}, [r5]		@ keep iv in q6
274	pop		{r4-r6, pc}
275ENDPROC(ce_aes_cbc_decrypt)
276
277	/*
278	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
279	 *		   int blocks, u8 ctr[])
280	 */
281ENTRY(ce_aes_ctr_encrypt)
282	push		{r4-r6, lr}
283	ldrd		r4, r5, [sp, #16]
284	vld1.8		{q6}, [r5]		@ load ctr
285	prepare_key	r2, r3
286	vmov		r6, s27			@ keep swabbed ctr in r6
287	rev		r6, r6
288	cmn		r6, r4			@ 32 bit overflow?
289	bcs		.Lctrloop
290.Lctrloop3x:
291	subs		r4, r4, #3
292	bmi		.Lctr1x
293	add		r6, r6, #1
294	vmov		q0, q6
295	vmov		q1, q6
296	rev		ip, r6
297	add		r6, r6, #1
298	vmov		q2, q6
299	vmov		s7, ip
300	rev		ip, r6
301	add		r6, r6, #1
302	vmov		s11, ip
303	vld1.8		{q3-q4}, [r1, :64]!
304	vld1.8		{q5}, [r1, :64]!
305	bl		aes_encrypt_3x
306	veor		q0, q0, q3
307	veor		q1, q1, q4
308	veor		q2, q2, q5
309	rev		ip, r6
310	vst1.8		{q0-q1}, [r0, :64]!
311	vst1.8		{q2}, [r0, :64]!
312	vmov		s27, ip
313	b		.Lctrloop3x
314.Lctr1x:
315	adds		r4, r4, #3
316	beq		.Lctrout
317.Lctrloop:
318	vmov		q0, q6
319	bl		aes_encrypt
320	subs		r4, r4, #1
321	bmi		.Lctrhalfblock		@ blocks < 0 means 1/2 block
322	vld1.8		{q3}, [r1, :64]!
323	veor		q3, q0, q3
324	vst1.8		{q3}, [r0, :64]!
325
326	adds		r6, r6, #1		@ increment BE ctr
327	rev		ip, r6
328	vmov		s27, ip
329	bcs		.Lctrcarry
330	teq		r4, #0
331	bne		.Lctrloop
332.Lctrout:
333	vst1.8		{q6}, [r5]
334	pop		{r4-r6, pc}
335
336.Lctrhalfblock:
337	vld1.8		{d1}, [r1, :64]
338	veor		d0, d0, d1
339	vst1.8		{d0}, [r0, :64]
340	pop		{r4-r6, pc}
341
342.Lctrcarry:
343	.irp		sreg, s26, s25, s24
344	vmov		ip, \sreg		@ load next word of ctr
345	rev		ip, ip			@ ... to handle the carry
346	adds		ip, ip, #1
347	rev		ip, ip
348	vmov		\sreg, ip
349	bcc		0f
350	.endr
3510:	teq		r4, #0
352	beq		.Lctrout
353	b		.Lctrloop
354ENDPROC(ce_aes_ctr_encrypt)
355
356	/*
357	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
358	 *		   int blocks, u8 iv[], u8 const rk2[], int first)
359	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
360	 *		   int blocks, u8 iv[], u8 const rk2[], int first)
361	 */
362
363	.macro		next_tweak, out, in, const, tmp
364	vshr.s64	\tmp, \in, #63
365	vand		\tmp, \tmp, \const
366	vadd.u64	\out, \in, \in
367	vext.8		\tmp, \tmp, \tmp, #8
368	veor		\out, \out, \tmp
369	.endm
370
371	.align		3
372.Lxts_mul_x:
373	.quad		1, 0x87
374
375ce_aes_xts_init:
376	vldr		d14, .Lxts_mul_x
377	vldr		d15, .Lxts_mul_x + 8
378
379	ldrd		r4, r5, [sp, #16]	@ load args
380	ldr		r6, [sp, #28]
381	vld1.8		{q0}, [r5]		@ load iv
382	teq		r6, #1			@ start of a block?
383	bxne		lr
384
385	@ Encrypt the IV in q0 with the second AES key. This should only
386	@ be done at the start of a block.
387	ldr		r6, [sp, #24]		@ load AES key 2
388	prepare_key	r6, r3
389	add		ip, r6, #32		@ 3rd round key of key 2
390	b		.Laes_encrypt_tweak	@ tail call
391ENDPROC(ce_aes_xts_init)
392
393ENTRY(ce_aes_xts_encrypt)
394	push		{r4-r6, lr}
395
396	bl		ce_aes_xts_init		@ run shared prologue
397	prepare_key	r2, r3
398	vmov		q3, q0
399
400	teq		r6, #0			@ start of a block?
401	bne		.Lxtsenc3x
402
403.Lxtsencloop3x:
404	next_tweak	q3, q3, q7, q6
405.Lxtsenc3x:
406	subs		r4, r4, #3
407	bmi		.Lxtsenc1x
408	vld1.8		{q0-q1}, [r1, :64]!	@ get 3 pt blocks
409	vld1.8		{q2}, [r1, :64]!
410	next_tweak	q4, q3, q7, q6
411	veor		q0, q0, q3
412	next_tweak	q5, q4, q7, q6
413	veor		q1, q1, q4
414	veor		q2, q2, q5
415	bl		aes_encrypt_3x
416	veor		q0, q0, q3
417	veor		q1, q1, q4
418	veor		q2, q2, q5
419	vst1.8		{q0-q1}, [r0, :64]!	@ write 3 ct blocks
420	vst1.8		{q2}, [r0, :64]!
421	vmov		q3, q5
422	teq		r4, #0
423	beq		.Lxtsencout
424	b		.Lxtsencloop3x
425.Lxtsenc1x:
426	adds		r4, r4, #3
427	beq		.Lxtsencout
428.Lxtsencloop:
429	vld1.8		{q0}, [r1, :64]!
430	veor		q0, q0, q3
431	bl		aes_encrypt
432	veor		q0, q0, q3
433	vst1.8		{q0}, [r0, :64]!
434	subs		r4, r4, #1
435	beq		.Lxtsencout
436	next_tweak	q3, q3, q7, q6
437	b		.Lxtsencloop
438.Lxtsencout:
439	vst1.8		{q3}, [r5]
440	pop		{r4-r6, pc}
441ENDPROC(ce_aes_xts_encrypt)
442
443
444ENTRY(ce_aes_xts_decrypt)
445	push		{r4-r6, lr}
446
447	bl		ce_aes_xts_init		@ run shared prologue
448	prepare_key	r2, r3
449	vmov		q3, q0
450
451	teq		r6, #0			@ start of a block?
452	bne		.Lxtsdec3x
453
454.Lxtsdecloop3x:
455	next_tweak	q3, q3, q7, q6
456.Lxtsdec3x:
457	subs		r4, r4, #3
458	bmi		.Lxtsdec1x
459	vld1.8		{q0-q1}, [r1, :64]!	@ get 3 ct blocks
460	vld1.8		{q2}, [r1, :64]!
461	next_tweak	q4, q3, q7, q6
462	veor		q0, q0, q3
463	next_tweak	q5, q4, q7, q6
464	veor		q1, q1, q4
465	veor		q2, q2, q5
466	bl		aes_decrypt_3x
467	veor		q0, q0, q3
468	veor		q1, q1, q4
469	veor		q2, q2, q5
470	vst1.8		{q0-q1}, [r0, :64]!	@ write 3 pt blocks
471	vst1.8		{q2}, [r0, :64]!
472	vmov		q3, q5
473	teq		r4, #0
474	beq		.Lxtsdecout
475	b		.Lxtsdecloop3x
476.Lxtsdec1x:
477	adds		r4, r4, #3
478	beq		.Lxtsdecout
479.Lxtsdecloop:
480	vld1.8		{q0}, [r1, :64]!
481	veor		q0, q0, q3
482	add		ip, r2, #32		@ 3rd round key
483	bl		aes_decrypt
484	veor		q0, q0, q3
485	vst1.8		{q0}, [r0, :64]!
486	subs		r4, r4, #1
487	beq		.Lxtsdecout
488	next_tweak	q3, q3, q7, q6
489	b		.Lxtsdecloop
490.Lxtsdecout:
491	vst1.8		{q3}, [r5]
492	pop		{r4-r6, pc}
493ENDPROC(ce_aes_xts_decrypt)
494
495	/*
496	 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
497	 *                             AES sbox substitution on each byte in
498	 *                             'input'
499	 */
500ENTRY(ce_aes_sub)
501	vdup.32		q1, r0
502	veor		q0, q0, q0
503	aese.8		q0, q1
504	vmov		r0, s0
505	bx		lr
506ENDPROC(ce_aes_sub)
507
508	/*
509	 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
510	 *                                        operation on round key *src
511	 */
512ENTRY(ce_aes_invert)
513	vld1.8		{q0}, [r1]
514	aesimc.8	q0, q0
515	vst1.8		{q0}, [r0]
516	bx		lr
517ENDPROC(ce_aes_invert)
518