xref: /linux/arch/arm64/crypto/aes-modes.S (revision 32786fdc9506aeba98278c1844d4bfb766863832)
1/*
2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/* included by aes-ce.S and aes-neon.S */
12
13	.text
14	.align		4
15
16/*
17 * There are several ways to instantiate this code:
18 * - no interleave, all inline
19 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
20 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
21 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
22 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
23 *
24 * Macros imported by this code:
25 * - enc_prepare	- setup NEON registers for encryption
26 * - dec_prepare	- setup NEON registers for decryption
27 * - enc_switch_key	- change to new key after having prepared for encryption
28 * - encrypt_block	- encrypt a single block
29 * - decrypt block	- decrypt a single block
30 * - encrypt_block2x	- encrypt 2 blocks in parallel (if INTERLEAVE == 2)
31 * - decrypt_block2x	- decrypt 2 blocks in parallel (if INTERLEAVE == 2)
32 * - encrypt_block4x	- encrypt 4 blocks in parallel (if INTERLEAVE == 4)
33 * - decrypt_block4x	- decrypt 4 blocks in parallel (if INTERLEAVE == 4)
34 */
35
36#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
37#define FRAME_PUSH	stp x29, x30, [sp,#-16]! ; mov x29, sp
38#define FRAME_POP	ldp x29, x30, [sp],#16
39
40#if INTERLEAVE == 2
41
42aes_encrypt_block2x:
43	encrypt_block2x	v0, v1, w3, x2, x6, w7
44	ret
45ENDPROC(aes_encrypt_block2x)
46
47aes_decrypt_block2x:
48	decrypt_block2x	v0, v1, w3, x2, x6, w7
49	ret
50ENDPROC(aes_decrypt_block2x)
51
52#elif INTERLEAVE == 4
53
54aes_encrypt_block4x:
55	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
56	ret
57ENDPROC(aes_encrypt_block4x)
58
59aes_decrypt_block4x:
60	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
61	ret
62ENDPROC(aes_decrypt_block4x)
63
64#else
65#error INTERLEAVE should equal 2 or 4
66#endif
67
68	.macro		do_encrypt_block2x
69	bl		aes_encrypt_block2x
70	.endm
71
72	.macro		do_decrypt_block2x
73	bl		aes_decrypt_block2x
74	.endm
75
76	.macro		do_encrypt_block4x
77	bl		aes_encrypt_block4x
78	.endm
79
80	.macro		do_decrypt_block4x
81	bl		aes_decrypt_block4x
82	.endm
83
84#else
85#define FRAME_PUSH
86#define FRAME_POP
87
88	.macro		do_encrypt_block2x
89	encrypt_block2x	v0, v1, w3, x2, x6, w7
90	.endm
91
92	.macro		do_decrypt_block2x
93	decrypt_block2x	v0, v1, w3, x2, x6, w7
94	.endm
95
96	.macro		do_encrypt_block4x
97	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
98	.endm
99
100	.macro		do_decrypt_block4x
101	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
102	.endm
103
104#endif
105
106	/*
107	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
108	 *		   int blocks, int first)
109	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
110	 *		   int blocks, int first)
111	 */
112
113AES_ENTRY(aes_ecb_encrypt)
114	FRAME_PUSH
115	cbz		w5, .LecbencloopNx
116
117	enc_prepare	w3, x2, x5
118
119.LecbencloopNx:
120#if INTERLEAVE >= 2
121	subs		w4, w4, #INTERLEAVE
122	bmi		.Lecbenc1x
123#if INTERLEAVE == 2
124	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */
125	do_encrypt_block2x
126	st1		{v0.16b-v1.16b}, [x0], #32
127#else
128	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
129	do_encrypt_block4x
130	st1		{v0.16b-v3.16b}, [x0], #64
131#endif
132	b		.LecbencloopNx
133.Lecbenc1x:
134	adds		w4, w4, #INTERLEAVE
135	beq		.Lecbencout
136#endif
137.Lecbencloop:
138	ld1		{v0.16b}, [x1], #16		/* get next pt block */
139	encrypt_block	v0, w3, x2, x5, w6
140	st1		{v0.16b}, [x0], #16
141	subs		w4, w4, #1
142	bne		.Lecbencloop
143.Lecbencout:
144	FRAME_POP
145	ret
146AES_ENDPROC(aes_ecb_encrypt)
147
148
149AES_ENTRY(aes_ecb_decrypt)
150	FRAME_PUSH
151	cbz		w5, .LecbdecloopNx
152
153	dec_prepare	w3, x2, x5
154
155.LecbdecloopNx:
156#if INTERLEAVE >= 2
157	subs		w4, w4, #INTERLEAVE
158	bmi		.Lecbdec1x
159#if INTERLEAVE == 2
160	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
161	do_decrypt_block2x
162	st1		{v0.16b-v1.16b}, [x0], #32
163#else
164	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
165	do_decrypt_block4x
166	st1		{v0.16b-v3.16b}, [x0], #64
167#endif
168	b		.LecbdecloopNx
169.Lecbdec1x:
170	adds		w4, w4, #INTERLEAVE
171	beq		.Lecbdecout
172#endif
173.Lecbdecloop:
174	ld1		{v0.16b}, [x1], #16		/* get next ct block */
175	decrypt_block	v0, w3, x2, x5, w6
176	st1		{v0.16b}, [x0], #16
177	subs		w4, w4, #1
178	bne		.Lecbdecloop
179.Lecbdecout:
180	FRAME_POP
181	ret
182AES_ENDPROC(aes_ecb_decrypt)
183
184
185	/*
186	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
187	 *		   int blocks, u8 iv[], int first)
188	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
189	 *		   int blocks, u8 iv[], int first)
190	 */
191
192AES_ENTRY(aes_cbc_encrypt)
193	cbz		w6, .Lcbcencloop
194
195	ld1		{v0.16b}, [x5]			/* get iv */
196	enc_prepare	w3, x2, x5
197
198.Lcbcencloop:
199	ld1		{v1.16b}, [x1], #16		/* get next pt block */
200	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with iv */
201	encrypt_block	v0, w3, x2, x5, w6
202	st1		{v0.16b}, [x0], #16
203	subs		w4, w4, #1
204	bne		.Lcbcencloop
205	ret
206AES_ENDPROC(aes_cbc_encrypt)
207
208
209AES_ENTRY(aes_cbc_decrypt)
210	FRAME_PUSH
211	cbz		w6, .LcbcdecloopNx
212
213	ld1		{v7.16b}, [x5]			/* get iv */
214	dec_prepare	w3, x2, x5
215
216.LcbcdecloopNx:
217#if INTERLEAVE >= 2
218	subs		w4, w4, #INTERLEAVE
219	bmi		.Lcbcdec1x
220#if INTERLEAVE == 2
221	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
222	mov		v2.16b, v0.16b
223	mov		v3.16b, v1.16b
224	do_decrypt_block2x
225	eor		v0.16b, v0.16b, v7.16b
226	eor		v1.16b, v1.16b, v2.16b
227	mov		v7.16b, v3.16b
228	st1		{v0.16b-v1.16b}, [x0], #32
229#else
230	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
231	mov		v4.16b, v0.16b
232	mov		v5.16b, v1.16b
233	mov		v6.16b, v2.16b
234	do_decrypt_block4x
235	sub		x1, x1, #16
236	eor		v0.16b, v0.16b, v7.16b
237	eor		v1.16b, v1.16b, v4.16b
238	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
239	eor		v2.16b, v2.16b, v5.16b
240	eor		v3.16b, v3.16b, v6.16b
241	st1		{v0.16b-v3.16b}, [x0], #64
242#endif
243	b		.LcbcdecloopNx
244.Lcbcdec1x:
245	adds		w4, w4, #INTERLEAVE
246	beq		.Lcbcdecout
247#endif
248.Lcbcdecloop:
249	ld1		{v1.16b}, [x1], #16		/* get next ct block */
250	mov		v0.16b, v1.16b			/* ...and copy to v0 */
251	decrypt_block	v0, w3, x2, x5, w6
252	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
253	mov		v7.16b, v1.16b			/* ct is next iv */
254	st1		{v0.16b}, [x0], #16
255	subs		w4, w4, #1
256	bne		.Lcbcdecloop
257.Lcbcdecout:
258	FRAME_POP
259	ret
260AES_ENDPROC(aes_cbc_decrypt)
261
262
263	/*
264	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
265	 *		   int blocks, u8 ctr[], int first)
266	 */
267
268AES_ENTRY(aes_ctr_encrypt)
269	FRAME_PUSH
270	cbnz		w6, .Lctrfirst		/* 1st time around? */
271	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */
272	rev		x5, x5
273#if INTERLEAVE >= 2
274	cmn		w5, w4			/* 32 bit overflow? */
275	bcs		.Lctrinc
276	add		x5, x5, #1		/* increment BE ctr */
277	b		.LctrincNx
278#else
279	b		.Lctrinc
280#endif
281.Lctrfirst:
282	enc_prepare	w3, x2, x6
283	ld1		{v4.16b}, [x5]
284	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */
285	rev		x5, x5
286#if INTERLEAVE >= 2
287	cmn		w5, w4			/* 32 bit overflow? */
288	bcs		.Lctrloop
289.LctrloopNx:
290	subs		w4, w4, #INTERLEAVE
291	bmi		.Lctr1x
292#if INTERLEAVE == 2
293	mov		v0.8b, v4.8b
294	mov		v1.8b, v4.8b
295	rev		x7, x5
296	add		x5, x5, #1
297	ins		v0.d[1], x7
298	rev		x7, x5
299	add		x5, x5, #1
300	ins		v1.d[1], x7
301	ld1		{v2.16b-v3.16b}, [x1], #32	/* get 2 input blocks */
302	do_encrypt_block2x
303	eor		v0.16b, v0.16b, v2.16b
304	eor		v1.16b, v1.16b, v3.16b
305	st1		{v0.16b-v1.16b}, [x0], #32
306#else
307	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */
308	dup		v7.4s, w5
309	mov		v0.16b, v4.16b
310	add		v7.4s, v7.4s, v8.4s
311	mov		v1.16b, v4.16b
312	rev32		v8.16b, v7.16b
313	mov		v2.16b, v4.16b
314	mov		v3.16b, v4.16b
315	mov		v1.s[3], v8.s[0]
316	mov		v2.s[3], v8.s[1]
317	mov		v3.s[3], v8.s[2]
318	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
319	do_encrypt_block4x
320	eor		v0.16b, v5.16b, v0.16b
321	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
322	eor		v1.16b, v6.16b, v1.16b
323	eor		v2.16b, v7.16b, v2.16b
324	eor		v3.16b, v5.16b, v3.16b
325	st1		{v0.16b-v3.16b}, [x0], #64
326	add		x5, x5, #INTERLEAVE
327#endif
328	cbz		w4, .LctroutNx
329.LctrincNx:
330	rev		x7, x5
331	ins		v4.d[1], x7
332	b		.LctrloopNx
333.LctroutNx:
334	sub		x5, x5, #1
335	rev		x7, x5
336	ins		v4.d[1], x7
337	b		.Lctrout
338.Lctr1x:
339	adds		w4, w4, #INTERLEAVE
340	beq		.Lctrout
341#endif
342.Lctrloop:
343	mov		v0.16b, v4.16b
344	encrypt_block	v0, w3, x2, x6, w7
345	subs		w4, w4, #1
346	bmi		.Lctrhalfblock		/* blocks < 0 means 1/2 block */
347	ld1		{v3.16b}, [x1], #16
348	eor		v3.16b, v0.16b, v3.16b
349	st1		{v3.16b}, [x0], #16
350	beq		.Lctrout
351.Lctrinc:
352	adds		x5, x5, #1		/* increment BE ctr */
353	rev		x7, x5
354	ins		v4.d[1], x7
355	bcc		.Lctrloop		/* no overflow? */
356	umov		x7, v4.d[0]		/* load upper word of ctr  */
357	rev		x7, x7			/* ... to handle the carry */
358	add		x7, x7, #1
359	rev		x7, x7
360	ins		v4.d[0], x7
361	b		.Lctrloop
362.Lctrhalfblock:
363	ld1		{v3.8b}, [x1]
364	eor		v3.8b, v0.8b, v3.8b
365	st1		{v3.8b}, [x0]
366.Lctrout:
367	FRAME_POP
368	ret
369AES_ENDPROC(aes_ctr_encrypt)
370	.ltorg
371
372
373	/*
374	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
375	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
376	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
377	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
378	 */
379
380	.macro		next_tweak, out, in, const, tmp
381	sshr		\tmp\().2d,  \in\().2d,   #63
382	and		\tmp\().16b, \tmp\().16b, \const\().16b
383	add		\out\().2d,  \in\().2d,   \in\().2d
384	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
385	eor		\out\().16b, \out\().16b, \tmp\().16b
386	.endm
387
388.Lxts_mul_x:
389CPU_LE(	.quad		1, 0x87		)
390CPU_BE(	.quad		0x87, 1		)
391
392AES_ENTRY(aes_xts_encrypt)
393	FRAME_PUSH
394	cbz		w7, .LxtsencloopNx
395
396	ld1		{v4.16b}, [x6]
397	enc_prepare	w3, x5, x6
398	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */
399	enc_switch_key	w3, x2, x6
400	ldr		q7, .Lxts_mul_x
401	b		.LxtsencNx
402
403.LxtsencloopNx:
404	ldr		q7, .Lxts_mul_x
405	next_tweak	v4, v4, v7, v8
406.LxtsencNx:
407#if INTERLEAVE >= 2
408	subs		w4, w4, #INTERLEAVE
409	bmi		.Lxtsenc1x
410#if INTERLEAVE == 2
411	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */
412	next_tweak	v5, v4, v7, v8
413	eor		v0.16b, v0.16b, v4.16b
414	eor		v1.16b, v1.16b, v5.16b
415	do_encrypt_block2x
416	eor		v0.16b, v0.16b, v4.16b
417	eor		v1.16b, v1.16b, v5.16b
418	st1		{v0.16b-v1.16b}, [x0], #32
419	cbz		w4, .LxtsencoutNx
420	next_tweak	v4, v5, v7, v8
421	b		.LxtsencNx
422.LxtsencoutNx:
423	mov		v4.16b, v5.16b
424	b		.Lxtsencout
425#else
426	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
427	next_tweak	v5, v4, v7, v8
428	eor		v0.16b, v0.16b, v4.16b
429	next_tweak	v6, v5, v7, v8
430	eor		v1.16b, v1.16b, v5.16b
431	eor		v2.16b, v2.16b, v6.16b
432	next_tweak	v7, v6, v7, v8
433	eor		v3.16b, v3.16b, v7.16b
434	do_encrypt_block4x
435	eor		v3.16b, v3.16b, v7.16b
436	eor		v0.16b, v0.16b, v4.16b
437	eor		v1.16b, v1.16b, v5.16b
438	eor		v2.16b, v2.16b, v6.16b
439	st1		{v0.16b-v3.16b}, [x0], #64
440	mov		v4.16b, v7.16b
441	cbz		w4, .Lxtsencout
442	b		.LxtsencloopNx
443#endif
444.Lxtsenc1x:
445	adds		w4, w4, #INTERLEAVE
446	beq		.Lxtsencout
447#endif
448.Lxtsencloop:
449	ld1		{v1.16b}, [x1], #16
450	eor		v0.16b, v1.16b, v4.16b
451	encrypt_block	v0, w3, x2, x6, w7
452	eor		v0.16b, v0.16b, v4.16b
453	st1		{v0.16b}, [x0], #16
454	subs		w4, w4, #1
455	beq		.Lxtsencout
456	next_tweak	v4, v4, v7, v8
457	b		.Lxtsencloop
458.Lxtsencout:
459	FRAME_POP
460	ret
461AES_ENDPROC(aes_xts_encrypt)
462
463
464AES_ENTRY(aes_xts_decrypt)
465	FRAME_PUSH
466	cbz		w7, .LxtsdecloopNx
467
468	ld1		{v4.16b}, [x6]
469	enc_prepare	w3, x5, x6
470	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */
471	dec_prepare	w3, x2, x6
472	ldr		q7, .Lxts_mul_x
473	b		.LxtsdecNx
474
475.LxtsdecloopNx:
476	ldr		q7, .Lxts_mul_x
477	next_tweak	v4, v4, v7, v8
478.LxtsdecNx:
479#if INTERLEAVE >= 2
480	subs		w4, w4, #INTERLEAVE
481	bmi		.Lxtsdec1x
482#if INTERLEAVE == 2
483	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
484	next_tweak	v5, v4, v7, v8
485	eor		v0.16b, v0.16b, v4.16b
486	eor		v1.16b, v1.16b, v5.16b
487	do_decrypt_block2x
488	eor		v0.16b, v0.16b, v4.16b
489	eor		v1.16b, v1.16b, v5.16b
490	st1		{v0.16b-v1.16b}, [x0], #32
491	cbz		w4, .LxtsdecoutNx
492	next_tweak	v4, v5, v7, v8
493	b		.LxtsdecNx
494.LxtsdecoutNx:
495	mov		v4.16b, v5.16b
496	b		.Lxtsdecout
497#else
498	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
499	next_tweak	v5, v4, v7, v8
500	eor		v0.16b, v0.16b, v4.16b
501	next_tweak	v6, v5, v7, v8
502	eor		v1.16b, v1.16b, v5.16b
503	eor		v2.16b, v2.16b, v6.16b
504	next_tweak	v7, v6, v7, v8
505	eor		v3.16b, v3.16b, v7.16b
506	do_decrypt_block4x
507	eor		v3.16b, v3.16b, v7.16b
508	eor		v0.16b, v0.16b, v4.16b
509	eor		v1.16b, v1.16b, v5.16b
510	eor		v2.16b, v2.16b, v6.16b
511	st1		{v0.16b-v3.16b}, [x0], #64
512	mov		v4.16b, v7.16b
513	cbz		w4, .Lxtsdecout
514	b		.LxtsdecloopNx
515#endif
516.Lxtsdec1x:
517	adds		w4, w4, #INTERLEAVE
518	beq		.Lxtsdecout
519#endif
520.Lxtsdecloop:
521	ld1		{v1.16b}, [x1], #16
522	eor		v0.16b, v1.16b, v4.16b
523	decrypt_block	v0, w3, x2, x6, w7
524	eor		v0.16b, v0.16b, v4.16b
525	st1		{v0.16b}, [x0], #16
526	subs		w4, w4, #1
527	beq		.Lxtsdecout
528	next_tweak	v4, v4, v7, v8
529	b		.Lxtsdecloop
530.Lxtsdecout:
531	FRAME_POP
532	ret
533AES_ENDPROC(aes_xts_decrypt)
534