xref: /linux/arch/arm64/crypto/aes-modes.S (revision 2b64b2ed277ff23e785fbdb65098ee7e1252d64f)
1/*
2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
3 *
4 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/* included by aes-ce.S and aes-neon.S */
12
13	.text
14	.align		4
15
16aes_encrypt_block4x:
17	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
18	ret
19ENDPROC(aes_encrypt_block4x)
20
21aes_decrypt_block4x:
22	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
23	ret
24ENDPROC(aes_decrypt_block4x)
25
26	/*
27	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
28	 *		   int blocks)
29	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
30	 *		   int blocks)
31	 */
32
33AES_ENTRY(aes_ecb_encrypt)
34	stp		x29, x30, [sp, #-16]!
35	mov		x29, sp
36
37	enc_prepare	w3, x2, x5
38
39.LecbencloopNx:
40	subs		w4, w4, #4
41	bmi		.Lecbenc1x
42	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
43	bl		aes_encrypt_block4x
44	st1		{v0.16b-v3.16b}, [x0], #64
45	b		.LecbencloopNx
46.Lecbenc1x:
47	adds		w4, w4, #4
48	beq		.Lecbencout
49.Lecbencloop:
50	ld1		{v0.16b}, [x1], #16		/* get next pt block */
51	encrypt_block	v0, w3, x2, x5, w6
52	st1		{v0.16b}, [x0], #16
53	subs		w4, w4, #1
54	bne		.Lecbencloop
55.Lecbencout:
56	ldp		x29, x30, [sp], #16
57	ret
58AES_ENDPROC(aes_ecb_encrypt)
59
60
61AES_ENTRY(aes_ecb_decrypt)
62	stp		x29, x30, [sp, #-16]!
63	mov		x29, sp
64
65	dec_prepare	w3, x2, x5
66
67.LecbdecloopNx:
68	subs		w4, w4, #4
69	bmi		.Lecbdec1x
70	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
71	bl		aes_decrypt_block4x
72	st1		{v0.16b-v3.16b}, [x0], #64
73	b		.LecbdecloopNx
74.Lecbdec1x:
75	adds		w4, w4, #4
76	beq		.Lecbdecout
77.Lecbdecloop:
78	ld1		{v0.16b}, [x1], #16		/* get next ct block */
79	decrypt_block	v0, w3, x2, x5, w6
80	st1		{v0.16b}, [x0], #16
81	subs		w4, w4, #1
82	bne		.Lecbdecloop
83.Lecbdecout:
84	ldp		x29, x30, [sp], #16
85	ret
86AES_ENDPROC(aes_ecb_decrypt)
87
88
89	/*
90	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
91	 *		   int blocks, u8 iv[])
92	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
93	 *		   int blocks, u8 iv[])
94	 */
95
96AES_ENTRY(aes_cbc_encrypt)
97	ld1		{v4.16b}, [x5]			/* get iv */
98	enc_prepare	w3, x2, x6
99
100.Lcbcencloop4x:
101	subs		w4, w4, #4
102	bmi		.Lcbcenc1x
103	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
104	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
105	encrypt_block	v0, w3, x2, x6, w7
106	eor		v1.16b, v1.16b, v0.16b
107	encrypt_block	v1, w3, x2, x6, w7
108	eor		v2.16b, v2.16b, v1.16b
109	encrypt_block	v2, w3, x2, x6, w7
110	eor		v3.16b, v3.16b, v2.16b
111	encrypt_block	v3, w3, x2, x6, w7
112	st1		{v0.16b-v3.16b}, [x0], #64
113	mov		v4.16b, v3.16b
114	b		.Lcbcencloop4x
115.Lcbcenc1x:
116	adds		w4, w4, #4
117	beq		.Lcbcencout
118.Lcbcencloop:
119	ld1		{v0.16b}, [x1], #16		/* get next pt block */
120	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
121	encrypt_block	v4, w3, x2, x6, w7
122	st1		{v4.16b}, [x0], #16
123	subs		w4, w4, #1
124	bne		.Lcbcencloop
125.Lcbcencout:
126	st1		{v4.16b}, [x5]			/* return iv */
127	ret
128AES_ENDPROC(aes_cbc_encrypt)
129
130
131AES_ENTRY(aes_cbc_decrypt)
132	stp		x29, x30, [sp, #-16]!
133	mov		x29, sp
134
135	ld1		{v7.16b}, [x5]			/* get iv */
136	dec_prepare	w3, x2, x6
137
138.LcbcdecloopNx:
139	subs		w4, w4, #4
140	bmi		.Lcbcdec1x
141	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
142	mov		v4.16b, v0.16b
143	mov		v5.16b, v1.16b
144	mov		v6.16b, v2.16b
145	bl		aes_decrypt_block4x
146	sub		x1, x1, #16
147	eor		v0.16b, v0.16b, v7.16b
148	eor		v1.16b, v1.16b, v4.16b
149	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
150	eor		v2.16b, v2.16b, v5.16b
151	eor		v3.16b, v3.16b, v6.16b
152	st1		{v0.16b-v3.16b}, [x0], #64
153	b		.LcbcdecloopNx
154.Lcbcdec1x:
155	adds		w4, w4, #4
156	beq		.Lcbcdecout
157.Lcbcdecloop:
158	ld1		{v1.16b}, [x1], #16		/* get next ct block */
159	mov		v0.16b, v1.16b			/* ...and copy to v0 */
160	decrypt_block	v0, w3, x2, x6, w7
161	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
162	mov		v7.16b, v1.16b			/* ct is next iv */
163	st1		{v0.16b}, [x0], #16
164	subs		w4, w4, #1
165	bne		.Lcbcdecloop
166.Lcbcdecout:
167	st1		{v7.16b}, [x5]			/* return iv */
168	ldp		x29, x30, [sp], #16
169	ret
170AES_ENDPROC(aes_cbc_decrypt)
171
172
173	/*
174	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
175	 *		       int rounds, int bytes, u8 const iv[])
176	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
177	 *		       int rounds, int bytes, u8 const iv[])
178	 */
179
180AES_ENTRY(aes_cbc_cts_encrypt)
181	adr_l		x8, .Lcts_permute_table
182	sub		x4, x4, #16
183	add		x9, x8, #32
184	add		x8, x8, x4
185	sub		x9, x9, x4
186	ld1		{v3.16b}, [x8]
187	ld1		{v4.16b}, [x9]
188
189	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
190	ld1		{v1.16b}, [x1]
191
192	ld1		{v5.16b}, [x5]			/* get iv */
193	enc_prepare	w3, x2, x6
194
195	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
196	tbl		v1.16b, {v1.16b}, v4.16b
197	encrypt_block	v0, w3, x2, x6, w7
198
199	eor		v1.16b, v1.16b, v0.16b
200	tbl		v0.16b, {v0.16b}, v3.16b
201	encrypt_block	v1, w3, x2, x6, w7
202
203	add		x4, x0, x4
204	st1		{v0.16b}, [x4]			/* overlapping stores */
205	st1		{v1.16b}, [x0]
206	ret
207AES_ENDPROC(aes_cbc_cts_encrypt)
208
209AES_ENTRY(aes_cbc_cts_decrypt)
210	adr_l		x8, .Lcts_permute_table
211	sub		x4, x4, #16
212	add		x9, x8, #32
213	add		x8, x8, x4
214	sub		x9, x9, x4
215	ld1		{v3.16b}, [x8]
216	ld1		{v4.16b}, [x9]
217
218	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
219	ld1		{v1.16b}, [x1]
220
221	ld1		{v5.16b}, [x5]			/* get iv */
222	dec_prepare	w3, x2, x6
223
224	tbl		v2.16b, {v1.16b}, v4.16b
225	decrypt_block	v0, w3, x2, x6, w7
226	eor		v2.16b, v2.16b, v0.16b
227
228	tbx		v0.16b, {v1.16b}, v4.16b
229	tbl		v2.16b, {v2.16b}, v3.16b
230	decrypt_block	v0, w3, x2, x6, w7
231	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
232
233	add		x4, x0, x4
234	st1		{v2.16b}, [x4]			/* overlapping stores */
235	st1		{v0.16b}, [x0]
236	ret
237AES_ENDPROC(aes_cbc_cts_decrypt)
238
239	.section	".rodata", "a"
240	.align		6
241.Lcts_permute_table:
242	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
243	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
244	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
245	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
246	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
247	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
248	.previous
249
250
251	/*
252	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
253	 *		   int blocks, u8 ctr[])
254	 */
255
256AES_ENTRY(aes_ctr_encrypt)
257	stp		x29, x30, [sp, #-16]!
258	mov		x29, sp
259
260	enc_prepare	w3, x2, x6
261	ld1		{v4.16b}, [x5]
262
263	umov		x6, v4.d[1]		/* keep swabbed ctr in reg */
264	rev		x6, x6
265	cmn		w6, w4			/* 32 bit overflow? */
266	bcs		.Lctrloop
267.LctrloopNx:
268	subs		w4, w4, #4
269	bmi		.Lctr1x
270	add		w7, w6, #1
271	mov		v0.16b, v4.16b
272	add		w8, w6, #2
273	mov		v1.16b, v4.16b
274	add		w9, w6, #3
275	mov		v2.16b, v4.16b
276	rev		w7, w7
277	mov		v3.16b, v4.16b
278	rev		w8, w8
279	mov		v1.s[3], w7
280	rev		w9, w9
281	mov		v2.s[3], w8
282	mov		v3.s[3], w9
283	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
284	bl		aes_encrypt_block4x
285	eor		v0.16b, v5.16b, v0.16b
286	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
287	eor		v1.16b, v6.16b, v1.16b
288	eor		v2.16b, v7.16b, v2.16b
289	eor		v3.16b, v5.16b, v3.16b
290	st1		{v0.16b-v3.16b}, [x0], #64
291	add		x6, x6, #4
292	rev		x7, x6
293	ins		v4.d[1], x7
294	cbz		w4, .Lctrout
295	b		.LctrloopNx
296.Lctr1x:
297	adds		w4, w4, #4
298	beq		.Lctrout
299.Lctrloop:
300	mov		v0.16b, v4.16b
301	encrypt_block	v0, w3, x2, x8, w7
302
303	adds		x6, x6, #1		/* increment BE ctr */
304	rev		x7, x6
305	ins		v4.d[1], x7
306	bcs		.Lctrcarry		/* overflow? */
307
308.Lctrcarrydone:
309	subs		w4, w4, #1
310	bmi		.Lctrtailblock		/* blocks <0 means tail block */
311	ld1		{v3.16b}, [x1], #16
312	eor		v3.16b, v0.16b, v3.16b
313	st1		{v3.16b}, [x0], #16
314	bne		.Lctrloop
315
316.Lctrout:
317	st1		{v4.16b}, [x5]		/* return next CTR value */
318	ldp		x29, x30, [sp], #16
319	ret
320
321.Lctrtailblock:
322	st1		{v0.16b}, [x0]
323	b		.Lctrout
324
325.Lctrcarry:
326	umov		x7, v4.d[0]		/* load upper word of ctr  */
327	rev		x7, x7			/* ... to handle the carry */
328	add		x7, x7, #1
329	rev		x7, x7
330	ins		v4.d[0], x7
331	b		.Lctrcarrydone
332AES_ENDPROC(aes_ctr_encrypt)
333
334
335	/*
336	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
337	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
338	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
339	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
340	 */
341
342	.macro		next_tweak, out, in, tmp
343	sshr		\tmp\().2d,  \in\().2d,   #63
344	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
345	add		\out\().2d,  \in\().2d,   \in\().2d
346	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
347	eor		\out\().16b, \out\().16b, \tmp\().16b
348	.endm
349
350	.macro		xts_load_mask, tmp
351	movi		xtsmask.2s, #0x1
352	movi		\tmp\().2s, #0x87
353	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
354	.endm
355
356AES_ENTRY(aes_xts_encrypt)
357	stp		x29, x30, [sp, #-16]!
358	mov		x29, sp
359
360	ld1		{v4.16b}, [x6]
361	xts_load_mask	v8
362	cbz		w7, .Lxtsencnotfirst
363
364	enc_prepare	w3, x5, x8
365	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
366	enc_switch_key	w3, x2, x8
367	b		.LxtsencNx
368
369.Lxtsencnotfirst:
370	enc_prepare	w3, x2, x8
371.LxtsencloopNx:
372	next_tweak	v4, v4, v8
373.LxtsencNx:
374	subs		w4, w4, #4
375	bmi		.Lxtsenc1x
376	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
377	next_tweak	v5, v4, v8
378	eor		v0.16b, v0.16b, v4.16b
379	next_tweak	v6, v5, v8
380	eor		v1.16b, v1.16b, v5.16b
381	eor		v2.16b, v2.16b, v6.16b
382	next_tweak	v7, v6, v8
383	eor		v3.16b, v3.16b, v7.16b
384	bl		aes_encrypt_block4x
385	eor		v3.16b, v3.16b, v7.16b
386	eor		v0.16b, v0.16b, v4.16b
387	eor		v1.16b, v1.16b, v5.16b
388	eor		v2.16b, v2.16b, v6.16b
389	st1		{v0.16b-v3.16b}, [x0], #64
390	mov		v4.16b, v7.16b
391	cbz		w4, .Lxtsencout
392	xts_reload_mask	v8
393	b		.LxtsencloopNx
394.Lxtsenc1x:
395	adds		w4, w4, #4
396	beq		.Lxtsencout
397.Lxtsencloop:
398	ld1		{v1.16b}, [x1], #16
399	eor		v0.16b, v1.16b, v4.16b
400	encrypt_block	v0, w3, x2, x8, w7
401	eor		v0.16b, v0.16b, v4.16b
402	st1		{v0.16b}, [x0], #16
403	subs		w4, w4, #1
404	beq		.Lxtsencout
405	next_tweak	v4, v4, v8
406	b		.Lxtsencloop
407.Lxtsencout:
408	st1		{v4.16b}, [x6]
409	ldp		x29, x30, [sp], #16
410	ret
411AES_ENDPROC(aes_xts_encrypt)
412
413
414AES_ENTRY(aes_xts_decrypt)
415	stp		x29, x30, [sp, #-16]!
416	mov		x29, sp
417
418	ld1		{v4.16b}, [x6]
419	xts_load_mask	v8
420	cbz		w7, .Lxtsdecnotfirst
421
422	enc_prepare	w3, x5, x8
423	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
424	dec_prepare	w3, x2, x8
425	b		.LxtsdecNx
426
427.Lxtsdecnotfirst:
428	dec_prepare	w3, x2, x8
429.LxtsdecloopNx:
430	next_tweak	v4, v4, v8
431.LxtsdecNx:
432	subs		w4, w4, #4
433	bmi		.Lxtsdec1x
434	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
435	next_tweak	v5, v4, v8
436	eor		v0.16b, v0.16b, v4.16b
437	next_tweak	v6, v5, v8
438	eor		v1.16b, v1.16b, v5.16b
439	eor		v2.16b, v2.16b, v6.16b
440	next_tweak	v7, v6, v8
441	eor		v3.16b, v3.16b, v7.16b
442	bl		aes_decrypt_block4x
443	eor		v3.16b, v3.16b, v7.16b
444	eor		v0.16b, v0.16b, v4.16b
445	eor		v1.16b, v1.16b, v5.16b
446	eor		v2.16b, v2.16b, v6.16b
447	st1		{v0.16b-v3.16b}, [x0], #64
448	mov		v4.16b, v7.16b
449	cbz		w4, .Lxtsdecout
450	xts_reload_mask	v8
451	b		.LxtsdecloopNx
452.Lxtsdec1x:
453	adds		w4, w4, #4
454	beq		.Lxtsdecout
455.Lxtsdecloop:
456	ld1		{v1.16b}, [x1], #16
457	eor		v0.16b, v1.16b, v4.16b
458	decrypt_block	v0, w3, x2, x8, w7
459	eor		v0.16b, v0.16b, v4.16b
460	st1		{v0.16b}, [x0], #16
461	subs		w4, w4, #1
462	beq		.Lxtsdecout
463	next_tweak	v4, v4, v8
464	b		.Lxtsdecloop
465.Lxtsdecout:
466	st1		{v4.16b}, [x6]
467	ldp		x29, x30, [sp], #16
468	ret
469AES_ENDPROC(aes_xts_decrypt)
470
471	/*
472	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
473	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
474	 */
475AES_ENTRY(aes_mac_update)
476	frame_push	6
477
478	mov		x19, x0
479	mov		x20, x1
480	mov		x21, x2
481	mov		x22, x3
482	mov		x23, x4
483	mov		x24, x6
484
485	ld1		{v0.16b}, [x23]			/* get dg */
486	enc_prepare	w2, x1, x7
487	cbz		w5, .Lmacloop4x
488
489	encrypt_block	v0, w2, x1, x7, w8
490
491.Lmacloop4x:
492	subs		w22, w22, #4
493	bmi		.Lmac1x
494	ld1		{v1.16b-v4.16b}, [x19], #64	/* get next pt block */
495	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
496	encrypt_block	v0, w21, x20, x7, w8
497	eor		v0.16b, v0.16b, v2.16b
498	encrypt_block	v0, w21, x20, x7, w8
499	eor		v0.16b, v0.16b, v3.16b
500	encrypt_block	v0, w21, x20, x7, w8
501	eor		v0.16b, v0.16b, v4.16b
502	cmp		w22, wzr
503	csinv		x5, x24, xzr, eq
504	cbz		w5, .Lmacout
505	encrypt_block	v0, w21, x20, x7, w8
506	st1		{v0.16b}, [x23]			/* return dg */
507	cond_yield_neon	.Lmacrestart
508	b		.Lmacloop4x
509.Lmac1x:
510	add		w22, w22, #4
511.Lmacloop:
512	cbz		w22, .Lmacout
513	ld1		{v1.16b}, [x19], #16		/* get next pt block */
514	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
515
516	subs		w22, w22, #1
517	csinv		x5, x24, xzr, eq
518	cbz		w5, .Lmacout
519
520.Lmacenc:
521	encrypt_block	v0, w21, x20, x7, w8
522	b		.Lmacloop
523
524.Lmacout:
525	st1		{v0.16b}, [x23]			/* return dg */
526	frame_pop
527	ret
528
529.Lmacrestart:
530	ld1		{v0.16b}, [x23]			/* get dg */
531	enc_prepare	w21, x20, x0
532	b		.Lmacloop4x
533AES_ENDPROC(aes_mac_update)
534