xref: /linux/arch/arm/crypto/aes-ce-core.S (revision cdd38c5f1ce4398ec58fec95904b75824daab7b5)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
4 *
5 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	.text
12	.arch		armv8-a
13	.fpu		crypto-neon-fp-armv8
14	.align		3
15
16	.macro		enc_round, state, key
17	aese.8		\state, \key
18	aesmc.8		\state, \state
19	.endm
20
21	.macro		dec_round, state, key
22	aesd.8		\state, \key
23	aesimc.8	\state, \state
24	.endm
25
26	.macro		enc_dround, key1, key2
27	enc_round	q0, \key1
28	enc_round	q0, \key2
29	.endm
30
31	.macro		dec_dround, key1, key2
32	dec_round	q0, \key1
33	dec_round	q0, \key2
34	.endm
35
36	.macro		enc_fround, key1, key2, key3
37	enc_round	q0, \key1
38	aese.8		q0, \key2
39	veor		q0, q0, \key3
40	.endm
41
42	.macro		dec_fround, key1, key2, key3
43	dec_round	q0, \key1
44	aesd.8		q0, \key2
45	veor		q0, q0, \key3
46	.endm
47
48	.macro		enc_dround_4x, key1, key2
49	enc_round	q0, \key1
50	enc_round	q1, \key1
51	enc_round	q2, \key1
52	enc_round	q3, \key1
53	enc_round	q0, \key2
54	enc_round	q1, \key2
55	enc_round	q2, \key2
56	enc_round	q3, \key2
57	.endm
58
59	.macro		dec_dround_4x, key1, key2
60	dec_round	q0, \key1
61	dec_round	q1, \key1
62	dec_round	q2, \key1
63	dec_round	q3, \key1
64	dec_round	q0, \key2
65	dec_round	q1, \key2
66	dec_round	q2, \key2
67	dec_round	q3, \key2
68	.endm
69
70	.macro		enc_fround_4x, key1, key2, key3
71	enc_round	q0, \key1
72	enc_round	q1, \key1
73	enc_round	q2, \key1
74	enc_round	q3, \key1
75	aese.8		q0, \key2
76	aese.8		q1, \key2
77	aese.8		q2, \key2
78	aese.8		q3, \key2
79	veor		q0, q0, \key3
80	veor		q1, q1, \key3
81	veor		q2, q2, \key3
82	veor		q3, q3, \key3
83	.endm
84
85	.macro		dec_fround_4x, key1, key2, key3
86	dec_round	q0, \key1
87	dec_round	q1, \key1
88	dec_round	q2, \key1
89	dec_round	q3, \key1
90	aesd.8		q0, \key2
91	aesd.8		q1, \key2
92	aesd.8		q2, \key2
93	aesd.8		q3, \key2
94	veor		q0, q0, \key3
95	veor		q1, q1, \key3
96	veor		q2, q2, \key3
97	veor		q3, q3, \key3
98	.endm
99
100	.macro		do_block, dround, fround
101	cmp		r3, #12			@ which key size?
102	vld1.32		{q10-q11}, [ip]!
103	\dround		q8, q9
104	vld1.32		{q12-q13}, [ip]!
105	\dround		q10, q11
106	vld1.32		{q10-q11}, [ip]!
107	\dround		q12, q13
108	vld1.32		{q12-q13}, [ip]!
109	\dround		q10, q11
110	blo		0f			@ AES-128: 10 rounds
111	vld1.32		{q10-q11}, [ip]!
112	\dround		q12, q13
113	beq		1f			@ AES-192: 12 rounds
114	vld1.32		{q12-q13}, [ip]
115	\dround		q10, q11
1160:	\fround		q12, q13, q14
117	bx		lr
118
1191:	\fround		q10, q11, q14
120	bx		lr
121	.endm
122
123	/*
124	 * Internal, non-AAPCS compliant functions that implement the core AES
125	 * transforms. These should preserve all registers except q0 - q2 and ip
126	 * Arguments:
127	 *   q0        : first in/output block
128	 *   q1        : second in/output block (_4x version only)
129	 *   q2        : third in/output block (_4x version only)
130	 *   q3        : fourth in/output block (_4x version only)
131	 *   q8        : first round key
132	 *   q9        : secound round key
133	 *   q14       : final round key
134	 *   r2        : address of round key array
135	 *   r3        : number of rounds
136	 */
137	.align		6
138aes_encrypt:
139	add		ip, r2, #32		@ 3rd round key
140.Laes_encrypt_tweak:
141	do_block	enc_dround, enc_fround
142ENDPROC(aes_encrypt)
143
144	.align		6
145aes_decrypt:
146	add		ip, r2, #32		@ 3rd round key
147	do_block	dec_dround, dec_fround
148ENDPROC(aes_decrypt)
149
150	.align		6
151aes_encrypt_4x:
152	add		ip, r2, #32		@ 3rd round key
153	do_block	enc_dround_4x, enc_fround_4x
154ENDPROC(aes_encrypt_4x)
155
156	.align		6
157aes_decrypt_4x:
158	add		ip, r2, #32		@ 3rd round key
159	do_block	dec_dround_4x, dec_fround_4x
160ENDPROC(aes_decrypt_4x)
161
162	.macro		prepare_key, rk, rounds
163	add		ip, \rk, \rounds, lsl #4
164	vld1.32		{q8-q9}, [\rk]		@ load first 2 round keys
165	vld1.32		{q14}, [ip]		@ load last round key
166	.endm
167
168	/*
169	 * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
170	 *		   int blocks)
171	 * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
172	 *		   int blocks)
173	 */
174ENTRY(ce_aes_ecb_encrypt)
175	push		{r4, lr}
176	ldr		r4, [sp, #8]
177	prepare_key	r2, r3
178.Lecbencloop4x:
179	subs		r4, r4, #4
180	bmi		.Lecbenc1x
181	vld1.8		{q0-q1}, [r1]!
182	vld1.8		{q2-q3}, [r1]!
183	bl		aes_encrypt_4x
184	vst1.8		{q0-q1}, [r0]!
185	vst1.8		{q2-q3}, [r0]!
186	b		.Lecbencloop4x
187.Lecbenc1x:
188	adds		r4, r4, #4
189	beq		.Lecbencout
190.Lecbencloop:
191	vld1.8		{q0}, [r1]!
192	bl		aes_encrypt
193	vst1.8		{q0}, [r0]!
194	subs		r4, r4, #1
195	bne		.Lecbencloop
196.Lecbencout:
197	pop		{r4, pc}
198ENDPROC(ce_aes_ecb_encrypt)
199
200ENTRY(ce_aes_ecb_decrypt)
201	push		{r4, lr}
202	ldr		r4, [sp, #8]
203	prepare_key	r2, r3
204.Lecbdecloop4x:
205	subs		r4, r4, #4
206	bmi		.Lecbdec1x
207	vld1.8		{q0-q1}, [r1]!
208	vld1.8		{q2-q3}, [r1]!
209	bl		aes_decrypt_4x
210	vst1.8		{q0-q1}, [r0]!
211	vst1.8		{q2-q3}, [r0]!
212	b		.Lecbdecloop4x
213.Lecbdec1x:
214	adds		r4, r4, #4
215	beq		.Lecbdecout
216.Lecbdecloop:
217	vld1.8		{q0}, [r1]!
218	bl		aes_decrypt
219	vst1.8		{q0}, [r0]!
220	subs		r4, r4, #1
221	bne		.Lecbdecloop
222.Lecbdecout:
223	pop		{r4, pc}
224ENDPROC(ce_aes_ecb_decrypt)
225
226	/*
227	 * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
228	 *		   int blocks, u8 iv[])
229	 * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
230	 *		   int blocks, u8 iv[])
231	 */
232ENTRY(ce_aes_cbc_encrypt)
233	push		{r4-r6, lr}
234	ldrd		r4, r5, [sp, #16]
235	vld1.8		{q0}, [r5]
236	prepare_key	r2, r3
237.Lcbcencloop:
238	vld1.8		{q1}, [r1]!		@ get next pt block
239	veor		q0, q0, q1		@ ..and xor with iv
240	bl		aes_encrypt
241	vst1.8		{q0}, [r0]!
242	subs		r4, r4, #1
243	bne		.Lcbcencloop
244	vst1.8		{q0}, [r5]
245	pop		{r4-r6, pc}
246ENDPROC(ce_aes_cbc_encrypt)
247
248ENTRY(ce_aes_cbc_decrypt)
249	push		{r4-r6, lr}
250	ldrd		r4, r5, [sp, #16]
251	vld1.8		{q15}, [r5]		@ keep iv in q15
252	prepare_key	r2, r3
253.Lcbcdecloop4x:
254	subs		r4, r4, #4
255	bmi		.Lcbcdec1x
256	vld1.8		{q0-q1}, [r1]!
257	vld1.8		{q2-q3}, [r1]!
258	vmov		q4, q0
259	vmov		q5, q1
260	vmov		q6, q2
261	vmov		q7, q3
262	bl		aes_decrypt_4x
263	veor		q0, q0, q15
264	veor		q1, q1, q4
265	veor		q2, q2, q5
266	veor		q3, q3, q6
267	vmov		q15, q7
268	vst1.8		{q0-q1}, [r0]!
269	vst1.8		{q2-q3}, [r0]!
270	b		.Lcbcdecloop4x
271.Lcbcdec1x:
272	adds		r4, r4, #4
273	beq		.Lcbcdecout
274	vmov		q6, q14			@ preserve last round key
275.Lcbcdecloop:
276	vld1.8		{q0}, [r1]!		@ get next ct block
277	veor		q14, q15, q6		@ combine prev ct with last key
278	vmov		q15, q0
279	bl		aes_decrypt
280	vst1.8		{q0}, [r0]!
281	subs		r4, r4, #1
282	bne		.Lcbcdecloop
283.Lcbcdecout:
284	vst1.8		{q15}, [r5]		@ keep iv in q15
285	pop		{r4-r6, pc}
286ENDPROC(ce_aes_cbc_decrypt)
287
288
289	/*
290	 * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
291	 *			  int rounds, int bytes, u8 const iv[])
292	 * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
293	 *			  int rounds, int bytes, u8 const iv[])
294	 */
295
296ENTRY(ce_aes_cbc_cts_encrypt)
297	push		{r4-r6, lr}
298	ldrd		r4, r5, [sp, #16]
299
300	movw		ip, :lower16:.Lcts_permute_table
301	movt		ip, :upper16:.Lcts_permute_table
302	sub		r4, r4, #16
303	add		lr, ip, #32
304	add		ip, ip, r4
305	sub		lr, lr, r4
306	vld1.8		{q5}, [ip]
307	vld1.8		{q6}, [lr]
308
309	add		ip, r1, r4
310	vld1.8		{q0}, [r1]			@ overlapping loads
311	vld1.8		{q3}, [ip]
312
313	vld1.8		{q1}, [r5]			@ get iv
314	prepare_key	r2, r3
315
316	veor		q0, q0, q1			@ xor with iv
317	bl		aes_encrypt
318
319	vtbl.8		d4, {d0-d1}, d10
320	vtbl.8		d5, {d0-d1}, d11
321	vtbl.8		d2, {d6-d7}, d12
322	vtbl.8		d3, {d6-d7}, d13
323
324	veor		q0, q0, q1
325	bl		aes_encrypt
326
327	add		r4, r0, r4
328	vst1.8		{q2}, [r4]			@ overlapping stores
329	vst1.8		{q0}, [r0]
330
331	pop		{r4-r6, pc}
332ENDPROC(ce_aes_cbc_cts_encrypt)
333
334ENTRY(ce_aes_cbc_cts_decrypt)
335	push		{r4-r6, lr}
336	ldrd		r4, r5, [sp, #16]
337
338	movw		ip, :lower16:.Lcts_permute_table
339	movt		ip, :upper16:.Lcts_permute_table
340	sub		r4, r4, #16
341	add		lr, ip, #32
342	add		ip, ip, r4
343	sub		lr, lr, r4
344	vld1.8		{q5}, [ip]
345	vld1.8		{q6}, [lr]
346
347	add		ip, r1, r4
348	vld1.8		{q0}, [r1]			@ overlapping loads
349	vld1.8		{q1}, [ip]
350
351	vld1.8		{q3}, [r5]			@ get iv
352	prepare_key	r2, r3
353
354	bl		aes_decrypt
355
356	vtbl.8		d4, {d0-d1}, d10
357	vtbl.8		d5, {d0-d1}, d11
358	vtbx.8		d0, {d2-d3}, d12
359	vtbx.8		d1, {d2-d3}, d13
360
361	veor		q1, q1, q2
362	bl		aes_decrypt
363	veor		q0, q0, q3			@ xor with iv
364
365	add		r4, r0, r4
366	vst1.8		{q1}, [r4]			@ overlapping stores
367	vst1.8		{q0}, [r0]
368
369	pop		{r4-r6, pc}
370ENDPROC(ce_aes_cbc_cts_decrypt)
371
372
373	/*
374	 * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
375	 *		   int blocks, u8 ctr[])
376	 */
377ENTRY(ce_aes_ctr_encrypt)
378	push		{r4-r6, lr}
379	ldrd		r4, r5, [sp, #16]
380	vld1.8		{q7}, [r5]		@ load ctr
381	prepare_key	r2, r3
382	vmov		r6, s31			@ keep swabbed ctr in r6
383	rev		r6, r6
384	cmn		r6, r4			@ 32 bit overflow?
385	bcs		.Lctrloop
386.Lctrloop4x:
387	subs		r4, r4, #4
388	bmi		.Lctr1x
389
390	/*
391	 * NOTE: the sequence below has been carefully tweaked to avoid
392	 * a silicon erratum that exists in Cortex-A57 (#1742098) and
393	 * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs
394	 * may produce an incorrect result if they take their input from a
395	 * register of which a single 32-bit lane has been updated the last
396	 * time it was modified. To work around this, the lanes of registers
397	 * q0-q3 below are not manipulated individually, and the different
398	 * counter values are prepared by successive manipulations of q7.
399	 */
400	add		ip, r6, #1
401	vmov		q0, q7
402	rev		ip, ip
403	add		lr, r6, #2
404	vmov		s31, ip			@ set lane 3 of q1 via q7
405	add		ip, r6, #3
406	rev		lr, lr
407	vmov		q1, q7
408	vmov		s31, lr			@ set lane 3 of q2 via q7
409	rev		ip, ip
410	vmov		q2, q7
411	vmov		s31, ip			@ set lane 3 of q3 via q7
412	add		r6, r6, #4
413	vmov		q3, q7
414
415	vld1.8		{q4-q5}, [r1]!
416	vld1.8		{q6}, [r1]!
417	vld1.8		{q15}, [r1]!
418	bl		aes_encrypt_4x
419	veor		q0, q0, q4
420	veor		q1, q1, q5
421	veor		q2, q2, q6
422	veor		q3, q3, q15
423	rev		ip, r6
424	vst1.8		{q0-q1}, [r0]!
425	vst1.8		{q2-q3}, [r0]!
426	vmov		s31, ip
427	b		.Lctrloop4x
428.Lctr1x:
429	adds		r4, r4, #4
430	beq		.Lctrout
431.Lctrloop:
432	vmov		q0, q7
433	bl		aes_encrypt
434
435	adds		r6, r6, #1		@ increment BE ctr
436	rev		ip, r6
437	vmov		s31, ip
438	bcs		.Lctrcarry
439
440.Lctrcarrydone:
441	subs		r4, r4, #1
442	bmi		.Lctrtailblock		@ blocks < 0 means tail block
443	vld1.8		{q3}, [r1]!
444	veor		q3, q0, q3
445	vst1.8		{q3}, [r0]!
446	bne		.Lctrloop
447
448.Lctrout:
449	vst1.8		{q7}, [r5]		@ return next CTR value
450	pop		{r4-r6, pc}
451
452.Lctrtailblock:
453	vst1.8		{q0}, [r0, :64]		@ return the key stream
454	b		.Lctrout
455
456.Lctrcarry:
457	.irp		sreg, s30, s29, s28
458	vmov		ip, \sreg		@ load next word of ctr
459	rev		ip, ip			@ ... to handle the carry
460	adds		ip, ip, #1
461	rev		ip, ip
462	vmov		\sreg, ip
463	bcc		.Lctrcarrydone
464	.endr
465	b		.Lctrcarrydone
466ENDPROC(ce_aes_ctr_encrypt)
467
468	/*
469	 * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
470	 *		   int bytes, u8 iv[], u32 const rk2[], int first)
471	 * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
472	 *		   int bytes, u8 iv[], u32 const rk2[], int first)
473	 */
474
475	.macro		next_tweak, out, in, const, tmp
476	vshr.s64	\tmp, \in, #63
477	vand		\tmp, \tmp, \const
478	vadd.u64	\out, \in, \in
479	vext.8		\tmp, \tmp, \tmp, #8
480	veor		\out, \out, \tmp
481	.endm
482
483ce_aes_xts_init:
484	vmov.i32	d30, #0x87		@ compose tweak mask vector
485	vmovl.u32	q15, d30
486	vshr.u64	d30, d31, #7
487
488	ldrd		r4, r5, [sp, #16]	@ load args
489	ldr		r6, [sp, #28]
490	vld1.8		{q0}, [r5]		@ load iv
491	teq		r6, #1			@ start of a block?
492	bxne		lr
493
494	@ Encrypt the IV in q0 with the second AES key. This should only
495	@ be done at the start of a block.
496	ldr		r6, [sp, #24]		@ load AES key 2
497	prepare_key	r6, r3
498	add		ip, r6, #32		@ 3rd round key of key 2
499	b		.Laes_encrypt_tweak	@ tail call
500ENDPROC(ce_aes_xts_init)
501
502ENTRY(ce_aes_xts_encrypt)
503	push		{r4-r6, lr}
504
505	bl		ce_aes_xts_init		@ run shared prologue
506	prepare_key	r2, r3
507	vmov		q4, q0
508
509	teq		r6, #0			@ start of a block?
510	bne		.Lxtsenc4x
511
512.Lxtsencloop4x:
513	next_tweak	q4, q4, q15, q10
514.Lxtsenc4x:
515	subs		r4, r4, #64
516	bmi		.Lxtsenc1x
517	vld1.8		{q0-q1}, [r1]!		@ get 4 pt blocks
518	vld1.8		{q2-q3}, [r1]!
519	next_tweak	q5, q4, q15, q10
520	veor		q0, q0, q4
521	next_tweak	q6, q5, q15, q10
522	veor		q1, q1, q5
523	next_tweak	q7, q6, q15, q10
524	veor		q2, q2, q6
525	veor		q3, q3, q7
526	bl		aes_encrypt_4x
527	veor		q0, q0, q4
528	veor		q1, q1, q5
529	veor		q2, q2, q6
530	veor		q3, q3, q7
531	vst1.8		{q0-q1}, [r0]!		@ write 4 ct blocks
532	vst1.8		{q2-q3}, [r0]!
533	vmov		q4, q7
534	teq		r4, #0
535	beq		.Lxtsencret
536	b		.Lxtsencloop4x
537.Lxtsenc1x:
538	adds		r4, r4, #64
539	beq		.Lxtsencout
540	subs		r4, r4, #16
541	bmi		.LxtsencctsNx
542.Lxtsencloop:
543	vld1.8		{q0}, [r1]!
544.Lxtsencctsout:
545	veor		q0, q0, q4
546	bl		aes_encrypt
547	veor		q0, q0, q4
548	teq		r4, #0
549	beq		.Lxtsencout
550	subs		r4, r4, #16
551	next_tweak	q4, q4, q15, q6
552	bmi		.Lxtsenccts
553	vst1.8		{q0}, [r0]!
554	b		.Lxtsencloop
555.Lxtsencout:
556	vst1.8		{q0}, [r0]
557.Lxtsencret:
558	vst1.8		{q4}, [r5]
559	pop		{r4-r6, pc}
560
561.LxtsencctsNx:
562	vmov		q0, q3
563	sub		r0, r0, #16
564.Lxtsenccts:
565	movw		ip, :lower16:.Lcts_permute_table
566	movt		ip, :upper16:.Lcts_permute_table
567
568	add		r1, r1, r4		@ rewind input pointer
569	add		r4, r4, #16		@ # bytes in final block
570	add		lr, ip, #32
571	add		ip, ip, r4
572	sub		lr, lr, r4
573	add		r4, r0, r4		@ output address of final block
574
575	vld1.8		{q1}, [r1]		@ load final partial block
576	vld1.8		{q2}, [ip]
577	vld1.8		{q3}, [lr]
578
579	vtbl.8		d4, {d0-d1}, d4
580	vtbl.8		d5, {d0-d1}, d5
581	vtbx.8		d0, {d2-d3}, d6
582	vtbx.8		d1, {d2-d3}, d7
583
584	vst1.8		{q2}, [r4]		@ overlapping stores
585	mov		r4, #0
586	b		.Lxtsencctsout
587ENDPROC(ce_aes_xts_encrypt)
588
589
590ENTRY(ce_aes_xts_decrypt)
591	push		{r4-r6, lr}
592
593	bl		ce_aes_xts_init		@ run shared prologue
594	prepare_key	r2, r3
595	vmov		q4, q0
596
597	/* subtract 16 bytes if we are doing CTS */
598	tst		r4, #0xf
599	subne		r4, r4, #0x10
600
601	teq		r6, #0			@ start of a block?
602	bne		.Lxtsdec4x
603
604.Lxtsdecloop4x:
605	next_tweak	q4, q4, q15, q10
606.Lxtsdec4x:
607	subs		r4, r4, #64
608	bmi		.Lxtsdec1x
609	vld1.8		{q0-q1}, [r1]!		@ get 4 ct blocks
610	vld1.8		{q2-q3}, [r1]!
611	next_tweak	q5, q4, q15, q10
612	veor		q0, q0, q4
613	next_tweak	q6, q5, q15, q10
614	veor		q1, q1, q5
615	next_tweak	q7, q6, q15, q10
616	veor		q2, q2, q6
617	veor		q3, q3, q7
618	bl		aes_decrypt_4x
619	veor		q0, q0, q4
620	veor		q1, q1, q5
621	veor		q2, q2, q6
622	veor		q3, q3, q7
623	vst1.8		{q0-q1}, [r0]!		@ write 4 pt blocks
624	vst1.8		{q2-q3}, [r0]!
625	vmov		q4, q7
626	teq		r4, #0
627	beq		.Lxtsdecout
628	b		.Lxtsdecloop4x
629.Lxtsdec1x:
630	adds		r4, r4, #64
631	beq		.Lxtsdecout
632	subs		r4, r4, #16
633.Lxtsdecloop:
634	vld1.8		{q0}, [r1]!
635	bmi		.Lxtsdeccts
636.Lxtsdecctsout:
637	veor		q0, q0, q4
638	bl		aes_decrypt
639	veor		q0, q0, q4
640	vst1.8		{q0}, [r0]!
641	teq		r4, #0
642	beq		.Lxtsdecout
643	subs		r4, r4, #16
644	next_tweak	q4, q4, q15, q6
645	b		.Lxtsdecloop
646.Lxtsdecout:
647	vst1.8		{q4}, [r5]
648	pop		{r4-r6, pc}
649
650.Lxtsdeccts:
651	movw		ip, :lower16:.Lcts_permute_table
652	movt		ip, :upper16:.Lcts_permute_table
653
654	add		r1, r1, r4		@ rewind input pointer
655	add		r4, r4, #16		@ # bytes in final block
656	add		lr, ip, #32
657	add		ip, ip, r4
658	sub		lr, lr, r4
659	add		r4, r0, r4		@ output address of final block
660
661	next_tweak	q5, q4, q15, q6
662
663	vld1.8		{q1}, [r1]		@ load final partial block
664	vld1.8		{q2}, [ip]
665	vld1.8		{q3}, [lr]
666
667	veor		q0, q0, q5
668	bl		aes_decrypt
669	veor		q0, q0, q5
670
671	vtbl.8		d4, {d0-d1}, d4
672	vtbl.8		d5, {d0-d1}, d5
673	vtbx.8		d0, {d2-d3}, d6
674	vtbx.8		d1, {d2-d3}, d7
675
676	vst1.8		{q2}, [r4]		@ overlapping stores
677	mov		r4, #0
678	b		.Lxtsdecctsout
679ENDPROC(ce_aes_xts_decrypt)
680
681	/*
682	 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
683	 *                             AES sbox substitution on each byte in
684	 *                             'input'
685	 */
686ENTRY(ce_aes_sub)
687	vdup.32		q1, r0
688	veor		q0, q0, q0
689	aese.8		q0, q1
690	vmov		r0, s0
691	bx		lr
692ENDPROC(ce_aes_sub)
693
694	/*
695	 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
696	 *                                        operation on round key *src
697	 */
698ENTRY(ce_aes_invert)
699	vld1.32		{q0}, [r1]
700	aesimc.8	q0, q0
701	vst1.32		{q0}, [r0]
702	bx		lr
703ENDPROC(ce_aes_invert)
704
705	.section	".rodata", "a"
706	.align		6
707.Lcts_permute_table:
708	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
709	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
710	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
711	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
712	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
713	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
714