xref: /linux/arch/arm64/crypto/aes-modes.S (revision d19e470b6605c900db21fc7b34c66b6891a79983)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4 *
5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/* included by aes-ce.S and aes-neon.S */
9
10	.text
11	.align		4
12
13#ifndef MAX_STRIDE
14#define MAX_STRIDE	4
15#endif
16
17#if MAX_STRIDE == 4
18#define ST4(x...) x
19#define ST5(x...)
20#else
21#define ST4(x...)
22#define ST5(x...) x
23#endif
24
25aes_encrypt_block4x:
26	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
27	ret
28ENDPROC(aes_encrypt_block4x)
29
30aes_decrypt_block4x:
31	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
32	ret
33ENDPROC(aes_decrypt_block4x)
34
35#if MAX_STRIDE == 5
36aes_encrypt_block5x:
37	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
38	ret
39ENDPROC(aes_encrypt_block5x)
40
41aes_decrypt_block5x:
42	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
43	ret
44ENDPROC(aes_decrypt_block5x)
45#endif
46
47	/*
48	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49	 *		   int blocks)
50	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51	 *		   int blocks)
52	 */
53
54AES_ENTRY(aes_ecb_encrypt)
55	stp		x29, x30, [sp, #-16]!
56	mov		x29, sp
57
58	enc_prepare	w3, x2, x5
59
60.LecbencloopNx:
61	subs		w4, w4, #MAX_STRIDE
62	bmi		.Lecbenc1x
63	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
64ST4(	bl		aes_encrypt_block4x		)
65ST5(	ld1		{v4.16b}, [x1], #16		)
66ST5(	bl		aes_encrypt_block5x		)
67	st1		{v0.16b-v3.16b}, [x0], #64
68ST5(	st1		{v4.16b}, [x0], #16		)
69	b		.LecbencloopNx
70.Lecbenc1x:
71	adds		w4, w4, #MAX_STRIDE
72	beq		.Lecbencout
73.Lecbencloop:
74	ld1		{v0.16b}, [x1], #16		/* get next pt block */
75	encrypt_block	v0, w3, x2, x5, w6
76	st1		{v0.16b}, [x0], #16
77	subs		w4, w4, #1
78	bne		.Lecbencloop
79.Lecbencout:
80	ldp		x29, x30, [sp], #16
81	ret
82AES_ENDPROC(aes_ecb_encrypt)
83
84
85AES_ENTRY(aes_ecb_decrypt)
86	stp		x29, x30, [sp, #-16]!
87	mov		x29, sp
88
89	dec_prepare	w3, x2, x5
90
91.LecbdecloopNx:
92	subs		w4, w4, #MAX_STRIDE
93	bmi		.Lecbdec1x
94	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
95ST4(	bl		aes_decrypt_block4x		)
96ST5(	ld1		{v4.16b}, [x1], #16		)
97ST5(	bl		aes_decrypt_block5x		)
98	st1		{v0.16b-v3.16b}, [x0], #64
99ST5(	st1		{v4.16b}, [x0], #16		)
100	b		.LecbdecloopNx
101.Lecbdec1x:
102	adds		w4, w4, #MAX_STRIDE
103	beq		.Lecbdecout
104.Lecbdecloop:
105	ld1		{v0.16b}, [x1], #16		/* get next ct block */
106	decrypt_block	v0, w3, x2, x5, w6
107	st1		{v0.16b}, [x0], #16
108	subs		w4, w4, #1
109	bne		.Lecbdecloop
110.Lecbdecout:
111	ldp		x29, x30, [sp], #16
112	ret
113AES_ENDPROC(aes_ecb_decrypt)
114
115
116	/*
117	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118	 *		   int blocks, u8 iv[])
119	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120	 *		   int blocks, u8 iv[])
121	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122	 *			 int rounds, int blocks, u8 iv[],
123	 *			 u32 const rk2[]);
124	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125	 *			 int rounds, int blocks, u8 iv[],
126	 *			 u32 const rk2[]);
127	 */
128
129AES_ENTRY(aes_essiv_cbc_encrypt)
130	ld1		{v4.16b}, [x5]			/* get iv */
131
132	mov		w8, #14				/* AES-256: 14 rounds */
133	enc_prepare	w8, x6, x7
134	encrypt_block	v4, w8, x6, x7, w9
135	enc_switch_key	w3, x2, x6
136	b		.Lcbcencloop4x
137
138AES_ENTRY(aes_cbc_encrypt)
139	ld1		{v4.16b}, [x5]			/* get iv */
140	enc_prepare	w3, x2, x6
141
142.Lcbcencloop4x:
143	subs		w4, w4, #4
144	bmi		.Lcbcenc1x
145	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
146	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
147	encrypt_block	v0, w3, x2, x6, w7
148	eor		v1.16b, v1.16b, v0.16b
149	encrypt_block	v1, w3, x2, x6, w7
150	eor		v2.16b, v2.16b, v1.16b
151	encrypt_block	v2, w3, x2, x6, w7
152	eor		v3.16b, v3.16b, v2.16b
153	encrypt_block	v3, w3, x2, x6, w7
154	st1		{v0.16b-v3.16b}, [x0], #64
155	mov		v4.16b, v3.16b
156	b		.Lcbcencloop4x
157.Lcbcenc1x:
158	adds		w4, w4, #4
159	beq		.Lcbcencout
160.Lcbcencloop:
161	ld1		{v0.16b}, [x1], #16		/* get next pt block */
162	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
163	encrypt_block	v4, w3, x2, x6, w7
164	st1		{v4.16b}, [x0], #16
165	subs		w4, w4, #1
166	bne		.Lcbcencloop
167.Lcbcencout:
168	st1		{v4.16b}, [x5]			/* return iv */
169	ret
170AES_ENDPROC(aes_cbc_encrypt)
171AES_ENDPROC(aes_essiv_cbc_encrypt)
172
173AES_ENTRY(aes_essiv_cbc_decrypt)
174	stp		x29, x30, [sp, #-16]!
175	mov		x29, sp
176
177	ld1		{cbciv.16b}, [x5]		/* get iv */
178
179	mov		w8, #14				/* AES-256: 14 rounds */
180	enc_prepare	w8, x6, x7
181	encrypt_block	cbciv, w8, x6, x7, w9
182	b		.Lessivcbcdecstart
183
184AES_ENTRY(aes_cbc_decrypt)
185	stp		x29, x30, [sp, #-16]!
186	mov		x29, sp
187
188	ld1		{cbciv.16b}, [x5]		/* get iv */
189.Lessivcbcdecstart:
190	dec_prepare	w3, x2, x6
191
192.LcbcdecloopNx:
193	subs		w4, w4, #MAX_STRIDE
194	bmi		.Lcbcdec1x
195	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
196#if MAX_STRIDE == 5
197	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
198	mov		v5.16b, v0.16b
199	mov		v6.16b, v1.16b
200	mov		v7.16b, v2.16b
201	bl		aes_decrypt_block5x
202	sub		x1, x1, #32
203	eor		v0.16b, v0.16b, cbciv.16b
204	eor		v1.16b, v1.16b, v5.16b
205	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
206	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
207	eor		v2.16b, v2.16b, v6.16b
208	eor		v3.16b, v3.16b, v7.16b
209	eor		v4.16b, v4.16b, v5.16b
210#else
211	mov		v4.16b, v0.16b
212	mov		v5.16b, v1.16b
213	mov		v6.16b, v2.16b
214	bl		aes_decrypt_block4x
215	sub		x1, x1, #16
216	eor		v0.16b, v0.16b, cbciv.16b
217	eor		v1.16b, v1.16b, v4.16b
218	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
219	eor		v2.16b, v2.16b, v5.16b
220	eor		v3.16b, v3.16b, v6.16b
221#endif
222	st1		{v0.16b-v3.16b}, [x0], #64
223ST5(	st1		{v4.16b}, [x0], #16		)
224	b		.LcbcdecloopNx
225.Lcbcdec1x:
226	adds		w4, w4, #MAX_STRIDE
227	beq		.Lcbcdecout
228.Lcbcdecloop:
229	ld1		{v1.16b}, [x1], #16		/* get next ct block */
230	mov		v0.16b, v1.16b			/* ...and copy to v0 */
231	decrypt_block	v0, w3, x2, x6, w7
232	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
233	mov		cbciv.16b, v1.16b		/* ct is next iv */
234	st1		{v0.16b}, [x0], #16
235	subs		w4, w4, #1
236	bne		.Lcbcdecloop
237.Lcbcdecout:
238	st1		{cbciv.16b}, [x5]		/* return iv */
239	ldp		x29, x30, [sp], #16
240	ret
241AES_ENDPROC(aes_cbc_decrypt)
242AES_ENDPROC(aes_essiv_cbc_decrypt)
243
244
245	/*
246	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247	 *		       int rounds, int bytes, u8 const iv[])
248	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249	 *		       int rounds, int bytes, u8 const iv[])
250	 */
251
252AES_ENTRY(aes_cbc_cts_encrypt)
253	adr_l		x8, .Lcts_permute_table
254	sub		x4, x4, #16
255	add		x9, x8, #32
256	add		x8, x8, x4
257	sub		x9, x9, x4
258	ld1		{v3.16b}, [x8]
259	ld1		{v4.16b}, [x9]
260
261	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
262	ld1		{v1.16b}, [x1]
263
264	ld1		{v5.16b}, [x5]			/* get iv */
265	enc_prepare	w3, x2, x6
266
267	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
268	tbl		v1.16b, {v1.16b}, v4.16b
269	encrypt_block	v0, w3, x2, x6, w7
270
271	eor		v1.16b, v1.16b, v0.16b
272	tbl		v0.16b, {v0.16b}, v3.16b
273	encrypt_block	v1, w3, x2, x6, w7
274
275	add		x4, x0, x4
276	st1		{v0.16b}, [x4]			/* overlapping stores */
277	st1		{v1.16b}, [x0]
278	ret
279AES_ENDPROC(aes_cbc_cts_encrypt)
280
281AES_ENTRY(aes_cbc_cts_decrypt)
282	adr_l		x8, .Lcts_permute_table
283	sub		x4, x4, #16
284	add		x9, x8, #32
285	add		x8, x8, x4
286	sub		x9, x9, x4
287	ld1		{v3.16b}, [x8]
288	ld1		{v4.16b}, [x9]
289
290	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
291	ld1		{v1.16b}, [x1]
292
293	ld1		{v5.16b}, [x5]			/* get iv */
294	dec_prepare	w3, x2, x6
295
296	decrypt_block	v0, w3, x2, x6, w7
297	tbl		v2.16b, {v0.16b}, v3.16b
298	eor		v2.16b, v2.16b, v1.16b
299
300	tbx		v0.16b, {v1.16b}, v4.16b
301	decrypt_block	v0, w3, x2, x6, w7
302	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
303
304	add		x4, x0, x4
305	st1		{v2.16b}, [x4]			/* overlapping stores */
306	st1		{v0.16b}, [x0]
307	ret
308AES_ENDPROC(aes_cbc_cts_decrypt)
309
310	.section	".rodata", "a"
311	.align		6
312.Lcts_permute_table:
313	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
316	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
317	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
319	.previous
320
321
322	/*
323	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
324	 *		   int blocks, u8 ctr[])
325	 */
326
327AES_ENTRY(aes_ctr_encrypt)
328	stp		x29, x30, [sp, #-16]!
329	mov		x29, sp
330
331	enc_prepare	w3, x2, x6
332	ld1		{vctr.16b}, [x5]
333
334	umov		x6, vctr.d[1]		/* keep swabbed ctr in reg */
335	rev		x6, x6
336	cmn		w6, w4			/* 32 bit overflow? */
337	bcs		.Lctrloop
338.LctrloopNx:
339	subs		w4, w4, #MAX_STRIDE
340	bmi		.Lctr1x
341	add		w7, w6, #1
342	mov		v0.16b, vctr.16b
343	add		w8, w6, #2
344	mov		v1.16b, vctr.16b
345	add		w9, w6, #3
346	mov		v2.16b, vctr.16b
347	add		w9, w6, #3
348	rev		w7, w7
349	mov		v3.16b, vctr.16b
350	rev		w8, w8
351ST5(	mov		v4.16b, vctr.16b		)
352	mov		v1.s[3], w7
353	rev		w9, w9
354ST5(	add		w10, w6, #4			)
355	mov		v2.s[3], w8
356ST5(	rev		w10, w10			)
357	mov		v3.s[3], w9
358ST5(	mov		v4.s[3], w10			)
359	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
360ST4(	bl		aes_encrypt_block4x		)
361ST5(	bl		aes_encrypt_block5x		)
362	eor		v0.16b, v5.16b, v0.16b
363ST4(	ld1		{v5.16b}, [x1], #16		)
364	eor		v1.16b, v6.16b, v1.16b
365ST5(	ld1		{v5.16b-v6.16b}, [x1], #32	)
366	eor		v2.16b, v7.16b, v2.16b
367	eor		v3.16b, v5.16b, v3.16b
368ST5(	eor		v4.16b, v6.16b, v4.16b		)
369	st1		{v0.16b-v3.16b}, [x0], #64
370ST5(	st1		{v4.16b}, [x0], #16		)
371	add		x6, x6, #MAX_STRIDE
372	rev		x7, x6
373	ins		vctr.d[1], x7
374	cbz		w4, .Lctrout
375	b		.LctrloopNx
376.Lctr1x:
377	adds		w4, w4, #MAX_STRIDE
378	beq		.Lctrout
379.Lctrloop:
380	mov		v0.16b, vctr.16b
381	encrypt_block	v0, w3, x2, x8, w7
382
383	adds		x6, x6, #1		/* increment BE ctr */
384	rev		x7, x6
385	ins		vctr.d[1], x7
386	bcs		.Lctrcarry		/* overflow? */
387
388.Lctrcarrydone:
389	subs		w4, w4, #1
390	bmi		.Lctrtailblock		/* blocks <0 means tail block */
391	ld1		{v3.16b}, [x1], #16
392	eor		v3.16b, v0.16b, v3.16b
393	st1		{v3.16b}, [x0], #16
394	bne		.Lctrloop
395
396.Lctrout:
397	st1		{vctr.16b}, [x5]	/* return next CTR value */
398	ldp		x29, x30, [sp], #16
399	ret
400
401.Lctrtailblock:
402	st1		{v0.16b}, [x0]
403	b		.Lctrout
404
405.Lctrcarry:
406	umov		x7, vctr.d[0]		/* load upper word of ctr  */
407	rev		x7, x7			/* ... to handle the carry */
408	add		x7, x7, #1
409	rev		x7, x7
410	ins		vctr.d[0], x7
411	b		.Lctrcarrydone
412AES_ENDPROC(aes_ctr_encrypt)
413
414
415	/*
416	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
417	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
418	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
419	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
420	 */
421
422	.macro		next_tweak, out, in, tmp
423	sshr		\tmp\().2d,  \in\().2d,   #63
424	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
425	add		\out\().2d,  \in\().2d,   \in\().2d
426	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
427	eor		\out\().16b, \out\().16b, \tmp\().16b
428	.endm
429
430	.macro		xts_load_mask, tmp
431	movi		xtsmask.2s, #0x1
432	movi		\tmp\().2s, #0x87
433	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
434	.endm
435
436AES_ENTRY(aes_xts_encrypt)
437	stp		x29, x30, [sp, #-16]!
438	mov		x29, sp
439
440	ld1		{v4.16b}, [x6]
441	xts_load_mask	v8
442	cbz		w7, .Lxtsencnotfirst
443
444	enc_prepare	w3, x5, x8
445	xts_cts_skip_tw	w7, .LxtsencNx
446	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
447	enc_switch_key	w3, x2, x8
448	b		.LxtsencNx
449
450.Lxtsencnotfirst:
451	enc_prepare	w3, x2, x8
452.LxtsencloopNx:
453	next_tweak	v4, v4, v8
454.LxtsencNx:
455	subs		w4, w4, #64
456	bmi		.Lxtsenc1x
457	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
458	next_tweak	v5, v4, v8
459	eor		v0.16b, v0.16b, v4.16b
460	next_tweak	v6, v5, v8
461	eor		v1.16b, v1.16b, v5.16b
462	eor		v2.16b, v2.16b, v6.16b
463	next_tweak	v7, v6, v8
464	eor		v3.16b, v3.16b, v7.16b
465	bl		aes_encrypt_block4x
466	eor		v3.16b, v3.16b, v7.16b
467	eor		v0.16b, v0.16b, v4.16b
468	eor		v1.16b, v1.16b, v5.16b
469	eor		v2.16b, v2.16b, v6.16b
470	st1		{v0.16b-v3.16b}, [x0], #64
471	mov		v4.16b, v7.16b
472	cbz		w4, .Lxtsencret
473	xts_reload_mask	v8
474	b		.LxtsencloopNx
475.Lxtsenc1x:
476	adds		w4, w4, #64
477	beq		.Lxtsencout
478	subs		w4, w4, #16
479	bmi		.LxtsencctsNx
480.Lxtsencloop:
481	ld1		{v0.16b}, [x1], #16
482.Lxtsencctsout:
483	eor		v0.16b, v0.16b, v4.16b
484	encrypt_block	v0, w3, x2, x8, w7
485	eor		v0.16b, v0.16b, v4.16b
486	cbz		w4, .Lxtsencout
487	subs		w4, w4, #16
488	next_tweak	v4, v4, v8
489	bmi		.Lxtsenccts
490	st1		{v0.16b}, [x0], #16
491	b		.Lxtsencloop
492.Lxtsencout:
493	st1		{v0.16b}, [x0]
494.Lxtsencret:
495	st1		{v4.16b}, [x6]
496	ldp		x29, x30, [sp], #16
497	ret
498
499.LxtsencctsNx:
500	mov		v0.16b, v3.16b
501	sub		x0, x0, #16
502.Lxtsenccts:
503	adr_l		x8, .Lcts_permute_table
504
505	add		x1, x1, w4, sxtw	/* rewind input pointer */
506	add		w4, w4, #16		/* # bytes in final block */
507	add		x9, x8, #32
508	add		x8, x8, x4
509	sub		x9, x9, x4
510	add		x4, x0, x4		/* output address of final block */
511
512	ld1		{v1.16b}, [x1]		/* load final block */
513	ld1		{v2.16b}, [x8]
514	ld1		{v3.16b}, [x9]
515
516	tbl		v2.16b, {v0.16b}, v2.16b
517	tbx		v0.16b, {v1.16b}, v3.16b
518	st1		{v2.16b}, [x4]			/* overlapping stores */
519	mov		w4, wzr
520	b		.Lxtsencctsout
521AES_ENDPROC(aes_xts_encrypt)
522
523AES_ENTRY(aes_xts_decrypt)
524	stp		x29, x30, [sp, #-16]!
525	mov		x29, sp
526
527	/* subtract 16 bytes if we are doing CTS */
528	sub		w8, w4, #0x10
529	tst		w4, #0xf
530	csel		w4, w4, w8, eq
531
532	ld1		{v4.16b}, [x6]
533	xts_load_mask	v8
534	xts_cts_skip_tw	w7, .Lxtsdecskiptw
535	cbz		w7, .Lxtsdecnotfirst
536
537	enc_prepare	w3, x5, x8
538	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
539.Lxtsdecskiptw:
540	dec_prepare	w3, x2, x8
541	b		.LxtsdecNx
542
543.Lxtsdecnotfirst:
544	dec_prepare	w3, x2, x8
545.LxtsdecloopNx:
546	next_tweak	v4, v4, v8
547.LxtsdecNx:
548	subs		w4, w4, #64
549	bmi		.Lxtsdec1x
550	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
551	next_tweak	v5, v4, v8
552	eor		v0.16b, v0.16b, v4.16b
553	next_tweak	v6, v5, v8
554	eor		v1.16b, v1.16b, v5.16b
555	eor		v2.16b, v2.16b, v6.16b
556	next_tweak	v7, v6, v8
557	eor		v3.16b, v3.16b, v7.16b
558	bl		aes_decrypt_block4x
559	eor		v3.16b, v3.16b, v7.16b
560	eor		v0.16b, v0.16b, v4.16b
561	eor		v1.16b, v1.16b, v5.16b
562	eor		v2.16b, v2.16b, v6.16b
563	st1		{v0.16b-v3.16b}, [x0], #64
564	mov		v4.16b, v7.16b
565	cbz		w4, .Lxtsdecout
566	xts_reload_mask	v8
567	b		.LxtsdecloopNx
568.Lxtsdec1x:
569	adds		w4, w4, #64
570	beq		.Lxtsdecout
571	subs		w4, w4, #16
572.Lxtsdecloop:
573	ld1		{v0.16b}, [x1], #16
574	bmi		.Lxtsdeccts
575.Lxtsdecctsout:
576	eor		v0.16b, v0.16b, v4.16b
577	decrypt_block	v0, w3, x2, x8, w7
578	eor		v0.16b, v0.16b, v4.16b
579	st1		{v0.16b}, [x0], #16
580	cbz		w4, .Lxtsdecout
581	subs		w4, w4, #16
582	next_tweak	v4, v4, v8
583	b		.Lxtsdecloop
584.Lxtsdecout:
585	st1		{v4.16b}, [x6]
586	ldp		x29, x30, [sp], #16
587	ret
588
589.Lxtsdeccts:
590	adr_l		x8, .Lcts_permute_table
591
592	add		x1, x1, w4, sxtw	/* rewind input pointer */
593	add		w4, w4, #16		/* # bytes in final block */
594	add		x9, x8, #32
595	add		x8, x8, x4
596	sub		x9, x9, x4
597	add		x4, x0, x4		/* output address of final block */
598
599	next_tweak	v5, v4, v8
600
601	ld1		{v1.16b}, [x1]		/* load final block */
602	ld1		{v2.16b}, [x8]
603	ld1		{v3.16b}, [x9]
604
605	eor		v0.16b, v0.16b, v5.16b
606	decrypt_block	v0, w3, x2, x8, w7
607	eor		v0.16b, v0.16b, v5.16b
608
609	tbl		v2.16b, {v0.16b}, v2.16b
610	tbx		v0.16b, {v1.16b}, v3.16b
611
612	st1		{v2.16b}, [x4]			/* overlapping stores */
613	mov		w4, wzr
614	b		.Lxtsdecctsout
615AES_ENDPROC(aes_xts_decrypt)
616
617	/*
618	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
619	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
620	 */
621AES_ENTRY(aes_mac_update)
622	frame_push	6
623
624	mov		x19, x0
625	mov		x20, x1
626	mov		x21, x2
627	mov		x22, x3
628	mov		x23, x4
629	mov		x24, x6
630
631	ld1		{v0.16b}, [x23]			/* get dg */
632	enc_prepare	w2, x1, x7
633	cbz		w5, .Lmacloop4x
634
635	encrypt_block	v0, w2, x1, x7, w8
636
637.Lmacloop4x:
638	subs		w22, w22, #4
639	bmi		.Lmac1x
640	ld1		{v1.16b-v4.16b}, [x19], #64	/* get next pt block */
641	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
642	encrypt_block	v0, w21, x20, x7, w8
643	eor		v0.16b, v0.16b, v2.16b
644	encrypt_block	v0, w21, x20, x7, w8
645	eor		v0.16b, v0.16b, v3.16b
646	encrypt_block	v0, w21, x20, x7, w8
647	eor		v0.16b, v0.16b, v4.16b
648	cmp		w22, wzr
649	csinv		x5, x24, xzr, eq
650	cbz		w5, .Lmacout
651	encrypt_block	v0, w21, x20, x7, w8
652	st1		{v0.16b}, [x23]			/* return dg */
653	cond_yield_neon	.Lmacrestart
654	b		.Lmacloop4x
655.Lmac1x:
656	add		w22, w22, #4
657.Lmacloop:
658	cbz		w22, .Lmacout
659	ld1		{v1.16b}, [x19], #16		/* get next pt block */
660	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
661
662	subs		w22, w22, #1
663	csinv		x5, x24, xzr, eq
664	cbz		w5, .Lmacout
665
666.Lmacenc:
667	encrypt_block	v0, w21, x20, x7, w8
668	b		.Lmacloop
669
670.Lmacout:
671	st1		{v0.16b}, [x23]			/* return dg */
672	frame_pop
673	ret
674
675.Lmacrestart:
676	ld1		{v0.16b}, [x23]			/* get dg */
677	enc_prepare	w21, x20, x0
678	b		.Lmacloop4x
679AES_ENDPROC(aes_mac_update)
680