xref: /linux/arch/arm64/crypto/aes-modes.S (revision 4b132aacb0768ac1e652cf517097ea6f237214b9)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4 *
5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/* included by aes-ce.S and aes-neon.S */
9
10	.text
11	.align		4
12
13#ifndef MAX_STRIDE
14#define MAX_STRIDE	4
15#endif
16
17#if MAX_STRIDE == 4
18#define ST4(x...) x
19#define ST5(x...)
20#else
21#define ST4(x...)
22#define ST5(x...) x
23#endif
24
25SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
27	ret
28SYM_FUNC_END(aes_encrypt_block4x)
29
30SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
32	ret
33SYM_FUNC_END(aes_decrypt_block4x)
34
35#if MAX_STRIDE == 5
36SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
38	ret
39SYM_FUNC_END(aes_encrypt_block5x)
40
41SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
43	ret
44SYM_FUNC_END(aes_decrypt_block5x)
45#endif
46
47	/*
48	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49	 *		   int blocks)
50	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51	 *		   int blocks)
52	 */
53
54AES_FUNC_START(aes_ecb_encrypt)
55	frame_push	0
56
57	enc_prepare	w3, x2, x5
58
59.LecbencloopNx:
60	subs		w4, w4, #MAX_STRIDE
61	bmi		.Lecbenc1x
62	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
63ST4(	bl		aes_encrypt_block4x		)
64ST5(	ld1		{v4.16b}, [x1], #16		)
65ST5(	bl		aes_encrypt_block5x		)
66	st1		{v0.16b-v3.16b}, [x0], #64
67ST5(	st1		{v4.16b}, [x0], #16		)
68	b		.LecbencloopNx
69.Lecbenc1x:
70	adds		w4, w4, #MAX_STRIDE
71	beq		.Lecbencout
72.Lecbencloop:
73	ld1		{v0.16b}, [x1], #16		/* get next pt block */
74	encrypt_block	v0, w3, x2, x5, w6
75	st1		{v0.16b}, [x0], #16
76	subs		w4, w4, #1
77	bne		.Lecbencloop
78.Lecbencout:
79	frame_pop
80	ret
81AES_FUNC_END(aes_ecb_encrypt)
82
83
84AES_FUNC_START(aes_ecb_decrypt)
85	frame_push	0
86
87	dec_prepare	w3, x2, x5
88
89.LecbdecloopNx:
90	subs		w4, w4, #MAX_STRIDE
91	bmi		.Lecbdec1x
92	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
93ST4(	bl		aes_decrypt_block4x		)
94ST5(	ld1		{v4.16b}, [x1], #16		)
95ST5(	bl		aes_decrypt_block5x		)
96	st1		{v0.16b-v3.16b}, [x0], #64
97ST5(	st1		{v4.16b}, [x0], #16		)
98	b		.LecbdecloopNx
99.Lecbdec1x:
100	adds		w4, w4, #MAX_STRIDE
101	beq		.Lecbdecout
102.Lecbdecloop:
103	ld1		{v0.16b}, [x1], #16		/* get next ct block */
104	decrypt_block	v0, w3, x2, x5, w6
105	st1		{v0.16b}, [x0], #16
106	subs		w4, w4, #1
107	bne		.Lecbdecloop
108.Lecbdecout:
109	frame_pop
110	ret
111AES_FUNC_END(aes_ecb_decrypt)
112
113
114	/*
115	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
116	 *		   int blocks, u8 iv[])
117	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118	 *		   int blocks, u8 iv[])
119	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
120	 *			 int rounds, int blocks, u8 iv[],
121	 *			 u32 const rk2[]);
122	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
123	 *			 int rounds, int blocks, u8 iv[],
124	 *			 u32 const rk2[]);
125	 */
126
127AES_FUNC_START(aes_essiv_cbc_encrypt)
128	ld1		{v4.16b}, [x5]			/* get iv */
129
130	mov		w8, #14				/* AES-256: 14 rounds */
131	enc_prepare	w8, x6, x7
132	encrypt_block	v4, w8, x6, x7, w9
133	enc_switch_key	w3, x2, x6
134	b		.Lcbcencloop4x
135
136AES_FUNC_START(aes_cbc_encrypt)
137	ld1		{v4.16b}, [x5]			/* get iv */
138	enc_prepare	w3, x2, x6
139
140.Lcbcencloop4x:
141	subs		w4, w4, #4
142	bmi		.Lcbcenc1x
143	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
144	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
145	encrypt_block	v0, w3, x2, x6, w7
146	eor		v1.16b, v1.16b, v0.16b
147	encrypt_block	v1, w3, x2, x6, w7
148	eor		v2.16b, v2.16b, v1.16b
149	encrypt_block	v2, w3, x2, x6, w7
150	eor		v3.16b, v3.16b, v2.16b
151	encrypt_block	v3, w3, x2, x6, w7
152	st1		{v0.16b-v3.16b}, [x0], #64
153	mov		v4.16b, v3.16b
154	b		.Lcbcencloop4x
155.Lcbcenc1x:
156	adds		w4, w4, #4
157	beq		.Lcbcencout
158.Lcbcencloop:
159	ld1		{v0.16b}, [x1], #16		/* get next pt block */
160	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
161	encrypt_block	v4, w3, x2, x6, w7
162	st1		{v4.16b}, [x0], #16
163	subs		w4, w4, #1
164	bne		.Lcbcencloop
165.Lcbcencout:
166	st1		{v4.16b}, [x5]			/* return iv */
167	ret
168AES_FUNC_END(aes_cbc_encrypt)
169AES_FUNC_END(aes_essiv_cbc_encrypt)
170
171AES_FUNC_START(aes_essiv_cbc_decrypt)
172	ld1		{cbciv.16b}, [x5]		/* get iv */
173
174	mov		w8, #14				/* AES-256: 14 rounds */
175	enc_prepare	w8, x6, x7
176	encrypt_block	cbciv, w8, x6, x7, w9
177	b		.Lessivcbcdecstart
178
179AES_FUNC_START(aes_cbc_decrypt)
180	ld1		{cbciv.16b}, [x5]		/* get iv */
181.Lessivcbcdecstart:
182	frame_push	0
183	dec_prepare	w3, x2, x6
184
185.LcbcdecloopNx:
186	subs		w4, w4, #MAX_STRIDE
187	bmi		.Lcbcdec1x
188	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
189#if MAX_STRIDE == 5
190	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
191	mov		v5.16b, v0.16b
192	mov		v6.16b, v1.16b
193	mov		v7.16b, v2.16b
194	bl		aes_decrypt_block5x
195	sub		x1, x1, #32
196	eor		v0.16b, v0.16b, cbciv.16b
197	eor		v1.16b, v1.16b, v5.16b
198	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
199	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
200	eor		v2.16b, v2.16b, v6.16b
201	eor		v3.16b, v3.16b, v7.16b
202	eor		v4.16b, v4.16b, v5.16b
203#else
204	mov		v4.16b, v0.16b
205	mov		v5.16b, v1.16b
206	mov		v6.16b, v2.16b
207	bl		aes_decrypt_block4x
208	sub		x1, x1, #16
209	eor		v0.16b, v0.16b, cbciv.16b
210	eor		v1.16b, v1.16b, v4.16b
211	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
212	eor		v2.16b, v2.16b, v5.16b
213	eor		v3.16b, v3.16b, v6.16b
214#endif
215	st1		{v0.16b-v3.16b}, [x0], #64
216ST5(	st1		{v4.16b}, [x0], #16		)
217	b		.LcbcdecloopNx
218.Lcbcdec1x:
219	adds		w4, w4, #MAX_STRIDE
220	beq		.Lcbcdecout
221.Lcbcdecloop:
222	ld1		{v1.16b}, [x1], #16		/* get next ct block */
223	mov		v0.16b, v1.16b			/* ...and copy to v0 */
224	decrypt_block	v0, w3, x2, x6, w7
225	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
226	mov		cbciv.16b, v1.16b		/* ct is next iv */
227	st1		{v0.16b}, [x0], #16
228	subs		w4, w4, #1
229	bne		.Lcbcdecloop
230.Lcbcdecout:
231	st1		{cbciv.16b}, [x5]		/* return iv */
232	frame_pop
233	ret
234AES_FUNC_END(aes_cbc_decrypt)
235AES_FUNC_END(aes_essiv_cbc_decrypt)
236
237
238	/*
239	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
240	 *		       int rounds, int bytes, u8 const iv[])
241	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
242	 *		       int rounds, int bytes, u8 const iv[])
243	 */
244
245AES_FUNC_START(aes_cbc_cts_encrypt)
246	adr_l		x8, .Lcts_permute_table
247	sub		x4, x4, #16
248	add		x9, x8, #32
249	add		x8, x8, x4
250	sub		x9, x9, x4
251	ld1		{v3.16b}, [x8]
252	ld1		{v4.16b}, [x9]
253
254	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
255	ld1		{v1.16b}, [x1]
256
257	ld1		{v5.16b}, [x5]			/* get iv */
258	enc_prepare	w3, x2, x6
259
260	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
261	tbl		v1.16b, {v1.16b}, v4.16b
262	encrypt_block	v0, w3, x2, x6, w7
263
264	eor		v1.16b, v1.16b, v0.16b
265	tbl		v0.16b, {v0.16b}, v3.16b
266	encrypt_block	v1, w3, x2, x6, w7
267
268	add		x4, x0, x4
269	st1		{v0.16b}, [x4]			/* overlapping stores */
270	st1		{v1.16b}, [x0]
271	ret
272AES_FUNC_END(aes_cbc_cts_encrypt)
273
274AES_FUNC_START(aes_cbc_cts_decrypt)
275	adr_l		x8, .Lcts_permute_table
276	sub		x4, x4, #16
277	add		x9, x8, #32
278	add		x8, x8, x4
279	sub		x9, x9, x4
280	ld1		{v3.16b}, [x8]
281	ld1		{v4.16b}, [x9]
282
283	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
284	ld1		{v1.16b}, [x1]
285
286	ld1		{v5.16b}, [x5]			/* get iv */
287	dec_prepare	w3, x2, x6
288
289	decrypt_block	v0, w3, x2, x6, w7
290	tbl		v2.16b, {v0.16b}, v3.16b
291	eor		v2.16b, v2.16b, v1.16b
292
293	tbx		v0.16b, {v1.16b}, v4.16b
294	decrypt_block	v0, w3, x2, x6, w7
295	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
296
297	add		x4, x0, x4
298	st1		{v2.16b}, [x4]			/* overlapping stores */
299	st1		{v0.16b}, [x0]
300	ret
301AES_FUNC_END(aes_cbc_cts_decrypt)
302
303	.section	".rodata", "a"
304	.align		6
305.Lcts_permute_table:
306	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
307	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
308	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
309	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
310	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
311	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
312	.previous
313
314	/*
315	 * This macro generates the code for CTR and XCTR mode.
316	 */
317.macro ctr_encrypt xctr
318	// Arguments
319	OUT		.req x0
320	IN		.req x1
321	KEY		.req x2
322	ROUNDS_W	.req w3
323	BYTES_W		.req w4
324	IV		.req x5
325	BYTE_CTR_W 	.req w6		// XCTR only
326	// Intermediate values
327	CTR_W		.req w11	// XCTR only
328	CTR		.req x11	// XCTR only
329	IV_PART		.req x12
330	BLOCKS		.req x13
331	BLOCKS_W	.req w13
332
333	frame_push	0
334
335	enc_prepare	ROUNDS_W, KEY, IV_PART
336	ld1		{vctr.16b}, [IV]
337
338	/*
339	 * Keep 64 bits of the IV in a register.  For CTR mode this lets us
340	 * easily increment the IV.  For XCTR mode this lets us efficiently XOR
341	 * the 64-bit counter with the IV.
342	 */
343	.if \xctr
344		umov		IV_PART, vctr.d[0]
345		lsr		CTR_W, BYTE_CTR_W, #4
346	.else
347		umov		IV_PART, vctr.d[1]
348		rev		IV_PART, IV_PART
349	.endif
350
351.LctrloopNx\xctr:
352	add		BLOCKS_W, BYTES_W, #15
353	sub		BYTES_W, BYTES_W, #MAX_STRIDE << 4
354	lsr		BLOCKS_W, BLOCKS_W, #4
355	mov		w8, #MAX_STRIDE
356	cmp		BLOCKS_W, w8
357	csel		BLOCKS_W, BLOCKS_W, w8, lt
358
359	/*
360	 * Set up the counter values in v0-v{MAX_STRIDE-1}.
361	 *
362	 * If we are encrypting less than MAX_STRIDE blocks, the tail block
363	 * handling code expects the last keystream block to be in
364	 * v{MAX_STRIDE-1}.  For example: if encrypting two blocks with
365	 * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
366	 */
367	.if \xctr
368		add		CTR, CTR, BLOCKS
369	.else
370		adds		IV_PART, IV_PART, BLOCKS
371	.endif
372	mov		v0.16b, vctr.16b
373	mov		v1.16b, vctr.16b
374	mov		v2.16b, vctr.16b
375	mov		v3.16b, vctr.16b
376ST5(	mov		v4.16b, vctr.16b		)
377	.if \xctr
378		sub		x6, CTR, #MAX_STRIDE - 1
379		sub		x7, CTR, #MAX_STRIDE - 2
380		sub		x8, CTR, #MAX_STRIDE - 3
381		sub		x9, CTR, #MAX_STRIDE - 4
382ST5(		sub		x10, CTR, #MAX_STRIDE - 5	)
383		eor		x6, x6, IV_PART
384		eor		x7, x7, IV_PART
385		eor		x8, x8, IV_PART
386		eor		x9, x9, IV_PART
387ST5(		eor		x10, x10, IV_PART		)
388		mov		v0.d[0], x6
389		mov		v1.d[0], x7
390		mov		v2.d[0], x8
391		mov		v3.d[0], x9
392ST5(		mov		v4.d[0], x10			)
393	.else
394		bcs		0f
395		.subsection	1
396		/*
397		 * This subsection handles carries.
398		 *
399		 * Conditional branching here is allowed with respect to time
400		 * invariance since the branches are dependent on the IV instead
401		 * of the plaintext or key.  This code is rarely executed in
402		 * practice anyway.
403		 */
404
405		/* Apply carry to outgoing counter. */
4060:		umov		x8, vctr.d[0]
407		rev		x8, x8
408		add		x8, x8, #1
409		rev		x8, x8
410		ins		vctr.d[0], x8
411
412		/*
413		 * Apply carry to counter blocks if needed.
414		 *
415		 * Since the carry flag was set, we know 0 <= IV_PART <
416		 * MAX_STRIDE.  Using the value of IV_PART we can determine how
417		 * many counter blocks need to be updated.
418		 */
419		cbz		IV_PART, 2f
420		adr		x16, 1f
421		sub		x16, x16, IV_PART, lsl #3
422		br		x16
423		bti		c
424		mov		v0.d[0], vctr.d[0]
425		bti		c
426		mov		v1.d[0], vctr.d[0]
427		bti		c
428		mov		v2.d[0], vctr.d[0]
429		bti		c
430		mov		v3.d[0], vctr.d[0]
431ST5(		bti		c				)
432ST5(		mov		v4.d[0], vctr.d[0]		)
4331:		b		2f
434		.previous
435
4362:		rev		x7, IV_PART
437		ins		vctr.d[1], x7
438		sub		x7, IV_PART, #MAX_STRIDE - 1
439		sub		x8, IV_PART, #MAX_STRIDE - 2
440		sub		x9, IV_PART, #MAX_STRIDE - 3
441		rev		x7, x7
442		rev		x8, x8
443		mov		v1.d[1], x7
444		rev		x9, x9
445ST5(		sub		x10, IV_PART, #MAX_STRIDE - 4	)
446		mov		v2.d[1], x8
447ST5(		rev		x10, x10			)
448		mov		v3.d[1], x9
449ST5(		mov		v4.d[1], x10			)
450	.endif
451
452	/*
453	 * If there are at least MAX_STRIDE blocks left, XOR the data with
454	 * keystream and store.  Otherwise jump to tail handling.
455	 */
456	tbnz		BYTES_W, #31, .Lctrtail\xctr
457	ld1		{v5.16b-v7.16b}, [IN], #48
458ST4(	bl		aes_encrypt_block4x		)
459ST5(	bl		aes_encrypt_block5x		)
460	eor		v0.16b, v5.16b, v0.16b
461ST4(	ld1		{v5.16b}, [IN], #16		)
462	eor		v1.16b, v6.16b, v1.16b
463ST5(	ld1		{v5.16b-v6.16b}, [IN], #32	)
464	eor		v2.16b, v7.16b, v2.16b
465	eor		v3.16b, v5.16b, v3.16b
466ST5(	eor		v4.16b, v6.16b, v4.16b		)
467	st1		{v0.16b-v3.16b}, [OUT], #64
468ST5(	st1		{v4.16b}, [OUT], #16		)
469	cbz		BYTES_W, .Lctrout\xctr
470	b		.LctrloopNx\xctr
471
472.Lctrout\xctr:
473	.if !\xctr
474		st1		{vctr.16b}, [IV] /* return next CTR value */
475	.endif
476	frame_pop
477	ret
478
479.Lctrtail\xctr:
480	/*
481	 * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
482	 *
483	 * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
484	 * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
485	 * v4 should have the next two counter blocks.
486	 *
487	 * This allows us to store the ciphertext by writing to overlapping
488	 * regions of memory.  Any invalid ciphertext blocks get overwritten by
489	 * correctly computed blocks.  This approach greatly simplifies the
490	 * logic for storing the ciphertext.
491	 */
492	mov		x16, #16
493	ands		w7, BYTES_W, #0xf
494	csel		x13, x7, x16, ne
495
496ST5(	cmp		BYTES_W, #64 - (MAX_STRIDE << 4))
497ST5(	csel		x14, x16, xzr, gt		)
498	cmp		BYTES_W, #48 - (MAX_STRIDE << 4)
499	csel		x15, x16, xzr, gt
500	cmp		BYTES_W, #32 - (MAX_STRIDE << 4)
501	csel		x16, x16, xzr, gt
502	cmp		BYTES_W, #16 - (MAX_STRIDE << 4)
503
504	adr_l		x9, .Lcts_permute_table
505	add		x9, x9, x13
506	ble		.Lctrtail1x\xctr
507
508ST5(	ld1		{v5.16b}, [IN], x14		)
509	ld1		{v6.16b}, [IN], x15
510	ld1		{v7.16b}, [IN], x16
511
512ST4(	bl		aes_encrypt_block4x		)
513ST5(	bl		aes_encrypt_block5x		)
514
515	ld1		{v8.16b}, [IN], x13
516	ld1		{v9.16b}, [IN]
517	ld1		{v10.16b}, [x9]
518
519ST4(	eor		v6.16b, v6.16b, v0.16b		)
520ST4(	eor		v7.16b, v7.16b, v1.16b		)
521ST4(	tbl		v3.16b, {v3.16b}, v10.16b	)
522ST4(	eor		v8.16b, v8.16b, v2.16b		)
523ST4(	eor		v9.16b, v9.16b, v3.16b		)
524
525ST5(	eor		v5.16b, v5.16b, v0.16b		)
526ST5(	eor		v6.16b, v6.16b, v1.16b		)
527ST5(	tbl		v4.16b, {v4.16b}, v10.16b	)
528ST5(	eor		v7.16b, v7.16b, v2.16b		)
529ST5(	eor		v8.16b, v8.16b, v3.16b		)
530ST5(	eor		v9.16b, v9.16b, v4.16b		)
531
532ST5(	st1		{v5.16b}, [OUT], x14		)
533	st1		{v6.16b}, [OUT], x15
534	st1		{v7.16b}, [OUT], x16
535	add		x13, x13, OUT
536	st1		{v9.16b}, [x13]		// overlapping stores
537	st1		{v8.16b}, [OUT]
538	b		.Lctrout\xctr
539
540.Lctrtail1x\xctr:
541	/*
542	 * Handle <= 16 bytes of plaintext
543	 *
544	 * This code always reads and writes 16 bytes.  To avoid out of bounds
545	 * accesses, XCTR and CTR modes must use a temporary buffer when
546	 * encrypting/decrypting less than 16 bytes.
547	 *
548	 * This code is unusual in that it loads the input and stores the output
549	 * relative to the end of the buffers rather than relative to the start.
550	 * This causes unusual behaviour when encrypting/decrypting less than 16
551	 * bytes; the end of the data is expected to be at the end of the
552	 * temporary buffer rather than the start of the data being at the start
553	 * of the temporary buffer.
554	 */
555	sub		x8, x7, #16
556	csel		x7, x7, x8, eq
557	add		IN, IN, x7
558	add		OUT, OUT, x7
559	ld1		{v5.16b}, [IN]
560	ld1		{v6.16b}, [OUT]
561ST5(	mov		v3.16b, v4.16b			)
562	encrypt_block	v3, ROUNDS_W, KEY, x8, w7
563	ld1		{v10.16b-v11.16b}, [x9]
564	tbl		v3.16b, {v3.16b}, v10.16b
565	sshr		v11.16b, v11.16b, #7
566	eor		v5.16b, v5.16b, v3.16b
567	bif		v5.16b, v6.16b, v11.16b
568	st1		{v5.16b}, [OUT]
569	b		.Lctrout\xctr
570
571	// Arguments
572	.unreq OUT
573	.unreq IN
574	.unreq KEY
575	.unreq ROUNDS_W
576	.unreq BYTES_W
577	.unreq IV
578	.unreq BYTE_CTR_W	// XCTR only
579	// Intermediate values
580	.unreq CTR_W		// XCTR only
581	.unreq CTR		// XCTR only
582	.unreq IV_PART
583	.unreq BLOCKS
584	.unreq BLOCKS_W
585.endm
586
587	/*
588	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
589	 *		   int bytes, u8 ctr[])
590	 *
591	 * The input and output buffers must always be at least 16 bytes even if
592	 * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
593	 * accesses will occur.  The data to be encrypted/decrypted is expected
594	 * to be at the end of this 16-byte temporary buffer rather than the
595	 * start.
596	 */
597
598AES_FUNC_START(aes_ctr_encrypt)
599	ctr_encrypt 0
600AES_FUNC_END(aes_ctr_encrypt)
601
602	/*
603	 * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
604	 *		   int bytes, u8 const iv[], int byte_ctr)
605	 *
606	 * The input and output buffers must always be at least 16 bytes even if
607	 * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
608	 * accesses will occur.  The data to be encrypted/decrypted is expected
609	 * to be at the end of this 16-byte temporary buffer rather than the
610	 * start.
611	 */
612
613AES_FUNC_START(aes_xctr_encrypt)
614	ctr_encrypt 1
615AES_FUNC_END(aes_xctr_encrypt)
616
617
618	/*
619	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
620	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
621	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
622	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
623	 */
624
625	.macro		next_tweak, out, in, tmp
626	sshr		\tmp\().2d,  \in\().2d,   #63
627	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
628	add		\out\().2d,  \in\().2d,   \in\().2d
629	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
630	eor		\out\().16b, \out\().16b, \tmp\().16b
631	.endm
632
633	.macro		xts_load_mask, tmp
634	movi		xtsmask.2s, #0x1
635	movi		\tmp\().2s, #0x87
636	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
637	.endm
638
639AES_FUNC_START(aes_xts_encrypt)
640	frame_push	0
641
642	ld1		{v4.16b}, [x6]
643	xts_load_mask	v8
644	cbz		w7, .Lxtsencnotfirst
645
646	enc_prepare	w3, x5, x8
647	xts_cts_skip_tw	w7, .LxtsencNx
648	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
649	enc_switch_key	w3, x2, x8
650	b		.LxtsencNx
651
652.Lxtsencnotfirst:
653	enc_prepare	w3, x2, x8
654.LxtsencloopNx:
655	next_tweak	v4, v4, v8
656.LxtsencNx:
657	subs		w4, w4, #64
658	bmi		.Lxtsenc1x
659	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
660	next_tweak	v5, v4, v8
661	eor		v0.16b, v0.16b, v4.16b
662	next_tweak	v6, v5, v8
663	eor		v1.16b, v1.16b, v5.16b
664	eor		v2.16b, v2.16b, v6.16b
665	next_tweak	v7, v6, v8
666	eor		v3.16b, v3.16b, v7.16b
667	bl		aes_encrypt_block4x
668	eor		v3.16b, v3.16b, v7.16b
669	eor		v0.16b, v0.16b, v4.16b
670	eor		v1.16b, v1.16b, v5.16b
671	eor		v2.16b, v2.16b, v6.16b
672	st1		{v0.16b-v3.16b}, [x0], #64
673	mov		v4.16b, v7.16b
674	cbz		w4, .Lxtsencret
675	xts_reload_mask	v8
676	b		.LxtsencloopNx
677.Lxtsenc1x:
678	adds		w4, w4, #64
679	beq		.Lxtsencout
680	subs		w4, w4, #16
681	bmi		.LxtsencctsNx
682.Lxtsencloop:
683	ld1		{v0.16b}, [x1], #16
684.Lxtsencctsout:
685	eor		v0.16b, v0.16b, v4.16b
686	encrypt_block	v0, w3, x2, x8, w7
687	eor		v0.16b, v0.16b, v4.16b
688	cbz		w4, .Lxtsencout
689	subs		w4, w4, #16
690	next_tweak	v4, v4, v8
691	bmi		.Lxtsenccts
692	st1		{v0.16b}, [x0], #16
693	b		.Lxtsencloop
694.Lxtsencout:
695	st1		{v0.16b}, [x0]
696.Lxtsencret:
697	st1		{v4.16b}, [x6]
698	frame_pop
699	ret
700
701.LxtsencctsNx:
702	mov		v0.16b, v3.16b
703	sub		x0, x0, #16
704.Lxtsenccts:
705	adr_l		x8, .Lcts_permute_table
706
707	add		x1, x1, w4, sxtw	/* rewind input pointer */
708	add		w4, w4, #16		/* # bytes in final block */
709	add		x9, x8, #32
710	add		x8, x8, x4
711	sub		x9, x9, x4
712	add		x4, x0, x4		/* output address of final block */
713
714	ld1		{v1.16b}, [x1]		/* load final block */
715	ld1		{v2.16b}, [x8]
716	ld1		{v3.16b}, [x9]
717
718	tbl		v2.16b, {v0.16b}, v2.16b
719	tbx		v0.16b, {v1.16b}, v3.16b
720	st1		{v2.16b}, [x4]			/* overlapping stores */
721	mov		w4, wzr
722	b		.Lxtsencctsout
723AES_FUNC_END(aes_xts_encrypt)
724
725AES_FUNC_START(aes_xts_decrypt)
726	frame_push	0
727
728	/* subtract 16 bytes if we are doing CTS */
729	sub		w8, w4, #0x10
730	tst		w4, #0xf
731	csel		w4, w4, w8, eq
732
733	ld1		{v4.16b}, [x6]
734	xts_load_mask	v8
735	xts_cts_skip_tw	w7, .Lxtsdecskiptw
736	cbz		w7, .Lxtsdecnotfirst
737
738	enc_prepare	w3, x5, x8
739	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
740.Lxtsdecskiptw:
741	dec_prepare	w3, x2, x8
742	b		.LxtsdecNx
743
744.Lxtsdecnotfirst:
745	dec_prepare	w3, x2, x8
746.LxtsdecloopNx:
747	next_tweak	v4, v4, v8
748.LxtsdecNx:
749	subs		w4, w4, #64
750	bmi		.Lxtsdec1x
751	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
752	next_tweak	v5, v4, v8
753	eor		v0.16b, v0.16b, v4.16b
754	next_tweak	v6, v5, v8
755	eor		v1.16b, v1.16b, v5.16b
756	eor		v2.16b, v2.16b, v6.16b
757	next_tweak	v7, v6, v8
758	eor		v3.16b, v3.16b, v7.16b
759	bl		aes_decrypt_block4x
760	eor		v3.16b, v3.16b, v7.16b
761	eor		v0.16b, v0.16b, v4.16b
762	eor		v1.16b, v1.16b, v5.16b
763	eor		v2.16b, v2.16b, v6.16b
764	st1		{v0.16b-v3.16b}, [x0], #64
765	mov		v4.16b, v7.16b
766	cbz		w4, .Lxtsdecout
767	xts_reload_mask	v8
768	b		.LxtsdecloopNx
769.Lxtsdec1x:
770	adds		w4, w4, #64
771	beq		.Lxtsdecout
772	subs		w4, w4, #16
773.Lxtsdecloop:
774	ld1		{v0.16b}, [x1], #16
775	bmi		.Lxtsdeccts
776.Lxtsdecctsout:
777	eor		v0.16b, v0.16b, v4.16b
778	decrypt_block	v0, w3, x2, x8, w7
779	eor		v0.16b, v0.16b, v4.16b
780	st1		{v0.16b}, [x0], #16
781	cbz		w4, .Lxtsdecout
782	subs		w4, w4, #16
783	next_tweak	v4, v4, v8
784	b		.Lxtsdecloop
785.Lxtsdecout:
786	st1		{v4.16b}, [x6]
787	frame_pop
788	ret
789
790.Lxtsdeccts:
791	adr_l		x8, .Lcts_permute_table
792
793	add		x1, x1, w4, sxtw	/* rewind input pointer */
794	add		w4, w4, #16		/* # bytes in final block */
795	add		x9, x8, #32
796	add		x8, x8, x4
797	sub		x9, x9, x4
798	add		x4, x0, x4		/* output address of final block */
799
800	next_tweak	v5, v4, v8
801
802	ld1		{v1.16b}, [x1]		/* load final block */
803	ld1		{v2.16b}, [x8]
804	ld1		{v3.16b}, [x9]
805
806	eor		v0.16b, v0.16b, v5.16b
807	decrypt_block	v0, w3, x2, x8, w7
808	eor		v0.16b, v0.16b, v5.16b
809
810	tbl		v2.16b, {v0.16b}, v2.16b
811	tbx		v0.16b, {v1.16b}, v3.16b
812
813	st1		{v2.16b}, [x4]			/* overlapping stores */
814	mov		w4, wzr
815	b		.Lxtsdecctsout
816AES_FUNC_END(aes_xts_decrypt)
817
818	/*
819	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
820	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
821	 */
822AES_FUNC_START(aes_mac_update)
823	ld1		{v0.16b}, [x4]			/* get dg */
824	enc_prepare	w2, x1, x7
825	cbz		w5, .Lmacloop4x
826
827	encrypt_block	v0, w2, x1, x7, w8
828
829.Lmacloop4x:
830	subs		w3, w3, #4
831	bmi		.Lmac1x
832	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
833	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
834	encrypt_block	v0, w2, x1, x7, w8
835	eor		v0.16b, v0.16b, v2.16b
836	encrypt_block	v0, w2, x1, x7, w8
837	eor		v0.16b, v0.16b, v3.16b
838	encrypt_block	v0, w2, x1, x7, w8
839	eor		v0.16b, v0.16b, v4.16b
840	cmp		w3, wzr
841	csinv		x5, x6, xzr, eq
842	cbz		w5, .Lmacout
843	encrypt_block	v0, w2, x1, x7, w8
844	st1		{v0.16b}, [x4]			/* return dg */
845	cond_yield	.Lmacout, x7, x8
846	b		.Lmacloop4x
847.Lmac1x:
848	add		w3, w3, #4
849.Lmacloop:
850	cbz		w3, .Lmacout
851	ld1		{v1.16b}, [x0], #16		/* get next pt block */
852	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
853
854	subs		w3, w3, #1
855	csinv		x5, x6, xzr, eq
856	cbz		w5, .Lmacout
857
858.Lmacenc:
859	encrypt_block	v0, w2, x1, x7, w8
860	b		.Lmacloop
861
862.Lmacout:
863	st1		{v0.16b}, [x4]			/* return dg */
864	mov		w0, w3
865	ret
866AES_FUNC_END(aes_mac_update)
867