xref: /linux/arch/arm64/crypto/aes-modes.S (revision 3f8ae9fe0409698799e173f698b714f34570b64b)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4 *
5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/* included by aes-ce.S and aes-neon.S */
9
10	.text
11	.align		4
12
13#ifndef MAX_STRIDE
14#define MAX_STRIDE	4
15#endif
16
17#if MAX_STRIDE == 4
18#define ST4(x...) x
19#define ST5(x...)
20#else
21#define ST4(x...)
22#define ST5(x...) x
23#endif
24
25SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
27	ret
28SYM_FUNC_END(aes_encrypt_block4x)
29
30SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
32	ret
33SYM_FUNC_END(aes_decrypt_block4x)
34
35#if MAX_STRIDE == 5
36SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
38	ret
39SYM_FUNC_END(aes_encrypt_block5x)
40
41SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
43	ret
44SYM_FUNC_END(aes_decrypt_block5x)
45#endif
46
47	/*
48	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49	 *		   int blocks)
50	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51	 *		   int blocks)
52	 */
53
54AES_FUNC_START(aes_ecb_encrypt)
55	stp		x29, x30, [sp, #-16]!
56	mov		x29, sp
57
58	enc_prepare	w3, x2, x5
59
60.LecbencloopNx:
61	subs		w4, w4, #MAX_STRIDE
62	bmi		.Lecbenc1x
63	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
64ST4(	bl		aes_encrypt_block4x		)
65ST5(	ld1		{v4.16b}, [x1], #16		)
66ST5(	bl		aes_encrypt_block5x		)
67	st1		{v0.16b-v3.16b}, [x0], #64
68ST5(	st1		{v4.16b}, [x0], #16		)
69	b		.LecbencloopNx
70.Lecbenc1x:
71	adds		w4, w4, #MAX_STRIDE
72	beq		.Lecbencout
73.Lecbencloop:
74	ld1		{v0.16b}, [x1], #16		/* get next pt block */
75	encrypt_block	v0, w3, x2, x5, w6
76	st1		{v0.16b}, [x0], #16
77	subs		w4, w4, #1
78	bne		.Lecbencloop
79.Lecbencout:
80	ldp		x29, x30, [sp], #16
81	ret
82AES_FUNC_END(aes_ecb_encrypt)
83
84
85AES_FUNC_START(aes_ecb_decrypt)
86	stp		x29, x30, [sp, #-16]!
87	mov		x29, sp
88
89	dec_prepare	w3, x2, x5
90
91.LecbdecloopNx:
92	subs		w4, w4, #MAX_STRIDE
93	bmi		.Lecbdec1x
94	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
95ST4(	bl		aes_decrypt_block4x		)
96ST5(	ld1		{v4.16b}, [x1], #16		)
97ST5(	bl		aes_decrypt_block5x		)
98	st1		{v0.16b-v3.16b}, [x0], #64
99ST5(	st1		{v4.16b}, [x0], #16		)
100	b		.LecbdecloopNx
101.Lecbdec1x:
102	adds		w4, w4, #MAX_STRIDE
103	beq		.Lecbdecout
104.Lecbdecloop:
105	ld1		{v0.16b}, [x1], #16		/* get next ct block */
106	decrypt_block	v0, w3, x2, x5, w6
107	st1		{v0.16b}, [x0], #16
108	subs		w4, w4, #1
109	bne		.Lecbdecloop
110.Lecbdecout:
111	ldp		x29, x30, [sp], #16
112	ret
113AES_FUNC_END(aes_ecb_decrypt)
114
115
116	/*
117	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118	 *		   int blocks, u8 iv[])
119	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120	 *		   int blocks, u8 iv[])
121	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122	 *			 int rounds, int blocks, u8 iv[],
123	 *			 u32 const rk2[]);
124	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125	 *			 int rounds, int blocks, u8 iv[],
126	 *			 u32 const rk2[]);
127	 */
128
129AES_FUNC_START(aes_essiv_cbc_encrypt)
130	ld1		{v4.16b}, [x5]			/* get iv */
131
132	mov		w8, #14				/* AES-256: 14 rounds */
133	enc_prepare	w8, x6, x7
134	encrypt_block	v4, w8, x6, x7, w9
135	enc_switch_key	w3, x2, x6
136	b		.Lcbcencloop4x
137
138AES_FUNC_START(aes_cbc_encrypt)
139	ld1		{v4.16b}, [x5]			/* get iv */
140	enc_prepare	w3, x2, x6
141
142.Lcbcencloop4x:
143	subs		w4, w4, #4
144	bmi		.Lcbcenc1x
145	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
146	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
147	encrypt_block	v0, w3, x2, x6, w7
148	eor		v1.16b, v1.16b, v0.16b
149	encrypt_block	v1, w3, x2, x6, w7
150	eor		v2.16b, v2.16b, v1.16b
151	encrypt_block	v2, w3, x2, x6, w7
152	eor		v3.16b, v3.16b, v2.16b
153	encrypt_block	v3, w3, x2, x6, w7
154	st1		{v0.16b-v3.16b}, [x0], #64
155	mov		v4.16b, v3.16b
156	b		.Lcbcencloop4x
157.Lcbcenc1x:
158	adds		w4, w4, #4
159	beq		.Lcbcencout
160.Lcbcencloop:
161	ld1		{v0.16b}, [x1], #16		/* get next pt block */
162	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
163	encrypt_block	v4, w3, x2, x6, w7
164	st1		{v4.16b}, [x0], #16
165	subs		w4, w4, #1
166	bne		.Lcbcencloop
167.Lcbcencout:
168	st1		{v4.16b}, [x5]			/* return iv */
169	ret
170AES_FUNC_END(aes_cbc_encrypt)
171AES_FUNC_END(aes_essiv_cbc_encrypt)
172
173AES_FUNC_START(aes_essiv_cbc_decrypt)
174	stp		x29, x30, [sp, #-16]!
175	mov		x29, sp
176
177	ld1		{cbciv.16b}, [x5]		/* get iv */
178
179	mov		w8, #14				/* AES-256: 14 rounds */
180	enc_prepare	w8, x6, x7
181	encrypt_block	cbciv, w8, x6, x7, w9
182	b		.Lessivcbcdecstart
183
184AES_FUNC_START(aes_cbc_decrypt)
185	stp		x29, x30, [sp, #-16]!
186	mov		x29, sp
187
188	ld1		{cbciv.16b}, [x5]		/* get iv */
189.Lessivcbcdecstart:
190	dec_prepare	w3, x2, x6
191
192.LcbcdecloopNx:
193	subs		w4, w4, #MAX_STRIDE
194	bmi		.Lcbcdec1x
195	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
196#if MAX_STRIDE == 5
197	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
198	mov		v5.16b, v0.16b
199	mov		v6.16b, v1.16b
200	mov		v7.16b, v2.16b
201	bl		aes_decrypt_block5x
202	sub		x1, x1, #32
203	eor		v0.16b, v0.16b, cbciv.16b
204	eor		v1.16b, v1.16b, v5.16b
205	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
206	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
207	eor		v2.16b, v2.16b, v6.16b
208	eor		v3.16b, v3.16b, v7.16b
209	eor		v4.16b, v4.16b, v5.16b
210#else
211	mov		v4.16b, v0.16b
212	mov		v5.16b, v1.16b
213	mov		v6.16b, v2.16b
214	bl		aes_decrypt_block4x
215	sub		x1, x1, #16
216	eor		v0.16b, v0.16b, cbciv.16b
217	eor		v1.16b, v1.16b, v4.16b
218	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
219	eor		v2.16b, v2.16b, v5.16b
220	eor		v3.16b, v3.16b, v6.16b
221#endif
222	st1		{v0.16b-v3.16b}, [x0], #64
223ST5(	st1		{v4.16b}, [x0], #16		)
224	b		.LcbcdecloopNx
225.Lcbcdec1x:
226	adds		w4, w4, #MAX_STRIDE
227	beq		.Lcbcdecout
228.Lcbcdecloop:
229	ld1		{v1.16b}, [x1], #16		/* get next ct block */
230	mov		v0.16b, v1.16b			/* ...and copy to v0 */
231	decrypt_block	v0, w3, x2, x6, w7
232	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
233	mov		cbciv.16b, v1.16b		/* ct is next iv */
234	st1		{v0.16b}, [x0], #16
235	subs		w4, w4, #1
236	bne		.Lcbcdecloop
237.Lcbcdecout:
238	st1		{cbciv.16b}, [x5]		/* return iv */
239	ldp		x29, x30, [sp], #16
240	ret
241AES_FUNC_END(aes_cbc_decrypt)
242AES_FUNC_END(aes_essiv_cbc_decrypt)
243
244
245	/*
246	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247	 *		       int rounds, int bytes, u8 const iv[])
248	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249	 *		       int rounds, int bytes, u8 const iv[])
250	 */
251
252AES_FUNC_START(aes_cbc_cts_encrypt)
253	adr_l		x8, .Lcts_permute_table
254	sub		x4, x4, #16
255	add		x9, x8, #32
256	add		x8, x8, x4
257	sub		x9, x9, x4
258	ld1		{v3.16b}, [x8]
259	ld1		{v4.16b}, [x9]
260
261	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
262	ld1		{v1.16b}, [x1]
263
264	ld1		{v5.16b}, [x5]			/* get iv */
265	enc_prepare	w3, x2, x6
266
267	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
268	tbl		v1.16b, {v1.16b}, v4.16b
269	encrypt_block	v0, w3, x2, x6, w7
270
271	eor		v1.16b, v1.16b, v0.16b
272	tbl		v0.16b, {v0.16b}, v3.16b
273	encrypt_block	v1, w3, x2, x6, w7
274
275	add		x4, x0, x4
276	st1		{v0.16b}, [x4]			/* overlapping stores */
277	st1		{v1.16b}, [x0]
278	ret
279AES_FUNC_END(aes_cbc_cts_encrypt)
280
281AES_FUNC_START(aes_cbc_cts_decrypt)
282	adr_l		x8, .Lcts_permute_table
283	sub		x4, x4, #16
284	add		x9, x8, #32
285	add		x8, x8, x4
286	sub		x9, x9, x4
287	ld1		{v3.16b}, [x8]
288	ld1		{v4.16b}, [x9]
289
290	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
291	ld1		{v1.16b}, [x1]
292
293	ld1		{v5.16b}, [x5]			/* get iv */
294	dec_prepare	w3, x2, x6
295
296	decrypt_block	v0, w3, x2, x6, w7
297	tbl		v2.16b, {v0.16b}, v3.16b
298	eor		v2.16b, v2.16b, v1.16b
299
300	tbx		v0.16b, {v1.16b}, v4.16b
301	decrypt_block	v0, w3, x2, x6, w7
302	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
303
304	add		x4, x0, x4
305	st1		{v2.16b}, [x4]			/* overlapping stores */
306	st1		{v0.16b}, [x0]
307	ret
308AES_FUNC_END(aes_cbc_cts_decrypt)
309
310	.section	".rodata", "a"
311	.align		6
312.Lcts_permute_table:
313	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
316	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
317	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
319	.previous
320
321	/*
322	 * This macro generates the code for CTR and XCTR mode.
323	 */
324.macro ctr_encrypt xctr
325	// Arguments
326	OUT		.req x0
327	IN		.req x1
328	KEY		.req x2
329	ROUNDS_W	.req w3
330	BYTES_W		.req w4
331	IV		.req x5
332	BYTE_CTR_W 	.req w6		// XCTR only
333	// Intermediate values
334	CTR_W		.req w11	// XCTR only
335	CTR		.req x11	// XCTR only
336	IV_PART		.req x12
337	BLOCKS		.req x13
338	BLOCKS_W	.req w13
339
340	stp		x29, x30, [sp, #-16]!
341	mov		x29, sp
342
343	enc_prepare	ROUNDS_W, KEY, IV_PART
344	ld1		{vctr.16b}, [IV]
345
346	/*
347	 * Keep 64 bits of the IV in a register.  For CTR mode this lets us
348	 * easily increment the IV.  For XCTR mode this lets us efficiently XOR
349	 * the 64-bit counter with the IV.
350	 */
351	.if \xctr
352		umov		IV_PART, vctr.d[0]
353		lsr		CTR_W, BYTE_CTR_W, #4
354	.else
355		umov		IV_PART, vctr.d[1]
356		rev		IV_PART, IV_PART
357	.endif
358
359.LctrloopNx\xctr:
360	add		BLOCKS_W, BYTES_W, #15
361	sub		BYTES_W, BYTES_W, #MAX_STRIDE << 4
362	lsr		BLOCKS_W, BLOCKS_W, #4
363	mov		w8, #MAX_STRIDE
364	cmp		BLOCKS_W, w8
365	csel		BLOCKS_W, BLOCKS_W, w8, lt
366
367	/*
368	 * Set up the counter values in v0-v{MAX_STRIDE-1}.
369	 *
370	 * If we are encrypting less than MAX_STRIDE blocks, the tail block
371	 * handling code expects the last keystream block to be in
372	 * v{MAX_STRIDE-1}.  For example: if encrypting two blocks with
373	 * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
374	 */
375	.if \xctr
376		add		CTR, CTR, BLOCKS
377	.else
378		adds		IV_PART, IV_PART, BLOCKS
379	.endif
380	mov		v0.16b, vctr.16b
381	mov		v1.16b, vctr.16b
382	mov		v2.16b, vctr.16b
383	mov		v3.16b, vctr.16b
384ST5(	mov		v4.16b, vctr.16b		)
385	.if \xctr
386		sub		x6, CTR, #MAX_STRIDE - 1
387		sub		x7, CTR, #MAX_STRIDE - 2
388		sub		x8, CTR, #MAX_STRIDE - 3
389		sub		x9, CTR, #MAX_STRIDE - 4
390ST5(		sub		x10, CTR, #MAX_STRIDE - 5	)
391		eor		x6, x6, IV_PART
392		eor		x7, x7, IV_PART
393		eor		x8, x8, IV_PART
394		eor		x9, x9, IV_PART
395ST5(		eor		x10, x10, IV_PART		)
396		mov		v0.d[0], x6
397		mov		v1.d[0], x7
398		mov		v2.d[0], x8
399		mov		v3.d[0], x9
400ST5(		mov		v4.d[0], x10			)
401	.else
402		bcs		0f
403		.subsection	1
404		/*
405		 * This subsection handles carries.
406		 *
407		 * Conditional branching here is allowed with respect to time
408		 * invariance since the branches are dependent on the IV instead
409		 * of the plaintext or key.  This code is rarely executed in
410		 * practice anyway.
411		 */
412
413		/* Apply carry to outgoing counter. */
4140:		umov		x8, vctr.d[0]
415		rev		x8, x8
416		add		x8, x8, #1
417		rev		x8, x8
418		ins		vctr.d[0], x8
419
420		/*
421		 * Apply carry to counter blocks if needed.
422		 *
423		 * Since the carry flag was set, we know 0 <= IV_PART <
424		 * MAX_STRIDE.  Using the value of IV_PART we can determine how
425		 * many counter blocks need to be updated.
426		 */
427		cbz		IV_PART, 2f
428		adr		x16, 1f
429		sub		x16, x16, IV_PART, lsl #3
430		br		x16
431		bti		c
432		mov		v0.d[0], vctr.d[0]
433		bti		c
434		mov		v1.d[0], vctr.d[0]
435		bti		c
436		mov		v2.d[0], vctr.d[0]
437		bti		c
438		mov		v3.d[0], vctr.d[0]
439ST5(		bti		c				)
440ST5(		mov		v4.d[0], vctr.d[0]		)
4411:		b		2f
442		.previous
443
4442:		rev		x7, IV_PART
445		ins		vctr.d[1], x7
446		sub		x7, IV_PART, #MAX_STRIDE - 1
447		sub		x8, IV_PART, #MAX_STRIDE - 2
448		sub		x9, IV_PART, #MAX_STRIDE - 3
449		rev		x7, x7
450		rev		x8, x8
451		mov		v1.d[1], x7
452		rev		x9, x9
453ST5(		sub		x10, IV_PART, #MAX_STRIDE - 4	)
454		mov		v2.d[1], x8
455ST5(		rev		x10, x10			)
456		mov		v3.d[1], x9
457ST5(		mov		v4.d[1], x10			)
458	.endif
459
460	/*
461	 * If there are at least MAX_STRIDE blocks left, XOR the data with
462	 * keystream and store.  Otherwise jump to tail handling.
463	 */
464	tbnz		BYTES_W, #31, .Lctrtail\xctr
465	ld1		{v5.16b-v7.16b}, [IN], #48
466ST4(	bl		aes_encrypt_block4x		)
467ST5(	bl		aes_encrypt_block5x		)
468	eor		v0.16b, v5.16b, v0.16b
469ST4(	ld1		{v5.16b}, [IN], #16		)
470	eor		v1.16b, v6.16b, v1.16b
471ST5(	ld1		{v5.16b-v6.16b}, [IN], #32	)
472	eor		v2.16b, v7.16b, v2.16b
473	eor		v3.16b, v5.16b, v3.16b
474ST5(	eor		v4.16b, v6.16b, v4.16b		)
475	st1		{v0.16b-v3.16b}, [OUT], #64
476ST5(	st1		{v4.16b}, [OUT], #16		)
477	cbz		BYTES_W, .Lctrout\xctr
478	b		.LctrloopNx\xctr
479
480.Lctrout\xctr:
481	.if !\xctr
482		st1		{vctr.16b}, [IV] /* return next CTR value */
483	.endif
484	ldp		x29, x30, [sp], #16
485	ret
486
487.Lctrtail\xctr:
488	/*
489	 * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
490	 *
491	 * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
492	 * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
493	 * v4 should have the next two counter blocks.
494	 *
495	 * This allows us to store the ciphertext by writing to overlapping
496	 * regions of memory.  Any invalid ciphertext blocks get overwritten by
497	 * correctly computed blocks.  This approach greatly simplifies the
498	 * logic for storing the ciphertext.
499	 */
500	mov		x16, #16
501	ands		w7, BYTES_W, #0xf
502	csel		x13, x7, x16, ne
503
504ST5(	cmp		BYTES_W, #64 - (MAX_STRIDE << 4))
505ST5(	csel		x14, x16, xzr, gt		)
506	cmp		BYTES_W, #48 - (MAX_STRIDE << 4)
507	csel		x15, x16, xzr, gt
508	cmp		BYTES_W, #32 - (MAX_STRIDE << 4)
509	csel		x16, x16, xzr, gt
510	cmp		BYTES_W, #16 - (MAX_STRIDE << 4)
511
512	adr_l		x9, .Lcts_permute_table
513	add		x9, x9, x13
514	ble		.Lctrtail1x\xctr
515
516ST5(	ld1		{v5.16b}, [IN], x14		)
517	ld1		{v6.16b}, [IN], x15
518	ld1		{v7.16b}, [IN], x16
519
520ST4(	bl		aes_encrypt_block4x		)
521ST5(	bl		aes_encrypt_block5x		)
522
523	ld1		{v8.16b}, [IN], x13
524	ld1		{v9.16b}, [IN]
525	ld1		{v10.16b}, [x9]
526
527ST4(	eor		v6.16b, v6.16b, v0.16b		)
528ST4(	eor		v7.16b, v7.16b, v1.16b		)
529ST4(	tbl		v3.16b, {v3.16b}, v10.16b	)
530ST4(	eor		v8.16b, v8.16b, v2.16b		)
531ST4(	eor		v9.16b, v9.16b, v3.16b		)
532
533ST5(	eor		v5.16b, v5.16b, v0.16b		)
534ST5(	eor		v6.16b, v6.16b, v1.16b		)
535ST5(	tbl		v4.16b, {v4.16b}, v10.16b	)
536ST5(	eor		v7.16b, v7.16b, v2.16b		)
537ST5(	eor		v8.16b, v8.16b, v3.16b		)
538ST5(	eor		v9.16b, v9.16b, v4.16b		)
539
540ST5(	st1		{v5.16b}, [OUT], x14		)
541	st1		{v6.16b}, [OUT], x15
542	st1		{v7.16b}, [OUT], x16
543	add		x13, x13, OUT
544	st1		{v9.16b}, [x13]		// overlapping stores
545	st1		{v8.16b}, [OUT]
546	b		.Lctrout\xctr
547
548.Lctrtail1x\xctr:
549	/*
550	 * Handle <= 16 bytes of plaintext
551	 *
552	 * This code always reads and writes 16 bytes.  To avoid out of bounds
553	 * accesses, XCTR and CTR modes must use a temporary buffer when
554	 * encrypting/decrypting less than 16 bytes.
555	 *
556	 * This code is unusual in that it loads the input and stores the output
557	 * relative to the end of the buffers rather than relative to the start.
558	 * This causes unusual behaviour when encrypting/decrypting less than 16
559	 * bytes; the end of the data is expected to be at the end of the
560	 * temporary buffer rather than the start of the data being at the start
561	 * of the temporary buffer.
562	 */
563	sub		x8, x7, #16
564	csel		x7, x7, x8, eq
565	add		IN, IN, x7
566	add		OUT, OUT, x7
567	ld1		{v5.16b}, [IN]
568	ld1		{v6.16b}, [OUT]
569ST5(	mov		v3.16b, v4.16b			)
570	encrypt_block	v3, ROUNDS_W, KEY, x8, w7
571	ld1		{v10.16b-v11.16b}, [x9]
572	tbl		v3.16b, {v3.16b}, v10.16b
573	sshr		v11.16b, v11.16b, #7
574	eor		v5.16b, v5.16b, v3.16b
575	bif		v5.16b, v6.16b, v11.16b
576	st1		{v5.16b}, [OUT]
577	b		.Lctrout\xctr
578
579	// Arguments
580	.unreq OUT
581	.unreq IN
582	.unreq KEY
583	.unreq ROUNDS_W
584	.unreq BYTES_W
585	.unreq IV
586	.unreq BYTE_CTR_W	// XCTR only
587	// Intermediate values
588	.unreq CTR_W		// XCTR only
589	.unreq CTR		// XCTR only
590	.unreq IV_PART
591	.unreq BLOCKS
592	.unreq BLOCKS_W
593.endm
594
595	/*
596	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
597	 *		   int bytes, u8 ctr[])
598	 *
599	 * The input and output buffers must always be at least 16 bytes even if
600	 * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
601	 * accesses will occur.  The data to be encrypted/decrypted is expected
602	 * to be at the end of this 16-byte temporary buffer rather than the
603	 * start.
604	 */
605
606AES_FUNC_START(aes_ctr_encrypt)
607	ctr_encrypt 0
608AES_FUNC_END(aes_ctr_encrypt)
609
610	/*
611	 * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
612	 *		   int bytes, u8 const iv[], int byte_ctr)
613	 *
614	 * The input and output buffers must always be at least 16 bytes even if
615	 * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
616	 * accesses will occur.  The data to be encrypted/decrypted is expected
617	 * to be at the end of this 16-byte temporary buffer rather than the
618	 * start.
619	 */
620
621AES_FUNC_START(aes_xctr_encrypt)
622	ctr_encrypt 1
623AES_FUNC_END(aes_xctr_encrypt)
624
625
626	/*
627	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
628	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
629	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
630	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
631	 */
632
633	.macro		next_tweak, out, in, tmp
634	sshr		\tmp\().2d,  \in\().2d,   #63
635	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
636	add		\out\().2d,  \in\().2d,   \in\().2d
637	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
638	eor		\out\().16b, \out\().16b, \tmp\().16b
639	.endm
640
641	.macro		xts_load_mask, tmp
642	movi		xtsmask.2s, #0x1
643	movi		\tmp\().2s, #0x87
644	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
645	.endm
646
647AES_FUNC_START(aes_xts_encrypt)
648	stp		x29, x30, [sp, #-16]!
649	mov		x29, sp
650
651	ld1		{v4.16b}, [x6]
652	xts_load_mask	v8
653	cbz		w7, .Lxtsencnotfirst
654
655	enc_prepare	w3, x5, x8
656	xts_cts_skip_tw	w7, .LxtsencNx
657	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
658	enc_switch_key	w3, x2, x8
659	b		.LxtsencNx
660
661.Lxtsencnotfirst:
662	enc_prepare	w3, x2, x8
663.LxtsencloopNx:
664	next_tweak	v4, v4, v8
665.LxtsencNx:
666	subs		w4, w4, #64
667	bmi		.Lxtsenc1x
668	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
669	next_tweak	v5, v4, v8
670	eor		v0.16b, v0.16b, v4.16b
671	next_tweak	v6, v5, v8
672	eor		v1.16b, v1.16b, v5.16b
673	eor		v2.16b, v2.16b, v6.16b
674	next_tweak	v7, v6, v8
675	eor		v3.16b, v3.16b, v7.16b
676	bl		aes_encrypt_block4x
677	eor		v3.16b, v3.16b, v7.16b
678	eor		v0.16b, v0.16b, v4.16b
679	eor		v1.16b, v1.16b, v5.16b
680	eor		v2.16b, v2.16b, v6.16b
681	st1		{v0.16b-v3.16b}, [x0], #64
682	mov		v4.16b, v7.16b
683	cbz		w4, .Lxtsencret
684	xts_reload_mask	v8
685	b		.LxtsencloopNx
686.Lxtsenc1x:
687	adds		w4, w4, #64
688	beq		.Lxtsencout
689	subs		w4, w4, #16
690	bmi		.LxtsencctsNx
691.Lxtsencloop:
692	ld1		{v0.16b}, [x1], #16
693.Lxtsencctsout:
694	eor		v0.16b, v0.16b, v4.16b
695	encrypt_block	v0, w3, x2, x8, w7
696	eor		v0.16b, v0.16b, v4.16b
697	cbz		w4, .Lxtsencout
698	subs		w4, w4, #16
699	next_tweak	v4, v4, v8
700	bmi		.Lxtsenccts
701	st1		{v0.16b}, [x0], #16
702	b		.Lxtsencloop
703.Lxtsencout:
704	st1		{v0.16b}, [x0]
705.Lxtsencret:
706	st1		{v4.16b}, [x6]
707	ldp		x29, x30, [sp], #16
708	ret
709
710.LxtsencctsNx:
711	mov		v0.16b, v3.16b
712	sub		x0, x0, #16
713.Lxtsenccts:
714	adr_l		x8, .Lcts_permute_table
715
716	add		x1, x1, w4, sxtw	/* rewind input pointer */
717	add		w4, w4, #16		/* # bytes in final block */
718	add		x9, x8, #32
719	add		x8, x8, x4
720	sub		x9, x9, x4
721	add		x4, x0, x4		/* output address of final block */
722
723	ld1		{v1.16b}, [x1]		/* load final block */
724	ld1		{v2.16b}, [x8]
725	ld1		{v3.16b}, [x9]
726
727	tbl		v2.16b, {v0.16b}, v2.16b
728	tbx		v0.16b, {v1.16b}, v3.16b
729	st1		{v2.16b}, [x4]			/* overlapping stores */
730	mov		w4, wzr
731	b		.Lxtsencctsout
732AES_FUNC_END(aes_xts_encrypt)
733
734AES_FUNC_START(aes_xts_decrypt)
735	stp		x29, x30, [sp, #-16]!
736	mov		x29, sp
737
738	/* subtract 16 bytes if we are doing CTS */
739	sub		w8, w4, #0x10
740	tst		w4, #0xf
741	csel		w4, w4, w8, eq
742
743	ld1		{v4.16b}, [x6]
744	xts_load_mask	v8
745	xts_cts_skip_tw	w7, .Lxtsdecskiptw
746	cbz		w7, .Lxtsdecnotfirst
747
748	enc_prepare	w3, x5, x8
749	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
750.Lxtsdecskiptw:
751	dec_prepare	w3, x2, x8
752	b		.LxtsdecNx
753
754.Lxtsdecnotfirst:
755	dec_prepare	w3, x2, x8
756.LxtsdecloopNx:
757	next_tweak	v4, v4, v8
758.LxtsdecNx:
759	subs		w4, w4, #64
760	bmi		.Lxtsdec1x
761	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
762	next_tweak	v5, v4, v8
763	eor		v0.16b, v0.16b, v4.16b
764	next_tweak	v6, v5, v8
765	eor		v1.16b, v1.16b, v5.16b
766	eor		v2.16b, v2.16b, v6.16b
767	next_tweak	v7, v6, v8
768	eor		v3.16b, v3.16b, v7.16b
769	bl		aes_decrypt_block4x
770	eor		v3.16b, v3.16b, v7.16b
771	eor		v0.16b, v0.16b, v4.16b
772	eor		v1.16b, v1.16b, v5.16b
773	eor		v2.16b, v2.16b, v6.16b
774	st1		{v0.16b-v3.16b}, [x0], #64
775	mov		v4.16b, v7.16b
776	cbz		w4, .Lxtsdecout
777	xts_reload_mask	v8
778	b		.LxtsdecloopNx
779.Lxtsdec1x:
780	adds		w4, w4, #64
781	beq		.Lxtsdecout
782	subs		w4, w4, #16
783.Lxtsdecloop:
784	ld1		{v0.16b}, [x1], #16
785	bmi		.Lxtsdeccts
786.Lxtsdecctsout:
787	eor		v0.16b, v0.16b, v4.16b
788	decrypt_block	v0, w3, x2, x8, w7
789	eor		v0.16b, v0.16b, v4.16b
790	st1		{v0.16b}, [x0], #16
791	cbz		w4, .Lxtsdecout
792	subs		w4, w4, #16
793	next_tweak	v4, v4, v8
794	b		.Lxtsdecloop
795.Lxtsdecout:
796	st1		{v4.16b}, [x6]
797	ldp		x29, x30, [sp], #16
798	ret
799
800.Lxtsdeccts:
801	adr_l		x8, .Lcts_permute_table
802
803	add		x1, x1, w4, sxtw	/* rewind input pointer */
804	add		w4, w4, #16		/* # bytes in final block */
805	add		x9, x8, #32
806	add		x8, x8, x4
807	sub		x9, x9, x4
808	add		x4, x0, x4		/* output address of final block */
809
810	next_tweak	v5, v4, v8
811
812	ld1		{v1.16b}, [x1]		/* load final block */
813	ld1		{v2.16b}, [x8]
814	ld1		{v3.16b}, [x9]
815
816	eor		v0.16b, v0.16b, v5.16b
817	decrypt_block	v0, w3, x2, x8, w7
818	eor		v0.16b, v0.16b, v5.16b
819
820	tbl		v2.16b, {v0.16b}, v2.16b
821	tbx		v0.16b, {v1.16b}, v3.16b
822
823	st1		{v2.16b}, [x4]			/* overlapping stores */
824	mov		w4, wzr
825	b		.Lxtsdecctsout
826AES_FUNC_END(aes_xts_decrypt)
827
828	/*
829	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
830	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
831	 */
832AES_FUNC_START(aes_mac_update)
833	ld1		{v0.16b}, [x4]			/* get dg */
834	enc_prepare	w2, x1, x7
835	cbz		w5, .Lmacloop4x
836
837	encrypt_block	v0, w2, x1, x7, w8
838
839.Lmacloop4x:
840	subs		w3, w3, #4
841	bmi		.Lmac1x
842	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
843	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
844	encrypt_block	v0, w2, x1, x7, w8
845	eor		v0.16b, v0.16b, v2.16b
846	encrypt_block	v0, w2, x1, x7, w8
847	eor		v0.16b, v0.16b, v3.16b
848	encrypt_block	v0, w2, x1, x7, w8
849	eor		v0.16b, v0.16b, v4.16b
850	cmp		w3, wzr
851	csinv		x5, x6, xzr, eq
852	cbz		w5, .Lmacout
853	encrypt_block	v0, w2, x1, x7, w8
854	st1		{v0.16b}, [x4]			/* return dg */
855	cond_yield	.Lmacout, x7, x8
856	b		.Lmacloop4x
857.Lmac1x:
858	add		w3, w3, #4
859.Lmacloop:
860	cbz		w3, .Lmacout
861	ld1		{v1.16b}, [x0], #16		/* get next pt block */
862	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
863
864	subs		w3, w3, #1
865	csinv		x5, x6, xzr, eq
866	cbz		w5, .Lmacout
867
868.Lmacenc:
869	encrypt_block	v0, w2, x1, x7, w8
870	b		.Lmacloop
871
872.Lmacout:
873	st1		{v0.16b}, [x4]			/* return dg */
874	mov		w0, w3
875	ret
876AES_FUNC_END(aes_mac_update)
877