xref: /linux/arch/mips/crypto/chacha-core.S (revision 49aa7c00eddf8d8f462b0256bd82e81762d7b0c6)
1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
4 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5 */
6
7#define MASK_U32		0x3c
8#define CHACHA20_BLOCK_SIZE	64
9#define STACK_SIZE		32
10
11#define X0	$t0
12#define X1	$t1
13#define X2	$t2
14#define X3	$t3
15#define X4	$t4
16#define X5	$t5
17#define X6	$t6
18#define X7	$t7
19#define X8	$t8
20#define X9	$t9
21#define X10	$v1
22#define X11	$s6
23#define X12	$s5
24#define X13	$s4
25#define X14	$s3
26#define X15	$s2
27/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
28#define T0	$s1
29#define T1	$s0
30#define T(n)	T ## n
31#define X(n)	X ## n
32
33/* Input arguments */
34#define STATE		$a0
35#define OUT		$a1
36#define IN		$a2
37#define BYTES		$a3
38
39/* Output argument */
40/* NONCE[0] is kept in a register and not in memory.
41 * We don't want to touch original value in memory.
42 * Must be incremented every loop iteration.
43 */
44#define NONCE_0		$v0
45
46/* SAVED_X and SAVED_CA are set in the jump table.
47 * Use regs which are overwritten on exit else we don't leak clear data.
48 * They are used to handling the last bytes which are not multiple of 4.
49 */
50#define SAVED_X		X15
51#define SAVED_CA	$s7
52
53#define IS_UNALIGNED	$s7
54
55#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
56#define MSB 0
57#define LSB 3
58#define ROTx rotl
59#define ROTR(n) rotr n, 24
60#define	CPU_TO_LE32(n) \
61	wsbh	n; \
62	rotr	n, 16;
63#else
64#define MSB 3
65#define LSB 0
66#define ROTx rotr
67#define CPU_TO_LE32(n)
68#define ROTR(n)
69#endif
70
71#define FOR_EACH_WORD(x) \
72	x( 0); \
73	x( 1); \
74	x( 2); \
75	x( 3); \
76	x( 4); \
77	x( 5); \
78	x( 6); \
79	x( 7); \
80	x( 8); \
81	x( 9); \
82	x(10); \
83	x(11); \
84	x(12); \
85	x(13); \
86	x(14); \
87	x(15);
88
89#define FOR_EACH_WORD_REV(x) \
90	x(15); \
91	x(14); \
92	x(13); \
93	x(12); \
94	x(11); \
95	x(10); \
96	x( 9); \
97	x( 8); \
98	x( 7); \
99	x( 6); \
100	x( 5); \
101	x( 4); \
102	x( 3); \
103	x( 2); \
104	x( 1); \
105	x( 0);
106
107#define PLUS_ONE_0	 1
108#define PLUS_ONE_1	 2
109#define PLUS_ONE_2	 3
110#define PLUS_ONE_3	 4
111#define PLUS_ONE_4	 5
112#define PLUS_ONE_5	 6
113#define PLUS_ONE_6	 7
114#define PLUS_ONE_7	 8
115#define PLUS_ONE_8	 9
116#define PLUS_ONE_9	10
117#define PLUS_ONE_10	11
118#define PLUS_ONE_11	12
119#define PLUS_ONE_12	13
120#define PLUS_ONE_13	14
121#define PLUS_ONE_14	15
122#define PLUS_ONE_15	16
123#define PLUS_ONE(x)	PLUS_ONE_ ## x
124#define _CONCAT3(a,b,c)	a ## b ## c
125#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
126
127#define STORE_UNALIGNED(x) \
128CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
129	.if (x != 12); \
130		lw	T0, (x*4)(STATE); \
131	.endif; \
132	lwl	T1, (x*4)+MSB ## (IN); \
133	lwr	T1, (x*4)+LSB ## (IN); \
134	.if (x == 12); \
135		addu	X ## x, NONCE_0; \
136	.else; \
137		addu	X ## x, T0; \
138	.endif; \
139	CPU_TO_LE32(X ## x); \
140	xor	X ## x, T1; \
141	swl	X ## x, (x*4)+MSB ## (OUT); \
142	swr	X ## x, (x*4)+LSB ## (OUT);
143
144#define STORE_ALIGNED(x) \
145CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
146	.if (x != 12); \
147		lw	T0, (x*4)(STATE); \
148	.endif; \
149	lw	T1, (x*4) ## (IN); \
150	.if (x == 12); \
151		addu	X ## x, NONCE_0; \
152	.else; \
153		addu	X ## x, T0; \
154	.endif; \
155	CPU_TO_LE32(X ## x); \
156	xor	X ## x, T1; \
157	sw	X ## x, (x*4) ## (OUT);
158
159/* Jump table macro.
160 * Used for setup and handling the last bytes, which are not multiple of 4.
161 * X15 is free to store Xn
162 * Every jumptable entry must be equal in size.
163 */
164#define JMPTBL_ALIGNED(x) \
165.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
166	.set	noreorder; \
167	b	.Lchacha20_mips_xor_aligned_ ## x ## _b; \
168	.if (x == 12); \
169		addu	SAVED_X, X ## x, NONCE_0; \
170	.else; \
171		addu	SAVED_X, X ## x, SAVED_CA; \
172	.endif; \
173	.set	reorder
174
175#define JMPTBL_UNALIGNED(x) \
176.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
177	.set	noreorder; \
178	b	.Lchacha20_mips_xor_unaligned_ ## x ## _b; \
179	.if (x == 12); \
180		addu	SAVED_X, X ## x, NONCE_0; \
181	.else; \
182		addu	SAVED_X, X ## x, SAVED_CA; \
183	.endif; \
184	.set	reorder
185
186#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
187	addu	X(A), X(K); \
188	addu	X(B), X(L); \
189	addu	X(C), X(M); \
190	addu	X(D), X(N); \
191	xor	X(V), X(A); \
192	xor	X(W), X(B); \
193	xor	X(Y), X(C); \
194	xor	X(Z), X(D); \
195	rotl	X(V), S;    \
196	rotl	X(W), S;    \
197	rotl	X(Y), S;    \
198	rotl	X(Z), S;
199
200.text
201.set	reorder
202.set	noat
203.globl	chacha20_mips
204.ent	chacha20_mips
205chacha20_mips:
206	.frame	$sp, STACK_SIZE, $ra
207
208	addiu	$sp, -STACK_SIZE
209
210	/* Return bytes = 0. */
211	beqz	BYTES, .Lchacha20_mips_end
212
213	lw	NONCE_0, 48(STATE)
214
215	/* Save s0-s7 */
216	sw	$s0,  0($sp)
217	sw	$s1,  4($sp)
218	sw	$s2,  8($sp)
219	sw	$s3, 12($sp)
220	sw	$s4, 16($sp)
221	sw	$s5, 20($sp)
222	sw	$s6, 24($sp)
223	sw	$s7, 28($sp)
224
225	/* Test IN or OUT is unaligned.
226	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
227	 */
228	or	IS_UNALIGNED, IN, OUT
229	andi	IS_UNALIGNED, 0x3
230
231	/* Set number of rounds */
232	li	$at, 20
233
234	b	.Lchacha20_rounds_start
235
236.align 4
237.Loop_chacha20_rounds:
238	addiu	IN,  CHACHA20_BLOCK_SIZE
239	addiu	OUT, CHACHA20_BLOCK_SIZE
240	addiu	NONCE_0, 1
241
242.Lchacha20_rounds_start:
243	lw	X0,  0(STATE)
244	lw	X1,  4(STATE)
245	lw	X2,  8(STATE)
246	lw	X3,  12(STATE)
247
248	lw	X4,  16(STATE)
249	lw	X5,  20(STATE)
250	lw	X6,  24(STATE)
251	lw	X7,  28(STATE)
252	lw	X8,  32(STATE)
253	lw	X9,  36(STATE)
254	lw	X10, 40(STATE)
255	lw	X11, 44(STATE)
256
257	move	X12, NONCE_0
258	lw	X13, 52(STATE)
259	lw	X14, 56(STATE)
260	lw	X15, 60(STATE)
261
262.Loop_chacha20_xor_rounds:
263	addiu	$at, -2
264	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
265	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
266	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
267	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
268	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
269	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
270	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
271	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
272	bnez	$at, .Loop_chacha20_xor_rounds
273
274	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
275
276	/* Is data src/dst unaligned? Jump */
277	bnez	IS_UNALIGNED, .Loop_chacha20_unaligned
278
279	/* Set number rounds here to fill delayslot. */
280	li	$at, 20
281
282	/* BYTES < 0, it has no full block. */
283	bltz	BYTES, .Lchacha20_mips_no_full_block_aligned
284
285	FOR_EACH_WORD_REV(STORE_ALIGNED)
286
287	/* BYTES > 0? Loop again. */
288	bgtz	BYTES, .Loop_chacha20_rounds
289
290	/* Place this here to fill delay slot */
291	addiu	NONCE_0, 1
292
293	/* BYTES < 0? Handle last bytes */
294	bltz	BYTES, .Lchacha20_mips_xor_bytes
295
296.Lchacha20_mips_xor_done:
297	/* Restore used registers */
298	lw	$s0,  0($sp)
299	lw	$s1,  4($sp)
300	lw	$s2,  8($sp)
301	lw	$s3, 12($sp)
302	lw	$s4, 16($sp)
303	lw	$s5, 20($sp)
304	lw	$s6, 24($sp)
305	lw	$s7, 28($sp)
306
307	/* Write NONCE_0 back to right location in state */
308	sw	NONCE_0, 48(STATE)
309
310.Lchacha20_mips_end:
311	addiu	$sp, STACK_SIZE
312	jr	$ra
313
314.Lchacha20_mips_no_full_block_aligned:
315	/* Restore the offset on BYTES */
316	addiu	BYTES, CHACHA20_BLOCK_SIZE
317
318	/* Get number of full WORDS */
319	andi	$at, BYTES, MASK_U32
320
321	/* Load upper half of jump table addr */
322	lui	T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
323
324	/* Calculate lower half jump table offset */
325	ins	T0, $at, 1, 6
326
327	/* Add offset to STATE */
328	addu	T1, STATE, $at
329
330	/* Add lower half jump table addr */
331	addiu	T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
332
333	/* Read value from STATE */
334	lw	SAVED_CA, 0(T1)
335
336	/* Store remaining bytecounter as negative value */
337	subu	BYTES, $at, BYTES
338
339	jr	T0
340
341	/* Jump table */
342	FOR_EACH_WORD(JMPTBL_ALIGNED)
343
344
345.Loop_chacha20_unaligned:
346	/* Set number rounds here to fill delayslot. */
347	li	$at, 20
348
349	/* BYTES > 0, it has no full block. */
350	bltz	BYTES, .Lchacha20_mips_no_full_block_unaligned
351
352	FOR_EACH_WORD_REV(STORE_UNALIGNED)
353
354	/* BYTES > 0? Loop again. */
355	bgtz	BYTES, .Loop_chacha20_rounds
356
357	/* Write NONCE_0 back to right location in state */
358	sw	NONCE_0, 48(STATE)
359
360	.set noreorder
361	/* Fall through to byte handling */
362	bgez	BYTES, .Lchacha20_mips_xor_done
363.Lchacha20_mips_xor_unaligned_0_b:
364.Lchacha20_mips_xor_aligned_0_b:
365	/* Place this here to fill delay slot */
366	addiu	NONCE_0, 1
367	.set reorder
368
369.Lchacha20_mips_xor_bytes:
370	addu	IN, $at
371	addu	OUT, $at
372	/* First byte */
373	lbu	T1, 0(IN)
374	addiu	$at, BYTES, 1
375	CPU_TO_LE32(SAVED_X)
376	ROTR(SAVED_X)
377	xor	T1, SAVED_X
378	sb	T1, 0(OUT)
379	beqz	$at, .Lchacha20_mips_xor_done
380	/* Second byte */
381	lbu	T1, 1(IN)
382	addiu	$at, BYTES, 2
383	ROTx	SAVED_X, 8
384	xor	T1, SAVED_X
385	sb	T1, 1(OUT)
386	beqz	$at, .Lchacha20_mips_xor_done
387	/* Third byte */
388	lbu	T1, 2(IN)
389	ROTx	SAVED_X, 8
390	xor	T1, SAVED_X
391	sb	T1, 2(OUT)
392	b	.Lchacha20_mips_xor_done
393
394.Lchacha20_mips_no_full_block_unaligned:
395	/* Restore the offset on BYTES */
396	addiu	BYTES, CHACHA20_BLOCK_SIZE
397
398	/* Get number of full WORDS */
399	andi	$at, BYTES, MASK_U32
400
401	/* Load upper half of jump table addr */
402	lui	T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
403
404	/* Calculate lower half jump table offset */
405	ins	T0, $at, 1, 6
406
407	/* Add offset to STATE */
408	addu	T1, STATE, $at
409
410	/* Add lower half jump table addr */
411	addiu	T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
412
413	/* Read value from STATE */
414	lw	SAVED_CA, 0(T1)
415
416	/* Store remaining bytecounter as negative value */
417	subu	BYTES, $at, BYTES
418
419	jr	T0
420
421	/* Jump table */
422	FOR_EACH_WORD(JMPTBL_UNALIGNED)
423.end chacha20_mips
424.set at
425