xref: /linux/arch/loongarch/vdso/vgetrandom-chacha.S (revision 570172569238c66a482ec3eb5d766cc9cf255f69)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
4 */
5
6#include <asm/asm.h>
7#include <asm/regdef.h>
8#include <linux/linkage.h>
9
10.text
11
12/* Salsa20 quarter-round */
13.macro	QR	a b c d
14	add.w		\a, \a, \b
15	xor		\d, \d, \a
16	rotri.w		\d, \d, 16
17
18	add.w		\c, \c, \d
19	xor		\b, \b, \c
20	rotri.w		\b, \b, 20
21
22	add.w		\a, \a, \b
23	xor		\d, \d, \a
24	rotri.w		\d, \d, 24
25
26	add.w		\c, \c, \d
27	xor		\b, \b, \c
28	rotri.w		\b, \b, 25
29.endm
30
31/*
32 * Very basic LoongArch implementation of ChaCha20. Produces a given positive
33 * number of blocks of output with a nonce of 0, taking an input key and
34 * 8-byte counter. Importantly does not spill to the stack. Its arguments
35 * are:
36 *
37 *	a0: output bytes
38 *	a1: 32-byte key input
39 *	a2: 8-byte counter input/output
40 *	a3: number of 64-byte blocks to write to output
41 */
42SYM_FUNC_START(__arch_chacha20_blocks_nostack)
43
44/* We don't need a frame pointer */
45#define s9		fp
46
47#define output		a0
48#define key		a1
49#define counter		a2
50#define nblocks		a3
51#define i		a4
52#define state0		s0
53#define state1		s1
54#define state2		s2
55#define state3		s3
56#define state4		s4
57#define state5		s5
58#define state6		s6
59#define state7		s7
60#define state8		s8
61#define state9		s9
62#define state10		a5
63#define state11		a6
64#define state12		a7
65#define state13		t0
66#define state14		t1
67#define state15		t2
68#define cnt_lo		t3
69#define cnt_hi		t4
70#define copy0		t5
71#define copy1		t6
72#define copy2		t7
73
74/* Reuse i as copy3 */
75#define copy3		i
76
77	/*
78	 * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
79	 * This does not violate the stack-less requirement: no sensitive data
80	 * is spilled onto the stack.
81	 */
82	PTR_ADDI	sp, sp, (-SZREG * 10) & STACK_ALIGN
83	REG_S		s0, sp, 0
84	REG_S		s1, sp, SZREG
85	REG_S		s2, sp, SZREG * 2
86	REG_S		s3, sp, SZREG * 3
87	REG_S		s4, sp, SZREG * 4
88	REG_S		s5, sp, SZREG * 5
89	REG_S		s6, sp, SZREG * 6
90	REG_S		s7, sp, SZREG * 7
91	REG_S		s8, sp, SZREG * 8
92	REG_S		s9, sp, SZREG * 9
93
94	li.w		copy0, 0x61707865
95	li.w		copy1, 0x3320646e
96	li.w		copy2, 0x79622d32
97
98	ld.w		cnt_lo, counter, 0
99	ld.w		cnt_hi, counter, 4
100
101.Lblock:
102	/* state[0,1,2,3] = "expand 32-byte k" */
103	move		state0, copy0
104	move		state1, copy1
105	move		state2, copy2
106	li.w		state3, 0x6b206574
107
108	/* state[4,5,..,11] = key */
109	ld.w		state4, key, 0
110	ld.w		state5, key, 4
111	ld.w		state6, key, 8
112	ld.w		state7, key, 12
113	ld.w		state8, key, 16
114	ld.w		state9, key, 20
115	ld.w		state10, key, 24
116	ld.w		state11, key, 28
117
118	/* state[12,13] = counter */
119	move		state12, cnt_lo
120	move		state13, cnt_hi
121
122	/* state[14,15] = 0 */
123	move		state14, zero
124	move		state15, zero
125
126	li.w		i, 10
127.Lpermute:
128	/* odd round */
129	QR		state0, state4, state8, state12
130	QR		state1, state5, state9, state13
131	QR		state2, state6, state10, state14
132	QR		state3, state7, state11, state15
133
134	/* even round */
135	QR		state0, state5, state10, state15
136	QR		state1, state6, state11, state12
137	QR		state2, state7, state8, state13
138	QR		state3, state4, state9, state14
139
140	addi.w		i, i, -1
141	bnez		i, .Lpermute
142
143	/*
144	 * copy[3] = "expa", materialize it here because copy[3] shares the
145	 * same register with i which just became dead.
146	 */
147	li.w		copy3, 0x6b206574
148
149	/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
150	add.w		state0, state0, copy0
151	add.w		state1, state1, copy1
152	add.w		state2, state2, copy2
153	add.w		state3, state3, copy3
154	st.w		state0, output, 0
155	st.w		state1, output, 4
156	st.w		state2, output, 8
157	st.w		state3, output, 12
158
159	/* from now on state[0,1,2,3] are scratch registers  */
160
161	/* state[0,1,2,3] = lo32(key) */
162	ld.w		state0, key, 0
163	ld.w		state1, key, 4
164	ld.w		state2, key, 8
165	ld.w		state3, key, 12
166
167	/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
168	add.w		state4, state4, state0
169	add.w		state5, state5, state1
170	add.w		state6, state6, state2
171	add.w		state7, state7, state3
172	st.w		state4, output, 16
173	st.w		state5, output, 20
174	st.w		state6, output, 24
175	st.w		state7, output, 28
176
177	/* state[0,1,2,3] = hi32(key) */
178	ld.w		state0, key, 16
179	ld.w		state1, key, 20
180	ld.w		state2, key, 24
181	ld.w		state3, key, 28
182
183	/* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
184	add.w		state8, state8, state0
185	add.w		state9, state9, state1
186	add.w		state10, state10, state2
187	add.w		state11, state11, state3
188	st.w		state8, output, 32
189	st.w		state9, output, 36
190	st.w		state10, output, 40
191	st.w		state11, output, 44
192
193	/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
194	add.w		state12, state12, cnt_lo
195	add.w		state13, state13, cnt_hi
196	st.w		state12, output, 48
197	st.w		state13, output, 52
198	st.w		state14, output, 56
199	st.w		state15, output, 60
200
201	/* ++counter  */
202	addi.w		cnt_lo, cnt_lo, 1
203	sltui		state0, cnt_lo, 1
204	add.w		cnt_hi, cnt_hi, state0
205
206	/* output += 64 */
207	PTR_ADDI	output, output, 64
208	/* --nblocks */
209	PTR_ADDI	nblocks, nblocks, -1
210	bnez		nblocks, .Lblock
211
212	/* counter = [cnt_lo, cnt_hi] */
213	st.w		cnt_lo, counter, 0
214	st.w		cnt_hi, counter, 4
215
216	/*
217	 * Zero out the potentially sensitive regs, in case nothing uses these
218	 * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
219	 * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
220	 * only need to zero state[11,...,15].
221	 */
222	move		state10, zero
223	move		state11, zero
224	move		state12, zero
225	move		state13, zero
226	move		state14, zero
227	move		state15, zero
228
229	REG_L		s0, sp, 0
230	REG_L		s1, sp, SZREG
231	REG_L		s2, sp, SZREG * 2
232	REG_L		s3, sp, SZREG * 3
233	REG_L		s4, sp, SZREG * 4
234	REG_L		s5, sp, SZREG * 5
235	REG_L		s6, sp, SZREG * 6
236	REG_L		s7, sp, SZREG * 7
237	REG_L		s8, sp, SZREG * 8
238	REG_L		s9, sp, SZREG * 9
239	PTR_ADDI	sp, sp, -((-SZREG * 10) & STACK_ALIGN)
240
241	jr		ra
242SYM_FUNC_END(__arch_chacha20_blocks_nostack)
243