xref: /linux/arch/powerpc/kernel/vdso/vgetrandom-chacha.S (revision c7546e2c3cb739a3c1a2f5acaf9bb629d401afe5)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (C) 2024 Christophe Leroy <christophe.leroy@csgroup.eu>, CS GROUP France
4 */
5
6#include <linux/linkage.h>
7
8#include <asm/ppc_asm.h>
9
10#define	dst_bytes	r3
11#define	key		r4
12#define	counter		r5
13#define	nblocks		r6
14
15#define	idx_r0		r0
16#define	val4		r4
17
18#define	const0		0x61707865
19#define	const1		0x3320646e
20#define	const2		0x79622d32
21#define	const3		0x6b206574
22
23#define	key0		r5
24#define	key1		r6
25#define	key2		r7
26#define	key3		r8
27#define	key4		r9
28#define	key5		r10
29#define	key6		r11
30#define	key7		r12
31
32#define	counter0	r14
33#define	counter1	r15
34
35#define	state0		r16
36#define	state1		r17
37#define	state2		r18
38#define	state3		r19
39#define	state4		r20
40#define	state5		r21
41#define	state6		r22
42#define	state7		r23
43#define	state8		r24
44#define	state9		r25
45#define	state10		r26
46#define	state11		r27
47#define	state12		r28
48#define	state13		r29
49#define	state14		r30
50#define	state15		r31
51
52.macro quarterround4 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 a4 b4 c4 d4
53	add	\a1, \a1, \b1
54	add	\a2, \a2, \b2
55	add	\a3, \a3, \b3
56	add	\a4, \a4, \b4
57	xor	\d1, \d1, \a1
58	xor	\d2, \d2, \a2
59	xor	\d3, \d3, \a3
60	xor	\d4, \d4, \a4
61	rotlwi	\d1, \d1, 16
62	rotlwi	\d2, \d2, 16
63	rotlwi	\d3, \d3, 16
64	rotlwi	\d4, \d4, 16
65	add	\c1, \c1, \d1
66	add	\c2, \c2, \d2
67	add	\c3, \c3, \d3
68	add	\c4, \c4, \d4
69	xor	\b1, \b1, \c1
70	xor	\b2, \b2, \c2
71	xor	\b3, \b3, \c3
72	xor	\b4, \b4, \c4
73	rotlwi	\b1, \b1, 12
74	rotlwi	\b2, \b2, 12
75	rotlwi	\b3, \b3, 12
76	rotlwi	\b4, \b4, 12
77	add	\a1, \a1, \b1
78	add	\a2, \a2, \b2
79	add	\a3, \a3, \b3
80	add	\a4, \a4, \b4
81	xor	\d1, \d1, \a1
82	xor	\d2, \d2, \a2
83	xor	\d3, \d3, \a3
84	xor	\d4, \d4, \a4
85	rotlwi	\d1, \d1, 8
86	rotlwi	\d2, \d2, 8
87	rotlwi	\d3, \d3, 8
88	rotlwi	\d4, \d4, 8
89	add	\c1, \c1, \d1
90	add	\c2, \c2, \d2
91	add	\c3, \c3, \d3
92	add	\c4, \c4, \d4
93	xor	\b1, \b1, \c1
94	xor	\b2, \b2, \c2
95	xor	\b3, \b3, \c3
96	xor	\b4, \b4, \c4
97	rotlwi	\b1, \b1, 7
98	rotlwi	\b2, \b2, 7
99	rotlwi	\b3, \b3, 7
100	rotlwi	\b4, \b4, 7
101.endm
102
103#define QUARTERROUND4(a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,a4,b4,c4,d4) \
104	quarterround4 state##a1 state##b1 state##c1 state##d1 \
105		      state##a2 state##b2 state##c2 state##d2 \
106		      state##a3 state##b3 state##c3 state##d3 \
107		      state##a4 state##b4 state##c4 state##d4
108
109/*
110 * Very basic 32 bits implementation of ChaCha20. Produces a given positive number
111 * of blocks of output with a nonce of 0, taking an input key and 8-byte
112 * counter. Importantly does not spill to the stack. Its arguments are:
113 *
114 *	r3: output bytes
115 *	r4: 32-byte key input
116 *	r5: 8-byte counter input/output (saved on stack)
117 *	r6: number of 64-byte blocks to write to output
118 *
119 *	r0: counter of blocks (initialised with r6)
120 *	r4: Value '4' after key has been read.
121 *	r5-r12: key
122 *	r14-r15: counter
123 *	r16-r31: state
124 */
125SYM_FUNC_START(__arch_chacha20_blocks_nostack)
126#ifdef __powerpc64__
127	std	counter, -216(r1)
128
129	std	r14, -144(r1)
130	std	r15, -136(r1)
131	std	r16, -128(r1)
132	std	r17, -120(r1)
133	std	r18, -112(r1)
134	std	r19, -104(r1)
135	std	r20, -96(r1)
136	std	r21, -88(r1)
137	std	r22, -80(r1)
138	std	r23, -72(r1)
139	std	r24, -64(r1)
140	std	r25, -56(r1)
141	std	r26, -48(r1)
142	std	r27, -40(r1)
143	std	r28, -32(r1)
144	std	r29, -24(r1)
145	std	r30, -16(r1)
146	std	r31, -8(r1)
147#else
148	stwu	r1, -96(r1)
149	stw	counter, 20(r1)
150#ifdef __BIG_ENDIAN__
151	stmw	r14, 24(r1)
152#else
153	stw	r14, 24(r1)
154	stw	r15, 28(r1)
155	stw	r16, 32(r1)
156	stw	r17, 36(r1)
157	stw	r18, 40(r1)
158	stw	r19, 44(r1)
159	stw	r20, 48(r1)
160	stw	r21, 52(r1)
161	stw	r22, 56(r1)
162	stw	r23, 60(r1)
163	stw	r24, 64(r1)
164	stw	r25, 68(r1)
165	stw	r26, 72(r1)
166	stw	r27, 76(r1)
167	stw	r28, 80(r1)
168	stw	r29, 84(r1)
169	stw	r30, 88(r1)
170	stw	r31, 92(r1)
171#endif
172#endif	/* __powerpc64__ */
173
174	lwz	counter0, 0(counter)
175	lwz	counter1, 4(counter)
176#ifdef __powerpc64__
177	rldimi	counter0, counter1, 32, 0
178#endif
179	mr	idx_r0, nblocks
180	subi	dst_bytes, dst_bytes, 4
181
182	lwz	key0, 0(key)
183	lwz	key1, 4(key)
184	lwz	key2, 8(key)
185	lwz	key3, 12(key)
186	lwz	key4, 16(key)
187	lwz	key5, 20(key)
188	lwz	key6, 24(key)
189	lwz	key7, 28(key)
190
191	li	val4, 4
192.Lblock:
193	li	r31, 10
194
195	lis	state0, const0@ha
196	lis	state1, const1@ha
197	lis	state2, const2@ha
198	lis	state3, const3@ha
199	addi	state0, state0, const0@l
200	addi	state1, state1, const1@l
201	addi	state2, state2, const2@l
202	addi	state3, state3, const3@l
203
204	mtctr	r31
205
206	mr	state4, key0
207	mr	state5, key1
208	mr	state6, key2
209	mr	state7, key3
210	mr	state8, key4
211	mr	state9, key5
212	mr	state10, key6
213	mr	state11, key7
214
215	mr	state12, counter0
216	mr	state13, counter1
217
218	li	state14, 0
219	li	state15, 0
220
221.Lpermute:
222	QUARTERROUND4( 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15)
223	QUARTERROUND4( 0, 5,10,15, 1, 6,11,12, 2, 7, 8,13, 3, 4, 9,14)
224
225	bdnz	.Lpermute
226
227	addis	state0, state0, const0@ha
228	addis	state1, state1, const1@ha
229	addis	state2, state2, const2@ha
230	addis	state3, state3, const3@ha
231	addi	state0, state0, const0@l
232	addi	state1, state1, const1@l
233	addi	state2, state2, const2@l
234	addi	state3, state3, const3@l
235
236	add	state4, state4, key0
237	add	state5, state5, key1
238	add	state6, state6, key2
239	add	state7, state7, key3
240	add	state8, state8, key4
241	add	state9, state9, key5
242	add	state10, state10, key6
243	add	state11, state11, key7
244
245	add	state12, state12, counter0
246	add	state13, state13, counter1
247
248#ifdef __BIG_ENDIAN__
249	stwbrx	state0, val4, dst_bytes
250	addi	dst_bytes, dst_bytes, 8
251	stwbrx	state1, 0, dst_bytes
252	stwbrx	state2, val4, dst_bytes
253	addi	dst_bytes, dst_bytes, 8
254	stwbrx	state3, 0, dst_bytes
255	stwbrx	state4, val4, dst_bytes
256	addi	dst_bytes, dst_bytes, 8
257	stwbrx	state5, 0, dst_bytes
258	stwbrx	state6, val4, dst_bytes
259	addi	dst_bytes, dst_bytes, 8
260	stwbrx	state7, 0, dst_bytes
261	stwbrx	state8, val4, dst_bytes
262	addi	dst_bytes, dst_bytes, 8
263	stwbrx	state9, 0, dst_bytes
264	stwbrx	state10, val4, dst_bytes
265	addi	dst_bytes, dst_bytes, 8
266	stwbrx	state11, 0, dst_bytes
267	stwbrx	state12, val4, dst_bytes
268	addi	dst_bytes, dst_bytes, 8
269	stwbrx	state13, 0, dst_bytes
270	stwbrx	state14, val4, dst_bytes
271	addi	dst_bytes, dst_bytes, 8
272	stwbrx	state15, 0, dst_bytes
273#else
274	stw	state0, 4(dst_bytes)
275	stw	state1, 8(dst_bytes)
276	stw	state2, 12(dst_bytes)
277	stw	state3, 16(dst_bytes)
278	stw	state4, 20(dst_bytes)
279	stw	state5, 24(dst_bytes)
280	stw	state6, 28(dst_bytes)
281	stw	state7, 32(dst_bytes)
282	stw	state8, 36(dst_bytes)
283	stw	state9, 40(dst_bytes)
284	stw	state10, 44(dst_bytes)
285	stw	state11, 48(dst_bytes)
286	stw	state12, 52(dst_bytes)
287	stw	state13, 56(dst_bytes)
288	stw	state14, 60(dst_bytes)
289	stwu	state15, 64(dst_bytes)
290#endif
291
292	subic.	idx_r0, idx_r0, 1	/* subi. can't use r0 as source */
293
294#ifdef __powerpc64__
295	addi	counter0, counter0, 1
296	srdi	counter1, counter0, 32
297#else
298	addic	counter0, counter0, 1
299	addze	counter1, counter1
300#endif
301
302	bne	.Lblock
303
304#ifdef __powerpc64__
305	ld	counter, -216(r1)
306#else
307	lwz	counter, 20(r1)
308#endif
309	stw	counter0, 0(counter)
310	stw	counter1, 4(counter)
311
312	li	r6, 0
313	li	r7, 0
314	li	r8, 0
315	li	r9, 0
316	li	r10, 0
317	li	r11, 0
318	li	r12, 0
319
320#ifdef __powerpc64__
321	ld	r14, -144(r1)
322	ld	r15, -136(r1)
323	ld	r16, -128(r1)
324	ld	r17, -120(r1)
325	ld	r18, -112(r1)
326	ld	r19, -104(r1)
327	ld	r20, -96(r1)
328	ld	r21, -88(r1)
329	ld	r22, -80(r1)
330	ld	r23, -72(r1)
331	ld	r24, -64(r1)
332	ld	r25, -56(r1)
333	ld	r26, -48(r1)
334	ld	r27, -40(r1)
335	ld	r28, -32(r1)
336	ld	r29, -24(r1)
337	ld	r30, -16(r1)
338	ld	r31, -8(r1)
339#else
340#ifdef __BIG_ENDIAN__
341	lmw	r14, 24(r1)
342#else
343	lwz	r14, 24(r1)
344	lwz	r15, 28(r1)
345	lwz	r16, 32(r1)
346	lwz	r17, 36(r1)
347	lwz	r18, 40(r1)
348	lwz	r19, 44(r1)
349	lwz	r20, 48(r1)
350	lwz	r21, 52(r1)
351	lwz	r22, 56(r1)
352	lwz	r23, 60(r1)
353	lwz	r24, 64(r1)
354	lwz	r25, 68(r1)
355	lwz	r26, 72(r1)
356	lwz	r27, 76(r1)
357	lwz	r28, 80(r1)
358	lwz	r29, 84(r1)
359	lwz	r30, 88(r1)
360	lwz	r31, 92(r1)
361#endif
362	addi	r1, r1, 96
363#endif	/* __powerpc64__ */
364	blr
365SYM_FUNC_END(__arch_chacha20_blocks_nostack)
366