xref: /linux/arch/riscv/kernel/vdso/vgetrandom-chacha.S (revision 119b1e61a769aa98e68599f44721661a4d8c55f3)
1*ee0d0305SXi Ruoyao/* SPDX-License-Identifier: GPL-2.0 */
2*ee0d0305SXi Ruoyao/*
3*ee0d0305SXi Ruoyao * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
4*ee0d0305SXi Ruoyao *
5*ee0d0305SXi Ruoyao * Based on arch/loongarch/vdso/vgetrandom-chacha.S.
6*ee0d0305SXi Ruoyao */
7*ee0d0305SXi Ruoyao
8*ee0d0305SXi Ruoyao#include <asm/asm.h>
9*ee0d0305SXi Ruoyao#include <linux/linkage.h>
10*ee0d0305SXi Ruoyao
11*ee0d0305SXi Ruoyao.text
12*ee0d0305SXi Ruoyao
13*ee0d0305SXi Ruoyao.macro	ROTRI	rd rs imm
14*ee0d0305SXi Ruoyao	slliw	t0, \rs, 32 - \imm
15*ee0d0305SXi Ruoyao	srliw	\rd, \rs, \imm
16*ee0d0305SXi Ruoyao	or	\rd, \rd, t0
17*ee0d0305SXi Ruoyao.endm
18*ee0d0305SXi Ruoyao
19*ee0d0305SXi Ruoyao.macro	OP_4REG	op d0 d1 d2 d3 s0 s1 s2 s3
20*ee0d0305SXi Ruoyao	\op	\d0, \d0, \s0
21*ee0d0305SXi Ruoyao	\op	\d1, \d1, \s1
22*ee0d0305SXi Ruoyao	\op	\d2, \d2, \s2
23*ee0d0305SXi Ruoyao	\op	\d3, \d3, \s3
24*ee0d0305SXi Ruoyao.endm
25*ee0d0305SXi Ruoyao
26*ee0d0305SXi Ruoyao/*
27*ee0d0305SXi Ruoyao *	a0: output bytes
28*ee0d0305SXi Ruoyao * 	a1: 32-byte key input
29*ee0d0305SXi Ruoyao *	a2: 8-byte counter input/output
30*ee0d0305SXi Ruoyao *	a3: number of 64-byte blocks to write to output
31*ee0d0305SXi Ruoyao */
32*ee0d0305SXi RuoyaoSYM_FUNC_START(__arch_chacha20_blocks_nostack)
33*ee0d0305SXi Ruoyao
34*ee0d0305SXi Ruoyao#define output		a0
35*ee0d0305SXi Ruoyao#define key		a1
36*ee0d0305SXi Ruoyao#define counter		a2
37*ee0d0305SXi Ruoyao#define nblocks		a3
38*ee0d0305SXi Ruoyao#define i		a4
39*ee0d0305SXi Ruoyao#define state0		s0
40*ee0d0305SXi Ruoyao#define state1		s1
41*ee0d0305SXi Ruoyao#define state2		s2
42*ee0d0305SXi Ruoyao#define state3		s3
43*ee0d0305SXi Ruoyao#define state4		s4
44*ee0d0305SXi Ruoyao#define state5		s5
45*ee0d0305SXi Ruoyao#define state6		s6
46*ee0d0305SXi Ruoyao#define state7		s7
47*ee0d0305SXi Ruoyao#define state8		s8
48*ee0d0305SXi Ruoyao#define state9		s9
49*ee0d0305SXi Ruoyao#define state10		s10
50*ee0d0305SXi Ruoyao#define state11		s11
51*ee0d0305SXi Ruoyao#define state12		a5
52*ee0d0305SXi Ruoyao#define state13		a6
53*ee0d0305SXi Ruoyao#define state14		a7
54*ee0d0305SXi Ruoyao#define state15		t1
55*ee0d0305SXi Ruoyao#define cnt		t2
56*ee0d0305SXi Ruoyao#define copy0		t3
57*ee0d0305SXi Ruoyao#define copy1		t4
58*ee0d0305SXi Ruoyao#define copy2		t5
59*ee0d0305SXi Ruoyao#define copy3		t6
60*ee0d0305SXi Ruoyao
61*ee0d0305SXi Ruoyao/* Packs to be used with OP_4REG */
62*ee0d0305SXi Ruoyao#define line0		state0, state1, state2, state3
63*ee0d0305SXi Ruoyao#define line1		state4, state5, state6, state7
64*ee0d0305SXi Ruoyao#define line2		state8, state9, state10, state11
65*ee0d0305SXi Ruoyao#define line3		state12, state13, state14, state15
66*ee0d0305SXi Ruoyao
67*ee0d0305SXi Ruoyao#define line1_perm	state5, state6, state7, state4
68*ee0d0305SXi Ruoyao#define line2_perm	state10, state11, state8, state9
69*ee0d0305SXi Ruoyao#define line3_perm	state15, state12, state13, state14
70*ee0d0305SXi Ruoyao
71*ee0d0305SXi Ruoyao#define copy		copy0, copy1, copy2, copy3
72*ee0d0305SXi Ruoyao
73*ee0d0305SXi Ruoyao#define _16		16, 16, 16, 16
74*ee0d0305SXi Ruoyao#define _20		20, 20, 20, 20
75*ee0d0305SXi Ruoyao#define _24		24, 24, 24, 24
76*ee0d0305SXi Ruoyao#define _25		25, 25, 25, 25
77*ee0d0305SXi Ruoyao
78*ee0d0305SXi Ruoyao	/*
79*ee0d0305SXi Ruoyao	 * The ABI requires s0-s9 saved.
80*ee0d0305SXi Ruoyao	 * This does not violate the stack-less requirement: no sensitive data
81*ee0d0305SXi Ruoyao	 * is spilled onto the stack.
82*ee0d0305SXi Ruoyao	 */
83*ee0d0305SXi Ruoyao	addi		sp, sp, -12*SZREG
84*ee0d0305SXi Ruoyao	REG_S		s0,         (sp)
85*ee0d0305SXi Ruoyao	REG_S		s1,    SZREG(sp)
86*ee0d0305SXi Ruoyao	REG_S		s2,  2*SZREG(sp)
87*ee0d0305SXi Ruoyao	REG_S		s3,  3*SZREG(sp)
88*ee0d0305SXi Ruoyao	REG_S		s4,  4*SZREG(sp)
89*ee0d0305SXi Ruoyao	REG_S		s5,  5*SZREG(sp)
90*ee0d0305SXi Ruoyao	REG_S		s6,  6*SZREG(sp)
91*ee0d0305SXi Ruoyao	REG_S		s7,  7*SZREG(sp)
92*ee0d0305SXi Ruoyao	REG_S		s8,  8*SZREG(sp)
93*ee0d0305SXi Ruoyao	REG_S		s9,  9*SZREG(sp)
94*ee0d0305SXi Ruoyao	REG_S		s10, 10*SZREG(sp)
95*ee0d0305SXi Ruoyao	REG_S		s11, 11*SZREG(sp)
96*ee0d0305SXi Ruoyao
97*ee0d0305SXi Ruoyao	ld		cnt, (counter)
98*ee0d0305SXi Ruoyao
99*ee0d0305SXi Ruoyao	li		copy0, 0x61707865
100*ee0d0305SXi Ruoyao	li		copy1, 0x3320646e
101*ee0d0305SXi Ruoyao	li		copy2, 0x79622d32
102*ee0d0305SXi Ruoyao	li		copy3, 0x6b206574
103*ee0d0305SXi Ruoyao
104*ee0d0305SXi Ruoyao.Lblock:
105*ee0d0305SXi Ruoyao	/* state[0,1,2,3] = "expand 32-byte k" */
106*ee0d0305SXi Ruoyao	mv		state0, copy0
107*ee0d0305SXi Ruoyao	mv		state1, copy1
108*ee0d0305SXi Ruoyao	mv		state2, copy2
109*ee0d0305SXi Ruoyao	mv		state3, copy3
110*ee0d0305SXi Ruoyao
111*ee0d0305SXi Ruoyao	/* state[4,5,..,11] = key */
112*ee0d0305SXi Ruoyao	lw		state4,   (key)
113*ee0d0305SXi Ruoyao	lw		state5,  4(key)
114*ee0d0305SXi Ruoyao	lw		state6,  8(key)
115*ee0d0305SXi Ruoyao	lw		state7,  12(key)
116*ee0d0305SXi Ruoyao	lw		state8,  16(key)
117*ee0d0305SXi Ruoyao	lw		state9,  20(key)
118*ee0d0305SXi Ruoyao	lw		state10, 24(key)
119*ee0d0305SXi Ruoyao	lw		state11, 28(key)
120*ee0d0305SXi Ruoyao
121*ee0d0305SXi Ruoyao	/* state[12,13] = counter */
122*ee0d0305SXi Ruoyao	mv		state12, cnt
123*ee0d0305SXi Ruoyao	srli		state13, cnt, 32
124*ee0d0305SXi Ruoyao
125*ee0d0305SXi Ruoyao	/* state[14,15] = 0 */
126*ee0d0305SXi Ruoyao	mv		state14, zero
127*ee0d0305SXi Ruoyao	mv		state15, zero
128*ee0d0305SXi Ruoyao
129*ee0d0305SXi Ruoyao	li		i, 10
130*ee0d0305SXi Ruoyao.Lpermute:
131*ee0d0305SXi Ruoyao	/* odd round */
132*ee0d0305SXi Ruoyao	OP_4REG	addw	line0, line1
133*ee0d0305SXi Ruoyao	OP_4REG	xor	line3, line0
134*ee0d0305SXi Ruoyao	OP_4REG	ROTRI	line3, _16
135*ee0d0305SXi Ruoyao
136*ee0d0305SXi Ruoyao	OP_4REG	addw	line2, line3
137*ee0d0305SXi Ruoyao	OP_4REG	xor	line1, line2
138*ee0d0305SXi Ruoyao	OP_4REG	ROTRI	line1, _20
139*ee0d0305SXi Ruoyao
140*ee0d0305SXi Ruoyao	OP_4REG	addw	line0, line1
141*ee0d0305SXi Ruoyao	OP_4REG	xor	line3, line0
142*ee0d0305SXi Ruoyao	OP_4REG	ROTRI	line3, _24
143*ee0d0305SXi Ruoyao
144*ee0d0305SXi Ruoyao	OP_4REG	addw	line2, line3
145*ee0d0305SXi Ruoyao	OP_4REG	xor	line1, line2
146*ee0d0305SXi Ruoyao	OP_4REG	ROTRI	line1, _25
147*ee0d0305SXi Ruoyao
148*ee0d0305SXi Ruoyao	/* even round */
149*ee0d0305SXi Ruoyao	OP_4REG	addw	line0, line1_perm
150*ee0d0305SXi Ruoyao	OP_4REG	xor	line3_perm, line0
151*ee0d0305SXi Ruoyao	OP_4REG	ROTRI	line3_perm, _16
152*ee0d0305SXi Ruoyao
153*ee0d0305SXi Ruoyao	OP_4REG	addw	line2_perm, line3_perm
154*ee0d0305SXi Ruoyao	OP_4REG	xor	line1_perm, line2_perm
155*ee0d0305SXi Ruoyao	OP_4REG	ROTRI	line1_perm, _20
156*ee0d0305SXi Ruoyao
157*ee0d0305SXi Ruoyao	OP_4REG	addw	line0, line1_perm
158*ee0d0305SXi Ruoyao	OP_4REG	xor	line3_perm, line0
159*ee0d0305SXi Ruoyao	OP_4REG	ROTRI	line3_perm, _24
160*ee0d0305SXi Ruoyao
161*ee0d0305SXi Ruoyao	OP_4REG	addw	line2_perm, line3_perm
162*ee0d0305SXi Ruoyao	OP_4REG	xor	line1_perm, line2_perm
163*ee0d0305SXi Ruoyao	OP_4REG	ROTRI	line1_perm, _25
164*ee0d0305SXi Ruoyao
165*ee0d0305SXi Ruoyao	addi		i, i, -1
166*ee0d0305SXi Ruoyao	bnez		i, .Lpermute
167*ee0d0305SXi Ruoyao
168*ee0d0305SXi Ruoyao	/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
169*ee0d0305SXi Ruoyao	OP_4REG	addw	line0, copy
170*ee0d0305SXi Ruoyao	sw		state0,   (output)
171*ee0d0305SXi Ruoyao	sw		state1,  4(output)
172*ee0d0305SXi Ruoyao	sw		state2,  8(output)
173*ee0d0305SXi Ruoyao	sw		state3, 12(output)
174*ee0d0305SXi Ruoyao
175*ee0d0305SXi Ruoyao	/* from now on state[0,1,2,3] are scratch registers  */
176*ee0d0305SXi Ruoyao
177*ee0d0305SXi Ruoyao	/* state[0,1,2,3] = lo(key) */
178*ee0d0305SXi Ruoyao	lw		state0,   (key)
179*ee0d0305SXi Ruoyao	lw		state1,  4(key)
180*ee0d0305SXi Ruoyao	lw		state2,  8(key)
181*ee0d0305SXi Ruoyao	lw		state3, 12(key)
182*ee0d0305SXi Ruoyao
183*ee0d0305SXi Ruoyao	/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
184*ee0d0305SXi Ruoyao	OP_4REG	addw	line1, line0
185*ee0d0305SXi Ruoyao	sw		state4, 16(output)
186*ee0d0305SXi Ruoyao	sw		state5, 20(output)
187*ee0d0305SXi Ruoyao	sw		state6, 24(output)
188*ee0d0305SXi Ruoyao	sw		state7, 28(output)
189*ee0d0305SXi Ruoyao
190*ee0d0305SXi Ruoyao	/* state[0,1,2,3] = hi(key) */
191*ee0d0305SXi Ruoyao	lw		state0, 16(key)
192*ee0d0305SXi Ruoyao	lw		state1, 20(key)
193*ee0d0305SXi Ruoyao	lw		state2, 24(key)
194*ee0d0305SXi Ruoyao	lw		state3, 28(key)
195*ee0d0305SXi Ruoyao
196*ee0d0305SXi Ruoyao	/* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */
197*ee0d0305SXi Ruoyao	OP_4REG	addw	line2, line0
198*ee0d0305SXi Ruoyao	sw		state8,  32(output)
199*ee0d0305SXi Ruoyao	sw		state9,  36(output)
200*ee0d0305SXi Ruoyao	sw		state10, 40(output)
201*ee0d0305SXi Ruoyao	sw		state11, 44(output)
202*ee0d0305SXi Ruoyao
203*ee0d0305SXi Ruoyao	/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
204*ee0d0305SXi Ruoyao	addw		state12, state12, cnt
205*ee0d0305SXi Ruoyao	srli		state0, cnt, 32
206*ee0d0305SXi Ruoyao	addw		state13, state13, state0
207*ee0d0305SXi Ruoyao	sw		state12, 48(output)
208*ee0d0305SXi Ruoyao	sw		state13, 52(output)
209*ee0d0305SXi Ruoyao	sw		state14, 56(output)
210*ee0d0305SXi Ruoyao	sw		state15, 60(output)
211*ee0d0305SXi Ruoyao
212*ee0d0305SXi Ruoyao	/* ++counter */
213*ee0d0305SXi Ruoyao	addi		cnt, cnt, 1
214*ee0d0305SXi Ruoyao
215*ee0d0305SXi Ruoyao	/* output += 64 */
216*ee0d0305SXi Ruoyao	addi		output, output, 64
217*ee0d0305SXi Ruoyao	/* --nblocks */
218*ee0d0305SXi Ruoyao	addi		nblocks, nblocks, -1
219*ee0d0305SXi Ruoyao	bnez		nblocks, .Lblock
220*ee0d0305SXi Ruoyao
221*ee0d0305SXi Ruoyao	/* counter = [cnt_lo, cnt_hi] */
222*ee0d0305SXi Ruoyao	sd		cnt, (counter)
223*ee0d0305SXi Ruoyao
224*ee0d0305SXi Ruoyao	/* Zero out the potentially sensitive regs, in case nothing uses these
225*ee0d0305SXi Ruoyao	 * again.  As at now copy[0,1,2,3] just contains "expand 32-byte k" and
226*ee0d0305SXi Ruoyao	 * state[0,...,11] are s0-s11 those we'll restore in the epilogue, we
227*ee0d0305SXi Ruoyao	 * only need to zero state[12,...,15].
228*ee0d0305SXi Ruoyao	 */
229*ee0d0305SXi Ruoyao	mv		state12, zero
230*ee0d0305SXi Ruoyao	mv		state13, zero
231*ee0d0305SXi Ruoyao	mv		state14, zero
232*ee0d0305SXi Ruoyao	mv		state15, zero
233*ee0d0305SXi Ruoyao
234*ee0d0305SXi Ruoyao	REG_L		s0,         (sp)
235*ee0d0305SXi Ruoyao	REG_L		s1,    SZREG(sp)
236*ee0d0305SXi Ruoyao	REG_L		s2,  2*SZREG(sp)
237*ee0d0305SXi Ruoyao	REG_L		s3,  3*SZREG(sp)
238*ee0d0305SXi Ruoyao	REG_L		s4,  4*SZREG(sp)
239*ee0d0305SXi Ruoyao	REG_L		s5,  5*SZREG(sp)
240*ee0d0305SXi Ruoyao	REG_L		s6,  6*SZREG(sp)
241*ee0d0305SXi Ruoyao	REG_L		s7,  7*SZREG(sp)
242*ee0d0305SXi Ruoyao	REG_L		s8,  8*SZREG(sp)
243*ee0d0305SXi Ruoyao	REG_L		s9,  9*SZREG(sp)
244*ee0d0305SXi Ruoyao	REG_L		s10, 10*SZREG(sp)
245*ee0d0305SXi Ruoyao	REG_L		s11, 11*SZREG(sp)
246*ee0d0305SXi Ruoyao	addi		sp, sp, 12*SZREG
247*ee0d0305SXi Ruoyao
248*ee0d0305SXi Ruoyao	ret
249*ee0d0305SXi RuoyaoSYM_FUNC_END(__arch_chacha20_blocks_nostack)
250