xref: /linux/arch/arm64/kernel/vdso/vgetrandom-chacha.S (revision 5afca7e996c42aed1b4a42d4712817601ba42aff)
1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/linkage.h>
4#include <asm/cache.h>
5#include <asm/assembler.h>
6
7	.text
8
9#define state0		v0
10#define state1		v1
11#define state2		v2
12#define state3		v3
13#define copy0		v4
14#define copy0_q		q4
15#define copy1		v5
16#define copy2		v6
17#define copy3		v7
18#define copy3_d		d7
19#define one_d		d16
20#define one_q		q16
21#define one_v		v16
22#define tmp		v17
23#define rot8		v18
24
25/*
26 * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
27 * number of blocks of output with nonce 0, taking an input key and 8-bytes
28 * counter.  Importantly does not spill to the stack.
29 *
30 * This implementation avoids d8-d15 because they are callee-save in user
31 * space.
32 *
33 * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
34 *				       const uint8_t *key,
35 * 				       uint32_t *counter,
36 *				       size_t nblocks)
37 *
38 * 	x0: output bytes
39 *	x1: 32-byte key input
40 *	x2: 8-byte counter input/output
41 *	x3: number of 64-byte block to write to output
42 */
43SYM_FUNC_START(__arch_chacha20_blocks_nostack)
44
45	/* copy0 = "expand 32-byte k" */
46	mov_q		x8, 0x3320646e61707865
47	mov_q		x9, 0x6b20657479622d32
48	mov		copy0.d[0], x8
49	mov		copy0.d[1], x9
50
51	/* copy1,copy2 = key */
52	ld1		{ copy1.4s, copy2.4s }, [x1]
53	/* copy3 = counter || zero nonce  */
54	ld1		{ copy3.2s }, [x2]
55
56	movi		one_v.2s, #1
57	uzp1		one_v.4s, one_v.4s, one_v.4s
58
59.Lblock:
60	/* copy state to auxiliary vectors for the final add after the permute.  */
61	mov		state0.16b, copy0.16b
62	mov		state1.16b, copy1.16b
63	mov		state2.16b, copy2.16b
64	mov		state3.16b, copy3.16b
65
66	mov		w4, 20
67.Lpermute:
68	/*
69	 * Permute one 64-byte block where the state matrix is stored in the four NEON
70	 * registers state0-state3.  It performs matrix operations on four words in parallel,
71	 * but requires shuffling to rearrange the words after each round.
72	 */
73
74.Ldoubleround:
75	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
76	add		state0.4s, state0.4s, state1.4s
77	eor		state3.16b, state3.16b, state0.16b
78	rev32		state3.8h, state3.8h
79
80	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
81	add		state2.4s, state2.4s, state3.4s
82	eor		tmp.16b, state1.16b, state2.16b
83	shl		state1.4s, tmp.4s, #12
84	sri		state1.4s, tmp.4s, #20
85
86	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
87	add		state0.4s, state0.4s, state1.4s
88	eor		tmp.16b, state3.16b, state0.16b
89	shl		state3.4s, tmp.4s, #8
90	sri		state3.4s, tmp.4s, #24
91
92	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
93	add		state2.4s, state2.4s, state3.4s
94	eor		tmp.16b, state1.16b, state2.16b
95	shl		state1.4s, tmp.4s, #7
96	sri		state1.4s, tmp.4s, #25
97
98	/* state1[0,1,2,3] = state1[1,2,3,0] */
99	ext		state1.16b, state1.16b, state1.16b, #4
100	/* state2[0,1,2,3] = state2[2,3,0,1] */
101	ext		state2.16b, state2.16b, state2.16b, #8
102	/* state3[0,1,2,3] = state3[1,2,3,0] */
103	ext		state3.16b, state3.16b, state3.16b, #12
104
105	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
106	add		state0.4s, state0.4s, state1.4s
107	eor		state3.16b, state3.16b, state0.16b
108	rev32		state3.8h, state3.8h
109
110	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
111	add		state2.4s, state2.4s, state3.4s
112	eor		tmp.16b, state1.16b, state2.16b
113	shl		state1.4s, tmp.4s, #12
114	sri		state1.4s, tmp.4s, #20
115
116	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
117	add		state0.4s, state0.4s, state1.4s
118	eor		tmp.16b, state3.16b, state0.16b
119	shl		state3.4s, tmp.4s, #8
120	sri		state3.4s, tmp.4s, #24
121
122	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
123	add		state2.4s, state2.4s, state3.4s
124	eor		tmp.16b, state1.16b, state2.16b
125	shl		state1.4s, tmp.4s, #7
126	sri		state1.4s, tmp.4s, #25
127
128	/* state1[0,1,2,3] = state1[3,0,1,2] */
129	ext		state1.16b, state1.16b, state1.16b, #12
130	/* state2[0,1,2,3] = state2[2,3,0,1] */
131	ext		state2.16b, state2.16b, state2.16b, #8
132	/* state3[0,1,2,3] = state3[1,2,3,0] */
133	ext		state3.16b, state3.16b, state3.16b, #4
134
135	subs		w4, w4, #2
136	b.ne		.Ldoubleround
137
138	/* output0 = state0 + state0 */
139	add		state0.4s, state0.4s, copy0.4s
140	/* output1 = state1 + state1 */
141	add		state1.4s, state1.4s, copy1.4s
142	/* output2 = state2 + state2 */
143	add		state2.4s, state2.4s, copy2.4s
144	/* output2 = state3 + state3 */
145	add		state3.4s, state3.4s, copy3.4s
146	st1		{ state0.16b - state3.16b }, [x0]
147
148	/*
149	 * ++copy3.counter, the 'add' clears the upper half of the SIMD register
150	 * which is the expected behaviour here.
151	 */
152	add		copy3_d, copy3_d, one_d
153
154	/* output += 64, --nblocks */
155	add		x0, x0, 64
156	subs		x3, x3, #1
157	b.ne		.Lblock
158
159	/* counter = copy3.counter */
160	st1		{ copy3.2s }, [x2]
161
162	/* Zero out the potentially sensitive regs, in case nothing uses these again. */
163	movi		state0.16b, #0
164	movi		state1.16b, #0
165	movi		state2.16b, #0
166	movi		state3.16b, #0
167	movi		copy1.16b, #0
168	movi		copy2.16b, #0
169	ret
170SYM_FUNC_END(__arch_chacha20_blocks_nostack)
171
172emit_aarch64_feature_1_and
173