xref: /linux/arch/s390/kernel/vdso64/vgetrandom-chacha.S (revision c94cd9508b1335b949fd13ebd269313c65492df0)
1/* SPDX-License-Identifier: GPL-2.0 */
2
3#include <linux/linkage.h>
4#include <asm/alternative.h>
5#include <asm/fpu-insn.h>
6
7#define STATE0	%v0
8#define STATE1	%v1
9#define STATE2	%v2
10#define STATE3	%v3
11#define COPY0	%v4
12#define COPY1	%v5
13#define COPY2	%v6
14#define COPY3	%v7
15#define PERM4	%v16
16#define PERM8	%v17
17#define PERM12	%v18
18#define BEPERM	%v19
19#define TMP0	%v20
20#define TMP1	%v21
21#define TMP2	%v22
22#define TMP3	%v23
23
24	.section .rodata
25
26	.balign 128
27.Lconstants:
28	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
29	.long	0x04050607,0x08090a0b,0x0c0d0e0f,0x00010203 # rotl  4 bytes
30	.long	0x08090a0b,0x0c0d0e0f,0x00010203,0x04050607 # rotl  8 bytes
31	.long	0x0c0d0e0f,0x00010203,0x04050607,0x08090a0b # rotl 12 bytes
32	.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
33
34	.text
35/*
36 * s390 ChaCha20 implementation meant for vDSO. Produces a given positive
37 * number of blocks of output with nonce 0, taking an input key and 8-bytes
38 * counter. Does not spill to the stack.
39 *
40 * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
41 *				       const uint8_t *key,
42 *				       uint32_t *counter,
43 *				       size_t nblocks)
44 */
45SYM_FUNC_START(__arch_chacha20_blocks_nostack)
46	larl	%r1,.Lconstants
47
48	/* COPY0 = "expand 32-byte k" */
49	VL	COPY0,0,,%r1
50
51	/* PERM4-PERM12,BEPERM = byte selectors for VPERM */
52	VLM	PERM4,BEPERM,16,%r1
53
54	/* COPY1,COPY2 = key */
55	VLM	COPY1,COPY2,0,%r3
56
57	/* COPY3 = counter || zero nonce  */
58	lg	%r3,0(%r4)
59	VZERO	COPY3
60	VLVGG	COPY3,%r3,0
61
62	lghi	%r1,0
63.Lblock:
64	VLR	STATE0,COPY0
65	VLR	STATE1,COPY1
66	VLR	STATE2,COPY2
67	VLR	STATE3,COPY3
68
69	lghi	%r0,10
70.Ldoubleround:
71	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
72	VAF	STATE0,STATE0,STATE1
73	VX	STATE3,STATE3,STATE0
74	VERLLF	STATE3,STATE3,16
75
76	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
77	VAF	STATE2,STATE2,STATE3
78	VX	STATE1,STATE1,STATE2
79	VERLLF	STATE1,STATE1,12
80
81	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
82	VAF	STATE0,STATE0,STATE1
83	VX	STATE3,STATE3,STATE0
84	VERLLF	STATE3,STATE3,8
85
86	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
87	VAF	STATE2,STATE2,STATE3
88	VX	STATE1,STATE1,STATE2
89	VERLLF	STATE1,STATE1,7
90
91	/* STATE1[0,1,2,3] = STATE1[1,2,3,0] */
92	VPERM	STATE1,STATE1,STATE1,PERM4
93	/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
94	VPERM	STATE2,STATE2,STATE2,PERM8
95	/* STATE3[0,1,2,3] = STATE3[3,0,1,2] */
96	VPERM	STATE3,STATE3,STATE3,PERM12
97
98	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
99	VAF	STATE0,STATE0,STATE1
100	VX	STATE3,STATE3,STATE0
101	VERLLF	STATE3,STATE3,16
102
103	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
104	VAF	STATE2,STATE2,STATE3
105	VX	STATE1,STATE1,STATE2
106	VERLLF	STATE1,STATE1,12
107
108	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
109	VAF	STATE0,STATE0,STATE1
110	VX	STATE3,STATE3,STATE0
111	VERLLF	STATE3,STATE3,8
112
113	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
114	VAF	STATE2,STATE2,STATE3
115	VX	STATE1,STATE1,STATE2
116	VERLLF	STATE1,STATE1,7
117
118	/* STATE1[0,1,2,3] = STATE1[3,0,1,2] */
119	VPERM	STATE1,STATE1,STATE1,PERM12
120	/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
121	VPERM	STATE2,STATE2,STATE2,PERM8
122	/* STATE3[0,1,2,3] = STATE3[1,2,3,0] */
123	VPERM	STATE3,STATE3,STATE3,PERM4
124	brctg	%r0,.Ldoubleround
125
126	/* OUTPUT0 = STATE0 + STATE0 */
127	VAF	STATE0,STATE0,COPY0
128	/* OUTPUT1 = STATE1 + STATE1 */
129	VAF	STATE1,STATE1,COPY1
130	/* OUTPUT2 = STATE2 + STATE2 */
131	VAF	STATE2,STATE2,COPY2
132	/* OUTPUT2 = STATE3 + STATE3 */
133	VAF	STATE3,STATE3,COPY3
134
135	/*
136	 * 32 bit wise little endian store to OUTPUT. If the vector
137	 * enhancement facility 2 is not installed use the slow path.
138	 */
139	ALTERNATIVE "brc 0xf,.Lstoreslow", "nop", ALT_FACILITY(148)
140	VSTBRF	STATE0,0,,%r2
141	VSTBRF	STATE1,16,,%r2
142	VSTBRF	STATE2,32,,%r2
143	VSTBRF	STATE3,48,,%r2
144.Lstoredone:
145
146	/* ++COPY3.COUNTER */
147	/* alsih %r3,1 */
148	.insn	rilu,0xcc0a00000000,%r3,1
149	alcr	%r3,%r1
150	VLVGG	COPY3,%r3,0
151
152	/* OUTPUT += 64, --NBLOCKS */
153	aghi	%r2,64
154	brctg	%r5,.Lblock
155
156	/* COUNTER = COPY3.COUNTER */
157	stg	%r3,0(%r4)
158
159	/* Zero out potentially sensitive regs */
160	VZERO	STATE0
161	VZERO	STATE1
162	VZERO	STATE2
163	VZERO	STATE3
164	VZERO	COPY1
165	VZERO	COPY2
166
167	/* Early exit if TMP0-TMP3 have not been used */
168	ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148)
169
170	VZERO	TMP0
171	VZERO	TMP1
172	VZERO	TMP2
173	VZERO	TMP3
174
175	br	%r14
176
177.Lstoreslow:
178	/* Convert STATE to little endian format and store to OUTPUT */
179	VPERM	TMP0,STATE0,STATE0,BEPERM
180	VPERM	TMP1,STATE1,STATE1,BEPERM
181	VPERM	TMP2,STATE2,STATE2,BEPERM
182	VPERM	TMP3,STATE3,STATE3,BEPERM
183	VSTM	TMP0,TMP3,0,%r2
184	j	.Lstoredone
185SYM_FUNC_END(__arch_chacha20_blocks_nostack)
186