xref: /linux/arch/s390/kernel/vdso64/vgetrandom-chacha.S (revision 7a4ffec9fd54ea27395e24dff726dbf58e2fe06b)
1/* SPDX-License-Identifier: GPL-2.0 */
2
3#include <linux/stringify.h>
4#include <linux/linkage.h>
5#include <asm/alternative.h>
6#include <asm/dwarf.h>
7#include <asm/fpu-insn.h>
8
9#define STATE0	%v0
10#define STATE1	%v1
11#define STATE2	%v2
12#define STATE3	%v3
13#define COPY0	%v4
14#define COPY1	%v5
15#define COPY2	%v6
16#define COPY3	%v7
17#define BEPERM	%v19
18#define TMP0	%v20
19#define TMP1	%v21
20#define TMP2	%v22
21#define TMP3	%v23
22
23	.section .rodata
24
25	.balign 32
26SYM_DATA_START_LOCAL(chacha20_constants)
27	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
28	.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
29SYM_DATA_END(chacha20_constants)
30
31	.text
32/*
33 * s390 ChaCha20 implementation meant for vDSO. Produces a given positive
34 * number of blocks of output with nonce 0, taking an input key and 8-bytes
35 * counter. Does not spill to the stack.
36 *
37 * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
38 *				       const uint8_t *key,
39 *				       uint32_t *counter,
40 *				       size_t nblocks)
41 */
42SYM_FUNC_START(__arch_chacha20_blocks_nostack)
43	CFI_STARTPROC
44	larl	%r1,chacha20_constants
45
46	/* COPY0 = "expand 32-byte k" */
47	VL	COPY0,0,,%r1
48
49	/* BEPERM = byte selectors for VPERM */
50	ALTERNATIVE __stringify(VL BEPERM,16,,%r1), "brcl 0,0", ALT_FACILITY(148)
51
52	/* COPY1,COPY2 = key */
53	VLM	COPY1,COPY2,0,%r3
54
55	/* COPY3 = counter || zero nonce  */
56	lg	%r3,0(%r4)
57	VZERO	COPY3
58	VLVGG	COPY3,%r3,0
59
60	lghi	%r1,0
61.Lblock:
62	VLR	STATE0,COPY0
63	VLR	STATE1,COPY1
64	VLR	STATE2,COPY2
65	VLR	STATE3,COPY3
66
67	lghi	%r0,10
68.Ldoubleround:
69	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
70	VAF	STATE0,STATE0,STATE1
71	VX	STATE3,STATE3,STATE0
72	VERLLF	STATE3,STATE3,16
73
74	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
75	VAF	STATE2,STATE2,STATE3
76	VX	STATE1,STATE1,STATE2
77	VERLLF	STATE1,STATE1,12
78
79	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
80	VAF	STATE0,STATE0,STATE1
81	VX	STATE3,STATE3,STATE0
82	VERLLF	STATE3,STATE3,8
83
84	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
85	VAF	STATE2,STATE2,STATE3
86	VX	STATE1,STATE1,STATE2
87	VERLLF	STATE1,STATE1,7
88
89	/* STATE1[0,1,2,3] = STATE1[1,2,3,0] */
90	VSLDB	STATE1,STATE1,STATE1,4
91	/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
92	VSLDB	STATE2,STATE2,STATE2,8
93	/* STATE3[0,1,2,3] = STATE3[3,0,1,2] */
94	VSLDB	STATE3,STATE3,STATE3,12
95
96	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
97	VAF	STATE0,STATE0,STATE1
98	VX	STATE3,STATE3,STATE0
99	VERLLF	STATE3,STATE3,16
100
101	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
102	VAF	STATE2,STATE2,STATE3
103	VX	STATE1,STATE1,STATE2
104	VERLLF	STATE1,STATE1,12
105
106	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
107	VAF	STATE0,STATE0,STATE1
108	VX	STATE3,STATE3,STATE0
109	VERLLF	STATE3,STATE3,8
110
111	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
112	VAF	STATE2,STATE2,STATE3
113	VX	STATE1,STATE1,STATE2
114	VERLLF	STATE1,STATE1,7
115
116	/* STATE1[0,1,2,3] = STATE1[3,0,1,2] */
117	VSLDB	STATE1,STATE1,STATE1,12
118	/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
119	VSLDB	STATE2,STATE2,STATE2,8
120	/* STATE3[0,1,2,3] = STATE3[1,2,3,0] */
121	VSLDB	STATE3,STATE3,STATE3,4
122	brctg	%r0,.Ldoubleround
123
124	/* OUTPUT0 = STATE0 + COPY0 */
125	VAF	STATE0,STATE0,COPY0
126	/* OUTPUT1 = STATE1 + COPY1 */
127	VAF	STATE1,STATE1,COPY1
128	/* OUTPUT2 = STATE2 + COPY2 */
129	VAF	STATE2,STATE2,COPY2
130	/* OUTPUT3 = STATE3 + COPY3 */
131	VAF	STATE3,STATE3,COPY3
132
133	ALTERNATIVE							\
134		__stringify(						\
135		/* Convert STATE to little endian and store to OUTPUT */\
136		VPERM	TMP0,STATE0,STATE0,BEPERM;			\
137		VPERM	TMP1,STATE1,STATE1,BEPERM;			\
138		VPERM	TMP2,STATE2,STATE2,BEPERM;			\
139		VPERM	TMP3,STATE3,STATE3,BEPERM;			\
140		VSTM	TMP0,TMP3,0,%r2),				\
141		__stringify(						\
142		/* 32 bit wise little endian store to OUTPUT */		\
143		VSTBRF	STATE0,0,,%r2;					\
144		VSTBRF	STATE1,16,,%r2;					\
145		VSTBRF	STATE2,32,,%r2;					\
146		VSTBRF	STATE3,48,,%r2;					\
147		brcl	0,0),						\
148		ALT_FACILITY(148)
149
150	/* ++COPY3.COUNTER */
151	/* alsih %r3,1 */
152	.insn	rilu,0xcc0a00000000,%r3,1
153	alcr	%r3,%r1
154	VLVGG	COPY3,%r3,0
155
156	/* OUTPUT += 64, --NBLOCKS */
157	aghi	%r2,64
158	brctg	%r5,.Lblock
159
160	/* COUNTER = COPY3.COUNTER */
161	stg	%r3,0(%r4)
162
163	/* Zero out potentially sensitive regs */
164	VZERO	STATE0
165	VZERO	STATE1
166	VZERO	STATE2
167	VZERO	STATE3
168	VZERO	COPY1
169	VZERO	COPY2
170
171	/* Early exit if TMP0-TMP3 have not been used */
172	ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148)
173
174	VZERO	TMP0
175	VZERO	TMP1
176	VZERO	TMP2
177	VZERO	TMP3
178
179	br	%r14
180	CFI_ENDPROC
181SYM_FUNC_END(__arch_chacha20_blocks_nostack)
182