xref: /linux/arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1*693c819fSH. Peter Anvin// SPDX-License-Identifier: GPL-2.0
2*693c819fSH. Peter Anvin/*
3*693c819fSH. Peter Anvin * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4*693c819fSH. Peter Anvin */
5*693c819fSH. Peter Anvin
6*693c819fSH. Peter Anvin#include <linux/linkage.h>
7*693c819fSH. Peter Anvin#include <asm/frame.h>
8*693c819fSH. Peter Anvin
9*693c819fSH. Peter Anvin.section	.rodata, "a"
10*693c819fSH. Peter Anvin.align 16
11*693c819fSH. Peter AnvinCONSTANTS:	.octa 0x6b20657479622d323320646e61707865
12*693c819fSH. Peter Anvin.text
13*693c819fSH. Peter Anvin
14*693c819fSH. Peter Anvin/*
15*693c819fSH. Peter Anvin * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
16*693c819fSH. Peter Anvin * of blocks of output with a nonce of 0, taking an input key and 8-byte
17*693c819fSH. Peter Anvin * counter. Importantly does not spill to the stack. Its arguments are:
18*693c819fSH. Peter Anvin *
19*693c819fSH. Peter Anvin *	rdi: output bytes
20*693c819fSH. Peter Anvin *	rsi: 32-byte key input
21*693c819fSH. Peter Anvin *	rdx: 8-byte counter input/output
22*693c819fSH. Peter Anvin *	rcx: number of 64-byte blocks to write to output
23*693c819fSH. Peter Anvin */
24*693c819fSH. Peter AnvinSYM_FUNC_START(__arch_chacha20_blocks_nostack)
25*693c819fSH. Peter Anvin
26*693c819fSH. Peter Anvin.set	output,		%rdi
27*693c819fSH. Peter Anvin.set	key,		%rsi
28*693c819fSH. Peter Anvin.set	counter,	%rdx
29*693c819fSH. Peter Anvin.set	nblocks,	%rcx
30*693c819fSH. Peter Anvin.set	i,		%al
31*693c819fSH. Peter Anvin/* xmm registers are *not* callee-save. */
32*693c819fSH. Peter Anvin.set	temp,		%xmm0
33*693c819fSH. Peter Anvin.set	state0,		%xmm1
34*693c819fSH. Peter Anvin.set	state1,		%xmm2
35*693c819fSH. Peter Anvin.set	state2,		%xmm3
36*693c819fSH. Peter Anvin.set	state3,		%xmm4
37*693c819fSH. Peter Anvin.set	copy0,		%xmm5
38*693c819fSH. Peter Anvin.set	copy1,		%xmm6
39*693c819fSH. Peter Anvin.set	copy2,		%xmm7
40*693c819fSH. Peter Anvin.set	copy3,		%xmm8
41*693c819fSH. Peter Anvin.set	one,		%xmm9
42*693c819fSH. Peter Anvin
43*693c819fSH. Peter Anvin	/* copy0 = "expand 32-byte k" */
44*693c819fSH. Peter Anvin	movaps		CONSTANTS(%rip),copy0
45*693c819fSH. Peter Anvin	/* copy1,copy2 = key */
46*693c819fSH. Peter Anvin	movups		0x00(key),copy1
47*693c819fSH. Peter Anvin	movups		0x10(key),copy2
48*693c819fSH. Peter Anvin	/* copy3 = counter || zero nonce */
49*693c819fSH. Peter Anvin	movq		0x00(counter),copy3
50*693c819fSH. Peter Anvin	/* one = 1 || 0 */
51*693c819fSH. Peter Anvin	movq		$1,%rax
52*693c819fSH. Peter Anvin	movq		%rax,one
53*693c819fSH. Peter Anvin
54*693c819fSH. Peter Anvin.Lblock:
55*693c819fSH. Peter Anvin	/* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
56*693c819fSH. Peter Anvin	movdqa		copy0,state0
57*693c819fSH. Peter Anvin	movdqa		copy1,state1
58*693c819fSH. Peter Anvin	movdqa		copy2,state2
59*693c819fSH. Peter Anvin	movdqa		copy3,state3
60*693c819fSH. Peter Anvin
61*693c819fSH. Peter Anvin	movb		$10,i
62*693c819fSH. Peter Anvin.Lpermute:
63*693c819fSH. Peter Anvin	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
64*693c819fSH. Peter Anvin	paddd		state1,state0
65*693c819fSH. Peter Anvin	pxor		state0,state3
66*693c819fSH. Peter Anvin	movdqa		state3,temp
67*693c819fSH. Peter Anvin	pslld		$16,temp
68*693c819fSH. Peter Anvin	psrld		$16,state3
69*693c819fSH. Peter Anvin	por		temp,state3
70*693c819fSH. Peter Anvin
71*693c819fSH. Peter Anvin	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
72*693c819fSH. Peter Anvin	paddd		state3,state2
73*693c819fSH. Peter Anvin	pxor		state2,state1
74*693c819fSH. Peter Anvin	movdqa		state1,temp
75*693c819fSH. Peter Anvin	pslld		$12,temp
76*693c819fSH. Peter Anvin	psrld		$20,state1
77*693c819fSH. Peter Anvin	por		temp,state1
78*693c819fSH. Peter Anvin
79*693c819fSH. Peter Anvin	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
80*693c819fSH. Peter Anvin	paddd		state1,state0
81*693c819fSH. Peter Anvin	pxor		state0,state3
82*693c819fSH. Peter Anvin	movdqa		state3,temp
83*693c819fSH. Peter Anvin	pslld		$8,temp
84*693c819fSH. Peter Anvin	psrld		$24,state3
85*693c819fSH. Peter Anvin	por		temp,state3
86*693c819fSH. Peter Anvin
87*693c819fSH. Peter Anvin	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
88*693c819fSH. Peter Anvin	paddd		state3,state2
89*693c819fSH. Peter Anvin	pxor		state2,state1
90*693c819fSH. Peter Anvin	movdqa		state1,temp
91*693c819fSH. Peter Anvin	pslld		$7,temp
92*693c819fSH. Peter Anvin	psrld		$25,state1
93*693c819fSH. Peter Anvin	por		temp,state1
94*693c819fSH. Peter Anvin
95*693c819fSH. Peter Anvin	/* state1[0,1,2,3] = state1[1,2,3,0] */
96*693c819fSH. Peter Anvin	pshufd		$0x39,state1,state1
97*693c819fSH. Peter Anvin	/* state2[0,1,2,3] = state2[2,3,0,1] */
98*693c819fSH. Peter Anvin	pshufd		$0x4e,state2,state2
99*693c819fSH. Peter Anvin	/* state3[0,1,2,3] = state3[3,0,1,2] */
100*693c819fSH. Peter Anvin	pshufd		$0x93,state3,state3
101*693c819fSH. Peter Anvin
102*693c819fSH. Peter Anvin	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
103*693c819fSH. Peter Anvin	paddd		state1,state0
104*693c819fSH. Peter Anvin	pxor		state0,state3
105*693c819fSH. Peter Anvin	movdqa		state3,temp
106*693c819fSH. Peter Anvin	pslld		$16,temp
107*693c819fSH. Peter Anvin	psrld		$16,state3
108*693c819fSH. Peter Anvin	por		temp,state3
109*693c819fSH. Peter Anvin
110*693c819fSH. Peter Anvin	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
111*693c819fSH. Peter Anvin	paddd		state3,state2
112*693c819fSH. Peter Anvin	pxor		state2,state1
113*693c819fSH. Peter Anvin	movdqa		state1,temp
114*693c819fSH. Peter Anvin	pslld		$12,temp
115*693c819fSH. Peter Anvin	psrld		$20,state1
116*693c819fSH. Peter Anvin	por		temp,state1
117*693c819fSH. Peter Anvin
118*693c819fSH. Peter Anvin	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
119*693c819fSH. Peter Anvin	paddd		state1,state0
120*693c819fSH. Peter Anvin	pxor		state0,state3
121*693c819fSH. Peter Anvin	movdqa		state3,temp
122*693c819fSH. Peter Anvin	pslld		$8,temp
123*693c819fSH. Peter Anvin	psrld		$24,state3
124*693c819fSH. Peter Anvin	por		temp,state3
125*693c819fSH. Peter Anvin
126*693c819fSH. Peter Anvin	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
127*693c819fSH. Peter Anvin	paddd		state3,state2
128*693c819fSH. Peter Anvin	pxor		state2,state1
129*693c819fSH. Peter Anvin	movdqa		state1,temp
130*693c819fSH. Peter Anvin	pslld		$7,temp
131*693c819fSH. Peter Anvin	psrld		$25,state1
132*693c819fSH. Peter Anvin	por		temp,state1
133*693c819fSH. Peter Anvin
134*693c819fSH. Peter Anvin	/* state1[0,1,2,3] = state1[3,0,1,2] */
135*693c819fSH. Peter Anvin	pshufd		$0x93,state1,state1
136*693c819fSH. Peter Anvin	/* state2[0,1,2,3] = state2[2,3,0,1] */
137*693c819fSH. Peter Anvin	pshufd		$0x4e,state2,state2
138*693c819fSH. Peter Anvin	/* state3[0,1,2,3] = state3[1,2,3,0] */
139*693c819fSH. Peter Anvin	pshufd		$0x39,state3,state3
140*693c819fSH. Peter Anvin
141*693c819fSH. Peter Anvin	decb		i
142*693c819fSH. Peter Anvin	jnz		.Lpermute
143*693c819fSH. Peter Anvin
144*693c819fSH. Peter Anvin	/* output0 = state0 + copy0 */
145*693c819fSH. Peter Anvin	paddd		copy0,state0
146*693c819fSH. Peter Anvin	movups		state0,0x00(output)
147*693c819fSH. Peter Anvin	/* output1 = state1 + copy1 */
148*693c819fSH. Peter Anvin	paddd		copy1,state1
149*693c819fSH. Peter Anvin	movups		state1,0x10(output)
150*693c819fSH. Peter Anvin	/* output2 = state2 + copy2 */
151*693c819fSH. Peter Anvin	paddd		copy2,state2
152*693c819fSH. Peter Anvin	movups		state2,0x20(output)
153*693c819fSH. Peter Anvin	/* output3 = state3 + copy3 */
154*693c819fSH. Peter Anvin	paddd		copy3,state3
155*693c819fSH. Peter Anvin	movups		state3,0x30(output)
156*693c819fSH. Peter Anvin
157*693c819fSH. Peter Anvin	/* ++copy3.counter */
158*693c819fSH. Peter Anvin	paddq		one,copy3
159*693c819fSH. Peter Anvin
160*693c819fSH. Peter Anvin	/* output += 64, --nblocks */
161*693c819fSH. Peter Anvin	addq		$64,output
162*693c819fSH. Peter Anvin	decq		nblocks
163*693c819fSH. Peter Anvin	jnz		.Lblock
164*693c819fSH. Peter Anvin
165*693c819fSH. Peter Anvin	/* counter = copy3.counter */
166*693c819fSH. Peter Anvin	movq		copy3,0x00(counter)
167*693c819fSH. Peter Anvin
168*693c819fSH. Peter Anvin	/* Zero out the potentially sensitive regs, in case nothing uses these again. */
169*693c819fSH. Peter Anvin	pxor		state0,state0
170*693c819fSH. Peter Anvin	pxor		state1,state1
171*693c819fSH. Peter Anvin	pxor		state2,state2
172*693c819fSH. Peter Anvin	pxor		state3,state3
173*693c819fSH. Peter Anvin	pxor		copy1,copy1
174*693c819fSH. Peter Anvin	pxor		copy2,copy2
175*693c819fSH. Peter Anvin	pxor		temp,temp
176*693c819fSH. Peter Anvin
177*693c819fSH. Peter Anvin	ret
178*693c819fSH. Peter AnvinSYM_FUNC_END(__arch_chacha20_blocks_nostack)
179