xref: /linux/lib/crypto/x86/blake2s-core.S (revision c19bdf24cc274e96006267173d664df2ef4b13db)
1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
5 */
6
7#include <linux/linkage.h>
8
9.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
10.align 32
11IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
12	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
13.section .rodata.cst16.ROT16, "aM", @progbits, 16
14.align 16
15ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
16.section .rodata.cst16.ROR328, "aM", @progbits, 16
17.align 16
18ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
19.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
20.align 64
21SIGMA:
22.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
23.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
24.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
25.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
26.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
27.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
28.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
29.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
30.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
31.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
32.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 160
33.align 64
34SIGMA2:
35.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
36.byte  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
37.byte 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
38.byte 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
39.byte  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
40.byte  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
41.byte  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
42.byte  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
43.byte 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
44.byte  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
45
46.text
47SYM_FUNC_START(blake2s_compress_ssse3)
48	movdqu		(%rdi),%xmm0
49	movdqu		0x10(%rdi),%xmm1
50	movdqa		ROT16(%rip),%xmm12
51	movdqa		ROR328(%rip),%xmm13
52	movdqu		0x20(%rdi),%xmm14
53	movd		%ecx,%xmm15
54	leaq		SIGMA+0xa0(%rip),%r8
55	jmp		.Lbeginofloop
56	.align		32
57.Lbeginofloop:
58	movdqa		%xmm0,%xmm10
59	movdqa		%xmm1,%xmm11
60	paddq		%xmm15,%xmm14
61	movdqa		IV(%rip),%xmm2
62	movdqa		%xmm14,%xmm3
63	pxor		IV+0x10(%rip),%xmm3
64	leaq		SIGMA(%rip),%rcx
65.Lroundloop:
66	movzbl		(%rcx),%eax
67	movd		(%rsi,%rax,4),%xmm4
68	movzbl		0x1(%rcx),%eax
69	movd		(%rsi,%rax,4),%xmm5
70	movzbl		0x2(%rcx),%eax
71	movd		(%rsi,%rax,4),%xmm6
72	movzbl		0x3(%rcx),%eax
73	movd		(%rsi,%rax,4),%xmm7
74	punpckldq	%xmm5,%xmm4
75	punpckldq	%xmm7,%xmm6
76	punpcklqdq	%xmm6,%xmm4
77	paddd		%xmm4,%xmm0
78	paddd		%xmm1,%xmm0
79	pxor		%xmm0,%xmm3
80	pshufb		%xmm12,%xmm3
81	paddd		%xmm3,%xmm2
82	pxor		%xmm2,%xmm1
83	movdqa		%xmm1,%xmm8
84	psrld		$0xc,%xmm1
85	pslld		$0x14,%xmm8
86	por		%xmm8,%xmm1
87	movzbl		0x4(%rcx),%eax
88	movd		(%rsi,%rax,4),%xmm5
89	movzbl		0x5(%rcx),%eax
90	movd		(%rsi,%rax,4),%xmm6
91	movzbl		0x6(%rcx),%eax
92	movd		(%rsi,%rax,4),%xmm7
93	movzbl		0x7(%rcx),%eax
94	movd		(%rsi,%rax,4),%xmm4
95	punpckldq	%xmm6,%xmm5
96	punpckldq	%xmm4,%xmm7
97	punpcklqdq	%xmm7,%xmm5
98	paddd		%xmm5,%xmm0
99	paddd		%xmm1,%xmm0
100	pxor		%xmm0,%xmm3
101	pshufb		%xmm13,%xmm3
102	paddd		%xmm3,%xmm2
103	pxor		%xmm2,%xmm1
104	movdqa		%xmm1,%xmm8
105	psrld		$0x7,%xmm1
106	pslld		$0x19,%xmm8
107	por		%xmm8,%xmm1
108	pshufd		$0x93,%xmm0,%xmm0
109	pshufd		$0x4e,%xmm3,%xmm3
110	pshufd		$0x39,%xmm2,%xmm2
111	movzbl		0x8(%rcx),%eax
112	movd		(%rsi,%rax,4),%xmm6
113	movzbl		0x9(%rcx),%eax
114	movd		(%rsi,%rax,4),%xmm7
115	movzbl		0xa(%rcx),%eax
116	movd		(%rsi,%rax,4),%xmm4
117	movzbl		0xb(%rcx),%eax
118	movd		(%rsi,%rax,4),%xmm5
119	punpckldq	%xmm7,%xmm6
120	punpckldq	%xmm5,%xmm4
121	punpcklqdq	%xmm4,%xmm6
122	paddd		%xmm6,%xmm0
123	paddd		%xmm1,%xmm0
124	pxor		%xmm0,%xmm3
125	pshufb		%xmm12,%xmm3
126	paddd		%xmm3,%xmm2
127	pxor		%xmm2,%xmm1
128	movdqa		%xmm1,%xmm8
129	psrld		$0xc,%xmm1
130	pslld		$0x14,%xmm8
131	por		%xmm8,%xmm1
132	movzbl		0xc(%rcx),%eax
133	movd		(%rsi,%rax,4),%xmm7
134	movzbl		0xd(%rcx),%eax
135	movd		(%rsi,%rax,4),%xmm4
136	movzbl		0xe(%rcx),%eax
137	movd		(%rsi,%rax,4),%xmm5
138	movzbl		0xf(%rcx),%eax
139	movd		(%rsi,%rax,4),%xmm6
140	punpckldq	%xmm4,%xmm7
141	punpckldq	%xmm6,%xmm5
142	punpcklqdq	%xmm5,%xmm7
143	paddd		%xmm7,%xmm0
144	paddd		%xmm1,%xmm0
145	pxor		%xmm0,%xmm3
146	pshufb		%xmm13,%xmm3
147	paddd		%xmm3,%xmm2
148	pxor		%xmm2,%xmm1
149	movdqa		%xmm1,%xmm8
150	psrld		$0x7,%xmm1
151	pslld		$0x19,%xmm8
152	por		%xmm8,%xmm1
153	pshufd		$0x39,%xmm0,%xmm0
154	pshufd		$0x4e,%xmm3,%xmm3
155	pshufd		$0x93,%xmm2,%xmm2
156	addq		$0x10,%rcx
157	cmpq		%r8,%rcx
158	jnz		.Lroundloop
159	pxor		%xmm2,%xmm0
160	pxor		%xmm3,%xmm1
161	pxor		%xmm10,%xmm0
162	pxor		%xmm11,%xmm1
163	addq		$0x40,%rsi
164	decq		%rdx
165	jnz		.Lbeginofloop
166	movdqu		%xmm0,(%rdi)
167	movdqu		%xmm1,0x10(%rdi)
168	movdqu		%xmm14,0x20(%rdi)
169	RET
170SYM_FUNC_END(blake2s_compress_ssse3)
171
172SYM_FUNC_START(blake2s_compress_avx512)
173	vmovdqu		(%rdi),%xmm0
174	vmovdqu		0x10(%rdi),%xmm1
175	vmovdqu		0x20(%rdi),%xmm4
176	vmovd		%ecx,%xmm5
177	vmovdqa		IV(%rip),%xmm14
178	vmovdqa		IV+16(%rip),%xmm15
179	jmp		.Lblake2s_compress_avx512_mainloop
180.align 32
181.Lblake2s_compress_avx512_mainloop:
182	vmovdqa		%xmm0,%xmm10
183	vmovdqa		%xmm1,%xmm11
184	vpaddq		%xmm5,%xmm4,%xmm4
185	vmovdqa		%xmm14,%xmm2
186	vpxor		%xmm15,%xmm4,%xmm3
187	vmovdqu		(%rsi),%ymm6
188	vmovdqu		0x20(%rsi),%ymm7
189	addq		$0x40,%rsi
190	leaq		SIGMA2(%rip),%rax
191	movb		$0xa,%cl
192.Lblake2s_compress_avx512_roundloop:
193	vpmovzxbd	(%rax),%ymm8
194	vpmovzxbd	0x8(%rax),%ymm9
195	addq		$0x10,%rax
196	vpermi2d	%ymm7,%ymm6,%ymm8
197	vpermi2d	%ymm7,%ymm6,%ymm9
198	vmovdqa		%ymm8,%ymm6
199	vmovdqa		%ymm9,%ymm7
200	vpaddd		%xmm8,%xmm0,%xmm0
201	vpaddd		%xmm1,%xmm0,%xmm0
202	vpxor		%xmm0,%xmm3,%xmm3
203	vprord		$0x10,%xmm3,%xmm3
204	vpaddd		%xmm3,%xmm2,%xmm2
205	vpxor		%xmm2,%xmm1,%xmm1
206	vprord		$0xc,%xmm1,%xmm1
207	vextracti128	$0x1,%ymm8,%xmm8
208	vpaddd		%xmm8,%xmm0,%xmm0
209	vpaddd		%xmm1,%xmm0,%xmm0
210	vpxor		%xmm0,%xmm3,%xmm3
211	vprord		$0x8,%xmm3,%xmm3
212	vpaddd		%xmm3,%xmm2,%xmm2
213	vpxor		%xmm2,%xmm1,%xmm1
214	vprord		$0x7,%xmm1,%xmm1
215	vpshufd		$0x93,%xmm0,%xmm0
216	vpshufd		$0x4e,%xmm3,%xmm3
217	vpshufd		$0x39,%xmm2,%xmm2
218	vpaddd		%xmm9,%xmm0,%xmm0
219	vpaddd		%xmm1,%xmm0,%xmm0
220	vpxor		%xmm0,%xmm3,%xmm3
221	vprord		$0x10,%xmm3,%xmm3
222	vpaddd		%xmm3,%xmm2,%xmm2
223	vpxor		%xmm2,%xmm1,%xmm1
224	vprord		$0xc,%xmm1,%xmm1
225	vextracti128	$0x1,%ymm9,%xmm9
226	vpaddd		%xmm9,%xmm0,%xmm0
227	vpaddd		%xmm1,%xmm0,%xmm0
228	vpxor		%xmm0,%xmm3,%xmm3
229	vprord		$0x8,%xmm3,%xmm3
230	vpaddd		%xmm3,%xmm2,%xmm2
231	vpxor		%xmm2,%xmm1,%xmm1
232	vprord		$0x7,%xmm1,%xmm1
233	vpshufd		$0x39,%xmm0,%xmm0
234	vpshufd		$0x4e,%xmm3,%xmm3
235	vpshufd		$0x93,%xmm2,%xmm2
236	decb		%cl
237	jne		.Lblake2s_compress_avx512_roundloop
238	vpxor		%xmm10,%xmm0,%xmm0
239	vpxor		%xmm11,%xmm1,%xmm1
240	vpxor		%xmm2,%xmm0,%xmm0
241	vpxor		%xmm3,%xmm1,%xmm1
242	decq		%rdx
243	jne		.Lblake2s_compress_avx512_mainloop
244	vmovdqu		%xmm0,(%rdi)
245	vmovdqu		%xmm1,0x10(%rdi)
246	vmovdqu		%xmm4,0x20(%rdi)
247	vzeroupper
248	RET
249SYM_FUNC_END(blake2s_compress_avx512)
250