xref: /linux/lib/crypto/x86/blake2s-core.S (revision 13150742b09e720fdf021de14cd2b98b37415a89)
1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
5 */
6
7#include <linux/linkage.h>
8
9.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
10.align 32
11IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
12	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
13.section .rodata.cst16.ROT16, "aM", @progbits, 16
14.align 16
15ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
16.section .rodata.cst16.ROR328, "aM", @progbits, 16
17.align 16
18ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
19.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
20.align 64
21SIGMA:
22.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
23.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
24.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
25.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
26.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
27.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
28.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
29.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
30.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
31.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
32.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
33.align 64
34SIGMA2:
35.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
36.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
37.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
38.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
39.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
40.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
41.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
42.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
43.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
44.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
45
46.text
47SYM_FUNC_START(blake2s_compress_ssse3)
48	testq		%rdx,%rdx
49	je		.Lendofloop
50	movdqu		(%rdi),%xmm0
51	movdqu		0x10(%rdi),%xmm1
52	movdqa		ROT16(%rip),%xmm12
53	movdqa		ROR328(%rip),%xmm13
54	movdqu		0x20(%rdi),%xmm14
55	movq		%rcx,%xmm15
56	leaq		SIGMA+0xa0(%rip),%r8
57	jmp		.Lbeginofloop
58	.align		32
59.Lbeginofloop:
60	movdqa		%xmm0,%xmm10
61	movdqa		%xmm1,%xmm11
62	paddq		%xmm15,%xmm14
63	movdqa		IV(%rip),%xmm2
64	movdqa		%xmm14,%xmm3
65	pxor		IV+0x10(%rip),%xmm3
66	leaq		SIGMA(%rip),%rcx
67.Lroundloop:
68	movzbl		(%rcx),%eax
69	movd		(%rsi,%rax,4),%xmm4
70	movzbl		0x1(%rcx),%eax
71	movd		(%rsi,%rax,4),%xmm5
72	movzbl		0x2(%rcx),%eax
73	movd		(%rsi,%rax,4),%xmm6
74	movzbl		0x3(%rcx),%eax
75	movd		(%rsi,%rax,4),%xmm7
76	punpckldq	%xmm5,%xmm4
77	punpckldq	%xmm7,%xmm6
78	punpcklqdq	%xmm6,%xmm4
79	paddd		%xmm4,%xmm0
80	paddd		%xmm1,%xmm0
81	pxor		%xmm0,%xmm3
82	pshufb		%xmm12,%xmm3
83	paddd		%xmm3,%xmm2
84	pxor		%xmm2,%xmm1
85	movdqa		%xmm1,%xmm8
86	psrld		$0xc,%xmm1
87	pslld		$0x14,%xmm8
88	por		%xmm8,%xmm1
89	movzbl		0x4(%rcx),%eax
90	movd		(%rsi,%rax,4),%xmm5
91	movzbl		0x5(%rcx),%eax
92	movd		(%rsi,%rax,4),%xmm6
93	movzbl		0x6(%rcx),%eax
94	movd		(%rsi,%rax,4),%xmm7
95	movzbl		0x7(%rcx),%eax
96	movd		(%rsi,%rax,4),%xmm4
97	punpckldq	%xmm6,%xmm5
98	punpckldq	%xmm4,%xmm7
99	punpcklqdq	%xmm7,%xmm5
100	paddd		%xmm5,%xmm0
101	paddd		%xmm1,%xmm0
102	pxor		%xmm0,%xmm3
103	pshufb		%xmm13,%xmm3
104	paddd		%xmm3,%xmm2
105	pxor		%xmm2,%xmm1
106	movdqa		%xmm1,%xmm8
107	psrld		$0x7,%xmm1
108	pslld		$0x19,%xmm8
109	por		%xmm8,%xmm1
110	pshufd		$0x93,%xmm0,%xmm0
111	pshufd		$0x4e,%xmm3,%xmm3
112	pshufd		$0x39,%xmm2,%xmm2
113	movzbl		0x8(%rcx),%eax
114	movd		(%rsi,%rax,4),%xmm6
115	movzbl		0x9(%rcx),%eax
116	movd		(%rsi,%rax,4),%xmm7
117	movzbl		0xa(%rcx),%eax
118	movd		(%rsi,%rax,4),%xmm4
119	movzbl		0xb(%rcx),%eax
120	movd		(%rsi,%rax,4),%xmm5
121	punpckldq	%xmm7,%xmm6
122	punpckldq	%xmm5,%xmm4
123	punpcklqdq	%xmm4,%xmm6
124	paddd		%xmm6,%xmm0
125	paddd		%xmm1,%xmm0
126	pxor		%xmm0,%xmm3
127	pshufb		%xmm12,%xmm3
128	paddd		%xmm3,%xmm2
129	pxor		%xmm2,%xmm1
130	movdqa		%xmm1,%xmm8
131	psrld		$0xc,%xmm1
132	pslld		$0x14,%xmm8
133	por		%xmm8,%xmm1
134	movzbl		0xc(%rcx),%eax
135	movd		(%rsi,%rax,4),%xmm7
136	movzbl		0xd(%rcx),%eax
137	movd		(%rsi,%rax,4),%xmm4
138	movzbl		0xe(%rcx),%eax
139	movd		(%rsi,%rax,4),%xmm5
140	movzbl		0xf(%rcx),%eax
141	movd		(%rsi,%rax,4),%xmm6
142	punpckldq	%xmm4,%xmm7
143	punpckldq	%xmm6,%xmm5
144	punpcklqdq	%xmm5,%xmm7
145	paddd		%xmm7,%xmm0
146	paddd		%xmm1,%xmm0
147	pxor		%xmm0,%xmm3
148	pshufb		%xmm13,%xmm3
149	paddd		%xmm3,%xmm2
150	pxor		%xmm2,%xmm1
151	movdqa		%xmm1,%xmm8
152	psrld		$0x7,%xmm1
153	pslld		$0x19,%xmm8
154	por		%xmm8,%xmm1
155	pshufd		$0x39,%xmm0,%xmm0
156	pshufd		$0x4e,%xmm3,%xmm3
157	pshufd		$0x93,%xmm2,%xmm2
158	addq		$0x10,%rcx
159	cmpq		%r8,%rcx
160	jnz		.Lroundloop
161	pxor		%xmm2,%xmm0
162	pxor		%xmm3,%xmm1
163	pxor		%xmm10,%xmm0
164	pxor		%xmm11,%xmm1
165	addq		$0x40,%rsi
166	decq		%rdx
167	jnz		.Lbeginofloop
168	movdqu		%xmm0,(%rdi)
169	movdqu		%xmm1,0x10(%rdi)
170	movdqu		%xmm14,0x20(%rdi)
171.Lendofloop:
172	RET
173SYM_FUNC_END(blake2s_compress_ssse3)
174
175SYM_FUNC_START(blake2s_compress_avx512)
176	vmovdqu		(%rdi),%xmm0
177	vmovdqu		0x10(%rdi),%xmm1
178	vmovdqu		0x20(%rdi),%xmm4
179	vmovq		%rcx,%xmm5
180	vmovdqa		IV(%rip),%xmm14
181	vmovdqa		IV+16(%rip),%xmm15
182	jmp		.Lblake2s_compress_avx512_mainloop
183.align 32
184.Lblake2s_compress_avx512_mainloop:
185	vmovdqa		%xmm0,%xmm10
186	vmovdqa		%xmm1,%xmm11
187	vpaddq		%xmm5,%xmm4,%xmm4
188	vmovdqa		%xmm14,%xmm2
189	vpxor		%xmm15,%xmm4,%xmm3
190	vmovdqu		(%rsi),%ymm6
191	vmovdqu		0x20(%rsi),%ymm7
192	addq		$0x40,%rsi
193	leaq		SIGMA2(%rip),%rax
194	movb		$0xa,%cl
195.Lblake2s_compress_avx512_roundloop:
196	addq		$0x40,%rax
197	vmovdqa		-0x40(%rax),%ymm8
198	vmovdqa		-0x20(%rax),%ymm9
199	vpermi2d	%ymm7,%ymm6,%ymm8
200	vpermi2d	%ymm7,%ymm6,%ymm9
201	vmovdqa		%ymm8,%ymm6
202	vmovdqa		%ymm9,%ymm7
203	vpaddd		%xmm8,%xmm0,%xmm0
204	vpaddd		%xmm1,%xmm0,%xmm0
205	vpxor		%xmm0,%xmm3,%xmm3
206	vprord		$0x10,%xmm3,%xmm3
207	vpaddd		%xmm3,%xmm2,%xmm2
208	vpxor		%xmm2,%xmm1,%xmm1
209	vprord		$0xc,%xmm1,%xmm1
210	vextracti128	$0x1,%ymm8,%xmm8
211	vpaddd		%xmm8,%xmm0,%xmm0
212	vpaddd		%xmm1,%xmm0,%xmm0
213	vpxor		%xmm0,%xmm3,%xmm3
214	vprord		$0x8,%xmm3,%xmm3
215	vpaddd		%xmm3,%xmm2,%xmm2
216	vpxor		%xmm2,%xmm1,%xmm1
217	vprord		$0x7,%xmm1,%xmm1
218	vpshufd		$0x93,%xmm0,%xmm0
219	vpshufd		$0x4e,%xmm3,%xmm3
220	vpshufd		$0x39,%xmm2,%xmm2
221	vpaddd		%xmm9,%xmm0,%xmm0
222	vpaddd		%xmm1,%xmm0,%xmm0
223	vpxor		%xmm0,%xmm3,%xmm3
224	vprord		$0x10,%xmm3,%xmm3
225	vpaddd		%xmm3,%xmm2,%xmm2
226	vpxor		%xmm2,%xmm1,%xmm1
227	vprord		$0xc,%xmm1,%xmm1
228	vextracti128	$0x1,%ymm9,%xmm9
229	vpaddd		%xmm9,%xmm0,%xmm0
230	vpaddd		%xmm1,%xmm0,%xmm0
231	vpxor		%xmm0,%xmm3,%xmm3
232	vprord		$0x8,%xmm3,%xmm3
233	vpaddd		%xmm3,%xmm2,%xmm2
234	vpxor		%xmm2,%xmm1,%xmm1
235	vprord		$0x7,%xmm1,%xmm1
236	vpshufd		$0x39,%xmm0,%xmm0
237	vpshufd		$0x4e,%xmm3,%xmm3
238	vpshufd		$0x93,%xmm2,%xmm2
239	decb		%cl
240	jne		.Lblake2s_compress_avx512_roundloop
241	vpxor		%xmm10,%xmm0,%xmm0
242	vpxor		%xmm11,%xmm1,%xmm1
243	vpxor		%xmm2,%xmm0,%xmm0
244	vpxor		%xmm3,%xmm1,%xmm1
245	decq		%rdx
246	jne		.Lblake2s_compress_avx512_mainloop
247	vmovdqu		%xmm0,(%rdi)
248	vmovdqu		%xmm1,0x10(%rdi)
249	vmovdqu		%xmm4,0x20(%rdi)
250	vzeroupper
251	RET
252SYM_FUNC_END(blake2s_compress_avx512)
253