xref: /linux/lib/crypto/x86/blake2s-core.S (revision 5abe8d8efc022cc78b6273d01e4a453242b9f4d8)
1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
5 */
6
7#include <linux/linkage.h>
8
9.section .rodata.cst32.iv, "aM", @progbits, 32
10.align 32
11.Liv:
12	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
13	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
14
15.section .rodata.cst16.ror16, "aM", @progbits, 16
16.align 16
17.Lror16:
18	.octa 0x0D0C0F0E09080B0A0504070601000302
19
20.section .rodata.cst16.ror8, "aM", @progbits, 16
21.align 16
22.Lror8:
23	.octa 0x0C0F0E0D080B0A090407060500030201
24
25.section .rodata.cst64.sigma, "aM", @progbits, 160
26.align 64
27.Lsigma:
28.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
29.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
30.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
31.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
32.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
33.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
34.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
35.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
36.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
37.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
38
39.section .rodata.cst64.sigma2, "aM", @progbits, 160
40.align 64
41.Lsigma2:
42.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
43.byte  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
44.byte 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
45.byte 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
46.byte  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
47.byte  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
48.byte  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
49.byte  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
50.byte 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
51.byte  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
52
53#define CTX		%rdi
54#define DATA		%rsi
55#define NBLOCKS		%rdx
56#define INC		%ecx
57
58.text
59//
60// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
61//			       const u8 *data, size_t nblocks, u32 inc);
62//
63// Only the first three fields of struct blake2s_ctx are used:
64//	u32 h[8];	(inout)
65//	u32 t[2];	(inout)
66//	u32 f[2];	(in)
67//
68SYM_FUNC_START(blake2s_compress_ssse3)
69	movdqu		(CTX),%xmm0		// Load h[0..3]
70	movdqu		16(CTX),%xmm1		// Load h[4..7]
71	movdqa		.Lror16(%rip),%xmm12
72	movdqa		.Lror8(%rip),%xmm13
73	movdqu		32(CTX),%xmm14		// Load t and f
74	movd		INC,%xmm15		// Load inc
75	leaq		.Lsigma+160(%rip),%r8
76	jmp		.Lssse3_mainloop
77
78	.align		32
79.Lssse3_mainloop:
80	// Main loop: each iteration processes one 64-byte block.
81	movdqa		%xmm0,%xmm10		// Save h[0..3] and let v[0..3] = h[0..3]
82	movdqa		%xmm1,%xmm11		// Save h[4..7] and let v[4..7] = h[4..7]
83	paddq		%xmm15,%xmm14		// t += inc (64-bit addition)
84	movdqa		.Liv(%rip),%xmm2	// v[8..11] = iv[0..3]
85	movdqa		%xmm14,%xmm3
86	pxor		.Liv+16(%rip),%xmm3	// v[12..15] = iv[4..7] ^ [t, f]
87	leaq		.Lsigma(%rip),%rcx
88
89.Lssse3_roundloop:
90	// Round loop: each iteration does 1 round (of 10 rounds total).
91	movzbl		(%rcx),%eax
92	movd		(DATA,%rax,4),%xmm4
93	movzbl		1(%rcx),%eax
94	movd		(DATA,%rax,4),%xmm5
95	movzbl		2(%rcx),%eax
96	movd		(DATA,%rax,4),%xmm6
97	movzbl		3(%rcx),%eax
98	movd		(DATA,%rax,4),%xmm7
99	punpckldq	%xmm5,%xmm4
100	punpckldq	%xmm7,%xmm6
101	punpcklqdq	%xmm6,%xmm4
102	paddd		%xmm4,%xmm0
103	paddd		%xmm1,%xmm0
104	pxor		%xmm0,%xmm3
105	pshufb		%xmm12,%xmm3
106	paddd		%xmm3,%xmm2
107	pxor		%xmm2,%xmm1
108	movdqa		%xmm1,%xmm8
109	psrld		$12,%xmm1
110	pslld		$20,%xmm8
111	por		%xmm8,%xmm1
112	movzbl		4(%rcx),%eax
113	movd		(DATA,%rax,4),%xmm5
114	movzbl		5(%rcx),%eax
115	movd		(DATA,%rax,4),%xmm6
116	movzbl		6(%rcx),%eax
117	movd		(DATA,%rax,4),%xmm7
118	movzbl		7(%rcx),%eax
119	movd		(DATA,%rax,4),%xmm4
120	punpckldq	%xmm6,%xmm5
121	punpckldq	%xmm4,%xmm7
122	punpcklqdq	%xmm7,%xmm5
123	paddd		%xmm5,%xmm0
124	paddd		%xmm1,%xmm0
125	pxor		%xmm0,%xmm3
126	pshufb		%xmm13,%xmm3
127	paddd		%xmm3,%xmm2
128	pxor		%xmm2,%xmm1
129	movdqa		%xmm1,%xmm8
130	psrld		$7,%xmm1
131	pslld		$25,%xmm8
132	por		%xmm8,%xmm1
133	pshufd		$0x93,%xmm0,%xmm0
134	pshufd		$0x4e,%xmm3,%xmm3
135	pshufd		$0x39,%xmm2,%xmm2
136	movzbl		8(%rcx),%eax
137	movd		(DATA,%rax,4),%xmm6
138	movzbl		9(%rcx),%eax
139	movd		(DATA,%rax,4),%xmm7
140	movzbl		10(%rcx),%eax
141	movd		(DATA,%rax,4),%xmm4
142	movzbl		11(%rcx),%eax
143	movd		(DATA,%rax,4),%xmm5
144	punpckldq	%xmm7,%xmm6
145	punpckldq	%xmm5,%xmm4
146	punpcklqdq	%xmm4,%xmm6
147	paddd		%xmm6,%xmm0
148	paddd		%xmm1,%xmm0
149	pxor		%xmm0,%xmm3
150	pshufb		%xmm12,%xmm3
151	paddd		%xmm3,%xmm2
152	pxor		%xmm2,%xmm1
153	movdqa		%xmm1,%xmm8
154	psrld		$12,%xmm1
155	pslld		$20,%xmm8
156	por		%xmm8,%xmm1
157	movzbl		12(%rcx),%eax
158	movd		(DATA,%rax,4),%xmm7
159	movzbl		13(%rcx),%eax
160	movd		(DATA,%rax,4),%xmm4
161	movzbl		14(%rcx),%eax
162	movd		(DATA,%rax,4),%xmm5
163	movzbl		15(%rcx),%eax
164	movd		(DATA,%rax,4),%xmm6
165	punpckldq	%xmm4,%xmm7
166	punpckldq	%xmm6,%xmm5
167	punpcklqdq	%xmm5,%xmm7
168	paddd		%xmm7,%xmm0
169	paddd		%xmm1,%xmm0
170	pxor		%xmm0,%xmm3
171	pshufb		%xmm13,%xmm3
172	paddd		%xmm3,%xmm2
173	pxor		%xmm2,%xmm1
174	movdqa		%xmm1,%xmm8
175	psrld		$7,%xmm1
176	pslld		$25,%xmm8
177	por		%xmm8,%xmm1
178	pshufd		$0x39,%xmm0,%xmm0
179	pshufd		$0x4e,%xmm3,%xmm3
180	pshufd		$0x93,%xmm2,%xmm2
181	addq		$16,%rcx
182	cmpq		%r8,%rcx
183	jnz		.Lssse3_roundloop
184
185	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
186	pxor		%xmm2,%xmm0
187	pxor		%xmm3,%xmm1
188	pxor		%xmm10,%xmm0
189	pxor		%xmm11,%xmm1
190	addq		$64,DATA
191	decq		NBLOCKS
192	jnz		.Lssse3_mainloop
193
194	movdqu		%xmm0,(CTX)		// Store new h[0..3]
195	movdqu		%xmm1,16(CTX)		// Store new h[4..7]
196	movq		%xmm14,32(CTX)		// Store new t (f is unchanged)
197	RET
198SYM_FUNC_END(blake2s_compress_ssse3)
199
200//
201// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
202//				const u8 *data, size_t nblocks, u32 inc);
203//
204// Only the first three fields of struct blake2s_ctx are used:
205//	u32 h[8];	(inout)
206//	u32 t[2];	(inout)
207//	u32 f[2];	(in)
208//
209SYM_FUNC_START(blake2s_compress_avx512)
210	vmovdqu		(CTX),%xmm0		// Load h[0..3]
211	vmovdqu		16(CTX),%xmm1		// Load h[4..7]
212	vmovdqu		32(CTX),%xmm4		// Load t and f
213	vmovd		INC,%xmm5		// Load inc
214	vmovdqa		.Liv(%rip),%xmm14	// Load iv[0..3]
215	vmovdqa		.Liv+16(%rip),%xmm15	// Load iv[4..7]
216	jmp		.Lavx512_mainloop
217
218	.align		32
219.Lavx512_mainloop:
220	// Main loop: each iteration processes one 64-byte block.
221	vmovdqa		%xmm0,%xmm10		// Save h[0..3] and let v[0..3] = h[0..3]
222	vmovdqa		%xmm1,%xmm11		// Save h[4..7] and let v[4..7] = h[4..7]
223	vpaddq		%xmm5,%xmm4,%xmm4	// t += inc (64-bit addition)
224	vmovdqa		%xmm14,%xmm2		// v[8..11] = iv[0..3]
225	vpxor		%xmm15,%xmm4,%xmm3	// v[12..15] = iv[4..7] ^ [t, f]
226	vmovdqu		(DATA),%ymm6		// Load first 8 data words
227	vmovdqu		32(DATA),%ymm7		// Load second 8 data words
228	addq		$64,DATA
229	leaq		.Lsigma2(%rip),%rax
230	movb		$10,%cl			// Set num rounds remaining
231
232.Lavx512_roundloop:
233	// Round loop: each iteration does 1 round (of 10 rounds total).
234	vpmovzxbd	(%rax),%ymm8
235	vpmovzxbd	8(%rax),%ymm9
236	addq		$16,%rax
237	vpermi2d	%ymm7,%ymm6,%ymm8
238	vpermi2d	%ymm7,%ymm6,%ymm9
239	vmovdqa		%ymm8,%ymm6
240	vmovdqa		%ymm9,%ymm7
241	vpaddd		%xmm8,%xmm0,%xmm0
242	vpaddd		%xmm1,%xmm0,%xmm0
243	vpxor		%xmm0,%xmm3,%xmm3
244	vprord		$16,%xmm3,%xmm3
245	vpaddd		%xmm3,%xmm2,%xmm2
246	vpxor		%xmm2,%xmm1,%xmm1
247	vprord		$12,%xmm1,%xmm1
248	vextracti128	$1,%ymm8,%xmm8
249	vpaddd		%xmm8,%xmm0,%xmm0
250	vpaddd		%xmm1,%xmm0,%xmm0
251	vpxor		%xmm0,%xmm3,%xmm3
252	vprord		$8,%xmm3,%xmm3
253	vpaddd		%xmm3,%xmm2,%xmm2
254	vpxor		%xmm2,%xmm1,%xmm1
255	vprord		$7,%xmm1,%xmm1
256	vpshufd		$0x93,%xmm0,%xmm0
257	vpshufd		$0x4e,%xmm3,%xmm3
258	vpshufd		$0x39,%xmm2,%xmm2
259	vpaddd		%xmm9,%xmm0,%xmm0
260	vpaddd		%xmm1,%xmm0,%xmm0
261	vpxor		%xmm0,%xmm3,%xmm3
262	vprord		$16,%xmm3,%xmm3
263	vpaddd		%xmm3,%xmm2,%xmm2
264	vpxor		%xmm2,%xmm1,%xmm1
265	vprord		$12,%xmm1,%xmm1
266	vextracti128	$1,%ymm9,%xmm9
267	vpaddd		%xmm9,%xmm0,%xmm0
268	vpaddd		%xmm1,%xmm0,%xmm0
269	vpxor		%xmm0,%xmm3,%xmm3
270	vprord		$8,%xmm3,%xmm3
271	vpaddd		%xmm3,%xmm2,%xmm2
272	vpxor		%xmm2,%xmm1,%xmm1
273	vprord		$7,%xmm1,%xmm1
274	vpshufd		$0x39,%xmm0,%xmm0
275	vpshufd		$0x4e,%xmm3,%xmm3
276	vpshufd		$0x93,%xmm2,%xmm2
277	decb		%cl
278	jne		.Lavx512_roundloop
279
280	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
281	vpternlogd	$0x96,%xmm10,%xmm2,%xmm0
282	vpternlogd	$0x96,%xmm11,%xmm3,%xmm1
283	decq		NBLOCKS
284	jne		.Lavx512_mainloop
285
286	vmovdqu		%xmm0,(CTX)		// Store new h[0..3]
287	vmovdqu		%xmm1,16(CTX)		// Store new h[4..7]
288	vmovq		%xmm4,32(CTX)		// Store new t (f is unchanged)
289	vzeroupper
290	RET
291SYM_FUNC_END(blake2s_compress_avx512)
292