xref: /freebsd/sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
25  */
26 
27 #include <sys/simd.h>
28 #include <sys/zfs_context.h>
29 #include <sys/zfs_impl.h>
30 #include <sys/blake3.h>
31 
32 #include "blake3_impl.h"
33 
34 #if !defined(OMIT_SIMD) && (defined(__aarch64__) ||  \
35 	(defined(__x86_64) && defined(HAVE_SSE2)) || \
36     (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)))
37 #define USE_SIMD
38 #endif
39 
40 #ifdef USE_SIMD
41 extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8],
42     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
43     uint64_t counter, uint8_t flags);
44 
45 extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8],
46     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
47     uint64_t counter, uint8_t flags, uint8_t out[64]);
48 
49 extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs,
50     size_t num_inputs, size_t blocks, const uint32_t key[8],
51     uint64_t counter, boolean_t increment_counter, uint8_t flags,
52     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
53 
blake3_compress_in_place_sse2(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)54 static void blake3_compress_in_place_sse2(uint32_t cv[8],
55     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
56     uint64_t counter, uint8_t flags) {
57 	kfpu_begin();
58 	zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter,
59 	    flags);
60 	kfpu_end();
61 }
62 
blake3_compress_xof_sse2(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])63 static void blake3_compress_xof_sse2(const uint32_t cv[8],
64     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
65     uint64_t counter, uint8_t flags, uint8_t out[64]) {
66 	kfpu_begin();
67 	zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags,
68 	    out);
69 	kfpu_end();
70 }
71 
blake3_hash_many_sse2(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,boolean_t increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)72 static void blake3_hash_many_sse2(const uint8_t * const *inputs,
73     size_t num_inputs, size_t blocks, const uint32_t key[8],
74     uint64_t counter, boolean_t increment_counter, uint8_t flags,
75     uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
76 	kfpu_begin();
77 	zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
78 	    increment_counter, flags, flags_start, flags_end, out);
79 	kfpu_end();
80 }
81 
blake3_is_sse2_supported(void)82 static boolean_t blake3_is_sse2_supported(void)
83 {
84 #if defined(__x86_64)
85 	return (kfpu_allowed() && zfs_sse2_available());
86 #elif defined(__PPC64__)
87 	return (kfpu_allowed() && zfs_vsx_available());
88 #else
89 	return (kfpu_allowed());
90 #endif
91 }
92 
93 const blake3_ops_t blake3_sse2_impl = {
94 	.compress_in_place = blake3_compress_in_place_sse2,
95 	.compress_xof = blake3_compress_xof_sse2,
96 	.hash_many = blake3_hash_many_sse2,
97 	.is_supported = blake3_is_sse2_supported,
98 	.degree = 4,
99 	.name = "sse2"
100 };
101 #endif
102 
103 #ifdef USE_SIMD
104 
105 extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8],
106     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
107     uint64_t counter, uint8_t flags);
108 
109 extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8],
110     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
111     uint64_t counter, uint8_t flags, uint8_t out[64]);
112 
113 extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs,
114     size_t num_inputs, size_t blocks, const uint32_t key[8],
115     uint64_t counter, boolean_t increment_counter, uint8_t flags,
116     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
117 
blake3_compress_in_place_sse41(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)118 static void blake3_compress_in_place_sse41(uint32_t cv[8],
119     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
120     uint64_t counter, uint8_t flags) {
121 	kfpu_begin();
122 	zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter,
123 	    flags);
124 	kfpu_end();
125 }
126 
blake3_compress_xof_sse41(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])127 static void blake3_compress_xof_sse41(const uint32_t cv[8],
128     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
129     uint64_t counter, uint8_t flags, uint8_t out[64]) {
130 	kfpu_begin();
131 	zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags,
132 	    out);
133 	kfpu_end();
134 }
135 
blake3_hash_many_sse41(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,boolean_t increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)136 static void blake3_hash_many_sse41(const uint8_t * const *inputs,
137     size_t num_inputs, size_t blocks, const uint32_t key[8],
138     uint64_t counter, boolean_t increment_counter, uint8_t flags,
139     uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
140 	kfpu_begin();
141 	zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
142 	    increment_counter, flags, flags_start, flags_end, out);
143 	kfpu_end();
144 }
145 
blake3_is_sse41_supported(void)146 static boolean_t blake3_is_sse41_supported(void)
147 {
148 #if defined(__x86_64)
149 	return (kfpu_allowed() && zfs_sse4_1_available());
150 #elif defined(__PPC64__)
151 	return (kfpu_allowed() && zfs_vsx_available());
152 #else
153 	return (kfpu_allowed());
154 #endif
155 }
156 
157 const blake3_ops_t blake3_sse41_impl = {
158 	.compress_in_place = blake3_compress_in_place_sse41,
159 	.compress_xof = blake3_compress_xof_sse41,
160 	.hash_many = blake3_hash_many_sse41,
161 	.is_supported = blake3_is_sse41_supported,
162 	.degree = 4,
163 	.name = "sse41"
164 };
165 #endif
166 
167 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
168 extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs,
169     size_t num_inputs, size_t blocks, const uint32_t key[8],
170     uint64_t counter, boolean_t increment_counter, uint8_t flags,
171     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
172 
blake3_hash_many_avx2(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,boolean_t increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)173 static void blake3_hash_many_avx2(const uint8_t * const *inputs,
174     size_t num_inputs, size_t blocks, const uint32_t key[8],
175     uint64_t counter, boolean_t increment_counter, uint8_t flags,
176     uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
177 	kfpu_begin();
178 	zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
179 	    increment_counter, flags, flags_start, flags_end, out);
180 	kfpu_end();
181 }
182 
blake3_is_avx2_supported(void)183 static boolean_t blake3_is_avx2_supported(void)
184 {
185 	return (kfpu_allowed() && zfs_sse4_1_available() &&
186 	    zfs_avx2_available());
187 }
188 
189 const blake3_ops_t
190 blake3_avx2_impl = {
191 	.compress_in_place = blake3_compress_in_place_sse41,
192 	.compress_xof = blake3_compress_xof_sse41,
193 	.hash_many = blake3_hash_many_avx2,
194 	.is_supported = blake3_is_avx2_supported,
195 	.degree = 8,
196 	.name = "avx2"
197 };
198 #endif
199 
200 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
201 extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8],
202     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
203     uint64_t counter, uint8_t flags);
204 
205 extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8],
206     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
207     uint64_t counter, uint8_t flags, uint8_t out[64]);
208 
209 extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs,
210     size_t num_inputs, size_t blocks, const uint32_t key[8],
211     uint64_t counter, boolean_t increment_counter, uint8_t flags,
212     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
213 
blake3_compress_in_place_avx512(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)214 static void blake3_compress_in_place_avx512(uint32_t cv[8],
215     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
216     uint64_t counter, uint8_t flags) {
217 	kfpu_begin();
218 	zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter,
219 	    flags);
220 	kfpu_end();
221 }
222 
blake3_compress_xof_avx512(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])223 static void blake3_compress_xof_avx512(const uint32_t cv[8],
224     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
225     uint64_t counter, uint8_t flags, uint8_t out[64]) {
226 	kfpu_begin();
227 	zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags,
228 	    out);
229 	kfpu_end();
230 }
231 
blake3_hash_many_avx512(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,boolean_t increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)232 static void blake3_hash_many_avx512(const uint8_t * const *inputs,
233     size_t num_inputs, size_t blocks, const uint32_t key[8],
234     uint64_t counter, boolean_t increment_counter, uint8_t flags,
235     uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
236 	kfpu_begin();
237 	zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
238 	    increment_counter, flags, flags_start, flags_end, out);
239 	kfpu_end();
240 }
241 
blake3_is_avx512_supported(void)242 static boolean_t blake3_is_avx512_supported(void)
243 {
244 	return (kfpu_allowed() && zfs_avx512f_available() &&
245 	    zfs_avx512vl_available());
246 }
247 
248 const blake3_ops_t blake3_avx512_impl = {
249 	.compress_in_place = blake3_compress_in_place_avx512,
250 	.compress_xof = blake3_compress_xof_avx512,
251 	.hash_many = blake3_hash_many_avx512,
252 	.is_supported = blake3_is_avx512_supported,
253 	.degree = 16,
254 	.name = "avx512"
255 };
256 #endif
257 
258 extern const blake3_ops_t blake3_generic_impl;
259 
260 static const blake3_ops_t *const blake3_impls[] = {
261 	&blake3_generic_impl,
262 #ifdef USE_SIMD
263 #if defined(__aarch64__) || \
264 	(defined(__x86_64) && defined(HAVE_SSE2)) || \
265 	(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
266 	&blake3_sse2_impl,
267 #endif
268 #if defined(__aarch64__) || \
269 	(defined(__x86_64) && defined(HAVE_SSE4_1)) || \
270 	(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
271 	&blake3_sse41_impl,
272 #endif
273 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
274 	&blake3_avx2_impl,
275 #endif
276 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
277 	&blake3_avx512_impl,
278 #endif
279 #endif
280 };
281 
282 /* use the generic implementation functions */
283 #define	IMPL_NAME		"blake3"
284 #define	IMPL_OPS_T		blake3_ops_t
285 #define	IMPL_ARRAY		blake3_impls
286 #define	IMPL_GET_OPS		blake3_get_ops
287 #define	ZFS_IMPL_OPS		zfs_blake3_ops
288 #include <generic_impl.c>
289 
290 #ifdef _KERNEL
291 void **blake3_per_cpu_ctx;
292 
293 void
blake3_per_cpu_ctx_init(void)294 blake3_per_cpu_ctx_init(void)
295 {
296 	/*
297 	 * Create "The Godfather" ptr to hold all blake3 ctx
298 	 */
299 	blake3_per_cpu_ctx = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP);
300 	for (int i = 0; i < max_ncpus; i++) {
301 		blake3_per_cpu_ctx[i] = kmem_alloc(sizeof (BLAKE3_CTX),
302 		    KM_SLEEP);
303 	}
304 }
305 
306 void
blake3_per_cpu_ctx_fini(void)307 blake3_per_cpu_ctx_fini(void)
308 {
309 	for (int i = 0; i < max_ncpus; i++) {
310 		memset(blake3_per_cpu_ctx[i], 0, sizeof (BLAKE3_CTX));
311 		kmem_free(blake3_per_cpu_ctx[i], sizeof (BLAKE3_CTX));
312 	}
313 	memset(blake3_per_cpu_ctx, 0, max_ncpus * sizeof (void *));
314 	kmem_free(blake3_per_cpu_ctx, max_ncpus * sizeof (void *));
315 }
316 
317 #define	IMPL_FMT(impl, i)	(((impl) == (i)) ? "[%s] " : "%s ")
318 
319 #if defined(__linux__)
320 
321 static int
blake3_param_get(char * buffer,zfs_kernel_param_t * unused)322 blake3_param_get(char *buffer, zfs_kernel_param_t *unused)
323 {
324 	const uint32_t impl = IMPL_READ(generic_impl_chosen);
325 	char *fmt;
326 	int cnt = 0;
327 
328 	/* cycling */
329 	fmt = IMPL_FMT(impl, IMPL_CYCLE);
330 	cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "cycle");
331 
332 	/* list fastest */
333 	fmt = IMPL_FMT(impl, IMPL_FASTEST);
334 	cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest");
335 
336 	/* list all supported implementations */
337 	generic_impl_init();
338 	for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
339 		fmt = IMPL_FMT(impl, i);
340 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
341 		    blake3_impls[i]->name);
342 	}
343 
344 	return (cnt);
345 }
346 
347 static int
blake3_param_set(const char * val,zfs_kernel_param_t * unused)348 blake3_param_set(const char *val, zfs_kernel_param_t *unused)
349 {
350 	(void) unused;
351 	return (generic_impl_setname(val));
352 }
353 
354 #elif defined(__FreeBSD__)
355 
356 #include <sys/sbuf.h>
357 
358 static int
blake3_param(ZFS_MODULE_PARAM_ARGS)359 blake3_param(ZFS_MODULE_PARAM_ARGS)
360 {
361 	int err;
362 
363 	generic_impl_init();
364 	if (req->newptr == NULL) {
365 		const uint32_t impl = IMPL_READ(generic_impl_chosen);
366 		const int init_buflen = 64;
367 		const char *fmt;
368 		struct sbuf *s;
369 
370 		s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
371 
372 		/* cycling */
373 		fmt = IMPL_FMT(impl, IMPL_CYCLE);
374 		(void) sbuf_printf(s, fmt, "cycle");
375 
376 		/* list fastest */
377 		fmt = IMPL_FMT(impl, IMPL_FASTEST);
378 		(void) sbuf_printf(s, fmt, "fastest");
379 
380 		/* list all supported implementations */
381 		for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
382 			fmt = IMPL_FMT(impl, i);
383 			(void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
384 		}
385 
386 		err = sbuf_finish(s);
387 		sbuf_delete(s);
388 
389 		return (err);
390 	}
391 
392 	char buf[16];
393 
394 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
395 	if (err) {
396 		return (err);
397 	}
398 
399 	return (-generic_impl_setname(buf));
400 }
401 #endif
402 
403 #undef IMPL_FMT
404 
405 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, blake3_impl,
406     blake3_param_set, blake3_param_get, ZMOD_RW, \
407 	"Select BLAKE3 implementation.");
408 #endif
409