1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
24 */
25
26 #include <sys/simd.h>
27 #include <sys/zfs_context.h>
28 #include <sys/zfs_impl.h>
29 #include <sys/blake3.h>
30
31 #include "blake3_impl.h"
32
33 #if !defined(OMIT_SIMD) && (defined(__aarch64__) || \
34 (defined(__x86_64) && defined(HAVE_SSE2)) || \
35 (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)))
36 #define USE_SIMD
37 #endif
38
39 #ifdef USE_SIMD
40 extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8],
41 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
42 uint64_t counter, uint8_t flags);
43
44 extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8],
45 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
46 uint64_t counter, uint8_t flags, uint8_t out[64]);
47
48 extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs,
49 size_t num_inputs, size_t blocks, const uint32_t key[8],
50 uint64_t counter, boolean_t increment_counter, uint8_t flags,
51 uint8_t flags_start, uint8_t flags_end, uint8_t *out);
52
blake3_compress_in_place_sse2(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)53 static void blake3_compress_in_place_sse2(uint32_t cv[8],
54 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
55 uint64_t counter, uint8_t flags) {
56 kfpu_begin();
57 zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter,
58 flags);
59 kfpu_end();
60 }
61
blake3_compress_xof_sse2(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])62 static void blake3_compress_xof_sse2(const uint32_t cv[8],
63 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
64 uint64_t counter, uint8_t flags, uint8_t out[64]) {
65 kfpu_begin();
66 zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags,
67 out);
68 kfpu_end();
69 }
70
blake3_hash_many_sse2(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,boolean_t increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)71 static void blake3_hash_many_sse2(const uint8_t * const *inputs,
72 size_t num_inputs, size_t blocks, const uint32_t key[8],
73 uint64_t counter, boolean_t increment_counter, uint8_t flags,
74 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
75 kfpu_begin();
76 zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
77 increment_counter, flags, flags_start, flags_end, out);
78 kfpu_end();
79 }
80
blake3_is_sse2_supported(void)81 static boolean_t blake3_is_sse2_supported(void)
82 {
83 #if defined(__x86_64)
84 return (kfpu_allowed() && zfs_sse2_available());
85 #elif defined(__PPC64__)
86 return (kfpu_allowed() && zfs_vsx_available());
87 #else
88 return (kfpu_allowed());
89 #endif
90 }
91
92 const blake3_ops_t blake3_sse2_impl = {
93 .compress_in_place = blake3_compress_in_place_sse2,
94 .compress_xof = blake3_compress_xof_sse2,
95 .hash_many = blake3_hash_many_sse2,
96 .is_supported = blake3_is_sse2_supported,
97 .degree = 4,
98 .name = "sse2"
99 };
100 #endif
101
102 #ifdef USE_SIMD
103
104 extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8],
105 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
106 uint64_t counter, uint8_t flags);
107
108 extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8],
109 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
110 uint64_t counter, uint8_t flags, uint8_t out[64]);
111
112 extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs,
113 size_t num_inputs, size_t blocks, const uint32_t key[8],
114 uint64_t counter, boolean_t increment_counter, uint8_t flags,
115 uint8_t flags_start, uint8_t flags_end, uint8_t *out);
116
blake3_compress_in_place_sse41(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)117 static void blake3_compress_in_place_sse41(uint32_t cv[8],
118 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
119 uint64_t counter, uint8_t flags) {
120 kfpu_begin();
121 zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter,
122 flags);
123 kfpu_end();
124 }
125
blake3_compress_xof_sse41(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])126 static void blake3_compress_xof_sse41(const uint32_t cv[8],
127 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
128 uint64_t counter, uint8_t flags, uint8_t out[64]) {
129 kfpu_begin();
130 zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags,
131 out);
132 kfpu_end();
133 }
134
blake3_hash_many_sse41(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,boolean_t increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)135 static void blake3_hash_many_sse41(const uint8_t * const *inputs,
136 size_t num_inputs, size_t blocks, const uint32_t key[8],
137 uint64_t counter, boolean_t increment_counter, uint8_t flags,
138 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
139 kfpu_begin();
140 zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
141 increment_counter, flags, flags_start, flags_end, out);
142 kfpu_end();
143 }
144
blake3_is_sse41_supported(void)145 static boolean_t blake3_is_sse41_supported(void)
146 {
147 #if defined(__x86_64)
148 return (kfpu_allowed() && zfs_sse4_1_available());
149 #elif defined(__PPC64__)
150 return (kfpu_allowed() && zfs_vsx_available());
151 #else
152 return (kfpu_allowed());
153 #endif
154 }
155
156 const blake3_ops_t blake3_sse41_impl = {
157 .compress_in_place = blake3_compress_in_place_sse41,
158 .compress_xof = blake3_compress_xof_sse41,
159 .hash_many = blake3_hash_many_sse41,
160 .is_supported = blake3_is_sse41_supported,
161 .degree = 4,
162 .name = "sse41"
163 };
164 #endif
165
166 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
167 extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs,
168 size_t num_inputs, size_t blocks, const uint32_t key[8],
169 uint64_t counter, boolean_t increment_counter, uint8_t flags,
170 uint8_t flags_start, uint8_t flags_end, uint8_t *out);
171
blake3_hash_many_avx2(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,boolean_t increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)172 static void blake3_hash_many_avx2(const uint8_t * const *inputs,
173 size_t num_inputs, size_t blocks, const uint32_t key[8],
174 uint64_t counter, boolean_t increment_counter, uint8_t flags,
175 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
176 kfpu_begin();
177 zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
178 increment_counter, flags, flags_start, flags_end, out);
179 kfpu_end();
180 }
181
blake3_is_avx2_supported(void)182 static boolean_t blake3_is_avx2_supported(void)
183 {
184 return (kfpu_allowed() && zfs_sse4_1_available() &&
185 zfs_avx2_available());
186 }
187
188 const blake3_ops_t
189 blake3_avx2_impl = {
190 .compress_in_place = blake3_compress_in_place_sse41,
191 .compress_xof = blake3_compress_xof_sse41,
192 .hash_many = blake3_hash_many_avx2,
193 .is_supported = blake3_is_avx2_supported,
194 .degree = 8,
195 .name = "avx2"
196 };
197 #endif
198
199 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
200 extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8],
201 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
202 uint64_t counter, uint8_t flags);
203
204 extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8],
205 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
206 uint64_t counter, uint8_t flags, uint8_t out[64]);
207
208 extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs,
209 size_t num_inputs, size_t blocks, const uint32_t key[8],
210 uint64_t counter, boolean_t increment_counter, uint8_t flags,
211 uint8_t flags_start, uint8_t flags_end, uint8_t *out);
212
blake3_compress_in_place_avx512(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)213 static void blake3_compress_in_place_avx512(uint32_t cv[8],
214 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
215 uint64_t counter, uint8_t flags) {
216 kfpu_begin();
217 zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter,
218 flags);
219 kfpu_end();
220 }
221
blake3_compress_xof_avx512(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])222 static void blake3_compress_xof_avx512(const uint32_t cv[8],
223 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
224 uint64_t counter, uint8_t flags, uint8_t out[64]) {
225 kfpu_begin();
226 zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags,
227 out);
228 kfpu_end();
229 }
230
blake3_hash_many_avx512(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,boolean_t increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)231 static void blake3_hash_many_avx512(const uint8_t * const *inputs,
232 size_t num_inputs, size_t blocks, const uint32_t key[8],
233 uint64_t counter, boolean_t increment_counter, uint8_t flags,
234 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
235 kfpu_begin();
236 zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
237 increment_counter, flags, flags_start, flags_end, out);
238 kfpu_end();
239 }
240
blake3_is_avx512_supported(void)241 static boolean_t blake3_is_avx512_supported(void)
242 {
243 return (kfpu_allowed() && zfs_avx512f_available() &&
244 zfs_avx512vl_available());
245 }
246
247 const blake3_ops_t blake3_avx512_impl = {
248 .compress_in_place = blake3_compress_in_place_avx512,
249 .compress_xof = blake3_compress_xof_avx512,
250 .hash_many = blake3_hash_many_avx512,
251 .is_supported = blake3_is_avx512_supported,
252 .degree = 16,
253 .name = "avx512"
254 };
255 #endif
256
257 extern const blake3_ops_t blake3_generic_impl;
258
259 static const blake3_ops_t *const blake3_impls[] = {
260 &blake3_generic_impl,
261 #ifdef USE_SIMD
262 #if defined(__aarch64__) || \
263 (defined(__x86_64) && defined(HAVE_SSE2)) || \
264 (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
265 &blake3_sse2_impl,
266 #endif
267 #if defined(__aarch64__) || \
268 (defined(__x86_64) && defined(HAVE_SSE4_1)) || \
269 (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
270 &blake3_sse41_impl,
271 #endif
272 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
273 &blake3_avx2_impl,
274 #endif
275 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
276 &blake3_avx512_impl,
277 #endif
278 #endif
279 };
280
281 /* use the generic implementation functions */
282 #define IMPL_NAME "blake3"
283 #define IMPL_OPS_T blake3_ops_t
284 #define IMPL_ARRAY blake3_impls
285 #define IMPL_GET_OPS blake3_get_ops
286 #define ZFS_IMPL_OPS zfs_blake3_ops
287 #include <generic_impl.c>
288
289 #ifdef _KERNEL
290 void **blake3_per_cpu_ctx;
291
292 void
blake3_per_cpu_ctx_init(void)293 blake3_per_cpu_ctx_init(void)
294 {
295 /*
296 * Create "The Godfather" ptr to hold all blake3 ctx
297 */
298 blake3_per_cpu_ctx = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP);
299 for (int i = 0; i < max_ncpus; i++) {
300 blake3_per_cpu_ctx[i] = kmem_alloc(sizeof (BLAKE3_CTX),
301 KM_SLEEP);
302 }
303 }
304
305 void
blake3_per_cpu_ctx_fini(void)306 blake3_per_cpu_ctx_fini(void)
307 {
308 for (int i = 0; i < max_ncpus; i++) {
309 memset(blake3_per_cpu_ctx[i], 0, sizeof (BLAKE3_CTX));
310 kmem_free(blake3_per_cpu_ctx[i], sizeof (BLAKE3_CTX));
311 }
312 memset(blake3_per_cpu_ctx, 0, max_ncpus * sizeof (void *));
313 kmem_free(blake3_per_cpu_ctx, max_ncpus * sizeof (void *));
314 }
315
316 #define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ")
317
318 #if defined(__linux__)
319
320 static int
blake3_param_get(char * buffer,zfs_kernel_param_t * unused)321 blake3_param_get(char *buffer, zfs_kernel_param_t *unused)
322 {
323 const uint32_t impl = IMPL_READ(generic_impl_chosen);
324 char *fmt;
325 int cnt = 0;
326
327 /* cycling */
328 fmt = IMPL_FMT(impl, IMPL_CYCLE);
329 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "cycle");
330
331 /* list fastest */
332 fmt = IMPL_FMT(impl, IMPL_FASTEST);
333 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest");
334
335 /* list all supported implementations */
336 generic_impl_init();
337 for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
338 fmt = IMPL_FMT(impl, i);
339 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
340 blake3_impls[i]->name);
341 }
342
343 return (cnt);
344 }
345
346 static int
blake3_param_set(const char * val,zfs_kernel_param_t * unused)347 blake3_param_set(const char *val, zfs_kernel_param_t *unused)
348 {
349 (void) unused;
350 return (generic_impl_setname(val));
351 }
352
353 #elif defined(__FreeBSD__)
354
355 #include <sys/sbuf.h>
356
357 static int
blake3_param(ZFS_MODULE_PARAM_ARGS)358 blake3_param(ZFS_MODULE_PARAM_ARGS)
359 {
360 int err;
361
362 generic_impl_init();
363 if (req->newptr == NULL) {
364 const uint32_t impl = IMPL_READ(generic_impl_chosen);
365 const int init_buflen = 64;
366 const char *fmt;
367 struct sbuf *s;
368
369 s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
370
371 /* cycling */
372 fmt = IMPL_FMT(impl, IMPL_CYCLE);
373 (void) sbuf_printf(s, fmt, "cycle");
374
375 /* list fastest */
376 fmt = IMPL_FMT(impl, IMPL_FASTEST);
377 (void) sbuf_printf(s, fmt, "fastest");
378
379 /* list all supported implementations */
380 for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
381 fmt = IMPL_FMT(impl, i);
382 (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
383 }
384
385 err = sbuf_finish(s);
386 sbuf_delete(s);
387
388 return (err);
389 }
390
391 char buf[16];
392
393 err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
394 if (err) {
395 return (err);
396 }
397
398 return (-generic_impl_setname(buf));
399 }
400 #endif
401
402 #undef IMPL_FMT
403
404 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, blake3_impl,
405 blake3_param_set, blake3_param_get, ZMOD_RW, \
406 "Select BLAKE3 implementation.");
407 #endif
408