1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de> 25 */ 26 27 #include <sys/simd.h> 28 #include <sys/zfs_context.h> 29 #include <sys/zfs_impl.h> 30 #include <sys/blake3.h> 31 32 #include "blake3_impl.h" 33 34 #if !defined(OMIT_SIMD) && (defined(__aarch64__) || \ 35 (defined(__x86_64) && defined(HAVE_SSE2)) || \ 36 (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))) 37 #define USE_SIMD 38 #endif 39 40 #ifdef USE_SIMD 41 extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8], 42 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 43 uint64_t counter, uint8_t flags); 44 45 extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8], 46 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 47 uint64_t counter, uint8_t flags, uint8_t out[64]); 48 49 extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, 50 size_t num_inputs, size_t blocks, const uint32_t key[8], 51 uint64_t counter, boolean_t increment_counter, uint8_t flags, 52 uint8_t flags_start, uint8_t flags_end, uint8_t *out); 53 54 static void blake3_compress_in_place_sse2(uint32_t cv[8], 55 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 56 uint64_t counter, uint8_t flags) { 57 kfpu_begin(); 58 zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, 59 flags); 60 kfpu_end(); 61 } 62 63 static void blake3_compress_xof_sse2(const uint32_t cv[8], 64 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 65 uint64_t counter, uint8_t flags, uint8_t out[64]) { 66 kfpu_begin(); 67 zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, 68 out); 69 kfpu_end(); 70 } 71 72 static void blake3_hash_many_sse2(const uint8_t * const *inputs, 73 size_t num_inputs, size_t blocks, const uint32_t key[8], 74 uint64_t counter, boolean_t increment_counter, uint8_t flags, 75 uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 76 kfpu_begin(); 77 zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, 78 increment_counter, flags, flags_start, flags_end, out); 79 kfpu_end(); 80 } 81 82 static boolean_t blake3_is_sse2_supported(void) 83 { 84 #if defined(__x86_64) 85 return (kfpu_allowed() && zfs_sse2_available()); 86 #elif defined(__PPC64__) 87 return (kfpu_allowed() && zfs_vsx_available()); 88 #else 89 return (kfpu_allowed()); 90 #endif 91 } 92 93 const blake3_ops_t blake3_sse2_impl = { 94 .compress_in_place = blake3_compress_in_place_sse2, 95 .compress_xof = blake3_compress_xof_sse2, 96 .hash_many = blake3_hash_many_sse2, 97 .is_supported = blake3_is_sse2_supported, 98 .degree = 4, 99 .name = "sse2" 100 }; 101 #endif 102 103 #ifdef USE_SIMD 104 105 extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8], 106 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 107 uint64_t counter, uint8_t flags); 108 109 extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8], 110 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 111 uint64_t counter, uint8_t flags, uint8_t out[64]); 112 113 extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, 114 size_t num_inputs, size_t blocks, const uint32_t key[8], 115 uint64_t counter, boolean_t increment_counter, uint8_t flags, 116 uint8_t flags_start, uint8_t flags_end, uint8_t *out); 117 118 static void blake3_compress_in_place_sse41(uint32_t cv[8], 119 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 120 uint64_t counter, uint8_t flags) { 121 kfpu_begin(); 122 zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, 123 flags); 124 kfpu_end(); 125 } 126 127 static void blake3_compress_xof_sse41(const uint32_t cv[8], 128 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 129 uint64_t counter, uint8_t flags, uint8_t out[64]) { 130 kfpu_begin(); 131 zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, 132 out); 133 kfpu_end(); 134 } 135 136 static void blake3_hash_many_sse41(const uint8_t * const *inputs, 137 size_t num_inputs, size_t blocks, const uint32_t key[8], 138 uint64_t counter, boolean_t increment_counter, uint8_t flags, 139 uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 140 kfpu_begin(); 141 zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, 142 increment_counter, flags, flags_start, flags_end, out); 143 kfpu_end(); 144 } 145 146 static boolean_t blake3_is_sse41_supported(void) 147 { 148 #if defined(__x86_64) 149 return (kfpu_allowed() && zfs_sse4_1_available()); 150 #elif defined(__PPC64__) 151 return (kfpu_allowed() && zfs_vsx_available()); 152 #else 153 return (kfpu_allowed()); 154 #endif 155 } 156 157 const blake3_ops_t blake3_sse41_impl = { 158 .compress_in_place = blake3_compress_in_place_sse41, 159 .compress_xof = blake3_compress_xof_sse41, 160 .hash_many = blake3_hash_many_sse41, 161 .is_supported = blake3_is_sse41_supported, 162 .degree = 4, 163 .name = "sse41" 164 }; 165 #endif 166 167 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) 168 extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, 169 size_t num_inputs, size_t blocks, const uint32_t key[8], 170 uint64_t counter, boolean_t increment_counter, uint8_t flags, 171 uint8_t flags_start, uint8_t flags_end, uint8_t *out); 172 173 static void blake3_hash_many_avx2(const uint8_t * const *inputs, 174 size_t num_inputs, size_t blocks, const uint32_t key[8], 175 uint64_t counter, boolean_t increment_counter, uint8_t flags, 176 uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 177 kfpu_begin(); 178 zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, 179 increment_counter, flags, flags_start, flags_end, out); 180 kfpu_end(); 181 } 182 183 static boolean_t blake3_is_avx2_supported(void) 184 { 185 return (kfpu_allowed() && zfs_sse4_1_available() && 186 zfs_avx2_available()); 187 } 188 189 const blake3_ops_t 190 blake3_avx2_impl = { 191 .compress_in_place = blake3_compress_in_place_sse41, 192 .compress_xof = blake3_compress_xof_sse41, 193 .hash_many = blake3_hash_many_avx2, 194 .is_supported = blake3_is_avx2_supported, 195 .degree = 8, 196 .name = "avx2" 197 }; 198 #endif 199 200 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) 201 extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8], 202 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 203 uint64_t counter, uint8_t flags); 204 205 extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8], 206 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 207 uint64_t counter, uint8_t flags, uint8_t out[64]); 208 209 extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, 210 size_t num_inputs, size_t blocks, const uint32_t key[8], 211 uint64_t counter, boolean_t increment_counter, uint8_t flags, 212 uint8_t flags_start, uint8_t flags_end, uint8_t *out); 213 214 static void blake3_compress_in_place_avx512(uint32_t cv[8], 215 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 216 uint64_t counter, uint8_t flags) { 217 kfpu_begin(); 218 zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, 219 flags); 220 kfpu_end(); 221 } 222 223 static void blake3_compress_xof_avx512(const uint32_t cv[8], 224 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 225 uint64_t counter, uint8_t flags, uint8_t out[64]) { 226 kfpu_begin(); 227 zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, 228 out); 229 kfpu_end(); 230 } 231 232 static void blake3_hash_many_avx512(const uint8_t * const *inputs, 233 size_t num_inputs, size_t blocks, const uint32_t key[8], 234 uint64_t counter, boolean_t increment_counter, uint8_t flags, 235 uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 236 kfpu_begin(); 237 zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, 238 increment_counter, flags, flags_start, flags_end, out); 239 kfpu_end(); 240 } 241 242 static boolean_t blake3_is_avx512_supported(void) 243 { 244 return (kfpu_allowed() && zfs_avx512f_available() && 245 zfs_avx512vl_available()); 246 } 247 248 const blake3_ops_t blake3_avx512_impl = { 249 .compress_in_place = blake3_compress_in_place_avx512, 250 .compress_xof = blake3_compress_xof_avx512, 251 .hash_many = blake3_hash_many_avx512, 252 .is_supported = blake3_is_avx512_supported, 253 .degree = 16, 254 .name = "avx512" 255 }; 256 #endif 257 258 extern const blake3_ops_t blake3_generic_impl; 259 260 static const blake3_ops_t *const blake3_impls[] = { 261 &blake3_generic_impl, 262 #ifdef USE_SIMD 263 #if defined(__aarch64__) || \ 264 (defined(__x86_64) && defined(HAVE_SSE2)) || \ 265 (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) 266 &blake3_sse2_impl, 267 #endif 268 #if defined(__aarch64__) || \ 269 (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ 270 (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) 271 &blake3_sse41_impl, 272 #endif 273 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) 274 &blake3_avx2_impl, 275 #endif 276 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) 277 &blake3_avx512_impl, 278 #endif 279 #endif 280 }; 281 282 /* use the generic implementation functions */ 283 #define IMPL_NAME "blake3" 284 #define IMPL_OPS_T blake3_ops_t 285 #define IMPL_ARRAY blake3_impls 286 #define IMPL_GET_OPS blake3_get_ops 287 #define ZFS_IMPL_OPS zfs_blake3_ops 288 #include <generic_impl.c> 289 290 #ifdef _KERNEL 291 void **blake3_per_cpu_ctx; 292 293 void 294 blake3_per_cpu_ctx_init(void) 295 { 296 /* 297 * Create "The Godfather" ptr to hold all blake3 ctx 298 */ 299 blake3_per_cpu_ctx = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); 300 for (int i = 0; i < max_ncpus; i++) { 301 blake3_per_cpu_ctx[i] = kmem_alloc(sizeof (BLAKE3_CTX), 302 KM_SLEEP); 303 } 304 } 305 306 void 307 blake3_per_cpu_ctx_fini(void) 308 { 309 for (int i = 0; i < max_ncpus; i++) { 310 memset(blake3_per_cpu_ctx[i], 0, sizeof (BLAKE3_CTX)); 311 kmem_free(blake3_per_cpu_ctx[i], sizeof (BLAKE3_CTX)); 312 } 313 memset(blake3_per_cpu_ctx, 0, max_ncpus * sizeof (void *)); 314 kmem_free(blake3_per_cpu_ctx, max_ncpus * sizeof (void *)); 315 } 316 317 #define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") 318 319 #if defined(__linux__) 320 321 static int 322 blake3_param_get(char *buffer, zfs_kernel_param_t *unused) 323 { 324 const uint32_t impl = IMPL_READ(generic_impl_chosen); 325 char *fmt; 326 int cnt = 0; 327 328 /* cycling */ 329 fmt = IMPL_FMT(impl, IMPL_CYCLE); 330 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "cycle"); 331 332 /* list fastest */ 333 fmt = IMPL_FMT(impl, IMPL_FASTEST); 334 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest"); 335 336 /* list all supported implementations */ 337 generic_impl_init(); 338 for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { 339 fmt = IMPL_FMT(impl, i); 340 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, 341 blake3_impls[i]->name); 342 } 343 344 return (cnt); 345 } 346 347 static int 348 blake3_param_set(const char *val, zfs_kernel_param_t *unused) 349 { 350 (void) unused; 351 return (generic_impl_setname(val)); 352 } 353 354 #elif defined(__FreeBSD__) 355 356 #include <sys/sbuf.h> 357 358 static int 359 blake3_param(ZFS_MODULE_PARAM_ARGS) 360 { 361 int err; 362 363 generic_impl_init(); 364 if (req->newptr == NULL) { 365 const uint32_t impl = IMPL_READ(generic_impl_chosen); 366 const int init_buflen = 64; 367 const char *fmt; 368 struct sbuf *s; 369 370 s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); 371 372 /* cycling */ 373 fmt = IMPL_FMT(impl, IMPL_CYCLE); 374 (void) sbuf_printf(s, fmt, "cycle"); 375 376 /* list fastest */ 377 fmt = IMPL_FMT(impl, IMPL_FASTEST); 378 (void) sbuf_printf(s, fmt, "fastest"); 379 380 /* list all supported implementations */ 381 for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { 382 fmt = IMPL_FMT(impl, i); 383 (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name); 384 } 385 386 err = sbuf_finish(s); 387 sbuf_delete(s); 388 389 return (err); 390 } 391 392 char buf[16]; 393 394 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 395 if (err) { 396 return (err); 397 } 398 399 return (-generic_impl_setname(buf)); 400 } 401 #endif 402 403 #undef IMPL_FMT 404 405 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, blake3_impl, 406 blake3_param_set, blake3_param_get, ZMOD_RW, \ 407 "Select BLAKE3 implementation."); 408 #endif 409