1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de> 24 */ 25 26 #include <sys/simd.h> 27 #include <sys/zfs_context.h> 28 #include <sys/zfs_impl.h> 29 #include <sys/blake3.h> 30 31 #include "blake3_impl.h" 32 33 #if defined(__aarch64__) || \ 34 (defined(__x86_64) && defined(HAVE_SSE2)) || \ 35 (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) 36 37 extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8], 38 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 39 uint64_t counter, uint8_t flags); 40 41 extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8], 42 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 43 uint64_t counter, uint8_t flags, uint8_t out[64]); 44 45 extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, 46 size_t num_inputs, size_t blocks, const uint32_t key[8], 47 uint64_t counter, boolean_t increment_counter, uint8_t flags, 48 uint8_t flags_start, uint8_t flags_end, uint8_t *out); 49 50 static void blake3_compress_in_place_sse2(uint32_t cv[8], 51 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 52 uint64_t counter, uint8_t flags) { 53 kfpu_begin(); 54 zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, 55 flags); 56 kfpu_end(); 57 } 58 59 static void blake3_compress_xof_sse2(const uint32_t cv[8], 60 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 61 uint64_t counter, uint8_t flags, uint8_t out[64]) { 62 kfpu_begin(); 63 zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, 64 out); 65 kfpu_end(); 66 } 67 68 static void blake3_hash_many_sse2(const uint8_t * const *inputs, 69 size_t num_inputs, size_t blocks, const uint32_t key[8], 70 uint64_t counter, boolean_t increment_counter, uint8_t flags, 71 uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 72 kfpu_begin(); 73 zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, 74 increment_counter, flags, flags_start, flags_end, out); 75 kfpu_end(); 76 } 77 78 static boolean_t blake3_is_sse2_supported(void) 79 { 80 #if defined(__x86_64) 81 return (kfpu_allowed() && zfs_sse2_available()); 82 #elif defined(__PPC64__) 83 return (kfpu_allowed() && zfs_vsx_available()); 84 #else 85 return (kfpu_allowed()); 86 #endif 87 } 88 89 const blake3_ops_t blake3_sse2_impl = { 90 .compress_in_place = blake3_compress_in_place_sse2, 91 .compress_xof = blake3_compress_xof_sse2, 92 .hash_many = blake3_hash_many_sse2, 93 .is_supported = blake3_is_sse2_supported, 94 .degree = 4, 95 .name = "sse2" 96 }; 97 #endif 98 99 #if defined(__aarch64__) || \ 100 (defined(__x86_64) && defined(HAVE_SSE2)) || \ 101 (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) 102 103 extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8], 104 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 105 uint64_t counter, uint8_t flags); 106 107 extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8], 108 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 109 uint64_t counter, uint8_t flags, uint8_t out[64]); 110 111 extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, 112 size_t num_inputs, size_t blocks, const uint32_t key[8], 113 uint64_t counter, boolean_t increment_counter, uint8_t flags, 114 uint8_t flags_start, uint8_t flags_end, uint8_t *out); 115 116 static void blake3_compress_in_place_sse41(uint32_t cv[8], 117 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 118 uint64_t counter, uint8_t flags) { 119 kfpu_begin(); 120 zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, 121 flags); 122 kfpu_end(); 123 } 124 125 static void blake3_compress_xof_sse41(const uint32_t cv[8], 126 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 127 uint64_t counter, uint8_t flags, uint8_t out[64]) { 128 kfpu_begin(); 129 zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, 130 out); 131 kfpu_end(); 132 } 133 134 static void blake3_hash_many_sse41(const uint8_t * const *inputs, 135 size_t num_inputs, size_t blocks, const uint32_t key[8], 136 uint64_t counter, boolean_t increment_counter, uint8_t flags, 137 uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 138 kfpu_begin(); 139 zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, 140 increment_counter, flags, flags_start, flags_end, out); 141 kfpu_end(); 142 } 143 144 static boolean_t blake3_is_sse41_supported(void) 145 { 146 #if defined(__x86_64) 147 return (kfpu_allowed() && zfs_sse4_1_available()); 148 #elif defined(__PPC64__) 149 return (kfpu_allowed() && zfs_vsx_available()); 150 #else 151 return (kfpu_allowed()); 152 #endif 153 } 154 155 const blake3_ops_t blake3_sse41_impl = { 156 .compress_in_place = blake3_compress_in_place_sse41, 157 .compress_xof = blake3_compress_xof_sse41, 158 .hash_many = blake3_hash_many_sse41, 159 .is_supported = blake3_is_sse41_supported, 160 .degree = 4, 161 .name = "sse41" 162 }; 163 #endif 164 165 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) 166 extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, 167 size_t num_inputs, size_t blocks, const uint32_t key[8], 168 uint64_t counter, boolean_t increment_counter, uint8_t flags, 169 uint8_t flags_start, uint8_t flags_end, uint8_t *out); 170 171 static void blake3_hash_many_avx2(const uint8_t * const *inputs, 172 size_t num_inputs, size_t blocks, const uint32_t key[8], 173 uint64_t counter, boolean_t increment_counter, uint8_t flags, 174 uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 175 kfpu_begin(); 176 zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, 177 increment_counter, flags, flags_start, flags_end, out); 178 kfpu_end(); 179 } 180 181 static boolean_t blake3_is_avx2_supported(void) 182 { 183 return (kfpu_allowed() && zfs_sse4_1_available() && 184 zfs_avx2_available()); 185 } 186 187 const blake3_ops_t 188 blake3_avx2_impl = { 189 .compress_in_place = blake3_compress_in_place_sse41, 190 .compress_xof = blake3_compress_xof_sse41, 191 .hash_many = blake3_hash_many_avx2, 192 .is_supported = blake3_is_avx2_supported, 193 .degree = 8, 194 .name = "avx2" 195 }; 196 #endif 197 198 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) 199 extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8], 200 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 201 uint64_t counter, uint8_t flags); 202 203 extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8], 204 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 205 uint64_t counter, uint8_t flags, uint8_t out[64]); 206 207 extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, 208 size_t num_inputs, size_t blocks, const uint32_t key[8], 209 uint64_t counter, boolean_t increment_counter, uint8_t flags, 210 uint8_t flags_start, uint8_t flags_end, uint8_t *out); 211 212 static void blake3_compress_in_place_avx512(uint32_t cv[8], 213 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 214 uint64_t counter, uint8_t flags) { 215 kfpu_begin(); 216 zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, 217 flags); 218 kfpu_end(); 219 } 220 221 static void blake3_compress_xof_avx512(const uint32_t cv[8], 222 const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, 223 uint64_t counter, uint8_t flags, uint8_t out[64]) { 224 kfpu_begin(); 225 zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, 226 out); 227 kfpu_end(); 228 } 229 230 static void blake3_hash_many_avx512(const uint8_t * const *inputs, 231 size_t num_inputs, size_t blocks, const uint32_t key[8], 232 uint64_t counter, boolean_t increment_counter, uint8_t flags, 233 uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 234 kfpu_begin(); 235 zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, 236 increment_counter, flags, flags_start, flags_end, out); 237 kfpu_end(); 238 } 239 240 static boolean_t blake3_is_avx512_supported(void) 241 { 242 return (kfpu_allowed() && zfs_avx512f_available() && 243 zfs_avx512vl_available()); 244 } 245 246 const blake3_ops_t blake3_avx512_impl = { 247 .compress_in_place = blake3_compress_in_place_avx512, 248 .compress_xof = blake3_compress_xof_avx512, 249 .hash_many = blake3_hash_many_avx512, 250 .is_supported = blake3_is_avx512_supported, 251 .degree = 16, 252 .name = "avx512" 253 }; 254 #endif 255 256 extern const blake3_ops_t blake3_generic_impl; 257 258 static const blake3_ops_t *const blake3_impls[] = { 259 &blake3_generic_impl, 260 #if defined(__aarch64__) || \ 261 (defined(__x86_64) && defined(HAVE_SSE2)) || \ 262 (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) 263 &blake3_sse2_impl, 264 #endif 265 #if defined(__aarch64__) || \ 266 (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ 267 (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) 268 &blake3_sse41_impl, 269 #endif 270 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) 271 &blake3_avx2_impl, 272 #endif 273 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) 274 &blake3_avx512_impl, 275 #endif 276 }; 277 278 /* use the generic implementation functions */ 279 #define IMPL_NAME "blake3" 280 #define IMPL_OPS_T blake3_ops_t 281 #define IMPL_ARRAY blake3_impls 282 #define IMPL_GET_OPS blake3_get_ops 283 #define ZFS_IMPL_OPS zfs_blake3_ops 284 #include <generic_impl.c> 285 286 #ifdef _KERNEL 287 void **blake3_per_cpu_ctx; 288 289 void 290 blake3_per_cpu_ctx_init(void) 291 { 292 /* 293 * Create "The Godfather" ptr to hold all blake3 ctx 294 */ 295 blake3_per_cpu_ctx = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); 296 for (int i = 0; i < max_ncpus; i++) { 297 blake3_per_cpu_ctx[i] = kmem_alloc(sizeof (BLAKE3_CTX), 298 KM_SLEEP); 299 } 300 } 301 302 void 303 blake3_per_cpu_ctx_fini(void) 304 { 305 for (int i = 0; i < max_ncpus; i++) { 306 memset(blake3_per_cpu_ctx[i], 0, sizeof (BLAKE3_CTX)); 307 kmem_free(blake3_per_cpu_ctx[i], sizeof (BLAKE3_CTX)); 308 } 309 memset(blake3_per_cpu_ctx, 0, max_ncpus * sizeof (void *)); 310 kmem_free(blake3_per_cpu_ctx, max_ncpus * sizeof (void *)); 311 } 312 313 #define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") 314 315 #if defined(__linux__) 316 317 static int 318 blake3_param_get(char *buffer, zfs_kernel_param_t *unused) 319 { 320 const uint32_t impl = IMPL_READ(generic_impl_chosen); 321 char *fmt; 322 int cnt = 0; 323 324 /* cycling */ 325 fmt = IMPL_FMT(impl, IMPL_CYCLE); 326 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "cycle"); 327 328 /* list fastest */ 329 fmt = IMPL_FMT(impl, IMPL_FASTEST); 330 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest"); 331 332 /* list all supported implementations */ 333 generic_impl_init(); 334 for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { 335 fmt = IMPL_FMT(impl, i); 336 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, 337 blake3_impls[i]->name); 338 } 339 340 return (cnt); 341 } 342 343 static int 344 blake3_param_set(const char *val, zfs_kernel_param_t *unused) 345 { 346 (void) unused; 347 return (generic_impl_setname(val)); 348 } 349 350 #elif defined(__FreeBSD__) 351 352 #include <sys/sbuf.h> 353 354 static int 355 blake3_param(ZFS_MODULE_PARAM_ARGS) 356 { 357 int err; 358 359 generic_impl_init(); 360 if (req->newptr == NULL) { 361 const uint32_t impl = IMPL_READ(generic_impl_chosen); 362 const int init_buflen = 64; 363 const char *fmt; 364 struct sbuf *s; 365 366 s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); 367 368 /* cycling */ 369 fmt = IMPL_FMT(impl, IMPL_CYCLE); 370 (void) sbuf_printf(s, fmt, "cycle"); 371 372 /* list fastest */ 373 fmt = IMPL_FMT(impl, IMPL_FASTEST); 374 (void) sbuf_printf(s, fmt, "fastest"); 375 376 /* list all supported implementations */ 377 for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { 378 fmt = IMPL_FMT(impl, i); 379 (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name); 380 } 381 382 err = sbuf_finish(s); 383 sbuf_delete(s); 384 385 return (err); 386 } 387 388 char buf[16]; 389 390 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 391 if (err) { 392 return (err); 393 } 394 395 return (-generic_impl_setname(buf)); 396 } 397 #endif 398 399 #undef IMPL_FMT 400 401 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, blake3_impl, 402 blake3_param_set, blake3_param_get, ZMOD_RW, \ 403 "Select BLAKE3 implementation."); 404 #endif 405