1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright 2013 Saso Kiselkov. All rights reserved. 27 * Copyright (c) 2016 by Delphix. All rights reserved. 28 * Copyright (C) 2016 Gvozden Nešković. All rights reserved. 29 * Copyright 2024 Oxide Computer Company 30 */ 31 32 /* 33 * Fletcher Checksums 34 * ------------------ 35 * 36 * ZFS's 2nd and 4th order Fletcher checksums are defined by the following 37 * recurrence relations: 38 * 39 * a = a + f 40 * i i-1 i-1 41 * 42 * b = b + a 43 * i i-1 i 44 * 45 * c = c + b (fletcher-4 only) 46 * i i-1 i 47 * 48 * d = d + c (fletcher-4 only) 49 * i i-1 i 50 * 51 * Where 52 * a_0 = b_0 = c_0 = d_0 = 0 53 * and 54 * f_0 .. f_(n-1) are the input data. 55 * 56 * Using standard techniques, these translate into the following series: 57 * 58 * __n_ __n_ 59 * \ | \ | 60 * a = > f b = > i * f 61 * n /___| n - i n /___| n - i 62 * i = 1 i = 1 63 * 64 * 65 * __n_ __n_ 66 * \ | i*(i+1) \ | i*(i+1)*(i+2) 67 * c = > ------- f d = > ------------- f 68 * n /___| 2 n - i n /___| 6 n - i 69 * i = 1 i = 1 70 * 71 * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. 72 * Since the additions are done mod (2^64), errors in the high bits may not 73 * be noticed. For this reason, fletcher-2 is deprecated. 74 * 75 * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. 76 * A conservative estimate of how big the buffer can get before we overflow 77 * can be estimated using f_i = 0xffffffff for all i: 78 * 79 * % bc 80 * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 81 * 2264 82 * quit 83 * % 84 * 85 * So blocks of up to 2k will not overflow. Our largest block size is 86 * 128k, which has 32k 4-byte words, so we can compute the largest possible 87 * accumulators, then divide by 2^64 to figure the max amount of overflow: 88 * 89 * % bc 90 * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } 91 * a/2^64;b/2^64;c/2^64;d/2^64 92 * 0 93 * 0 94 * 1365 95 * 11186858 96 * quit 97 * % 98 * 99 * So a and b cannot overflow. To make sure each bit of input has some 100 * effect on the contents of c and d, we can look at what the factors of 101 * the coefficients in the equations for c_n and d_n are. The number of 2s 102 * in the factors determines the lowest set bit in the multiplier. Running 103 * through the cases for n*(n+1)/2 reveals that the highest power of 2 is 104 * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow 105 * the 64-bit accumulators, every bit of every f_i effects every accumulator, 106 * even for 128k blocks. 107 * 108 * If we wanted to make a stronger version of fletcher4 (fletcher4c?), 109 * we could do our calculations mod (2^32 - 1) by adding in the carries 110 * periodically, and store the number of carries in the top 32-bits. 111 * 112 * -------------------- 113 * Checksum Performance 114 * -------------------- 115 * 116 * There are two interesting components to checksum performance: cached and 117 * uncached performance. With cached data, fletcher-2 is about four times 118 * faster than fletcher-4. With uncached data, the performance difference is 119 * negligible, since the cost of a cache fill dominates the processing time. 120 * Even though fletcher-4 is slower than fletcher-2, it is still a pretty 121 * efficient pass over the data. 122 * 123 * In normal operation, the data which is being checksummed is in a buffer 124 * which has been filled either by: 125 * 126 * 1. a compression step, which will be mostly cached, or 127 * 2. a bcopy() or copyin(), which will be uncached (because the 128 * copy is cache-bypassing). 129 * 130 * For both cached and uncached data, both fletcher checksums are much faster 131 * than sha-256, and slower than 'off', which doesn't touch the data at all. 132 */ 133 134 #include <sys/types.h> 135 #include <sys/sysmacros.h> 136 #include <sys/byteorder.h> 137 #include <sys/simd.h> 138 #include <sys/spa.h> 139 #include <sys/zio_checksum.h> 140 #include <sys/zfs_context.h> 141 #include <zfs_fletcher.h> 142 143 #define FLETCHER_MIN_SIMD_SIZE 64 144 145 #ifdef _KERNEL 146 147 #include <sys/atomic.h> 148 #include <sys/disp.h> 149 #define KPREEMPT_DISABLE kpreempt_disable() 150 #define KPREEMPT_ENABLE kpreempt_enable() 151 #define MEMBAR_PRODUCER membar_producer() 152 153 #else /* _KERNEL */ 154 155 #include <atomic.h> 156 #include <string.h> 157 #ifndef SET_ERROR 158 #define SET_ERROR(err) (err) 159 #endif 160 #define KPREEMPT_DISABLE 161 #define KPREEMPT_ENABLE 162 #define MEMBAR_PRODUCER 163 164 #endif /* _KERNEL */ 165 166 static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx); 167 static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp); 168 static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, 169 const void *buf, size_t size); 170 static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, 171 const void *buf, size_t size); 172 static boolean_t fletcher_4_scalar_valid(void); 173 174 static const fletcher_4_ops_t fletcher_4_scalar_ops = { 175 .init_native = fletcher_4_scalar_init, 176 .fini_native = fletcher_4_scalar_fini, 177 .compute_native = fletcher_4_scalar_native, 178 .init_byteswap = fletcher_4_scalar_init, 179 .fini_byteswap = fletcher_4_scalar_fini, 180 .compute_byteswap = fletcher_4_scalar_byteswap, 181 .valid = fletcher_4_scalar_valid, 182 .uses_fpu_native = B_FALSE, 183 .uses_fpu_byteswap = B_FALSE, 184 .name = "scalar" 185 }; 186 187 static fletcher_4_ops_t fletcher_4_fastest_impl = { 188 .name = "fastest", 189 .valid = fletcher_4_scalar_valid 190 }; 191 192 static const fletcher_4_ops_t *fletcher_4_impls[] = { 193 &fletcher_4_scalar_ops, 194 &fletcher_4_superscalar_ops, 195 &fletcher_4_superscalar4_ops, 196 #ifdef __amd64 197 &fletcher_4_sse2_ops, 198 &fletcher_4_ssse3_ops, 199 &fletcher_4_avx2_ops, 200 &fletcher_4_avx512f_ops, 201 &fletcher_4_avx512bw_ops, 202 #endif 203 }; 204 205 /* Hold all supported implementations */ 206 static uint32_t fletcher_4_supp_impls_cnt = 0; 207 static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)]; 208 209 /* Select fletcher4 implementation */ 210 #define IMPL_FASTEST (UINT32_MAX) 211 #define IMPL_CYCLE (UINT32_MAX - 1) 212 #define IMPL_SCALAR (0) 213 #define IMPL_SUPERSCALAR (1) 214 #define IMPL_SUPERSCALAR4 (2) 215 216 static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST; 217 218 #define IMPL_READ(i) (*(volatile uint32_t *) &(i)) 219 220 static struct fletcher_4_impl_selector { 221 const char *fis_name; 222 uint32_t fis_sel; 223 } fletcher_4_impl_selectors[] = { 224 { "cycle", IMPL_CYCLE }, 225 { "fastest", IMPL_FASTEST }, 226 { "scalar", IMPL_SCALAR } 227 }; 228 229 #if defined(_KERNEL) 230 static kstat_t *fletcher_4_kstat; 231 static kstat_named_t fletcher_4_kstat_data[ARRAY_SIZE(fletcher_4_impls) * 2]; 232 233 static struct fletcher_4_bench { 234 uint64_t native; 235 uint64_t byteswap; 236 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1]; 237 #endif 238 239 /* Indicate that benchmark has been completed */ 240 static boolean_t fletcher_4_initialized = B_FALSE; 241 242 void 243 fletcher_init(zio_cksum_t *zcp) 244 { 245 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 246 } 247 248 int 249 fletcher_2_incremental_native(void *buf, size_t size, void *data) 250 { 251 zio_cksum_t *zcp = data; 252 253 const uint64_t *ip = buf; 254 const uint64_t *ipend = ip + (size / sizeof (uint64_t)); 255 uint64_t a0, b0, a1, b1; 256 257 a0 = zcp->zc_word[0]; 258 a1 = zcp->zc_word[1]; 259 b0 = zcp->zc_word[2]; 260 b1 = zcp->zc_word[3]; 261 262 for (; ip < ipend; ip += 2) { 263 a0 += ip[0]; 264 a1 += ip[1]; 265 b0 += a0; 266 b1 += a1; 267 } 268 269 ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); 270 return (0); 271 } 272 273 void 274 fletcher_2_native(const void *buf, size_t size, 275 const void *ctx_template __unused, zio_cksum_t *zcp) 276 { 277 fletcher_init(zcp); 278 (void) fletcher_2_incremental_native((void *) buf, size, zcp); 279 } 280 281 int 282 fletcher_2_incremental_byteswap(void *buf, size_t size, void *data) 283 { 284 zio_cksum_t *zcp = data; 285 286 const uint64_t *ip = buf; 287 const uint64_t *ipend = ip + (size / sizeof (uint64_t)); 288 uint64_t a0, b0, a1, b1; 289 290 a0 = zcp->zc_word[0]; 291 a1 = zcp->zc_word[1]; 292 b0 = zcp->zc_word[2]; 293 b1 = zcp->zc_word[3]; 294 295 for (; ip < ipend; ip += 2) { 296 a0 += BSWAP_64(ip[0]); 297 a1 += BSWAP_64(ip[1]); 298 b0 += a0; 299 b1 += a1; 300 } 301 302 ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); 303 return (0); 304 } 305 306 void 307 fletcher_2_byteswap(const void *buf, size_t size, 308 const void *ctx_template __unused, zio_cksum_t *zcp) 309 { 310 fletcher_init(zcp); 311 (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp); 312 } 313 314 static void 315 fletcher_4_scalar_init(fletcher_4_ctx_t *ctx) 316 { 317 ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0); 318 } 319 320 static void 321 fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) 322 { 323 memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t)); 324 } 325 326 static void 327 fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, size_t size) 328 { 329 const uint32_t *ip = buf; 330 const uint32_t *ipend = ip + (size / sizeof (uint32_t)); 331 uint64_t a, b, c, d; 332 333 a = ctx->scalar.zc_word[0]; 334 b = ctx->scalar.zc_word[1]; 335 c = ctx->scalar.zc_word[2]; 336 d = ctx->scalar.zc_word[3]; 337 338 for (; ip < ipend; ip++) { 339 a += ip[0]; 340 b += a; 341 c += b; 342 d += c; 343 } 344 345 ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); 346 } 347 348 static void 349 fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, size_t size) 350 { 351 const uint32_t *ip = buf; 352 const uint32_t *ipend = ip + (size / sizeof (uint32_t)); 353 uint64_t a, b, c, d; 354 355 a = ctx->scalar.zc_word[0]; 356 b = ctx->scalar.zc_word[1]; 357 c = ctx->scalar.zc_word[2]; 358 d = ctx->scalar.zc_word[3]; 359 360 for (; ip < ipend; ip++) { 361 a += BSWAP_32(ip[0]); 362 b += a; 363 c += b; 364 d += c; 365 } 366 367 ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); 368 } 369 370 static boolean_t 371 fletcher_4_scalar_valid(void) 372 { 373 return (B_TRUE); 374 } 375 376 int 377 fletcher_4_impl_set(const char *val) 378 { 379 int err = EINVAL; 380 uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); 381 size_t i; 382 383 /* check mandatory implementations */ 384 for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) { 385 const char *name = fletcher_4_impl_selectors[i].fis_name; 386 387 if (strcmp(val, name) == 0) { 388 impl = fletcher_4_impl_selectors[i].fis_sel; 389 err = 0; 390 break; 391 } 392 } 393 394 if (err != 0 && fletcher_4_initialized) { 395 /* check all supported implementations */ 396 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { 397 const char *name = fletcher_4_supp_impls[i]->name; 398 399 if (strcmp(val, name) == 0) { 400 impl = i; 401 err = 0; 402 break; 403 } 404 } 405 } 406 407 if (err == 0) { 408 atomic_swap_32(&fletcher_4_impl_chosen, impl); 409 MEMBAR_PRODUCER; 410 } 411 412 return (SET_ERROR(err)); 413 } 414 415 /* 416 * Returns the Fletcher 4 operations for checksums. When a SIMD 417 * implementation is not allowed in the current context, then fallback 418 * to the fastest generic implementation. 419 */ 420 static inline const fletcher_4_ops_t * 421 fletcher_4_impl_get(void) 422 { 423 if (!kfpu_allowed()) 424 return (&fletcher_4_superscalar4_ops); 425 426 const fletcher_4_ops_t *ops = NULL; 427 uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); 428 429 switch (impl) { 430 case IMPL_FASTEST: 431 ASSERT(fletcher_4_initialized); 432 ops = &fletcher_4_fastest_impl; 433 break; 434 case IMPL_CYCLE: 435 /* Cycle through supported implementations */ 436 ASSERT(fletcher_4_initialized); 437 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); 438 439 static uint32_t cycle_count = 0; 440 uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt; 441 442 ops = fletcher_4_supp_impls[idx]; 443 break; 444 default: 445 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); 446 ASSERT3U(impl, <, fletcher_4_supp_impls_cnt); 447 448 ops = fletcher_4_supp_impls[impl]; 449 break; 450 } 451 452 ASSERT3P(ops, !=, NULL); 453 454 return (ops); 455 } 456 457 static inline void 458 fletcher_4_native_impl(const void *buf, size_t size, zio_cksum_t *zcp) 459 { 460 fletcher_4_ctx_t ctx; 461 const fletcher_4_ops_t *ops = fletcher_4_impl_get(); 462 463 if (ops->uses_fpu_native) 464 kfpu_begin(); 465 ops->init_native(&ctx); 466 ops->compute_native(&ctx, buf, size); 467 ops->fini_native(&ctx, zcp); 468 if (ops->uses_fpu_native) 469 kfpu_end(); 470 } 471 472 void 473 fletcher_4_native(const void *buf, size_t size, 474 const void *ctx_template __unused, zio_cksum_t *zcp) 475 { 476 const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); 477 478 ASSERT(IS_P2ALIGNED(buf, sizeof (uint32_t))); 479 ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); 480 481 if (size == 0 || p2size == 0) { 482 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 483 484 if (size > 0) { 485 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, 486 buf, size); 487 } 488 } else { 489 fletcher_4_native_impl(buf, p2size, zcp); 490 491 if (p2size < size) { 492 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, 493 (char *)buf + p2size, size - p2size); 494 } 495 } 496 } 497 498 void 499 fletcher_4_native_varsize(const void *buf, size_t size, zio_cksum_t *zcp) 500 { 501 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 502 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); 503 } 504 505 static inline void 506 fletcher_4_byteswap_impl(const void *buf, size_t size, zio_cksum_t *zcp) 507 { 508 fletcher_4_ctx_t ctx; 509 const fletcher_4_ops_t *ops = fletcher_4_impl_get(); 510 511 if (ops->uses_fpu_byteswap) 512 kfpu_begin(); 513 ops->init_byteswap(&ctx); 514 ops->compute_byteswap(&ctx, buf, size); 515 ops->fini_byteswap(&ctx, zcp); 516 if (ops->uses_fpu_byteswap) 517 kfpu_end(); 518 } 519 520 void 521 fletcher_4_byteswap(const void *buf, size_t size, 522 const void *ctx_template __unused, zio_cksum_t *zcp) 523 { 524 const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); 525 526 ASSERT(IS_P2ALIGNED(buf, sizeof (uint32_t))); 527 ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); 528 529 if (size == 0 || p2size == 0) { 530 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 531 532 if (size > 0) { 533 fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, 534 buf, size); 535 } 536 } else { 537 fletcher_4_byteswap_impl(buf, p2size, zcp); 538 539 if (p2size < size) { 540 fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, 541 (char *)buf + p2size, size - p2size); 542 } 543 } 544 } 545 546 /* Incremental Fletcher 4 */ 547 548 #define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20) 549 550 static inline void 551 fletcher_4_incremental_combine(zio_cksum_t *zcp, const size_t size, 552 const zio_cksum_t *nzcp) 553 { 554 const uint64_t c1 = size / sizeof (uint32_t); 555 const uint64_t c2 = c1 * (c1 + 1) / 2; 556 const uint64_t c3 = c2 * (c1 + 2) / 3; 557 558 /* 559 * Value of 'c3' overflows on buffer sizes close to 16MiB. For that 560 * reason we split incremental fletcher4 computation of large buffers 561 * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size. 562 */ 563 ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE); 564 565 zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] + 566 c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0]; 567 zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] + 568 c2 * zcp->zc_word[0]; 569 zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0]; 570 zcp->zc_word[0] += nzcp->zc_word[0]; 571 } 572 573 static inline void 574 fletcher_4_incremental_impl(boolean_t native, const void *buf, size_t size, 575 zio_cksum_t *zcp) 576 { 577 while (size > 0) { 578 zio_cksum_t nzc; 579 uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE); 580 581 if (native) 582 fletcher_4_native(buf, len, NULL, &nzc); 583 else 584 fletcher_4_byteswap(buf, len, NULL, &nzc); 585 586 fletcher_4_incremental_combine(zcp, len, &nzc); 587 588 size -= len; 589 buf += len; 590 } 591 } 592 593 int 594 fletcher_4_incremental_native(void *buf, size_t size, void *data) 595 { 596 zio_cksum_t *zcp = data; 597 598 /* Use scalar impl to directly update cksum of small blocks */ 599 if (size < SPA_MINBLOCKSIZE) 600 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); 601 else 602 fletcher_4_incremental_impl(B_TRUE, buf, size, zcp); 603 return (0); 604 } 605 606 int 607 fletcher_4_incremental_byteswap(void *buf, size_t size, void *data) 608 { 609 zio_cksum_t *zcp = data; 610 611 /* Use scalar impl to directly update cksum of small blocks */ 612 if (size < SPA_MINBLOCKSIZE) 613 fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); 614 else 615 fletcher_4_incremental_impl(B_FALSE, buf, size, zcp); 616 return (0); 617 } 618 619 #define FLETCHER_4_FASTEST_FN_COPY(type, src) \ 620 { \ 621 fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \ 622 fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \ 623 fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \ 624 fletcher_4_fastest_impl.uses_fpu_ ## type = src->uses_fpu_ ## type; \ 625 } 626 627 #define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */ 628 629 typedef void fletcher_checksum_func_t(const void *, size_t, const void *, 630 zio_cksum_t *); 631 632 #if defined(_KERNEL) 633 static void 634 fletcher_4_benchmark_impl(boolean_t native, char *data, size_t data_size) 635 { 636 struct fletcher_4_bench *fastest_stat = 637 &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; 638 hrtime_t start; 639 uint64_t run_bw, run_time_ns, best_run = 0; 640 zio_cksum_t zc; 641 uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); 642 643 fletcher_checksum_func_t *fletcher_4_test = 644 native ? fletcher_4_native : fletcher_4_byteswap; 645 646 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { 647 struct fletcher_4_bench *stat = &fletcher_4_stat_data[i]; 648 uint64_t run_count = 0; 649 650 /* Temporarily set an implementation */ 651 fletcher_4_impl_chosen = i; 652 653 KPREEMPT_DISABLE; 654 start = gethrtime(); 655 do { 656 for (l = 0; l < 32; l++, run_count++) 657 fletcher_4_test(data, data_size, NULL, &zc); 658 659 run_time_ns = gethrtime() - start; 660 } while (run_time_ns < FLETCHER_4_BENCH_NS); 661 KPREEMPT_ENABLE; 662 663 run_bw = data_size * run_count * NANOSEC; 664 run_bw /= run_time_ns; /* B/s */ 665 666 if (native) 667 stat->native = run_bw; 668 else 669 stat->byteswap = run_bw; 670 671 if (run_bw > best_run) { 672 best_run = run_bw; 673 674 if (native) { 675 fastest_stat->native = i; 676 FLETCHER_4_FASTEST_FN_COPY(native, 677 fletcher_4_supp_impls[i]); 678 } else { 679 fastest_stat->byteswap = i; 680 FLETCHER_4_FASTEST_FN_COPY(byteswap, 681 fletcher_4_supp_impls[i]); 682 } 683 } 684 } 685 686 /* restore original selection */ 687 atomic_swap_32(&fletcher_4_impl_chosen, sel_save); 688 } 689 #endif /* _KERNEL */ 690 691 /* 692 * Initialize and benchmark all supported implementations. 693 */ 694 static void 695 fletcher_4_benchmark(void) 696 { 697 fletcher_4_ops_t *curr_impl; 698 int i, c; 699 700 /* Move supported implementations into fletcher_4_supp_impls */ 701 for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { 702 curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i]; 703 704 if (curr_impl->valid && curr_impl->valid()) 705 fletcher_4_supp_impls[c++] = curr_impl; 706 } 707 MEMBAR_PRODUCER; /* complete fletcher_4_supp_impls[] init */ 708 fletcher_4_supp_impls_cnt = c; /* number of supported impl */ 709 710 #if defined(_KERNEL) 711 static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ 712 char *databuf = kmem_alloc(data_size, KM_SLEEP); 713 714 for (i = 0; i < data_size / sizeof (uint64_t); i++) 715 ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */ 716 717 fletcher_4_benchmark_impl(B_FALSE, databuf, data_size); 718 fletcher_4_benchmark_impl(B_TRUE, databuf, data_size); 719 720 kmem_free(databuf, data_size); 721 #else 722 /* 723 * Skip the benchmark in user space to avoid impacting libzpool 724 * consumers (zdb, zhack, zinject, ztest). The last implementation 725 * is assumed to be the fastest and used by default. 726 */ 727 memcpy(&fletcher_4_fastest_impl, 728 fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1], 729 sizeof (fletcher_4_fastest_impl)); 730 fletcher_4_fastest_impl.name = "fastest"; 731 #endif /* _KERNEL */ 732 } 733 734 void 735 fletcher_4_init(void) 736 { 737 /* Determine the fastest available implementation. */ 738 fletcher_4_benchmark(); 739 740 #if defined(_KERNEL) 741 /* install kstats for all implementations */ 742 for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; i++) { 743 struct fletcher_4_bench *stat = &fletcher_4_stat_data[i]; 744 const fletcher_4_ops_t *ops = fletcher_4_supp_impls[i]; 745 kstat_named_t *kstat_native = &fletcher_4_kstat_data[i * 2]; 746 kstat_named_t *kstat_byteswap = 747 &fletcher_4_kstat_data[i * 2 + 1]; 748 749 (void) snprintf(kstat_native->name, 750 sizeof (kstat_native->name), "%s_native", ops->name); 751 kstat_native->data_type = KSTAT_DATA_UINT64; 752 kstat_native->value.ui64 = stat->native; 753 754 (void) snprintf(kstat_byteswap->name, 755 sizeof (kstat_byteswap->name), "%s_byteswap", ops->name); 756 kstat_byteswap->data_type = KSTAT_DATA_UINT64; 757 kstat_byteswap->value.ui64 = stat->byteswap; 758 } 759 760 fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", 761 KSTAT_TYPE_NAMED, ARRAY_SIZE(fletcher_4_supp_impls) * 2, 762 KSTAT_FLAG_VIRTUAL); 763 764 if (fletcher_4_kstat != NULL) { 765 fletcher_4_kstat->ks_data = fletcher_4_kstat_data; 766 kstat_install(fletcher_4_kstat); 767 } 768 #endif 769 770 /* Finish initialization */ 771 fletcher_4_initialized = B_TRUE; 772 } 773 774 void 775 fletcher_4_fini(void) 776 { 777 #if defined(_KERNEL) 778 if (fletcher_4_kstat != NULL) { 779 kstat_delete(fletcher_4_kstat); 780 fletcher_4_kstat = NULL; 781 } 782 #endif 783 } 784 785 /* ABD adapters */ 786 787 static void 788 abd_fletcher_4_init(zio_abd_checksum_data_t *cdp) 789 { 790 const fletcher_4_ops_t *ops = fletcher_4_impl_get(); 791 cdp->acd_private = (void *) ops; 792 793 if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) { 794 if (ops->uses_fpu_native) 795 kfpu_begin(); 796 ops->init_native(cdp->acd_ctx); 797 } else { 798 if (ops->uses_fpu_byteswap) 799 kfpu_begin(); 800 ops->init_byteswap(cdp->acd_ctx); 801 } 802 } 803 804 static void 805 abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp) 806 { 807 fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; 808 809 ASSERT(ops); 810 811 if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) { 812 ops->fini_native(cdp->acd_ctx, cdp->acd_zcp); 813 if (ops->uses_fpu_native) 814 kfpu_end(); 815 } else { 816 ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp); 817 if (ops->uses_fpu_byteswap) 818 kfpu_end(); 819 } 820 } 821 822 static void 823 abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size, 824 zio_abd_checksum_data_t *cdp) 825 { 826 zio_cksum_t *zcp = cdp->acd_zcp; 827 828 ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE); 829 830 abd_fletcher_4_fini(cdp); 831 cdp->acd_private = (void *)&fletcher_4_scalar_ops; 832 833 if (native) 834 fletcher_4_incremental_native(data, size, zcp); 835 else 836 fletcher_4_incremental_byteswap(data, size, zcp); 837 } 838 839 static int 840 abd_fletcher_4_iter(void *data, size_t size, void *private) 841 { 842 zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private; 843 fletcher_4_ctx_t *ctx = cdp->acd_ctx; 844 fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; 845 boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE; 846 uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); 847 848 ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); 849 850 if (asize > 0) { 851 if (native) 852 ops->compute_native(ctx, data, asize); 853 else 854 ops->compute_byteswap(ctx, data, asize); 855 856 size -= asize; 857 data = (char *)data + asize; 858 } 859 860 if (size > 0) { 861 ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE); 862 /* At this point we have to switch to scalar impl */ 863 abd_fletcher_4_simd2scalar(native, data, size, cdp); 864 } 865 866 return (0); 867 } 868 869 zio_abd_checksum_func_t fletcher_4_abd_ops = { 870 .acf_init = abd_fletcher_4_init, 871 .acf_fini = abd_fletcher_4_fini, 872 .acf_iter = abd_fletcher_4_iter 873 }; 874