1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright 2013 Saso Kiselkov. All rights reserved.
27 * Copyright (c) 2016 by Delphix. All rights reserved.
28 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
29 * Copyright 2024 Oxide Computer Company
30 */
31
32 /*
33 * Fletcher Checksums
34 * ------------------
35 *
36 * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
37 * recurrence relations:
38 *
39 * a = a + f
40 * i i-1 i-1
41 *
42 * b = b + a
43 * i i-1 i
44 *
45 * c = c + b (fletcher-4 only)
46 * i i-1 i
47 *
48 * d = d + c (fletcher-4 only)
49 * i i-1 i
50 *
51 * Where
52 * a_0 = b_0 = c_0 = d_0 = 0
53 * and
54 * f_0 .. f_(n-1) are the input data.
55 *
56 * Using standard techniques, these translate into the following series:
57 *
58 * __n_ __n_
59 * \ | \ |
60 * a = > f b = > i * f
61 * n /___| n - i n /___| n - i
62 * i = 1 i = 1
63 *
64 *
65 * __n_ __n_
66 * \ | i*(i+1) \ | i*(i+1)*(i+2)
67 * c = > ------- f d = > ------------- f
68 * n /___| 2 n - i n /___| 6 n - i
69 * i = 1 i = 1
70 *
71 * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
72 * Since the additions are done mod (2^64), errors in the high bits may not
73 * be noticed. For this reason, fletcher-2 is deprecated.
74 *
75 * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
76 * A conservative estimate of how big the buffer can get before we overflow
77 * can be estimated using f_i = 0xffffffff for all i:
78 *
79 * % bc
80 * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
81 * 2264
82 * quit
83 * %
84 *
85 * So blocks of up to 2k will not overflow. Our largest block size is
86 * 128k, which has 32k 4-byte words, so we can compute the largest possible
87 * accumulators, then divide by 2^64 to figure the max amount of overflow:
88 *
89 * % bc
90 * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
91 * a/2^64;b/2^64;c/2^64;d/2^64
92 * 0
93 * 0
94 * 1365
95 * 11186858
96 * quit
97 * %
98 *
99 * So a and b cannot overflow. To make sure each bit of input has some
100 * effect on the contents of c and d, we can look at what the factors of
101 * the coefficients in the equations for c_n and d_n are. The number of 2s
102 * in the factors determines the lowest set bit in the multiplier. Running
103 * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
104 * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow
105 * the 64-bit accumulators, every bit of every f_i effects every accumulator,
106 * even for 128k blocks.
107 *
108 * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
109 * we could do our calculations mod (2^32 - 1) by adding in the carries
110 * periodically, and store the number of carries in the top 32-bits.
111 *
112 * --------------------
113 * Checksum Performance
114 * --------------------
115 *
116 * There are two interesting components to checksum performance: cached and
117 * uncached performance. With cached data, fletcher-2 is about four times
118 * faster than fletcher-4. With uncached data, the performance difference is
119 * negligible, since the cost of a cache fill dominates the processing time.
120 * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
121 * efficient pass over the data.
122 *
123 * In normal operation, the data which is being checksummed is in a buffer
124 * which has been filled either by:
125 *
126 * 1. a compression step, which will be mostly cached, or
127 * 2. a bcopy() or copyin(), which will be uncached (because the
128 * copy is cache-bypassing).
129 *
130 * For both cached and uncached data, both fletcher checksums are much faster
131 * than sha-256, and slower than 'off', which doesn't touch the data at all.
132 */
133
134 #include <sys/types.h>
135 #include <sys/sysmacros.h>
136 #include <sys/byteorder.h>
137 #include <sys/simd.h>
138 #include <sys/spa.h>
139 #include <sys/zio_checksum.h>
140 #include <sys/zfs_context.h>
141 #include <zfs_fletcher.h>
142
143 #define FLETCHER_MIN_SIMD_SIZE 64
144
145 #ifdef _KERNEL
146
147 #include <sys/atomic.h>
148 #include <sys/disp.h>
149 #define KPREEMPT_DISABLE kpreempt_disable()
150 #define KPREEMPT_ENABLE kpreempt_enable()
151 #define MEMBAR_PRODUCER membar_producer()
152
153 #else /* _KERNEL */
154
155 #include <atomic.h>
156 #include <string.h>
157 #ifndef SET_ERROR
158 #define SET_ERROR(err) (err)
159 #endif
160 #define KPREEMPT_DISABLE
161 #define KPREEMPT_ENABLE
162 #define MEMBAR_PRODUCER
163
164 #endif /* _KERNEL */
165
166 static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
167 static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
168 static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
169 const void *buf, size_t size);
170 static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
171 const void *buf, size_t size);
172 static boolean_t fletcher_4_scalar_valid(void);
173
174 static const fletcher_4_ops_t fletcher_4_scalar_ops = {
175 .init_native = fletcher_4_scalar_init,
176 .fini_native = fletcher_4_scalar_fini,
177 .compute_native = fletcher_4_scalar_native,
178 .init_byteswap = fletcher_4_scalar_init,
179 .fini_byteswap = fletcher_4_scalar_fini,
180 .compute_byteswap = fletcher_4_scalar_byteswap,
181 .valid = fletcher_4_scalar_valid,
182 .uses_fpu_native = B_FALSE,
183 .uses_fpu_byteswap = B_FALSE,
184 .name = "scalar"
185 };
186
187 static fletcher_4_ops_t fletcher_4_fastest_impl = {
188 .name = "fastest",
189 .valid = fletcher_4_scalar_valid
190 };
191
192 static const fletcher_4_ops_t *fletcher_4_impls[] = {
193 &fletcher_4_scalar_ops,
194 &fletcher_4_superscalar_ops,
195 &fletcher_4_superscalar4_ops,
196 #ifdef __amd64
197 &fletcher_4_sse2_ops,
198 &fletcher_4_ssse3_ops,
199 &fletcher_4_avx2_ops,
200 &fletcher_4_avx512f_ops,
201 &fletcher_4_avx512bw_ops,
202 #endif
203 };
204
205 /* Hold all supported implementations */
206 static uint32_t fletcher_4_supp_impls_cnt = 0;
207 static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
208
209 /* Select fletcher4 implementation */
210 #define IMPL_FASTEST (UINT32_MAX)
211 #define IMPL_CYCLE (UINT32_MAX - 1)
212 #define IMPL_SCALAR (0)
213 #define IMPL_SUPERSCALAR (1)
214 #define IMPL_SUPERSCALAR4 (2)
215
216 static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
217
218 #define IMPL_READ(i) (*(volatile uint32_t *) &(i))
219
220 static struct fletcher_4_impl_selector {
221 const char *fis_name;
222 uint32_t fis_sel;
223 } fletcher_4_impl_selectors[] = {
224 { "cycle", IMPL_CYCLE },
225 { "fastest", IMPL_FASTEST },
226 { "scalar", IMPL_SCALAR }
227 };
228
229 #if defined(_KERNEL)
230 static kstat_t *fletcher_4_kstat;
231 static kstat_named_t fletcher_4_kstat_data[ARRAY_SIZE(fletcher_4_impls) * 2];
232
233 static struct fletcher_4_bench {
234 uint64_t native;
235 uint64_t byteswap;
236 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
237 #endif
238
239 /* Indicate that benchmark has been completed */
240 static boolean_t fletcher_4_initialized = B_FALSE;
241
242 void
fletcher_init(zio_cksum_t * zcp)243 fletcher_init(zio_cksum_t *zcp)
244 {
245 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
246 }
247
248 int
fletcher_2_incremental_native(void * buf,size_t size,void * data)249 fletcher_2_incremental_native(void *buf, size_t size, void *data)
250 {
251 zio_cksum_t *zcp = data;
252
253 const uint64_t *ip = buf;
254 const uint64_t *ipend = ip + (size / sizeof (uint64_t));
255 uint64_t a0, b0, a1, b1;
256
257 a0 = zcp->zc_word[0];
258 a1 = zcp->zc_word[1];
259 b0 = zcp->zc_word[2];
260 b1 = zcp->zc_word[3];
261
262 for (; ip < ipend; ip += 2) {
263 a0 += ip[0];
264 a1 += ip[1];
265 b0 += a0;
266 b1 += a1;
267 }
268
269 ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
270 return (0);
271 }
272
273 void
fletcher_2_native(const void * buf,size_t size,const void * ctx_template __unused,zio_cksum_t * zcp)274 fletcher_2_native(const void *buf, size_t size,
275 const void *ctx_template __unused, zio_cksum_t *zcp)
276 {
277 fletcher_init(zcp);
278 (void) fletcher_2_incremental_native((void *) buf, size, zcp);
279 }
280
281 int
fletcher_2_incremental_byteswap(void * buf,size_t size,void * data)282 fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
283 {
284 zio_cksum_t *zcp = data;
285
286 const uint64_t *ip = buf;
287 const uint64_t *ipend = ip + (size / sizeof (uint64_t));
288 uint64_t a0, b0, a1, b1;
289
290 a0 = zcp->zc_word[0];
291 a1 = zcp->zc_word[1];
292 b0 = zcp->zc_word[2];
293 b1 = zcp->zc_word[3];
294
295 for (; ip < ipend; ip += 2) {
296 a0 += BSWAP_64(ip[0]);
297 a1 += BSWAP_64(ip[1]);
298 b0 += a0;
299 b1 += a1;
300 }
301
302 ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
303 return (0);
304 }
305
306 void
fletcher_2_byteswap(const void * buf,size_t size,const void * ctx_template __unused,zio_cksum_t * zcp)307 fletcher_2_byteswap(const void *buf, size_t size,
308 const void *ctx_template __unused, zio_cksum_t *zcp)
309 {
310 fletcher_init(zcp);
311 (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
312 }
313
314 static void
fletcher_4_scalar_init(fletcher_4_ctx_t * ctx)315 fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
316 {
317 ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
318 }
319
320 static void
fletcher_4_scalar_fini(fletcher_4_ctx_t * ctx,zio_cksum_t * zcp)321 fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
322 {
323 memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
324 }
325
326 static void
fletcher_4_scalar_native(fletcher_4_ctx_t * ctx,const void * buf,size_t size)327 fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, size_t size)
328 {
329 const uint32_t *ip = buf;
330 const uint32_t *ipend = ip + (size / sizeof (uint32_t));
331 uint64_t a, b, c, d;
332
333 a = ctx->scalar.zc_word[0];
334 b = ctx->scalar.zc_word[1];
335 c = ctx->scalar.zc_word[2];
336 d = ctx->scalar.zc_word[3];
337
338 for (; ip < ipend; ip++) {
339 a += ip[0];
340 b += a;
341 c += b;
342 d += c;
343 }
344
345 ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
346 }
347
348 static void
fletcher_4_scalar_byteswap(fletcher_4_ctx_t * ctx,const void * buf,size_t size)349 fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, size_t size)
350 {
351 const uint32_t *ip = buf;
352 const uint32_t *ipend = ip + (size / sizeof (uint32_t));
353 uint64_t a, b, c, d;
354
355 a = ctx->scalar.zc_word[0];
356 b = ctx->scalar.zc_word[1];
357 c = ctx->scalar.zc_word[2];
358 d = ctx->scalar.zc_word[3];
359
360 for (; ip < ipend; ip++) {
361 a += BSWAP_32(ip[0]);
362 b += a;
363 c += b;
364 d += c;
365 }
366
367 ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
368 }
369
370 static boolean_t
fletcher_4_scalar_valid(void)371 fletcher_4_scalar_valid(void)
372 {
373 return (B_TRUE);
374 }
375
376 int
fletcher_4_impl_set(const char * val)377 fletcher_4_impl_set(const char *val)
378 {
379 int err = EINVAL;
380 uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
381 size_t i;
382
383 /* check mandatory implementations */
384 for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
385 const char *name = fletcher_4_impl_selectors[i].fis_name;
386
387 if (strcmp(val, name) == 0) {
388 impl = fletcher_4_impl_selectors[i].fis_sel;
389 err = 0;
390 break;
391 }
392 }
393
394 if (err != 0 && fletcher_4_initialized) {
395 /* check all supported implementations */
396 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
397 const char *name = fletcher_4_supp_impls[i]->name;
398
399 if (strcmp(val, name) == 0) {
400 impl = i;
401 err = 0;
402 break;
403 }
404 }
405 }
406
407 if (err == 0) {
408 atomic_swap_32(&fletcher_4_impl_chosen, impl);
409 MEMBAR_PRODUCER;
410 }
411
412 return (SET_ERROR(err));
413 }
414
415 /*
416 * Returns the Fletcher 4 operations for checksums. When a SIMD
417 * implementation is not allowed in the current context, then fallback
418 * to the fastest generic implementation.
419 */
420 static inline const fletcher_4_ops_t *
fletcher_4_impl_get(void)421 fletcher_4_impl_get(void)
422 {
423 if (!kfpu_allowed())
424 return (&fletcher_4_superscalar4_ops);
425
426 const fletcher_4_ops_t *ops = NULL;
427 uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
428
429 switch (impl) {
430 case IMPL_FASTEST:
431 ASSERT(fletcher_4_initialized);
432 ops = &fletcher_4_fastest_impl;
433 break;
434 case IMPL_CYCLE:
435 /* Cycle through supported implementations */
436 ASSERT(fletcher_4_initialized);
437 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
438
439 static uint32_t cycle_count = 0;
440 uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
441
442 ops = fletcher_4_supp_impls[idx];
443 break;
444 default:
445 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
446 ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
447
448 ops = fletcher_4_supp_impls[impl];
449 break;
450 }
451
452 ASSERT3P(ops, !=, NULL);
453
454 return (ops);
455 }
456
457 static inline void
fletcher_4_native_impl(const void * buf,size_t size,zio_cksum_t * zcp)458 fletcher_4_native_impl(const void *buf, size_t size, zio_cksum_t *zcp)
459 {
460 fletcher_4_ctx_t ctx;
461 const fletcher_4_ops_t *ops = fletcher_4_impl_get();
462
463 if (ops->uses_fpu_native)
464 kfpu_begin();
465 ops->init_native(&ctx);
466 ops->compute_native(&ctx, buf, size);
467 ops->fini_native(&ctx, zcp);
468 if (ops->uses_fpu_native)
469 kfpu_end();
470 }
471
472 void
fletcher_4_native(const void * buf,size_t size,const void * ctx_template __unused,zio_cksum_t * zcp)473 fletcher_4_native(const void *buf, size_t size,
474 const void *ctx_template __unused, zio_cksum_t *zcp)
475 {
476 const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
477
478 ASSERT(IS_P2ALIGNED(buf, sizeof (uint32_t)));
479 ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
480
481 if (size == 0 || p2size == 0) {
482 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
483
484 if (size > 0) {
485 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
486 buf, size);
487 }
488 } else {
489 fletcher_4_native_impl(buf, p2size, zcp);
490
491 if (p2size < size) {
492 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
493 (char *)buf + p2size, size - p2size);
494 }
495 }
496 }
497
498 void
fletcher_4_native_varsize(const void * buf,size_t size,zio_cksum_t * zcp)499 fletcher_4_native_varsize(const void *buf, size_t size, zio_cksum_t *zcp)
500 {
501 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
502 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
503 }
504
505 static inline void
fletcher_4_byteswap_impl(const void * buf,size_t size,zio_cksum_t * zcp)506 fletcher_4_byteswap_impl(const void *buf, size_t size, zio_cksum_t *zcp)
507 {
508 fletcher_4_ctx_t ctx;
509 const fletcher_4_ops_t *ops = fletcher_4_impl_get();
510
511 if (ops->uses_fpu_byteswap)
512 kfpu_begin();
513 ops->init_byteswap(&ctx);
514 ops->compute_byteswap(&ctx, buf, size);
515 ops->fini_byteswap(&ctx, zcp);
516 if (ops->uses_fpu_byteswap)
517 kfpu_end();
518 }
519
520 void
fletcher_4_byteswap(const void * buf,size_t size,const void * ctx_template __unused,zio_cksum_t * zcp)521 fletcher_4_byteswap(const void *buf, size_t size,
522 const void *ctx_template __unused, zio_cksum_t *zcp)
523 {
524 const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
525
526 ASSERT(IS_P2ALIGNED(buf, sizeof (uint32_t)));
527 ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
528
529 if (size == 0 || p2size == 0) {
530 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
531
532 if (size > 0) {
533 fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
534 buf, size);
535 }
536 } else {
537 fletcher_4_byteswap_impl(buf, p2size, zcp);
538
539 if (p2size < size) {
540 fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
541 (char *)buf + p2size, size - p2size);
542 }
543 }
544 }
545
546 /* Incremental Fletcher 4 */
547
548 #define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20)
549
550 static inline void
fletcher_4_incremental_combine(zio_cksum_t * zcp,const size_t size,const zio_cksum_t * nzcp)551 fletcher_4_incremental_combine(zio_cksum_t *zcp, const size_t size,
552 const zio_cksum_t *nzcp)
553 {
554 const uint64_t c1 = size / sizeof (uint32_t);
555 const uint64_t c2 = c1 * (c1 + 1) / 2;
556 const uint64_t c3 = c2 * (c1 + 2) / 3;
557
558 /*
559 * Value of 'c3' overflows on buffer sizes close to 16MiB. For that
560 * reason we split incremental fletcher4 computation of large buffers
561 * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
562 */
563 ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
564
565 zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
566 c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
567 zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
568 c2 * zcp->zc_word[0];
569 zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
570 zcp->zc_word[0] += nzcp->zc_word[0];
571 }
572
573 static inline void
fletcher_4_incremental_impl(boolean_t native,const void * buf,size_t size,zio_cksum_t * zcp)574 fletcher_4_incremental_impl(boolean_t native, const void *buf, size_t size,
575 zio_cksum_t *zcp)
576 {
577 while (size > 0) {
578 zio_cksum_t nzc;
579 uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
580
581 if (native)
582 fletcher_4_native(buf, len, NULL, &nzc);
583 else
584 fletcher_4_byteswap(buf, len, NULL, &nzc);
585
586 fletcher_4_incremental_combine(zcp, len, &nzc);
587
588 size -= len;
589 buf += len;
590 }
591 }
592
593 int
fletcher_4_incremental_native(void * buf,size_t size,void * data)594 fletcher_4_incremental_native(void *buf, size_t size, void *data)
595 {
596 zio_cksum_t *zcp = data;
597
598 /* Use scalar impl to directly update cksum of small blocks */
599 if (size < SPA_MINBLOCKSIZE)
600 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
601 else
602 fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
603 return (0);
604 }
605
606 int
fletcher_4_incremental_byteswap(void * buf,size_t size,void * data)607 fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
608 {
609 zio_cksum_t *zcp = data;
610
611 /* Use scalar impl to directly update cksum of small blocks */
612 if (size < SPA_MINBLOCKSIZE)
613 fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
614 else
615 fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
616 return (0);
617 }
618
619 #define FLETCHER_4_FASTEST_FN_COPY(type, src) \
620 { \
621 fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \
622 fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \
623 fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
624 fletcher_4_fastest_impl.uses_fpu_ ## type = src->uses_fpu_ ## type; \
625 }
626
627 #define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */
628
629 typedef void fletcher_checksum_func_t(const void *, size_t, const void *,
630 zio_cksum_t *);
631
632 #if defined(_KERNEL)
633 static void
fletcher_4_benchmark_impl(boolean_t native,char * data,size_t data_size)634 fletcher_4_benchmark_impl(boolean_t native, char *data, size_t data_size)
635 {
636 struct fletcher_4_bench *fastest_stat =
637 &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
638 hrtime_t start;
639 uint64_t run_bw, run_time_ns, best_run = 0;
640 zio_cksum_t zc;
641 uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
642
643 fletcher_checksum_func_t *fletcher_4_test =
644 native ? fletcher_4_native : fletcher_4_byteswap;
645
646 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
647 struct fletcher_4_bench *stat = &fletcher_4_stat_data[i];
648 uint64_t run_count = 0;
649
650 /* Temporarily set an implementation */
651 fletcher_4_impl_chosen = i;
652
653 KPREEMPT_DISABLE;
654 start = gethrtime();
655 do {
656 for (l = 0; l < 32; l++, run_count++)
657 fletcher_4_test(data, data_size, NULL, &zc);
658
659 run_time_ns = gethrtime() - start;
660 } while (run_time_ns < FLETCHER_4_BENCH_NS);
661 KPREEMPT_ENABLE;
662
663 run_bw = data_size * run_count * NANOSEC;
664 run_bw /= run_time_ns; /* B/s */
665
666 if (native)
667 stat->native = run_bw;
668 else
669 stat->byteswap = run_bw;
670
671 if (run_bw > best_run) {
672 best_run = run_bw;
673
674 if (native) {
675 fastest_stat->native = i;
676 FLETCHER_4_FASTEST_FN_COPY(native,
677 fletcher_4_supp_impls[i]);
678 } else {
679 fastest_stat->byteswap = i;
680 FLETCHER_4_FASTEST_FN_COPY(byteswap,
681 fletcher_4_supp_impls[i]);
682 }
683 }
684 }
685
686 /* restore original selection */
687 atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
688 }
689 #endif /* _KERNEL */
690
691 /*
692 * Initialize and benchmark all supported implementations.
693 */
694 static void
fletcher_4_benchmark(void)695 fletcher_4_benchmark(void)
696 {
697 fletcher_4_ops_t *curr_impl;
698 int i, c;
699
700 /* Move supported implementations into fletcher_4_supp_impls */
701 for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
702 curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
703
704 if (curr_impl->valid && curr_impl->valid())
705 fletcher_4_supp_impls[c++] = curr_impl;
706 }
707 MEMBAR_PRODUCER; /* complete fletcher_4_supp_impls[] init */
708 fletcher_4_supp_impls_cnt = c; /* number of supported impl */
709
710 #if defined(_KERNEL)
711 static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
712 char *databuf = kmem_alloc(data_size, KM_SLEEP);
713
714 for (i = 0; i < data_size / sizeof (uint64_t); i++)
715 ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
716
717 fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
718 fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
719
720 kmem_free(databuf, data_size);
721 #else
722 /*
723 * Skip the benchmark in user space to avoid impacting libzpool
724 * consumers (zdb, zhack, zinject, ztest). The last implementation
725 * is assumed to be the fastest and used by default.
726 */
727 memcpy(&fletcher_4_fastest_impl,
728 fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
729 sizeof (fletcher_4_fastest_impl));
730 fletcher_4_fastest_impl.name = "fastest";
731 #endif /* _KERNEL */
732 }
733
734 void
fletcher_4_init(void)735 fletcher_4_init(void)
736 {
737 /* Determine the fastest available implementation. */
738 fletcher_4_benchmark();
739
740 #if defined(_KERNEL)
741 /* install kstats for all implementations */
742 for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; i++) {
743 struct fletcher_4_bench *stat = &fletcher_4_stat_data[i];
744 const fletcher_4_ops_t *ops = fletcher_4_supp_impls[i];
745 kstat_named_t *kstat_native = &fletcher_4_kstat_data[i * 2];
746 kstat_named_t *kstat_byteswap =
747 &fletcher_4_kstat_data[i * 2 + 1];
748
749 (void) snprintf(kstat_native->name,
750 sizeof (kstat_native->name), "%s_native", ops->name);
751 kstat_native->data_type = KSTAT_DATA_UINT64;
752 kstat_native->value.ui64 = stat->native;
753
754 (void) snprintf(kstat_byteswap->name,
755 sizeof (kstat_byteswap->name), "%s_byteswap", ops->name);
756 kstat_byteswap->data_type = KSTAT_DATA_UINT64;
757 kstat_byteswap->value.ui64 = stat->byteswap;
758 }
759
760 fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
761 KSTAT_TYPE_NAMED, ARRAY_SIZE(fletcher_4_supp_impls) * 2,
762 KSTAT_FLAG_VIRTUAL);
763
764 if (fletcher_4_kstat != NULL) {
765 fletcher_4_kstat->ks_data = fletcher_4_kstat_data;
766 kstat_install(fletcher_4_kstat);
767 }
768 #endif
769
770 /* Finish initialization */
771 fletcher_4_initialized = B_TRUE;
772 }
773
774 void
fletcher_4_fini(void)775 fletcher_4_fini(void)
776 {
777 #if defined(_KERNEL)
778 if (fletcher_4_kstat != NULL) {
779 kstat_delete(fletcher_4_kstat);
780 fletcher_4_kstat = NULL;
781 }
782 #endif
783 }
784
785 /* ABD adapters */
786
787 static void
abd_fletcher_4_init(zio_abd_checksum_data_t * cdp)788 abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
789 {
790 const fletcher_4_ops_t *ops = fletcher_4_impl_get();
791 cdp->acd_private = (void *) ops;
792
793 if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) {
794 if (ops->uses_fpu_native)
795 kfpu_begin();
796 ops->init_native(cdp->acd_ctx);
797 } else {
798 if (ops->uses_fpu_byteswap)
799 kfpu_begin();
800 ops->init_byteswap(cdp->acd_ctx);
801 }
802 }
803
804 static void
abd_fletcher_4_fini(zio_abd_checksum_data_t * cdp)805 abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
806 {
807 fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
808
809 ASSERT(ops);
810
811 if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) {
812 ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
813 if (ops->uses_fpu_native)
814 kfpu_end();
815 } else {
816 ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
817 if (ops->uses_fpu_byteswap)
818 kfpu_end();
819 }
820 }
821
822 static void
abd_fletcher_4_simd2scalar(boolean_t native,void * data,size_t size,zio_abd_checksum_data_t * cdp)823 abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
824 zio_abd_checksum_data_t *cdp)
825 {
826 zio_cksum_t *zcp = cdp->acd_zcp;
827
828 ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
829
830 abd_fletcher_4_fini(cdp);
831 cdp->acd_private = (void *)&fletcher_4_scalar_ops;
832
833 if (native)
834 fletcher_4_incremental_native(data, size, zcp);
835 else
836 fletcher_4_incremental_byteswap(data, size, zcp);
837 }
838
839 static int
abd_fletcher_4_iter(void * data,size_t size,void * private)840 abd_fletcher_4_iter(void *data, size_t size, void *private)
841 {
842 zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private;
843 fletcher_4_ctx_t *ctx = cdp->acd_ctx;
844 fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
845 boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;
846 uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
847
848 ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
849
850 if (asize > 0) {
851 if (native)
852 ops->compute_native(ctx, data, asize);
853 else
854 ops->compute_byteswap(ctx, data, asize);
855
856 size -= asize;
857 data = (char *)data + asize;
858 }
859
860 if (size > 0) {
861 ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
862 /* At this point we have to switch to scalar impl */
863 abd_fletcher_4_simd2scalar(native, data, size, cdp);
864 }
865
866 return (0);
867 }
868
869 zio_abd_checksum_func_t fletcher_4_abd_ops = {
870 .acf_init = abd_fletcher_4_init,
871 .acf_fini = abd_fletcher_4_fini,
872 .acf_iter = abd_fletcher_4_iter
873 };
874