xref: /illumos-gate/usr/src/common/zfs/zfs_fletcher.c (revision bc0ee17c150fbf29e52c0ff365163e4e7b1c2f0a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright 2013 Saso Kiselkov. All rights reserved.
27  * Copyright (c) 2016 by Delphix. All rights reserved.
28  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
29  * Copyright 2024 Oxide Computer Company
30  */
31 
32 /*
33  * Fletcher Checksums
34  * ------------------
35  *
36  * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
37  * recurrence relations:
38  *
39  *	a  = a    + f
40  *	 i    i-1    i-1
41  *
42  *	b  = b    + a
43  *	 i    i-1    i
44  *
45  *	c  = c    + b		(fletcher-4 only)
46  *	 i    i-1    i
47  *
48  *	d  = d    + c		(fletcher-4 only)
49  *	 i    i-1    i
50  *
51  * Where
52  *	a_0 = b_0 = c_0 = d_0 = 0
53  * and
54  *	f_0 .. f_(n-1) are the input data.
55  *
56  * Using standard techniques, these translate into the following series:
57  *
58  *	     __n_			     __n_
59  *	     \   |			     \   |
60  *	a  =  >     f			b  =  >     i * f
61  *	 n   /___|   n - i		 n   /___|	 n - i
62  *	     i = 1			     i = 1
63  *
64  *
65  *	     __n_			     __n_
66  *	     \   |  i*(i+1)		     \   |  i*(i+1)*(i+2)
67  *	c  =  >     ------- f		d  =  >     ------------- f
68  *	 n   /___|     2     n - i	 n   /___|	  6	   n - i
69  *	     i = 1			     i = 1
70  *
71  * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
72  * Since the additions are done mod (2^64), errors in the high bits may not
73  * be noticed.  For this reason, fletcher-2 is deprecated.
74  *
75  * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
76  * A conservative estimate of how big the buffer can get before we overflow
77  * can be estimated using f_i = 0xffffffff for all i:
78  *
79  * % bc
80  *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
81  * 2264
82  *  quit
83  * %
84  *
85  * So blocks of up to 2k will not overflow.  Our largest block size is
86  * 128k, which has 32k 4-byte words, so we can compute the largest possible
87  * accumulators, then divide by 2^64 to figure the max amount of overflow:
88  *
89  * % bc
90  *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
91  *  a/2^64;b/2^64;c/2^64;d/2^64
92  * 0
93  * 0
94  * 1365
95  * 11186858
96  *  quit
97  * %
98  *
99  * So a and b cannot overflow.  To make sure each bit of input has some
100  * effect on the contents of c and d, we can look at what the factors of
101  * the coefficients in the equations for c_n and d_n are.  The number of 2s
102  * in the factors determines the lowest set bit in the multiplier.  Running
103  * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
104  * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
105  * the 64-bit accumulators, every bit of every f_i effects every accumulator,
106  * even for 128k blocks.
107  *
108  * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
109  * we could do our calculations mod (2^32 - 1) by adding in the carries
110  * periodically, and store the number of carries in the top 32-bits.
111  *
112  * --------------------
113  * Checksum Performance
114  * --------------------
115  *
116  * There are two interesting components to checksum performance: cached and
117  * uncached performance.  With cached data, fletcher-2 is about four times
118  * faster than fletcher-4.  With uncached data, the performance difference is
119  * negligible, since the cost of a cache fill dominates the processing time.
120  * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
121  * efficient pass over the data.
122  *
123  * In normal operation, the data which is being checksummed is in a buffer
124  * which has been filled either by:
125  *
126  *	1. a compression step, which will be mostly cached, or
127  *	2. a bcopy() or copyin(), which will be uncached (because the
128  *	   copy is cache-bypassing).
129  *
130  * For both cached and uncached data, both fletcher checksums are much faster
131  * than sha-256, and slower than 'off', which doesn't touch the data at all.
132  */
133 
134 #include <sys/types.h>
135 #include <sys/sysmacros.h>
136 #include <sys/byteorder.h>
137 #include <sys/simd.h>
138 #include <sys/spa.h>
139 #include <sys/zio_checksum.h>
140 #include <sys/zfs_context.h>
141 #include <zfs_fletcher.h>
142 
143 #define	FLETCHER_MIN_SIMD_SIZE	64
144 
145 #ifdef _KERNEL
146 
147 #include <sys/atomic.h>
148 #include <sys/disp.h>
149 #define	KPREEMPT_DISABLE	kpreempt_disable()
150 #define	KPREEMPT_ENABLE		kpreempt_enable()
151 #define	MEMBAR_PRODUCER		membar_producer()
152 
153 #else	/* _KERNEL */
154 
155 #include <atomic.h>
156 #include <string.h>
157 #ifndef SET_ERROR
158 #define	SET_ERROR(err) (err)
159 #endif
160 #define	KPREEMPT_DISABLE
161 #define	KPREEMPT_ENABLE
162 #define	MEMBAR_PRODUCER
163 
164 #endif	/* _KERNEL */
165 
166 static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
167 static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
168 static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
169     const void *buf, size_t size);
170 static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
171     const void *buf, size_t size);
172 static boolean_t fletcher_4_scalar_valid(void);
173 
174 static const fletcher_4_ops_t fletcher_4_scalar_ops = {
175 	.init_native = fletcher_4_scalar_init,
176 	.fini_native = fletcher_4_scalar_fini,
177 	.compute_native = fletcher_4_scalar_native,
178 	.init_byteswap = fletcher_4_scalar_init,
179 	.fini_byteswap = fletcher_4_scalar_fini,
180 	.compute_byteswap = fletcher_4_scalar_byteswap,
181 	.valid = fletcher_4_scalar_valid,
182 	.uses_fpu_native = B_FALSE,
183 	.uses_fpu_byteswap = B_FALSE,
184 	.name = "scalar"
185 };
186 
187 static fletcher_4_ops_t fletcher_4_fastest_impl = {
188 	.name = "fastest",
189 	.valid = fletcher_4_scalar_valid
190 };
191 
192 static const fletcher_4_ops_t *fletcher_4_impls[] = {
193 	&fletcher_4_scalar_ops,
194 	&fletcher_4_superscalar_ops,
195 	&fletcher_4_superscalar4_ops,
196 #ifdef __amd64
197 	&fletcher_4_sse2_ops,
198 	&fletcher_4_ssse3_ops,
199 	&fletcher_4_avx2_ops,
200 	&fletcher_4_avx512f_ops,
201 	&fletcher_4_avx512bw_ops,
202 #endif
203 };
204 
205 /* Hold all supported implementations */
206 static uint32_t fletcher_4_supp_impls_cnt = 0;
207 static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
208 
209 /* Select fletcher4 implementation */
210 #define	IMPL_FASTEST		(UINT32_MAX)
211 #define	IMPL_CYCLE		(UINT32_MAX - 1)
212 #define	IMPL_SCALAR		(0)
213 #define	IMPL_SUPERSCALAR	(1)
214 #define	IMPL_SUPERSCALAR4	(2)
215 
216 static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
217 
218 #define	IMPL_READ(i)	(*(volatile uint32_t *) &(i))
219 
220 static struct fletcher_4_impl_selector {
221 	const char	*fis_name;
222 	uint32_t	fis_sel;
223 } fletcher_4_impl_selectors[] = {
224 	{ "cycle",	IMPL_CYCLE },
225 	{ "fastest",	IMPL_FASTEST },
226 	{ "scalar",	IMPL_SCALAR }
227 };
228 
229 #if defined(_KERNEL)
230 static kstat_t *fletcher_4_kstat;
231 static kstat_named_t fletcher_4_kstat_data[ARRAY_SIZE(fletcher_4_impls) * 2];
232 
233 static struct fletcher_4_bench {
234 	uint64_t native;
235 	uint64_t byteswap;
236 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
237 #endif
238 
239 /* Indicate that benchmark has been completed */
240 static boolean_t fletcher_4_initialized = B_FALSE;
241 
242 void
243 fletcher_init(zio_cksum_t *zcp)
244 {
245 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
246 }
247 
248 int
249 fletcher_2_incremental_native(void *buf, size_t size, void *data)
250 {
251 	zio_cksum_t *zcp = data;
252 
253 	const uint64_t *ip = buf;
254 	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
255 	uint64_t a0, b0, a1, b1;
256 
257 	a0 = zcp->zc_word[0];
258 	a1 = zcp->zc_word[1];
259 	b0 = zcp->zc_word[2];
260 	b1 = zcp->zc_word[3];
261 
262 	for (; ip < ipend; ip += 2) {
263 		a0 += ip[0];
264 		a1 += ip[1];
265 		b0 += a0;
266 		b1 += a1;
267 	}
268 
269 	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
270 	return (0);
271 }
272 
273 void
274 fletcher_2_native(const void *buf, size_t size,
275     const void *ctx_template __unused, zio_cksum_t *zcp)
276 {
277 	fletcher_init(zcp);
278 	(void) fletcher_2_incremental_native((void *) buf, size, zcp);
279 }
280 
281 int
282 fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
283 {
284 	zio_cksum_t *zcp = data;
285 
286 	const uint64_t *ip = buf;
287 	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
288 	uint64_t a0, b0, a1, b1;
289 
290 	a0 = zcp->zc_word[0];
291 	a1 = zcp->zc_word[1];
292 	b0 = zcp->zc_word[2];
293 	b1 = zcp->zc_word[3];
294 
295 	for (; ip < ipend; ip += 2) {
296 		a0 += BSWAP_64(ip[0]);
297 		a1 += BSWAP_64(ip[1]);
298 		b0 += a0;
299 		b1 += a1;
300 	}
301 
302 	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
303 	return (0);
304 }
305 
306 void
307 fletcher_2_byteswap(const void *buf, size_t size,
308     const void *ctx_template __unused, zio_cksum_t *zcp)
309 {
310 	fletcher_init(zcp);
311 	(void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
312 }
313 
314 static void
315 fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
316 {
317 	ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
318 }
319 
320 static void
321 fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
322 {
323 	memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
324 }
325 
326 static void
327 fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, size_t size)
328 {
329 	const uint32_t *ip = buf;
330 	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
331 	uint64_t a, b, c, d;
332 
333 	a = ctx->scalar.zc_word[0];
334 	b = ctx->scalar.zc_word[1];
335 	c = ctx->scalar.zc_word[2];
336 	d = ctx->scalar.zc_word[3];
337 
338 	for (; ip < ipend; ip++) {
339 		a += ip[0];
340 		b += a;
341 		c += b;
342 		d += c;
343 	}
344 
345 	ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
346 }
347 
348 static void
349 fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, size_t size)
350 {
351 	const uint32_t *ip = buf;
352 	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
353 	uint64_t a, b, c, d;
354 
355 	a = ctx->scalar.zc_word[0];
356 	b = ctx->scalar.zc_word[1];
357 	c = ctx->scalar.zc_word[2];
358 	d = ctx->scalar.zc_word[3];
359 
360 	for (; ip < ipend; ip++) {
361 		a += BSWAP_32(ip[0]);
362 		b += a;
363 		c += b;
364 		d += c;
365 	}
366 
367 	ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
368 }
369 
370 static boolean_t
371 fletcher_4_scalar_valid(void)
372 {
373 	return (B_TRUE);
374 }
375 
376 int
377 fletcher_4_impl_set(const char *val)
378 {
379 	int err = EINVAL;
380 	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
381 	size_t i;
382 
383 	/* check mandatory implementations */
384 	for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
385 		const char *name = fletcher_4_impl_selectors[i].fis_name;
386 
387 		if (strcmp(val, name) == 0) {
388 			impl = fletcher_4_impl_selectors[i].fis_sel;
389 			err = 0;
390 			break;
391 		}
392 	}
393 
394 	if (err != 0 && fletcher_4_initialized) {
395 		/* check all supported implementations */
396 		for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
397 			const char *name = fletcher_4_supp_impls[i]->name;
398 
399 			if (strcmp(val, name) == 0) {
400 				impl = i;
401 				err = 0;
402 				break;
403 			}
404 		}
405 	}
406 
407 	if (err == 0) {
408 		atomic_swap_32(&fletcher_4_impl_chosen, impl);
409 		MEMBAR_PRODUCER;
410 	}
411 
412 	return (SET_ERROR(err));
413 }
414 
415 /*
416  * Returns the Fletcher 4 operations for checksums. When a SIMD
417  * implementation is not allowed in the current context, then fallback
418  * to the fastest generic implementation.
419  */
420 static inline const fletcher_4_ops_t *
421 fletcher_4_impl_get(void)
422 {
423 	if (!kfpu_allowed())
424 		return (&fletcher_4_superscalar4_ops);
425 
426 	const fletcher_4_ops_t *ops = NULL;
427 	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
428 
429 	switch (impl) {
430 	case IMPL_FASTEST:
431 		ASSERT(fletcher_4_initialized);
432 		ops = &fletcher_4_fastest_impl;
433 		break;
434 	case IMPL_CYCLE:
435 		/* Cycle through supported implementations */
436 		ASSERT(fletcher_4_initialized);
437 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
438 
439 		static uint32_t cycle_count = 0;
440 		uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
441 
442 		ops = fletcher_4_supp_impls[idx];
443 		break;
444 	default:
445 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
446 		ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
447 
448 		ops = fletcher_4_supp_impls[impl];
449 		break;
450 	}
451 
452 	ASSERT3P(ops, !=, NULL);
453 
454 	return (ops);
455 }
456 
457 static inline void
458 fletcher_4_native_impl(const void *buf, size_t size, zio_cksum_t *zcp)
459 {
460 	fletcher_4_ctx_t ctx;
461 	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
462 
463 	if (ops->uses_fpu_native)
464 		kfpu_begin();
465 	ops->init_native(&ctx);
466 	ops->compute_native(&ctx, buf, size);
467 	ops->fini_native(&ctx, zcp);
468 	if (ops->uses_fpu_native)
469 		kfpu_end();
470 }
471 
472 void
473 fletcher_4_native(const void *buf, size_t size,
474     const void *ctx_template __unused, zio_cksum_t *zcp)
475 {
476 	const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
477 
478 	ASSERT(IS_P2ALIGNED(buf, sizeof (uint32_t)));
479 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
480 
481 	if (size == 0 || p2size == 0) {
482 		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
483 
484 		if (size > 0) {
485 			fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
486 			    buf, size);
487 		}
488 	} else {
489 		fletcher_4_native_impl(buf, p2size, zcp);
490 
491 		if (p2size < size) {
492 			fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
493 			    (char *)buf + p2size, size - p2size);
494 		}
495 	}
496 }
497 
498 void
499 fletcher_4_native_varsize(const void *buf, size_t size, zio_cksum_t *zcp)
500 {
501 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
502 	fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
503 }
504 
505 static inline void
506 fletcher_4_byteswap_impl(const void *buf, size_t size, zio_cksum_t *zcp)
507 {
508 	fletcher_4_ctx_t ctx;
509 	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
510 
511 	if (ops->uses_fpu_byteswap)
512 		kfpu_begin();
513 	ops->init_byteswap(&ctx);
514 	ops->compute_byteswap(&ctx, buf, size);
515 	ops->fini_byteswap(&ctx, zcp);
516 	if (ops->uses_fpu_byteswap)
517 		kfpu_end();
518 }
519 
520 void
521 fletcher_4_byteswap(const void *buf, size_t size,
522     const void *ctx_template __unused, zio_cksum_t *zcp)
523 {
524 	const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
525 
526 	ASSERT(IS_P2ALIGNED(buf, sizeof (uint32_t)));
527 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
528 
529 	if (size == 0 || p2size == 0) {
530 		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
531 
532 		if (size > 0) {
533 			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
534 			    buf, size);
535 		}
536 	} else {
537 		fletcher_4_byteswap_impl(buf, p2size, zcp);
538 
539 		if (p2size < size) {
540 			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
541 			    (char *)buf + p2size, size - p2size);
542 		}
543 	}
544 }
545 
546 /* Incremental Fletcher 4 */
547 
548 #define	ZFS_FLETCHER_4_INC_MAX_SIZE	(8ULL << 20)
549 
550 static inline void
551 fletcher_4_incremental_combine(zio_cksum_t *zcp, const size_t size,
552     const zio_cksum_t *nzcp)
553 {
554 	const uint64_t c1 = size / sizeof (uint32_t);
555 	const uint64_t c2 = c1 * (c1 + 1) / 2;
556 	const uint64_t c3 = c2 * (c1 + 2) / 3;
557 
558 	/*
559 	 * Value of 'c3' overflows on buffer sizes close to 16MiB. For that
560 	 * reason we split incremental fletcher4 computation of large buffers
561 	 * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
562 	 */
563 	ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
564 
565 	zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
566 	    c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
567 	zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
568 	    c2 * zcp->zc_word[0];
569 	zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
570 	zcp->zc_word[0] += nzcp->zc_word[0];
571 }
572 
573 static inline void
574 fletcher_4_incremental_impl(boolean_t native, const void *buf, size_t size,
575     zio_cksum_t *zcp)
576 {
577 	while (size > 0) {
578 		zio_cksum_t nzc;
579 		uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
580 
581 		if (native)
582 			fletcher_4_native(buf, len, NULL, &nzc);
583 		else
584 			fletcher_4_byteswap(buf, len, NULL, &nzc);
585 
586 		fletcher_4_incremental_combine(zcp, len, &nzc);
587 
588 		size -= len;
589 		buf += len;
590 	}
591 }
592 
593 int
594 fletcher_4_incremental_native(void *buf, size_t size, void *data)
595 {
596 	zio_cksum_t *zcp = data;
597 
598 	/* Use scalar impl to directly update cksum of small blocks */
599 	if (size < SPA_MINBLOCKSIZE)
600 		fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
601 	else
602 		fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
603 	return (0);
604 }
605 
606 int
607 fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
608 {
609 	zio_cksum_t *zcp = data;
610 
611 	/* Use scalar impl to directly update cksum of small blocks */
612 	if (size < SPA_MINBLOCKSIZE)
613 		fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
614 	else
615 		fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
616 	return (0);
617 }
618 
619 #define	FLETCHER_4_FASTEST_FN_COPY(type, src)				  \
620 {									  \
621 	fletcher_4_fastest_impl.init_ ## type = src->init_ ## type;	  \
622 	fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type;	  \
623 	fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
624 	fletcher_4_fastest_impl.uses_fpu_ ## type = src->uses_fpu_ ## type; \
625 }
626 
627 #define	FLETCHER_4_BENCH_NS	(MSEC2NSEC(1))		/* 1ms */
628 
629 typedef void fletcher_checksum_func_t(const void *, size_t, const void *,
630     zio_cksum_t *);
631 
632 #if defined(_KERNEL)
633 static void
634 fletcher_4_benchmark_impl(boolean_t native, char *data, size_t data_size)
635 {
636 	struct fletcher_4_bench *fastest_stat =
637 	    &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
638 	hrtime_t start;
639 	uint64_t run_bw, run_time_ns, best_run = 0;
640 	zio_cksum_t zc;
641 	uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
642 
643 	fletcher_checksum_func_t *fletcher_4_test =
644 	    native ? fletcher_4_native : fletcher_4_byteswap;
645 
646 	for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
647 		struct fletcher_4_bench *stat = &fletcher_4_stat_data[i];
648 		uint64_t run_count = 0;
649 
650 		/* Temporarily set an implementation */
651 		fletcher_4_impl_chosen = i;
652 
653 		KPREEMPT_DISABLE;
654 		start = gethrtime();
655 		do {
656 			for (l = 0; l < 32; l++, run_count++)
657 				fletcher_4_test(data, data_size, NULL, &zc);
658 
659 			run_time_ns = gethrtime() - start;
660 		} while (run_time_ns < FLETCHER_4_BENCH_NS);
661 		KPREEMPT_ENABLE;
662 
663 		run_bw = data_size * run_count * NANOSEC;
664 		run_bw /= run_time_ns;	/* B/s */
665 
666 		if (native)
667 			stat->native = run_bw;
668 		else
669 			stat->byteswap = run_bw;
670 
671 		if (run_bw > best_run) {
672 			best_run = run_bw;
673 
674 			if (native) {
675 				fastest_stat->native = i;
676 				FLETCHER_4_FASTEST_FN_COPY(native,
677 				    fletcher_4_supp_impls[i]);
678 			} else {
679 				fastest_stat->byteswap = i;
680 				FLETCHER_4_FASTEST_FN_COPY(byteswap,
681 				    fletcher_4_supp_impls[i]);
682 			}
683 		}
684 	}
685 
686 	/* restore original selection */
687 	atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
688 }
689 #endif /* _KERNEL */
690 
691 /*
692  * Initialize and benchmark all supported implementations.
693  */
694 static void
695 fletcher_4_benchmark(void)
696 {
697 	fletcher_4_ops_t *curr_impl;
698 	int i, c;
699 
700 	/* Move supported implementations into fletcher_4_supp_impls */
701 	for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
702 		curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
703 
704 		if (curr_impl->valid && curr_impl->valid())
705 			fletcher_4_supp_impls[c++] = curr_impl;
706 	}
707 	MEMBAR_PRODUCER;	/* complete fletcher_4_supp_impls[] init */
708 	fletcher_4_supp_impls_cnt = c;	/* number of supported impl */
709 
710 #if defined(_KERNEL)
711 	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
712 	char *databuf = kmem_alloc(data_size, KM_SLEEP);
713 
714 	for (i = 0; i < data_size / sizeof (uint64_t); i++)
715 		((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
716 
717 	fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
718 	fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
719 
720 	kmem_free(databuf, data_size);
721 #else
722 	/*
723 	 * Skip the benchmark in user space to avoid impacting libzpool
724 	 * consumers (zdb, zhack, zinject, ztest). The last implementation
725 	 * is assumed to be the fastest and used by default.
726 	 */
727 	memcpy(&fletcher_4_fastest_impl,
728 	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
729 	    sizeof (fletcher_4_fastest_impl));
730 	fletcher_4_fastest_impl.name = "fastest";
731 #endif /* _KERNEL */
732 }
733 
734 void
735 fletcher_4_init(void)
736 {
737 	/* Determine the fastest available implementation. */
738 	fletcher_4_benchmark();
739 
740 #if defined(_KERNEL)
741 	/* install kstats for all implementations */
742 	for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; i++) {
743 		struct fletcher_4_bench *stat = &fletcher_4_stat_data[i];
744 		const fletcher_4_ops_t *ops = fletcher_4_supp_impls[i];
745 		kstat_named_t *kstat_native = &fletcher_4_kstat_data[i * 2];
746 		kstat_named_t *kstat_byteswap =
747 		    &fletcher_4_kstat_data[i * 2 + 1];
748 
749 		(void) snprintf(kstat_native->name,
750 		    sizeof (kstat_native->name), "%s_native", ops->name);
751 		kstat_native->data_type = KSTAT_DATA_UINT64;
752 		kstat_native->value.ui64 = stat->native;
753 
754 		(void) snprintf(kstat_byteswap->name,
755 		    sizeof (kstat_byteswap->name), "%s_byteswap", ops->name);
756 		kstat_byteswap->data_type = KSTAT_DATA_UINT64;
757 		kstat_byteswap->value.ui64 = stat->byteswap;
758 	}
759 
760 	fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
761 	    KSTAT_TYPE_NAMED, ARRAY_SIZE(fletcher_4_supp_impls) * 2,
762 	    KSTAT_FLAG_VIRTUAL);
763 
764 	if (fletcher_4_kstat != NULL) {
765 		fletcher_4_kstat->ks_data = fletcher_4_kstat_data;
766 		kstat_install(fletcher_4_kstat);
767 	}
768 #endif
769 
770 	/* Finish initialization */
771 	fletcher_4_initialized = B_TRUE;
772 }
773 
774 void
775 fletcher_4_fini(void)
776 {
777 #if defined(_KERNEL)
778 	if (fletcher_4_kstat != NULL) {
779 		kstat_delete(fletcher_4_kstat);
780 		fletcher_4_kstat = NULL;
781 	}
782 #endif
783 }
784 
785 /* ABD adapters */
786 
787 static void
788 abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
789 {
790 	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
791 	cdp->acd_private = (void *) ops;
792 
793 	if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) {
794 		if (ops->uses_fpu_native)
795 			kfpu_begin();
796 		ops->init_native(cdp->acd_ctx);
797 	} else {
798 		if (ops->uses_fpu_byteswap)
799 			kfpu_begin();
800 		ops->init_byteswap(cdp->acd_ctx);
801 	}
802 }
803 
804 static void
805 abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
806 {
807 	fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
808 
809 	ASSERT(ops);
810 
811 	if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) {
812 		ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
813 		if (ops->uses_fpu_native)
814 			kfpu_end();
815 	} else {
816 		ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
817 		if (ops->uses_fpu_byteswap)
818 			kfpu_end();
819 	}
820 }
821 
822 static void
823 abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
824     zio_abd_checksum_data_t *cdp)
825 {
826 	zio_cksum_t *zcp = cdp->acd_zcp;
827 
828 	ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
829 
830 	abd_fletcher_4_fini(cdp);
831 	cdp->acd_private = (void *)&fletcher_4_scalar_ops;
832 
833 	if (native)
834 		fletcher_4_incremental_native(data, size, zcp);
835 	else
836 		fletcher_4_incremental_byteswap(data, size, zcp);
837 }
838 
839 static int
840 abd_fletcher_4_iter(void *data, size_t size, void *private)
841 {
842 	zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private;
843 	fletcher_4_ctx_t *ctx = cdp->acd_ctx;
844 	fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
845 	boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;
846 	uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
847 
848 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
849 
850 	if (asize > 0) {
851 		if (native)
852 			ops->compute_native(ctx, data, asize);
853 		else
854 			ops->compute_byteswap(ctx, data, asize);
855 
856 		size -= asize;
857 		data = (char *)data + asize;
858 	}
859 
860 	if (size > 0) {
861 		ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
862 		/* At this point we have to switch to scalar impl */
863 		abd_fletcher_4_simd2scalar(native, data, size, cdp);
864 	}
865 
866 	return (0);
867 }
868 
869 zio_abd_checksum_func_t fletcher_4_abd_ops = {
870 	.acf_init = abd_fletcher_4_init,
871 	.acf_fini = abd_fletcher_4_fini,
872 	.acf_iter = abd_fletcher_4_iter
873 };
874