1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23eda14cbcSMatt Macy * Use is subject to license terms. 24eda14cbcSMatt Macy * Copyright (C) 2016 Gvozden Nešković. All rights reserved. 25eda14cbcSMatt Macy */ 26eda14cbcSMatt Macy /* 27eda14cbcSMatt Macy * Copyright 2013 Saso Kiselkov. All rights reserved. 28eda14cbcSMatt Macy */ 29eda14cbcSMatt Macy 30eda14cbcSMatt Macy /* 31eda14cbcSMatt Macy * Copyright (c) 2016 by Delphix. All rights reserved. 32eda14cbcSMatt Macy */ 33eda14cbcSMatt Macy 34eda14cbcSMatt Macy /* 35eda14cbcSMatt Macy * Fletcher Checksums 36eda14cbcSMatt Macy * ------------------ 37eda14cbcSMatt Macy * 38eda14cbcSMatt Macy * ZFS's 2nd and 4th order Fletcher checksums are defined by the following 39eda14cbcSMatt Macy * recurrence relations: 40eda14cbcSMatt Macy * 41eda14cbcSMatt Macy * a = a + f 42eda14cbcSMatt Macy * i i-1 i-1 43eda14cbcSMatt Macy * 44eda14cbcSMatt Macy * b = b + a 45eda14cbcSMatt Macy * i i-1 i 46eda14cbcSMatt Macy * 47eda14cbcSMatt Macy * c = c + b (fletcher-4 only) 48eda14cbcSMatt Macy * i i-1 i 49eda14cbcSMatt Macy * 50eda14cbcSMatt Macy * d = d + c (fletcher-4 only) 51eda14cbcSMatt Macy * i i-1 i 52eda14cbcSMatt Macy * 53eda14cbcSMatt Macy * Where 54eda14cbcSMatt Macy * a_0 = b_0 = c_0 = d_0 = 0 55eda14cbcSMatt Macy * and 56eda14cbcSMatt Macy * f_0 .. f_(n-1) are the input data. 57eda14cbcSMatt Macy * 58eda14cbcSMatt Macy * Using standard techniques, these translate into the following series: 59eda14cbcSMatt Macy * 60eda14cbcSMatt Macy * __n_ __n_ 61eda14cbcSMatt Macy * \ | \ | 62eda14cbcSMatt Macy * a = > f b = > i * f 63eda14cbcSMatt Macy * n /___| n - i n /___| n - i 64eda14cbcSMatt Macy * i = 1 i = 1 65eda14cbcSMatt Macy * 66eda14cbcSMatt Macy * 67eda14cbcSMatt Macy * __n_ __n_ 68eda14cbcSMatt Macy * \ | i*(i+1) \ | i*(i+1)*(i+2) 69eda14cbcSMatt Macy * c = > ------- f d = > ------------- f 70eda14cbcSMatt Macy * n /___| 2 n - i n /___| 6 n - i 71eda14cbcSMatt Macy * i = 1 i = 1 72eda14cbcSMatt Macy * 73eda14cbcSMatt Macy * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. 74eda14cbcSMatt Macy * Since the additions are done mod (2^64), errors in the high bits may not 75eda14cbcSMatt Macy * be noticed. For this reason, fletcher-2 is deprecated. 76eda14cbcSMatt Macy * 77eda14cbcSMatt Macy * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. 78eda14cbcSMatt Macy * A conservative estimate of how big the buffer can get before we overflow 79eda14cbcSMatt Macy * can be estimated using f_i = 0xffffffff for all i: 80eda14cbcSMatt Macy * 81eda14cbcSMatt Macy * % bc 82eda14cbcSMatt Macy * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 83eda14cbcSMatt Macy * 2264 84eda14cbcSMatt Macy * quit 85eda14cbcSMatt Macy * % 86eda14cbcSMatt Macy * 87eda14cbcSMatt Macy * So blocks of up to 2k will not overflow. Our largest block size is 88eda14cbcSMatt Macy * 128k, which has 32k 4-byte words, so we can compute the largest possible 89eda14cbcSMatt Macy * accumulators, then divide by 2^64 to figure the max amount of overflow: 90eda14cbcSMatt Macy * 91eda14cbcSMatt Macy * % bc 92eda14cbcSMatt Macy * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } 93eda14cbcSMatt Macy * a/2^64;b/2^64;c/2^64;d/2^64 94eda14cbcSMatt Macy * 0 95eda14cbcSMatt Macy * 0 96eda14cbcSMatt Macy * 1365 97eda14cbcSMatt Macy * 11186858 98eda14cbcSMatt Macy * quit 99eda14cbcSMatt Macy * % 100eda14cbcSMatt Macy * 101eda14cbcSMatt Macy * So a and b cannot overflow. To make sure each bit of input has some 102eda14cbcSMatt Macy * effect on the contents of c and d, we can look at what the factors of 103eda14cbcSMatt Macy * the coefficients in the equations for c_n and d_n are. The number of 2s 104eda14cbcSMatt Macy * in the factors determines the lowest set bit in the multiplier. Running 105eda14cbcSMatt Macy * through the cases for n*(n+1)/2 reveals that the highest power of 2 is 106eda14cbcSMatt Macy * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow 107eda14cbcSMatt Macy * the 64-bit accumulators, every bit of every f_i effects every accumulator, 108eda14cbcSMatt Macy * even for 128k blocks. 109eda14cbcSMatt Macy * 110eda14cbcSMatt Macy * If we wanted to make a stronger version of fletcher4 (fletcher4c?), 111eda14cbcSMatt Macy * we could do our calculations mod (2^32 - 1) by adding in the carries 112eda14cbcSMatt Macy * periodically, and store the number of carries in the top 32-bits. 113eda14cbcSMatt Macy * 114eda14cbcSMatt Macy * -------------------- 115eda14cbcSMatt Macy * Checksum Performance 116eda14cbcSMatt Macy * -------------------- 117eda14cbcSMatt Macy * 118eda14cbcSMatt Macy * There are two interesting components to checksum performance: cached and 119eda14cbcSMatt Macy * uncached performance. With cached data, fletcher-2 is about four times 120eda14cbcSMatt Macy * faster than fletcher-4. With uncached data, the performance difference is 121eda14cbcSMatt Macy * negligible, since the cost of a cache fill dominates the processing time. 122eda14cbcSMatt Macy * Even though fletcher-4 is slower than fletcher-2, it is still a pretty 123eda14cbcSMatt Macy * efficient pass over the data. 124eda14cbcSMatt Macy * 125eda14cbcSMatt Macy * In normal operation, the data which is being checksummed is in a buffer 126eda14cbcSMatt Macy * which has been filled either by: 127eda14cbcSMatt Macy * 128eda14cbcSMatt Macy * 1. a compression step, which will be mostly cached, or 129eda14cbcSMatt Macy * 2. a bcopy() or copyin(), which will be uncached (because the 130eda14cbcSMatt Macy * copy is cache-bypassing). 131eda14cbcSMatt Macy * 132eda14cbcSMatt Macy * For both cached and uncached data, both fletcher checksums are much faster 133eda14cbcSMatt Macy * than sha-256, and slower than 'off', which doesn't touch the data at all. 134eda14cbcSMatt Macy */ 135eda14cbcSMatt Macy 136eda14cbcSMatt Macy #include <sys/types.h> 137eda14cbcSMatt Macy #include <sys/sysmacros.h> 138eda14cbcSMatt Macy #include <sys/byteorder.h> 139eda14cbcSMatt Macy #include <sys/spa.h> 140eda14cbcSMatt Macy #include <sys/simd.h> 141eda14cbcSMatt Macy #include <sys/zio_checksum.h> 142eda14cbcSMatt Macy #include <sys/zfs_context.h> 143eda14cbcSMatt Macy #include <zfs_fletcher.h> 144eda14cbcSMatt Macy 145eda14cbcSMatt Macy #define FLETCHER_MIN_SIMD_SIZE 64 146eda14cbcSMatt Macy 147eda14cbcSMatt Macy static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx); 148eda14cbcSMatt Macy static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp); 149eda14cbcSMatt Macy static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, 150eda14cbcSMatt Macy const void *buf, uint64_t size); 151eda14cbcSMatt Macy static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, 152eda14cbcSMatt Macy const void *buf, uint64_t size); 153eda14cbcSMatt Macy static boolean_t fletcher_4_scalar_valid(void); 154eda14cbcSMatt Macy 155eda14cbcSMatt Macy static const fletcher_4_ops_t fletcher_4_scalar_ops = { 156eda14cbcSMatt Macy .init_native = fletcher_4_scalar_init, 157eda14cbcSMatt Macy .fini_native = fletcher_4_scalar_fini, 158eda14cbcSMatt Macy .compute_native = fletcher_4_scalar_native, 159eda14cbcSMatt Macy .init_byteswap = fletcher_4_scalar_init, 160eda14cbcSMatt Macy .fini_byteswap = fletcher_4_scalar_fini, 161eda14cbcSMatt Macy .compute_byteswap = fletcher_4_scalar_byteswap, 162eda14cbcSMatt Macy .valid = fletcher_4_scalar_valid, 163eda14cbcSMatt Macy .name = "scalar" 164eda14cbcSMatt Macy }; 165eda14cbcSMatt Macy 166eda14cbcSMatt Macy static fletcher_4_ops_t fletcher_4_fastest_impl = { 167eda14cbcSMatt Macy .name = "fastest", 168eda14cbcSMatt Macy .valid = fletcher_4_scalar_valid 169eda14cbcSMatt Macy }; 170eda14cbcSMatt Macy 171eda14cbcSMatt Macy static const fletcher_4_ops_t *fletcher_4_impls[] = { 172eda14cbcSMatt Macy &fletcher_4_scalar_ops, 173eda14cbcSMatt Macy &fletcher_4_superscalar_ops, 174eda14cbcSMatt Macy &fletcher_4_superscalar4_ops, 175eda14cbcSMatt Macy #if defined(HAVE_SSE2) 176eda14cbcSMatt Macy &fletcher_4_sse2_ops, 177eda14cbcSMatt Macy #endif 178eda14cbcSMatt Macy #if defined(HAVE_SSE2) && defined(HAVE_SSSE3) 179eda14cbcSMatt Macy &fletcher_4_ssse3_ops, 180eda14cbcSMatt Macy #endif 181eda14cbcSMatt Macy #if defined(HAVE_AVX) && defined(HAVE_AVX2) 182eda14cbcSMatt Macy &fletcher_4_avx2_ops, 183eda14cbcSMatt Macy #endif 184eda14cbcSMatt Macy #if defined(__x86_64) && defined(HAVE_AVX512F) 185eda14cbcSMatt Macy &fletcher_4_avx512f_ops, 186eda14cbcSMatt Macy #endif 187eda14cbcSMatt Macy #if defined(__x86_64) && defined(HAVE_AVX512BW) 188eda14cbcSMatt Macy &fletcher_4_avx512bw_ops, 189eda14cbcSMatt Macy #endif 190*ac0bf12eSMatt Macy #if defined(__aarch64__) && !defined(__FreeBSD__) 191eda14cbcSMatt Macy &fletcher_4_aarch64_neon_ops, 192eda14cbcSMatt Macy #endif 193eda14cbcSMatt Macy }; 194eda14cbcSMatt Macy 195eda14cbcSMatt Macy /* Hold all supported implementations */ 196eda14cbcSMatt Macy static uint32_t fletcher_4_supp_impls_cnt = 0; 197eda14cbcSMatt Macy static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)]; 198eda14cbcSMatt Macy 199eda14cbcSMatt Macy /* Select fletcher4 implementation */ 200eda14cbcSMatt Macy #define IMPL_FASTEST (UINT32_MAX) 201eda14cbcSMatt Macy #define IMPL_CYCLE (UINT32_MAX - 1) 202eda14cbcSMatt Macy #define IMPL_SCALAR (0) 203eda14cbcSMatt Macy 204eda14cbcSMatt Macy static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST; 205eda14cbcSMatt Macy 206eda14cbcSMatt Macy #define IMPL_READ(i) (*(volatile uint32_t *) &(i)) 207eda14cbcSMatt Macy 208eda14cbcSMatt Macy static struct fletcher_4_impl_selector { 209eda14cbcSMatt Macy const char *fis_name; 210eda14cbcSMatt Macy uint32_t fis_sel; 211eda14cbcSMatt Macy } fletcher_4_impl_selectors[] = { 212eda14cbcSMatt Macy { "cycle", IMPL_CYCLE }, 213eda14cbcSMatt Macy { "fastest", IMPL_FASTEST }, 214eda14cbcSMatt Macy { "scalar", IMPL_SCALAR } 215eda14cbcSMatt Macy }; 216eda14cbcSMatt Macy 217eda14cbcSMatt Macy #if defined(_KERNEL) 218eda14cbcSMatt Macy static kstat_t *fletcher_4_kstat; 219eda14cbcSMatt Macy 220eda14cbcSMatt Macy static struct fletcher_4_kstat { 221eda14cbcSMatt Macy uint64_t native; 222eda14cbcSMatt Macy uint64_t byteswap; 223eda14cbcSMatt Macy } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1]; 224eda14cbcSMatt Macy #endif 225eda14cbcSMatt Macy 226eda14cbcSMatt Macy /* Indicate that benchmark has been completed */ 227eda14cbcSMatt Macy static boolean_t fletcher_4_initialized = B_FALSE; 228eda14cbcSMatt Macy 229eda14cbcSMatt Macy /*ARGSUSED*/ 230eda14cbcSMatt Macy void 231eda14cbcSMatt Macy fletcher_init(zio_cksum_t *zcp) 232eda14cbcSMatt Macy { 233eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 234eda14cbcSMatt Macy } 235eda14cbcSMatt Macy 236eda14cbcSMatt Macy int 237eda14cbcSMatt Macy fletcher_2_incremental_native(void *buf, size_t size, void *data) 238eda14cbcSMatt Macy { 239eda14cbcSMatt Macy zio_cksum_t *zcp = data; 240eda14cbcSMatt Macy 241eda14cbcSMatt Macy const uint64_t *ip = buf; 242eda14cbcSMatt Macy const uint64_t *ipend = ip + (size / sizeof (uint64_t)); 243eda14cbcSMatt Macy uint64_t a0, b0, a1, b1; 244eda14cbcSMatt Macy 245eda14cbcSMatt Macy a0 = zcp->zc_word[0]; 246eda14cbcSMatt Macy a1 = zcp->zc_word[1]; 247eda14cbcSMatt Macy b0 = zcp->zc_word[2]; 248eda14cbcSMatt Macy b1 = zcp->zc_word[3]; 249eda14cbcSMatt Macy 250eda14cbcSMatt Macy for (; ip < ipend; ip += 2) { 251eda14cbcSMatt Macy a0 += ip[0]; 252eda14cbcSMatt Macy a1 += ip[1]; 253eda14cbcSMatt Macy b0 += a0; 254eda14cbcSMatt Macy b1 += a1; 255eda14cbcSMatt Macy } 256eda14cbcSMatt Macy 257eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); 258eda14cbcSMatt Macy return (0); 259eda14cbcSMatt Macy } 260eda14cbcSMatt Macy 261eda14cbcSMatt Macy /*ARGSUSED*/ 262eda14cbcSMatt Macy void 263eda14cbcSMatt Macy fletcher_2_native(const void *buf, uint64_t size, 264eda14cbcSMatt Macy const void *ctx_template, zio_cksum_t *zcp) 265eda14cbcSMatt Macy { 266eda14cbcSMatt Macy fletcher_init(zcp); 267eda14cbcSMatt Macy (void) fletcher_2_incremental_native((void *) buf, size, zcp); 268eda14cbcSMatt Macy } 269eda14cbcSMatt Macy 270eda14cbcSMatt Macy int 271eda14cbcSMatt Macy fletcher_2_incremental_byteswap(void *buf, size_t size, void *data) 272eda14cbcSMatt Macy { 273eda14cbcSMatt Macy zio_cksum_t *zcp = data; 274eda14cbcSMatt Macy 275eda14cbcSMatt Macy const uint64_t *ip = buf; 276eda14cbcSMatt Macy const uint64_t *ipend = ip + (size / sizeof (uint64_t)); 277eda14cbcSMatt Macy uint64_t a0, b0, a1, b1; 278eda14cbcSMatt Macy 279eda14cbcSMatt Macy a0 = zcp->zc_word[0]; 280eda14cbcSMatt Macy a1 = zcp->zc_word[1]; 281eda14cbcSMatt Macy b0 = zcp->zc_word[2]; 282eda14cbcSMatt Macy b1 = zcp->zc_word[3]; 283eda14cbcSMatt Macy 284eda14cbcSMatt Macy for (; ip < ipend; ip += 2) { 285eda14cbcSMatt Macy a0 += BSWAP_64(ip[0]); 286eda14cbcSMatt Macy a1 += BSWAP_64(ip[1]); 287eda14cbcSMatt Macy b0 += a0; 288eda14cbcSMatt Macy b1 += a1; 289eda14cbcSMatt Macy } 290eda14cbcSMatt Macy 291eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); 292eda14cbcSMatt Macy return (0); 293eda14cbcSMatt Macy } 294eda14cbcSMatt Macy 295eda14cbcSMatt Macy /*ARGSUSED*/ 296eda14cbcSMatt Macy void 297eda14cbcSMatt Macy fletcher_2_byteswap(const void *buf, uint64_t size, 298eda14cbcSMatt Macy const void *ctx_template, zio_cksum_t *zcp) 299eda14cbcSMatt Macy { 300eda14cbcSMatt Macy fletcher_init(zcp); 301eda14cbcSMatt Macy (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp); 302eda14cbcSMatt Macy } 303eda14cbcSMatt Macy 304eda14cbcSMatt Macy static void 305eda14cbcSMatt Macy fletcher_4_scalar_init(fletcher_4_ctx_t *ctx) 306eda14cbcSMatt Macy { 307eda14cbcSMatt Macy ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0); 308eda14cbcSMatt Macy } 309eda14cbcSMatt Macy 310eda14cbcSMatt Macy static void 311eda14cbcSMatt Macy fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) 312eda14cbcSMatt Macy { 313eda14cbcSMatt Macy memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t)); 314eda14cbcSMatt Macy } 315eda14cbcSMatt Macy 316eda14cbcSMatt Macy static void 317eda14cbcSMatt Macy fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, 318eda14cbcSMatt Macy uint64_t size) 319eda14cbcSMatt Macy { 320eda14cbcSMatt Macy const uint32_t *ip = buf; 321eda14cbcSMatt Macy const uint32_t *ipend = ip + (size / sizeof (uint32_t)); 322eda14cbcSMatt Macy uint64_t a, b, c, d; 323eda14cbcSMatt Macy 324eda14cbcSMatt Macy a = ctx->scalar.zc_word[0]; 325eda14cbcSMatt Macy b = ctx->scalar.zc_word[1]; 326eda14cbcSMatt Macy c = ctx->scalar.zc_word[2]; 327eda14cbcSMatt Macy d = ctx->scalar.zc_word[3]; 328eda14cbcSMatt Macy 329eda14cbcSMatt Macy for (; ip < ipend; ip++) { 330eda14cbcSMatt Macy a += ip[0]; 331eda14cbcSMatt Macy b += a; 332eda14cbcSMatt Macy c += b; 333eda14cbcSMatt Macy d += c; 334eda14cbcSMatt Macy } 335eda14cbcSMatt Macy 336eda14cbcSMatt Macy ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); 337eda14cbcSMatt Macy } 338eda14cbcSMatt Macy 339eda14cbcSMatt Macy static void 340eda14cbcSMatt Macy fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, 341eda14cbcSMatt Macy uint64_t size) 342eda14cbcSMatt Macy { 343eda14cbcSMatt Macy const uint32_t *ip = buf; 344eda14cbcSMatt Macy const uint32_t *ipend = ip + (size / sizeof (uint32_t)); 345eda14cbcSMatt Macy uint64_t a, b, c, d; 346eda14cbcSMatt Macy 347eda14cbcSMatt Macy a = ctx->scalar.zc_word[0]; 348eda14cbcSMatt Macy b = ctx->scalar.zc_word[1]; 349eda14cbcSMatt Macy c = ctx->scalar.zc_word[2]; 350eda14cbcSMatt Macy d = ctx->scalar.zc_word[3]; 351eda14cbcSMatt Macy 352eda14cbcSMatt Macy for (; ip < ipend; ip++) { 353eda14cbcSMatt Macy a += BSWAP_32(ip[0]); 354eda14cbcSMatt Macy b += a; 355eda14cbcSMatt Macy c += b; 356eda14cbcSMatt Macy d += c; 357eda14cbcSMatt Macy } 358eda14cbcSMatt Macy 359eda14cbcSMatt Macy ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); 360eda14cbcSMatt Macy } 361eda14cbcSMatt Macy 362eda14cbcSMatt Macy static boolean_t 363eda14cbcSMatt Macy fletcher_4_scalar_valid(void) 364eda14cbcSMatt Macy { 365eda14cbcSMatt Macy return (B_TRUE); 366eda14cbcSMatt Macy } 367eda14cbcSMatt Macy 368eda14cbcSMatt Macy int 369eda14cbcSMatt Macy fletcher_4_impl_set(const char *val) 370eda14cbcSMatt Macy { 371eda14cbcSMatt Macy int err = -EINVAL; 372eda14cbcSMatt Macy uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); 373eda14cbcSMatt Macy size_t i, val_len; 374eda14cbcSMatt Macy 375eda14cbcSMatt Macy val_len = strlen(val); 376eda14cbcSMatt Macy while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */ 377eda14cbcSMatt Macy val_len--; 378eda14cbcSMatt Macy 379eda14cbcSMatt Macy /* check mandatory implementations */ 380eda14cbcSMatt Macy for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) { 381eda14cbcSMatt Macy const char *name = fletcher_4_impl_selectors[i].fis_name; 382eda14cbcSMatt Macy 383eda14cbcSMatt Macy if (val_len == strlen(name) && 384eda14cbcSMatt Macy strncmp(val, name, val_len) == 0) { 385eda14cbcSMatt Macy impl = fletcher_4_impl_selectors[i].fis_sel; 386eda14cbcSMatt Macy err = 0; 387eda14cbcSMatt Macy break; 388eda14cbcSMatt Macy } 389eda14cbcSMatt Macy } 390eda14cbcSMatt Macy 391eda14cbcSMatt Macy if (err != 0 && fletcher_4_initialized) { 392eda14cbcSMatt Macy /* check all supported implementations */ 393eda14cbcSMatt Macy for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { 394eda14cbcSMatt Macy const char *name = fletcher_4_supp_impls[i]->name; 395eda14cbcSMatt Macy 396eda14cbcSMatt Macy if (val_len == strlen(name) && 397eda14cbcSMatt Macy strncmp(val, name, val_len) == 0) { 398eda14cbcSMatt Macy impl = i; 399eda14cbcSMatt Macy err = 0; 400eda14cbcSMatt Macy break; 401eda14cbcSMatt Macy } 402eda14cbcSMatt Macy } 403eda14cbcSMatt Macy } 404eda14cbcSMatt Macy 405eda14cbcSMatt Macy if (err == 0) { 406eda14cbcSMatt Macy atomic_swap_32(&fletcher_4_impl_chosen, impl); 407eda14cbcSMatt Macy membar_producer(); 408eda14cbcSMatt Macy } 409eda14cbcSMatt Macy 410eda14cbcSMatt Macy return (err); 411eda14cbcSMatt Macy } 412eda14cbcSMatt Macy 413eda14cbcSMatt Macy /* 414eda14cbcSMatt Macy * Returns the Fletcher 4 operations for checksums. When a SIMD 415eda14cbcSMatt Macy * implementation is not allowed in the current context, then fallback 416eda14cbcSMatt Macy * to the fastest generic implementation. 417eda14cbcSMatt Macy */ 418eda14cbcSMatt Macy static inline const fletcher_4_ops_t * 419eda14cbcSMatt Macy fletcher_4_impl_get(void) 420eda14cbcSMatt Macy { 421eda14cbcSMatt Macy if (!kfpu_allowed()) 422eda14cbcSMatt Macy return (&fletcher_4_superscalar4_ops); 423eda14cbcSMatt Macy 424eda14cbcSMatt Macy const fletcher_4_ops_t *ops = NULL; 425eda14cbcSMatt Macy uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); 426eda14cbcSMatt Macy 427eda14cbcSMatt Macy switch (impl) { 428eda14cbcSMatt Macy case IMPL_FASTEST: 429eda14cbcSMatt Macy ASSERT(fletcher_4_initialized); 430eda14cbcSMatt Macy ops = &fletcher_4_fastest_impl; 431eda14cbcSMatt Macy break; 432eda14cbcSMatt Macy case IMPL_CYCLE: 433eda14cbcSMatt Macy /* Cycle through supported implementations */ 434eda14cbcSMatt Macy ASSERT(fletcher_4_initialized); 435eda14cbcSMatt Macy ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); 436eda14cbcSMatt Macy static uint32_t cycle_count = 0; 437eda14cbcSMatt Macy uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt; 438eda14cbcSMatt Macy ops = fletcher_4_supp_impls[idx]; 439eda14cbcSMatt Macy break; 440eda14cbcSMatt Macy default: 441eda14cbcSMatt Macy ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); 442eda14cbcSMatt Macy ASSERT3U(impl, <, fletcher_4_supp_impls_cnt); 443eda14cbcSMatt Macy ops = fletcher_4_supp_impls[impl]; 444eda14cbcSMatt Macy break; 445eda14cbcSMatt Macy } 446eda14cbcSMatt Macy 447eda14cbcSMatt Macy ASSERT3P(ops, !=, NULL); 448eda14cbcSMatt Macy 449eda14cbcSMatt Macy return (ops); 450eda14cbcSMatt Macy } 451eda14cbcSMatt Macy 452eda14cbcSMatt Macy static inline void 453eda14cbcSMatt Macy fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) 454eda14cbcSMatt Macy { 455eda14cbcSMatt Macy fletcher_4_ctx_t ctx; 456eda14cbcSMatt Macy const fletcher_4_ops_t *ops = fletcher_4_impl_get(); 457eda14cbcSMatt Macy 458eda14cbcSMatt Macy ops->init_native(&ctx); 459eda14cbcSMatt Macy ops->compute_native(&ctx, buf, size); 460eda14cbcSMatt Macy ops->fini_native(&ctx, zcp); 461eda14cbcSMatt Macy } 462eda14cbcSMatt Macy 463eda14cbcSMatt Macy /*ARGSUSED*/ 464eda14cbcSMatt Macy void 465eda14cbcSMatt Macy fletcher_4_native(const void *buf, uint64_t size, 466eda14cbcSMatt Macy const void *ctx_template, zio_cksum_t *zcp) 467eda14cbcSMatt Macy { 468eda14cbcSMatt Macy const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); 469eda14cbcSMatt Macy 470eda14cbcSMatt Macy ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); 471eda14cbcSMatt Macy 472eda14cbcSMatt Macy if (size == 0 || p2size == 0) { 473eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 474eda14cbcSMatt Macy 475eda14cbcSMatt Macy if (size > 0) 476eda14cbcSMatt Macy fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, 477eda14cbcSMatt Macy buf, size); 478eda14cbcSMatt Macy } else { 479eda14cbcSMatt Macy fletcher_4_native_impl(buf, p2size, zcp); 480eda14cbcSMatt Macy 481eda14cbcSMatt Macy if (p2size < size) 482eda14cbcSMatt Macy fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, 483eda14cbcSMatt Macy (char *)buf + p2size, size - p2size); 484eda14cbcSMatt Macy } 485eda14cbcSMatt Macy } 486eda14cbcSMatt Macy 487eda14cbcSMatt Macy void 488eda14cbcSMatt Macy fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp) 489eda14cbcSMatt Macy { 490eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 491eda14cbcSMatt Macy fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); 492eda14cbcSMatt Macy } 493eda14cbcSMatt Macy 494eda14cbcSMatt Macy static inline void 495eda14cbcSMatt Macy fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) 496eda14cbcSMatt Macy { 497eda14cbcSMatt Macy fletcher_4_ctx_t ctx; 498eda14cbcSMatt Macy const fletcher_4_ops_t *ops = fletcher_4_impl_get(); 499eda14cbcSMatt Macy 500eda14cbcSMatt Macy ops->init_byteswap(&ctx); 501eda14cbcSMatt Macy ops->compute_byteswap(&ctx, buf, size); 502eda14cbcSMatt Macy ops->fini_byteswap(&ctx, zcp); 503eda14cbcSMatt Macy } 504eda14cbcSMatt Macy 505eda14cbcSMatt Macy /*ARGSUSED*/ 506eda14cbcSMatt Macy void 507eda14cbcSMatt Macy fletcher_4_byteswap(const void *buf, uint64_t size, 508eda14cbcSMatt Macy const void *ctx_template, zio_cksum_t *zcp) 509eda14cbcSMatt Macy { 510eda14cbcSMatt Macy const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); 511eda14cbcSMatt Macy 512eda14cbcSMatt Macy ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); 513eda14cbcSMatt Macy 514eda14cbcSMatt Macy if (size == 0 || p2size == 0) { 515eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 516eda14cbcSMatt Macy 517eda14cbcSMatt Macy if (size > 0) 518eda14cbcSMatt Macy fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, 519eda14cbcSMatt Macy buf, size); 520eda14cbcSMatt Macy } else { 521eda14cbcSMatt Macy fletcher_4_byteswap_impl(buf, p2size, zcp); 522eda14cbcSMatt Macy 523eda14cbcSMatt Macy if (p2size < size) 524eda14cbcSMatt Macy fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, 525eda14cbcSMatt Macy (char *)buf + p2size, size - p2size); 526eda14cbcSMatt Macy } 527eda14cbcSMatt Macy } 528eda14cbcSMatt Macy 529eda14cbcSMatt Macy /* Incremental Fletcher 4 */ 530eda14cbcSMatt Macy 531eda14cbcSMatt Macy #define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20) 532eda14cbcSMatt Macy 533eda14cbcSMatt Macy static inline void 534eda14cbcSMatt Macy fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size, 535eda14cbcSMatt Macy const zio_cksum_t *nzcp) 536eda14cbcSMatt Macy { 537eda14cbcSMatt Macy const uint64_t c1 = size / sizeof (uint32_t); 538eda14cbcSMatt Macy const uint64_t c2 = c1 * (c1 + 1) / 2; 539eda14cbcSMatt Macy const uint64_t c3 = c2 * (c1 + 2) / 3; 540eda14cbcSMatt Macy 541eda14cbcSMatt Macy /* 542eda14cbcSMatt Macy * Value of 'c3' overflows on buffer sizes close to 16MiB. For that 543eda14cbcSMatt Macy * reason we split incremental fletcher4 computation of large buffers 544eda14cbcSMatt Macy * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size. 545eda14cbcSMatt Macy */ 546eda14cbcSMatt Macy ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE); 547eda14cbcSMatt Macy 548eda14cbcSMatt Macy zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] + 549eda14cbcSMatt Macy c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0]; 550eda14cbcSMatt Macy zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] + 551eda14cbcSMatt Macy c2 * zcp->zc_word[0]; 552eda14cbcSMatt Macy zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0]; 553eda14cbcSMatt Macy zcp->zc_word[0] += nzcp->zc_word[0]; 554eda14cbcSMatt Macy } 555eda14cbcSMatt Macy 556eda14cbcSMatt Macy static inline void 557eda14cbcSMatt Macy fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size, 558eda14cbcSMatt Macy zio_cksum_t *zcp) 559eda14cbcSMatt Macy { 560eda14cbcSMatt Macy while (size > 0) { 561eda14cbcSMatt Macy zio_cksum_t nzc; 562eda14cbcSMatt Macy uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE); 563eda14cbcSMatt Macy 564eda14cbcSMatt Macy if (native) 565eda14cbcSMatt Macy fletcher_4_native(buf, len, NULL, &nzc); 566eda14cbcSMatt Macy else 567eda14cbcSMatt Macy fletcher_4_byteswap(buf, len, NULL, &nzc); 568eda14cbcSMatt Macy 569eda14cbcSMatt Macy fletcher_4_incremental_combine(zcp, len, &nzc); 570eda14cbcSMatt Macy 571eda14cbcSMatt Macy size -= len; 572eda14cbcSMatt Macy buf += len; 573eda14cbcSMatt Macy } 574eda14cbcSMatt Macy } 575eda14cbcSMatt Macy 576eda14cbcSMatt Macy int 577eda14cbcSMatt Macy fletcher_4_incremental_native(void *buf, size_t size, void *data) 578eda14cbcSMatt Macy { 579eda14cbcSMatt Macy zio_cksum_t *zcp = data; 580eda14cbcSMatt Macy /* Use scalar impl to directly update cksum of small blocks */ 581eda14cbcSMatt Macy if (size < SPA_MINBLOCKSIZE) 582eda14cbcSMatt Macy fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); 583eda14cbcSMatt Macy else 584eda14cbcSMatt Macy fletcher_4_incremental_impl(B_TRUE, buf, size, zcp); 585eda14cbcSMatt Macy return (0); 586eda14cbcSMatt Macy } 587eda14cbcSMatt Macy 588eda14cbcSMatt Macy int 589eda14cbcSMatt Macy fletcher_4_incremental_byteswap(void *buf, size_t size, void *data) 590eda14cbcSMatt Macy { 591eda14cbcSMatt Macy zio_cksum_t *zcp = data; 592eda14cbcSMatt Macy /* Use scalar impl to directly update cksum of small blocks */ 593eda14cbcSMatt Macy if (size < SPA_MINBLOCKSIZE) 594eda14cbcSMatt Macy fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); 595eda14cbcSMatt Macy else 596eda14cbcSMatt Macy fletcher_4_incremental_impl(B_FALSE, buf, size, zcp); 597eda14cbcSMatt Macy return (0); 598eda14cbcSMatt Macy } 599eda14cbcSMatt Macy 600eda14cbcSMatt Macy #if defined(_KERNEL) 601eda14cbcSMatt Macy /* 602eda14cbcSMatt Macy * Fletcher 4 kstats 603eda14cbcSMatt Macy */ 604eda14cbcSMatt Macy static int 605eda14cbcSMatt Macy fletcher_4_kstat_headers(char *buf, size_t size) 606eda14cbcSMatt Macy { 607eda14cbcSMatt Macy ssize_t off = 0; 608eda14cbcSMatt Macy 609eda14cbcSMatt Macy off += snprintf(buf + off, size, "%-17s", "implementation"); 610eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-15s", "native"); 611eda14cbcSMatt Macy (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap"); 612eda14cbcSMatt Macy 613eda14cbcSMatt Macy return (0); 614eda14cbcSMatt Macy } 615eda14cbcSMatt Macy 616eda14cbcSMatt Macy static int 617eda14cbcSMatt Macy fletcher_4_kstat_data(char *buf, size_t size, void *data) 618eda14cbcSMatt Macy { 619eda14cbcSMatt Macy struct fletcher_4_kstat *fastest_stat = 620eda14cbcSMatt Macy &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; 621eda14cbcSMatt Macy struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data; 622eda14cbcSMatt Macy ssize_t off = 0; 623eda14cbcSMatt Macy 624eda14cbcSMatt Macy if (curr_stat == fastest_stat) { 625eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-17s", "fastest"); 626eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-15s", 627eda14cbcSMatt Macy fletcher_4_supp_impls[fastest_stat->native]->name); 628eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-15s\n", 629eda14cbcSMatt Macy fletcher_4_supp_impls[fastest_stat->byteswap]->name); 630eda14cbcSMatt Macy } else { 631eda14cbcSMatt Macy ptrdiff_t id = curr_stat - fletcher_4_stat_data; 632eda14cbcSMatt Macy 633eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-17s", 634eda14cbcSMatt Macy fletcher_4_supp_impls[id]->name); 635eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-15llu", 636eda14cbcSMatt Macy (u_longlong_t)curr_stat->native); 637eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-15llu\n", 638eda14cbcSMatt Macy (u_longlong_t)curr_stat->byteswap); 639eda14cbcSMatt Macy } 640eda14cbcSMatt Macy 641eda14cbcSMatt Macy return (0); 642eda14cbcSMatt Macy } 643eda14cbcSMatt Macy 644eda14cbcSMatt Macy static void * 645eda14cbcSMatt Macy fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) 646eda14cbcSMatt Macy { 647eda14cbcSMatt Macy if (n <= fletcher_4_supp_impls_cnt) 648eda14cbcSMatt Macy ksp->ks_private = (void *) (fletcher_4_stat_data + n); 649eda14cbcSMatt Macy else 650eda14cbcSMatt Macy ksp->ks_private = NULL; 651eda14cbcSMatt Macy 652eda14cbcSMatt Macy return (ksp->ks_private); 653eda14cbcSMatt Macy } 654eda14cbcSMatt Macy #endif 655eda14cbcSMatt Macy 656eda14cbcSMatt Macy #define FLETCHER_4_FASTEST_FN_COPY(type, src) \ 657eda14cbcSMatt Macy { \ 658eda14cbcSMatt Macy fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \ 659eda14cbcSMatt Macy fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \ 660eda14cbcSMatt Macy fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \ 661eda14cbcSMatt Macy } 662eda14cbcSMatt Macy 663eda14cbcSMatt Macy #define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */ 664eda14cbcSMatt Macy 665eda14cbcSMatt Macy typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, 666eda14cbcSMatt Macy zio_cksum_t *); 667eda14cbcSMatt Macy 668eda14cbcSMatt Macy #if defined(_KERNEL) 669eda14cbcSMatt Macy static void 670eda14cbcSMatt Macy fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) 671eda14cbcSMatt Macy { 672eda14cbcSMatt Macy 673eda14cbcSMatt Macy struct fletcher_4_kstat *fastest_stat = 674eda14cbcSMatt Macy &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; 675eda14cbcSMatt Macy hrtime_t start; 676eda14cbcSMatt Macy uint64_t run_bw, run_time_ns, best_run = 0; 677eda14cbcSMatt Macy zio_cksum_t zc; 678eda14cbcSMatt Macy uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); 679eda14cbcSMatt Macy 680eda14cbcSMatt Macy fletcher_checksum_func_t *fletcher_4_test = native ? 681eda14cbcSMatt Macy fletcher_4_native : fletcher_4_byteswap; 682eda14cbcSMatt Macy 683eda14cbcSMatt Macy for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { 684eda14cbcSMatt Macy struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i]; 685eda14cbcSMatt Macy uint64_t run_count = 0; 686eda14cbcSMatt Macy 687eda14cbcSMatt Macy /* temporary set an implementation */ 688eda14cbcSMatt Macy fletcher_4_impl_chosen = i; 689eda14cbcSMatt Macy 690eda14cbcSMatt Macy kpreempt_disable(); 691eda14cbcSMatt Macy start = gethrtime(); 692eda14cbcSMatt Macy do { 693eda14cbcSMatt Macy for (l = 0; l < 32; l++, run_count++) 694eda14cbcSMatt Macy fletcher_4_test(data, data_size, NULL, &zc); 695eda14cbcSMatt Macy 696eda14cbcSMatt Macy run_time_ns = gethrtime() - start; 697eda14cbcSMatt Macy } while (run_time_ns < FLETCHER_4_BENCH_NS); 698eda14cbcSMatt Macy kpreempt_enable(); 699eda14cbcSMatt Macy 700eda14cbcSMatt Macy run_bw = data_size * run_count * NANOSEC; 701eda14cbcSMatt Macy run_bw /= run_time_ns; /* B/s */ 702eda14cbcSMatt Macy 703eda14cbcSMatt Macy if (native) 704eda14cbcSMatt Macy stat->native = run_bw; 705eda14cbcSMatt Macy else 706eda14cbcSMatt Macy stat->byteswap = run_bw; 707eda14cbcSMatt Macy 708eda14cbcSMatt Macy if (run_bw > best_run) { 709eda14cbcSMatt Macy best_run = run_bw; 710eda14cbcSMatt Macy 711eda14cbcSMatt Macy if (native) { 712eda14cbcSMatt Macy fastest_stat->native = i; 713eda14cbcSMatt Macy FLETCHER_4_FASTEST_FN_COPY(native, 714eda14cbcSMatt Macy fletcher_4_supp_impls[i]); 715eda14cbcSMatt Macy } else { 716eda14cbcSMatt Macy fastest_stat->byteswap = i; 717eda14cbcSMatt Macy FLETCHER_4_FASTEST_FN_COPY(byteswap, 718eda14cbcSMatt Macy fletcher_4_supp_impls[i]); 719eda14cbcSMatt Macy } 720eda14cbcSMatt Macy } 721eda14cbcSMatt Macy } 722eda14cbcSMatt Macy 723eda14cbcSMatt Macy /* restore original selection */ 724eda14cbcSMatt Macy atomic_swap_32(&fletcher_4_impl_chosen, sel_save); 725eda14cbcSMatt Macy } 726eda14cbcSMatt Macy #endif /* _KERNEL */ 727eda14cbcSMatt Macy 728eda14cbcSMatt Macy /* 729eda14cbcSMatt Macy * Initialize and benchmark all supported implementations. 730eda14cbcSMatt Macy */ 731eda14cbcSMatt Macy static void 732eda14cbcSMatt Macy fletcher_4_benchmark(void) 733eda14cbcSMatt Macy { 734eda14cbcSMatt Macy fletcher_4_ops_t *curr_impl; 735eda14cbcSMatt Macy int i, c; 736eda14cbcSMatt Macy 737eda14cbcSMatt Macy /* Move supported implementations into fletcher_4_supp_impls */ 738eda14cbcSMatt Macy for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { 739eda14cbcSMatt Macy curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i]; 740eda14cbcSMatt Macy 741eda14cbcSMatt Macy if (curr_impl->valid && curr_impl->valid()) 742eda14cbcSMatt Macy fletcher_4_supp_impls[c++] = curr_impl; 743eda14cbcSMatt Macy } 744eda14cbcSMatt Macy membar_producer(); /* complete fletcher_4_supp_impls[] init */ 745eda14cbcSMatt Macy fletcher_4_supp_impls_cnt = c; /* number of supported impl */ 746eda14cbcSMatt Macy 747eda14cbcSMatt Macy #if defined(_KERNEL) 748eda14cbcSMatt Macy static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ 749eda14cbcSMatt Macy char *databuf = vmem_alloc(data_size, KM_SLEEP); 750eda14cbcSMatt Macy 751eda14cbcSMatt Macy for (i = 0; i < data_size / sizeof (uint64_t); i++) 752eda14cbcSMatt Macy ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */ 753eda14cbcSMatt Macy 754eda14cbcSMatt Macy fletcher_4_benchmark_impl(B_FALSE, databuf, data_size); 755eda14cbcSMatt Macy fletcher_4_benchmark_impl(B_TRUE, databuf, data_size); 756eda14cbcSMatt Macy 757eda14cbcSMatt Macy vmem_free(databuf, data_size); 758eda14cbcSMatt Macy #else 759eda14cbcSMatt Macy /* 760eda14cbcSMatt Macy * Skip the benchmark in user space to avoid impacting libzpool 761eda14cbcSMatt Macy * consumers (zdb, zhack, zinject, ztest). The last implementation 762eda14cbcSMatt Macy * is assumed to be the fastest and used by default. 763eda14cbcSMatt Macy */ 764eda14cbcSMatt Macy memcpy(&fletcher_4_fastest_impl, 765eda14cbcSMatt Macy fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1], 766eda14cbcSMatt Macy sizeof (fletcher_4_fastest_impl)); 767eda14cbcSMatt Macy fletcher_4_fastest_impl.name = "fastest"; 768eda14cbcSMatt Macy membar_producer(); 769eda14cbcSMatt Macy #endif /* _KERNEL */ 770eda14cbcSMatt Macy } 771eda14cbcSMatt Macy 772eda14cbcSMatt Macy void 773eda14cbcSMatt Macy fletcher_4_init(void) 774eda14cbcSMatt Macy { 775eda14cbcSMatt Macy /* Determine the fastest available implementation. */ 776eda14cbcSMatt Macy fletcher_4_benchmark(); 777eda14cbcSMatt Macy 778eda14cbcSMatt Macy #if defined(_KERNEL) 779eda14cbcSMatt Macy /* Install kstats for all implementations */ 780eda14cbcSMatt Macy fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", 781eda14cbcSMatt Macy KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 782eda14cbcSMatt Macy if (fletcher_4_kstat != NULL) { 783eda14cbcSMatt Macy fletcher_4_kstat->ks_data = NULL; 784eda14cbcSMatt Macy fletcher_4_kstat->ks_ndata = UINT32_MAX; 785eda14cbcSMatt Macy kstat_set_raw_ops(fletcher_4_kstat, 786eda14cbcSMatt Macy fletcher_4_kstat_headers, 787eda14cbcSMatt Macy fletcher_4_kstat_data, 788eda14cbcSMatt Macy fletcher_4_kstat_addr); 789eda14cbcSMatt Macy kstat_install(fletcher_4_kstat); 790eda14cbcSMatt Macy } 791eda14cbcSMatt Macy #endif 792eda14cbcSMatt Macy 793eda14cbcSMatt Macy /* Finish initialization */ 794eda14cbcSMatt Macy fletcher_4_initialized = B_TRUE; 795eda14cbcSMatt Macy } 796eda14cbcSMatt Macy 797eda14cbcSMatt Macy void 798eda14cbcSMatt Macy fletcher_4_fini(void) 799eda14cbcSMatt Macy { 800eda14cbcSMatt Macy #if defined(_KERNEL) 801eda14cbcSMatt Macy if (fletcher_4_kstat != NULL) { 802eda14cbcSMatt Macy kstat_delete(fletcher_4_kstat); 803eda14cbcSMatt Macy fletcher_4_kstat = NULL; 804eda14cbcSMatt Macy } 805eda14cbcSMatt Macy #endif 806eda14cbcSMatt Macy } 807eda14cbcSMatt Macy 808eda14cbcSMatt Macy /* ABD adapters */ 809eda14cbcSMatt Macy 810eda14cbcSMatt Macy static void 811eda14cbcSMatt Macy abd_fletcher_4_init(zio_abd_checksum_data_t *cdp) 812eda14cbcSMatt Macy { 813eda14cbcSMatt Macy const fletcher_4_ops_t *ops = fletcher_4_impl_get(); 814eda14cbcSMatt Macy cdp->acd_private = (void *) ops; 815eda14cbcSMatt Macy 816eda14cbcSMatt Macy if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) 817eda14cbcSMatt Macy ops->init_native(cdp->acd_ctx); 818eda14cbcSMatt Macy else 819eda14cbcSMatt Macy ops->init_byteswap(cdp->acd_ctx); 820eda14cbcSMatt Macy } 821eda14cbcSMatt Macy 822eda14cbcSMatt Macy static void 823eda14cbcSMatt Macy abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp) 824eda14cbcSMatt Macy { 825eda14cbcSMatt Macy fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; 826eda14cbcSMatt Macy 827eda14cbcSMatt Macy ASSERT(ops); 828eda14cbcSMatt Macy 829eda14cbcSMatt Macy if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) 830eda14cbcSMatt Macy ops->fini_native(cdp->acd_ctx, cdp->acd_zcp); 831eda14cbcSMatt Macy else 832eda14cbcSMatt Macy ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp); 833eda14cbcSMatt Macy } 834eda14cbcSMatt Macy 835eda14cbcSMatt Macy static void 836eda14cbcSMatt Macy abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size, 837eda14cbcSMatt Macy zio_abd_checksum_data_t *cdp) 838eda14cbcSMatt Macy { 839eda14cbcSMatt Macy zio_cksum_t *zcp = cdp->acd_zcp; 840eda14cbcSMatt Macy 841eda14cbcSMatt Macy ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE); 842eda14cbcSMatt Macy 843eda14cbcSMatt Macy abd_fletcher_4_fini(cdp); 844eda14cbcSMatt Macy cdp->acd_private = (void *)&fletcher_4_scalar_ops; 845eda14cbcSMatt Macy 846eda14cbcSMatt Macy if (native) 847eda14cbcSMatt Macy fletcher_4_incremental_native(data, size, zcp); 848eda14cbcSMatt Macy else 849eda14cbcSMatt Macy fletcher_4_incremental_byteswap(data, size, zcp); 850eda14cbcSMatt Macy } 851eda14cbcSMatt Macy 852eda14cbcSMatt Macy static int 853eda14cbcSMatt Macy abd_fletcher_4_iter(void *data, size_t size, void *private) 854eda14cbcSMatt Macy { 855eda14cbcSMatt Macy zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private; 856eda14cbcSMatt Macy fletcher_4_ctx_t *ctx = cdp->acd_ctx; 857eda14cbcSMatt Macy fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; 858eda14cbcSMatt Macy boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE; 859eda14cbcSMatt Macy uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); 860eda14cbcSMatt Macy 861eda14cbcSMatt Macy ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); 862eda14cbcSMatt Macy 863eda14cbcSMatt Macy if (asize > 0) { 864eda14cbcSMatt Macy if (native) 865eda14cbcSMatt Macy ops->compute_native(ctx, data, asize); 866eda14cbcSMatt Macy else 867eda14cbcSMatt Macy ops->compute_byteswap(ctx, data, asize); 868eda14cbcSMatt Macy 869eda14cbcSMatt Macy size -= asize; 870eda14cbcSMatt Macy data = (char *)data + asize; 871eda14cbcSMatt Macy } 872eda14cbcSMatt Macy 873eda14cbcSMatt Macy if (size > 0) { 874eda14cbcSMatt Macy ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE); 875eda14cbcSMatt Macy /* At this point we have to switch to scalar impl */ 876eda14cbcSMatt Macy abd_fletcher_4_simd2scalar(native, data, size, cdp); 877eda14cbcSMatt Macy } 878eda14cbcSMatt Macy 879eda14cbcSMatt Macy return (0); 880eda14cbcSMatt Macy } 881eda14cbcSMatt Macy 882eda14cbcSMatt Macy zio_abd_checksum_func_t fletcher_4_abd_ops = { 883eda14cbcSMatt Macy .acf_init = abd_fletcher_4_init, 884eda14cbcSMatt Macy .acf_fini = abd_fletcher_4_fini, 885eda14cbcSMatt Macy .acf_iter = abd_fletcher_4_iter 886eda14cbcSMatt Macy }; 887eda14cbcSMatt Macy 888eda14cbcSMatt Macy 889eda14cbcSMatt Macy #if defined(_KERNEL) && defined(__linux__) 890eda14cbcSMatt Macy 891eda14cbcSMatt Macy static int 892eda14cbcSMatt Macy fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused) 893eda14cbcSMatt Macy { 894eda14cbcSMatt Macy const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); 895eda14cbcSMatt Macy char *fmt; 896eda14cbcSMatt Macy int i, cnt = 0; 897eda14cbcSMatt Macy 898eda14cbcSMatt Macy /* list fastest */ 899eda14cbcSMatt Macy fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s "; 900eda14cbcSMatt Macy cnt += sprintf(buffer + cnt, fmt, "fastest"); 901eda14cbcSMatt Macy 902eda14cbcSMatt Macy /* list all supported implementations */ 903eda14cbcSMatt Macy for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { 904eda14cbcSMatt Macy fmt = (i == impl) ? "[%s] " : "%s "; 905eda14cbcSMatt Macy cnt += sprintf(buffer + cnt, fmt, 906eda14cbcSMatt Macy fletcher_4_supp_impls[i]->name); 907eda14cbcSMatt Macy } 908eda14cbcSMatt Macy 909eda14cbcSMatt Macy return (cnt); 910eda14cbcSMatt Macy } 911eda14cbcSMatt Macy 912eda14cbcSMatt Macy static int 913eda14cbcSMatt Macy fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused) 914eda14cbcSMatt Macy { 915eda14cbcSMatt Macy return (fletcher_4_impl_set(val)); 916eda14cbcSMatt Macy } 917eda14cbcSMatt Macy 918eda14cbcSMatt Macy /* 919eda14cbcSMatt Macy * Choose a fletcher 4 implementation in ZFS. 920eda14cbcSMatt Macy * Users can choose "cycle" to exercise all implementations, but this is 921eda14cbcSMatt Macy * for testing purpose therefore it can only be set in user space. 922eda14cbcSMatt Macy */ 923eda14cbcSMatt Macy module_param_call(zfs_fletcher_4_impl, 924eda14cbcSMatt Macy fletcher_4_param_set, fletcher_4_param_get, NULL, 0644); 925eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation."); 926eda14cbcSMatt Macy 927eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_init); 928eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_2_incremental_native); 929eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_2_incremental_byteswap); 930eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_init); 931eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_fini); 932eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_2_native); 933eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_2_byteswap); 934eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_native); 935eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_native_varsize); 936eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_byteswap); 937eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_incremental_native); 938eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_incremental_byteswap); 939eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_abd_ops); 940eda14cbcSMatt Macy #endif 941