1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6*eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7*eda14cbcSMatt Macy * 8*eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10*eda14cbcSMatt Macy * See the License for the specific language governing permissions 11*eda14cbcSMatt Macy * and limitations under the License. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14*eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16*eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17*eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18*eda14cbcSMatt Macy * 19*eda14cbcSMatt Macy * CDDL HEADER END 20*eda14cbcSMatt Macy */ 21*eda14cbcSMatt Macy /* 22*eda14cbcSMatt Macy * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23*eda14cbcSMatt Macy * Use is subject to license terms. 24*eda14cbcSMatt Macy * Copyright (C) 2016 Gvozden Nešković. All rights reserved. 25*eda14cbcSMatt Macy */ 26*eda14cbcSMatt Macy /* 27*eda14cbcSMatt Macy * Copyright 2013 Saso Kiselkov. All rights reserved. 28*eda14cbcSMatt Macy */ 29*eda14cbcSMatt Macy 30*eda14cbcSMatt Macy /* 31*eda14cbcSMatt Macy * Copyright (c) 2016 by Delphix. All rights reserved. 32*eda14cbcSMatt Macy */ 33*eda14cbcSMatt Macy 34*eda14cbcSMatt Macy /* 35*eda14cbcSMatt Macy * Fletcher Checksums 36*eda14cbcSMatt Macy * ------------------ 37*eda14cbcSMatt Macy * 38*eda14cbcSMatt Macy * ZFS's 2nd and 4th order Fletcher checksums are defined by the following 39*eda14cbcSMatt Macy * recurrence relations: 40*eda14cbcSMatt Macy * 41*eda14cbcSMatt Macy * a = a + f 42*eda14cbcSMatt Macy * i i-1 i-1 43*eda14cbcSMatt Macy * 44*eda14cbcSMatt Macy * b = b + a 45*eda14cbcSMatt Macy * i i-1 i 46*eda14cbcSMatt Macy * 47*eda14cbcSMatt Macy * c = c + b (fletcher-4 only) 48*eda14cbcSMatt Macy * i i-1 i 49*eda14cbcSMatt Macy * 50*eda14cbcSMatt Macy * d = d + c (fletcher-4 only) 51*eda14cbcSMatt Macy * i i-1 i 52*eda14cbcSMatt Macy * 53*eda14cbcSMatt Macy * Where 54*eda14cbcSMatt Macy * a_0 = b_0 = c_0 = d_0 = 0 55*eda14cbcSMatt Macy * and 56*eda14cbcSMatt Macy * f_0 .. f_(n-1) are the input data. 57*eda14cbcSMatt Macy * 58*eda14cbcSMatt Macy * Using standard techniques, these translate into the following series: 59*eda14cbcSMatt Macy * 60*eda14cbcSMatt Macy * __n_ __n_ 61*eda14cbcSMatt Macy * \ | \ | 62*eda14cbcSMatt Macy * a = > f b = > i * f 63*eda14cbcSMatt Macy * n /___| n - i n /___| n - i 64*eda14cbcSMatt Macy * i = 1 i = 1 65*eda14cbcSMatt Macy * 66*eda14cbcSMatt Macy * 67*eda14cbcSMatt Macy * __n_ __n_ 68*eda14cbcSMatt Macy * \ | i*(i+1) \ | i*(i+1)*(i+2) 69*eda14cbcSMatt Macy * c = > ------- f d = > ------------- f 70*eda14cbcSMatt Macy * n /___| 2 n - i n /___| 6 n - i 71*eda14cbcSMatt Macy * i = 1 i = 1 72*eda14cbcSMatt Macy * 73*eda14cbcSMatt Macy * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. 74*eda14cbcSMatt Macy * Since the additions are done mod (2^64), errors in the high bits may not 75*eda14cbcSMatt Macy * be noticed. For this reason, fletcher-2 is deprecated. 76*eda14cbcSMatt Macy * 77*eda14cbcSMatt Macy * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. 78*eda14cbcSMatt Macy * A conservative estimate of how big the buffer can get before we overflow 79*eda14cbcSMatt Macy * can be estimated using f_i = 0xffffffff for all i: 80*eda14cbcSMatt Macy * 81*eda14cbcSMatt Macy * % bc 82*eda14cbcSMatt Macy * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 83*eda14cbcSMatt Macy * 2264 84*eda14cbcSMatt Macy * quit 85*eda14cbcSMatt Macy * % 86*eda14cbcSMatt Macy * 87*eda14cbcSMatt Macy * So blocks of up to 2k will not overflow. Our largest block size is 88*eda14cbcSMatt Macy * 128k, which has 32k 4-byte words, so we can compute the largest possible 89*eda14cbcSMatt Macy * accumulators, then divide by 2^64 to figure the max amount of overflow: 90*eda14cbcSMatt Macy * 91*eda14cbcSMatt Macy * % bc 92*eda14cbcSMatt Macy * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } 93*eda14cbcSMatt Macy * a/2^64;b/2^64;c/2^64;d/2^64 94*eda14cbcSMatt Macy * 0 95*eda14cbcSMatt Macy * 0 96*eda14cbcSMatt Macy * 1365 97*eda14cbcSMatt Macy * 11186858 98*eda14cbcSMatt Macy * quit 99*eda14cbcSMatt Macy * % 100*eda14cbcSMatt Macy * 101*eda14cbcSMatt Macy * So a and b cannot overflow. To make sure each bit of input has some 102*eda14cbcSMatt Macy * effect on the contents of c and d, we can look at what the factors of 103*eda14cbcSMatt Macy * the coefficients in the equations for c_n and d_n are. The number of 2s 104*eda14cbcSMatt Macy * in the factors determines the lowest set bit in the multiplier. Running 105*eda14cbcSMatt Macy * through the cases for n*(n+1)/2 reveals that the highest power of 2 is 106*eda14cbcSMatt Macy * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow 107*eda14cbcSMatt Macy * the 64-bit accumulators, every bit of every f_i effects every accumulator, 108*eda14cbcSMatt Macy * even for 128k blocks. 109*eda14cbcSMatt Macy * 110*eda14cbcSMatt Macy * If we wanted to make a stronger version of fletcher4 (fletcher4c?), 111*eda14cbcSMatt Macy * we could do our calculations mod (2^32 - 1) by adding in the carries 112*eda14cbcSMatt Macy * periodically, and store the number of carries in the top 32-bits. 113*eda14cbcSMatt Macy * 114*eda14cbcSMatt Macy * -------------------- 115*eda14cbcSMatt Macy * Checksum Performance 116*eda14cbcSMatt Macy * -------------------- 117*eda14cbcSMatt Macy * 118*eda14cbcSMatt Macy * There are two interesting components to checksum performance: cached and 119*eda14cbcSMatt Macy * uncached performance. With cached data, fletcher-2 is about four times 120*eda14cbcSMatt Macy * faster than fletcher-4. With uncached data, the performance difference is 121*eda14cbcSMatt Macy * negligible, since the cost of a cache fill dominates the processing time. 122*eda14cbcSMatt Macy * Even though fletcher-4 is slower than fletcher-2, it is still a pretty 123*eda14cbcSMatt Macy * efficient pass over the data. 124*eda14cbcSMatt Macy * 125*eda14cbcSMatt Macy * In normal operation, the data which is being checksummed is in a buffer 126*eda14cbcSMatt Macy * which has been filled either by: 127*eda14cbcSMatt Macy * 128*eda14cbcSMatt Macy * 1. a compression step, which will be mostly cached, or 129*eda14cbcSMatt Macy * 2. a bcopy() or copyin(), which will be uncached (because the 130*eda14cbcSMatt Macy * copy is cache-bypassing). 131*eda14cbcSMatt Macy * 132*eda14cbcSMatt Macy * For both cached and uncached data, both fletcher checksums are much faster 133*eda14cbcSMatt Macy * than sha-256, and slower than 'off', which doesn't touch the data at all. 134*eda14cbcSMatt Macy */ 135*eda14cbcSMatt Macy 136*eda14cbcSMatt Macy #include <sys/types.h> 137*eda14cbcSMatt Macy #include <sys/sysmacros.h> 138*eda14cbcSMatt Macy #include <sys/byteorder.h> 139*eda14cbcSMatt Macy #include <sys/spa.h> 140*eda14cbcSMatt Macy #include <sys/simd.h> 141*eda14cbcSMatt Macy #include <sys/zio_checksum.h> 142*eda14cbcSMatt Macy #include <sys/zfs_context.h> 143*eda14cbcSMatt Macy #include <zfs_fletcher.h> 144*eda14cbcSMatt Macy 145*eda14cbcSMatt Macy #define FLETCHER_MIN_SIMD_SIZE 64 146*eda14cbcSMatt Macy 147*eda14cbcSMatt Macy static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx); 148*eda14cbcSMatt Macy static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp); 149*eda14cbcSMatt Macy static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, 150*eda14cbcSMatt Macy const void *buf, uint64_t size); 151*eda14cbcSMatt Macy static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, 152*eda14cbcSMatt Macy const void *buf, uint64_t size); 153*eda14cbcSMatt Macy static boolean_t fletcher_4_scalar_valid(void); 154*eda14cbcSMatt Macy 155*eda14cbcSMatt Macy static const fletcher_4_ops_t fletcher_4_scalar_ops = { 156*eda14cbcSMatt Macy .init_native = fletcher_4_scalar_init, 157*eda14cbcSMatt Macy .fini_native = fletcher_4_scalar_fini, 158*eda14cbcSMatt Macy .compute_native = fletcher_4_scalar_native, 159*eda14cbcSMatt Macy .init_byteswap = fletcher_4_scalar_init, 160*eda14cbcSMatt Macy .fini_byteswap = fletcher_4_scalar_fini, 161*eda14cbcSMatt Macy .compute_byteswap = fletcher_4_scalar_byteswap, 162*eda14cbcSMatt Macy .valid = fletcher_4_scalar_valid, 163*eda14cbcSMatt Macy .name = "scalar" 164*eda14cbcSMatt Macy }; 165*eda14cbcSMatt Macy 166*eda14cbcSMatt Macy static fletcher_4_ops_t fletcher_4_fastest_impl = { 167*eda14cbcSMatt Macy .name = "fastest", 168*eda14cbcSMatt Macy .valid = fletcher_4_scalar_valid 169*eda14cbcSMatt Macy }; 170*eda14cbcSMatt Macy 171*eda14cbcSMatt Macy static const fletcher_4_ops_t *fletcher_4_impls[] = { 172*eda14cbcSMatt Macy &fletcher_4_scalar_ops, 173*eda14cbcSMatt Macy &fletcher_4_superscalar_ops, 174*eda14cbcSMatt Macy &fletcher_4_superscalar4_ops, 175*eda14cbcSMatt Macy #if defined(HAVE_SSE2) 176*eda14cbcSMatt Macy &fletcher_4_sse2_ops, 177*eda14cbcSMatt Macy #endif 178*eda14cbcSMatt Macy #if defined(HAVE_SSE2) && defined(HAVE_SSSE3) 179*eda14cbcSMatt Macy &fletcher_4_ssse3_ops, 180*eda14cbcSMatt Macy #endif 181*eda14cbcSMatt Macy #if defined(HAVE_AVX) && defined(HAVE_AVX2) 182*eda14cbcSMatt Macy &fletcher_4_avx2_ops, 183*eda14cbcSMatt Macy #endif 184*eda14cbcSMatt Macy #if defined(__x86_64) && defined(HAVE_AVX512F) 185*eda14cbcSMatt Macy &fletcher_4_avx512f_ops, 186*eda14cbcSMatt Macy #endif 187*eda14cbcSMatt Macy #if defined(__x86_64) && defined(HAVE_AVX512BW) 188*eda14cbcSMatt Macy &fletcher_4_avx512bw_ops, 189*eda14cbcSMatt Macy #endif 190*eda14cbcSMatt Macy #if defined(__aarch64__) 191*eda14cbcSMatt Macy &fletcher_4_aarch64_neon_ops, 192*eda14cbcSMatt Macy #endif 193*eda14cbcSMatt Macy }; 194*eda14cbcSMatt Macy 195*eda14cbcSMatt Macy /* Hold all supported implementations */ 196*eda14cbcSMatt Macy static uint32_t fletcher_4_supp_impls_cnt = 0; 197*eda14cbcSMatt Macy static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)]; 198*eda14cbcSMatt Macy 199*eda14cbcSMatt Macy /* Select fletcher4 implementation */ 200*eda14cbcSMatt Macy #define IMPL_FASTEST (UINT32_MAX) 201*eda14cbcSMatt Macy #define IMPL_CYCLE (UINT32_MAX - 1) 202*eda14cbcSMatt Macy #define IMPL_SCALAR (0) 203*eda14cbcSMatt Macy 204*eda14cbcSMatt Macy static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST; 205*eda14cbcSMatt Macy 206*eda14cbcSMatt Macy #define IMPL_READ(i) (*(volatile uint32_t *) &(i)) 207*eda14cbcSMatt Macy 208*eda14cbcSMatt Macy static struct fletcher_4_impl_selector { 209*eda14cbcSMatt Macy const char *fis_name; 210*eda14cbcSMatt Macy uint32_t fis_sel; 211*eda14cbcSMatt Macy } fletcher_4_impl_selectors[] = { 212*eda14cbcSMatt Macy { "cycle", IMPL_CYCLE }, 213*eda14cbcSMatt Macy { "fastest", IMPL_FASTEST }, 214*eda14cbcSMatt Macy { "scalar", IMPL_SCALAR } 215*eda14cbcSMatt Macy }; 216*eda14cbcSMatt Macy 217*eda14cbcSMatt Macy #if defined(_KERNEL) 218*eda14cbcSMatt Macy static kstat_t *fletcher_4_kstat; 219*eda14cbcSMatt Macy 220*eda14cbcSMatt Macy static struct fletcher_4_kstat { 221*eda14cbcSMatt Macy uint64_t native; 222*eda14cbcSMatt Macy uint64_t byteswap; 223*eda14cbcSMatt Macy } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1]; 224*eda14cbcSMatt Macy #endif 225*eda14cbcSMatt Macy 226*eda14cbcSMatt Macy /* Indicate that benchmark has been completed */ 227*eda14cbcSMatt Macy static boolean_t fletcher_4_initialized = B_FALSE; 228*eda14cbcSMatt Macy 229*eda14cbcSMatt Macy /*ARGSUSED*/ 230*eda14cbcSMatt Macy void 231*eda14cbcSMatt Macy fletcher_init(zio_cksum_t *zcp) 232*eda14cbcSMatt Macy { 233*eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 234*eda14cbcSMatt Macy } 235*eda14cbcSMatt Macy 236*eda14cbcSMatt Macy int 237*eda14cbcSMatt Macy fletcher_2_incremental_native(void *buf, size_t size, void *data) 238*eda14cbcSMatt Macy { 239*eda14cbcSMatt Macy zio_cksum_t *zcp = data; 240*eda14cbcSMatt Macy 241*eda14cbcSMatt Macy const uint64_t *ip = buf; 242*eda14cbcSMatt Macy const uint64_t *ipend = ip + (size / sizeof (uint64_t)); 243*eda14cbcSMatt Macy uint64_t a0, b0, a1, b1; 244*eda14cbcSMatt Macy 245*eda14cbcSMatt Macy a0 = zcp->zc_word[0]; 246*eda14cbcSMatt Macy a1 = zcp->zc_word[1]; 247*eda14cbcSMatt Macy b0 = zcp->zc_word[2]; 248*eda14cbcSMatt Macy b1 = zcp->zc_word[3]; 249*eda14cbcSMatt Macy 250*eda14cbcSMatt Macy for (; ip < ipend; ip += 2) { 251*eda14cbcSMatt Macy a0 += ip[0]; 252*eda14cbcSMatt Macy a1 += ip[1]; 253*eda14cbcSMatt Macy b0 += a0; 254*eda14cbcSMatt Macy b1 += a1; 255*eda14cbcSMatt Macy } 256*eda14cbcSMatt Macy 257*eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); 258*eda14cbcSMatt Macy return (0); 259*eda14cbcSMatt Macy } 260*eda14cbcSMatt Macy 261*eda14cbcSMatt Macy /*ARGSUSED*/ 262*eda14cbcSMatt Macy void 263*eda14cbcSMatt Macy fletcher_2_native(const void *buf, uint64_t size, 264*eda14cbcSMatt Macy const void *ctx_template, zio_cksum_t *zcp) 265*eda14cbcSMatt Macy { 266*eda14cbcSMatt Macy fletcher_init(zcp); 267*eda14cbcSMatt Macy (void) fletcher_2_incremental_native((void *) buf, size, zcp); 268*eda14cbcSMatt Macy } 269*eda14cbcSMatt Macy 270*eda14cbcSMatt Macy int 271*eda14cbcSMatt Macy fletcher_2_incremental_byteswap(void *buf, size_t size, void *data) 272*eda14cbcSMatt Macy { 273*eda14cbcSMatt Macy zio_cksum_t *zcp = data; 274*eda14cbcSMatt Macy 275*eda14cbcSMatt Macy const uint64_t *ip = buf; 276*eda14cbcSMatt Macy const uint64_t *ipend = ip + (size / sizeof (uint64_t)); 277*eda14cbcSMatt Macy uint64_t a0, b0, a1, b1; 278*eda14cbcSMatt Macy 279*eda14cbcSMatt Macy a0 = zcp->zc_word[0]; 280*eda14cbcSMatt Macy a1 = zcp->zc_word[1]; 281*eda14cbcSMatt Macy b0 = zcp->zc_word[2]; 282*eda14cbcSMatt Macy b1 = zcp->zc_word[3]; 283*eda14cbcSMatt Macy 284*eda14cbcSMatt Macy for (; ip < ipend; ip += 2) { 285*eda14cbcSMatt Macy a0 += BSWAP_64(ip[0]); 286*eda14cbcSMatt Macy a1 += BSWAP_64(ip[1]); 287*eda14cbcSMatt Macy b0 += a0; 288*eda14cbcSMatt Macy b1 += a1; 289*eda14cbcSMatt Macy } 290*eda14cbcSMatt Macy 291*eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); 292*eda14cbcSMatt Macy return (0); 293*eda14cbcSMatt Macy } 294*eda14cbcSMatt Macy 295*eda14cbcSMatt Macy /*ARGSUSED*/ 296*eda14cbcSMatt Macy void 297*eda14cbcSMatt Macy fletcher_2_byteswap(const void *buf, uint64_t size, 298*eda14cbcSMatt Macy const void *ctx_template, zio_cksum_t *zcp) 299*eda14cbcSMatt Macy { 300*eda14cbcSMatt Macy fletcher_init(zcp); 301*eda14cbcSMatt Macy (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp); 302*eda14cbcSMatt Macy } 303*eda14cbcSMatt Macy 304*eda14cbcSMatt Macy static void 305*eda14cbcSMatt Macy fletcher_4_scalar_init(fletcher_4_ctx_t *ctx) 306*eda14cbcSMatt Macy { 307*eda14cbcSMatt Macy ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0); 308*eda14cbcSMatt Macy } 309*eda14cbcSMatt Macy 310*eda14cbcSMatt Macy static void 311*eda14cbcSMatt Macy fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) 312*eda14cbcSMatt Macy { 313*eda14cbcSMatt Macy memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t)); 314*eda14cbcSMatt Macy } 315*eda14cbcSMatt Macy 316*eda14cbcSMatt Macy static void 317*eda14cbcSMatt Macy fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, 318*eda14cbcSMatt Macy uint64_t size) 319*eda14cbcSMatt Macy { 320*eda14cbcSMatt Macy const uint32_t *ip = buf; 321*eda14cbcSMatt Macy const uint32_t *ipend = ip + (size / sizeof (uint32_t)); 322*eda14cbcSMatt Macy uint64_t a, b, c, d; 323*eda14cbcSMatt Macy 324*eda14cbcSMatt Macy a = ctx->scalar.zc_word[0]; 325*eda14cbcSMatt Macy b = ctx->scalar.zc_word[1]; 326*eda14cbcSMatt Macy c = ctx->scalar.zc_word[2]; 327*eda14cbcSMatt Macy d = ctx->scalar.zc_word[3]; 328*eda14cbcSMatt Macy 329*eda14cbcSMatt Macy for (; ip < ipend; ip++) { 330*eda14cbcSMatt Macy a += ip[0]; 331*eda14cbcSMatt Macy b += a; 332*eda14cbcSMatt Macy c += b; 333*eda14cbcSMatt Macy d += c; 334*eda14cbcSMatt Macy } 335*eda14cbcSMatt Macy 336*eda14cbcSMatt Macy ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); 337*eda14cbcSMatt Macy } 338*eda14cbcSMatt Macy 339*eda14cbcSMatt Macy static void 340*eda14cbcSMatt Macy fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, 341*eda14cbcSMatt Macy uint64_t size) 342*eda14cbcSMatt Macy { 343*eda14cbcSMatt Macy const uint32_t *ip = buf; 344*eda14cbcSMatt Macy const uint32_t *ipend = ip + (size / sizeof (uint32_t)); 345*eda14cbcSMatt Macy uint64_t a, b, c, d; 346*eda14cbcSMatt Macy 347*eda14cbcSMatt Macy a = ctx->scalar.zc_word[0]; 348*eda14cbcSMatt Macy b = ctx->scalar.zc_word[1]; 349*eda14cbcSMatt Macy c = ctx->scalar.zc_word[2]; 350*eda14cbcSMatt Macy d = ctx->scalar.zc_word[3]; 351*eda14cbcSMatt Macy 352*eda14cbcSMatt Macy for (; ip < ipend; ip++) { 353*eda14cbcSMatt Macy a += BSWAP_32(ip[0]); 354*eda14cbcSMatt Macy b += a; 355*eda14cbcSMatt Macy c += b; 356*eda14cbcSMatt Macy d += c; 357*eda14cbcSMatt Macy } 358*eda14cbcSMatt Macy 359*eda14cbcSMatt Macy ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); 360*eda14cbcSMatt Macy } 361*eda14cbcSMatt Macy 362*eda14cbcSMatt Macy static boolean_t 363*eda14cbcSMatt Macy fletcher_4_scalar_valid(void) 364*eda14cbcSMatt Macy { 365*eda14cbcSMatt Macy return (B_TRUE); 366*eda14cbcSMatt Macy } 367*eda14cbcSMatt Macy 368*eda14cbcSMatt Macy int 369*eda14cbcSMatt Macy fletcher_4_impl_set(const char *val) 370*eda14cbcSMatt Macy { 371*eda14cbcSMatt Macy int err = -EINVAL; 372*eda14cbcSMatt Macy uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); 373*eda14cbcSMatt Macy size_t i, val_len; 374*eda14cbcSMatt Macy 375*eda14cbcSMatt Macy val_len = strlen(val); 376*eda14cbcSMatt Macy while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */ 377*eda14cbcSMatt Macy val_len--; 378*eda14cbcSMatt Macy 379*eda14cbcSMatt Macy /* check mandatory implementations */ 380*eda14cbcSMatt Macy for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) { 381*eda14cbcSMatt Macy const char *name = fletcher_4_impl_selectors[i].fis_name; 382*eda14cbcSMatt Macy 383*eda14cbcSMatt Macy if (val_len == strlen(name) && 384*eda14cbcSMatt Macy strncmp(val, name, val_len) == 0) { 385*eda14cbcSMatt Macy impl = fletcher_4_impl_selectors[i].fis_sel; 386*eda14cbcSMatt Macy err = 0; 387*eda14cbcSMatt Macy break; 388*eda14cbcSMatt Macy } 389*eda14cbcSMatt Macy } 390*eda14cbcSMatt Macy 391*eda14cbcSMatt Macy if (err != 0 && fletcher_4_initialized) { 392*eda14cbcSMatt Macy /* check all supported implementations */ 393*eda14cbcSMatt Macy for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { 394*eda14cbcSMatt Macy const char *name = fletcher_4_supp_impls[i]->name; 395*eda14cbcSMatt Macy 396*eda14cbcSMatt Macy if (val_len == strlen(name) && 397*eda14cbcSMatt Macy strncmp(val, name, val_len) == 0) { 398*eda14cbcSMatt Macy impl = i; 399*eda14cbcSMatt Macy err = 0; 400*eda14cbcSMatt Macy break; 401*eda14cbcSMatt Macy } 402*eda14cbcSMatt Macy } 403*eda14cbcSMatt Macy } 404*eda14cbcSMatt Macy 405*eda14cbcSMatt Macy if (err == 0) { 406*eda14cbcSMatt Macy atomic_swap_32(&fletcher_4_impl_chosen, impl); 407*eda14cbcSMatt Macy membar_producer(); 408*eda14cbcSMatt Macy } 409*eda14cbcSMatt Macy 410*eda14cbcSMatt Macy return (err); 411*eda14cbcSMatt Macy } 412*eda14cbcSMatt Macy 413*eda14cbcSMatt Macy /* 414*eda14cbcSMatt Macy * Returns the Fletcher 4 operations for checksums. When a SIMD 415*eda14cbcSMatt Macy * implementation is not allowed in the current context, then fallback 416*eda14cbcSMatt Macy * to the fastest generic implementation. 417*eda14cbcSMatt Macy */ 418*eda14cbcSMatt Macy static inline const fletcher_4_ops_t * 419*eda14cbcSMatt Macy fletcher_4_impl_get(void) 420*eda14cbcSMatt Macy { 421*eda14cbcSMatt Macy if (!kfpu_allowed()) 422*eda14cbcSMatt Macy return (&fletcher_4_superscalar4_ops); 423*eda14cbcSMatt Macy 424*eda14cbcSMatt Macy const fletcher_4_ops_t *ops = NULL; 425*eda14cbcSMatt Macy uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); 426*eda14cbcSMatt Macy 427*eda14cbcSMatt Macy switch (impl) { 428*eda14cbcSMatt Macy case IMPL_FASTEST: 429*eda14cbcSMatt Macy ASSERT(fletcher_4_initialized); 430*eda14cbcSMatt Macy ops = &fletcher_4_fastest_impl; 431*eda14cbcSMatt Macy break; 432*eda14cbcSMatt Macy case IMPL_CYCLE: 433*eda14cbcSMatt Macy /* Cycle through supported implementations */ 434*eda14cbcSMatt Macy ASSERT(fletcher_4_initialized); 435*eda14cbcSMatt Macy ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); 436*eda14cbcSMatt Macy static uint32_t cycle_count = 0; 437*eda14cbcSMatt Macy uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt; 438*eda14cbcSMatt Macy ops = fletcher_4_supp_impls[idx]; 439*eda14cbcSMatt Macy break; 440*eda14cbcSMatt Macy default: 441*eda14cbcSMatt Macy ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); 442*eda14cbcSMatt Macy ASSERT3U(impl, <, fletcher_4_supp_impls_cnt); 443*eda14cbcSMatt Macy ops = fletcher_4_supp_impls[impl]; 444*eda14cbcSMatt Macy break; 445*eda14cbcSMatt Macy } 446*eda14cbcSMatt Macy 447*eda14cbcSMatt Macy ASSERT3P(ops, !=, NULL); 448*eda14cbcSMatt Macy 449*eda14cbcSMatt Macy return (ops); 450*eda14cbcSMatt Macy } 451*eda14cbcSMatt Macy 452*eda14cbcSMatt Macy static inline void 453*eda14cbcSMatt Macy fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) 454*eda14cbcSMatt Macy { 455*eda14cbcSMatt Macy fletcher_4_ctx_t ctx; 456*eda14cbcSMatt Macy const fletcher_4_ops_t *ops = fletcher_4_impl_get(); 457*eda14cbcSMatt Macy 458*eda14cbcSMatt Macy ops->init_native(&ctx); 459*eda14cbcSMatt Macy ops->compute_native(&ctx, buf, size); 460*eda14cbcSMatt Macy ops->fini_native(&ctx, zcp); 461*eda14cbcSMatt Macy } 462*eda14cbcSMatt Macy 463*eda14cbcSMatt Macy /*ARGSUSED*/ 464*eda14cbcSMatt Macy void 465*eda14cbcSMatt Macy fletcher_4_native(const void *buf, uint64_t size, 466*eda14cbcSMatt Macy const void *ctx_template, zio_cksum_t *zcp) 467*eda14cbcSMatt Macy { 468*eda14cbcSMatt Macy const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); 469*eda14cbcSMatt Macy 470*eda14cbcSMatt Macy ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); 471*eda14cbcSMatt Macy 472*eda14cbcSMatt Macy if (size == 0 || p2size == 0) { 473*eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 474*eda14cbcSMatt Macy 475*eda14cbcSMatt Macy if (size > 0) 476*eda14cbcSMatt Macy fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, 477*eda14cbcSMatt Macy buf, size); 478*eda14cbcSMatt Macy } else { 479*eda14cbcSMatt Macy fletcher_4_native_impl(buf, p2size, zcp); 480*eda14cbcSMatt Macy 481*eda14cbcSMatt Macy if (p2size < size) 482*eda14cbcSMatt Macy fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, 483*eda14cbcSMatt Macy (char *)buf + p2size, size - p2size); 484*eda14cbcSMatt Macy } 485*eda14cbcSMatt Macy } 486*eda14cbcSMatt Macy 487*eda14cbcSMatt Macy void 488*eda14cbcSMatt Macy fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp) 489*eda14cbcSMatt Macy { 490*eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 491*eda14cbcSMatt Macy fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); 492*eda14cbcSMatt Macy } 493*eda14cbcSMatt Macy 494*eda14cbcSMatt Macy static inline void 495*eda14cbcSMatt Macy fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) 496*eda14cbcSMatt Macy { 497*eda14cbcSMatt Macy fletcher_4_ctx_t ctx; 498*eda14cbcSMatt Macy const fletcher_4_ops_t *ops = fletcher_4_impl_get(); 499*eda14cbcSMatt Macy 500*eda14cbcSMatt Macy ops->init_byteswap(&ctx); 501*eda14cbcSMatt Macy ops->compute_byteswap(&ctx, buf, size); 502*eda14cbcSMatt Macy ops->fini_byteswap(&ctx, zcp); 503*eda14cbcSMatt Macy } 504*eda14cbcSMatt Macy 505*eda14cbcSMatt Macy /*ARGSUSED*/ 506*eda14cbcSMatt Macy void 507*eda14cbcSMatt Macy fletcher_4_byteswap(const void *buf, uint64_t size, 508*eda14cbcSMatt Macy const void *ctx_template, zio_cksum_t *zcp) 509*eda14cbcSMatt Macy { 510*eda14cbcSMatt Macy const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); 511*eda14cbcSMatt Macy 512*eda14cbcSMatt Macy ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); 513*eda14cbcSMatt Macy 514*eda14cbcSMatt Macy if (size == 0 || p2size == 0) { 515*eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 516*eda14cbcSMatt Macy 517*eda14cbcSMatt Macy if (size > 0) 518*eda14cbcSMatt Macy fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, 519*eda14cbcSMatt Macy buf, size); 520*eda14cbcSMatt Macy } else { 521*eda14cbcSMatt Macy fletcher_4_byteswap_impl(buf, p2size, zcp); 522*eda14cbcSMatt Macy 523*eda14cbcSMatt Macy if (p2size < size) 524*eda14cbcSMatt Macy fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, 525*eda14cbcSMatt Macy (char *)buf + p2size, size - p2size); 526*eda14cbcSMatt Macy } 527*eda14cbcSMatt Macy } 528*eda14cbcSMatt Macy 529*eda14cbcSMatt Macy /* Incremental Fletcher 4 */ 530*eda14cbcSMatt Macy 531*eda14cbcSMatt Macy #define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20) 532*eda14cbcSMatt Macy 533*eda14cbcSMatt Macy static inline void 534*eda14cbcSMatt Macy fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size, 535*eda14cbcSMatt Macy const zio_cksum_t *nzcp) 536*eda14cbcSMatt Macy { 537*eda14cbcSMatt Macy const uint64_t c1 = size / sizeof (uint32_t); 538*eda14cbcSMatt Macy const uint64_t c2 = c1 * (c1 + 1) / 2; 539*eda14cbcSMatt Macy const uint64_t c3 = c2 * (c1 + 2) / 3; 540*eda14cbcSMatt Macy 541*eda14cbcSMatt Macy /* 542*eda14cbcSMatt Macy * Value of 'c3' overflows on buffer sizes close to 16MiB. For that 543*eda14cbcSMatt Macy * reason we split incremental fletcher4 computation of large buffers 544*eda14cbcSMatt Macy * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size. 545*eda14cbcSMatt Macy */ 546*eda14cbcSMatt Macy ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE); 547*eda14cbcSMatt Macy 548*eda14cbcSMatt Macy zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] + 549*eda14cbcSMatt Macy c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0]; 550*eda14cbcSMatt Macy zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] + 551*eda14cbcSMatt Macy c2 * zcp->zc_word[0]; 552*eda14cbcSMatt Macy zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0]; 553*eda14cbcSMatt Macy zcp->zc_word[0] += nzcp->zc_word[0]; 554*eda14cbcSMatt Macy } 555*eda14cbcSMatt Macy 556*eda14cbcSMatt Macy static inline void 557*eda14cbcSMatt Macy fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size, 558*eda14cbcSMatt Macy zio_cksum_t *zcp) 559*eda14cbcSMatt Macy { 560*eda14cbcSMatt Macy while (size > 0) { 561*eda14cbcSMatt Macy zio_cksum_t nzc; 562*eda14cbcSMatt Macy uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE); 563*eda14cbcSMatt Macy 564*eda14cbcSMatt Macy if (native) 565*eda14cbcSMatt Macy fletcher_4_native(buf, len, NULL, &nzc); 566*eda14cbcSMatt Macy else 567*eda14cbcSMatt Macy fletcher_4_byteswap(buf, len, NULL, &nzc); 568*eda14cbcSMatt Macy 569*eda14cbcSMatt Macy fletcher_4_incremental_combine(zcp, len, &nzc); 570*eda14cbcSMatt Macy 571*eda14cbcSMatt Macy size -= len; 572*eda14cbcSMatt Macy buf += len; 573*eda14cbcSMatt Macy } 574*eda14cbcSMatt Macy } 575*eda14cbcSMatt Macy 576*eda14cbcSMatt Macy int 577*eda14cbcSMatt Macy fletcher_4_incremental_native(void *buf, size_t size, void *data) 578*eda14cbcSMatt Macy { 579*eda14cbcSMatt Macy zio_cksum_t *zcp = data; 580*eda14cbcSMatt Macy /* Use scalar impl to directly update cksum of small blocks */ 581*eda14cbcSMatt Macy if (size < SPA_MINBLOCKSIZE) 582*eda14cbcSMatt Macy fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); 583*eda14cbcSMatt Macy else 584*eda14cbcSMatt Macy fletcher_4_incremental_impl(B_TRUE, buf, size, zcp); 585*eda14cbcSMatt Macy return (0); 586*eda14cbcSMatt Macy } 587*eda14cbcSMatt Macy 588*eda14cbcSMatt Macy int 589*eda14cbcSMatt Macy fletcher_4_incremental_byteswap(void *buf, size_t size, void *data) 590*eda14cbcSMatt Macy { 591*eda14cbcSMatt Macy zio_cksum_t *zcp = data; 592*eda14cbcSMatt Macy /* Use scalar impl to directly update cksum of small blocks */ 593*eda14cbcSMatt Macy if (size < SPA_MINBLOCKSIZE) 594*eda14cbcSMatt Macy fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); 595*eda14cbcSMatt Macy else 596*eda14cbcSMatt Macy fletcher_4_incremental_impl(B_FALSE, buf, size, zcp); 597*eda14cbcSMatt Macy return (0); 598*eda14cbcSMatt Macy } 599*eda14cbcSMatt Macy 600*eda14cbcSMatt Macy #if defined(_KERNEL) 601*eda14cbcSMatt Macy /* 602*eda14cbcSMatt Macy * Fletcher 4 kstats 603*eda14cbcSMatt Macy */ 604*eda14cbcSMatt Macy static int 605*eda14cbcSMatt Macy fletcher_4_kstat_headers(char *buf, size_t size) 606*eda14cbcSMatt Macy { 607*eda14cbcSMatt Macy ssize_t off = 0; 608*eda14cbcSMatt Macy 609*eda14cbcSMatt Macy off += snprintf(buf + off, size, "%-17s", "implementation"); 610*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-15s", "native"); 611*eda14cbcSMatt Macy (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap"); 612*eda14cbcSMatt Macy 613*eda14cbcSMatt Macy return (0); 614*eda14cbcSMatt Macy } 615*eda14cbcSMatt Macy 616*eda14cbcSMatt Macy static int 617*eda14cbcSMatt Macy fletcher_4_kstat_data(char *buf, size_t size, void *data) 618*eda14cbcSMatt Macy { 619*eda14cbcSMatt Macy struct fletcher_4_kstat *fastest_stat = 620*eda14cbcSMatt Macy &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; 621*eda14cbcSMatt Macy struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data; 622*eda14cbcSMatt Macy ssize_t off = 0; 623*eda14cbcSMatt Macy 624*eda14cbcSMatt Macy if (curr_stat == fastest_stat) { 625*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-17s", "fastest"); 626*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-15s", 627*eda14cbcSMatt Macy fletcher_4_supp_impls[fastest_stat->native]->name); 628*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-15s\n", 629*eda14cbcSMatt Macy fletcher_4_supp_impls[fastest_stat->byteswap]->name); 630*eda14cbcSMatt Macy } else { 631*eda14cbcSMatt Macy ptrdiff_t id = curr_stat - fletcher_4_stat_data; 632*eda14cbcSMatt Macy 633*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-17s", 634*eda14cbcSMatt Macy fletcher_4_supp_impls[id]->name); 635*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-15llu", 636*eda14cbcSMatt Macy (u_longlong_t)curr_stat->native); 637*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-15llu\n", 638*eda14cbcSMatt Macy (u_longlong_t)curr_stat->byteswap); 639*eda14cbcSMatt Macy } 640*eda14cbcSMatt Macy 641*eda14cbcSMatt Macy return (0); 642*eda14cbcSMatt Macy } 643*eda14cbcSMatt Macy 644*eda14cbcSMatt Macy static void * 645*eda14cbcSMatt Macy fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) 646*eda14cbcSMatt Macy { 647*eda14cbcSMatt Macy if (n <= fletcher_4_supp_impls_cnt) 648*eda14cbcSMatt Macy ksp->ks_private = (void *) (fletcher_4_stat_data + n); 649*eda14cbcSMatt Macy else 650*eda14cbcSMatt Macy ksp->ks_private = NULL; 651*eda14cbcSMatt Macy 652*eda14cbcSMatt Macy return (ksp->ks_private); 653*eda14cbcSMatt Macy } 654*eda14cbcSMatt Macy #endif 655*eda14cbcSMatt Macy 656*eda14cbcSMatt Macy #define FLETCHER_4_FASTEST_FN_COPY(type, src) \ 657*eda14cbcSMatt Macy { \ 658*eda14cbcSMatt Macy fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \ 659*eda14cbcSMatt Macy fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \ 660*eda14cbcSMatt Macy fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \ 661*eda14cbcSMatt Macy } 662*eda14cbcSMatt Macy 663*eda14cbcSMatt Macy #define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */ 664*eda14cbcSMatt Macy 665*eda14cbcSMatt Macy typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, 666*eda14cbcSMatt Macy zio_cksum_t *); 667*eda14cbcSMatt Macy 668*eda14cbcSMatt Macy #if defined(_KERNEL) 669*eda14cbcSMatt Macy static void 670*eda14cbcSMatt Macy fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) 671*eda14cbcSMatt Macy { 672*eda14cbcSMatt Macy 673*eda14cbcSMatt Macy struct fletcher_4_kstat *fastest_stat = 674*eda14cbcSMatt Macy &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; 675*eda14cbcSMatt Macy hrtime_t start; 676*eda14cbcSMatt Macy uint64_t run_bw, run_time_ns, best_run = 0; 677*eda14cbcSMatt Macy zio_cksum_t zc; 678*eda14cbcSMatt Macy uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); 679*eda14cbcSMatt Macy 680*eda14cbcSMatt Macy fletcher_checksum_func_t *fletcher_4_test = native ? 681*eda14cbcSMatt Macy fletcher_4_native : fletcher_4_byteswap; 682*eda14cbcSMatt Macy 683*eda14cbcSMatt Macy for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { 684*eda14cbcSMatt Macy struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i]; 685*eda14cbcSMatt Macy uint64_t run_count = 0; 686*eda14cbcSMatt Macy 687*eda14cbcSMatt Macy /* temporary set an implementation */ 688*eda14cbcSMatt Macy fletcher_4_impl_chosen = i; 689*eda14cbcSMatt Macy 690*eda14cbcSMatt Macy kpreempt_disable(); 691*eda14cbcSMatt Macy start = gethrtime(); 692*eda14cbcSMatt Macy do { 693*eda14cbcSMatt Macy for (l = 0; l < 32; l++, run_count++) 694*eda14cbcSMatt Macy fletcher_4_test(data, data_size, NULL, &zc); 695*eda14cbcSMatt Macy 696*eda14cbcSMatt Macy run_time_ns = gethrtime() - start; 697*eda14cbcSMatt Macy } while (run_time_ns < FLETCHER_4_BENCH_NS); 698*eda14cbcSMatt Macy kpreempt_enable(); 699*eda14cbcSMatt Macy 700*eda14cbcSMatt Macy run_bw = data_size * run_count * NANOSEC; 701*eda14cbcSMatt Macy run_bw /= run_time_ns; /* B/s */ 702*eda14cbcSMatt Macy 703*eda14cbcSMatt Macy if (native) 704*eda14cbcSMatt Macy stat->native = run_bw; 705*eda14cbcSMatt Macy else 706*eda14cbcSMatt Macy stat->byteswap = run_bw; 707*eda14cbcSMatt Macy 708*eda14cbcSMatt Macy if (run_bw > best_run) { 709*eda14cbcSMatt Macy best_run = run_bw; 710*eda14cbcSMatt Macy 711*eda14cbcSMatt Macy if (native) { 712*eda14cbcSMatt Macy fastest_stat->native = i; 713*eda14cbcSMatt Macy FLETCHER_4_FASTEST_FN_COPY(native, 714*eda14cbcSMatt Macy fletcher_4_supp_impls[i]); 715*eda14cbcSMatt Macy } else { 716*eda14cbcSMatt Macy fastest_stat->byteswap = i; 717*eda14cbcSMatt Macy FLETCHER_4_FASTEST_FN_COPY(byteswap, 718*eda14cbcSMatt Macy fletcher_4_supp_impls[i]); 719*eda14cbcSMatt Macy } 720*eda14cbcSMatt Macy } 721*eda14cbcSMatt Macy } 722*eda14cbcSMatt Macy 723*eda14cbcSMatt Macy /* restore original selection */ 724*eda14cbcSMatt Macy atomic_swap_32(&fletcher_4_impl_chosen, sel_save); 725*eda14cbcSMatt Macy } 726*eda14cbcSMatt Macy #endif /* _KERNEL */ 727*eda14cbcSMatt Macy 728*eda14cbcSMatt Macy /* 729*eda14cbcSMatt Macy * Initialize and benchmark all supported implementations. 730*eda14cbcSMatt Macy */ 731*eda14cbcSMatt Macy static void 732*eda14cbcSMatt Macy fletcher_4_benchmark(void) 733*eda14cbcSMatt Macy { 734*eda14cbcSMatt Macy fletcher_4_ops_t *curr_impl; 735*eda14cbcSMatt Macy int i, c; 736*eda14cbcSMatt Macy 737*eda14cbcSMatt Macy /* Move supported implementations into fletcher_4_supp_impls */ 738*eda14cbcSMatt Macy for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { 739*eda14cbcSMatt Macy curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i]; 740*eda14cbcSMatt Macy 741*eda14cbcSMatt Macy if (curr_impl->valid && curr_impl->valid()) 742*eda14cbcSMatt Macy fletcher_4_supp_impls[c++] = curr_impl; 743*eda14cbcSMatt Macy } 744*eda14cbcSMatt Macy membar_producer(); /* complete fletcher_4_supp_impls[] init */ 745*eda14cbcSMatt Macy fletcher_4_supp_impls_cnt = c; /* number of supported impl */ 746*eda14cbcSMatt Macy 747*eda14cbcSMatt Macy #if defined(_KERNEL) 748*eda14cbcSMatt Macy static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ 749*eda14cbcSMatt Macy char *databuf = vmem_alloc(data_size, KM_SLEEP); 750*eda14cbcSMatt Macy 751*eda14cbcSMatt Macy for (i = 0; i < data_size / sizeof (uint64_t); i++) 752*eda14cbcSMatt Macy ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */ 753*eda14cbcSMatt Macy 754*eda14cbcSMatt Macy fletcher_4_benchmark_impl(B_FALSE, databuf, data_size); 755*eda14cbcSMatt Macy fletcher_4_benchmark_impl(B_TRUE, databuf, data_size); 756*eda14cbcSMatt Macy 757*eda14cbcSMatt Macy vmem_free(databuf, data_size); 758*eda14cbcSMatt Macy #else 759*eda14cbcSMatt Macy /* 760*eda14cbcSMatt Macy * Skip the benchmark in user space to avoid impacting libzpool 761*eda14cbcSMatt Macy * consumers (zdb, zhack, zinject, ztest). The last implementation 762*eda14cbcSMatt Macy * is assumed to be the fastest and used by default. 763*eda14cbcSMatt Macy */ 764*eda14cbcSMatt Macy memcpy(&fletcher_4_fastest_impl, 765*eda14cbcSMatt Macy fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1], 766*eda14cbcSMatt Macy sizeof (fletcher_4_fastest_impl)); 767*eda14cbcSMatt Macy fletcher_4_fastest_impl.name = "fastest"; 768*eda14cbcSMatt Macy membar_producer(); 769*eda14cbcSMatt Macy #endif /* _KERNEL */ 770*eda14cbcSMatt Macy } 771*eda14cbcSMatt Macy 772*eda14cbcSMatt Macy void 773*eda14cbcSMatt Macy fletcher_4_init(void) 774*eda14cbcSMatt Macy { 775*eda14cbcSMatt Macy /* Determine the fastest available implementation. */ 776*eda14cbcSMatt Macy fletcher_4_benchmark(); 777*eda14cbcSMatt Macy 778*eda14cbcSMatt Macy #if defined(_KERNEL) 779*eda14cbcSMatt Macy /* Install kstats for all implementations */ 780*eda14cbcSMatt Macy fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", 781*eda14cbcSMatt Macy KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 782*eda14cbcSMatt Macy if (fletcher_4_kstat != NULL) { 783*eda14cbcSMatt Macy fletcher_4_kstat->ks_data = NULL; 784*eda14cbcSMatt Macy fletcher_4_kstat->ks_ndata = UINT32_MAX; 785*eda14cbcSMatt Macy kstat_set_raw_ops(fletcher_4_kstat, 786*eda14cbcSMatt Macy fletcher_4_kstat_headers, 787*eda14cbcSMatt Macy fletcher_4_kstat_data, 788*eda14cbcSMatt Macy fletcher_4_kstat_addr); 789*eda14cbcSMatt Macy kstat_install(fletcher_4_kstat); 790*eda14cbcSMatt Macy } 791*eda14cbcSMatt Macy #endif 792*eda14cbcSMatt Macy 793*eda14cbcSMatt Macy /* Finish initialization */ 794*eda14cbcSMatt Macy fletcher_4_initialized = B_TRUE; 795*eda14cbcSMatt Macy } 796*eda14cbcSMatt Macy 797*eda14cbcSMatt Macy void 798*eda14cbcSMatt Macy fletcher_4_fini(void) 799*eda14cbcSMatt Macy { 800*eda14cbcSMatt Macy #if defined(_KERNEL) 801*eda14cbcSMatt Macy if (fletcher_4_kstat != NULL) { 802*eda14cbcSMatt Macy kstat_delete(fletcher_4_kstat); 803*eda14cbcSMatt Macy fletcher_4_kstat = NULL; 804*eda14cbcSMatt Macy } 805*eda14cbcSMatt Macy #endif 806*eda14cbcSMatt Macy } 807*eda14cbcSMatt Macy 808*eda14cbcSMatt Macy /* ABD adapters */ 809*eda14cbcSMatt Macy 810*eda14cbcSMatt Macy static void 811*eda14cbcSMatt Macy abd_fletcher_4_init(zio_abd_checksum_data_t *cdp) 812*eda14cbcSMatt Macy { 813*eda14cbcSMatt Macy const fletcher_4_ops_t *ops = fletcher_4_impl_get(); 814*eda14cbcSMatt Macy cdp->acd_private = (void *) ops; 815*eda14cbcSMatt Macy 816*eda14cbcSMatt Macy if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) 817*eda14cbcSMatt Macy ops->init_native(cdp->acd_ctx); 818*eda14cbcSMatt Macy else 819*eda14cbcSMatt Macy ops->init_byteswap(cdp->acd_ctx); 820*eda14cbcSMatt Macy } 821*eda14cbcSMatt Macy 822*eda14cbcSMatt Macy static void 823*eda14cbcSMatt Macy abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp) 824*eda14cbcSMatt Macy { 825*eda14cbcSMatt Macy fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; 826*eda14cbcSMatt Macy 827*eda14cbcSMatt Macy ASSERT(ops); 828*eda14cbcSMatt Macy 829*eda14cbcSMatt Macy if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) 830*eda14cbcSMatt Macy ops->fini_native(cdp->acd_ctx, cdp->acd_zcp); 831*eda14cbcSMatt Macy else 832*eda14cbcSMatt Macy ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp); 833*eda14cbcSMatt Macy } 834*eda14cbcSMatt Macy 835*eda14cbcSMatt Macy static void 836*eda14cbcSMatt Macy abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size, 837*eda14cbcSMatt Macy zio_abd_checksum_data_t *cdp) 838*eda14cbcSMatt Macy { 839*eda14cbcSMatt Macy zio_cksum_t *zcp = cdp->acd_zcp; 840*eda14cbcSMatt Macy 841*eda14cbcSMatt Macy ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE); 842*eda14cbcSMatt Macy 843*eda14cbcSMatt Macy abd_fletcher_4_fini(cdp); 844*eda14cbcSMatt Macy cdp->acd_private = (void *)&fletcher_4_scalar_ops; 845*eda14cbcSMatt Macy 846*eda14cbcSMatt Macy if (native) 847*eda14cbcSMatt Macy fletcher_4_incremental_native(data, size, zcp); 848*eda14cbcSMatt Macy else 849*eda14cbcSMatt Macy fletcher_4_incremental_byteswap(data, size, zcp); 850*eda14cbcSMatt Macy } 851*eda14cbcSMatt Macy 852*eda14cbcSMatt Macy static int 853*eda14cbcSMatt Macy abd_fletcher_4_iter(void *data, size_t size, void *private) 854*eda14cbcSMatt Macy { 855*eda14cbcSMatt Macy zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private; 856*eda14cbcSMatt Macy fletcher_4_ctx_t *ctx = cdp->acd_ctx; 857*eda14cbcSMatt Macy fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; 858*eda14cbcSMatt Macy boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE; 859*eda14cbcSMatt Macy uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); 860*eda14cbcSMatt Macy 861*eda14cbcSMatt Macy ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); 862*eda14cbcSMatt Macy 863*eda14cbcSMatt Macy if (asize > 0) { 864*eda14cbcSMatt Macy if (native) 865*eda14cbcSMatt Macy ops->compute_native(ctx, data, asize); 866*eda14cbcSMatt Macy else 867*eda14cbcSMatt Macy ops->compute_byteswap(ctx, data, asize); 868*eda14cbcSMatt Macy 869*eda14cbcSMatt Macy size -= asize; 870*eda14cbcSMatt Macy data = (char *)data + asize; 871*eda14cbcSMatt Macy } 872*eda14cbcSMatt Macy 873*eda14cbcSMatt Macy if (size > 0) { 874*eda14cbcSMatt Macy ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE); 875*eda14cbcSMatt Macy /* At this point we have to switch to scalar impl */ 876*eda14cbcSMatt Macy abd_fletcher_4_simd2scalar(native, data, size, cdp); 877*eda14cbcSMatt Macy } 878*eda14cbcSMatt Macy 879*eda14cbcSMatt Macy return (0); 880*eda14cbcSMatt Macy } 881*eda14cbcSMatt Macy 882*eda14cbcSMatt Macy zio_abd_checksum_func_t fletcher_4_abd_ops = { 883*eda14cbcSMatt Macy .acf_init = abd_fletcher_4_init, 884*eda14cbcSMatt Macy .acf_fini = abd_fletcher_4_fini, 885*eda14cbcSMatt Macy .acf_iter = abd_fletcher_4_iter 886*eda14cbcSMatt Macy }; 887*eda14cbcSMatt Macy 888*eda14cbcSMatt Macy 889*eda14cbcSMatt Macy #if defined(_KERNEL) && defined(__linux__) 890*eda14cbcSMatt Macy 891*eda14cbcSMatt Macy static int 892*eda14cbcSMatt Macy fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused) 893*eda14cbcSMatt Macy { 894*eda14cbcSMatt Macy const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); 895*eda14cbcSMatt Macy char *fmt; 896*eda14cbcSMatt Macy int i, cnt = 0; 897*eda14cbcSMatt Macy 898*eda14cbcSMatt Macy /* list fastest */ 899*eda14cbcSMatt Macy fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s "; 900*eda14cbcSMatt Macy cnt += sprintf(buffer + cnt, fmt, "fastest"); 901*eda14cbcSMatt Macy 902*eda14cbcSMatt Macy /* list all supported implementations */ 903*eda14cbcSMatt Macy for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { 904*eda14cbcSMatt Macy fmt = (i == impl) ? "[%s] " : "%s "; 905*eda14cbcSMatt Macy cnt += sprintf(buffer + cnt, fmt, 906*eda14cbcSMatt Macy fletcher_4_supp_impls[i]->name); 907*eda14cbcSMatt Macy } 908*eda14cbcSMatt Macy 909*eda14cbcSMatt Macy return (cnt); 910*eda14cbcSMatt Macy } 911*eda14cbcSMatt Macy 912*eda14cbcSMatt Macy static int 913*eda14cbcSMatt Macy fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused) 914*eda14cbcSMatt Macy { 915*eda14cbcSMatt Macy return (fletcher_4_impl_set(val)); 916*eda14cbcSMatt Macy } 917*eda14cbcSMatt Macy 918*eda14cbcSMatt Macy /* 919*eda14cbcSMatt Macy * Choose a fletcher 4 implementation in ZFS. 920*eda14cbcSMatt Macy * Users can choose "cycle" to exercise all implementations, but this is 921*eda14cbcSMatt Macy * for testing purpose therefore it can only be set in user space. 922*eda14cbcSMatt Macy */ 923*eda14cbcSMatt Macy module_param_call(zfs_fletcher_4_impl, 924*eda14cbcSMatt Macy fletcher_4_param_set, fletcher_4_param_get, NULL, 0644); 925*eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation."); 926*eda14cbcSMatt Macy 927*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_init); 928*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_2_incremental_native); 929*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_2_incremental_byteswap); 930*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_init); 931*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_fini); 932*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_2_native); 933*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_2_byteswap); 934*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_native); 935*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_native_varsize); 936*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_byteswap); 937*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_incremental_native); 938*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_incremental_byteswap); 939*eda14cbcSMatt Macy EXPORT_SYMBOL(fletcher_4_abd_ops); 940*eda14cbcSMatt Macy #endif 941