1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de> 24 */ 25 26 #include <sys/types.h> 27 #include <sys/spa.h> 28 #include <sys/zio_checksum.h> 29 #include <sys/zfs_context.h> 30 #include <sys/zfs_chksum.h> 31 32 #include <sys/blake3.h> 33 34 /* limit benchmarking to max 256KiB, when EdonR is slower then this: */ 35 #define LIMIT_PERF_MBS 300 36 37 typedef struct { 38 const char *name; 39 const char *impl; 40 uint64_t bs1k; 41 uint64_t bs4k; 42 uint64_t bs16k; 43 uint64_t bs64k; 44 uint64_t bs256k; 45 uint64_t bs1m; 46 uint64_t bs4m; 47 uint64_t bs16m; 48 zio_cksum_salt_t salt; 49 zio_checksum_t *(func); 50 zio_checksum_tmpl_init_t *(init); 51 zio_checksum_tmpl_free_t *(free); 52 } chksum_stat_t; 53 54 static chksum_stat_t *chksum_stat_data = 0; 55 static int chksum_stat_cnt = 0; 56 static kstat_t *chksum_kstat = NULL; 57 58 /* 59 * i3-1005G1 test output: 60 * 61 * implementation 1k 4k 16k 64k 256k 1m 4m 62 * fletcher-4 5421 15001 26468 32555 34720 32801 18847 63 * edonr-generic 1196 1602 1761 1749 1762 1759 1751 64 * skein-generic 546 591 608 615 619 612 616 65 * sha256-generic 246 270 274 274 277 275 276 66 * sha256-avx 262 296 304 307 307 307 306 67 * sha256-sha-ni 769 1072 1172 1220 1219 1232 1228 68 * sha256-openssl 240 300 316 314 304 285 276 69 * sha512-generic 333 374 385 392 391 393 392 70 * sha512-openssl 353 441 467 476 472 467 426 71 * sha512-avx 362 444 473 475 479 476 478 72 * sha512-avx2 394 500 530 538 543 545 542 73 * blake3-generic 308 313 313 313 312 313 312 74 * blake3-sse2 402 1289 1423 1446 1432 1458 1413 75 * blake3-sse41 427 1470 1625 1704 1679 1607 1629 76 * blake3-avx2 428 1920 3095 3343 3356 3318 3204 77 * blake3-avx512 473 2687 4905 5836 5844 5643 5374 78 */ 79 static int 80 chksum_kstat_headers(char *buf, size_t size) 81 { 82 ssize_t off = 0; 83 84 off += snprintf(buf + off, size, "%-23s", "implementation"); 85 off += snprintf(buf + off, size - off, "%8s", "1k"); 86 off += snprintf(buf + off, size - off, "%8s", "4k"); 87 off += snprintf(buf + off, size - off, "%8s", "16k"); 88 off += snprintf(buf + off, size - off, "%8s", "64k"); 89 off += snprintf(buf + off, size - off, "%8s", "256k"); 90 off += snprintf(buf + off, size - off, "%8s", "1m"); 91 off += snprintf(buf + off, size - off, "%8s", "4m"); 92 (void) snprintf(buf + off, size - off, "%8s\n", "16m"); 93 94 return (0); 95 } 96 97 static int 98 chksum_kstat_data(char *buf, size_t size, void *data) 99 { 100 chksum_stat_t *cs; 101 ssize_t off = 0; 102 char b[24]; 103 104 cs = (chksum_stat_t *)data; 105 snprintf(b, 23, "%s-%s", cs->name, cs->impl); 106 off += snprintf(buf + off, size - off, "%-23s", b); 107 off += snprintf(buf + off, size - off, "%8llu", 108 (u_longlong_t)cs->bs1k); 109 off += snprintf(buf + off, size - off, "%8llu", 110 (u_longlong_t)cs->bs4k); 111 off += snprintf(buf + off, size - off, "%8llu", 112 (u_longlong_t)cs->bs16k); 113 off += snprintf(buf + off, size - off, "%8llu", 114 (u_longlong_t)cs->bs64k); 115 off += snprintf(buf + off, size - off, "%8llu", 116 (u_longlong_t)cs->bs256k); 117 off += snprintf(buf + off, size - off, "%8llu", 118 (u_longlong_t)cs->bs1m); 119 off += snprintf(buf + off, size - off, "%8llu", 120 (u_longlong_t)cs->bs4m); 121 (void) snprintf(buf + off, size - off, "%8llu\n", 122 (u_longlong_t)cs->bs16m); 123 124 return (0); 125 } 126 127 static void * 128 chksum_kstat_addr(kstat_t *ksp, loff_t n) 129 { 130 if (n < chksum_stat_cnt) 131 ksp->ks_private = (void *)(chksum_stat_data + n); 132 else 133 ksp->ks_private = NULL; 134 135 return (ksp->ks_private); 136 } 137 138 static void 139 chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round, 140 uint64_t *result) 141 { 142 hrtime_t start; 143 uint64_t run_bw, run_time_ns, run_count = 0, size = 0; 144 uint32_t l, loops = 0; 145 zio_cksum_t zcp; 146 147 switch (round) { 148 case 1: /* 1k */ 149 size = 1<<10; loops = 128; break; 150 case 2: /* 2k */ 151 size = 1<<12; loops = 64; break; 152 case 3: /* 4k */ 153 size = 1<<14; loops = 32; break; 154 case 4: /* 16k */ 155 size = 1<<16; loops = 16; break; 156 case 5: /* 256k */ 157 size = 1<<18; loops = 8; break; 158 case 6: /* 1m */ 159 size = 1<<20; loops = 4; break; 160 case 7: /* 4m */ 161 size = 1<<22; loops = 1; break; 162 case 8: /* 16m */ 163 size = 1<<24; loops = 1; break; 164 } 165 166 kpreempt_disable(); 167 start = gethrtime(); 168 do { 169 for (l = 0; l < loops; l++, run_count++) 170 cs->func(abd, size, ctx, &zcp); 171 172 run_time_ns = gethrtime() - start; 173 } while (run_time_ns < MSEC2NSEC(1)); 174 kpreempt_enable(); 175 176 run_bw = size * run_count * NANOSEC; 177 run_bw /= run_time_ns; /* B/s */ 178 *result = run_bw/1024/1024; /* MiB/s */ 179 } 180 181 #define LIMIT_INIT 0 182 #define LIMIT_NEEDED 1 183 #define LIMIT_NOLIMIT 2 184 185 static void 186 chksum_benchit(chksum_stat_t *cs) 187 { 188 abd_t *abd; 189 void *ctx = 0; 190 void *salt = &cs->salt.zcs_bytes; 191 static int chksum_stat_limit = LIMIT_INIT; 192 193 memset(salt, 0, sizeof (cs->salt.zcs_bytes)); 194 if (cs->init) 195 ctx = cs->init(&cs->salt); 196 197 /* allocate test memory via abd linear interface */ 198 abd = abd_alloc_linear(1<<20, B_FALSE); 199 chksum_run(cs, abd, ctx, 1, &cs->bs1k); 200 chksum_run(cs, abd, ctx, 2, &cs->bs4k); 201 chksum_run(cs, abd, ctx, 3, &cs->bs16k); 202 chksum_run(cs, abd, ctx, 4, &cs->bs64k); 203 chksum_run(cs, abd, ctx, 5, &cs->bs256k); 204 205 /* check if we ran on a slow cpu */ 206 if (chksum_stat_limit == LIMIT_INIT) { 207 if (cs->bs1k < LIMIT_PERF_MBS) { 208 chksum_stat_limit = LIMIT_NEEDED; 209 } else { 210 chksum_stat_limit = LIMIT_NOLIMIT; 211 } 212 } 213 214 /* skip benchmarks >= 1MiB when the CPU is to slow */ 215 if (chksum_stat_limit == LIMIT_NEEDED) 216 goto abort; 217 218 chksum_run(cs, abd, ctx, 6, &cs->bs1m); 219 abd_free(abd); 220 221 /* allocate test memory via abd non linear interface */ 222 abd = abd_alloc(1<<24, B_FALSE); 223 chksum_run(cs, abd, ctx, 7, &cs->bs4m); 224 chksum_run(cs, abd, ctx, 8, &cs->bs16m); 225 226 abort: 227 abd_free(abd); 228 229 /* free up temp memory */ 230 if (cs->free) 231 cs->free(ctx); 232 } 233 234 /* 235 * Initialize and benchmark all supported implementations. 236 */ 237 static void 238 chksum_benchmark(void) 239 { 240 241 #ifndef _KERNEL 242 /* we need the benchmark only for the kernel module */ 243 return; 244 #endif 245 246 chksum_stat_t *cs; 247 int cbid = 0, id; 248 uint64_t max = 0; 249 250 /* space for the benchmark times */ 251 chksum_stat_cnt = 4; 252 chksum_stat_cnt += blake3_get_impl_count(); 253 chksum_stat_data = (chksum_stat_t *)kmem_zalloc( 254 sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP); 255 256 /* edonr - needs to be the first one here (slow CPU check) */ 257 cs = &chksum_stat_data[cbid++]; 258 cs->init = abd_checksum_edonr_tmpl_init; 259 cs->func = abd_checksum_edonr_native; 260 cs->free = abd_checksum_edonr_tmpl_free; 261 cs->name = "edonr"; 262 cs->impl = "generic"; 263 chksum_benchit(cs); 264 265 /* skein */ 266 cs = &chksum_stat_data[cbid++]; 267 cs->init = abd_checksum_skein_tmpl_init; 268 cs->func = abd_checksum_skein_native; 269 cs->free = abd_checksum_skein_tmpl_free; 270 cs->name = "skein"; 271 cs->impl = "generic"; 272 chksum_benchit(cs); 273 274 /* sha256 */ 275 cs = &chksum_stat_data[cbid++]; 276 cs->init = 0; 277 cs->func = abd_checksum_SHA256; 278 cs->free = 0; 279 cs->name = "sha256"; 280 cs->impl = "generic"; 281 chksum_benchit(cs); 282 283 /* sha512 */ 284 cs = &chksum_stat_data[cbid++]; 285 cs->init = 0; 286 cs->func = abd_checksum_SHA512_native; 287 cs->free = 0; 288 cs->name = "sha512"; 289 cs->impl = "generic"; 290 chksum_benchit(cs); 291 292 /* blake3 */ 293 for (id = 0; id < blake3_get_impl_count(); id++) { 294 blake3_set_impl_id(id); 295 cs = &chksum_stat_data[cbid++]; 296 cs->init = abd_checksum_blake3_tmpl_init; 297 cs->func = abd_checksum_blake3_native; 298 cs->free = abd_checksum_blake3_tmpl_free; 299 cs->name = "blake3"; 300 cs->impl = blake3_get_impl_name(); 301 chksum_benchit(cs); 302 if (cs->bs256k > max) { 303 max = cs->bs256k; 304 blake3_set_impl_fastest(id); 305 } 306 } 307 } 308 309 void 310 chksum_init(void) 311 { 312 #ifdef _KERNEL 313 blake3_per_cpu_ctx_init(); 314 #endif 315 316 /* Benchmark supported implementations */ 317 chksum_benchmark(); 318 319 /* Install kstats for all implementations */ 320 chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc", 321 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 322 323 if (chksum_kstat != NULL) { 324 chksum_kstat->ks_data = NULL; 325 chksum_kstat->ks_ndata = UINT32_MAX; 326 kstat_set_raw_ops(chksum_kstat, 327 chksum_kstat_headers, 328 chksum_kstat_data, 329 chksum_kstat_addr); 330 kstat_install(chksum_kstat); 331 } 332 333 /* setup implementations */ 334 blake3_setup_impl(); 335 } 336 337 void 338 chksum_fini(void) 339 { 340 if (chksum_kstat != NULL) { 341 kstat_delete(chksum_kstat); 342 chksum_kstat = NULL; 343 } 344 345 if (chksum_stat_cnt) { 346 kmem_free(chksum_stat_data, 347 sizeof (chksum_stat_t) * chksum_stat_cnt); 348 chksum_stat_cnt = 0; 349 chksum_stat_data = 0; 350 } 351 352 #ifdef _KERNEL 353 blake3_per_cpu_ctx_fini(); 354 #endif 355 } 356