1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de> 25 */ 26 27 #include <sys/zio_checksum.h> 28 #include <sys/zfs_context.h> 29 #include <sys/zfs_chksum.h> 30 #include <sys/zfs_impl.h> 31 32 #include <sys/blake3.h> 33 #include <sys/sha2.h> 34 35 typedef struct { 36 const char *name; 37 const char *impl; 38 uint64_t bs1k; 39 uint64_t bs4k; 40 uint64_t bs16k; 41 uint64_t bs64k; 42 uint64_t bs256k; 43 uint64_t bs1m; 44 uint64_t bs4m; 45 uint64_t bs16m; 46 zio_cksum_salt_t salt; 47 zio_checksum_t *(func); 48 zio_checksum_tmpl_init_t *(init); 49 zio_checksum_tmpl_free_t *(free); 50 } chksum_stat_t; 51 52 #define AT_STARTUP 0 53 #define AT_BENCHMARK 1 54 #define AT_DONE 2 55 56 static chksum_stat_t *chksum_stat_data = 0; 57 static kstat_t *chksum_kstat = NULL; 58 static int chksum_stat_limit = AT_STARTUP; 59 static int chksum_stat_cnt = 0; 60 static void chksum_benchmark(void); 61 62 /* 63 * Sample output on i3-1005G1 System: 64 * 65 * implementation 1k 4k 16k 64k 256k 1m 4m 16m 66 * edonr-generic 1278 1625 1769 1776 1783 1778 1771 1767 67 * skein-generic 548 594 613 623 621 623 621 486 68 * sha256-generic 255 270 281 278 279 281 283 283 69 * sha256-x64 288 310 316 317 318 317 317 316 70 * sha256-ssse3 304 342 351 355 356 357 356 356 71 * sha256-avx 311 348 359 362 362 363 363 362 72 * sha256-avx2 330 378 389 395 395 395 395 395 73 * sha256-shani 908 1127 1212 1230 1233 1234 1223 1230 74 * sha512-generic 359 409 431 427 429 430 428 423 75 * sha512-x64 420 473 490 496 497 497 496 495 76 * sha512-avx 406 522 546 560 560 560 556 560 77 * sha512-avx2 464 568 601 606 609 610 607 608 78 * blake3-generic 330 327 324 323 324 320 323 322 79 * blake3-sse2 424 1366 1449 1468 1458 1453 1395 1408 80 * blake3-sse41 453 1554 1658 1703 1689 1669 1622 1630 81 * blake3-avx2 452 2013 3225 3351 3356 3261 3076 3101 82 * blake3-avx512 498 2869 5269 5926 5872 5643 5014 5005 83 */ 84 static int 85 chksum_kstat_headers(char *buf, size_t size) 86 { 87 ssize_t off = 0; 88 89 off += kmem_scnprintf(buf + off, size, "%-23s", "implementation"); 90 off += kmem_scnprintf(buf + off, size - off, "%8s", "1k"); 91 off += kmem_scnprintf(buf + off, size - off, "%8s", "4k"); 92 off += kmem_scnprintf(buf + off, size - off, "%8s", "16k"); 93 off += kmem_scnprintf(buf + off, size - off, "%8s", "64k"); 94 off += kmem_scnprintf(buf + off, size - off, "%8s", "256k"); 95 off += kmem_scnprintf(buf + off, size - off, "%8s", "1m"); 96 off += kmem_scnprintf(buf + off, size - off, "%8s", "4m"); 97 (void) kmem_scnprintf(buf + off, size - off, "%8s\n", "16m"); 98 99 return (0); 100 } 101 102 static int 103 chksum_kstat_data(char *buf, size_t size, void *data) 104 { 105 chksum_stat_t *cs; 106 ssize_t off = 0; 107 char b[24]; 108 109 cs = (chksum_stat_t *)data; 110 kmem_scnprintf(b, 23, "%s-%s", cs->name, cs->impl); 111 off += kmem_scnprintf(buf + off, size - off, "%-23s", b); 112 off += kmem_scnprintf(buf + off, size - off, "%8llu", 113 (u_longlong_t)cs->bs1k); 114 off += kmem_scnprintf(buf + off, size - off, "%8llu", 115 (u_longlong_t)cs->bs4k); 116 off += kmem_scnprintf(buf + off, size - off, "%8llu", 117 (u_longlong_t)cs->bs16k); 118 off += kmem_scnprintf(buf + off, size - off, "%8llu", 119 (u_longlong_t)cs->bs64k); 120 off += kmem_scnprintf(buf + off, size - off, "%8llu", 121 (u_longlong_t)cs->bs256k); 122 off += kmem_scnprintf(buf + off, size - off, "%8llu", 123 (u_longlong_t)cs->bs1m); 124 off += kmem_scnprintf(buf + off, size - off, "%8llu", 125 (u_longlong_t)cs->bs4m); 126 (void) kmem_scnprintf(buf + off, size - off, "%8llu\n", 127 (u_longlong_t)cs->bs16m); 128 129 return (0); 130 } 131 132 static void * 133 chksum_kstat_addr(kstat_t *ksp, loff_t n) 134 { 135 /* full benchmark */ 136 chksum_benchmark(); 137 138 if (n < chksum_stat_cnt) 139 ksp->ks_private = (void *)(chksum_stat_data + n); 140 else 141 ksp->ks_private = NULL; 142 143 return (ksp->ks_private); 144 } 145 146 static void 147 chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round, 148 uint64_t *result) 149 { 150 hrtime_t start; 151 uint64_t run_bw, run_time_ns, run_count = 0, size = 0; 152 uint32_t l, loops = 0; 153 zio_cksum_t zcp; 154 155 switch (round) { 156 case 1: /* 1k */ 157 size = 1<<10; loops = 128; break; 158 case 2: /* 2k */ 159 size = 1<<12; loops = 64; break; 160 case 3: /* 4k */ 161 size = 1<<14; loops = 32; break; 162 case 4: /* 16k */ 163 size = 1<<16; loops = 16; break; 164 case 5: /* 256k */ 165 size = 1<<18; loops = 8; break; 166 case 6: /* 1m */ 167 size = 1<<20; loops = 4; break; 168 case 7: /* 4m */ 169 size = 1<<22; loops = 1; break; 170 case 8: /* 16m */ 171 size = 1<<24; loops = 1; break; 172 } 173 174 kpreempt_disable(); 175 start = gethrtime(); 176 do { 177 for (l = 0; l < loops; l++, run_count++) 178 cs->func(abd, size, ctx, &zcp); 179 180 run_time_ns = gethrtime() - start; 181 } while (run_time_ns < MSEC2NSEC(1)); 182 kpreempt_enable(); 183 184 run_bw = size * run_count * NANOSEC; 185 run_bw /= run_time_ns; /* B/s */ 186 *result = run_bw/1024/1024; /* MiB/s */ 187 } 188 189 static void 190 chksum_benchit(chksum_stat_t *cs) 191 { 192 abd_t *abd; 193 void *ctx = 0; 194 void *salt = &cs->salt.zcs_bytes; 195 196 memset(salt, 0, sizeof (cs->salt.zcs_bytes)); 197 if (cs->init) 198 ctx = cs->init(&cs->salt); 199 200 /* benchmarks in startup mode */ 201 if (chksum_stat_limit == AT_STARTUP) { 202 abd = abd_alloc_linear(1<<18, B_FALSE); 203 chksum_run(cs, abd, ctx, 5, &cs->bs256k); 204 goto done; 205 } 206 207 /* allocate test memory via abd linear interface */ 208 abd = abd_alloc_linear(1<<20, B_FALSE); 209 210 /* benchmarks when requested */ 211 chksum_run(cs, abd, ctx, 1, &cs->bs1k); 212 chksum_run(cs, abd, ctx, 2, &cs->bs4k); 213 chksum_run(cs, abd, ctx, 3, &cs->bs16k); 214 chksum_run(cs, abd, ctx, 4, &cs->bs64k); 215 chksum_run(cs, abd, ctx, 6, &cs->bs1m); 216 abd_free(abd); 217 218 /* allocate test memory via abd non linear interface */ 219 abd = abd_alloc(1<<24, B_FALSE); 220 chksum_run(cs, abd, ctx, 7, &cs->bs4m); 221 chksum_run(cs, abd, ctx, 8, &cs->bs16m); 222 223 done: 224 abd_free(abd); 225 226 /* free up temp memory */ 227 if (cs->free) 228 cs->free(ctx); 229 } 230 231 /* 232 * Initialize and benchmark all supported implementations. 233 */ 234 static void 235 chksum_benchmark(void) 236 { 237 #ifndef _KERNEL 238 /* we need the benchmark only for the kernel module */ 239 return; 240 #endif 241 chksum_stat_t *cs; 242 uint64_t max; 243 uint32_t id, cbid = 0, id_save; 244 const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); 245 const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256"); 246 const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512"); 247 248 /* benchmarks are done */ 249 if (chksum_stat_limit == AT_DONE) 250 return; 251 252 253 /* count implementations */ 254 chksum_stat_cnt = 1; /* edonr */ 255 chksum_stat_cnt += 1; /* skein */ 256 chksum_stat_cnt += sha256->getcnt(); 257 chksum_stat_cnt += sha512->getcnt(); 258 chksum_stat_cnt += blake3->getcnt(); 259 chksum_stat_data = kmem_zalloc( 260 sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP); 261 262 /* edonr - needs to be the first one here (slow CPU check) */ 263 cs = &chksum_stat_data[cbid++]; 264 265 /* edonr */ 266 cs->init = abd_checksum_edonr_tmpl_init; 267 cs->func = abd_checksum_edonr_native; 268 cs->free = abd_checksum_edonr_tmpl_free; 269 cs->name = "edonr"; 270 cs->impl = "generic"; 271 chksum_benchit(cs); 272 273 /* skein */ 274 cs = &chksum_stat_data[cbid++]; 275 cs->init = abd_checksum_skein_tmpl_init; 276 cs->func = abd_checksum_skein_native; 277 cs->free = abd_checksum_skein_tmpl_free; 278 cs->name = "skein"; 279 cs->impl = "generic"; 280 chksum_benchit(cs); 281 282 /* sha256 */ 283 id_save = sha256->getid(); 284 for (max = 0, id = 0; id < sha256->getcnt(); id++) { 285 sha256->setid(id); 286 cs = &chksum_stat_data[cbid++]; 287 cs->init = 0; 288 cs->func = abd_checksum_sha256; 289 cs->free = 0; 290 cs->name = sha256->name; 291 cs->impl = sha256->getname(); 292 chksum_benchit(cs); 293 if (cs->bs256k > max) { 294 max = cs->bs256k; 295 sha256->set_fastest(id); 296 } 297 } 298 sha256->setid(id_save); 299 300 /* sha512 */ 301 id_save = sha512->getid(); 302 for (max = 0, id = 0; id < sha512->getcnt(); id++) { 303 sha512->setid(id); 304 cs = &chksum_stat_data[cbid++]; 305 cs->init = 0; 306 cs->func = abd_checksum_sha512_native; 307 cs->free = 0; 308 cs->name = sha512->name; 309 cs->impl = sha512->getname(); 310 chksum_benchit(cs); 311 if (cs->bs256k > max) { 312 max = cs->bs256k; 313 sha512->set_fastest(id); 314 } 315 } 316 sha512->setid(id_save); 317 318 /* blake3 */ 319 id_save = blake3->getid(); 320 for (max = 0, id = 0; id < blake3->getcnt(); id++) { 321 blake3->setid(id); 322 cs = &chksum_stat_data[cbid++]; 323 cs->init = abd_checksum_blake3_tmpl_init; 324 cs->func = abd_checksum_blake3_native; 325 cs->free = abd_checksum_blake3_tmpl_free; 326 cs->name = blake3->name; 327 cs->impl = blake3->getname(); 328 chksum_benchit(cs); 329 if (cs->bs256k > max) { 330 max = cs->bs256k; 331 blake3->set_fastest(id); 332 } 333 } 334 blake3->setid(id_save); 335 336 switch (chksum_stat_limit) { 337 case AT_STARTUP: 338 /* next time we want a full benchmark */ 339 chksum_stat_limit = AT_BENCHMARK; 340 break; 341 case AT_BENCHMARK: 342 /* no further benchmarks */ 343 chksum_stat_limit = AT_DONE; 344 break; 345 } 346 } 347 348 void 349 chksum_init(void) 350 { 351 #ifdef _KERNEL 352 blake3_per_cpu_ctx_init(); 353 #endif 354 355 /* 256KiB benchmark */ 356 chksum_benchmark(); 357 358 /* Install kstats for all implementations */ 359 chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc", 360 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 361 362 if (chksum_kstat != NULL) { 363 chksum_kstat->ks_data = NULL; 364 chksum_kstat->ks_ndata = UINT32_MAX; 365 kstat_set_raw_ops(chksum_kstat, 366 chksum_kstat_headers, 367 chksum_kstat_data, 368 chksum_kstat_addr); 369 kstat_install(chksum_kstat); 370 } 371 } 372 373 void 374 chksum_fini(void) 375 { 376 if (chksum_kstat != NULL) { 377 kstat_delete(chksum_kstat); 378 chksum_kstat = NULL; 379 } 380 381 if (chksum_stat_cnt) { 382 kmem_free(chksum_stat_data, 383 sizeof (chksum_stat_t) * chksum_stat_cnt); 384 chksum_stat_cnt = 0; 385 chksum_stat_data = 0; 386 } 387 388 #ifdef _KERNEL 389 blake3_per_cpu_ctx_fini(); 390 #endif 391 } 392