1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de> 24 */ 25 26 #include <sys/zio_checksum.h> 27 #include <sys/zfs_context.h> 28 #include <sys/zfs_chksum.h> 29 #include <sys/zfs_impl.h> 30 31 #include <sys/blake3.h> 32 #include <sys/sha2.h> 33 34 /* limit benchmarking to max 256KiB, when EdonR is slower then this: */ 35 #define LIMIT_PERF_MBS 300 36 37 typedef struct { 38 const char *name; 39 const char *impl; 40 uint64_t bs1k; 41 uint64_t bs4k; 42 uint64_t bs16k; 43 uint64_t bs64k; 44 uint64_t bs256k; 45 uint64_t bs1m; 46 uint64_t bs4m; 47 uint64_t bs16m; 48 zio_cksum_salt_t salt; 49 zio_checksum_t *(func); 50 zio_checksum_tmpl_init_t *(init); 51 zio_checksum_tmpl_free_t *(free); 52 } chksum_stat_t; 53 54 static chksum_stat_t *chksum_stat_data = 0; 55 static int chksum_stat_cnt = 0; 56 static kstat_t *chksum_kstat = NULL; 57 58 /* 59 * Sample output on i3-1005G1 System: 60 * 61 * implementation 1k 4k 16k 64k 256k 1m 4m 16m 62 * edonr-generic 1278 1625 1769 1776 1783 1778 1771 1767 63 * skein-generic 548 594 613 623 621 623 621 486 64 * sha256-generic 255 270 281 278 279 281 283 283 65 * sha256-x64 288 310 316 317 318 317 317 316 66 * sha256-ssse3 304 342 351 355 356 357 356 356 67 * sha256-avx 311 348 359 362 362 363 363 362 68 * sha256-avx2 330 378 389 395 395 395 395 395 69 * sha256-shani 908 1127 1212 1230 1233 1234 1223 1230 70 * sha512-generic 359 409 431 427 429 430 428 423 71 * sha512-x64 420 473 490 496 497 497 496 495 72 * sha512-avx 406 522 546 560 560 560 556 560 73 * sha512-avx2 464 568 601 606 609 610 607 608 74 * blake3-generic 330 327 324 323 324 320 323 322 75 * blake3-sse2 424 1366 1449 1468 1458 1453 1395 1408 76 * blake3-sse41 453 1554 1658 1703 1689 1669 1622 1630 77 * blake3-avx2 452 2013 3225 3351 3356 3261 3076 3101 78 * blake3-avx512 498 2869 5269 5926 5872 5643 5014 5005 79 */ 80 static int 81 chksum_kstat_headers(char *buf, size_t size) 82 { 83 ssize_t off = 0; 84 85 off += kmem_scnprintf(buf + off, size, "%-23s", "implementation"); 86 off += kmem_scnprintf(buf + off, size - off, "%8s", "1k"); 87 off += kmem_scnprintf(buf + off, size - off, "%8s", "4k"); 88 off += kmem_scnprintf(buf + off, size - off, "%8s", "16k"); 89 off += kmem_scnprintf(buf + off, size - off, "%8s", "64k"); 90 off += kmem_scnprintf(buf + off, size - off, "%8s", "256k"); 91 off += kmem_scnprintf(buf + off, size - off, "%8s", "1m"); 92 off += kmem_scnprintf(buf + off, size - off, "%8s", "4m"); 93 (void) kmem_scnprintf(buf + off, size - off, "%8s\n", "16m"); 94 95 return (0); 96 } 97 98 static int 99 chksum_kstat_data(char *buf, size_t size, void *data) 100 { 101 chksum_stat_t *cs; 102 ssize_t off = 0; 103 char b[24]; 104 105 cs = (chksum_stat_t *)data; 106 kmem_scnprintf(b, 23, "%s-%s", cs->name, cs->impl); 107 off += kmem_scnprintf(buf + off, size - off, "%-23s", b); 108 off += kmem_scnprintf(buf + off, size - off, "%8llu", 109 (u_longlong_t)cs->bs1k); 110 off += kmem_scnprintf(buf + off, size - off, "%8llu", 111 (u_longlong_t)cs->bs4k); 112 off += kmem_scnprintf(buf + off, size - off, "%8llu", 113 (u_longlong_t)cs->bs16k); 114 off += kmem_scnprintf(buf + off, size - off, "%8llu", 115 (u_longlong_t)cs->bs64k); 116 off += kmem_scnprintf(buf + off, size - off, "%8llu", 117 (u_longlong_t)cs->bs256k); 118 off += kmem_scnprintf(buf + off, size - off, "%8llu", 119 (u_longlong_t)cs->bs1m); 120 off += kmem_scnprintf(buf + off, size - off, "%8llu", 121 (u_longlong_t)cs->bs4m); 122 (void) kmem_scnprintf(buf + off, size - off, "%8llu\n", 123 (u_longlong_t)cs->bs16m); 124 125 return (0); 126 } 127 128 static void * 129 chksum_kstat_addr(kstat_t *ksp, loff_t n) 130 { 131 if (n < chksum_stat_cnt) 132 ksp->ks_private = (void *)(chksum_stat_data + n); 133 else 134 ksp->ks_private = NULL; 135 136 return (ksp->ks_private); 137 } 138 139 static void 140 chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round, 141 uint64_t *result) 142 { 143 hrtime_t start; 144 uint64_t run_bw, run_time_ns, run_count = 0, size = 0; 145 uint32_t l, loops = 0; 146 zio_cksum_t zcp; 147 148 switch (round) { 149 case 1: /* 1k */ 150 size = 1<<10; loops = 128; break; 151 case 2: /* 2k */ 152 size = 1<<12; loops = 64; break; 153 case 3: /* 4k */ 154 size = 1<<14; loops = 32; break; 155 case 4: /* 16k */ 156 size = 1<<16; loops = 16; break; 157 case 5: /* 256k */ 158 size = 1<<18; loops = 8; break; 159 case 6: /* 1m */ 160 size = 1<<20; loops = 4; break; 161 case 7: /* 4m */ 162 size = 1<<22; loops = 1; break; 163 case 8: /* 16m */ 164 size = 1<<24; loops = 1; break; 165 } 166 167 kpreempt_disable(); 168 start = gethrtime(); 169 do { 170 for (l = 0; l < loops; l++, run_count++) 171 cs->func(abd, size, ctx, &zcp); 172 173 run_time_ns = gethrtime() - start; 174 } while (run_time_ns < MSEC2NSEC(1)); 175 kpreempt_enable(); 176 177 run_bw = size * run_count * NANOSEC; 178 run_bw /= run_time_ns; /* B/s */ 179 *result = run_bw/1024/1024; /* MiB/s */ 180 } 181 182 #define LIMIT_INIT 0 183 #define LIMIT_NEEDED 1 184 #define LIMIT_NOLIMIT 2 185 186 static void 187 chksum_benchit(chksum_stat_t *cs) 188 { 189 abd_t *abd; 190 void *ctx = 0; 191 void *salt = &cs->salt.zcs_bytes; 192 static int chksum_stat_limit = LIMIT_INIT; 193 194 memset(salt, 0, sizeof (cs->salt.zcs_bytes)); 195 if (cs->init) 196 ctx = cs->init(&cs->salt); 197 198 /* allocate test memory via abd linear interface */ 199 abd = abd_alloc_linear(1<<20, B_FALSE); 200 chksum_run(cs, abd, ctx, 1, &cs->bs1k); 201 chksum_run(cs, abd, ctx, 2, &cs->bs4k); 202 chksum_run(cs, abd, ctx, 3, &cs->bs16k); 203 chksum_run(cs, abd, ctx, 4, &cs->bs64k); 204 chksum_run(cs, abd, ctx, 5, &cs->bs256k); 205 206 /* check if we ran on a slow cpu */ 207 if (chksum_stat_limit == LIMIT_INIT) { 208 if (cs->bs1k < LIMIT_PERF_MBS) { 209 chksum_stat_limit = LIMIT_NEEDED; 210 } else { 211 chksum_stat_limit = LIMIT_NOLIMIT; 212 } 213 } 214 215 /* skip benchmarks >= 1MiB when the CPU is to slow */ 216 if (chksum_stat_limit == LIMIT_NEEDED) 217 goto abort; 218 219 chksum_run(cs, abd, ctx, 6, &cs->bs1m); 220 abd_free(abd); 221 222 /* allocate test memory via abd non linear interface */ 223 abd = abd_alloc(1<<24, B_FALSE); 224 chksum_run(cs, abd, ctx, 7, &cs->bs4m); 225 chksum_run(cs, abd, ctx, 8, &cs->bs16m); 226 227 abort: 228 abd_free(abd); 229 230 /* free up temp memory */ 231 if (cs->free) 232 cs->free(ctx); 233 } 234 235 /* 236 * Initialize and benchmark all supported implementations. 237 */ 238 static void 239 chksum_benchmark(void) 240 { 241 #ifndef _KERNEL 242 /* we need the benchmark only for the kernel module */ 243 return; 244 #endif 245 246 chksum_stat_t *cs; 247 uint64_t max; 248 uint32_t id, cbid = 0, id_save; 249 const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); 250 const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256"); 251 const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512"); 252 253 /* count implementations */ 254 chksum_stat_cnt = 2; 255 chksum_stat_cnt += sha256->getcnt(); 256 chksum_stat_cnt += sha512->getcnt(); 257 chksum_stat_cnt += blake3->getcnt(); 258 chksum_stat_data = kmem_zalloc( 259 sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP); 260 261 /* edonr - needs to be the first one here (slow CPU check) */ 262 cs = &chksum_stat_data[cbid++]; 263 264 /* edonr */ 265 cs->init = abd_checksum_edonr_tmpl_init; 266 cs->func = abd_checksum_edonr_native; 267 cs->free = abd_checksum_edonr_tmpl_free; 268 cs->name = "edonr"; 269 cs->impl = "generic"; 270 chksum_benchit(cs); 271 272 /* skein */ 273 cs = &chksum_stat_data[cbid++]; 274 cs->init = abd_checksum_skein_tmpl_init; 275 cs->func = abd_checksum_skein_native; 276 cs->free = abd_checksum_skein_tmpl_free; 277 cs->name = "skein"; 278 cs->impl = "generic"; 279 chksum_benchit(cs); 280 281 /* sha256 */ 282 id_save = sha256->getid(); 283 for (max = 0, id = 0; id < sha256->getcnt(); id++) { 284 sha256->setid(id); 285 cs = &chksum_stat_data[cbid++]; 286 cs->init = 0; 287 cs->func = abd_checksum_sha256; 288 cs->free = 0; 289 cs->name = sha256->name; 290 cs->impl = sha256->getname(); 291 chksum_benchit(cs); 292 if (cs->bs256k > max) { 293 max = cs->bs256k; 294 sha256->set_fastest(id); 295 } 296 } 297 sha256->setid(id_save); 298 299 /* sha512 */ 300 id_save = sha512->getid(); 301 for (max = 0, id = 0; id < sha512->getcnt(); id++) { 302 sha512->setid(id); 303 cs = &chksum_stat_data[cbid++]; 304 cs->init = 0; 305 cs->func = abd_checksum_sha512_native; 306 cs->free = 0; 307 cs->name = sha512->name; 308 cs->impl = sha512->getname(); 309 chksum_benchit(cs); 310 if (cs->bs256k > max) { 311 max = cs->bs256k; 312 sha512->set_fastest(id); 313 } 314 } 315 sha512->setid(id_save); 316 317 /* blake3 */ 318 id_save = blake3->getid(); 319 for (max = 0, id = 0; id < blake3->getcnt(); id++) { 320 blake3->setid(id); 321 cs = &chksum_stat_data[cbid++]; 322 cs->init = abd_checksum_blake3_tmpl_init; 323 cs->func = abd_checksum_blake3_native; 324 cs->free = abd_checksum_blake3_tmpl_free; 325 cs->name = blake3->name; 326 cs->impl = blake3->getname(); 327 chksum_benchit(cs); 328 if (cs->bs256k > max) { 329 max = cs->bs256k; 330 blake3->set_fastest(id); 331 } 332 } 333 blake3->setid(id_save); 334 } 335 336 void 337 chksum_init(void) 338 { 339 #ifdef _KERNEL 340 blake3_per_cpu_ctx_init(); 341 #endif 342 343 /* Benchmark supported implementations */ 344 chksum_benchmark(); 345 346 /* Install kstats for all implementations */ 347 chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc", 348 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 349 350 if (chksum_kstat != NULL) { 351 chksum_kstat->ks_data = NULL; 352 chksum_kstat->ks_ndata = UINT32_MAX; 353 kstat_set_raw_ops(chksum_kstat, 354 chksum_kstat_headers, 355 chksum_kstat_data, 356 chksum_kstat_addr); 357 kstat_install(chksum_kstat); 358 } 359 } 360 361 void 362 chksum_fini(void) 363 { 364 if (chksum_kstat != NULL) { 365 kstat_delete(chksum_kstat); 366 chksum_kstat = NULL; 367 } 368 369 if (chksum_stat_cnt) { 370 kmem_free(chksum_stat_data, 371 sizeof (chksum_stat_t) * chksum_stat_cnt); 372 chksum_stat_cnt = 0; 373 chksum_stat_data = 0; 374 } 375 376 #ifdef _KERNEL 377 blake3_per_cpu_ctx_fini(); 378 #endif 379 } 380