1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved. 23 */ 24 25 #include <sys/zfs_context.h> 26 #include <sys/types.h> 27 #include <sys/zio.h> 28 #include <sys/debug.h> 29 #include <sys/zfs_debug.h> 30 #include <sys/vdev_raidz.h> 31 #include <sys/vdev_raidz_impl.h> 32 #include <sys/simd.h> 33 34 #ifndef isspace 35 #define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || \ 36 (c) == '\r' || (c) == '\f' || (c) == '\013') 37 #endif 38 39 extern boolean_t raidz_will_scalar_work(void); 40 41 /* Opaque implementation with NULL methods to represent original methods */ 42 static const raidz_impl_ops_t vdev_raidz_original_impl = { 43 .name = "original", 44 .is_supported = raidz_will_scalar_work, 45 }; 46 47 /* RAIDZ parity op that contain the fastest methods */ 48 static raidz_impl_ops_t vdev_raidz_fastest_impl = { 49 .name = "fastest" 50 }; 51 52 /* All compiled in implementations */ 53 const raidz_impl_ops_t *raidz_all_maths[] = { 54 &vdev_raidz_original_impl, 55 &vdev_raidz_scalar_impl, 56 #if defined(__amd64) 57 &vdev_raidz_sse2_impl, 58 #endif 59 #if defined(__amd64) 60 &vdev_raidz_ssse3_impl, 61 #endif 62 #if defined(__amd64) 63 &vdev_raidz_avx2_impl, 64 #endif 65 }; 66 67 /* Indicate that benchmark has been completed */ 68 static boolean_t raidz_math_initialized = B_FALSE; 69 70 /* Select raidz implementation */ 71 #define IMPL_FASTEST (UINT32_MAX) 72 #define IMPL_CYCLE (UINT32_MAX - 1) 73 #define IMPL_ORIGINAL (0) 74 #define IMPL_SCALAR (1) 75 76 #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i)) 77 78 static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR; 79 static uint32_t user_sel_impl = IMPL_FASTEST; 80 81 /* Hold all supported implementations */ 82 static size_t raidz_supp_impl_cnt = 0; 83 static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)]; 84 85 #if defined(_KERNEL) 86 /* 87 * kstats values for supported implementations 88 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s] 89 * 90 * PORTING NOTE: 91 * On illumos this is not a kstat. OpenZFS uses their home-grown kstat code 92 * which implements a free-form kstat using additional functionality that does 93 * not exist in illumos. Because there are no software consumers of this 94 * information, we omit a kstat API. If an administrator needs to see this 95 * data for some reason, they can use mdb. 96 * 97 * The format of the kstat data on OpenZFS would be a "header" that looks like 98 * this (a column for each entry in the "raidz_gen_name" and "raidz_rec_name" 99 * arrays, starting with the parity function "implementation" name): 100 * impl gen_p gen_pq gen_pqr rec_p rec_q rec_r rec_pq rec_pr rec_qr rec_pqr 101 * This is followed by a row for each parity function implementation, showing 102 * the "speed" values calculated for that implementation for each of the 103 * parity generation and reconstruction functions in the "raidz_all_maths" 104 * array. 105 */ 106 static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1]; 107 108 #endif 109 110 /* 111 * Returns the RAIDZ operations for raidz_map() parity calculations. When 112 * a SIMD implementation is not allowed in the current context, then fallback 113 * to the fastest generic implementation. 114 */ 115 const raidz_impl_ops_t * 116 vdev_raidz_math_get_ops(void) 117 { 118 if (!kfpu_allowed()) 119 return (&vdev_raidz_scalar_impl); 120 121 raidz_impl_ops_t *ops = NULL; 122 const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); 123 124 switch (impl) { 125 case IMPL_FASTEST: 126 ASSERT(raidz_math_initialized); 127 ops = &vdev_raidz_fastest_impl; 128 break; 129 case IMPL_CYCLE: 130 /* Cycle through all supported implementations */ 131 ASSERT(raidz_math_initialized); 132 ASSERT3U(raidz_supp_impl_cnt, >, 0); 133 static size_t cycle_impl_idx = 0; 134 size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt; 135 ops = raidz_supp_impl[idx]; 136 break; 137 case IMPL_ORIGINAL: 138 ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; 139 break; 140 case IMPL_SCALAR: 141 ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl; 142 break; 143 default: 144 ASSERT3U(impl, <, raidz_supp_impl_cnt); 145 ASSERT3U(raidz_supp_impl_cnt, >, 0); 146 if (impl < ARRAY_SIZE(raidz_all_maths)) 147 ops = raidz_supp_impl[impl]; 148 break; 149 } 150 151 ASSERT3P(ops, !=, NULL); 152 153 return (ops); 154 } 155 156 /* 157 * Select parity generation method for raidz_map 158 */ 159 int 160 vdev_raidz_math_generate(raidz_map_t *rm) 161 { 162 raidz_gen_f gen_parity = NULL; 163 164 switch (raidz_parity(rm)) { 165 case 1: 166 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P]; 167 break; 168 case 2: 169 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ]; 170 break; 171 case 3: 172 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR]; 173 break; 174 default: 175 gen_parity = NULL; 176 cmn_err(CE_PANIC, "invalid RAID-Z configuration %u", 177 (uint_t)raidz_parity(rm)); 178 break; 179 } 180 181 /* if method is NULL execute the original implementation */ 182 if (gen_parity == NULL) 183 return (RAIDZ_ORIGINAL_IMPL); 184 185 gen_parity(rm); 186 187 return (0); 188 } 189 190 static raidz_rec_f 191 reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid, 192 const int nbaddata) 193 { 194 if (nbaddata == 1 && parity_valid[CODE_P]) { 195 return (rm->rm_ops->rec[RAIDZ_REC_P]); 196 } 197 return ((raidz_rec_f) NULL); 198 } 199 200 static raidz_rec_f 201 reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid, 202 const int nbaddata) 203 { 204 if (nbaddata == 1) { 205 if (parity_valid[CODE_P]) { 206 return (rm->rm_ops->rec[RAIDZ_REC_P]); 207 } else if (parity_valid[CODE_Q]) { 208 return (rm->rm_ops->rec[RAIDZ_REC_Q]); 209 } 210 } else if (nbaddata == 2 && 211 parity_valid[CODE_P] && parity_valid[CODE_Q]) { 212 return (rm->rm_ops->rec[RAIDZ_REC_PQ]); 213 } 214 return ((raidz_rec_f) NULL); 215 } 216 217 static raidz_rec_f 218 reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, 219 const int nbaddata) 220 { 221 if (nbaddata == 1) { 222 if (parity_valid[CODE_P]) { 223 return (rm->rm_ops->rec[RAIDZ_REC_P]); 224 } else if (parity_valid[CODE_Q]) { 225 return (rm->rm_ops->rec[RAIDZ_REC_Q]); 226 } else if (parity_valid[CODE_R]) { 227 return (rm->rm_ops->rec[RAIDZ_REC_R]); 228 } 229 } else if (nbaddata == 2) { 230 if (parity_valid[CODE_P] && parity_valid[CODE_Q]) { 231 return (rm->rm_ops->rec[RAIDZ_REC_PQ]); 232 } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) { 233 return (rm->rm_ops->rec[RAIDZ_REC_PR]); 234 } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) { 235 return (rm->rm_ops->rec[RAIDZ_REC_QR]); 236 } 237 } else if (nbaddata == 3 && 238 parity_valid[CODE_P] && parity_valid[CODE_Q] && 239 parity_valid[CODE_R]) { 240 return (rm->rm_ops->rec[RAIDZ_REC_PQR]); 241 } 242 return ((raidz_rec_f) NULL); 243 } 244 245 /* 246 * Select data reconstruction method for raidz_map 247 * @parity_valid - Parity validity flag 248 * @dt - Failed data index array 249 * @nbaddata - Number of failed data columns 250 */ 251 int 252 vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, 253 const int *dt, const int nbaddata) 254 { 255 raidz_rec_f rec_fn = NULL; 256 257 switch (raidz_parity(rm)) { 258 case PARITY_P: 259 rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); 260 break; 261 case PARITY_PQ: 262 rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); 263 break; 264 case PARITY_PQR: 265 rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); 266 break; 267 default: 268 cmn_err(CE_PANIC, "invalid RAID-Z configuration %u", 269 (uint_t)raidz_parity(rm)); 270 break; 271 } 272 273 if (rec_fn == NULL) 274 return (RAIDZ_ORIGINAL_IMPL); 275 else 276 return (rec_fn(rm, dt)); 277 } 278 279 const char *raidz_gen_name[] = { 280 "gen_p", "gen_pq", "gen_pqr" 281 }; 282 const char *raidz_rec_name[] = { 283 "rec_p", "rec_q", "rec_r", 284 "rec_pq", "rec_pr", "rec_qr", "rec_pqr" 285 }; 286 287 #if defined(_KERNEL) 288 289 #define BENCH_D_COLS (8ULL) 290 #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR) 291 #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */ 292 #define BENCH_NS MSEC2NSEC(25) /* 25ms */ 293 294 typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn); 295 296 static void 297 benchmark_gen_impl(raidz_map_t *rm, const int fn) 298 { 299 (void) fn; 300 vdev_raidz_generate_parity(rm); 301 } 302 303 static void 304 benchmark_rec_impl(raidz_map_t *rm, const int fn) 305 { 306 static const int rec_tgt[7][3] = { 307 {1, 2, 3}, /* rec_p: bad QR & D[0] */ 308 {0, 2, 3}, /* rec_q: bad PR & D[0] */ 309 {0, 1, 3}, /* rec_r: bad PQ & D[0] */ 310 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ 311 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ 312 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ 313 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ 314 }; 315 316 vdev_raidz_reconstruct(rm, rec_tgt[fn], 3); 317 } 318 319 /* 320 * Benchmarking of all supported implementations (raidz_supp_impl_cnt) 321 * is performed by setting the rm_ops pointer and calling the top level 322 * generate/reconstruct methods of bench_rm. 323 */ 324 static void 325 benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) 326 { 327 uint64_t run_cnt, speed, best_speed = 0; 328 hrtime_t t_start, t_diff; 329 raidz_impl_ops_t *curr_impl; 330 raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; 331 int impl, i; 332 333 for (impl = 0; impl < raidz_supp_impl_cnt; impl++) { 334 /* set an implementation to benchmark */ 335 curr_impl = raidz_supp_impl[impl]; 336 bench_rm->rm_ops = curr_impl; 337 338 run_cnt = 0; 339 t_start = gethrtime(); 340 341 do { 342 for (i = 0; i < 25; i++, run_cnt++) 343 bench_fn(bench_rm, fn); 344 345 t_diff = gethrtime() - t_start; 346 } while (t_diff < BENCH_NS); 347 348 speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC; 349 speed /= (t_diff * BENCH_COLS); 350 351 if (bench_fn == benchmark_gen_impl) 352 raidz_impl_kstats[impl].gen[fn] = speed; 353 else 354 raidz_impl_kstats[impl].rec[fn] = speed; 355 356 /* Update fastest implementation method */ 357 if (speed > best_speed) { 358 best_speed = speed; 359 360 if (bench_fn == benchmark_gen_impl) { 361 fstat->gen[fn] = impl; 362 vdev_raidz_fastest_impl.gen[fn] = 363 curr_impl->gen[fn]; 364 } else { 365 fstat->rec[fn] = impl; 366 vdev_raidz_fastest_impl.rec[fn] = 367 curr_impl->rec[fn]; 368 } 369 } 370 } 371 } 372 #endif 373 374 /* 375 * Initialize and benchmark all supported implementations. 376 */ 377 static void 378 benchmark_raidz(void) 379 { 380 raidz_impl_ops_t *curr_impl; 381 int i, c; 382 383 /* Move supported impl into raidz_supp_impl */ 384 for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { 385 curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; 386 387 if (curr_impl->init) 388 curr_impl->init(); 389 390 if (curr_impl->is_supported()) 391 raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl; 392 } 393 membar_producer(); /* complete raidz_supp_impl[] init */ 394 raidz_supp_impl_cnt = c; /* number of supported impl */ 395 396 #if defined(_KERNEL) 397 zio_t *bench_zio = NULL; 398 raidz_map_t *bench_rm = NULL; 399 uint64_t bench_parity; 400 401 /* Fake a zio and run the benchmark on a warmed up buffer */ 402 bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 403 bench_zio->io_offset = 0; 404 bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ 405 bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE); 406 memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); 407 408 /* Benchmark parity generation methods */ 409 for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) { 410 bench_parity = fn + 1; 411 /* New raidz_map is needed for each generate_p/q/r */ 412 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, 413 BENCH_D_COLS + bench_parity, bench_parity); 414 415 benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl); 416 417 vdev_raidz_map_free(bench_rm); 418 } 419 420 /* Benchmark data reconstruction methods */ 421 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, 422 BENCH_COLS, PARITY_PQR); 423 424 for (int fn = 0; fn < RAIDZ_REC_NUM; fn++) 425 benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl); 426 427 vdev_raidz_map_free(bench_rm); 428 429 /* cleanup the bench zio */ 430 abd_free(bench_zio->io_abd); 431 kmem_free(bench_zio, sizeof (zio_t)); 432 #else 433 /* 434 * Skip the benchmark in user space to avoid impacting libzpool 435 * consumers (zdb, zhack, zinject, ztest). The last implementation 436 * is assumed to be the fastest and used by default. 437 */ 438 memcpy(&vdev_raidz_fastest_impl, 439 raidz_supp_impl[raidz_supp_impl_cnt - 1], 440 sizeof (vdev_raidz_fastest_impl)); 441 strcpy(vdev_raidz_fastest_impl.name, "fastest"); 442 #endif /* _KERNEL */ 443 } 444 445 void 446 vdev_raidz_math_init(void) 447 { 448 /* Determine the fastest available implementation. */ 449 benchmark_raidz(); 450 451 /* Finish initialization */ 452 atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl); 453 raidz_math_initialized = B_TRUE; 454 } 455 456 void 457 vdev_raidz_math_fini(void) 458 { 459 raidz_impl_ops_t const *curr_impl; 460 461 for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { 462 curr_impl = raidz_all_maths[i]; 463 if (curr_impl->fini) 464 curr_impl->fini(); 465 } 466 } 467 468 static const struct { 469 char *name; 470 uint32_t sel; 471 } math_impl_opts[] = { 472 { "cycle", IMPL_CYCLE }, 473 { "fastest", IMPL_FASTEST }, 474 { "original", IMPL_ORIGINAL }, 475 { "scalar", IMPL_SCALAR } 476 }; 477 478 /* 479 * Function sets desired raidz implementation. 480 * 481 * If we are called before init(), user preference will be saved in 482 * user_sel_impl, and applied in later init() call. This occurs when module 483 * parameter is specified on module load. Otherwise, directly update 484 * zfs_vdev_raidz_impl. 485 * 486 * @val Name of raidz implementation to use 487 * @param Unused. 488 */ 489 int 490 vdev_raidz_impl_set(const char *val) 491 { 492 int err = -EINVAL; 493 char req_name[RAIDZ_IMPL_NAME_MAX]; 494 uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl); 495 size_t i; 496 497 /* sanitize input */ 498 i = strnlen(val, RAIDZ_IMPL_NAME_MAX); 499 if (i == 0 || i == RAIDZ_IMPL_NAME_MAX) 500 return (err); 501 502 strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX); 503 while (i > 0 && !!isspace(req_name[i-1])) 504 i--; 505 req_name[i] = '\0'; 506 507 /* Check mandatory options */ 508 for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) { 509 if (strcmp(req_name, math_impl_opts[i].name) == 0) { 510 impl = math_impl_opts[i].sel; 511 err = 0; 512 break; 513 } 514 } 515 516 /* check all supported impl if init() was already called */ 517 if (err != 0 && raidz_math_initialized) { 518 /* check all supported implementations */ 519 for (i = 0; i < raidz_supp_impl_cnt; i++) { 520 if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) { 521 impl = i; 522 err = 0; 523 break; 524 } 525 } 526 } 527 528 if (err == 0) { 529 if (raidz_math_initialized) 530 atomic_swap_32(&zfs_vdev_raidz_impl, impl); 531 else 532 atomic_swap_32(&user_sel_impl, impl); 533 } 534 535 return (err); 536 } 537 538 #if defined(_KERNEL) && defined(__linux__) 539 540 static int 541 zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp) 542 { 543 return (vdev_raidz_impl_set(val)); 544 } 545 546 static int 547 zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp) 548 { 549 int i, cnt = 0; 550 char *fmt; 551 const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); 552 553 ASSERT(raidz_math_initialized); 554 555 /* list mandatory options */ 556 for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) { 557 fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s "; 558 cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name); 559 } 560 561 /* list all supported implementations */ 562 for (i = 0; i < raidz_supp_impl_cnt; i++) { 563 fmt = (i == impl) ? "[%s] " : "%s "; 564 cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name); 565 } 566 567 return (cnt); 568 } 569 570 module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set, 571 zfs_vdev_raidz_impl_get, NULL, 0644); 572 MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation."); 573 #endif 574