1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved. 23 */ 24 25 #include <sys/zfs_context.h> 26 #include <sys/types.h> 27 #include <sys/zio.h> 28 #include <sys/debug.h> 29 #include <sys/zfs_debug.h> 30 #include <sys/vdev_raidz.h> 31 #include <sys/vdev_raidz_impl.h> 32 #include <sys/simd.h> 33 34 #ifndef isspace 35 #define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || \ 36 (c) == '\r' || (c) == '\f' || (c) == '\013') 37 #endif 38 39 extern boolean_t raidz_will_scalar_work(void); 40 41 /* Opaque implementation with NULL methods to represent original methods */ 42 static const raidz_impl_ops_t vdev_raidz_original_impl = { 43 .name = "original", 44 .is_supported = raidz_will_scalar_work, 45 }; 46 47 /* RAIDZ parity op that contain the fastest methods */ 48 static raidz_impl_ops_t vdev_raidz_fastest_impl = { 49 .name = "fastest" 50 }; 51 52 /* All compiled in implementations */ 53 const raidz_impl_ops_t *raidz_all_maths[] = { 54 &vdev_raidz_original_impl, 55 &vdev_raidz_scalar_impl, 56 }; 57 58 /* Indicate that benchmark has been completed */ 59 static boolean_t raidz_math_initialized = B_FALSE; 60 61 /* Select raidz implementation */ 62 #define IMPL_FASTEST (UINT32_MAX) 63 #define IMPL_CYCLE (UINT32_MAX - 1) 64 #define IMPL_ORIGINAL (0) 65 #define IMPL_SCALAR (1) 66 67 #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i)) 68 69 static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR; 70 static uint32_t user_sel_impl = IMPL_FASTEST; 71 72 /* Hold all supported implementations */ 73 static size_t raidz_supp_impl_cnt = 0; 74 static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)]; 75 76 #if defined(_KERNEL) 77 /* 78 * kstats values for supported implementations 79 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s] 80 * 81 * PORTING NOTE: 82 * On illumos this is not a kstat. OpenZFS uses their home-grown kstat code 83 * which implements a free-form kstat using additional functionality that does 84 * not exist in illumos. Because there are no software consumers of this 85 * information, we omit a kstat API. If an administrator needs to see this 86 * data for some reason, they can use mdb. 87 * 88 * The format of the kstat data on OpenZFS would be a "header" that looks like 89 * this (a column for each entry in the "raidz_gen_name" and "raidz_rec_name" 90 * arrays, starting with the parity function "implementation" name): 91 * impl gen_p gen_pq gen_pqr rec_p rec_q rec_r rec_pq rec_pr rec_qr rec_pqr 92 * This is followed by a row for each parity function implementation, showing 93 * the "speed" values calculated for that implementation for each of the 94 * parity generation and reconstruction functions in the "raidz_all_maths" 95 * array. 96 */ 97 static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1]; 98 99 #endif 100 101 /* 102 * Returns the RAIDZ operations for raidz_map() parity calculations. When 103 * a SIMD implementation is not allowed in the current context, then fallback 104 * to the fastest generic implementation. 105 */ 106 const raidz_impl_ops_t * 107 vdev_raidz_math_get_ops(void) 108 { 109 /* 110 * illumos porting note: 111 * The following check from OpenZFS is disabled since we don't have 112 * this compiled in yet and we need to be able to change the 113 * implementation for the user-level test suite. 114 * 115 * if (!kfpu_allowed()) 116 * return (&vdev_raidz_scalar_impl); 117 */ 118 119 raidz_impl_ops_t *ops = NULL; 120 const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); 121 122 switch (impl) { 123 case IMPL_FASTEST: 124 ASSERT(raidz_math_initialized); 125 ops = &vdev_raidz_fastest_impl; 126 break; 127 case IMPL_CYCLE: 128 /* Cycle through all supported implementations */ 129 ASSERT(raidz_math_initialized); 130 ASSERT3U(raidz_supp_impl_cnt, >, 0); 131 static size_t cycle_impl_idx = 0; 132 size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt; 133 ops = raidz_supp_impl[idx]; 134 break; 135 case IMPL_ORIGINAL: 136 ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; 137 break; 138 case IMPL_SCALAR: 139 ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl; 140 break; 141 default: 142 ASSERT3U(impl, <, raidz_supp_impl_cnt); 143 ASSERT3U(raidz_supp_impl_cnt, >, 0); 144 if (impl < ARRAY_SIZE(raidz_all_maths)) 145 ops = raidz_supp_impl[impl]; 146 break; 147 } 148 149 ASSERT3P(ops, !=, NULL); 150 151 return (ops); 152 } 153 154 /* 155 * Select parity generation method for raidz_map 156 */ 157 int 158 vdev_raidz_math_generate(raidz_map_t *rm) 159 { 160 raidz_gen_f gen_parity = NULL; 161 162 switch (raidz_parity(rm)) { 163 case 1: 164 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P]; 165 break; 166 case 2: 167 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ]; 168 break; 169 case 3: 170 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR]; 171 break; 172 default: 173 gen_parity = NULL; 174 cmn_err(CE_PANIC, "invalid RAID-Z configuration %u", 175 (uint_t)raidz_parity(rm)); 176 break; 177 } 178 179 /* if method is NULL execute the original implementation */ 180 if (gen_parity == NULL) 181 return (RAIDZ_ORIGINAL_IMPL); 182 183 gen_parity(rm); 184 185 return (0); 186 } 187 188 static raidz_rec_f 189 reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid, 190 const int nbaddata) 191 { 192 if (nbaddata == 1 && parity_valid[CODE_P]) { 193 return (rm->rm_ops->rec[RAIDZ_REC_P]); 194 } 195 return ((raidz_rec_f) NULL); 196 } 197 198 static raidz_rec_f 199 reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid, 200 const int nbaddata) 201 { 202 if (nbaddata == 1) { 203 if (parity_valid[CODE_P]) { 204 return (rm->rm_ops->rec[RAIDZ_REC_P]); 205 } else if (parity_valid[CODE_Q]) { 206 return (rm->rm_ops->rec[RAIDZ_REC_Q]); 207 } 208 } else if (nbaddata == 2 && 209 parity_valid[CODE_P] && parity_valid[CODE_Q]) { 210 return (rm->rm_ops->rec[RAIDZ_REC_PQ]); 211 } 212 return ((raidz_rec_f) NULL); 213 } 214 215 static raidz_rec_f 216 reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, 217 const int nbaddata) 218 { 219 if (nbaddata == 1) { 220 if (parity_valid[CODE_P]) { 221 return (rm->rm_ops->rec[RAIDZ_REC_P]); 222 } else if (parity_valid[CODE_Q]) { 223 return (rm->rm_ops->rec[RAIDZ_REC_Q]); 224 } else if (parity_valid[CODE_R]) { 225 return (rm->rm_ops->rec[RAIDZ_REC_R]); 226 } 227 } else if (nbaddata == 2) { 228 if (parity_valid[CODE_P] && parity_valid[CODE_Q]) { 229 return (rm->rm_ops->rec[RAIDZ_REC_PQ]); 230 } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) { 231 return (rm->rm_ops->rec[RAIDZ_REC_PR]); 232 } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) { 233 return (rm->rm_ops->rec[RAIDZ_REC_QR]); 234 } 235 } else if (nbaddata == 3 && 236 parity_valid[CODE_P] && parity_valid[CODE_Q] && 237 parity_valid[CODE_R]) { 238 return (rm->rm_ops->rec[RAIDZ_REC_PQR]); 239 } 240 return ((raidz_rec_f) NULL); 241 } 242 243 /* 244 * Select data reconstruction method for raidz_map 245 * @parity_valid - Parity validity flag 246 * @dt - Failed data index array 247 * @nbaddata - Number of failed data columns 248 */ 249 int 250 vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, 251 const int *dt, const int nbaddata) 252 { 253 raidz_rec_f rec_fn = NULL; 254 255 switch (raidz_parity(rm)) { 256 case PARITY_P: 257 rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); 258 break; 259 case PARITY_PQ: 260 rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); 261 break; 262 case PARITY_PQR: 263 rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); 264 break; 265 default: 266 cmn_err(CE_PANIC, "invalid RAID-Z configuration %u", 267 (uint_t)raidz_parity(rm)); 268 break; 269 } 270 271 if (rec_fn == NULL) 272 return (RAIDZ_ORIGINAL_IMPL); 273 else 274 return (rec_fn(rm, dt)); 275 } 276 277 const char *raidz_gen_name[] = { 278 "gen_p", "gen_pq", "gen_pqr" 279 }; 280 const char *raidz_rec_name[] = { 281 "rec_p", "rec_q", "rec_r", 282 "rec_pq", "rec_pr", "rec_qr", "rec_pqr" 283 }; 284 285 #if defined(_KERNEL) 286 287 #define BENCH_D_COLS (8ULL) 288 #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR) 289 #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */ 290 #define BENCH_NS MSEC2NSEC(25) /* 25ms */ 291 292 typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn); 293 294 static void 295 benchmark_gen_impl(raidz_map_t *rm, const int fn) 296 { 297 (void) fn; 298 vdev_raidz_generate_parity(rm); 299 } 300 301 static void 302 benchmark_rec_impl(raidz_map_t *rm, const int fn) 303 { 304 static const int rec_tgt[7][3] = { 305 {1, 2, 3}, /* rec_p: bad QR & D[0] */ 306 {0, 2, 3}, /* rec_q: bad PR & D[0] */ 307 {0, 1, 3}, /* rec_r: bad PQ & D[0] */ 308 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ 309 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ 310 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ 311 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ 312 }; 313 314 vdev_raidz_reconstruct(rm, rec_tgt[fn], 3); 315 } 316 317 /* 318 * Benchmarking of all supported implementations (raidz_supp_impl_cnt) 319 * is performed by setting the rm_ops pointer and calling the top level 320 * generate/reconstruct methods of bench_rm. 321 */ 322 static void 323 benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) 324 { 325 uint64_t run_cnt, speed, best_speed = 0; 326 hrtime_t t_start, t_diff; 327 raidz_impl_ops_t *curr_impl; 328 raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; 329 int impl, i; 330 331 for (impl = 0; impl < raidz_supp_impl_cnt; impl++) { 332 /* set an implementation to benchmark */ 333 curr_impl = raidz_supp_impl[impl]; 334 bench_rm->rm_ops = curr_impl; 335 336 run_cnt = 0; 337 t_start = gethrtime(); 338 339 do { 340 for (i = 0; i < 25; i++, run_cnt++) 341 bench_fn(bench_rm, fn); 342 343 t_diff = gethrtime() - t_start; 344 } while (t_diff < BENCH_NS); 345 346 speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC; 347 speed /= (t_diff * BENCH_COLS); 348 349 if (bench_fn == benchmark_gen_impl) 350 raidz_impl_kstats[impl].gen[fn] = speed; 351 else 352 raidz_impl_kstats[impl].rec[fn] = speed; 353 354 /* Update fastest implementation method */ 355 if (speed > best_speed) { 356 best_speed = speed; 357 358 if (bench_fn == benchmark_gen_impl) { 359 fstat->gen[fn] = impl; 360 vdev_raidz_fastest_impl.gen[fn] = 361 curr_impl->gen[fn]; 362 } else { 363 fstat->rec[fn] = impl; 364 vdev_raidz_fastest_impl.rec[fn] = 365 curr_impl->rec[fn]; 366 } 367 } 368 } 369 } 370 #endif 371 372 /* 373 * Initialize and benchmark all supported implementations. 374 */ 375 static void 376 benchmark_raidz(void) 377 { 378 raidz_impl_ops_t *curr_impl; 379 int i, c; 380 381 /* Move supported impl into raidz_supp_impl */ 382 for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { 383 curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; 384 385 if (curr_impl->init) 386 curr_impl->init(); 387 388 if (curr_impl->is_supported()) 389 raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl; 390 } 391 membar_producer(); /* complete raidz_supp_impl[] init */ 392 raidz_supp_impl_cnt = c; /* number of supported impl */ 393 394 #if defined(_KERNEL) 395 zio_t *bench_zio = NULL; 396 raidz_map_t *bench_rm = NULL; 397 uint64_t bench_parity; 398 399 /* Fake a zio and run the benchmark on a warmed up buffer */ 400 bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 401 bench_zio->io_offset = 0; 402 bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ 403 bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE); 404 memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); 405 406 /* Benchmark parity generation methods */ 407 for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) { 408 bench_parity = fn + 1; 409 /* New raidz_map is needed for each generate_p/q/r */ 410 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, 411 BENCH_D_COLS + bench_parity, bench_parity); 412 413 benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl); 414 415 vdev_raidz_map_free(bench_rm); 416 } 417 418 /* Benchmark data reconstruction methods */ 419 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, 420 BENCH_COLS, PARITY_PQR); 421 422 for (int fn = 0; fn < RAIDZ_REC_NUM; fn++) 423 benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl); 424 425 vdev_raidz_map_free(bench_rm); 426 427 /* cleanup the bench zio */ 428 abd_free(bench_zio->io_abd); 429 kmem_free(bench_zio, sizeof (zio_t)); 430 #else 431 /* 432 * Skip the benchmark in user space to avoid impacting libzpool 433 * consumers (zdb, zhack, zinject, ztest). The last implementation 434 * is assumed to be the fastest and used by default. 435 */ 436 memcpy(&vdev_raidz_fastest_impl, 437 raidz_supp_impl[raidz_supp_impl_cnt - 1], 438 sizeof (vdev_raidz_fastest_impl)); 439 strcpy(vdev_raidz_fastest_impl.name, "fastest"); 440 #endif /* _KERNEL */ 441 } 442 443 void 444 vdev_raidz_math_init(void) 445 { 446 /* Determine the fastest available implementation. */ 447 benchmark_raidz(); 448 449 /* Finish initialization */ 450 atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl); 451 raidz_math_initialized = B_TRUE; 452 } 453 454 void 455 vdev_raidz_math_fini(void) 456 { 457 raidz_impl_ops_t const *curr_impl; 458 459 for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { 460 curr_impl = raidz_all_maths[i]; 461 if (curr_impl->fini) 462 curr_impl->fini(); 463 } 464 } 465 466 static const struct { 467 char *name; 468 uint32_t sel; 469 } math_impl_opts[] = { 470 { "cycle", IMPL_CYCLE }, 471 { "fastest", IMPL_FASTEST }, 472 { "original", IMPL_ORIGINAL }, 473 { "scalar", IMPL_SCALAR } 474 }; 475 476 /* 477 * Function sets desired raidz implementation. 478 * 479 * If we are called before init(), user preference will be saved in 480 * user_sel_impl, and applied in later init() call. This occurs when module 481 * parameter is specified on module load. Otherwise, directly update 482 * zfs_vdev_raidz_impl. 483 * 484 * @val Name of raidz implementation to use 485 * @param Unused. 486 */ 487 int 488 vdev_raidz_impl_set(const char *val) 489 { 490 int err = -EINVAL; 491 char req_name[RAIDZ_IMPL_NAME_MAX]; 492 uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl); 493 size_t i; 494 495 /* sanitize input */ 496 i = strnlen(val, RAIDZ_IMPL_NAME_MAX); 497 if (i == 0 || i == RAIDZ_IMPL_NAME_MAX) 498 return (err); 499 500 strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX); 501 while (i > 0 && !!isspace(req_name[i-1])) 502 i--; 503 req_name[i] = '\0'; 504 505 /* Check mandatory options */ 506 for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) { 507 if (strcmp(req_name, math_impl_opts[i].name) == 0) { 508 impl = math_impl_opts[i].sel; 509 err = 0; 510 break; 511 } 512 } 513 514 /* check all supported impl if init() was already called */ 515 if (err != 0 && raidz_math_initialized) { 516 /* check all supported implementations */ 517 for (i = 0; i < raidz_supp_impl_cnt; i++) { 518 if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) { 519 impl = i; 520 err = 0; 521 break; 522 } 523 } 524 } 525 526 if (err == 0) { 527 if (raidz_math_initialized) 528 atomic_swap_32(&zfs_vdev_raidz_impl, impl); 529 else 530 atomic_swap_32(&user_sel_impl, impl); 531 } 532 533 return (err); 534 } 535 536 #if defined(_KERNEL) && defined(__linux__) 537 538 static int 539 zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp) 540 { 541 return (vdev_raidz_impl_set(val)); 542 } 543 544 static int 545 zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp) 546 { 547 int i, cnt = 0; 548 char *fmt; 549 const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); 550 551 ASSERT(raidz_math_initialized); 552 553 /* list mandatory options */ 554 for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) { 555 fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s "; 556 cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name); 557 } 558 559 /* list all supported implementations */ 560 for (i = 0; i < raidz_supp_impl_cnt; i++) { 561 fmt = (i == impl) ? "[%s] " : "%s "; 562 cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name); 563 } 564 565 return (cnt); 566 } 567 568 module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set, 569 zfs_vdev_raidz_impl_get, NULL, 0644); 570 MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation."); 571 #endif 572