1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved. 23 */ 24 25 #include <sys/zfs_context.h> 26 #include <sys/types.h> 27 #include <sys/zio.h> 28 #include <sys/debug.h> 29 #include <sys/zfs_debug.h> 30 #include <sys/vdev_raidz.h> 31 #include <sys/vdev_raidz_impl.h> 32 #include <sys/simd.h> 33 34 #ifndef isspace 35 #define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || \ 36 (c) == '\r' || (c) == '\f' || (c) == '\013') 37 #endif 38 39 extern boolean_t raidz_will_scalar_work(void); 40 41 /* Opaque implementation with NULL methods to represent original methods */ 42 static const raidz_impl_ops_t vdev_raidz_original_impl = { 43 .name = "original", 44 .is_supported = raidz_will_scalar_work, 45 }; 46 47 /* RAIDZ parity op that contain the fastest methods */ 48 static raidz_impl_ops_t vdev_raidz_fastest_impl = { 49 .name = "fastest" 50 }; 51 52 /* All compiled in implementations */ 53 const raidz_impl_ops_t *raidz_all_maths[] = { 54 &vdev_raidz_original_impl, 55 &vdev_raidz_scalar_impl, 56 #if defined(__amd64) 57 &vdev_raidz_sse2_impl, 58 &vdev_raidz_ssse3_impl, 59 &vdev_raidz_avx2_impl, 60 #endif 61 }; 62 63 /* Indicate that benchmark has been completed */ 64 static boolean_t raidz_math_initialized = B_FALSE; 65 66 /* Select raidz implementation */ 67 #define IMPL_FASTEST (UINT32_MAX) 68 #define IMPL_CYCLE (UINT32_MAX - 1) 69 #define IMPL_ORIGINAL (0) 70 #define IMPL_SCALAR (1) 71 72 #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i)) 73 74 static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR; 75 static uint32_t user_sel_impl = IMPL_FASTEST; 76 77 /* Hold all supported implementations */ 78 static size_t raidz_supp_impl_cnt = 0; 79 static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)]; 80 81 #if defined(_KERNEL) 82 /* 83 * kstats values for supported implementations 84 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s] 85 * 86 * PORTING NOTE: 87 * On illumos this is not a kstat. OpenZFS uses their home-grown kstat code 88 * which implements a free-form kstat using additional functionality that does 89 * not exist in illumos. Because there are no software consumers of this 90 * information, we omit a kstat API. If an administrator needs to see this 91 * data for some reason, they can use mdb. 92 * 93 * The format of the kstat data on OpenZFS would be a "header" that looks like 94 * this (a column for each entry in the "raidz_gen_name" and "raidz_rec_name" 95 * arrays, starting with the parity function "implementation" name): 96 * impl gen_p gen_pq gen_pqr rec_p rec_q rec_r rec_pq rec_pr rec_qr rec_pqr 97 * This is followed by a row for each parity function implementation, showing 98 * the "speed" values calculated for that implementation for each of the 99 * parity generation and reconstruction functions in the "raidz_all_maths" 100 * array. 101 */ 102 static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1]; 103 104 #endif 105 106 /* 107 * Returns the RAIDZ operations for raidz_map() parity calculations. When 108 * a SIMD implementation is not allowed in the current context, then fallback 109 * to the fastest generic implementation. 110 */ 111 const raidz_impl_ops_t * 112 vdev_raidz_math_get_ops(void) 113 { 114 if (!kfpu_allowed()) 115 return (&vdev_raidz_scalar_impl); 116 117 raidz_impl_ops_t *ops = NULL; 118 const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); 119 120 switch (impl) { 121 case IMPL_FASTEST: 122 ASSERT(raidz_math_initialized); 123 ops = &vdev_raidz_fastest_impl; 124 break; 125 case IMPL_CYCLE: 126 /* Cycle through all supported implementations */ 127 ASSERT(raidz_math_initialized); 128 ASSERT3U(raidz_supp_impl_cnt, >, 0); 129 static size_t cycle_impl_idx = 0; 130 size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt; 131 ops = raidz_supp_impl[idx]; 132 break; 133 case IMPL_ORIGINAL: 134 ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; 135 break; 136 case IMPL_SCALAR: 137 ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl; 138 break; 139 default: 140 ASSERT3U(impl, <, raidz_supp_impl_cnt); 141 ASSERT3U(raidz_supp_impl_cnt, >, 0); 142 if (impl < ARRAY_SIZE(raidz_all_maths)) 143 ops = raidz_supp_impl[impl]; 144 break; 145 } 146 147 ASSERT3P(ops, !=, NULL); 148 149 return (ops); 150 } 151 152 /* 153 * Select parity generation method for raidz_map 154 */ 155 int 156 vdev_raidz_math_generate(raidz_map_t *rm) 157 { 158 raidz_gen_f gen_parity = NULL; 159 160 switch (raidz_parity(rm)) { 161 case 1: 162 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P]; 163 break; 164 case 2: 165 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ]; 166 break; 167 case 3: 168 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR]; 169 break; 170 default: 171 gen_parity = NULL; 172 cmn_err(CE_PANIC, "invalid RAID-Z configuration %u", 173 (uint_t)raidz_parity(rm)); 174 break; 175 } 176 177 /* if method is NULL execute the original implementation */ 178 if (gen_parity == NULL) 179 return (RAIDZ_ORIGINAL_IMPL); 180 181 gen_parity(rm); 182 183 return (0); 184 } 185 186 static raidz_rec_f 187 reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid, 188 const int nbaddata) 189 { 190 if (nbaddata == 1 && parity_valid[CODE_P]) { 191 return (rm->rm_ops->rec[RAIDZ_REC_P]); 192 } 193 return ((raidz_rec_f) NULL); 194 } 195 196 static raidz_rec_f 197 reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid, 198 const int nbaddata) 199 { 200 if (nbaddata == 1) { 201 if (parity_valid[CODE_P]) { 202 return (rm->rm_ops->rec[RAIDZ_REC_P]); 203 } else if (parity_valid[CODE_Q]) { 204 return (rm->rm_ops->rec[RAIDZ_REC_Q]); 205 } 206 } else if (nbaddata == 2 && 207 parity_valid[CODE_P] && parity_valid[CODE_Q]) { 208 return (rm->rm_ops->rec[RAIDZ_REC_PQ]); 209 } 210 return ((raidz_rec_f) NULL); 211 } 212 213 static raidz_rec_f 214 reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, 215 const int nbaddata) 216 { 217 if (nbaddata == 1) { 218 if (parity_valid[CODE_P]) { 219 return (rm->rm_ops->rec[RAIDZ_REC_P]); 220 } else if (parity_valid[CODE_Q]) { 221 return (rm->rm_ops->rec[RAIDZ_REC_Q]); 222 } else if (parity_valid[CODE_R]) { 223 return (rm->rm_ops->rec[RAIDZ_REC_R]); 224 } 225 } else if (nbaddata == 2) { 226 if (parity_valid[CODE_P] && parity_valid[CODE_Q]) { 227 return (rm->rm_ops->rec[RAIDZ_REC_PQ]); 228 } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) { 229 return (rm->rm_ops->rec[RAIDZ_REC_PR]); 230 } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) { 231 return (rm->rm_ops->rec[RAIDZ_REC_QR]); 232 } 233 } else if (nbaddata == 3 && 234 parity_valid[CODE_P] && parity_valid[CODE_Q] && 235 parity_valid[CODE_R]) { 236 return (rm->rm_ops->rec[RAIDZ_REC_PQR]); 237 } 238 return ((raidz_rec_f) NULL); 239 } 240 241 /* 242 * Select data reconstruction method for raidz_map 243 * @parity_valid - Parity validity flag 244 * @dt - Failed data index array 245 * @nbaddata - Number of failed data columns 246 */ 247 int 248 vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, 249 const int *dt, const int nbaddata) 250 { 251 raidz_rec_f rec_fn = NULL; 252 253 switch (raidz_parity(rm)) { 254 case PARITY_P: 255 rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); 256 break; 257 case PARITY_PQ: 258 rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); 259 break; 260 case PARITY_PQR: 261 rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); 262 break; 263 default: 264 cmn_err(CE_PANIC, "invalid RAID-Z configuration %u", 265 (uint_t)raidz_parity(rm)); 266 break; 267 } 268 269 if (rec_fn == NULL) 270 return (RAIDZ_ORIGINAL_IMPL); 271 else 272 return (rec_fn(rm, dt)); 273 } 274 275 const char *raidz_gen_name[] = { 276 "gen_p", "gen_pq", "gen_pqr" 277 }; 278 const char *raidz_rec_name[] = { 279 "rec_p", "rec_q", "rec_r", 280 "rec_pq", "rec_pr", "rec_qr", "rec_pqr" 281 }; 282 283 #if defined(_KERNEL) 284 285 #define BENCH_D_COLS (8ULL) 286 #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR) 287 #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */ 288 #define BENCH_NS MSEC2NSEC(1) /* 1ms */ 289 290 typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn); 291 292 static void 293 benchmark_gen_impl(raidz_map_t *rm, const int fn) 294 { 295 (void) fn; 296 vdev_raidz_generate_parity(rm); 297 } 298 299 static void 300 benchmark_rec_impl(raidz_map_t *rm, const int fn) 301 { 302 static const int rec_tgt[7][3] = { 303 {1, 2, 3}, /* rec_p: bad QR & D[0] */ 304 {0, 2, 3}, /* rec_q: bad PR & D[0] */ 305 {0, 1, 3}, /* rec_r: bad PQ & D[0] */ 306 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ 307 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ 308 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ 309 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ 310 }; 311 312 vdev_raidz_reconstruct(rm, rec_tgt[fn], 3); 313 } 314 315 /* 316 * Benchmarking of all supported implementations (raidz_supp_impl_cnt) 317 * is performed by setting the rm_ops pointer and calling the top level 318 * generate/reconstruct methods of bench_rm. 319 */ 320 static void 321 benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) 322 { 323 uint64_t run_cnt, speed, best_speed = 0; 324 hrtime_t t_start, t_diff; 325 raidz_impl_ops_t *curr_impl; 326 raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; 327 int impl, i; 328 329 for (impl = 0; impl < raidz_supp_impl_cnt; impl++) { 330 /* set an implementation to benchmark */ 331 curr_impl = raidz_supp_impl[impl]; 332 bench_rm->rm_ops = curr_impl; 333 334 run_cnt = 0; 335 t_start = gethrtime(); 336 337 do { 338 for (i = 0; i < 5; i++, run_cnt++) 339 bench_fn(bench_rm, fn); 340 341 t_diff = gethrtime() - t_start; 342 } while (t_diff < BENCH_NS); 343 344 speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC; 345 speed /= (t_diff * BENCH_COLS); 346 347 if (bench_fn == benchmark_gen_impl) 348 raidz_impl_kstats[impl].gen[fn] = speed; 349 else 350 raidz_impl_kstats[impl].rec[fn] = speed; 351 352 /* Update fastest implementation method */ 353 if (speed > best_speed) { 354 best_speed = speed; 355 356 if (bench_fn == benchmark_gen_impl) { 357 fstat->gen[fn] = impl; 358 vdev_raidz_fastest_impl.gen[fn] = 359 curr_impl->gen[fn]; 360 } else { 361 fstat->rec[fn] = impl; 362 vdev_raidz_fastest_impl.rec[fn] = 363 curr_impl->rec[fn]; 364 } 365 } 366 } 367 } 368 #endif 369 370 /* 371 * Initialize and benchmark all supported implementations. 372 */ 373 static void 374 benchmark_raidz(void) 375 { 376 raidz_impl_ops_t *curr_impl; 377 int i, c; 378 379 /* Move supported impl into raidz_supp_impl */ 380 for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { 381 curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; 382 383 if (curr_impl->init) 384 curr_impl->init(); 385 386 if (curr_impl->is_supported()) 387 raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl; 388 } 389 membar_producer(); /* complete raidz_supp_impl[] init */ 390 raidz_supp_impl_cnt = c; /* number of supported impl */ 391 392 #if defined(_KERNEL) 393 zio_t *bench_zio = NULL; 394 raidz_map_t *bench_rm = NULL; 395 uint64_t bench_parity; 396 397 /* Fake a zio and run the benchmark on a warmed up buffer */ 398 bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 399 bench_zio->io_offset = 0; 400 bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ 401 bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE); 402 memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); 403 404 /* Benchmark parity generation methods */ 405 for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) { 406 bench_parity = fn + 1; 407 /* New raidz_map is needed for each generate_p/q/r */ 408 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, 409 BENCH_D_COLS + bench_parity, bench_parity); 410 411 benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl); 412 413 vdev_raidz_map_free(bench_rm); 414 } 415 416 /* Benchmark data reconstruction methods */ 417 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, 418 BENCH_COLS, PARITY_PQR); 419 420 for (int fn = 0; fn < RAIDZ_REC_NUM; fn++) 421 benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl); 422 423 vdev_raidz_map_free(bench_rm); 424 425 /* cleanup the bench zio */ 426 abd_free(bench_zio->io_abd); 427 kmem_free(bench_zio, sizeof (zio_t)); 428 #else 429 /* 430 * Skip the benchmark in user space to avoid impacting libzpool 431 * consumers (zdb, zhack, zinject, ztest). The last implementation 432 * is assumed to be the fastest and used by default. 433 */ 434 memcpy(&vdev_raidz_fastest_impl, 435 raidz_supp_impl[raidz_supp_impl_cnt - 1], 436 sizeof (vdev_raidz_fastest_impl)); 437 strcpy(vdev_raidz_fastest_impl.name, "fastest"); 438 #endif /* _KERNEL */ 439 } 440 441 void 442 vdev_raidz_math_init(void) 443 { 444 /* Determine the fastest available implementation. */ 445 benchmark_raidz(); 446 447 /* Finish initialization */ 448 atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl); 449 raidz_math_initialized = B_TRUE; 450 } 451 452 void 453 vdev_raidz_math_fini(void) 454 { 455 raidz_impl_ops_t const *curr_impl; 456 457 for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { 458 curr_impl = raidz_all_maths[i]; 459 if (curr_impl->fini) 460 curr_impl->fini(); 461 } 462 } 463 464 static const struct { 465 char *name; 466 uint32_t sel; 467 } math_impl_opts[] = { 468 { "cycle", IMPL_CYCLE }, 469 { "fastest", IMPL_FASTEST }, 470 { "original", IMPL_ORIGINAL }, 471 { "scalar", IMPL_SCALAR } 472 }; 473 474 /* 475 * Function sets desired raidz implementation. 476 * 477 * If we are called before init(), user preference will be saved in 478 * user_sel_impl, and applied in later init() call. This occurs when module 479 * parameter is specified on module load. Otherwise, directly update 480 * zfs_vdev_raidz_impl. 481 * 482 * @val Name of raidz implementation to use 483 * @param Unused. 484 */ 485 int 486 vdev_raidz_impl_set(const char *val) 487 { 488 int err = EINVAL; 489 char req_name[RAIDZ_IMPL_NAME_MAX]; 490 uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl); 491 size_t i; 492 493 /* sanitize input */ 494 i = strnlen(val, RAIDZ_IMPL_NAME_MAX); 495 if (i == 0 || i == RAIDZ_IMPL_NAME_MAX) 496 return (err); 497 498 strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX); 499 while (i > 0 && !!isspace(req_name[i-1])) 500 i--; 501 req_name[i] = '\0'; 502 503 /* Check mandatory options */ 504 for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) { 505 if (strcmp(req_name, math_impl_opts[i].name) == 0) { 506 impl = math_impl_opts[i].sel; 507 err = 0; 508 break; 509 } 510 } 511 512 /* check all supported impl if init() was already called */ 513 if (err != 0 && raidz_math_initialized) { 514 /* check all supported implementations */ 515 for (i = 0; i < raidz_supp_impl_cnt; i++) { 516 if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) { 517 impl = i; 518 err = 0; 519 break; 520 } 521 } 522 } 523 524 if (err == 0) { 525 if (raidz_math_initialized) 526 atomic_swap_32(&zfs_vdev_raidz_impl, impl); 527 else 528 atomic_swap_32(&user_sel_impl, impl); 529 } 530 531 return (err); 532 } 533 534 #if defined(_KERNEL) && defined(__linux__) 535 536 static int 537 zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp) 538 { 539 return (vdev_raidz_impl_set(val)); 540 } 541 542 static int 543 zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp) 544 { 545 int i, cnt = 0; 546 char *fmt; 547 const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); 548 549 ASSERT(raidz_math_initialized); 550 551 /* list mandatory options */ 552 for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) { 553 fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s "; 554 cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name); 555 } 556 557 /* list all supported implementations */ 558 for (i = 0; i < raidz_supp_impl_cnt; i++) { 559 fmt = (i == impl) ? "[%s] " : "%s "; 560 cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name); 561 } 562 563 return (cnt); 564 } 565 566 module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set, 567 zfs_vdev_raidz_impl_get, NULL, 0644); 568 MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation."); 569 #endif 570