1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (C) 2016 Gvozden Nešković. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/time.h> 28 #include <sys/wait.h> 29 #include <sys/zio.h> 30 #include <umem.h> 31 #include <sys/vdev_raidz.h> 32 #include <sys/vdev_raidz_impl.h> 33 #include <assert.h> 34 #include <stdio.h> 35 #include "raidz_test.h" 36 37 static int *rand_data; 38 raidz_test_opts_t rto_opts; 39 40 static char pid_s[16]; 41 42 static void sig_handler(int signo) 43 { 44 int old_errno = errno; 45 struct sigaction action; 46 /* 47 * Restore default action and re-raise signal so SIGSEGV and 48 * SIGABRT can trigger a core dump. 49 */ 50 action.sa_handler = SIG_DFL; 51 sigemptyset(&action.sa_mask); 52 action.sa_flags = 0; 53 (void) sigaction(signo, &action, NULL); 54 55 if (rto_opts.rto_gdb) { 56 pid_t pid = fork(); 57 if (pid == 0) { 58 execlp("gdb", "gdb", "-ex", "set pagination 0", 59 "-p", pid_s, NULL); 60 _exit(-1); 61 } else if (pid > 0) 62 while (waitpid(pid, NULL, 0) == -1 && errno == EINTR) 63 ; 64 } 65 66 raise(signo); 67 errno = old_errno; 68 } 69 70 static void print_opts(raidz_test_opts_t *opts, boolean_t force) 71 { 72 char *verbose; 73 switch (opts->rto_v) { 74 case D_ALL: 75 verbose = "no"; 76 break; 77 case D_INFO: 78 verbose = "info"; 79 break; 80 case D_DEBUG: 81 default: 82 verbose = "debug"; 83 break; 84 } 85 86 if (force || opts->rto_v >= D_INFO) { 87 (void) fprintf(stdout, DBLSEP "Running with options:\n" 88 " (-a) zio ashift : %zu\n" 89 " (-o) zio offset : 1 << %zu\n" 90 " (-e) expanded map : %s\n" 91 " (-r) reflow offset : %llx\n" 92 " (-d) number of raidz data columns : %zu\n" 93 " (-s) size of DATA : 1 << %zu\n" 94 " (-S) sweep parameters : %s \n" 95 " (-v) verbose : %s \n\n", 96 opts->rto_ashift, /* -a */ 97 ilog2(opts->rto_offset), /* -o */ 98 opts->rto_expand ? "yes" : "no", /* -e */ 99 (u_longlong_t)opts->rto_expand_offset, /* -r */ 100 opts->rto_dcols, /* -d */ 101 ilog2(opts->rto_dsize), /* -s */ 102 opts->rto_sweep ? "yes" : "no", /* -S */ 103 verbose); /* -v */ 104 } 105 } 106 107 static void usage(boolean_t requested) 108 { 109 const raidz_test_opts_t *o = &rto_opts_defaults; 110 111 FILE *fp = requested ? stdout : stderr; 112 113 (void) fprintf(fp, "Usage:\n" 114 "\t[-a zio ashift (default: %zu)]\n" 115 "\t[-o zio offset, exponent radix 2 (default: %zu)]\n" 116 "\t[-d number of raidz data columns (default: %zu)]\n" 117 "\t[-s zio size, exponent radix 2 (default: %zu)]\n" 118 "\t[-S parameter sweep (default: %s)]\n" 119 "\t[-t timeout for parameter sweep test]\n" 120 "\t[-B benchmark all raidz implementations]\n" 121 "\t[-e use expanded raidz map (default: %s)]\n" 122 "\t[-r expanded raidz map reflow offset (default: %llx)]\n" 123 "\t[-v increase verbosity (default: %d)]\n" 124 "\t[-h (print help)]\n" 125 "\t[-T test the test, see if failure would be detected]\n" 126 "\t[-D debug (attach gdb on SIGSEGV)]\n" 127 "", 128 o->rto_ashift, /* -a */ 129 ilog2(o->rto_offset), /* -o */ 130 o->rto_dcols, /* -d */ 131 ilog2(o->rto_dsize), /* -s */ 132 rto_opts.rto_sweep ? "yes" : "no", /* -S */ 133 rto_opts.rto_expand ? "yes" : "no", /* -e */ 134 (u_longlong_t)o->rto_expand_offset, /* -r */ 135 o->rto_v); /* -v */ 136 137 exit(requested ? 0 : 1); 138 } 139 140 static void process_options(int argc, char **argv) 141 { 142 size_t value; 143 int opt; 144 145 raidz_test_opts_t *o = &rto_opts; 146 147 bcopy(&rto_opts_defaults, o, sizeof (*o)); 148 149 while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) { 150 value = 0; 151 152 switch (opt) { 153 case 'a': 154 value = strtoull(optarg, NULL, 0); 155 o->rto_ashift = MIN(13, MAX(9, value)); 156 break; 157 case 'e': 158 o->rto_expand = 1; 159 break; 160 case 'r': 161 o->rto_expand_offset = strtoull(optarg, NULL, 0); 162 break; 163 case 'o': 164 value = strtoull(optarg, NULL, 0); 165 o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9; 166 break; 167 case 'd': 168 value = strtoull(optarg, NULL, 0); 169 o->rto_dcols = MIN(255, MAX(1, value)); 170 break; 171 case 's': 172 value = strtoull(optarg, NULL, 0); 173 o->rto_dsize = 1ULL << MIN(SPA_MAXBLOCKSHIFT, 174 MAX(SPA_MINBLOCKSHIFT, value)); 175 break; 176 case 't': 177 value = strtoull(optarg, NULL, 0); 178 o->rto_sweep_timeout = value; 179 break; 180 case 'v': 181 o->rto_v++; 182 break; 183 case 'S': 184 o->rto_sweep = 1; 185 break; 186 case 'B': 187 o->rto_benchmark = 1; 188 break; 189 case 'D': 190 o->rto_gdb = 1; 191 break; 192 case 'T': 193 o->rto_sanity = 1; 194 break; 195 case 'h': 196 usage(B_TRUE); 197 break; 198 case '?': 199 default: 200 usage(B_FALSE); 201 break; 202 } 203 } 204 } 205 206 #define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd) 207 #define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size) 208 209 #define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd) 210 #define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size) 211 212 static int 213 cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) 214 { 215 int r, i, ret = 0; 216 217 VERIFY(parity >= 1 && parity <= 3); 218 219 for (r = 0; r < rm->rm_nrows; r++) { 220 raidz_row_t * const rr = rm->rm_row[r]; 221 raidz_row_t * const rrg = opts->rm_golden->rm_row[r]; 222 for (i = 0; i < parity; i++) { 223 if (CODE_COL_SIZE(rrg, i) == 0) { 224 VERIFY0(CODE_COL_SIZE(rr, i)); 225 continue; 226 } 227 228 if (abd_cmp(CODE_COL(rr, i), 229 CODE_COL(rrg, i)) != 0) { 230 ret++; 231 LOG_OPT(D_DEBUG, opts, 232 "\nParity block [%d] different!\n", i); 233 } 234 } 235 } 236 return (ret); 237 } 238 239 static int 240 cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) 241 { 242 int r, i, dcols, ret = 0; 243 244 for (r = 0; r < rm->rm_nrows; r++) { 245 raidz_row_t *rr = rm->rm_row[r]; 246 raidz_row_t *rrg = opts->rm_golden->rm_row[r]; 247 dcols = opts->rm_golden->rm_row[0]->rr_cols - 248 raidz_parity(opts->rm_golden); 249 for (i = 0; i < dcols; i++) { 250 if (DATA_COL_SIZE(rrg, i) == 0) { 251 VERIFY0(DATA_COL_SIZE(rr, i)); 252 continue; 253 } 254 255 if (abd_cmp(DATA_COL(rrg, i), 256 DATA_COL(rr, i)) != 0) { 257 ret++; 258 259 LOG_OPT(D_DEBUG, opts, 260 "\nData block [%d] different!\n", i); 261 } 262 } 263 } 264 return (ret); 265 } 266 267 static int 268 init_rand(void *data, size_t size, void *private) 269 { 270 (void) private; 271 memcpy(data, rand_data, size); 272 return (0); 273 } 274 275 static void 276 corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) 277 { 278 for (int r = 0; r < rm->rm_nrows; r++) { 279 raidz_row_t *rr = rm->rm_row[r]; 280 for (int i = 0; i < cnt; i++) { 281 raidz_col_t *col = &rr->rr_col[tgts[i]]; 282 abd_iterate_func(col->rc_abd, 0, col->rc_size, 283 init_rand, NULL); 284 } 285 } 286 } 287 288 void 289 init_zio_abd(zio_t *zio) 290 { 291 abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL); 292 } 293 294 static void 295 fini_raidz_map(zio_t **zio, raidz_map_t **rm) 296 { 297 vdev_raidz_map_free(*rm); 298 raidz_free((*zio)->io_abd, (*zio)->io_size); 299 umem_free(*zio, sizeof (zio_t)); 300 301 *zio = NULL; 302 *rm = NULL; 303 } 304 305 static int 306 init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) 307 { 308 int err = 0; 309 zio_t *zio_test; 310 raidz_map_t *rm_test; 311 const size_t total_ncols = opts->rto_dcols + parity; 312 313 if (opts->rm_golden) { 314 fini_raidz_map(&opts->zio_golden, &opts->rm_golden); 315 } 316 317 opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); 318 zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); 319 320 opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset; 321 opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize; 322 323 opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize); 324 zio_test->io_abd = raidz_alloc(opts->rto_dsize); 325 326 init_zio_abd(opts->zio_golden); 327 init_zio_abd(zio_test); 328 329 VERIFY0(vdev_raidz_impl_set("original")); 330 331 if (opts->rto_expand) { 332 opts->rm_golden = 333 vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd, 334 opts->zio_golden->io_size, opts->zio_golden->io_offset, 335 opts->rto_ashift, total_ncols+1, total_ncols, 336 parity, opts->rto_expand_offset); 337 rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd, 338 zio_test->io_size, zio_test->io_offset, 339 opts->rto_ashift, total_ncols+1, total_ncols, 340 parity, opts->rto_expand_offset); 341 } else { 342 opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, 343 opts->rto_ashift, total_ncols, parity); 344 rm_test = vdev_raidz_map_alloc(zio_test, 345 opts->rto_ashift, total_ncols, parity); 346 } 347 348 VERIFY(opts->zio_golden); 349 VERIFY(opts->rm_golden); 350 351 vdev_raidz_generate_parity(opts->rm_golden); 352 vdev_raidz_generate_parity(rm_test); 353 354 /* sanity check */ 355 err |= cmp_data(opts, rm_test); 356 err |= cmp_code(opts, rm_test, parity); 357 358 if (err) 359 ERR("initializing the golden copy ... [FAIL]!\n"); 360 361 /* tear down raidz_map of test zio */ 362 fini_raidz_map(&zio_test, &rm_test); 363 364 return (err); 365 } 366 367 /* 368 * If reflow is not in progress, reflow_offset should be UINT64_MAX. 369 * For each row, if the row is entirely before reflow_offset, it will 370 * come from the new location. Otherwise this row will come from the 371 * old location. Therefore, rows that straddle the reflow_offset will 372 * come from the old location. 373 * 374 * NOTE: Until raidz expansion is implemented this function is only 375 * needed by raidz_test.c to the multi-row raid_map_t functionality. 376 */ 377 raidz_map_t * 378 vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, 379 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 380 uint64_t nparity, uint64_t reflow_offset) 381 { 382 /* The zio's size in units of the vdev's minimum sector size. */ 383 uint64_t s = size >> ashift; 384 uint64_t q, r, bc, devidx, asize = 0, tot; 385 386 /* 387 * "Quotient": The number of data sectors for this stripe on all but 388 * the "big column" child vdevs that also contain "remainder" data. 389 * AKA "full rows" 390 */ 391 q = s / (logical_cols - nparity); 392 393 /* 394 * "Remainder": The number of partial stripe data sectors in this I/O. 395 * This will add a sector to some, but not all, child vdevs. 396 */ 397 r = s - q * (logical_cols - nparity); 398 399 /* The number of "big columns" - those which contain remainder data. */ 400 bc = (r == 0 ? 0 : r + nparity); 401 402 /* 403 * The total number of data and parity sectors associated with 404 * this I/O. 405 */ 406 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 407 408 /* How many rows contain data (not skip) */ 409 uint64_t rows = howmany(tot, logical_cols); 410 int cols = MIN(tot, logical_cols); 411 412 raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 413 KM_SLEEP); 414 rm->rm_nrows = rows; 415 416 for (uint64_t row = 0; row < rows; row++) { 417 raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, 418 rr_col[cols]), KM_SLEEP); 419 rm->rm_row[row] = rr; 420 421 /* The starting RAIDZ (parent) vdev sector of the row. */ 422 uint64_t b = (offset >> ashift) + row * logical_cols; 423 424 /* 425 * If we are in the middle of a reflow, and any part of this 426 * row has not been copied, then use the old location of 427 * this row. 428 */ 429 int row_phys_cols = physical_cols; 430 if (b + (logical_cols - nparity) > reflow_offset >> ashift) 431 row_phys_cols--; 432 433 /* starting child of this row */ 434 uint64_t child_id = b % row_phys_cols; 435 /* The starting byte offset on each child vdev. */ 436 uint64_t child_offset = (b / row_phys_cols) << ashift; 437 438 /* 439 * We set cols to the entire width of the block, even 440 * if this row is shorter. This is needed because parity 441 * generation (for Q and R) needs to know the entire width, 442 * because it treats the short row as though it was 443 * full-width (and the "phantom" sectors were zero-filled). 444 * 445 * Another approach to this would be to set cols shorter 446 * (to just the number of columns that we might do i/o to) 447 * and have another mechanism to tell the parity generation 448 * about the "entire width". Reconstruction (at least 449 * vdev_raidz_reconstruct_general()) would also need to 450 * know about the "entire width". 451 */ 452 rr->rr_cols = cols; 453 rr->rr_bigcols = bc; 454 rr->rr_missingdata = 0; 455 rr->rr_missingparity = 0; 456 rr->rr_firstdatacol = nparity; 457 rr->rr_abd_empty = NULL; 458 rr->rr_nempty = 0; 459 460 for (int c = 0; c < rr->rr_cols; c++, child_id++) { 461 if (child_id >= row_phys_cols) { 462 child_id -= row_phys_cols; 463 child_offset += 1ULL << ashift; 464 } 465 rr->rr_col[c].rc_devidx = child_id; 466 rr->rr_col[c].rc_offset = child_offset; 467 rr->rr_col[c].rc_orig_data = NULL; 468 rr->rr_col[c].rc_error = 0; 469 rr->rr_col[c].rc_tried = 0; 470 rr->rr_col[c].rc_skipped = 0; 471 rr->rr_col[c].rc_need_orig_restore = B_FALSE; 472 473 uint64_t dc = c - rr->rr_firstdatacol; 474 if (c < rr->rr_firstdatacol) { 475 rr->rr_col[c].rc_size = 1ULL << ashift; 476 rr->rr_col[c].rc_abd = 477 abd_alloc_linear(rr->rr_col[c].rc_size, 478 B_TRUE); 479 } else if (row == rows - 1 && bc != 0 && c >= bc) { 480 /* 481 * Past the end, this for parity generation. 482 */ 483 rr->rr_col[c].rc_size = 0; 484 rr->rr_col[c].rc_abd = NULL; 485 } else { 486 /* 487 * "data column" (col excluding parity) 488 * Add an ASCII art diagram here 489 */ 490 uint64_t off; 491 492 if (c < bc || r == 0) { 493 off = dc * rows + row; 494 } else { 495 off = r * rows + 496 (dc - r) * (rows - 1) + row; 497 } 498 rr->rr_col[c].rc_size = 1ULL << ashift; 499 rr->rr_col[c].rc_abd = abd_get_offset_struct( 500 &rr->rr_col[c].rc_abdstruct, 501 abd, off << ashift, 1 << ashift); 502 } 503 504 asize += rr->rr_col[c].rc_size; 505 } 506 /* 507 * If all data stored spans all columns, there's a danger that 508 * parity will always be on the same device and, since parity 509 * isn't read during normal operation, that that device's I/O 510 * bandwidth won't be used effectively. We therefore switch 511 * the parity every 1MB. 512 * 513 * ...at least that was, ostensibly, the theory. As a practical 514 * matter unless we juggle the parity between all devices 515 * evenly, we won't see any benefit. Further, occasional writes 516 * that aren't a multiple of the LCM of the number of children 517 * and the minimum stripe width are sufficient to avoid pessimal 518 * behavior. Unfortunately, this decision created an implicit 519 * on-disk format requirement that we need to support for all 520 * eternity, but only for single-parity RAID-Z. 521 * 522 * If we intend to skip a sector in the zeroth column for 523 * padding we must make sure to note this swap. We will never 524 * intend to skip the first column since at least one data and 525 * one parity column must appear in each row. 526 */ 527 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 528 (offset & (1ULL << 20))) { 529 ASSERT(rr->rr_cols >= 2); 530 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 531 devidx = rr->rr_col[0].rc_devidx; 532 uint64_t o = rr->rr_col[0].rc_offset; 533 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 534 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 535 rr->rr_col[1].rc_devidx = devidx; 536 rr->rr_col[1].rc_offset = o; 537 } 538 539 } 540 ASSERT3U(asize, ==, tot << ashift); 541 542 /* init RAIDZ parity ops */ 543 rm->rm_ops = vdev_raidz_math_get_ops(); 544 545 return (rm); 546 } 547 548 static raidz_map_t * 549 init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) 550 { 551 raidz_map_t *rm = NULL; 552 const size_t alloc_dsize = opts->rto_dsize; 553 const size_t total_ncols = opts->rto_dcols + parity; 554 const int ccols[] = { 0, 1, 2 }; 555 556 VERIFY(zio); 557 VERIFY(parity <= 3 && parity >= 1); 558 559 *zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); 560 561 (*zio)->io_offset = 0; 562 (*zio)->io_size = alloc_dsize; 563 (*zio)->io_abd = raidz_alloc(alloc_dsize); 564 init_zio_abd(*zio); 565 566 if (opts->rto_expand) { 567 rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd, 568 (*zio)->io_size, (*zio)->io_offset, 569 opts->rto_ashift, total_ncols+1, total_ncols, 570 parity, opts->rto_expand_offset); 571 } else { 572 rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, 573 total_ncols, parity); 574 } 575 VERIFY(rm); 576 577 /* Make sure code columns are destroyed */ 578 corrupt_colums(rm, ccols, parity); 579 580 return (rm); 581 } 582 583 static int 584 run_gen_check(raidz_test_opts_t *opts) 585 { 586 char **impl_name; 587 int fn, err = 0; 588 zio_t *zio_test; 589 raidz_map_t *rm_test; 590 591 err = init_raidz_golden_map(opts, PARITY_PQR); 592 if (0 != err) 593 return (err); 594 595 LOG(D_INFO, DBLSEP); 596 LOG(D_INFO, "Testing parity generation...\n"); 597 598 for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; 599 impl_name++) { 600 601 LOG(D_INFO, SEP); 602 LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); 603 604 if (0 != vdev_raidz_impl_set(*impl_name)) { 605 LOG(D_INFO, "[SKIP]\n"); 606 continue; 607 } else { 608 LOG(D_INFO, "[SUPPORTED]\n"); 609 } 610 611 for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { 612 613 /* Check if should stop */ 614 if (rto_opts.rto_should_stop) 615 return (err); 616 617 /* create suitable raidz_map */ 618 rm_test = init_raidz_map(opts, &zio_test, fn+1); 619 VERIFY(rm_test); 620 621 LOG(D_INFO, "\t\tTesting method [%s] ...", 622 raidz_gen_name[fn]); 623 624 if (!opts->rto_sanity) 625 vdev_raidz_generate_parity(rm_test); 626 627 if (cmp_code(opts, rm_test, fn+1) != 0) { 628 LOG(D_INFO, "[FAIL]\n"); 629 err++; 630 } else 631 LOG(D_INFO, "[PASS]\n"); 632 633 fini_raidz_map(&zio_test, &rm_test); 634 } 635 } 636 637 fini_raidz_map(&opts->zio_golden, &opts->rm_golden); 638 639 return (err); 640 } 641 642 static int 643 run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) 644 { 645 int x0, x1, x2; 646 int tgtidx[3]; 647 int err = 0; 648 static const int rec_tgts[7][3] = { 649 {1, 2, 3}, /* rec_p: bad QR & D[0] */ 650 {0, 2, 3}, /* rec_q: bad PR & D[0] */ 651 {0, 1, 3}, /* rec_r: bad PQ & D[0] */ 652 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ 653 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ 654 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ 655 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ 656 }; 657 658 memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx)); 659 660 if (fn < RAIDZ_REC_PQ) { 661 /* can reconstruct 1 failed data disk */ 662 for (x0 = 0; x0 < opts->rto_dcols; x0++) { 663 if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) 664 continue; 665 666 /* Check if should stop */ 667 if (rto_opts.rto_should_stop) 668 return (err); 669 670 LOG(D_DEBUG, "[%d] ", x0); 671 672 tgtidx[2] = x0 + raidz_parity(rm); 673 674 corrupt_colums(rm, tgtidx+2, 1); 675 676 if (!opts->rto_sanity) 677 vdev_raidz_reconstruct(rm, tgtidx, 3); 678 679 if (cmp_data(opts, rm) != 0) { 680 err++; 681 LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0); 682 } 683 } 684 685 } else if (fn < RAIDZ_REC_PQR) { 686 /* can reconstruct 2 failed data disk */ 687 for (x0 = 0; x0 < opts->rto_dcols; x0++) { 688 if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) 689 continue; 690 for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { 691 if (x1 >= rm->rm_row[0]->rr_cols - 692 raidz_parity(rm)) 693 continue; 694 695 /* Check if should stop */ 696 if (rto_opts.rto_should_stop) 697 return (err); 698 699 LOG(D_DEBUG, "[%d %d] ", x0, x1); 700 701 tgtidx[1] = x0 + raidz_parity(rm); 702 tgtidx[2] = x1 + raidz_parity(rm); 703 704 corrupt_colums(rm, tgtidx+1, 2); 705 706 if (!opts->rto_sanity) 707 vdev_raidz_reconstruct(rm, tgtidx, 3); 708 709 if (cmp_data(opts, rm) != 0) { 710 err++; 711 LOG(D_DEBUG, "\nREC D[%d %d]... " 712 "[FAIL]\n", x0, x1); 713 } 714 } 715 } 716 } else { 717 /* can reconstruct 3 failed data disk */ 718 for (x0 = 0; x0 < opts->rto_dcols; x0++) { 719 if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) 720 continue; 721 for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { 722 if (x1 >= rm->rm_row[0]->rr_cols - 723 raidz_parity(rm)) 724 continue; 725 for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { 726 if (x2 >= rm->rm_row[0]->rr_cols - 727 raidz_parity(rm)) 728 continue; 729 730 /* Check if should stop */ 731 if (rto_opts.rto_should_stop) 732 return (err); 733 734 LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2); 735 736 tgtidx[0] = x0 + raidz_parity(rm); 737 tgtidx[1] = x1 + raidz_parity(rm); 738 tgtidx[2] = x2 + raidz_parity(rm); 739 740 corrupt_colums(rm, tgtidx, 3); 741 742 if (!opts->rto_sanity) 743 vdev_raidz_reconstruct(rm, 744 tgtidx, 3); 745 746 if (cmp_data(opts, rm) != 0) { 747 err++; 748 LOG(D_DEBUG, 749 "\nREC D[%d %d %d]... " 750 "[FAIL]\n", x0, x1, x2); 751 } 752 } 753 } 754 } 755 } 756 return (err); 757 } 758 759 static int 760 run_rec_check(raidz_test_opts_t *opts) 761 { 762 char **impl_name; 763 unsigned fn, err = 0; 764 zio_t *zio_test; 765 raidz_map_t *rm_test; 766 767 err = init_raidz_golden_map(opts, PARITY_PQR); 768 if (0 != err) 769 return (err); 770 771 LOG(D_INFO, DBLSEP); 772 LOG(D_INFO, "Testing data reconstruction...\n"); 773 774 for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; 775 impl_name++) { 776 777 LOG(D_INFO, SEP); 778 LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); 779 780 if (vdev_raidz_impl_set(*impl_name) != 0) { 781 LOG(D_INFO, "[SKIP]\n"); 782 continue; 783 } else 784 LOG(D_INFO, "[SUPPORTED]\n"); 785 786 787 /* create suitable raidz_map */ 788 rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR); 789 /* generate parity */ 790 vdev_raidz_generate_parity(rm_test); 791 792 for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { 793 794 LOG(D_INFO, "\t\tTesting method [%s] ...", 795 raidz_rec_name[fn]); 796 797 if (run_rec_check_impl(opts, rm_test, fn) != 0) { 798 LOG(D_INFO, "[FAIL]\n"); 799 err++; 800 801 } else 802 LOG(D_INFO, "[PASS]\n"); 803 804 } 805 /* tear down test raidz_map */ 806 fini_raidz_map(&zio_test, &rm_test); 807 } 808 809 fini_raidz_map(&opts->zio_golden, &opts->rm_golden); 810 811 return (err); 812 } 813 814 static int 815 run_test(raidz_test_opts_t *opts) 816 { 817 int err = 0; 818 819 if (opts == NULL) 820 opts = &rto_opts; 821 822 print_opts(opts, B_FALSE); 823 824 err |= run_gen_check(opts); 825 err |= run_rec_check(opts); 826 827 return (err); 828 } 829 830 #define SWEEP_RUNNING 0 831 #define SWEEP_FINISHED 1 832 #define SWEEP_ERROR 2 833 #define SWEEP_TIMEOUT 3 834 835 static int sweep_state = 0; 836 static raidz_test_opts_t failed_opts; 837 838 static kmutex_t sem_mtx; 839 static kcondvar_t sem_cv; 840 static int max_free_slots; 841 static int free_slots; 842 843 static _Noreturn void 844 sweep_thread(void *arg) 845 { 846 int err = 0; 847 raidz_test_opts_t *opts = (raidz_test_opts_t *)arg; 848 VERIFY(opts != NULL); 849 850 err = run_test(opts); 851 852 if (rto_opts.rto_sanity) { 853 /* 25% chance that a sweep test fails */ 854 if (rand() < (RAND_MAX/4)) 855 err = 1; 856 } 857 858 if (0 != err) { 859 mutex_enter(&sem_mtx); 860 memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t)); 861 sweep_state = SWEEP_ERROR; 862 mutex_exit(&sem_mtx); 863 } 864 865 umem_free(opts, sizeof (raidz_test_opts_t)); 866 867 /* signal the next thread */ 868 mutex_enter(&sem_mtx); 869 free_slots++; 870 cv_signal(&sem_cv); 871 mutex_exit(&sem_mtx); 872 873 thread_exit(); 874 } 875 876 static int 877 run_sweep(void) 878 { 879 static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 }; 880 static const size_t ashift_v[] = { 9, 12, 14 }; 881 static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12), 882 1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE }; 883 884 (void) setvbuf(stdout, NULL, _IONBF, 0); 885 886 ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) * 887 ARRAY_SIZE(dcols_v); 888 ulong_t tried_comb = 0; 889 hrtime_t time_diff, start_time = gethrtime(); 890 raidz_test_opts_t *opts; 891 int a, d, s; 892 893 max_free_slots = free_slots = MAX(2, boot_ncpus); 894 895 mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL); 896 cv_init(&sem_cv, NULL, CV_DEFAULT, NULL); 897 898 for (s = 0; s < ARRAY_SIZE(size_v); s++) 899 for (a = 0; a < ARRAY_SIZE(ashift_v); a++) 900 for (d = 0; d < ARRAY_SIZE(dcols_v); d++) { 901 902 if (size_v[s] < (1 << ashift_v[a])) { 903 total_comb--; 904 continue; 905 } 906 907 if (++tried_comb % 20 == 0) 908 LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb); 909 910 /* wait for signal to start new thread */ 911 mutex_enter(&sem_mtx); 912 while (cv_timedwait_sig(&sem_cv, &sem_mtx, 913 ddi_get_lbolt() + hz)) { 914 915 /* check if should stop the test (timeout) */ 916 time_diff = (gethrtime() - start_time) / NANOSEC; 917 if (rto_opts.rto_sweep_timeout > 0 && 918 time_diff >= rto_opts.rto_sweep_timeout) { 919 sweep_state = SWEEP_TIMEOUT; 920 rto_opts.rto_should_stop = B_TRUE; 921 mutex_exit(&sem_mtx); 922 goto exit; 923 } 924 925 /* check if should stop the test (error) */ 926 if (sweep_state != SWEEP_RUNNING) { 927 mutex_exit(&sem_mtx); 928 goto exit; 929 } 930 931 /* exit loop if a slot is available */ 932 if (free_slots > 0) { 933 break; 934 } 935 } 936 937 free_slots--; 938 mutex_exit(&sem_mtx); 939 940 opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL); 941 opts->rto_ashift = ashift_v[a]; 942 opts->rto_dcols = dcols_v[d]; 943 opts->rto_offset = (1 << ashift_v[a]) * rand(); 944 opts->rto_dsize = size_v[s]; 945 opts->rto_expand = rto_opts.rto_expand; 946 opts->rto_expand_offset = rto_opts.rto_expand_offset; 947 opts->rto_v = 0; /* be quiet */ 948 949 VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts, 950 0, NULL, TS_RUN, defclsyspri), !=, NULL); 951 } 952 953 exit: 954 LOG(D_ALL, "\nWaiting for test threads to finish...\n"); 955 mutex_enter(&sem_mtx); 956 VERIFY(free_slots <= max_free_slots); 957 while (free_slots < max_free_slots) { 958 (void) cv_wait(&sem_cv, &sem_mtx); 959 } 960 mutex_exit(&sem_mtx); 961 962 if (sweep_state == SWEEP_ERROR) { 963 ERR("Sweep test failed! Failed option: \n"); 964 print_opts(&failed_opts, B_TRUE); 965 } else { 966 if (sweep_state == SWEEP_TIMEOUT) 967 LOG(D_ALL, "Test timeout (%lus). Stopping...\n", 968 (ulong_t)rto_opts.rto_sweep_timeout); 969 970 LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n", 971 (ulong_t)tried_comb); 972 } 973 974 mutex_destroy(&sem_mtx); 975 976 return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0); 977 } 978 979 980 int 981 main(int argc, char **argv) 982 { 983 size_t i; 984 struct sigaction action; 985 int err = 0; 986 987 /* init gdb pid string early */ 988 (void) sprintf(pid_s, "%d", getpid()); 989 990 action.sa_handler = sig_handler; 991 sigemptyset(&action.sa_mask); 992 action.sa_flags = 0; 993 994 if (sigaction(SIGSEGV, &action, NULL) < 0) { 995 ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno)); 996 exit(EXIT_FAILURE); 997 } 998 999 (void) setvbuf(stdout, NULL, _IOLBF, 0); 1000 1001 dprintf_setup(&argc, argv); 1002 1003 process_options(argc, argv); 1004 1005 kernel_init(SPA_MODE_READ); 1006 1007 /* setup random data because rand() is not reentrant */ 1008 rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); 1009 srand((unsigned)time(NULL) * getpid()); 1010 for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++) 1011 rand_data[i] = rand(); 1012 1013 mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ); 1014 1015 if (rto_opts.rto_benchmark) { 1016 run_raidz_benchmark(); 1017 } else if (rto_opts.rto_sweep) { 1018 err = run_sweep(); 1019 } else { 1020 err = run_test(NULL); 1021 } 1022 1023 umem_free(rand_data, SPA_MAXBLOCKSIZE); 1024 kernel_fini(); 1025 1026 return (err); 1027 } 1028