1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (C) 2016 Gvozden Nešković. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/time.h> 28 #include <sys/wait.h> 29 #include <sys/zio.h> 30 #include <umem.h> 31 #include <sys/vdev_raidz.h> 32 #include <sys/vdev_raidz_impl.h> 33 #include <assert.h> 34 #include <stdio.h> 35 #include "raidz_test.h" 36 37 static int *rand_data; 38 raidz_test_opts_t rto_opts; 39 40 static char pid_s[16]; 41 42 static void sig_handler(int signo) 43 { 44 int old_errno = errno; 45 struct sigaction action; 46 /* 47 * Restore default action and re-raise signal so SIGSEGV and 48 * SIGABRT can trigger a core dump. 49 */ 50 action.sa_handler = SIG_DFL; 51 sigemptyset(&action.sa_mask); 52 action.sa_flags = 0; 53 (void) sigaction(signo, &action, NULL); 54 55 if (rto_opts.rto_gdb) { 56 pid_t pid = fork(); 57 if (pid == 0) { 58 execlp("gdb", "gdb", "-ex", "set pagination 0", 59 "-p", pid_s, NULL); 60 _exit(-1); 61 } else if (pid > 0) 62 while (waitpid(pid, NULL, 0) == -1 && errno == EINTR) 63 ; 64 } 65 66 raise(signo); 67 errno = old_errno; 68 } 69 70 static void print_opts(raidz_test_opts_t *opts, boolean_t force) 71 { 72 char *verbose; 73 switch (opts->rto_v) { 74 case 0: 75 verbose = "no"; 76 break; 77 case 1: 78 verbose = "info"; 79 break; 80 default: 81 verbose = "debug"; 82 break; 83 } 84 85 if (force || opts->rto_v >= D_INFO) { 86 (void) fprintf(stdout, DBLSEP "Running with options:\n" 87 " (-a) zio ashift : %zu\n" 88 " (-o) zio offset : 1 << %zu\n" 89 " (-e) expanded map : %s\n" 90 " (-r) reflow offset : %llx\n" 91 " (-d) number of raidz data columns : %zu\n" 92 " (-s) size of DATA : 1 << %zu\n" 93 " (-S) sweep parameters : %s \n" 94 " (-v) verbose : %s \n\n", 95 opts->rto_ashift, /* -a */ 96 ilog2(opts->rto_offset), /* -o */ 97 opts->rto_expand ? "yes" : "no", /* -e */ 98 (u_longlong_t)opts->rto_expand_offset, /* -r */ 99 opts->rto_dcols, /* -d */ 100 ilog2(opts->rto_dsize), /* -s */ 101 opts->rto_sweep ? "yes" : "no", /* -S */ 102 verbose); /* -v */ 103 } 104 } 105 106 static void usage(boolean_t requested) 107 { 108 const raidz_test_opts_t *o = &rto_opts_defaults; 109 110 FILE *fp = requested ? stdout : stderr; 111 112 (void) fprintf(fp, "Usage:\n" 113 "\t[-a zio ashift (default: %zu)]\n" 114 "\t[-o zio offset, exponent radix 2 (default: %zu)]\n" 115 "\t[-d number of raidz data columns (default: %zu)]\n" 116 "\t[-s zio size, exponent radix 2 (default: %zu)]\n" 117 "\t[-S parameter sweep (default: %s)]\n" 118 "\t[-t timeout for parameter sweep test]\n" 119 "\t[-B benchmark all raidz implementations]\n" 120 "\t[-e use expanded raidz map (default: %s)]\n" 121 "\t[-r expanded raidz map reflow offset (default: %llx)]\n" 122 "\t[-v increase verbosity (default: %zu)]\n" 123 "\t[-h (print help)]\n" 124 "\t[-T test the test, see if failure would be detected]\n" 125 "\t[-D debug (attach gdb on SIGSEGV)]\n" 126 "", 127 o->rto_ashift, /* -a */ 128 ilog2(o->rto_offset), /* -o */ 129 o->rto_dcols, /* -d */ 130 ilog2(o->rto_dsize), /* -s */ 131 rto_opts.rto_sweep ? "yes" : "no", /* -S */ 132 rto_opts.rto_expand ? "yes" : "no", /* -e */ 133 (u_longlong_t)o->rto_expand_offset, /* -r */ 134 o->rto_v); /* -d */ 135 136 exit(requested ? 0 : 1); 137 } 138 139 static void process_options(int argc, char **argv) 140 { 141 size_t value; 142 int opt; 143 144 raidz_test_opts_t *o = &rto_opts; 145 146 bcopy(&rto_opts_defaults, o, sizeof (*o)); 147 148 while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) { 149 value = 0; 150 151 switch (opt) { 152 case 'a': 153 value = strtoull(optarg, NULL, 0); 154 o->rto_ashift = MIN(13, MAX(9, value)); 155 break; 156 case 'e': 157 o->rto_expand = 1; 158 break; 159 case 'r': 160 o->rto_expand_offset = strtoull(optarg, NULL, 0); 161 break; 162 case 'o': 163 value = strtoull(optarg, NULL, 0); 164 o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9; 165 break; 166 case 'd': 167 value = strtoull(optarg, NULL, 0); 168 o->rto_dcols = MIN(255, MAX(1, value)); 169 break; 170 case 's': 171 value = strtoull(optarg, NULL, 0); 172 o->rto_dsize = 1ULL << MIN(SPA_MAXBLOCKSHIFT, 173 MAX(SPA_MINBLOCKSHIFT, value)); 174 break; 175 case 't': 176 value = strtoull(optarg, NULL, 0); 177 o->rto_sweep_timeout = value; 178 break; 179 case 'v': 180 o->rto_v++; 181 break; 182 case 'S': 183 o->rto_sweep = 1; 184 break; 185 case 'B': 186 o->rto_benchmark = 1; 187 break; 188 case 'D': 189 o->rto_gdb = 1; 190 break; 191 case 'T': 192 o->rto_sanity = 1; 193 break; 194 case 'h': 195 usage(B_TRUE); 196 break; 197 case '?': 198 default: 199 usage(B_FALSE); 200 break; 201 } 202 } 203 } 204 205 #define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd) 206 #define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size) 207 208 #define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd) 209 #define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size) 210 211 static int 212 cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) 213 { 214 int r, i, ret = 0; 215 216 VERIFY(parity >= 1 && parity <= 3); 217 218 for (r = 0; r < rm->rm_nrows; r++) { 219 raidz_row_t * const rr = rm->rm_row[r]; 220 raidz_row_t * const rrg = opts->rm_golden->rm_row[r]; 221 for (i = 0; i < parity; i++) { 222 if (CODE_COL_SIZE(rrg, i) == 0) { 223 VERIFY0(CODE_COL_SIZE(rr, i)); 224 continue; 225 } 226 227 if (abd_cmp(CODE_COL(rr, i), 228 CODE_COL(rrg, i)) != 0) { 229 ret++; 230 LOG_OPT(D_DEBUG, opts, 231 "\nParity block [%d] different!\n", i); 232 } 233 } 234 } 235 return (ret); 236 } 237 238 static int 239 cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) 240 { 241 int r, i, dcols, ret = 0; 242 243 for (r = 0; r < rm->rm_nrows; r++) { 244 raidz_row_t *rr = rm->rm_row[r]; 245 raidz_row_t *rrg = opts->rm_golden->rm_row[r]; 246 dcols = opts->rm_golden->rm_row[0]->rr_cols - 247 raidz_parity(opts->rm_golden); 248 for (i = 0; i < dcols; i++) { 249 if (DATA_COL_SIZE(rrg, i) == 0) { 250 VERIFY0(DATA_COL_SIZE(rr, i)); 251 continue; 252 } 253 254 if (abd_cmp(DATA_COL(rrg, i), 255 DATA_COL(rr, i)) != 0) { 256 ret++; 257 258 LOG_OPT(D_DEBUG, opts, 259 "\nData block [%d] different!\n", i); 260 } 261 } 262 } 263 return (ret); 264 } 265 266 static int 267 init_rand(void *data, size_t size, void *private) 268 { 269 int i; 270 int *dst = (int *)data; 271 272 for (i = 0; i < size / sizeof (int); i++) 273 dst[i] = rand_data[i]; 274 275 return (0); 276 } 277 278 static void 279 corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) 280 { 281 for (int r = 0; r < rm->rm_nrows; r++) { 282 raidz_row_t *rr = rm->rm_row[r]; 283 for (int i = 0; i < cnt; i++) { 284 raidz_col_t *col = &rr->rr_col[tgts[i]]; 285 abd_iterate_func(col->rc_abd, 0, col->rc_size, 286 init_rand, NULL); 287 } 288 } 289 } 290 291 void 292 init_zio_abd(zio_t *zio) 293 { 294 abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL); 295 } 296 297 static void 298 fini_raidz_map(zio_t **zio, raidz_map_t **rm) 299 { 300 vdev_raidz_map_free(*rm); 301 raidz_free((*zio)->io_abd, (*zio)->io_size); 302 umem_free(*zio, sizeof (zio_t)); 303 304 *zio = NULL; 305 *rm = NULL; 306 } 307 308 static int 309 init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) 310 { 311 int err = 0; 312 zio_t *zio_test; 313 raidz_map_t *rm_test; 314 const size_t total_ncols = opts->rto_dcols + parity; 315 316 if (opts->rm_golden) { 317 fini_raidz_map(&opts->zio_golden, &opts->rm_golden); 318 } 319 320 opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); 321 zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); 322 323 opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset; 324 opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize; 325 326 opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize); 327 zio_test->io_abd = raidz_alloc(opts->rto_dsize); 328 329 init_zio_abd(opts->zio_golden); 330 init_zio_abd(zio_test); 331 332 VERIFY0(vdev_raidz_impl_set("original")); 333 334 if (opts->rto_expand) { 335 opts->rm_golden = 336 vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd, 337 opts->zio_golden->io_size, opts->zio_golden->io_offset, 338 opts->rto_ashift, total_ncols+1, total_ncols, 339 parity, opts->rto_expand_offset); 340 rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd, 341 zio_test->io_size, zio_test->io_offset, 342 opts->rto_ashift, total_ncols+1, total_ncols, 343 parity, opts->rto_expand_offset); 344 } else { 345 opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, 346 opts->rto_ashift, total_ncols, parity); 347 rm_test = vdev_raidz_map_alloc(zio_test, 348 opts->rto_ashift, total_ncols, parity); 349 } 350 351 VERIFY(opts->zio_golden); 352 VERIFY(opts->rm_golden); 353 354 vdev_raidz_generate_parity(opts->rm_golden); 355 vdev_raidz_generate_parity(rm_test); 356 357 /* sanity check */ 358 err |= cmp_data(opts, rm_test); 359 err |= cmp_code(opts, rm_test, parity); 360 361 if (err) 362 ERR("initializing the golden copy ... [FAIL]!\n"); 363 364 /* tear down raidz_map of test zio */ 365 fini_raidz_map(&zio_test, &rm_test); 366 367 return (err); 368 } 369 370 /* 371 * If reflow is not in progress, reflow_offset should be UINT64_MAX. 372 * For each row, if the row is entirely before reflow_offset, it will 373 * come from the new location. Otherwise this row will come from the 374 * old location. Therefore, rows that straddle the reflow_offset will 375 * come from the old location. 376 * 377 * NOTE: Until raidz expansion is implemented this function is only 378 * needed by raidz_test.c to the multi-row raid_map_t functionality. 379 */ 380 raidz_map_t * 381 vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, 382 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, 383 uint64_t nparity, uint64_t reflow_offset) 384 { 385 /* The zio's size in units of the vdev's minimum sector size. */ 386 uint64_t s = size >> ashift; 387 uint64_t q, r, bc, devidx, asize = 0, tot; 388 389 /* 390 * "Quotient": The number of data sectors for this stripe on all but 391 * the "big column" child vdevs that also contain "remainder" data. 392 * AKA "full rows" 393 */ 394 q = s / (logical_cols - nparity); 395 396 /* 397 * "Remainder": The number of partial stripe data sectors in this I/O. 398 * This will add a sector to some, but not all, child vdevs. 399 */ 400 r = s - q * (logical_cols - nparity); 401 402 /* The number of "big columns" - those which contain remainder data. */ 403 bc = (r == 0 ? 0 : r + nparity); 404 405 /* 406 * The total number of data and parity sectors associated with 407 * this I/O. 408 */ 409 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 410 411 /* How many rows contain data (not skip) */ 412 uint64_t rows = howmany(tot, logical_cols); 413 int cols = MIN(tot, logical_cols); 414 415 raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), 416 KM_SLEEP); 417 rm->rm_nrows = rows; 418 419 for (uint64_t row = 0; row < rows; row++) { 420 raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, 421 rr_col[cols]), KM_SLEEP); 422 rm->rm_row[row] = rr; 423 424 /* The starting RAIDZ (parent) vdev sector of the row. */ 425 uint64_t b = (offset >> ashift) + row * logical_cols; 426 427 /* 428 * If we are in the middle of a reflow, and any part of this 429 * row has not been copied, then use the old location of 430 * this row. 431 */ 432 int row_phys_cols = physical_cols; 433 if (b + (logical_cols - nparity) > reflow_offset >> ashift) 434 row_phys_cols--; 435 436 /* starting child of this row */ 437 uint64_t child_id = b % row_phys_cols; 438 /* The starting byte offset on each child vdev. */ 439 uint64_t child_offset = (b / row_phys_cols) << ashift; 440 441 /* 442 * We set cols to the entire width of the block, even 443 * if this row is shorter. This is needed because parity 444 * generation (for Q and R) needs to know the entire width, 445 * because it treats the short row as though it was 446 * full-width (and the "phantom" sectors were zero-filled). 447 * 448 * Another approach to this would be to set cols shorter 449 * (to just the number of columns that we might do i/o to) 450 * and have another mechanism to tell the parity generation 451 * about the "entire width". Reconstruction (at least 452 * vdev_raidz_reconstruct_general()) would also need to 453 * know about the "entire width". 454 */ 455 rr->rr_cols = cols; 456 rr->rr_bigcols = bc; 457 rr->rr_missingdata = 0; 458 rr->rr_missingparity = 0; 459 rr->rr_firstdatacol = nparity; 460 rr->rr_abd_empty = NULL; 461 rr->rr_nempty = 0; 462 463 for (int c = 0; c < rr->rr_cols; c++, child_id++) { 464 if (child_id >= row_phys_cols) { 465 child_id -= row_phys_cols; 466 child_offset += 1ULL << ashift; 467 } 468 rr->rr_col[c].rc_devidx = child_id; 469 rr->rr_col[c].rc_offset = child_offset; 470 rr->rr_col[c].rc_orig_data = NULL; 471 rr->rr_col[c].rc_error = 0; 472 rr->rr_col[c].rc_tried = 0; 473 rr->rr_col[c].rc_skipped = 0; 474 rr->rr_col[c].rc_need_orig_restore = B_FALSE; 475 476 uint64_t dc = c - rr->rr_firstdatacol; 477 if (c < rr->rr_firstdatacol) { 478 rr->rr_col[c].rc_size = 1ULL << ashift; 479 rr->rr_col[c].rc_abd = 480 abd_alloc_linear(rr->rr_col[c].rc_size, 481 B_TRUE); 482 } else if (row == rows - 1 && bc != 0 && c >= bc) { 483 /* 484 * Past the end, this for parity generation. 485 */ 486 rr->rr_col[c].rc_size = 0; 487 rr->rr_col[c].rc_abd = NULL; 488 } else { 489 /* 490 * "data column" (col excluding parity) 491 * Add an ASCII art diagram here 492 */ 493 uint64_t off; 494 495 if (c < bc || r == 0) { 496 off = dc * rows + row; 497 } else { 498 off = r * rows + 499 (dc - r) * (rows - 1) + row; 500 } 501 rr->rr_col[c].rc_size = 1ULL << ashift; 502 rr->rr_col[c].rc_abd = abd_get_offset_struct( 503 &rr->rr_col[c].rc_abdstruct, 504 abd, off << ashift, 1 << ashift); 505 } 506 507 asize += rr->rr_col[c].rc_size; 508 } 509 /* 510 * If all data stored spans all columns, there's a danger that 511 * parity will always be on the same device and, since parity 512 * isn't read during normal operation, that that device's I/O 513 * bandwidth won't be used effectively. We therefore switch 514 * the parity every 1MB. 515 * 516 * ...at least that was, ostensibly, the theory. As a practical 517 * matter unless we juggle the parity between all devices 518 * evenly, we won't see any benefit. Further, occasional writes 519 * that aren't a multiple of the LCM of the number of children 520 * and the minimum stripe width are sufficient to avoid pessimal 521 * behavior. Unfortunately, this decision created an implicit 522 * on-disk format requirement that we need to support for all 523 * eternity, but only for single-parity RAID-Z. 524 * 525 * If we intend to skip a sector in the zeroth column for 526 * padding we must make sure to note this swap. We will never 527 * intend to skip the first column since at least one data and 528 * one parity column must appear in each row. 529 */ 530 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && 531 (offset & (1ULL << 20))) { 532 ASSERT(rr->rr_cols >= 2); 533 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); 534 devidx = rr->rr_col[0].rc_devidx; 535 uint64_t o = rr->rr_col[0].rc_offset; 536 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; 537 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; 538 rr->rr_col[1].rc_devidx = devidx; 539 rr->rr_col[1].rc_offset = o; 540 } 541 542 } 543 ASSERT3U(asize, ==, tot << ashift); 544 545 /* init RAIDZ parity ops */ 546 rm->rm_ops = vdev_raidz_math_get_ops(); 547 548 return (rm); 549 } 550 551 static raidz_map_t * 552 init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) 553 { 554 raidz_map_t *rm = NULL; 555 const size_t alloc_dsize = opts->rto_dsize; 556 const size_t total_ncols = opts->rto_dcols + parity; 557 const int ccols[] = { 0, 1, 2 }; 558 559 VERIFY(zio); 560 VERIFY(parity <= 3 && parity >= 1); 561 562 *zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); 563 564 (*zio)->io_offset = 0; 565 (*zio)->io_size = alloc_dsize; 566 (*zio)->io_abd = raidz_alloc(alloc_dsize); 567 init_zio_abd(*zio); 568 569 if (opts->rto_expand) { 570 rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd, 571 (*zio)->io_size, (*zio)->io_offset, 572 opts->rto_ashift, total_ncols+1, total_ncols, 573 parity, opts->rto_expand_offset); 574 } else { 575 rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, 576 total_ncols, parity); 577 } 578 VERIFY(rm); 579 580 /* Make sure code columns are destroyed */ 581 corrupt_colums(rm, ccols, parity); 582 583 return (rm); 584 } 585 586 static int 587 run_gen_check(raidz_test_opts_t *opts) 588 { 589 char **impl_name; 590 int fn, err = 0; 591 zio_t *zio_test; 592 raidz_map_t *rm_test; 593 594 err = init_raidz_golden_map(opts, PARITY_PQR); 595 if (0 != err) 596 return (err); 597 598 LOG(D_INFO, DBLSEP); 599 LOG(D_INFO, "Testing parity generation...\n"); 600 601 for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; 602 impl_name++) { 603 604 LOG(D_INFO, SEP); 605 LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); 606 607 if (0 != vdev_raidz_impl_set(*impl_name)) { 608 LOG(D_INFO, "[SKIP]\n"); 609 continue; 610 } else { 611 LOG(D_INFO, "[SUPPORTED]\n"); 612 } 613 614 for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { 615 616 /* Check if should stop */ 617 if (rto_opts.rto_should_stop) 618 return (err); 619 620 /* create suitable raidz_map */ 621 rm_test = init_raidz_map(opts, &zio_test, fn+1); 622 VERIFY(rm_test); 623 624 LOG(D_INFO, "\t\tTesting method [%s] ...", 625 raidz_gen_name[fn]); 626 627 if (!opts->rto_sanity) 628 vdev_raidz_generate_parity(rm_test); 629 630 if (cmp_code(opts, rm_test, fn+1) != 0) { 631 LOG(D_INFO, "[FAIL]\n"); 632 err++; 633 } else 634 LOG(D_INFO, "[PASS]\n"); 635 636 fini_raidz_map(&zio_test, &rm_test); 637 } 638 } 639 640 fini_raidz_map(&opts->zio_golden, &opts->rm_golden); 641 642 return (err); 643 } 644 645 static int 646 run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) 647 { 648 int x0, x1, x2; 649 int tgtidx[3]; 650 int err = 0; 651 static const int rec_tgts[7][3] = { 652 {1, 2, 3}, /* rec_p: bad QR & D[0] */ 653 {0, 2, 3}, /* rec_q: bad PR & D[0] */ 654 {0, 1, 3}, /* rec_r: bad PQ & D[0] */ 655 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ 656 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ 657 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ 658 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ 659 }; 660 661 memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx)); 662 663 if (fn < RAIDZ_REC_PQ) { 664 /* can reconstruct 1 failed data disk */ 665 for (x0 = 0; x0 < opts->rto_dcols; x0++) { 666 if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) 667 continue; 668 669 /* Check if should stop */ 670 if (rto_opts.rto_should_stop) 671 return (err); 672 673 LOG(D_DEBUG, "[%d] ", x0); 674 675 tgtidx[2] = x0 + raidz_parity(rm); 676 677 corrupt_colums(rm, tgtidx+2, 1); 678 679 if (!opts->rto_sanity) 680 vdev_raidz_reconstruct(rm, tgtidx, 3); 681 682 if (cmp_data(opts, rm) != 0) { 683 err++; 684 LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0); 685 } 686 } 687 688 } else if (fn < RAIDZ_REC_PQR) { 689 /* can reconstruct 2 failed data disk */ 690 for (x0 = 0; x0 < opts->rto_dcols; x0++) { 691 if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) 692 continue; 693 for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { 694 if (x1 >= rm->rm_row[0]->rr_cols - 695 raidz_parity(rm)) 696 continue; 697 698 /* Check if should stop */ 699 if (rto_opts.rto_should_stop) 700 return (err); 701 702 LOG(D_DEBUG, "[%d %d] ", x0, x1); 703 704 tgtidx[1] = x0 + raidz_parity(rm); 705 tgtidx[2] = x1 + raidz_parity(rm); 706 707 corrupt_colums(rm, tgtidx+1, 2); 708 709 if (!opts->rto_sanity) 710 vdev_raidz_reconstruct(rm, tgtidx, 3); 711 712 if (cmp_data(opts, rm) != 0) { 713 err++; 714 LOG(D_DEBUG, "\nREC D[%d %d]... " 715 "[FAIL]\n", x0, x1); 716 } 717 } 718 } 719 } else { 720 /* can reconstruct 3 failed data disk */ 721 for (x0 = 0; x0 < opts->rto_dcols; x0++) { 722 if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) 723 continue; 724 for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { 725 if (x1 >= rm->rm_row[0]->rr_cols - 726 raidz_parity(rm)) 727 continue; 728 for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { 729 if (x2 >= rm->rm_row[0]->rr_cols - 730 raidz_parity(rm)) 731 continue; 732 733 /* Check if should stop */ 734 if (rto_opts.rto_should_stop) 735 return (err); 736 737 LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2); 738 739 tgtidx[0] = x0 + raidz_parity(rm); 740 tgtidx[1] = x1 + raidz_parity(rm); 741 tgtidx[2] = x2 + raidz_parity(rm); 742 743 corrupt_colums(rm, tgtidx, 3); 744 745 if (!opts->rto_sanity) 746 vdev_raidz_reconstruct(rm, 747 tgtidx, 3); 748 749 if (cmp_data(opts, rm) != 0) { 750 err++; 751 LOG(D_DEBUG, 752 "\nREC D[%d %d %d]... " 753 "[FAIL]\n", x0, x1, x2); 754 } 755 } 756 } 757 } 758 } 759 return (err); 760 } 761 762 static int 763 run_rec_check(raidz_test_opts_t *opts) 764 { 765 char **impl_name; 766 unsigned fn, err = 0; 767 zio_t *zio_test; 768 raidz_map_t *rm_test; 769 770 err = init_raidz_golden_map(opts, PARITY_PQR); 771 if (0 != err) 772 return (err); 773 774 LOG(D_INFO, DBLSEP); 775 LOG(D_INFO, "Testing data reconstruction...\n"); 776 777 for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; 778 impl_name++) { 779 780 LOG(D_INFO, SEP); 781 LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); 782 783 if (vdev_raidz_impl_set(*impl_name) != 0) { 784 LOG(D_INFO, "[SKIP]\n"); 785 continue; 786 } else 787 LOG(D_INFO, "[SUPPORTED]\n"); 788 789 790 /* create suitable raidz_map */ 791 rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR); 792 /* generate parity */ 793 vdev_raidz_generate_parity(rm_test); 794 795 for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { 796 797 LOG(D_INFO, "\t\tTesting method [%s] ...", 798 raidz_rec_name[fn]); 799 800 if (run_rec_check_impl(opts, rm_test, fn) != 0) { 801 LOG(D_INFO, "[FAIL]\n"); 802 err++; 803 804 } else 805 LOG(D_INFO, "[PASS]\n"); 806 807 } 808 /* tear down test raidz_map */ 809 fini_raidz_map(&zio_test, &rm_test); 810 } 811 812 fini_raidz_map(&opts->zio_golden, &opts->rm_golden); 813 814 return (err); 815 } 816 817 static int 818 run_test(raidz_test_opts_t *opts) 819 { 820 int err = 0; 821 822 if (opts == NULL) 823 opts = &rto_opts; 824 825 print_opts(opts, B_FALSE); 826 827 err |= run_gen_check(opts); 828 err |= run_rec_check(opts); 829 830 return (err); 831 } 832 833 #define SWEEP_RUNNING 0 834 #define SWEEP_FINISHED 1 835 #define SWEEP_ERROR 2 836 #define SWEEP_TIMEOUT 3 837 838 static int sweep_state = 0; 839 static raidz_test_opts_t failed_opts; 840 841 static kmutex_t sem_mtx; 842 static kcondvar_t sem_cv; 843 static int max_free_slots; 844 static int free_slots; 845 846 static void 847 sweep_thread(void *arg) 848 { 849 int err = 0; 850 raidz_test_opts_t *opts = (raidz_test_opts_t *)arg; 851 VERIFY(opts != NULL); 852 853 err = run_test(opts); 854 855 if (rto_opts.rto_sanity) { 856 /* 25% chance that a sweep test fails */ 857 if (rand() < (RAND_MAX/4)) 858 err = 1; 859 } 860 861 if (0 != err) { 862 mutex_enter(&sem_mtx); 863 memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t)); 864 sweep_state = SWEEP_ERROR; 865 mutex_exit(&sem_mtx); 866 } 867 868 umem_free(opts, sizeof (raidz_test_opts_t)); 869 870 /* signal the next thread */ 871 mutex_enter(&sem_mtx); 872 free_slots++; 873 cv_signal(&sem_cv); 874 mutex_exit(&sem_mtx); 875 876 thread_exit(); 877 } 878 879 static int 880 run_sweep(void) 881 { 882 static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 }; 883 static const size_t ashift_v[] = { 9, 12, 14 }; 884 static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12), 885 1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE }; 886 887 (void) setvbuf(stdout, NULL, _IONBF, 0); 888 889 ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) * 890 ARRAY_SIZE(dcols_v); 891 ulong_t tried_comb = 0; 892 hrtime_t time_diff, start_time = gethrtime(); 893 raidz_test_opts_t *opts; 894 int a, d, s; 895 896 max_free_slots = free_slots = MAX(2, boot_ncpus); 897 898 mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL); 899 cv_init(&sem_cv, NULL, CV_DEFAULT, NULL); 900 901 for (s = 0; s < ARRAY_SIZE(size_v); s++) 902 for (a = 0; a < ARRAY_SIZE(ashift_v); a++) 903 for (d = 0; d < ARRAY_SIZE(dcols_v); d++) { 904 905 if (size_v[s] < (1 << ashift_v[a])) { 906 total_comb--; 907 continue; 908 } 909 910 if (++tried_comb % 20 == 0) 911 LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb); 912 913 /* wait for signal to start new thread */ 914 mutex_enter(&sem_mtx); 915 while (cv_timedwait_sig(&sem_cv, &sem_mtx, 916 ddi_get_lbolt() + hz)) { 917 918 /* check if should stop the test (timeout) */ 919 time_diff = (gethrtime() - start_time) / NANOSEC; 920 if (rto_opts.rto_sweep_timeout > 0 && 921 time_diff >= rto_opts.rto_sweep_timeout) { 922 sweep_state = SWEEP_TIMEOUT; 923 rto_opts.rto_should_stop = B_TRUE; 924 mutex_exit(&sem_mtx); 925 goto exit; 926 } 927 928 /* check if should stop the test (error) */ 929 if (sweep_state != SWEEP_RUNNING) { 930 mutex_exit(&sem_mtx); 931 goto exit; 932 } 933 934 /* exit loop if a slot is available */ 935 if (free_slots > 0) { 936 break; 937 } 938 } 939 940 free_slots--; 941 mutex_exit(&sem_mtx); 942 943 opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL); 944 opts->rto_ashift = ashift_v[a]; 945 opts->rto_dcols = dcols_v[d]; 946 opts->rto_offset = (1 << ashift_v[a]) * rand(); 947 opts->rto_dsize = size_v[s]; 948 opts->rto_expand = rto_opts.rto_expand; 949 opts->rto_expand_offset = rto_opts.rto_expand_offset; 950 opts->rto_v = 0; /* be quiet */ 951 952 VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts, 953 0, NULL, TS_RUN, defclsyspri), !=, NULL); 954 } 955 956 exit: 957 LOG(D_ALL, "\nWaiting for test threads to finish...\n"); 958 mutex_enter(&sem_mtx); 959 VERIFY(free_slots <= max_free_slots); 960 while (free_slots < max_free_slots) { 961 (void) cv_wait(&sem_cv, &sem_mtx); 962 } 963 mutex_exit(&sem_mtx); 964 965 if (sweep_state == SWEEP_ERROR) { 966 ERR("Sweep test failed! Failed option: \n"); 967 print_opts(&failed_opts, B_TRUE); 968 } else { 969 if (sweep_state == SWEEP_TIMEOUT) 970 LOG(D_ALL, "Test timeout (%lus). Stopping...\n", 971 (ulong_t)rto_opts.rto_sweep_timeout); 972 973 LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n", 974 (ulong_t)tried_comb); 975 } 976 977 mutex_destroy(&sem_mtx); 978 979 return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0); 980 } 981 982 983 int 984 main(int argc, char **argv) 985 { 986 size_t i; 987 struct sigaction action; 988 int err = 0; 989 990 /* init gdb pid string early */ 991 (void) sprintf(pid_s, "%d", getpid()); 992 993 action.sa_handler = sig_handler; 994 sigemptyset(&action.sa_mask); 995 action.sa_flags = 0; 996 997 if (sigaction(SIGSEGV, &action, NULL) < 0) { 998 ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno)); 999 exit(EXIT_FAILURE); 1000 } 1001 1002 (void) setvbuf(stdout, NULL, _IOLBF, 0); 1003 1004 dprintf_setup(&argc, argv); 1005 1006 process_options(argc, argv); 1007 1008 kernel_init(SPA_MODE_READ); 1009 1010 /* setup random data because rand() is not reentrant */ 1011 rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); 1012 srand((unsigned)time(NULL) * getpid()); 1013 for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++) 1014 rand_data[i] = rand(); 1015 1016 mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ); 1017 1018 if (rto_opts.rto_benchmark) { 1019 run_raidz_benchmark(); 1020 } else if (rto_opts.rto_sweep) { 1021 err = run_sweep(); 1022 } else { 1023 err = run_test(NULL); 1024 } 1025 1026 umem_free(rand_data, SPA_MAXBLOCKSIZE); 1027 kernel_fini(); 1028 1029 return (err); 1030 } 1031