1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2018 Intel Corporation. 23 * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. 24 */ 25 26 #include <stdio.h> 27 #include <zlib.h> 28 #include <zfs_fletcher.h> 29 #include <sys/vdev_draid.h> 30 #include <sys/nvpair.h> 31 #include <sys/stat.h> 32 33 /* 34 * The number of rows to generate for new permutation maps. 35 */ 36 #define MAP_ROWS_DEFAULT 256 37 38 /* 39 * Key values for dRAID maps when stored as nvlists. 40 */ 41 #define MAP_SEED "seed" 42 #define MAP_CHECKSUM "checksum" 43 #define MAP_WORST_RATIO "worst_ratio" 44 #define MAP_AVG_RATIO "avg_ratio" 45 #define MAP_CHILDREN "children" 46 #define MAP_NPERMS "nperms" 47 #define MAP_PERMS "perms" 48 49 static void 50 draid_usage(void) 51 { 52 (void) fprintf(stderr, 53 "usage: draid command args ...\n" 54 "Available commands are:\n" 55 "\n" 56 "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n" 57 "\tdraid verify [-rv] FILE\n" 58 "\tdraid dump [-v] [-m min] [-n max] FILE\n" 59 "\tdraid table FILE\n" 60 "\tdraid merge FILE SRC SRC...\n"); 61 exit(1); 62 } 63 64 static int 65 read_map(const char *filename, nvlist_t **allcfgs) 66 { 67 int block_size = 131072; 68 int buf_size = 131072; 69 int tmp_size, error; 70 char *tmp_buf; 71 72 struct stat64 stat; 73 if (lstat64(filename, &stat) != 0) 74 return (errno); 75 76 if (stat.st_size == 0 || 77 !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) { 78 return (EINVAL); 79 } 80 81 gzFile fp = gzopen(filename, "rb"); 82 if (fp == Z_NULL) 83 return (errno); 84 85 char *buf = malloc(buf_size); 86 if (buf == NULL) { 87 (void) gzclose(fp); 88 return (ENOMEM); 89 } 90 91 ssize_t rc, bytes = 0; 92 while (!gzeof(fp)) { 93 rc = gzread(fp, buf + bytes, block_size); 94 if ((rc < 0) || (rc == 0 && !gzeof(fp))) { 95 free(buf); 96 (void) gzerror(fp, &error); 97 (void) gzclose(fp); 98 return (error); 99 } else { 100 bytes += rc; 101 102 if (bytes + block_size >= buf_size) { 103 tmp_size = 2 * buf_size; 104 tmp_buf = malloc(tmp_size); 105 if (tmp_buf == NULL) { 106 free(buf); 107 (void) gzclose(fp); 108 return (ENOMEM); 109 } 110 111 memcpy(tmp_buf, buf, bytes); 112 free(buf); 113 buf = tmp_buf; 114 buf_size = tmp_size; 115 } 116 } 117 } 118 119 (void) gzclose(fp); 120 121 error = nvlist_unpack(buf, bytes, allcfgs, 0); 122 free(buf); 123 124 return (error); 125 } 126 127 /* 128 * Read a map from the specified filename. A file contains multiple maps 129 * which are indexed by the number of children. The caller is responsible 130 * for freeing the configuration returned. 131 */ 132 static int 133 read_map_key(const char *filename, const char *key, nvlist_t **cfg) 134 { 135 nvlist_t *allcfgs, *foundcfg = NULL; 136 int error; 137 138 error = read_map(filename, &allcfgs); 139 if (error != 0) 140 return (error); 141 142 (void) nvlist_lookup_nvlist(allcfgs, key, &foundcfg); 143 if (foundcfg != NULL) { 144 nvlist_dup(foundcfg, cfg, KM_SLEEP); 145 error = 0; 146 } else { 147 error = ENOENT; 148 } 149 150 nvlist_free(allcfgs); 151 152 return (error); 153 } 154 155 /* 156 * Write all mappings to the map file. 157 */ 158 static int 159 write_map(const char *filename, nvlist_t *allcfgs) 160 { 161 size_t buflen = 0; 162 int error; 163 164 error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR); 165 if (error) 166 return (error); 167 168 char *buf = malloc(buflen); 169 if (buf == NULL) 170 return (ENOMEM); 171 172 error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); 173 if (error) { 174 free(buf); 175 return (error); 176 } 177 178 /* 179 * Atomically update the file using a temporary file and the 180 * traditional unlink then rename steps. This code provides 181 * no locking, it only guarantees the packed nvlist on disk 182 * is updated atomically and is internally consistent. 183 */ 184 char *tmpname = calloc(1, MAXPATHLEN); 185 if (tmpname == NULL) { 186 free(buf); 187 return (ENOMEM); 188 } 189 190 snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename); 191 192 int fd = mkstemp(tmpname); 193 if (fd < 0) { 194 error = errno; 195 free(buf); 196 free(tmpname); 197 return (error); 198 } 199 (void) close(fd); 200 201 gzFile fp = gzopen(tmpname, "w9b"); 202 if (fp == Z_NULL) { 203 error = errno; 204 free(buf); 205 free(tmpname); 206 return (errno); 207 } 208 209 ssize_t rc, bytes = 0; 210 while (bytes < buflen) { 211 size_t size = MIN(buflen - bytes, 131072); 212 rc = gzwrite(fp, buf + bytes, size); 213 if (rc < 0) { 214 free(buf); 215 (void) gzerror(fp, &error); 216 (void) gzclose(fp); 217 (void) unlink(tmpname); 218 free(tmpname); 219 return (error); 220 } else if (rc == 0) { 221 break; 222 } else { 223 bytes += rc; 224 } 225 } 226 227 free(buf); 228 (void) gzclose(fp); 229 230 if (bytes != buflen) { 231 (void) unlink(tmpname); 232 free(tmpname); 233 return (EIO); 234 } 235 236 /* 237 * Unlink the previous config file and replace it with the updated 238 * version. If we're able to unlink the file then directory is 239 * writable by us and the subsequent rename should never fail. 240 */ 241 error = unlink(filename); 242 if (error != 0 && errno != ENOENT) { 243 error = errno; 244 (void) unlink(tmpname); 245 free(tmpname); 246 return (error); 247 } 248 249 error = rename(tmpname, filename); 250 if (error != 0) { 251 error = errno; 252 (void) unlink(tmpname); 253 free(tmpname); 254 return (error); 255 } 256 257 free(tmpname); 258 259 return (0); 260 } 261 262 /* 263 * Add the dRAID map to the file and write it out. 264 */ 265 static int 266 write_map_key(const char *filename, char *key, draid_map_t *map, 267 double worst_ratio, double avg_ratio) 268 { 269 nvlist_t *nv_cfg, *allcfgs; 270 int error; 271 272 /* 273 * Add the configuration to an existing or new file. The new 274 * configuration will replace an existing configuration with the 275 * same key if it has a lower ratio and is therefore better. 276 */ 277 error = read_map(filename, &allcfgs); 278 if (error == ENOENT) { 279 allcfgs = fnvlist_alloc(); 280 } else if (error != 0) { 281 return (error); 282 } 283 284 error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg); 285 if (error == 0) { 286 uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg, 287 MAP_WORST_RATIO); 288 double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0; 289 290 if (worst_ratio < nv_worst_ratio) { 291 /* Replace old map with the more balanced new map. */ 292 fnvlist_remove(allcfgs, key); 293 } else { 294 /* The old map is preferable, keep it. */ 295 nvlist_free(allcfgs); 296 return (EEXIST); 297 } 298 } 299 300 nvlist_t *cfg = fnvlist_alloc(); 301 fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed); 302 fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum); 303 fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children); 304 fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms); 305 fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms, 306 map->dm_children * map->dm_nperms * sizeof (uint8_t)); 307 308 fnvlist_add_uint64(cfg, MAP_WORST_RATIO, 309 (uint64_t)(worst_ratio * 1000.0)); 310 fnvlist_add_uint64(cfg, MAP_AVG_RATIO, 311 (uint64_t)(avg_ratio * 1000.0)); 312 313 error = nvlist_add_nvlist(allcfgs, key, cfg); 314 if (error == 0) 315 error = write_map(filename, allcfgs); 316 317 nvlist_free(cfg); 318 nvlist_free(allcfgs); 319 return (error); 320 } 321 322 static void 323 dump_map(draid_map_t *map, const char *key, double worst_ratio, 324 double avg_ratio, int verbose) 325 { 326 if (verbose == 0) { 327 return; 328 } else if (verbose == 1) { 329 printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f " 330 "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed, 331 worst_ratio, avg_ratio); 332 return; 333 } else { 334 printf(" \"%s\":\n" 335 " seed: 0x%016llx\n" 336 " checksum: 0x%016llx\n" 337 " worst_ratio: %2.03f\n" 338 " avg_ratio: %2.03f\n" 339 " children: %llu\n" 340 " nperms: %llu\n", 341 key, (u_longlong_t)map->dm_seed, 342 (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio, 343 (u_longlong_t)map->dm_children, 344 (u_longlong_t)map->dm_nperms); 345 346 if (verbose > 2) { 347 printf(" perms = {\n"); 348 for (int i = 0; i < map->dm_nperms; i++) { 349 printf(" { "); 350 for (int j = 0; j < map->dm_children; j++) { 351 printf("%3d%s ", map->dm_perms[ 352 i * map->dm_children + j], 353 j < map->dm_children - 1 ? 354 "," : ""); 355 } 356 printf(" },\n"); 357 } 358 printf(" }\n"); 359 } else if (verbose == 2) { 360 printf(" draid_perms = <omitted>\n"); 361 } 362 } 363 } 364 365 static void 366 dump_map_nv(const char *key, nvlist_t *cfg, int verbose) 367 { 368 draid_map_t map; 369 uint_t c; 370 371 uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); 372 uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); 373 374 map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); 375 map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); 376 map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); 377 map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); 378 map.dm_perms = fnvlist_lookup_uint8_array(cfg, MAP_PERMS, &c); 379 380 dump_map(&map, key, (double)worst_ratio / 1000.0, 381 avg_ratio / 1000.0, verbose); 382 } 383 384 /* 385 * Print a summary of the mapping. 386 */ 387 static int 388 dump_map_key(const char *filename, const char *key, int verbose) 389 { 390 nvlist_t *cfg; 391 int error; 392 393 error = read_map_key(filename, key, &cfg); 394 if (error != 0) 395 return (error); 396 397 dump_map_nv(key, cfg, verbose); 398 399 return (0); 400 } 401 402 /* 403 * Allocate a new permutation map for evaluation. 404 */ 405 static int 406 alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed, 407 draid_map_t **mapp) 408 { 409 draid_map_t *map; 410 int error; 411 412 map = malloc(sizeof (draid_map_t)); 413 if (map == NULL) 414 return (ENOMEM); 415 416 map->dm_children = children; 417 map->dm_nperms = nperms; 418 map->dm_seed = seed; 419 map->dm_checksum = 0; 420 421 error = vdev_draid_generate_perms(map, &map->dm_perms); 422 if (error) { 423 free(map); 424 return (error); 425 } 426 427 *mapp = map; 428 429 return (0); 430 } 431 432 /* 433 * Allocate the fixed permutation map for N children. 434 */ 435 static int 436 alloc_fixed_map(uint64_t children, draid_map_t **mapp) 437 { 438 const draid_map_t *fixed_map; 439 draid_map_t *map; 440 int error; 441 442 error = vdev_draid_lookup_map(children, &fixed_map); 443 if (error) 444 return (error); 445 446 map = malloc(sizeof (draid_map_t)); 447 if (map == NULL) 448 return (ENOMEM); 449 450 memcpy(map, fixed_map, sizeof (draid_map_t)); 451 VERIFY3U(map->dm_checksum, !=, 0); 452 453 error = vdev_draid_generate_perms(map, &map->dm_perms); 454 if (error) { 455 free(map); 456 return (error); 457 } 458 459 *mapp = map; 460 461 return (0); 462 } 463 464 /* 465 * Free a permutation map. 466 */ 467 static void 468 free_map(draid_map_t *map) 469 { 470 free(map->dm_perms); 471 free(map); 472 } 473 474 /* 475 * Check if dev is in the provided list of faulted devices. 476 */ 477 static inline boolean_t 478 is_faulted(int *faulted_devs, int nfaulted, int dev) 479 { 480 for (int i = 0; i < nfaulted; i++) 481 if (faulted_devs[i] == dev) 482 return (B_TRUE); 483 484 return (B_FALSE); 485 } 486 487 /* 488 * Evaluate how resilvering I/O will be distributed given a list of faulted 489 * vdevs. As a simplification we assume one IO is sufficient to repair each 490 * damaged device in a group. 491 */ 492 static double 493 eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares, 494 int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios) 495 { 496 uint64_t children = map->dm_children; 497 uint64_t ngroups = 1; 498 uint64_t ndisks = children - nspares; 499 500 /* 501 * Calculate the minimum number of groups required to fill a slice. 502 */ 503 while (ngroups * (groupwidth) % (children - nspares) != 0) 504 ngroups++; 505 506 int *ios = calloc(map->dm_children, sizeof (uint64_t)); 507 508 ASSERT3P(ios, !=, NULL); 509 510 /* Resilver all rows */ 511 for (int i = 0; i < map->dm_nperms; i++) { 512 uint8_t *row = &map->dm_perms[i * map->dm_children]; 513 514 /* Resilver all groups with faulted drives */ 515 for (int j = 0; j < ngroups; j++) { 516 uint64_t spareidx = map->dm_children - nspares; 517 boolean_t repair_needed = B_FALSE; 518 519 /* See if any devices in this group are faulted */ 520 uint64_t groupstart = (j * groupwidth) % ndisks; 521 522 for (int k = 0; k < groupwidth; k++) { 523 uint64_t groupidx = (groupstart + k) % ndisks; 524 525 repair_needed = is_faulted(faulted_devs, 526 nfaulted, row[groupidx]); 527 if (repair_needed) 528 break; 529 } 530 531 if (repair_needed == B_FALSE) 532 continue; 533 534 /* 535 * This group is degraded. Calculate the number of 536 * reads the non-faulted drives require and the number 537 * of writes to the distributed hot spare for this row. 538 */ 539 for (int k = 0; k < groupwidth; k++) { 540 uint64_t groupidx = (groupstart + k) % ndisks; 541 542 if (!is_faulted(faulted_devs, nfaulted, 543 row[groupidx])) { 544 ios[row[groupidx]]++; 545 } else if (nspares > 0) { 546 while (is_faulted(faulted_devs, 547 nfaulted, row[spareidx])) { 548 spareidx++; 549 } 550 551 ASSERT3U(spareidx, <, map->dm_children); 552 ios[row[spareidx]]++; 553 spareidx++; 554 } 555 } 556 } 557 } 558 559 *min_child_ios = INT_MAX; 560 *max_child_ios = 0; 561 562 /* 563 * Find the drives with fewest and most required I/O. These values 564 * are used to calculate the imbalance ratio. To avoid returning an 565 * infinite value for permutations which have children that perform 566 * no IO a floor of 1 IO per child is set. This ensures a meaningful 567 * ratio is returned for comparison and it is not an uncommon when 568 * there are a large number of children. 569 */ 570 for (int i = 0; i < map->dm_children; i++) { 571 572 if (is_faulted(faulted_devs, nfaulted, i)) { 573 ASSERT0(ios[i]); 574 continue; 575 } 576 577 if (ios[i] == 0) 578 ios[i] = 1; 579 580 if (ios[i] < *min_child_ios) 581 *min_child_ios = ios[i]; 582 583 if (ios[i] > *max_child_ios) 584 *max_child_ios = ios[i]; 585 } 586 587 ASSERT3S(*min_child_ios, !=, INT_MAX); 588 ASSERT3S(*max_child_ios, !=, 0); 589 590 double ratio = (double)(*max_child_ios) / (double)(*min_child_ios); 591 592 free(ios); 593 594 return (ratio); 595 } 596 597 /* 598 * Evaluate the quality of the permutation mapping by considering possible 599 * device failures. Returns the imbalance ratio for the worst mapping which 600 * is defined to be the largest number of child IOs over the fewest number 601 * child IOs. A value of 1.0 indicates the mapping is perfectly balance and 602 * all children perform an equal amount of work during reconstruction. 603 */ 604 static void 605 eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop) 606 { 607 uint64_t children = map->dm_children; 608 double worst_ratio = 1.0; 609 double sum = 0; 610 int worst_min_ios = 0, worst_max_ios = 0; 611 int n = 0; 612 613 /* 614 * When there are only 2 children there can be no distributed 615 * spare and no resilver to evaluate. Default to a ratio of 1.0 616 * for this degenerate case. 617 */ 618 if (children == VDEV_DRAID_MIN_CHILDREN) { 619 *worst_ratiop = 1.0; 620 *avg_ratiop = 1.0; 621 return; 622 } 623 624 /* 625 * Score the mapping as if it had either 1 or 2 distributed spares. 626 */ 627 for (int nspares = 1; nspares <= 2; nspares++) { 628 uint64_t faults = nspares; 629 630 /* 631 * Score groupwidths up to 19. This value was chosen as the 632 * largest reasonable width (16d+3p). dRAID pools may be still 633 * be created with wider stripes but they are not considered in 634 * this analysis in order to optimize for the most common cases. 635 */ 636 for (uint64_t groupwidth = 2; 637 groupwidth <= MIN(children - nspares, 19); 638 groupwidth++) { 639 int faulted_devs[2]; 640 int min_ios, max_ios; 641 642 /* 643 * Score possible devices faults. This is limited 644 * to exactly one fault per distributed spare for 645 * the purposes of this similation. 646 */ 647 for (int f1 = 0; f1 < children; f1++) { 648 faulted_devs[0] = f1; 649 double ratio; 650 651 if (faults == 1) { 652 ratio = eval_resilver(map, groupwidth, 653 nspares, faulted_devs, faults, 654 &min_ios, &max_ios); 655 656 if (ratio > worst_ratio) { 657 worst_ratio = ratio; 658 worst_min_ios = min_ios; 659 worst_max_ios = max_ios; 660 } 661 662 sum += ratio; 663 n++; 664 } else if (faults == 2) { 665 for (int f2 = f1 + 1; f2 < children; 666 f2++) { 667 faulted_devs[1] = f2; 668 669 ratio = eval_resilver(map, 670 groupwidth, nspares, 671 faulted_devs, faults, 672 &min_ios, &max_ios); 673 674 if (ratio > worst_ratio) { 675 worst_ratio = ratio; 676 worst_min_ios = min_ios; 677 worst_max_ios = max_ios; 678 } 679 680 sum += ratio; 681 n++; 682 } 683 } 684 } 685 } 686 } 687 688 *worst_ratiop = worst_ratio; 689 *avg_ratiop = sum / n; 690 691 /* 692 * Log the min/max io values for particularly unbalanced maps. 693 * Since the maps are generated entirely randomly these are possible 694 * be exceedingly unlikely. We log it for possible investigation. 695 */ 696 if (worst_ratio > 100.0) { 697 dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2); 698 printf("worst_min_ios=%d worst_max_ios=%d\n", 699 worst_min_ios, worst_max_ios); 700 } 701 } 702 703 static int 704 eval_maps(uint64_t children, int passes, uint64_t *map_seed, 705 draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop) 706 { 707 draid_map_t *best_map = NULL; 708 double best_worst_ratio = 1000.0; 709 double best_avg_ratio = 1000.0; 710 711 /* 712 * Perform the requested number of passes evaluating randomly 713 * generated permutation maps. Only the best version is kept. 714 */ 715 for (int i = 0; i < passes; i++) { 716 double worst_ratio, avg_ratio; 717 draid_map_t *map; 718 int error; 719 720 /* 721 * Calculate the next seed and generate a new candidate map. 722 */ 723 error = alloc_new_map(children, MAP_ROWS_DEFAULT, 724 vdev_draid_rand(map_seed), &map); 725 if (error) { 726 if (best_map != NULL) 727 free_map(best_map); 728 return (error); 729 } 730 731 /* 732 * Consider maps with a lower worst_ratio to be of higher 733 * quality. Some maps may have a lower avg_ratio but they 734 * are discarded since they might include some particularly 735 * imbalanced permutations. The average is tracked to in 736 * order to get a sense of the average permutation quality. 737 */ 738 eval_decluster(map, &worst_ratio, &avg_ratio); 739 740 if (best_map == NULL || worst_ratio < best_worst_ratio) { 741 742 if (best_map != NULL) 743 free_map(best_map); 744 745 best_map = map; 746 best_worst_ratio = worst_ratio; 747 best_avg_ratio = avg_ratio; 748 } else { 749 free_map(map); 750 } 751 } 752 753 /* 754 * After determining the best map generate a checksum over the full 755 * permutation array. This checksum is verified when opening a dRAID 756 * pool to ensure the generated in memory permutations are correct. 757 */ 758 zio_cksum_t cksum; 759 fletcher_4_native_varsize(best_map->dm_perms, 760 sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms, 761 &cksum); 762 best_map->dm_checksum = cksum.zc_word[0]; 763 764 *best_mapp = best_map; 765 *best_ratiop = best_worst_ratio; 766 *avg_ratiop = best_avg_ratio; 767 768 return (0); 769 } 770 771 static int 772 draid_generate(int argc, char *argv[]) 773 { 774 char filename[MAXPATHLEN] = {0}; 775 uint64_t map_seed[2]; 776 int c, fd, error, verbose = 0, passes = 1, continuous = 0; 777 int min_children = VDEV_DRAID_MIN_CHILDREN; 778 int max_children = VDEV_DRAID_MAX_CHILDREN; 779 int restarts = 0; 780 781 while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) { 782 switch (c) { 783 case 'c': 784 continuous++; 785 break; 786 case 'm': 787 min_children = (int)strtol(optarg, NULL, 0); 788 if (min_children < VDEV_DRAID_MIN_CHILDREN) { 789 (void) fprintf(stderr, "A minimum of 2 " 790 "children are required.\n"); 791 return (1); 792 } 793 794 break; 795 case 'n': 796 max_children = (int)strtol(optarg, NULL, 0); 797 if (max_children > VDEV_DRAID_MAX_CHILDREN) { 798 (void) fprintf(stderr, "A maximum of %d " 799 "children are allowed.\n", 800 VDEV_DRAID_MAX_CHILDREN); 801 return (1); 802 } 803 break; 804 case 'p': 805 passes = (int)strtol(optarg, NULL, 0); 806 break; 807 case 'v': 808 /* 809 * 0 - Only log when a better map is added to the file. 810 * 1 - Log the current best map for each child count. 811 * Minimal output on a single summary line. 812 * 2 - Log the current best map for each child count. 813 * More verbose includes most map fields. 814 * 3 - Log the current best map for each child count. 815 * Very verbose all fields including the full map. 816 */ 817 verbose++; 818 break; 819 case ':': 820 (void) fprintf(stderr, 821 "missing argument for '%c' option\n", optopt); 822 draid_usage(); 823 break; 824 case '?': 825 (void) fprintf(stderr, "invalid option '%c'\n", 826 optopt); 827 draid_usage(); 828 break; 829 } 830 } 831 832 if (argc > optind) 833 strlcpy(filename, argv[optind], sizeof (filename)); 834 else { 835 (void) fprintf(stderr, "A FILE must be specified.\n"); 836 return (1); 837 } 838 839 restart: 840 /* 841 * Start with a fresh seed from /dev/urandom. 842 */ 843 fd = open("/dev/urandom", O_RDONLY); 844 if (fd < 0) { 845 printf("Unable to open /dev/urandom: %s\n:", strerror(errno)); 846 return (1); 847 } else { 848 ssize_t bytes = sizeof (map_seed); 849 ssize_t bytes_read = 0; 850 851 while (bytes_read < bytes) { 852 ssize_t rc = read(fd, ((char *)map_seed) + bytes_read, 853 bytes - bytes_read); 854 if (rc < 0) { 855 printf("Unable to read /dev/urandom: %s\n:", 856 strerror(errno)); 857 close(fd); 858 return (1); 859 } 860 bytes_read += rc; 861 } 862 863 (void) close(fd); 864 } 865 866 if (restarts == 0) 867 printf("Writing generated mappings to '%s':\n", filename); 868 869 /* 870 * Generate maps for all requested child counts. The best map for 871 * each child count is written out to the specified file. If the file 872 * already contains a better mapping this map will not be added. 873 */ 874 for (uint64_t children = min_children; 875 children <= max_children; children++) { 876 char key[8] = { 0 }; 877 draid_map_t *map; 878 double worst_ratio = 1000.0; 879 double avg_ratio = 1000.0; 880 881 error = eval_maps(children, passes, map_seed, &map, 882 &worst_ratio, &avg_ratio); 883 if (error) { 884 printf("Error eval_maps(): %s\n", strerror(error)); 885 return (1); 886 } 887 888 if (worst_ratio < 1.0 || avg_ratio < 1.0) { 889 printf("Error ratio < 1.0: worst_ratio = %2.03f " 890 "avg_ratio = %2.03f\n", worst_ratio, avg_ratio); 891 return (1); 892 } 893 894 snprintf(key, 7, "%llu", (u_longlong_t)children); 895 error = write_map_key(filename, key, map, worst_ratio, 896 avg_ratio); 897 if (error == 0) { 898 /* The new map was added to the file. */ 899 dump_map(map, key, worst_ratio, avg_ratio, 900 MAX(verbose, 1)); 901 } else if (error == EEXIST) { 902 /* The existing map was preferable and kept. */ 903 if (verbose > 0) 904 dump_map_key(filename, key, verbose); 905 } else { 906 printf("Error write_map_key(): %s\n", strerror(error)); 907 return (1); 908 } 909 910 free_map(map); 911 } 912 913 /* 914 * When the continuous option is set restart at the minimum number of 915 * children instead of exiting. This option is useful as a mechanism 916 * to continuous try and refine the discovered permutations. 917 */ 918 if (continuous) { 919 restarts++; 920 printf("Restarting by request (-c): %d\n", restarts); 921 goto restart; 922 } 923 924 return (0); 925 } 926 927 /* 928 * Verify each map in the file by generating its in-memory permutation array 929 * and comfirming its checksum is correct. 930 */ 931 static int 932 draid_verify(int argc, char *argv[]) 933 { 934 char filename[MAXPATHLEN] = {0}; 935 int n = 0, c, error, verbose = 1; 936 int check_ratios = 0; 937 938 while ((c = getopt(argc, argv, ":rv")) != -1) { 939 switch (c) { 940 case 'r': 941 check_ratios++; 942 break; 943 case 'v': 944 verbose++; 945 break; 946 case ':': 947 (void) fprintf(stderr, 948 "missing argument for '%c' option\n", optopt); 949 draid_usage(); 950 break; 951 case '?': 952 (void) fprintf(stderr, "invalid option '%c'\n", 953 optopt); 954 draid_usage(); 955 break; 956 } 957 } 958 959 if (argc > optind) { 960 char *abspath = malloc(MAXPATHLEN); 961 if (abspath == NULL) 962 return (ENOMEM); 963 964 if (realpath(argv[optind], abspath) != NULL) 965 strlcpy(filename, abspath, sizeof (filename)); 966 else 967 strlcpy(filename, argv[optind], sizeof (filename)); 968 969 free(abspath); 970 } else { 971 (void) fprintf(stderr, "A FILE must be specified.\n"); 972 return (1); 973 } 974 975 printf("Verifying permutation maps: '%s'\n", filename); 976 977 /* 978 * Lookup hardcoded permutation map for each valid number of children 979 * and verify a generated map has the correct checksum. Then compare 980 * the generated map values with the nvlist map values read from the 981 * reference file to cross-check the permutation. 982 */ 983 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; 984 children <= VDEV_DRAID_MAX_CHILDREN; 985 children++) { 986 draid_map_t *map; 987 char key[8] = {0}; 988 989 snprintf(key, 8, "%llu", (u_longlong_t)children); 990 991 error = alloc_fixed_map(children, &map); 992 if (error) { 993 printf("Error alloc_fixed_map() failed: %s\n", 994 error == ECKSUM ? "Invalid checksum" : 995 strerror(error)); 996 return (1); 997 } 998 999 uint64_t nv_seed, nv_checksum, nv_children, nv_nperms; 1000 uint8_t *nv_perms; 1001 nvlist_t *cfg; 1002 uint_t c; 1003 1004 error = read_map_key(filename, key, &cfg); 1005 if (error != 0) { 1006 printf("Error read_map_key() failed: %s\n", 1007 strerror(error)); 1008 free_map(map); 1009 return (1); 1010 } 1011 1012 nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); 1013 nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); 1014 nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); 1015 nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); 1016 nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c); 1017 1018 /* 1019 * Compare draid_map_t and nvlist reference values. 1020 */ 1021 if (map->dm_seed != nv_seed) { 1022 printf("Error different seeds: 0x%016llx != " 1023 "0x%016llx\n", (u_longlong_t)map->dm_seed, 1024 (u_longlong_t)nv_seed); 1025 error = EINVAL; 1026 } 1027 1028 if (map->dm_checksum != nv_checksum) { 1029 printf("Error different checksums: 0x%016llx " 1030 "!= 0x%016llx\n", 1031 (u_longlong_t)map->dm_checksum, 1032 (u_longlong_t)nv_checksum); 1033 error = EINVAL; 1034 } 1035 1036 if (map->dm_children != nv_children) { 1037 printf("Error different children: %llu " 1038 "!= %llu\n", (u_longlong_t)map->dm_children, 1039 (u_longlong_t)nv_children); 1040 error = EINVAL; 1041 } 1042 1043 if (map->dm_nperms != nv_nperms) { 1044 printf("Error different nperms: %llu " 1045 "!= %llu\n", (u_longlong_t)map->dm_nperms, 1046 (u_longlong_t)nv_nperms); 1047 error = EINVAL; 1048 } 1049 1050 for (uint64_t i = 0; i < nv_children * nv_nperms; i++) { 1051 if (map->dm_perms[i] != nv_perms[i]) { 1052 printf("Error different perms[%llu]: " 1053 "%d != %d\n", (u_longlong_t)i, 1054 (int)map->dm_perms[i], 1055 (int)nv_perms[i]); 1056 error = EINVAL; 1057 break; 1058 } 1059 } 1060 1061 /* 1062 * For good measure recalculate the worst and average 1063 * ratios and confirm they match the nvlist values. 1064 */ 1065 if (check_ratios) { 1066 uint64_t nv_worst_ratio, nv_avg_ratio; 1067 double worst_ratio, avg_ratio; 1068 1069 eval_decluster(map, &worst_ratio, &avg_ratio); 1070 1071 nv_worst_ratio = fnvlist_lookup_uint64(cfg, 1072 MAP_WORST_RATIO); 1073 nv_avg_ratio = fnvlist_lookup_uint64(cfg, 1074 MAP_AVG_RATIO); 1075 1076 if (worst_ratio < 1.0 || avg_ratio < 1.0) { 1077 printf("Error ratio out of range %2.03f, " 1078 "%2.03f\n", worst_ratio, avg_ratio); 1079 error = EINVAL; 1080 } 1081 1082 if ((uint64_t)(worst_ratio * 1000.0) != 1083 nv_worst_ratio) { 1084 printf("Error different worst_ratio %2.03f " 1085 "!= %2.03f\n", (double)nv_worst_ratio / 1086 1000.0, worst_ratio); 1087 error = EINVAL; 1088 } 1089 1090 if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) { 1091 printf("Error different average_ratio %2.03f " 1092 "!= %2.03f\n", (double)nv_avg_ratio / 1093 1000.0, avg_ratio); 1094 error = EINVAL; 1095 } 1096 } 1097 1098 if (error) { 1099 free_map(map); 1100 nvlist_free(cfg); 1101 return (1); 1102 } 1103 1104 if (verbose > 0) { 1105 printf("- %llu children: good\n", 1106 (u_longlong_t)children); 1107 } 1108 n++; 1109 1110 free_map(map); 1111 nvlist_free(cfg); 1112 } 1113 1114 if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) { 1115 printf("Error permutation maps missing: %d / %d checked\n", 1116 n, VDEV_DRAID_MAX_CHILDREN - 1); 1117 return (1); 1118 } 1119 1120 printf("Successfully verified %d / %d permutation maps\n", 1121 n, VDEV_DRAID_MAX_CHILDREN - 1); 1122 1123 return (0); 1124 } 1125 1126 /* 1127 * Dump the contents of the specified mapping(s) for inspection. 1128 */ 1129 static int 1130 draid_dump(int argc, char *argv[]) 1131 { 1132 char filename[MAXPATHLEN] = {0}; 1133 int c, error, verbose = 1; 1134 int min_children = VDEV_DRAID_MIN_CHILDREN; 1135 int max_children = VDEV_DRAID_MAX_CHILDREN; 1136 1137 while ((c = getopt(argc, argv, ":vm:n:")) != -1) { 1138 switch (c) { 1139 case 'm': 1140 min_children = (int)strtol(optarg, NULL, 0); 1141 if (min_children < 2) { 1142 (void) fprintf(stderr, "A minimum of 2 " 1143 "children are required.\n"); 1144 return (1); 1145 } 1146 1147 break; 1148 case 'n': 1149 max_children = (int)strtol(optarg, NULL, 0); 1150 if (max_children > VDEV_DRAID_MAX_CHILDREN) { 1151 (void) fprintf(stderr, "A maximum of %d " 1152 "children are allowed.\n", 1153 VDEV_DRAID_MAX_CHILDREN); 1154 return (1); 1155 } 1156 break; 1157 case 'v': 1158 verbose++; 1159 break; 1160 case ':': 1161 (void) fprintf(stderr, 1162 "missing argument for '%c' option\n", optopt); 1163 draid_usage(); 1164 break; 1165 case '?': 1166 (void) fprintf(stderr, "invalid option '%c'\n", 1167 optopt); 1168 draid_usage(); 1169 break; 1170 } 1171 } 1172 1173 if (argc > optind) 1174 strlcpy(filename, argv[optind], sizeof (filename)); 1175 else { 1176 (void) fprintf(stderr, "A FILE must be specified.\n"); 1177 return (1); 1178 } 1179 1180 /* 1181 * Dump maps for the requested child counts. 1182 */ 1183 for (uint64_t children = min_children; 1184 children <= max_children; children++) { 1185 char key[8] = { 0 }; 1186 1187 snprintf(key, 7, "%llu", (u_longlong_t)children); 1188 error = dump_map_key(filename, key, verbose); 1189 if (error) { 1190 printf("Error dump_map_key(): %s\n", strerror(error)); 1191 return (1); 1192 } 1193 } 1194 1195 return (0); 1196 } 1197 1198 /* 1199 * Print all of the mappings as a C formatted draid_map_t array. This table 1200 * is found in the module/zcommon/zfs_draid.c file and is the definitive 1201 * source for all mapping used by dRAID. It cannot be updated without 1202 * changing the dRAID on disk format. 1203 */ 1204 static int 1205 draid_table(int argc, char *argv[]) 1206 { 1207 char filename[MAXPATHLEN] = {0}; 1208 int error; 1209 1210 if (argc > optind) 1211 strlcpy(filename, argv[optind], sizeof (filename)); 1212 else { 1213 (void) fprintf(stderr, "A FILE must be specified.\n"); 1214 return (1); 1215 } 1216 1217 printf("static const draid_map_t " 1218 "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n"); 1219 1220 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; 1221 children <= VDEV_DRAID_MAX_CHILDREN; 1222 children++) { 1223 uint64_t seed, checksum, nperms, avg_ratio; 1224 nvlist_t *cfg; 1225 char key[8] = {0}; 1226 1227 snprintf(key, 8, "%llu", (u_longlong_t)children); 1228 1229 error = read_map_key(filename, key, &cfg); 1230 if (error != 0) { 1231 printf("Error read_map_key() failed: %s\n", 1232 strerror(error)); 1233 return (1); 1234 } 1235 1236 seed = fnvlist_lookup_uint64(cfg, MAP_SEED); 1237 checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); 1238 children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); 1239 nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); 1240 avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); 1241 1242 printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t" 1243 "/* %2.03f */\n", (u_longlong_t)children, 1244 (u_longlong_t)nperms, (u_longlong_t)seed, 1245 (u_longlong_t)checksum, (double)avg_ratio / 1000.0); 1246 1247 nvlist_free(cfg); 1248 } 1249 1250 printf("};\n"); 1251 1252 return (0); 1253 } 1254 1255 static int 1256 draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp) 1257 { 1258 nvlist_t *srccfgs; 1259 nvpair_t *elem = NULL; 1260 int error, merged = 0; 1261 1262 error = read_map(srcfilename, &srccfgs); 1263 if (error != 0) 1264 return (error); 1265 1266 while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) { 1267 uint64_t nv_worst_ratio; 1268 uint64_t allcfg_worst_ratio; 1269 nvlist_t *cfg, *allcfg; 1270 const char *key; 1271 1272 switch (nvpair_type(elem)) { 1273 case DATA_TYPE_NVLIST: 1274 1275 (void) nvpair_value_nvlist(elem, &cfg); 1276 key = nvpair_name(elem); 1277 1278 nv_worst_ratio = fnvlist_lookup_uint64(cfg, 1279 MAP_WORST_RATIO); 1280 1281 error = nvlist_lookup_nvlist(allcfgs, key, &allcfg); 1282 if (error == 0) { 1283 allcfg_worst_ratio = fnvlist_lookup_uint64( 1284 allcfg, MAP_WORST_RATIO); 1285 1286 if (nv_worst_ratio < allcfg_worst_ratio) { 1287 fnvlist_remove(allcfgs, key); 1288 fnvlist_add_nvlist(allcfgs, key, cfg); 1289 merged++; 1290 } 1291 } else if (error == ENOENT) { 1292 fnvlist_add_nvlist(allcfgs, key, cfg); 1293 merged++; 1294 } else { 1295 return (error); 1296 } 1297 1298 break; 1299 default: 1300 continue; 1301 } 1302 } 1303 1304 nvlist_free(srccfgs); 1305 1306 *mergedp = merged; 1307 1308 return (0); 1309 } 1310 1311 /* 1312 * Merge the best map for each child count found in the listed files into 1313 * a new file. This allows 'draid generate' to be run in parallel and for 1314 * the results maps to be combined. 1315 */ 1316 static int 1317 draid_merge(int argc, char *argv[]) 1318 { 1319 char filename[MAXPATHLEN] = {0}; 1320 int c, error, total_merged = 0; 1321 nvlist_t *allcfgs; 1322 1323 while ((c = getopt(argc, argv, ":")) != -1) { 1324 switch (c) { 1325 case ':': 1326 (void) fprintf(stderr, 1327 "missing argument for '%c' option\n", optopt); 1328 draid_usage(); 1329 break; 1330 case '?': 1331 (void) fprintf(stderr, "invalid option '%c'\n", 1332 optopt); 1333 draid_usage(); 1334 break; 1335 } 1336 } 1337 1338 if (argc < 4) { 1339 (void) fprintf(stderr, 1340 "A FILE and multiple SRCs must be specified.\n"); 1341 return (1); 1342 } 1343 1344 strlcpy(filename, argv[optind], sizeof (filename)); 1345 optind++; 1346 1347 error = read_map(filename, &allcfgs); 1348 if (error == ENOENT) { 1349 allcfgs = fnvlist_alloc(); 1350 } else if (error != 0) { 1351 printf("Error read_map(): %s\n", strerror(error)); 1352 return (error); 1353 } 1354 1355 while (optind < argc) { 1356 char srcfilename[MAXPATHLEN] = {0}; 1357 int merged = 0; 1358 1359 strlcpy(srcfilename, argv[optind], sizeof (srcfilename)); 1360 1361 error = draid_merge_impl(allcfgs, srcfilename, &merged); 1362 if (error) { 1363 printf("Error draid_merge_impl(): %s\n", 1364 strerror(error)); 1365 nvlist_free(allcfgs); 1366 return (1); 1367 } 1368 1369 total_merged += merged; 1370 printf("Merged %d key(s) from '%s' into '%s'\n", merged, 1371 srcfilename, filename); 1372 1373 optind++; 1374 } 1375 1376 if (total_merged > 0) 1377 write_map(filename, allcfgs); 1378 1379 printf("Merged a total of %d key(s) into '%s'\n", total_merged, 1380 filename); 1381 1382 nvlist_free(allcfgs); 1383 1384 return (0); 1385 } 1386 1387 int 1388 main(int argc, char *argv[]) 1389 { 1390 if (argc < 2) 1391 draid_usage(); 1392 1393 char *subcommand = argv[1]; 1394 1395 if (strcmp(subcommand, "generate") == 0) { 1396 return (draid_generate(argc - 1, argv + 1)); 1397 } else if (strcmp(subcommand, "verify") == 0) { 1398 return (draid_verify(argc - 1, argv + 1)); 1399 } else if (strcmp(subcommand, "dump") == 0) { 1400 return (draid_dump(argc - 1, argv + 1)); 1401 } else if (strcmp(subcommand, "table") == 0) { 1402 return (draid_table(argc - 1, argv + 1)); 1403 } else if (strcmp(subcommand, "merge") == 0) { 1404 return (draid_merge(argc - 1, argv + 1)); 1405 } else { 1406 draid_usage(); 1407 } 1408 } 1409