1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2018 Intel Corporation. 24 * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. 25 */ 26 27 #include <stdio.h> 28 #include <zlib.h> 29 #include <zfs_fletcher.h> 30 #include <sys/vdev_draid.h> 31 #include <sys/nvpair.h> 32 #include <sys/stat.h> 33 34 /* 35 * The number of rows to generate for new permutation maps. 36 */ 37 #define MAP_ROWS_DEFAULT 256 38 39 /* 40 * Key values for dRAID maps when stored as nvlists. 41 */ 42 #define MAP_SEED "seed" 43 #define MAP_CHECKSUM "checksum" 44 #define MAP_WORST_RATIO "worst_ratio" 45 #define MAP_AVG_RATIO "avg_ratio" 46 #define MAP_CHILDREN "children" 47 #define MAP_NPERMS "nperms" 48 #define MAP_PERMS "perms" 49 50 static void 51 draid_usage(void) 52 { 53 (void) fprintf(stderr, 54 "usage: draid command args ...\n" 55 "Available commands are:\n" 56 "\n" 57 "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n" 58 "\tdraid verify [-rv] FILE\n" 59 "\tdraid dump [-v] [-m min] [-n max] FILE\n" 60 "\tdraid table FILE\n" 61 "\tdraid merge FILE SRC SRC...\n"); 62 exit(1); 63 } 64 65 static int 66 read_map(const char *filename, nvlist_t **allcfgs) 67 { 68 int block_size = 131072; 69 int buf_size = 131072; 70 int tmp_size, error; 71 char *tmp_buf; 72 73 struct stat64 stat; 74 if (lstat64(filename, &stat) != 0) 75 return (errno); 76 77 if (stat.st_size == 0 || 78 !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) { 79 return (EINVAL); 80 } 81 82 gzFile fp = gzopen(filename, "rb"); 83 if (fp == Z_NULL) 84 return (errno); 85 86 char *buf = malloc(buf_size); 87 if (buf == NULL) { 88 (void) gzclose(fp); 89 return (ENOMEM); 90 } 91 92 ssize_t rc, bytes = 0; 93 while (!gzeof(fp)) { 94 rc = gzread(fp, buf + bytes, block_size); 95 if ((rc < 0) || (rc == 0 && !gzeof(fp))) { 96 free(buf); 97 (void) gzerror(fp, &error); 98 (void) gzclose(fp); 99 return (error); 100 } else { 101 bytes += rc; 102 103 if (bytes + block_size >= buf_size) { 104 tmp_size = 2 * buf_size; 105 tmp_buf = malloc(tmp_size); 106 if (tmp_buf == NULL) { 107 free(buf); 108 (void) gzclose(fp); 109 return (ENOMEM); 110 } 111 112 memcpy(tmp_buf, buf, bytes); 113 free(buf); 114 buf = tmp_buf; 115 buf_size = tmp_size; 116 } 117 } 118 } 119 120 (void) gzclose(fp); 121 122 error = nvlist_unpack(buf, bytes, allcfgs, 0); 123 free(buf); 124 125 return (error); 126 } 127 128 /* 129 * Read a map from the specified filename. A file contains multiple maps 130 * which are indexed by the number of children. The caller is responsible 131 * for freeing the configuration returned. 132 */ 133 static int 134 read_map_key(const char *filename, const char *key, nvlist_t **cfg) 135 { 136 nvlist_t *allcfgs, *foundcfg = NULL; 137 int error; 138 139 error = read_map(filename, &allcfgs); 140 if (error != 0) 141 return (error); 142 143 (void) nvlist_lookup_nvlist(allcfgs, key, &foundcfg); 144 if (foundcfg != NULL) { 145 nvlist_dup(foundcfg, cfg, KM_SLEEP); 146 error = 0; 147 } else { 148 error = ENOENT; 149 } 150 151 nvlist_free(allcfgs); 152 153 return (error); 154 } 155 156 /* 157 * Write all mappings to the map file. 158 */ 159 static int 160 write_map(const char *filename, nvlist_t *allcfgs) 161 { 162 size_t buflen = 0; 163 int error; 164 165 error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR); 166 if (error) 167 return (error); 168 169 char *buf = malloc(buflen); 170 if (buf == NULL) 171 return (ENOMEM); 172 173 error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); 174 if (error) { 175 free(buf); 176 return (error); 177 } 178 179 /* 180 * Atomically update the file using a temporary file and the 181 * traditional unlink then rename steps. This code provides 182 * no locking, it only guarantees the packed nvlist on disk 183 * is updated atomically and is internally consistent. 184 */ 185 char *tmpname = calloc(1, MAXPATHLEN); 186 if (tmpname == NULL) { 187 free(buf); 188 return (ENOMEM); 189 } 190 191 snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename); 192 193 int fd = mkstemp(tmpname); 194 if (fd < 0) { 195 error = errno; 196 free(buf); 197 free(tmpname); 198 return (error); 199 } 200 (void) close(fd); 201 202 gzFile fp = gzopen(tmpname, "w9b"); 203 if (fp == Z_NULL) { 204 error = errno; 205 free(buf); 206 free(tmpname); 207 return (error); 208 } 209 210 ssize_t rc, bytes = 0; 211 while (bytes < buflen) { 212 size_t size = MIN(buflen - bytes, 131072); 213 rc = gzwrite(fp, buf + bytes, size); 214 if (rc < 0) { 215 free(buf); 216 (void) gzerror(fp, &error); 217 (void) gzclose(fp); 218 (void) unlink(tmpname); 219 free(tmpname); 220 return (error); 221 } else if (rc == 0) { 222 break; 223 } else { 224 bytes += rc; 225 } 226 } 227 228 free(buf); 229 (void) gzclose(fp); 230 231 if (bytes != buflen) { 232 (void) unlink(tmpname); 233 free(tmpname); 234 return (EIO); 235 } 236 237 /* 238 * Unlink the previous config file and replace it with the updated 239 * version. If we're able to unlink the file then directory is 240 * writable by us and the subsequent rename should never fail. 241 */ 242 error = unlink(filename); 243 if (error != 0 && errno != ENOENT) { 244 error = errno; 245 (void) unlink(tmpname); 246 free(tmpname); 247 return (error); 248 } 249 250 error = rename(tmpname, filename); 251 if (error != 0) { 252 error = errno; 253 (void) unlink(tmpname); 254 free(tmpname); 255 return (error); 256 } 257 258 free(tmpname); 259 260 return (0); 261 } 262 263 /* 264 * Add the dRAID map to the file and write it out. 265 */ 266 static int 267 write_map_key(const char *filename, char *key, draid_map_t *map, 268 double worst_ratio, double avg_ratio) 269 { 270 nvlist_t *nv_cfg, *allcfgs; 271 int error; 272 273 /* 274 * Add the configuration to an existing or new file. The new 275 * configuration will replace an existing configuration with the 276 * same key if it has a lower ratio and is therefore better. 277 */ 278 error = read_map(filename, &allcfgs); 279 if (error == ENOENT) { 280 allcfgs = fnvlist_alloc(); 281 } else if (error != 0) { 282 return (error); 283 } 284 285 error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg); 286 if (error == 0) { 287 uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg, 288 MAP_WORST_RATIO); 289 double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0; 290 291 if (worst_ratio < nv_worst_ratio) { 292 /* Replace old map with the more balanced new map. */ 293 fnvlist_remove(allcfgs, key); 294 } else { 295 /* The old map is preferable, keep it. */ 296 nvlist_free(allcfgs); 297 return (EEXIST); 298 } 299 } 300 301 nvlist_t *cfg = fnvlist_alloc(); 302 fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed); 303 fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum); 304 fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children); 305 fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms); 306 fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms, 307 map->dm_children * map->dm_nperms * sizeof (uint8_t)); 308 309 fnvlist_add_uint64(cfg, MAP_WORST_RATIO, 310 (uint64_t)(worst_ratio * 1000.0)); 311 fnvlist_add_uint64(cfg, MAP_AVG_RATIO, 312 (uint64_t)(avg_ratio * 1000.0)); 313 314 error = nvlist_add_nvlist(allcfgs, key, cfg); 315 if (error == 0) 316 error = write_map(filename, allcfgs); 317 318 nvlist_free(cfg); 319 nvlist_free(allcfgs); 320 return (error); 321 } 322 323 static void 324 dump_map(draid_map_t *map, const char *key, double worst_ratio, 325 double avg_ratio, int verbose) 326 { 327 if (verbose == 0) { 328 return; 329 } else if (verbose == 1) { 330 printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f " 331 "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed, 332 worst_ratio, avg_ratio); 333 return; 334 } else { 335 printf(" \"%s\":\n" 336 " seed: 0x%016llx\n" 337 " checksum: 0x%016llx\n" 338 " worst_ratio: %2.03f\n" 339 " avg_ratio: %2.03f\n" 340 " children: %llu\n" 341 " nperms: %llu\n", 342 key, (u_longlong_t)map->dm_seed, 343 (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio, 344 (u_longlong_t)map->dm_children, 345 (u_longlong_t)map->dm_nperms); 346 347 if (verbose > 2) { 348 printf(" perms = {\n"); 349 for (int i = 0; i < map->dm_nperms; i++) { 350 printf(" { "); 351 for (int j = 0; j < map->dm_children; j++) { 352 printf("%3d%s ", map->dm_perms[ 353 i * map->dm_children + j], 354 j < map->dm_children - 1 ? 355 "," : ""); 356 } 357 printf(" },\n"); 358 } 359 printf(" }\n"); 360 } else if (verbose == 2) { 361 printf(" draid_perms = <omitted>\n"); 362 } 363 } 364 } 365 366 static void 367 dump_map_nv(const char *key, nvlist_t *cfg, int verbose) 368 { 369 draid_map_t map; 370 uint_t c; 371 372 uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); 373 uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); 374 375 map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); 376 map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); 377 map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); 378 map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); 379 map.dm_perms = fnvlist_lookup_uint8_array(cfg, MAP_PERMS, &c); 380 381 dump_map(&map, key, (double)worst_ratio / 1000.0, 382 avg_ratio / 1000.0, verbose); 383 } 384 385 /* 386 * Print a summary of the mapping. 387 */ 388 static int 389 dump_map_key(const char *filename, const char *key, int verbose) 390 { 391 nvlist_t *cfg; 392 int error; 393 394 error = read_map_key(filename, key, &cfg); 395 if (error != 0) 396 return (error); 397 398 dump_map_nv(key, cfg, verbose); 399 400 return (0); 401 } 402 403 /* 404 * Allocate a new permutation map for evaluation. 405 */ 406 static int 407 alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed, 408 draid_map_t **mapp) 409 { 410 draid_map_t *map; 411 int error; 412 413 map = malloc(sizeof (draid_map_t)); 414 if (map == NULL) 415 return (ENOMEM); 416 417 map->dm_children = children; 418 map->dm_nperms = nperms; 419 map->dm_seed = seed; 420 map->dm_checksum = 0; 421 422 error = vdev_draid_generate_perms(map, &map->dm_perms); 423 if (error) { 424 free(map); 425 return (error); 426 } 427 428 *mapp = map; 429 430 return (0); 431 } 432 433 /* 434 * Allocate the fixed permutation map for N children. 435 */ 436 static int 437 alloc_fixed_map(uint64_t children, draid_map_t **mapp) 438 { 439 const draid_map_t *fixed_map; 440 draid_map_t *map; 441 int error; 442 443 error = vdev_draid_lookup_map(children, &fixed_map); 444 if (error) 445 return (error); 446 447 map = malloc(sizeof (draid_map_t)); 448 if (map == NULL) 449 return (ENOMEM); 450 451 memcpy(map, fixed_map, sizeof (draid_map_t)); 452 VERIFY3U(map->dm_checksum, !=, 0); 453 454 error = vdev_draid_generate_perms(map, &map->dm_perms); 455 if (error) { 456 free(map); 457 return (error); 458 } 459 460 *mapp = map; 461 462 return (0); 463 } 464 465 /* 466 * Free a permutation map. 467 */ 468 static void 469 free_map(draid_map_t *map) 470 { 471 free(map->dm_perms); 472 free(map); 473 } 474 475 /* 476 * Check if dev is in the provided list of faulted devices. 477 */ 478 static inline boolean_t 479 is_faulted(int *faulted_devs, int nfaulted, int dev) 480 { 481 for (int i = 0; i < nfaulted; i++) 482 if (faulted_devs[i] == dev) 483 return (B_TRUE); 484 485 return (B_FALSE); 486 } 487 488 /* 489 * Evaluate how resilvering I/O will be distributed given a list of faulted 490 * vdevs. As a simplification we assume one IO is sufficient to repair each 491 * damaged device in a group. 492 */ 493 static double 494 eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares, 495 int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios) 496 { 497 uint64_t children = map->dm_children; 498 uint64_t ngroups = 1; 499 uint64_t ndisks = children - nspares; 500 501 /* 502 * Calculate the minimum number of groups required to fill a slice. 503 */ 504 while (ngroups * (groupwidth) % (children - nspares) != 0) 505 ngroups++; 506 507 int *ios = calloc(map->dm_children, sizeof (uint64_t)); 508 509 ASSERT3P(ios, !=, NULL); 510 511 /* Resilver all rows */ 512 for (int i = 0; i < map->dm_nperms; i++) { 513 uint8_t *row = &map->dm_perms[i * map->dm_children]; 514 515 /* Resilver all groups with faulted drives */ 516 for (int j = 0; j < ngroups; j++) { 517 uint64_t spareidx = map->dm_children - nspares; 518 boolean_t repair_needed = B_FALSE; 519 520 /* See if any devices in this group are faulted */ 521 uint64_t groupstart = (j * groupwidth) % ndisks; 522 523 for (int k = 0; k < groupwidth; k++) { 524 uint64_t groupidx = (groupstart + k) % ndisks; 525 526 repair_needed = is_faulted(faulted_devs, 527 nfaulted, row[groupidx]); 528 if (repair_needed) 529 break; 530 } 531 532 if (repair_needed == B_FALSE) 533 continue; 534 535 /* 536 * This group is degraded. Calculate the number of 537 * reads the non-faulted drives require and the number 538 * of writes to the distributed hot spare for this row. 539 */ 540 for (int k = 0; k < groupwidth; k++) { 541 uint64_t groupidx = (groupstart + k) % ndisks; 542 543 if (!is_faulted(faulted_devs, nfaulted, 544 row[groupidx])) { 545 ios[row[groupidx]]++; 546 } else if (nspares > 0) { 547 while (is_faulted(faulted_devs, 548 nfaulted, row[spareidx])) { 549 spareidx++; 550 } 551 552 ASSERT3U(spareidx, <, map->dm_children); 553 ios[row[spareidx]]++; 554 spareidx++; 555 } 556 } 557 } 558 } 559 560 *min_child_ios = INT_MAX; 561 *max_child_ios = 0; 562 563 /* 564 * Find the drives with fewest and most required I/O. These values 565 * are used to calculate the imbalance ratio. To avoid returning an 566 * infinite value for permutations which have children that perform 567 * no IO a floor of 1 IO per child is set. This ensures a meaningful 568 * ratio is returned for comparison and it is not an uncommon when 569 * there are a large number of children. 570 */ 571 for (int i = 0; i < map->dm_children; i++) { 572 573 if (is_faulted(faulted_devs, nfaulted, i)) { 574 ASSERT0(ios[i]); 575 continue; 576 } 577 578 if (ios[i] == 0) 579 ios[i] = 1; 580 581 if (ios[i] < *min_child_ios) 582 *min_child_ios = ios[i]; 583 584 if (ios[i] > *max_child_ios) 585 *max_child_ios = ios[i]; 586 } 587 588 ASSERT3S(*min_child_ios, !=, INT_MAX); 589 ASSERT3S(*max_child_ios, !=, 0); 590 591 double ratio = (double)(*max_child_ios) / (double)(*min_child_ios); 592 593 free(ios); 594 595 return (ratio); 596 } 597 598 /* 599 * Evaluate the quality of the permutation mapping by considering possible 600 * device failures. Returns the imbalance ratio for the worst mapping which 601 * is defined to be the largest number of child IOs over the fewest number 602 * child IOs. A value of 1.0 indicates the mapping is perfectly balance and 603 * all children perform an equal amount of work during reconstruction. 604 */ 605 static void 606 eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop) 607 { 608 uint64_t children = map->dm_children; 609 double worst_ratio = 1.0; 610 double sum = 0; 611 int worst_min_ios = 0, worst_max_ios = 0; 612 int n = 0; 613 614 /* 615 * When there are only 2 children there can be no distributed 616 * spare and no resilver to evaluate. Default to a ratio of 1.0 617 * for this degenerate case. 618 */ 619 if (children == VDEV_DRAID_MIN_CHILDREN) { 620 *worst_ratiop = 1.0; 621 *avg_ratiop = 1.0; 622 return; 623 } 624 625 /* 626 * Score the mapping as if it had either 1 or 2 distributed spares. 627 */ 628 for (int nspares = 1; nspares <= 2; nspares++) { 629 uint64_t faults = nspares; 630 631 /* 632 * Score groupwidths up to 19. This value was chosen as the 633 * largest reasonable width (16d+3p). dRAID pools may be still 634 * be created with wider stripes but they are not considered in 635 * this analysis in order to optimize for the most common cases. 636 */ 637 for (uint64_t groupwidth = 2; 638 groupwidth <= MIN(children - nspares, 19); 639 groupwidth++) { 640 int faulted_devs[2]; 641 int min_ios, max_ios; 642 643 /* 644 * Score possible devices faults. This is limited 645 * to exactly one fault per distributed spare for 646 * the purposes of this similation. 647 */ 648 for (int f1 = 0; f1 < children; f1++) { 649 faulted_devs[0] = f1; 650 double ratio; 651 652 if (faults == 1) { 653 ratio = eval_resilver(map, groupwidth, 654 nspares, faulted_devs, faults, 655 &min_ios, &max_ios); 656 657 if (ratio > worst_ratio) { 658 worst_ratio = ratio; 659 worst_min_ios = min_ios; 660 worst_max_ios = max_ios; 661 } 662 663 sum += ratio; 664 n++; 665 } else if (faults == 2) { 666 for (int f2 = f1 + 1; f2 < children; 667 f2++) { 668 faulted_devs[1] = f2; 669 670 ratio = eval_resilver(map, 671 groupwidth, nspares, 672 faulted_devs, faults, 673 &min_ios, &max_ios); 674 675 if (ratio > worst_ratio) { 676 worst_ratio = ratio; 677 worst_min_ios = min_ios; 678 worst_max_ios = max_ios; 679 } 680 681 sum += ratio; 682 n++; 683 } 684 } 685 } 686 } 687 } 688 689 *worst_ratiop = worst_ratio; 690 *avg_ratiop = sum / n; 691 692 /* 693 * Log the min/max io values for particularly unbalanced maps. 694 * Since the maps are generated entirely randomly these are possible 695 * be exceedingly unlikely. We log it for possible investigation. 696 */ 697 if (worst_ratio > 100.0) { 698 dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2); 699 printf("worst_min_ios=%d worst_max_ios=%d\n", 700 worst_min_ios, worst_max_ios); 701 } 702 } 703 704 static int 705 eval_maps(uint64_t children, int passes, uint64_t *map_seed, 706 draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop) 707 { 708 draid_map_t *best_map = NULL; 709 double best_worst_ratio = 1000.0; 710 double best_avg_ratio = 1000.0; 711 712 /* 713 * Perform the requested number of passes evaluating randomly 714 * generated permutation maps. Only the best version is kept. 715 */ 716 for (int i = 0; i < passes; i++) { 717 double worst_ratio, avg_ratio; 718 draid_map_t *map; 719 int error; 720 721 /* 722 * Calculate the next seed and generate a new candidate map. 723 */ 724 error = alloc_new_map(children, MAP_ROWS_DEFAULT, 725 vdev_draid_rand(map_seed), &map); 726 if (error) { 727 if (best_map != NULL) 728 free_map(best_map); 729 return (error); 730 } 731 732 /* 733 * Consider maps with a lower worst_ratio to be of higher 734 * quality. Some maps may have a lower avg_ratio but they 735 * are discarded since they might include some particularly 736 * imbalanced permutations. The average is tracked to in 737 * order to get a sense of the average permutation quality. 738 */ 739 eval_decluster(map, &worst_ratio, &avg_ratio); 740 741 if (best_map == NULL || worst_ratio < best_worst_ratio) { 742 743 if (best_map != NULL) 744 free_map(best_map); 745 746 best_map = map; 747 best_worst_ratio = worst_ratio; 748 best_avg_ratio = avg_ratio; 749 } else { 750 free_map(map); 751 } 752 } 753 754 /* 755 * After determining the best map generate a checksum over the full 756 * permutation array. This checksum is verified when opening a dRAID 757 * pool to ensure the generated in memory permutations are correct. 758 */ 759 zio_cksum_t cksum; 760 fletcher_4_native_varsize(best_map->dm_perms, 761 sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms, 762 &cksum); 763 best_map->dm_checksum = cksum.zc_word[0]; 764 765 *best_mapp = best_map; 766 *best_ratiop = best_worst_ratio; 767 *avg_ratiop = best_avg_ratio; 768 769 return (0); 770 } 771 772 static int 773 draid_generate(int argc, char *argv[]) 774 { 775 char filename[MAXPATHLEN] = {0}; 776 uint64_t map_seed[2]; 777 int c, fd, error, verbose = 0, passes = 1, continuous = 0; 778 int min_children = VDEV_DRAID_MIN_CHILDREN; 779 int max_children = VDEV_DRAID_MAX_CHILDREN; 780 int restarts = 0; 781 782 while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) { 783 switch (c) { 784 case 'c': 785 continuous++; 786 break; 787 case 'm': 788 min_children = (int)strtol(optarg, NULL, 0); 789 if (min_children < VDEV_DRAID_MIN_CHILDREN) { 790 (void) fprintf(stderr, "A minimum of 2 " 791 "children are required.\n"); 792 return (1); 793 } 794 795 break; 796 case 'n': 797 max_children = (int)strtol(optarg, NULL, 0); 798 if (max_children > VDEV_DRAID_MAX_CHILDREN) { 799 (void) fprintf(stderr, "A maximum of %d " 800 "children are allowed.\n", 801 VDEV_DRAID_MAX_CHILDREN); 802 return (1); 803 } 804 break; 805 case 'p': 806 passes = (int)strtol(optarg, NULL, 0); 807 break; 808 case 'v': 809 /* 810 * 0 - Only log when a better map is added to the file. 811 * 1 - Log the current best map for each child count. 812 * Minimal output on a single summary line. 813 * 2 - Log the current best map for each child count. 814 * More verbose includes most map fields. 815 * 3 - Log the current best map for each child count. 816 * Very verbose all fields including the full map. 817 */ 818 verbose++; 819 break; 820 case ':': 821 (void) fprintf(stderr, 822 "missing argument for '%c' option\n", optopt); 823 draid_usage(); 824 break; 825 case '?': 826 (void) fprintf(stderr, "invalid option '%c'\n", 827 optopt); 828 draid_usage(); 829 break; 830 } 831 } 832 833 if (argc > optind) 834 strlcpy(filename, argv[optind], sizeof (filename)); 835 else { 836 (void) fprintf(stderr, "A FILE must be specified.\n"); 837 return (1); 838 } 839 840 restart: 841 /* 842 * Start with a fresh seed from /dev/urandom. 843 */ 844 fd = open("/dev/urandom", O_RDONLY); 845 if (fd < 0) { 846 printf("Unable to open /dev/urandom: %s\n:", strerror(errno)); 847 return (1); 848 } else { 849 ssize_t bytes = sizeof (map_seed); 850 ssize_t bytes_read = 0; 851 852 while (bytes_read < bytes) { 853 ssize_t rc = read(fd, ((char *)map_seed) + bytes_read, 854 bytes - bytes_read); 855 if (rc < 0) { 856 printf("Unable to read /dev/urandom: %s\n:", 857 strerror(errno)); 858 close(fd); 859 return (1); 860 } 861 bytes_read += rc; 862 } 863 864 (void) close(fd); 865 } 866 867 if (restarts == 0) 868 printf("Writing generated mappings to '%s':\n", filename); 869 870 /* 871 * Generate maps for all requested child counts. The best map for 872 * each child count is written out to the specified file. If the file 873 * already contains a better mapping this map will not be added. 874 */ 875 for (uint64_t children = min_children; 876 children <= max_children; children++) { 877 char key[8] = { 0 }; 878 draid_map_t *map; 879 double worst_ratio = 1000.0; 880 double avg_ratio = 1000.0; 881 882 error = eval_maps(children, passes, map_seed, &map, 883 &worst_ratio, &avg_ratio); 884 if (error) { 885 printf("Error eval_maps(): %s\n", strerror(error)); 886 return (1); 887 } 888 889 if (worst_ratio < 1.0 || avg_ratio < 1.0) { 890 printf("Error ratio < 1.0: worst_ratio = %2.03f " 891 "avg_ratio = %2.03f\n", worst_ratio, avg_ratio); 892 return (1); 893 } 894 895 snprintf(key, 7, "%llu", (u_longlong_t)children); 896 error = write_map_key(filename, key, map, worst_ratio, 897 avg_ratio); 898 if (error == 0) { 899 /* The new map was added to the file. */ 900 dump_map(map, key, worst_ratio, avg_ratio, 901 MAX(verbose, 1)); 902 } else if (error == EEXIST) { 903 /* The existing map was preferable and kept. */ 904 if (verbose > 0) 905 dump_map_key(filename, key, verbose); 906 } else { 907 printf("Error write_map_key(): %s\n", strerror(error)); 908 return (1); 909 } 910 911 free_map(map); 912 } 913 914 /* 915 * When the continuous option is set restart at the minimum number of 916 * children instead of exiting. This option is useful as a mechanism 917 * to continuous try and refine the discovered permutations. 918 */ 919 if (continuous) { 920 restarts++; 921 printf("Restarting by request (-c): %d\n", restarts); 922 goto restart; 923 } 924 925 return (0); 926 } 927 928 /* 929 * Verify each map in the file by generating its in-memory permutation array 930 * and comfirming its checksum is correct. 931 */ 932 static int 933 draid_verify(int argc, char *argv[]) 934 { 935 char filename[MAXPATHLEN] = {0}; 936 int n = 0, c, error, verbose = 1; 937 int check_ratios = 0; 938 939 while ((c = getopt(argc, argv, ":rv")) != -1) { 940 switch (c) { 941 case 'r': 942 check_ratios++; 943 break; 944 case 'v': 945 verbose++; 946 break; 947 case ':': 948 (void) fprintf(stderr, 949 "missing argument for '%c' option\n", optopt); 950 draid_usage(); 951 break; 952 case '?': 953 (void) fprintf(stderr, "invalid option '%c'\n", 954 optopt); 955 draid_usage(); 956 break; 957 } 958 } 959 960 if (argc > optind) { 961 char *abspath = malloc(MAXPATHLEN); 962 if (abspath == NULL) 963 return (ENOMEM); 964 965 if (realpath(argv[optind], abspath) != NULL) 966 strlcpy(filename, abspath, sizeof (filename)); 967 else 968 strlcpy(filename, argv[optind], sizeof (filename)); 969 970 free(abspath); 971 } else { 972 (void) fprintf(stderr, "A FILE must be specified.\n"); 973 return (1); 974 } 975 976 printf("Verifying permutation maps: '%s'\n", filename); 977 978 /* 979 * Lookup hardcoded permutation map for each valid number of children 980 * and verify a generated map has the correct checksum. Then compare 981 * the generated map values with the nvlist map values read from the 982 * reference file to cross-check the permutation. 983 */ 984 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; 985 children <= VDEV_DRAID_MAX_CHILDREN; 986 children++) { 987 draid_map_t *map; 988 char key[8] = {0}; 989 990 snprintf(key, 8, "%llu", (u_longlong_t)children); 991 992 error = alloc_fixed_map(children, &map); 993 if (error) { 994 printf("Error alloc_fixed_map() failed: %s\n", 995 error == ECKSUM ? "Invalid checksum" : 996 strerror(error)); 997 return (1); 998 } 999 1000 uint64_t nv_seed, nv_checksum, nv_children, nv_nperms; 1001 uint8_t *nv_perms; 1002 nvlist_t *cfg; 1003 uint_t c; 1004 1005 error = read_map_key(filename, key, &cfg); 1006 if (error != 0) { 1007 printf("Error read_map_key() failed: %s\n", 1008 strerror(error)); 1009 free_map(map); 1010 return (1); 1011 } 1012 1013 nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); 1014 nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); 1015 nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); 1016 nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); 1017 nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c); 1018 1019 /* 1020 * Compare draid_map_t and nvlist reference values. 1021 */ 1022 if (map->dm_seed != nv_seed) { 1023 printf("Error different seeds: 0x%016llx != " 1024 "0x%016llx\n", (u_longlong_t)map->dm_seed, 1025 (u_longlong_t)nv_seed); 1026 error = EINVAL; 1027 } 1028 1029 if (map->dm_checksum != nv_checksum) { 1030 printf("Error different checksums: 0x%016llx " 1031 "!= 0x%016llx\n", 1032 (u_longlong_t)map->dm_checksum, 1033 (u_longlong_t)nv_checksum); 1034 error = EINVAL; 1035 } 1036 1037 if (map->dm_children != nv_children) { 1038 printf("Error different children: %llu " 1039 "!= %llu\n", (u_longlong_t)map->dm_children, 1040 (u_longlong_t)nv_children); 1041 error = EINVAL; 1042 } 1043 1044 if (map->dm_nperms != nv_nperms) { 1045 printf("Error different nperms: %llu " 1046 "!= %llu\n", (u_longlong_t)map->dm_nperms, 1047 (u_longlong_t)nv_nperms); 1048 error = EINVAL; 1049 } 1050 1051 for (uint64_t i = 0; i < nv_children * nv_nperms; i++) { 1052 if (map->dm_perms[i] != nv_perms[i]) { 1053 printf("Error different perms[%llu]: " 1054 "%d != %d\n", (u_longlong_t)i, 1055 (int)map->dm_perms[i], 1056 (int)nv_perms[i]); 1057 error = EINVAL; 1058 break; 1059 } 1060 } 1061 1062 /* 1063 * For good measure recalculate the worst and average 1064 * ratios and confirm they match the nvlist values. 1065 */ 1066 if (check_ratios) { 1067 uint64_t nv_worst_ratio, nv_avg_ratio; 1068 double worst_ratio, avg_ratio; 1069 1070 eval_decluster(map, &worst_ratio, &avg_ratio); 1071 1072 nv_worst_ratio = fnvlist_lookup_uint64(cfg, 1073 MAP_WORST_RATIO); 1074 nv_avg_ratio = fnvlist_lookup_uint64(cfg, 1075 MAP_AVG_RATIO); 1076 1077 if (worst_ratio < 1.0 || avg_ratio < 1.0) { 1078 printf("Error ratio out of range %2.03f, " 1079 "%2.03f\n", worst_ratio, avg_ratio); 1080 error = EINVAL; 1081 } 1082 1083 if ((uint64_t)(worst_ratio * 1000.0) != 1084 nv_worst_ratio) { 1085 printf("Error different worst_ratio %2.03f " 1086 "!= %2.03f\n", (double)nv_worst_ratio / 1087 1000.0, worst_ratio); 1088 error = EINVAL; 1089 } 1090 1091 if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) { 1092 printf("Error different average_ratio %2.03f " 1093 "!= %2.03f\n", (double)nv_avg_ratio / 1094 1000.0, avg_ratio); 1095 error = EINVAL; 1096 } 1097 } 1098 1099 if (error) { 1100 free_map(map); 1101 nvlist_free(cfg); 1102 return (1); 1103 } 1104 1105 if (verbose > 0) { 1106 printf("- %llu children: good\n", 1107 (u_longlong_t)children); 1108 } 1109 n++; 1110 1111 free_map(map); 1112 nvlist_free(cfg); 1113 } 1114 1115 if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) { 1116 printf("Error permutation maps missing: %d / %d checked\n", 1117 n, VDEV_DRAID_MAX_CHILDREN - 1); 1118 return (1); 1119 } 1120 1121 printf("Successfully verified %d / %d permutation maps\n", 1122 n, VDEV_DRAID_MAX_CHILDREN - 1); 1123 1124 return (0); 1125 } 1126 1127 /* 1128 * Dump the contents of the specified mapping(s) for inspection. 1129 */ 1130 static int 1131 draid_dump(int argc, char *argv[]) 1132 { 1133 char filename[MAXPATHLEN] = {0}; 1134 int c, error, verbose = 1; 1135 int min_children = VDEV_DRAID_MIN_CHILDREN; 1136 int max_children = VDEV_DRAID_MAX_CHILDREN; 1137 1138 while ((c = getopt(argc, argv, ":vm:n:")) != -1) { 1139 switch (c) { 1140 case 'm': 1141 min_children = (int)strtol(optarg, NULL, 0); 1142 if (min_children < 2) { 1143 (void) fprintf(stderr, "A minimum of 2 " 1144 "children are required.\n"); 1145 return (1); 1146 } 1147 1148 break; 1149 case 'n': 1150 max_children = (int)strtol(optarg, NULL, 0); 1151 if (max_children > VDEV_DRAID_MAX_CHILDREN) { 1152 (void) fprintf(stderr, "A maximum of %d " 1153 "children are allowed.\n", 1154 VDEV_DRAID_MAX_CHILDREN); 1155 return (1); 1156 } 1157 break; 1158 case 'v': 1159 verbose++; 1160 break; 1161 case ':': 1162 (void) fprintf(stderr, 1163 "missing argument for '%c' option\n", optopt); 1164 draid_usage(); 1165 break; 1166 case '?': 1167 (void) fprintf(stderr, "invalid option '%c'\n", 1168 optopt); 1169 draid_usage(); 1170 break; 1171 } 1172 } 1173 1174 if (argc > optind) 1175 strlcpy(filename, argv[optind], sizeof (filename)); 1176 else { 1177 (void) fprintf(stderr, "A FILE must be specified.\n"); 1178 return (1); 1179 } 1180 1181 /* 1182 * Dump maps for the requested child counts. 1183 */ 1184 for (uint64_t children = min_children; 1185 children <= max_children; children++) { 1186 char key[8] = { 0 }; 1187 1188 snprintf(key, 7, "%llu", (u_longlong_t)children); 1189 error = dump_map_key(filename, key, verbose); 1190 if (error) { 1191 printf("Error dump_map_key(): %s\n", strerror(error)); 1192 return (1); 1193 } 1194 } 1195 1196 return (0); 1197 } 1198 1199 /* 1200 * Print all of the mappings as a C formatted draid_map_t array. This table 1201 * is found in the module/zcommon/zfs_draid.c file and is the definitive 1202 * source for all mapping used by dRAID. It cannot be updated without 1203 * changing the dRAID on disk format. 1204 */ 1205 static int 1206 draid_table(int argc, char *argv[]) 1207 { 1208 char filename[MAXPATHLEN] = {0}; 1209 int error; 1210 1211 if (argc > optind) 1212 strlcpy(filename, argv[optind], sizeof (filename)); 1213 else { 1214 (void) fprintf(stderr, "A FILE must be specified.\n"); 1215 return (1); 1216 } 1217 1218 printf("static const draid_map_t " 1219 "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n"); 1220 1221 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; 1222 children <= VDEV_DRAID_MAX_CHILDREN; 1223 children++) { 1224 uint64_t seed, checksum, nperms, avg_ratio; 1225 nvlist_t *cfg; 1226 char key[8] = {0}; 1227 1228 snprintf(key, 8, "%llu", (u_longlong_t)children); 1229 1230 error = read_map_key(filename, key, &cfg); 1231 if (error != 0) { 1232 printf("Error read_map_key() failed: %s\n", 1233 strerror(error)); 1234 return (1); 1235 } 1236 1237 seed = fnvlist_lookup_uint64(cfg, MAP_SEED); 1238 checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); 1239 children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); 1240 nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); 1241 avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); 1242 1243 printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t" 1244 "/* %2.03f */\n", (u_longlong_t)children, 1245 (u_longlong_t)nperms, (u_longlong_t)seed, 1246 (u_longlong_t)checksum, (double)avg_ratio / 1000.0); 1247 1248 nvlist_free(cfg); 1249 } 1250 1251 printf("};\n"); 1252 1253 return (0); 1254 } 1255 1256 static int 1257 draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp) 1258 { 1259 nvlist_t *srccfgs; 1260 nvpair_t *elem = NULL; 1261 int error, merged = 0; 1262 1263 error = read_map(srcfilename, &srccfgs); 1264 if (error != 0) 1265 return (error); 1266 1267 while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) { 1268 uint64_t nv_worst_ratio; 1269 uint64_t allcfg_worst_ratio; 1270 nvlist_t *cfg, *allcfg; 1271 const char *key; 1272 1273 switch (nvpair_type(elem)) { 1274 case DATA_TYPE_NVLIST: 1275 1276 (void) nvpair_value_nvlist(elem, &cfg); 1277 key = nvpair_name(elem); 1278 1279 nv_worst_ratio = fnvlist_lookup_uint64(cfg, 1280 MAP_WORST_RATIO); 1281 1282 error = nvlist_lookup_nvlist(allcfgs, key, &allcfg); 1283 if (error == 0) { 1284 allcfg_worst_ratio = fnvlist_lookup_uint64( 1285 allcfg, MAP_WORST_RATIO); 1286 1287 if (nv_worst_ratio < allcfg_worst_ratio) { 1288 fnvlist_remove(allcfgs, key); 1289 fnvlist_add_nvlist(allcfgs, key, cfg); 1290 merged++; 1291 } 1292 } else if (error == ENOENT) { 1293 fnvlist_add_nvlist(allcfgs, key, cfg); 1294 merged++; 1295 } else { 1296 return (error); 1297 } 1298 1299 break; 1300 default: 1301 continue; 1302 } 1303 } 1304 1305 nvlist_free(srccfgs); 1306 1307 *mergedp = merged; 1308 1309 return (0); 1310 } 1311 1312 /* 1313 * Merge the best map for each child count found in the listed files into 1314 * a new file. This allows 'draid generate' to be run in parallel and for 1315 * the results maps to be combined. 1316 */ 1317 static int 1318 draid_merge(int argc, char *argv[]) 1319 { 1320 char filename[MAXPATHLEN] = {0}; 1321 int c, error, total_merged = 0; 1322 nvlist_t *allcfgs; 1323 1324 while ((c = getopt(argc, argv, ":")) != -1) { 1325 switch (c) { 1326 case ':': 1327 (void) fprintf(stderr, 1328 "missing argument for '%c' option\n", optopt); 1329 draid_usage(); 1330 break; 1331 case '?': 1332 (void) fprintf(stderr, "invalid option '%c'\n", 1333 optopt); 1334 draid_usage(); 1335 break; 1336 } 1337 } 1338 1339 if (argc < 4) { 1340 (void) fprintf(stderr, 1341 "A FILE and multiple SRCs must be specified.\n"); 1342 return (1); 1343 } 1344 1345 strlcpy(filename, argv[optind], sizeof (filename)); 1346 optind++; 1347 1348 error = read_map(filename, &allcfgs); 1349 if (error == ENOENT) { 1350 allcfgs = fnvlist_alloc(); 1351 } else if (error != 0) { 1352 printf("Error read_map(): %s\n", strerror(error)); 1353 return (error); 1354 } 1355 1356 while (optind < argc) { 1357 char srcfilename[MAXPATHLEN] = {0}; 1358 int merged = 0; 1359 1360 strlcpy(srcfilename, argv[optind], sizeof (srcfilename)); 1361 1362 error = draid_merge_impl(allcfgs, srcfilename, &merged); 1363 if (error) { 1364 printf("Error draid_merge_impl(): %s\n", 1365 strerror(error)); 1366 nvlist_free(allcfgs); 1367 return (1); 1368 } 1369 1370 total_merged += merged; 1371 printf("Merged %d key(s) from '%s' into '%s'\n", merged, 1372 srcfilename, filename); 1373 1374 optind++; 1375 } 1376 1377 if (total_merged > 0) 1378 write_map(filename, allcfgs); 1379 1380 printf("Merged a total of %d key(s) into '%s'\n", total_merged, 1381 filename); 1382 1383 nvlist_free(allcfgs); 1384 1385 return (0); 1386 } 1387 1388 int 1389 main(int argc, char *argv[]) 1390 { 1391 if (argc < 2) 1392 draid_usage(); 1393 1394 char *subcommand = argv[1]; 1395 1396 if (strcmp(subcommand, "generate") == 0) { 1397 return (draid_generate(argc - 1, argv + 1)); 1398 } else if (strcmp(subcommand, "verify") == 0) { 1399 return (draid_verify(argc - 1, argv + 1)); 1400 } else if (strcmp(subcommand, "dump") == 0) { 1401 return (draid_dump(argc - 1, argv + 1)); 1402 } else if (strcmp(subcommand, "table") == 0) { 1403 return (draid_table(argc - 1, argv + 1)); 1404 } else if (strcmp(subcommand, "merge") == 0) { 1405 return (draid_merge(argc - 1, argv + 1)); 1406 } else { 1407 draid_usage(); 1408 } 1409 } 1410