1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2018 Intel Corporation. 23 * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. 24 */ 25 26 #include <stdio.h> 27 #include <zlib.h> 28 #include <zfs_fletcher.h> 29 #include <sys/vdev_draid.h> 30 #include <sys/nvpair.h> 31 #include <sys/stat.h> 32 33 /* 34 * The number of rows to generate for new permutation maps. 35 */ 36 #define MAP_ROWS_DEFAULT 256 37 38 /* 39 * Key values for dRAID maps when stored as nvlists. 40 */ 41 #define MAP_SEED "seed" 42 #define MAP_CHECKSUM "checksum" 43 #define MAP_WORST_RATIO "worst_ratio" 44 #define MAP_AVG_RATIO "avg_ratio" 45 #define MAP_CHILDREN "children" 46 #define MAP_NPERMS "nperms" 47 #define MAP_PERMS "perms" 48 49 static void 50 draid_usage(void) 51 { 52 (void) fprintf(stderr, 53 "usage: draid command args ...\n" 54 "Available commands are:\n" 55 "\n" 56 "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n" 57 "\tdraid verify [-rv] FILE\n" 58 "\tdraid dump [-v] [-m min] [-n max] FILE\n" 59 "\tdraid table FILE\n" 60 "\tdraid merge FILE SRC SRC...\n"); 61 exit(1); 62 } 63 64 static int 65 read_map(const char *filename, nvlist_t **allcfgs) 66 { 67 int block_size = 131072; 68 int buf_size = 131072; 69 int tmp_size, error; 70 char *tmp_buf; 71 72 struct stat64 stat; 73 if (lstat64(filename, &stat) != 0) 74 return (errno); 75 76 if (stat.st_size == 0 || 77 !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) { 78 return (EINVAL); 79 } 80 81 gzFile fp = gzopen(filename, "rb"); 82 if (fp == Z_NULL) 83 return (errno); 84 85 char *buf = malloc(buf_size); 86 if (buf == NULL) { 87 (void) gzclose(fp); 88 return (ENOMEM); 89 } 90 91 ssize_t rc, bytes = 0; 92 while (!gzeof(fp)) { 93 rc = gzread(fp, buf + bytes, block_size); 94 if ((rc < 0) || (rc == 0 && !gzeof(fp))) { 95 free(buf); 96 (void) gzclose(fp); 97 (void) gzerror(fp, &error); 98 return (error); 99 } else { 100 bytes += rc; 101 102 if (bytes + block_size >= buf_size) { 103 tmp_size = 2 * buf_size; 104 tmp_buf = malloc(tmp_size); 105 if (tmp_buf == NULL) { 106 free(buf); 107 (void) gzclose(fp); 108 return (ENOMEM); 109 } 110 111 memcpy(tmp_buf, buf, bytes); 112 free(buf); 113 buf = tmp_buf; 114 buf_size = tmp_size; 115 } 116 } 117 } 118 119 (void) gzclose(fp); 120 121 error = nvlist_unpack(buf, bytes, allcfgs, 0); 122 free(buf); 123 124 return (error); 125 } 126 127 /* 128 * Read a map from the specified filename. A file contains multiple maps 129 * which are indexed by the number of children. The caller is responsible 130 * for freeing the configuration returned. 131 */ 132 static int 133 read_map_key(const char *filename, const char *key, nvlist_t **cfg) 134 { 135 nvlist_t *allcfgs, *foundcfg = NULL; 136 int error; 137 138 error = read_map(filename, &allcfgs); 139 if (error != 0) 140 return (error); 141 142 nvlist_lookup_nvlist(allcfgs, key, &foundcfg); 143 if (foundcfg != NULL) { 144 nvlist_dup(foundcfg, cfg, KM_SLEEP); 145 error = 0; 146 } else { 147 error = ENOENT; 148 } 149 150 nvlist_free(allcfgs); 151 152 return (error); 153 } 154 155 /* 156 * Write all mappings to the map file. 157 */ 158 static int 159 write_map(const char *filename, nvlist_t *allcfgs) 160 { 161 size_t buflen = 0; 162 int error; 163 164 error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR); 165 if (error) 166 return (error); 167 168 char *buf = malloc(buflen); 169 if (buf == NULL) 170 return (ENOMEM); 171 172 error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); 173 if (error) { 174 free(buf); 175 return (error); 176 } 177 178 /* 179 * Atomically update the file using a temporary file and the 180 * traditional unlink then rename steps. This code provides 181 * no locking, it only guarantees the packed nvlist on disk 182 * is updated atomically and is internally consistent. 183 */ 184 char *tmpname = calloc(1, MAXPATHLEN); 185 if (tmpname == NULL) { 186 free(buf); 187 return (ENOMEM); 188 } 189 190 snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename); 191 192 int fd = mkstemp(tmpname); 193 if (fd < 0) { 194 error = errno; 195 free(buf); 196 free(tmpname); 197 return (error); 198 } 199 (void) close(fd); 200 201 gzFile fp = gzopen(tmpname, "w9b"); 202 if (fp == Z_NULL) { 203 error = errno; 204 free(buf); 205 free(tmpname); 206 return (errno); 207 } 208 209 ssize_t rc, bytes = 0; 210 while (bytes < buflen) { 211 size_t size = MIN(buflen - bytes, 131072); 212 rc = gzwrite(fp, buf + bytes, size); 213 if (rc < 0) { 214 free(buf); 215 (void) gzerror(fp, &error); 216 (void) gzclose(fp); 217 (void) unlink(tmpname); 218 free(tmpname); 219 return (error); 220 } else if (rc == 0) { 221 break; 222 } else { 223 bytes += rc; 224 } 225 } 226 227 free(buf); 228 (void) gzclose(fp); 229 230 if (bytes != buflen) { 231 (void) unlink(tmpname); 232 free(tmpname); 233 return (EIO); 234 } 235 236 /* 237 * Unlink the previous config file and replace it with the updated 238 * version. If we're able to unlink the file then directory is 239 * writable by us and the subsequent rename should never fail. 240 */ 241 error = unlink(filename); 242 if (error != 0 && errno != ENOENT) { 243 error = errno; 244 (void) unlink(tmpname); 245 free(tmpname); 246 return (error); 247 } 248 249 error = rename(tmpname, filename); 250 if (error != 0) { 251 error = errno; 252 (void) unlink(tmpname); 253 free(tmpname); 254 return (error); 255 } 256 257 free(tmpname); 258 259 return (0); 260 } 261 262 /* 263 * Add the dRAID map to the file and write it out. 264 */ 265 static int 266 write_map_key(const char *filename, char *key, draid_map_t *map, 267 double worst_ratio, double avg_ratio) 268 { 269 nvlist_t *nv_cfg, *allcfgs; 270 int error; 271 272 /* 273 * Add the configuration to an existing or new file. The new 274 * configuration will replace an existing configuration with the 275 * same key if it has a lower ratio and is therefore better. 276 */ 277 error = read_map(filename, &allcfgs); 278 if (error == ENOENT) { 279 allcfgs = fnvlist_alloc(); 280 } else if (error != 0) { 281 return (error); 282 } 283 284 error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg); 285 if (error == 0) { 286 uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg, 287 MAP_WORST_RATIO); 288 double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0; 289 290 if (worst_ratio < nv_worst_ratio) { 291 /* Replace old map with the more balanced new map. */ 292 fnvlist_remove(allcfgs, key); 293 } else { 294 /* The old map is preferable, keep it. */ 295 nvlist_free(allcfgs); 296 return (EEXIST); 297 } 298 } 299 300 nvlist_t *cfg = fnvlist_alloc(); 301 fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed); 302 fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum); 303 fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children); 304 fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms); 305 fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms, 306 map->dm_children * map->dm_nperms * sizeof (uint8_t)); 307 308 fnvlist_add_uint64(cfg, MAP_WORST_RATIO, 309 (uint64_t)(worst_ratio * 1000.0)); 310 fnvlist_add_uint64(cfg, MAP_AVG_RATIO, 311 (uint64_t)(avg_ratio * 1000.0)); 312 313 error = nvlist_add_nvlist(allcfgs, key, cfg); 314 if (error == 0) 315 error = write_map(filename, allcfgs); 316 317 nvlist_free(cfg); 318 nvlist_free(allcfgs); 319 return (error); 320 } 321 322 static void 323 dump_map(draid_map_t *map, const char *key, double worst_ratio, 324 double avg_ratio, int verbose) 325 { 326 if (verbose == 0) { 327 return; 328 } else if (verbose == 1) { 329 printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f " 330 "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed, 331 worst_ratio, avg_ratio); 332 return; 333 } else { 334 printf(" \"%s\":\n" 335 " seed: 0x%016llx\n" 336 " checksum: 0x%016llx\n" 337 " worst_ratio: %2.03f\n" 338 " avg_ratio: %2.03f\n" 339 " children: %llu\n" 340 " nperms: %llu\n", 341 key, (u_longlong_t)map->dm_seed, 342 (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio, 343 (u_longlong_t)map->dm_children, 344 (u_longlong_t)map->dm_nperms); 345 346 if (verbose > 2) { 347 printf(" perms = {\n"); 348 for (int i = 0; i < map->dm_nperms; i++) { 349 printf(" { "); 350 for (int j = 0; j < map->dm_children; j++) { 351 printf("%3d%s ", map->dm_perms[ 352 i * map->dm_children + j], 353 j < map->dm_children - 1 ? 354 "," : ""); 355 } 356 printf(" },\n"); 357 } 358 printf(" }\n"); 359 } else if (verbose == 2) { 360 printf(" draid_perms = <omitted>\n"); 361 } 362 } 363 } 364 365 static void 366 dump_map_nv(const char *key, nvlist_t *cfg, int verbose) 367 { 368 draid_map_t map; 369 uint_t c; 370 371 uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); 372 uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); 373 374 map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); 375 map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); 376 map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); 377 map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); 378 nvlist_lookup_uint8_array(cfg, MAP_PERMS, &map.dm_perms, &c); 379 380 dump_map(&map, key, (double)worst_ratio / 1000.0, 381 avg_ratio / 1000.0, verbose); 382 } 383 384 /* 385 * Print a summary of the mapping. 386 */ 387 static int 388 dump_map_key(const char *filename, const char *key, int verbose) 389 { 390 nvlist_t *cfg; 391 int error; 392 393 error = read_map_key(filename, key, &cfg); 394 if (error != 0) 395 return (error); 396 397 dump_map_nv(key, cfg, verbose); 398 399 return (0); 400 } 401 402 /* 403 * Allocate a new permutation map for evaluation. 404 */ 405 static int 406 alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed, 407 draid_map_t **mapp) 408 { 409 draid_map_t *map; 410 int error; 411 412 map = malloc(sizeof (draid_map_t)); 413 if (map == NULL) 414 return (ENOMEM); 415 416 map->dm_children = children; 417 map->dm_nperms = nperms; 418 map->dm_seed = seed; 419 map->dm_checksum = 0; 420 421 error = vdev_draid_generate_perms(map, &map->dm_perms); 422 if (error) { 423 free(map); 424 return (error); 425 } 426 427 *mapp = map; 428 429 return (0); 430 } 431 432 /* 433 * Allocate the fixed permutation map for N children. 434 */ 435 static int 436 alloc_fixed_map(uint64_t children, draid_map_t **mapp) 437 { 438 const draid_map_t *fixed_map; 439 draid_map_t *map; 440 int error; 441 442 error = vdev_draid_lookup_map(children, &fixed_map); 443 if (error) 444 return (error); 445 446 map = malloc(sizeof (draid_map_t)); 447 if (map == NULL) 448 return (ENOMEM); 449 450 memcpy(map, fixed_map, sizeof (draid_map_t)); 451 VERIFY3U(map->dm_checksum, !=, 0); 452 453 error = vdev_draid_generate_perms(map, &map->dm_perms); 454 if (error) { 455 free(map); 456 return (error); 457 } 458 459 *mapp = map; 460 461 return (0); 462 } 463 464 /* 465 * Free a permutation map. 466 */ 467 static void 468 free_map(draid_map_t *map) 469 { 470 free(map->dm_perms); 471 free(map); 472 } 473 474 /* 475 * Check if dev is in the provided list of faulted devices. 476 */ 477 static inline boolean_t 478 is_faulted(int *faulted_devs, int nfaulted, int dev) 479 { 480 for (int i = 0; i < nfaulted; i++) 481 if (faulted_devs[i] == dev) 482 return (B_TRUE); 483 484 return (B_FALSE); 485 } 486 487 /* 488 * Evaluate how resilvering I/O will be distributed given a list of faulted 489 * vdevs. As a simplification we assume one IO is sufficient to repair each 490 * damaged device in a group. 491 */ 492 static double 493 eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares, 494 int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios) 495 { 496 uint64_t children = map->dm_children; 497 uint64_t ngroups = 1; 498 uint64_t ndisks = children - nspares; 499 500 /* 501 * Calculate the minimum number of groups required to fill a slice. 502 */ 503 while (ngroups * (groupwidth) % (children - nspares) != 0) 504 ngroups++; 505 506 int *ios = calloc(map->dm_children, sizeof (uint64_t)); 507 508 /* Resilver all rows */ 509 for (int i = 0; i < map->dm_nperms; i++) { 510 uint8_t *row = &map->dm_perms[i * map->dm_children]; 511 512 /* Resilver all groups with faulted drives */ 513 for (int j = 0; j < ngroups; j++) { 514 uint64_t spareidx = map->dm_children - nspares; 515 boolean_t repair_needed = B_FALSE; 516 517 /* See if any devices in this group are faulted */ 518 uint64_t groupstart = (j * groupwidth) % ndisks; 519 520 for (int k = 0; k < groupwidth; k++) { 521 uint64_t groupidx = (groupstart + k) % ndisks; 522 523 repair_needed = is_faulted(faulted_devs, 524 nfaulted, row[groupidx]); 525 if (repair_needed) 526 break; 527 } 528 529 if (repair_needed == B_FALSE) 530 continue; 531 532 /* 533 * This group is degraded. Calculate the number of 534 * reads the non-faulted drives require and the number 535 * of writes to the distributed hot spare for this row. 536 */ 537 for (int k = 0; k < groupwidth; k++) { 538 uint64_t groupidx = (groupstart + k) % ndisks; 539 540 if (!is_faulted(faulted_devs, nfaulted, 541 row[groupidx])) { 542 ios[row[groupidx]]++; 543 } else if (nspares > 0) { 544 while (is_faulted(faulted_devs, 545 nfaulted, row[spareidx])) { 546 spareidx++; 547 } 548 549 ASSERT3U(spareidx, <, map->dm_children); 550 ios[row[spareidx]]++; 551 spareidx++; 552 } 553 } 554 } 555 } 556 557 *min_child_ios = INT_MAX; 558 *max_child_ios = 0; 559 560 /* 561 * Find the drives with fewest and most required I/O. These values 562 * are used to calculate the imbalance ratio. To avoid returning an 563 * infinite value for permutations which have children that perform 564 * no IO a floor of 1 IO per child is set. This ensures a meaningful 565 * ratio is returned for comparison and it is not an uncommon when 566 * there are a large number of children. 567 */ 568 for (int i = 0; i < map->dm_children; i++) { 569 570 if (is_faulted(faulted_devs, nfaulted, i)) { 571 ASSERT0(ios[i]); 572 continue; 573 } 574 575 if (ios[i] == 0) 576 ios[i] = 1; 577 578 if (ios[i] < *min_child_ios) 579 *min_child_ios = ios[i]; 580 581 if (ios[i] > *max_child_ios) 582 *max_child_ios = ios[i]; 583 } 584 585 ASSERT3S(*min_child_ios, !=, INT_MAX); 586 ASSERT3S(*max_child_ios, !=, 0); 587 588 double ratio = (double)(*max_child_ios) / (double)(*min_child_ios); 589 590 free(ios); 591 592 return (ratio); 593 } 594 595 /* 596 * Evaluate the quality of the permutation mapping by considering possible 597 * device failures. Returns the imbalance ratio for the worst mapping which 598 * is defined to be the largest number of child IOs over the fewest number 599 * child IOs. A value of 1.0 indicates the mapping is perfectly balance and 600 * all children perform an equal amount of work during reconstruction. 601 */ 602 static void 603 eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop) 604 { 605 uint64_t children = map->dm_children; 606 double worst_ratio = 1.0; 607 double sum = 0; 608 int worst_min_ios = 0, worst_max_ios = 0; 609 int n = 0; 610 611 /* 612 * When there are only 2 children there can be no distributed 613 * spare and no resilver to evaluate. Default to a ratio of 1.0 614 * for this degenerate case. 615 */ 616 if (children == VDEV_DRAID_MIN_CHILDREN) { 617 *worst_ratiop = 1.0; 618 *avg_ratiop = 1.0; 619 return; 620 } 621 622 /* 623 * Score the mapping as if it had either 1 or 2 distributed spares. 624 */ 625 for (int nspares = 1; nspares <= 2; nspares++) { 626 uint64_t faults = nspares; 627 628 /* 629 * Score groupwidths up to 19. This value was chosen as the 630 * largest reasonable width (16d+3p). dRAID pools may be still 631 * be created with wider stripes but they are not considered in 632 * this analysis in order to optimize for the most common cases. 633 */ 634 for (uint64_t groupwidth = 2; 635 groupwidth <= MIN(children - nspares, 19); 636 groupwidth++) { 637 int faulted_devs[2]; 638 int min_ios, max_ios; 639 640 /* 641 * Score possible devices faults. This is limited 642 * to exactly one fault per distributed spare for 643 * the purposes of this similation. 644 */ 645 for (int f1 = 0; f1 < children; f1++) { 646 faulted_devs[0] = f1; 647 double ratio; 648 649 if (faults == 1) { 650 ratio = eval_resilver(map, groupwidth, 651 nspares, faulted_devs, faults, 652 &min_ios, &max_ios); 653 654 if (ratio > worst_ratio) { 655 worst_ratio = ratio; 656 worst_min_ios = min_ios; 657 worst_max_ios = max_ios; 658 } 659 660 sum += ratio; 661 n++; 662 } else if (faults == 2) { 663 for (int f2 = f1 + 1; f2 < children; 664 f2++) { 665 faulted_devs[1] = f2; 666 667 ratio = eval_resilver(map, 668 groupwidth, nspares, 669 faulted_devs, faults, 670 &min_ios, &max_ios); 671 672 if (ratio > worst_ratio) { 673 worst_ratio = ratio; 674 worst_min_ios = min_ios; 675 worst_max_ios = max_ios; 676 } 677 678 sum += ratio; 679 n++; 680 } 681 } 682 } 683 } 684 } 685 686 *worst_ratiop = worst_ratio; 687 *avg_ratiop = sum / n; 688 689 /* 690 * Log the min/max io values for particularly unbalanced maps. 691 * Since the maps are generated entirely randomly these are possible 692 * be exceedingly unlikely. We log it for possible investigation. 693 */ 694 if (worst_ratio > 100.0) { 695 dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2); 696 printf("worst_min_ios=%d worst_max_ios=%d\n", 697 worst_min_ios, worst_max_ios); 698 } 699 } 700 701 static int 702 eval_maps(uint64_t children, int passes, uint64_t *map_seed, 703 draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop) 704 { 705 draid_map_t *best_map = NULL; 706 double best_worst_ratio = 1000.0; 707 double best_avg_ratio = 1000.0; 708 709 /* 710 * Perform the requested number of passes evaluating randomly 711 * generated permutation maps. Only the best version is kept. 712 */ 713 for (int i = 0; i < passes; i++) { 714 double worst_ratio, avg_ratio; 715 draid_map_t *map; 716 int error; 717 718 /* 719 * Calculate the next seed and generate a new candidate map. 720 */ 721 error = alloc_new_map(children, MAP_ROWS_DEFAULT, 722 vdev_draid_rand(map_seed), &map); 723 if (error) 724 return (error); 725 726 /* 727 * Consider maps with a lower worst_ratio to be of higher 728 * quality. Some maps may have a lower avg_ratio but they 729 * are discarded since they might include some particularly 730 * imbalanced permutations. The average is tracked to in 731 * order to get a sense of the average permutation quality. 732 */ 733 eval_decluster(map, &worst_ratio, &avg_ratio); 734 735 if (best_map == NULL || worst_ratio < best_worst_ratio) { 736 737 if (best_map != NULL) 738 free_map(best_map); 739 740 best_map = map; 741 best_worst_ratio = worst_ratio; 742 best_avg_ratio = avg_ratio; 743 } else { 744 free_map(map); 745 } 746 } 747 748 /* 749 * After determining the best map generate a checksum over the full 750 * permutation array. This checksum is verified when opening a dRAID 751 * pool to ensure the generated in memory permutations are correct. 752 */ 753 zio_cksum_t cksum; 754 fletcher_4_native_varsize(best_map->dm_perms, 755 sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms, 756 &cksum); 757 best_map->dm_checksum = cksum.zc_word[0]; 758 759 *best_mapp = best_map; 760 *best_ratiop = best_worst_ratio; 761 *avg_ratiop = best_avg_ratio; 762 763 return (0); 764 } 765 766 static int 767 draid_generate(int argc, char *argv[]) 768 { 769 char filename[MAXPATHLEN] = {0}; 770 uint64_t map_seed; 771 int c, fd, error, verbose = 0, passes = 1, continuous = 0; 772 int min_children = VDEV_DRAID_MIN_CHILDREN; 773 int max_children = VDEV_DRAID_MAX_CHILDREN; 774 int restarts = 0; 775 776 while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) { 777 switch (c) { 778 case 'c': 779 continuous++; 780 break; 781 case 'm': 782 min_children = (int)strtol(optarg, NULL, 0); 783 if (min_children < VDEV_DRAID_MIN_CHILDREN) { 784 (void) fprintf(stderr, "A minimum of 2 " 785 "children are required.\n"); 786 return (1); 787 } 788 789 break; 790 case 'n': 791 max_children = (int)strtol(optarg, NULL, 0); 792 if (max_children > VDEV_DRAID_MAX_CHILDREN) { 793 (void) fprintf(stderr, "A maximum of %d " 794 "children are allowed.\n", 795 VDEV_DRAID_MAX_CHILDREN); 796 return (1); 797 } 798 break; 799 case 'p': 800 passes = (int)strtol(optarg, NULL, 0); 801 break; 802 case 'v': 803 /* 804 * 0 - Only log when a better map is added to the file. 805 * 1 - Log the current best map for each child count. 806 * Minimal output on a single summary line. 807 * 2 - Log the current best map for each child count. 808 * More verbose includes most map fields. 809 * 3 - Log the current best map for each child count. 810 * Very verbose all fields including the full map. 811 */ 812 verbose++; 813 break; 814 case ':': 815 (void) fprintf(stderr, 816 "missing argument for '%c' option\n", optopt); 817 draid_usage(); 818 break; 819 case '?': 820 (void) fprintf(stderr, "invalid option '%c'\n", 821 optopt); 822 draid_usage(); 823 break; 824 } 825 } 826 827 if (argc > optind) 828 strncpy(filename, argv[optind], MAXPATHLEN - 1); 829 else { 830 (void) fprintf(stderr, "A FILE must be specified.\n"); 831 return (1); 832 } 833 834 restart: 835 /* 836 * Start with a fresh seed from /dev/urandom. 837 */ 838 fd = open("/dev/urandom", O_RDONLY); 839 if (fd < 0) { 840 printf("Unable to open /dev/urandom: %s\n:", strerror(errno)); 841 return (1); 842 } else { 843 ssize_t bytes = sizeof (map_seed); 844 ssize_t bytes_read = 0; 845 846 while (bytes_read < bytes) { 847 ssize_t rc = read(fd, ((char *)&map_seed) + bytes_read, 848 bytes - bytes_read); 849 if (rc < 0) { 850 printf("Unable to read /dev/urandom: %s\n:", 851 strerror(errno)); 852 return (1); 853 } 854 bytes_read += rc; 855 } 856 857 (void) close(fd); 858 } 859 860 if (restarts == 0) 861 printf("Writing generated mappings to '%s':\n", filename); 862 863 /* 864 * Generate maps for all requested child counts. The best map for 865 * each child count is written out to the specified file. If the file 866 * already contains a better mapping this map will not be added. 867 */ 868 for (uint64_t children = min_children; 869 children <= max_children; children++) { 870 char key[8] = { 0 }; 871 draid_map_t *map; 872 double worst_ratio = 1000.0; 873 double avg_ratio = 1000.0; 874 875 error = eval_maps(children, passes, &map_seed, &map, 876 &worst_ratio, &avg_ratio); 877 if (error) { 878 printf("Error eval_maps(): %s\n", strerror(error)); 879 return (1); 880 } 881 882 if (worst_ratio < 1.0 || avg_ratio < 1.0) { 883 printf("Error ratio < 1.0: worst_ratio = %2.03f " 884 "avg_ratio = %2.03f\n", worst_ratio, avg_ratio); 885 return (1); 886 } 887 888 snprintf(key, 7, "%llu", (u_longlong_t)children); 889 error = write_map_key(filename, key, map, worst_ratio, 890 avg_ratio); 891 if (error == 0) { 892 /* The new map was added to the file. */ 893 dump_map(map, key, worst_ratio, avg_ratio, 894 MAX(verbose, 1)); 895 } else if (error == EEXIST) { 896 /* The existing map was preferable and kept. */ 897 if (verbose > 0) 898 dump_map_key(filename, key, verbose); 899 } else { 900 printf("Error write_map_key(): %s\n", strerror(error)); 901 return (1); 902 } 903 904 free_map(map); 905 } 906 907 /* 908 * When the continuous option is set restart at the minimum number of 909 * children instead of exiting. This option is useful as a mechanism 910 * to continuous try and refine the discovered permutations. 911 */ 912 if (continuous) { 913 restarts++; 914 printf("Restarting by request (-c): %d\n", restarts); 915 goto restart; 916 } 917 918 return (0); 919 } 920 921 /* 922 * Verify each map in the file by generating its in-memory permutation array 923 * and comfirming its checksum is correct. 924 */ 925 static int 926 draid_verify(int argc, char *argv[]) 927 { 928 char filename[MAXPATHLEN] = {0}; 929 int n = 0, c, error, verbose = 1; 930 int check_ratios = 0; 931 932 while ((c = getopt(argc, argv, ":rv")) != -1) { 933 switch (c) { 934 case 'r': 935 check_ratios++; 936 break; 937 case 'v': 938 verbose++; 939 break; 940 case ':': 941 (void) fprintf(stderr, 942 "missing argument for '%c' option\n", optopt); 943 draid_usage(); 944 break; 945 case '?': 946 (void) fprintf(stderr, "invalid option '%c'\n", 947 optopt); 948 draid_usage(); 949 break; 950 } 951 } 952 953 if (argc > optind) { 954 char *abspath = malloc(MAXPATHLEN); 955 if (abspath == NULL) 956 return (ENOMEM); 957 958 if (realpath(argv[optind], abspath) != NULL) 959 strncpy(filename, abspath, MAXPATHLEN - 1); 960 else 961 strncpy(filename, argv[optind], MAXPATHLEN - 1); 962 963 free(abspath); 964 } else { 965 (void) fprintf(stderr, "A FILE must be specified.\n"); 966 return (1); 967 } 968 969 printf("Verifying permutation maps: '%s'\n", filename); 970 971 /* 972 * Lookup hardcoded permutation map for each valid number of children 973 * and verify a generated map has the correct checksum. Then compare 974 * the generated map values with the nvlist map values read from the 975 * reference file to cross-check the permutation. 976 */ 977 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; 978 children <= VDEV_DRAID_MAX_CHILDREN; 979 children++) { 980 draid_map_t *map; 981 char key[8] = {0}; 982 983 snprintf(key, 8, "%llu", (u_longlong_t)children); 984 985 error = alloc_fixed_map(children, &map); 986 if (error) { 987 printf("Error alloc_fixed_map() failed: %s\n", 988 error == ECKSUM ? "Invalid checksum" : 989 strerror(error)); 990 return (1); 991 } 992 993 uint64_t nv_seed, nv_checksum, nv_children, nv_nperms; 994 uint8_t *nv_perms; 995 nvlist_t *cfg; 996 uint_t c; 997 998 error = read_map_key(filename, key, &cfg); 999 if (error != 0) { 1000 printf("Error read_map_key() failed: %s\n", 1001 strerror(error)); 1002 free_map(map); 1003 return (1); 1004 } 1005 1006 nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); 1007 nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); 1008 nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); 1009 nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); 1010 nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c); 1011 1012 /* 1013 * Compare draid_map_t and nvlist reference values. 1014 */ 1015 if (map->dm_seed != nv_seed) { 1016 printf("Error different seeds: 0x%016llx != " 1017 "0x%016llx\n", (u_longlong_t)map->dm_seed, 1018 (u_longlong_t)nv_seed); 1019 error = EINVAL; 1020 } 1021 1022 if (map->dm_checksum != nv_checksum) { 1023 printf("Error different checksums: 0x%016llx " 1024 "!= 0x%016llx\n", 1025 (u_longlong_t)map->dm_checksum, 1026 (u_longlong_t)nv_checksum); 1027 error = EINVAL; 1028 } 1029 1030 if (map->dm_children != nv_children) { 1031 printf("Error different children: %llu " 1032 "!= %llu\n", (u_longlong_t)map->dm_children, 1033 (u_longlong_t)nv_children); 1034 error = EINVAL; 1035 } 1036 1037 if (map->dm_nperms != nv_nperms) { 1038 printf("Error different nperms: %llu " 1039 "!= %llu\n", (u_longlong_t)map->dm_nperms, 1040 (u_longlong_t)nv_nperms); 1041 error = EINVAL; 1042 } 1043 1044 for (uint64_t i = 0; i < nv_children * nv_nperms; i++) { 1045 if (map->dm_perms[i] != nv_perms[i]) { 1046 printf("Error different perms[%llu]: " 1047 "%d != %d\n", (u_longlong_t)i, 1048 (int)map->dm_perms[i], 1049 (int)nv_perms[i]); 1050 error = EINVAL; 1051 break; 1052 } 1053 } 1054 1055 /* 1056 * For good measure recalculate the worst and average 1057 * ratios and confirm they match the nvlist values. 1058 */ 1059 if (check_ratios) { 1060 uint64_t nv_worst_ratio, nv_avg_ratio; 1061 double worst_ratio, avg_ratio; 1062 1063 eval_decluster(map, &worst_ratio, &avg_ratio); 1064 1065 nv_worst_ratio = fnvlist_lookup_uint64(cfg, 1066 MAP_WORST_RATIO); 1067 nv_avg_ratio = fnvlist_lookup_uint64(cfg, 1068 MAP_AVG_RATIO); 1069 1070 if (worst_ratio < 1.0 || avg_ratio < 1.0) { 1071 printf("Error ratio out of range %2.03f, " 1072 "%2.03f\n", worst_ratio, avg_ratio); 1073 error = EINVAL; 1074 } 1075 1076 if ((uint64_t)(worst_ratio * 1000.0) != 1077 nv_worst_ratio) { 1078 printf("Error different worst_ratio %2.03f " 1079 "!= %2.03f\n", (double)nv_worst_ratio / 1080 1000.0, worst_ratio); 1081 error = EINVAL; 1082 } 1083 1084 if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) { 1085 printf("Error different average_ratio %2.03f " 1086 "!= %2.03f\n", (double)nv_avg_ratio / 1087 1000.0, avg_ratio); 1088 error = EINVAL; 1089 } 1090 } 1091 1092 if (error) { 1093 free_map(map); 1094 nvlist_free(cfg); 1095 return (1); 1096 } 1097 1098 if (verbose > 0) { 1099 printf("- %llu children: good\n", 1100 (u_longlong_t)children); 1101 } 1102 n++; 1103 1104 free_map(map); 1105 nvlist_free(cfg); 1106 } 1107 1108 if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) { 1109 printf("Error permutation maps missing: %d / %d checked\n", 1110 n, VDEV_DRAID_MAX_CHILDREN - 1); 1111 return (1); 1112 } 1113 1114 printf("Successfully verified %d / %d permutation maps\n", 1115 n, VDEV_DRAID_MAX_CHILDREN - 1); 1116 1117 return (0); 1118 } 1119 1120 /* 1121 * Dump the contents of the specified mapping(s) for inspection. 1122 */ 1123 static int 1124 draid_dump(int argc, char *argv[]) 1125 { 1126 char filename[MAXPATHLEN] = {0}; 1127 int c, error, verbose = 1; 1128 int min_children = VDEV_DRAID_MIN_CHILDREN; 1129 int max_children = VDEV_DRAID_MAX_CHILDREN; 1130 1131 while ((c = getopt(argc, argv, ":vm:n:")) != -1) { 1132 switch (c) { 1133 case 'm': 1134 min_children = (int)strtol(optarg, NULL, 0); 1135 if (min_children < 2) { 1136 (void) fprintf(stderr, "A minimum of 2 " 1137 "children are required.\n"); 1138 return (1); 1139 } 1140 1141 break; 1142 case 'n': 1143 max_children = (int)strtol(optarg, NULL, 0); 1144 if (max_children > VDEV_DRAID_MAX_CHILDREN) { 1145 (void) fprintf(stderr, "A maximum of %d " 1146 "children are allowed.\n", 1147 VDEV_DRAID_MAX_CHILDREN); 1148 return (1); 1149 } 1150 break; 1151 case 'v': 1152 verbose++; 1153 break; 1154 case ':': 1155 (void) fprintf(stderr, 1156 "missing argument for '%c' option\n", optopt); 1157 draid_usage(); 1158 break; 1159 case '?': 1160 (void) fprintf(stderr, "invalid option '%c'\n", 1161 optopt); 1162 draid_usage(); 1163 break; 1164 } 1165 } 1166 1167 if (argc > optind) 1168 strncpy(filename, argv[optind], MAXPATHLEN - 1); 1169 else { 1170 (void) fprintf(stderr, "A FILE must be specified.\n"); 1171 return (1); 1172 } 1173 1174 /* 1175 * Dump maps for the requested child counts. 1176 */ 1177 for (uint64_t children = min_children; 1178 children <= max_children; children++) { 1179 char key[8] = { 0 }; 1180 1181 snprintf(key, 7, "%llu", (u_longlong_t)children); 1182 error = dump_map_key(filename, key, verbose); 1183 if (error) { 1184 printf("Error dump_map_key(): %s\n", strerror(error)); 1185 return (1); 1186 } 1187 } 1188 1189 return (0); 1190 } 1191 1192 /* 1193 * Print all of the mappings as a C formatted draid_map_t array. This table 1194 * is found in the module/zcommon/zfs_draid.c file and is the definitive 1195 * source for all mapping used by dRAID. It cannot be updated without 1196 * changing the dRAID on disk format. 1197 */ 1198 static int 1199 draid_table(int argc, char *argv[]) 1200 { 1201 char filename[MAXPATHLEN] = {0}; 1202 int error; 1203 1204 if (argc > optind) 1205 strncpy(filename, argv[optind], MAXPATHLEN - 1); 1206 else { 1207 (void) fprintf(stderr, "A FILE must be specified.\n"); 1208 return (1); 1209 } 1210 1211 printf("static const draid_map_t " 1212 "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n"); 1213 1214 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; 1215 children <= VDEV_DRAID_MAX_CHILDREN; 1216 children++) { 1217 uint64_t seed, checksum, nperms, avg_ratio; 1218 nvlist_t *cfg; 1219 char key[8] = {0}; 1220 1221 snprintf(key, 8, "%llu", (u_longlong_t)children); 1222 1223 error = read_map_key(filename, key, &cfg); 1224 if (error != 0) { 1225 printf("Error read_map_key() failed: %s\n", 1226 strerror(error)); 1227 return (1); 1228 } 1229 1230 seed = fnvlist_lookup_uint64(cfg, MAP_SEED); 1231 checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); 1232 children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); 1233 nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); 1234 avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); 1235 1236 printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t" 1237 "/* %2.03f */\n", (u_longlong_t)children, 1238 (u_longlong_t)nperms, (u_longlong_t)seed, 1239 (u_longlong_t)checksum, (double)avg_ratio / 1000.0); 1240 1241 nvlist_free(cfg); 1242 } 1243 1244 printf("};\n"); 1245 1246 return (0); 1247 } 1248 1249 static int 1250 draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp) 1251 { 1252 nvlist_t *srccfgs; 1253 nvpair_t *elem = NULL; 1254 int error, merged = 0; 1255 1256 error = read_map(srcfilename, &srccfgs); 1257 if (error != 0) 1258 return (error); 1259 1260 while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) { 1261 uint64_t nv_worst_ratio; 1262 uint64_t allcfg_worst_ratio; 1263 nvlist_t *cfg, *allcfg; 1264 char *key; 1265 1266 switch (nvpair_type(elem)) { 1267 case DATA_TYPE_NVLIST: 1268 1269 (void) nvpair_value_nvlist(elem, &cfg); 1270 key = nvpair_name(elem); 1271 1272 nv_worst_ratio = fnvlist_lookup_uint64(cfg, 1273 MAP_WORST_RATIO); 1274 1275 error = nvlist_lookup_nvlist(allcfgs, key, &allcfg); 1276 if (error == 0) { 1277 allcfg_worst_ratio = fnvlist_lookup_uint64( 1278 allcfg, MAP_WORST_RATIO); 1279 1280 if (nv_worst_ratio < allcfg_worst_ratio) { 1281 fnvlist_remove(allcfgs, key); 1282 error = nvlist_add_nvlist(allcfgs, 1283 key, cfg); 1284 merged++; 1285 } 1286 } else if (error == ENOENT) { 1287 error = nvlist_add_nvlist(allcfgs, key, cfg); 1288 merged++; 1289 } else { 1290 return (error); 1291 } 1292 1293 break; 1294 default: 1295 continue; 1296 } 1297 } 1298 1299 nvlist_free(srccfgs); 1300 1301 *mergedp = merged; 1302 1303 return (0); 1304 } 1305 1306 /* 1307 * Merge the best map for each child count found in the listed files into 1308 * a new file. This allows 'draid generate' to be run in parallel and for 1309 * the results maps to be combined. 1310 */ 1311 static int 1312 draid_merge(int argc, char *argv[]) 1313 { 1314 char filename[MAXPATHLEN] = {0}; 1315 int c, error, total_merged = 0; 1316 nvlist_t *allcfgs; 1317 1318 while ((c = getopt(argc, argv, ":")) != -1) { 1319 switch (c) { 1320 case ':': 1321 (void) fprintf(stderr, 1322 "missing argument for '%c' option\n", optopt); 1323 draid_usage(); 1324 break; 1325 case '?': 1326 (void) fprintf(stderr, "invalid option '%c'\n", 1327 optopt); 1328 draid_usage(); 1329 break; 1330 } 1331 } 1332 1333 if (argc < 4) { 1334 (void) fprintf(stderr, 1335 "A FILE and multiple SRCs must be specified.\n"); 1336 return (1); 1337 } 1338 1339 strncpy(filename, argv[optind], MAXPATHLEN - 1); 1340 optind++; 1341 1342 error = read_map(filename, &allcfgs); 1343 if (error == ENOENT) { 1344 allcfgs = fnvlist_alloc(); 1345 } else if (error != 0) { 1346 printf("Error read_map(): %s\n", strerror(error)); 1347 return (error); 1348 } 1349 1350 while (optind < argc) { 1351 char srcfilename[MAXPATHLEN] = {0}; 1352 int merged = 0; 1353 1354 strncpy(srcfilename, argv[optind], MAXPATHLEN - 1); 1355 1356 error = draid_merge_impl(allcfgs, srcfilename, &merged); 1357 if (error) { 1358 printf("Error draid_merge_impl(): %s\n", 1359 strerror(error)); 1360 nvlist_free(allcfgs); 1361 return (1); 1362 } 1363 1364 total_merged += merged; 1365 printf("Merged %d key(s) from '%s' into '%s'\n", merged, 1366 srcfilename, filename); 1367 1368 optind++; 1369 } 1370 1371 if (total_merged > 0) 1372 write_map(filename, allcfgs); 1373 1374 printf("Merged a total of %d key(s) into '%s'\n", total_merged, 1375 filename); 1376 1377 nvlist_free(allcfgs); 1378 1379 return (0); 1380 } 1381 1382 int 1383 main(int argc, char *argv[]) 1384 { 1385 if (argc < 2) 1386 draid_usage(); 1387 1388 char *subcommand = argv[1]; 1389 1390 if (strcmp(subcommand, "generate") == 0) { 1391 return (draid_generate(argc - 1, argv + 1)); 1392 } else if (strcmp(subcommand, "verify") == 0) { 1393 return (draid_verify(argc - 1, argv + 1)); 1394 } else if (strcmp(subcommand, "dump") == 0) { 1395 return (draid_dump(argc - 1, argv + 1)); 1396 } else if (strcmp(subcommand, "table") == 0) { 1397 return (draid_table(argc - 1, argv + 1)); 1398 } else if (strcmp(subcommand, "merge") == 0) { 1399 return (draid_merge(argc - 1, argv + 1)); 1400 } else { 1401 draid_usage(); 1402 } 1403 } 1404