1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2018 Intel Corporation.
23 * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
24 */
25
26 #include <stdio.h>
27 #include <zlib.h>
28 #include <zfs_fletcher.h>
29 #include <sys/vdev_draid.h>
30 #include <sys/nvpair.h>
31 #include <sys/stat.h>
32
33 /*
34 * The number of rows to generate for new permutation maps.
35 */
36 #define MAP_ROWS_DEFAULT 256
37
38 /*
39 * Key values for dRAID maps when stored as nvlists.
40 */
41 #define MAP_SEED "seed"
42 #define MAP_CHECKSUM "checksum"
43 #define MAP_WORST_RATIO "worst_ratio"
44 #define MAP_AVG_RATIO "avg_ratio"
45 #define MAP_CHILDREN "children"
46 #define MAP_NPERMS "nperms"
47 #define MAP_PERMS "perms"
48
49 static void
draid_usage(void)50 draid_usage(void)
51 {
52 (void) fprintf(stderr,
53 "usage: draid command args ...\n"
54 "Available commands are:\n"
55 "\n"
56 "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n"
57 "\tdraid verify [-rv] FILE\n"
58 "\tdraid dump [-v] [-m min] [-n max] FILE\n"
59 "\tdraid table FILE\n"
60 "\tdraid merge FILE SRC SRC...\n");
61 exit(1);
62 }
63
64 static int
read_map(const char * filename,nvlist_t ** allcfgs)65 read_map(const char *filename, nvlist_t **allcfgs)
66 {
67 int block_size = 131072;
68 int buf_size = 131072;
69 int tmp_size, error;
70 char *tmp_buf;
71
72 struct stat64 stat;
73 if (lstat64(filename, &stat) != 0)
74 return (errno);
75
76 if (stat.st_size == 0 ||
77 !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) {
78 return (EINVAL);
79 }
80
81 gzFile fp = gzopen(filename, "rb");
82 if (fp == Z_NULL)
83 return (errno);
84
85 char *buf = malloc(buf_size);
86 if (buf == NULL) {
87 (void) gzclose(fp);
88 return (ENOMEM);
89 }
90
91 ssize_t rc, bytes = 0;
92 while (!gzeof(fp)) {
93 rc = gzread(fp, buf + bytes, block_size);
94 if ((rc < 0) || (rc == 0 && !gzeof(fp))) {
95 free(buf);
96 (void) gzerror(fp, &error);
97 (void) gzclose(fp);
98 return (error);
99 } else {
100 bytes += rc;
101
102 if (bytes + block_size >= buf_size) {
103 tmp_size = 2 * buf_size;
104 tmp_buf = malloc(tmp_size);
105 if (tmp_buf == NULL) {
106 free(buf);
107 (void) gzclose(fp);
108 return (ENOMEM);
109 }
110
111 memcpy(tmp_buf, buf, bytes);
112 free(buf);
113 buf = tmp_buf;
114 buf_size = tmp_size;
115 }
116 }
117 }
118
119 (void) gzclose(fp);
120
121 error = nvlist_unpack(buf, bytes, allcfgs, 0);
122 free(buf);
123
124 return (error);
125 }
126
127 /*
128 * Read a map from the specified filename. A file contains multiple maps
129 * which are indexed by the number of children. The caller is responsible
130 * for freeing the configuration returned.
131 */
132 static int
read_map_key(const char * filename,const char * key,nvlist_t ** cfg)133 read_map_key(const char *filename, const char *key, nvlist_t **cfg)
134 {
135 nvlist_t *allcfgs, *foundcfg = NULL;
136 int error;
137
138 error = read_map(filename, &allcfgs);
139 if (error != 0)
140 return (error);
141
142 (void) nvlist_lookup_nvlist(allcfgs, key, &foundcfg);
143 if (foundcfg != NULL) {
144 nvlist_dup(foundcfg, cfg, KM_SLEEP);
145 error = 0;
146 } else {
147 error = ENOENT;
148 }
149
150 nvlist_free(allcfgs);
151
152 return (error);
153 }
154
155 /*
156 * Write all mappings to the map file.
157 */
158 static int
write_map(const char * filename,nvlist_t * allcfgs)159 write_map(const char *filename, nvlist_t *allcfgs)
160 {
161 size_t buflen = 0;
162 int error;
163
164 error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR);
165 if (error)
166 return (error);
167
168 char *buf = malloc(buflen);
169 if (buf == NULL)
170 return (ENOMEM);
171
172 error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
173 if (error) {
174 free(buf);
175 return (error);
176 }
177
178 /*
179 * Atomically update the file using a temporary file and the
180 * traditional unlink then rename steps. This code provides
181 * no locking, it only guarantees the packed nvlist on disk
182 * is updated atomically and is internally consistent.
183 */
184 char *tmpname = calloc(1, MAXPATHLEN);
185 if (tmpname == NULL) {
186 free(buf);
187 return (ENOMEM);
188 }
189
190 snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename);
191
192 int fd = mkstemp(tmpname);
193 if (fd < 0) {
194 error = errno;
195 free(buf);
196 free(tmpname);
197 return (error);
198 }
199 (void) close(fd);
200
201 gzFile fp = gzopen(tmpname, "w9b");
202 if (fp == Z_NULL) {
203 error = errno;
204 free(buf);
205 free(tmpname);
206 return (errno);
207 }
208
209 ssize_t rc, bytes = 0;
210 while (bytes < buflen) {
211 size_t size = MIN(buflen - bytes, 131072);
212 rc = gzwrite(fp, buf + bytes, size);
213 if (rc < 0) {
214 free(buf);
215 (void) gzerror(fp, &error);
216 (void) gzclose(fp);
217 (void) unlink(tmpname);
218 free(tmpname);
219 return (error);
220 } else if (rc == 0) {
221 break;
222 } else {
223 bytes += rc;
224 }
225 }
226
227 free(buf);
228 (void) gzclose(fp);
229
230 if (bytes != buflen) {
231 (void) unlink(tmpname);
232 free(tmpname);
233 return (EIO);
234 }
235
236 /*
237 * Unlink the previous config file and replace it with the updated
238 * version. If we're able to unlink the file then directory is
239 * writable by us and the subsequent rename should never fail.
240 */
241 error = unlink(filename);
242 if (error != 0 && errno != ENOENT) {
243 error = errno;
244 (void) unlink(tmpname);
245 free(tmpname);
246 return (error);
247 }
248
249 error = rename(tmpname, filename);
250 if (error != 0) {
251 error = errno;
252 (void) unlink(tmpname);
253 free(tmpname);
254 return (error);
255 }
256
257 free(tmpname);
258
259 return (0);
260 }
261
262 /*
263 * Add the dRAID map to the file and write it out.
264 */
265 static int
write_map_key(const char * filename,char * key,draid_map_t * map,double worst_ratio,double avg_ratio)266 write_map_key(const char *filename, char *key, draid_map_t *map,
267 double worst_ratio, double avg_ratio)
268 {
269 nvlist_t *nv_cfg, *allcfgs;
270 int error;
271
272 /*
273 * Add the configuration to an existing or new file. The new
274 * configuration will replace an existing configuration with the
275 * same key if it has a lower ratio and is therefore better.
276 */
277 error = read_map(filename, &allcfgs);
278 if (error == ENOENT) {
279 allcfgs = fnvlist_alloc();
280 } else if (error != 0) {
281 return (error);
282 }
283
284 error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg);
285 if (error == 0) {
286 uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg,
287 MAP_WORST_RATIO);
288 double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0;
289
290 if (worst_ratio < nv_worst_ratio) {
291 /* Replace old map with the more balanced new map. */
292 fnvlist_remove(allcfgs, key);
293 } else {
294 /* The old map is preferable, keep it. */
295 nvlist_free(allcfgs);
296 return (EEXIST);
297 }
298 }
299
300 nvlist_t *cfg = fnvlist_alloc();
301 fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed);
302 fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum);
303 fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children);
304 fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms);
305 fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms,
306 map->dm_children * map->dm_nperms * sizeof (uint8_t));
307
308 fnvlist_add_uint64(cfg, MAP_WORST_RATIO,
309 (uint64_t)(worst_ratio * 1000.0));
310 fnvlist_add_uint64(cfg, MAP_AVG_RATIO,
311 (uint64_t)(avg_ratio * 1000.0));
312
313 error = nvlist_add_nvlist(allcfgs, key, cfg);
314 if (error == 0)
315 error = write_map(filename, allcfgs);
316
317 nvlist_free(cfg);
318 nvlist_free(allcfgs);
319 return (error);
320 }
321
322 static void
dump_map(draid_map_t * map,const char * key,double worst_ratio,double avg_ratio,int verbose)323 dump_map(draid_map_t *map, const char *key, double worst_ratio,
324 double avg_ratio, int verbose)
325 {
326 if (verbose == 0) {
327 return;
328 } else if (verbose == 1) {
329 printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f "
330 "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed,
331 worst_ratio, avg_ratio);
332 return;
333 } else {
334 printf(" \"%s\":\n"
335 " seed: 0x%016llx\n"
336 " checksum: 0x%016llx\n"
337 " worst_ratio: %2.03f\n"
338 " avg_ratio: %2.03f\n"
339 " children: %llu\n"
340 " nperms: %llu\n",
341 key, (u_longlong_t)map->dm_seed,
342 (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio,
343 (u_longlong_t)map->dm_children,
344 (u_longlong_t)map->dm_nperms);
345
346 if (verbose > 2) {
347 printf(" perms = {\n");
348 for (int i = 0; i < map->dm_nperms; i++) {
349 printf(" { ");
350 for (int j = 0; j < map->dm_children; j++) {
351 printf("%3d%s ", map->dm_perms[
352 i * map->dm_children + j],
353 j < map->dm_children - 1 ?
354 "," : "");
355 }
356 printf(" },\n");
357 }
358 printf(" }\n");
359 } else if (verbose == 2) {
360 printf(" draid_perms = <omitted>\n");
361 }
362 }
363 }
364
365 static void
dump_map_nv(const char * key,nvlist_t * cfg,int verbose)366 dump_map_nv(const char *key, nvlist_t *cfg, int verbose)
367 {
368 draid_map_t map;
369 uint_t c;
370
371 uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO);
372 uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
373
374 map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
375 map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
376 map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
377 map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
378 map.dm_perms = fnvlist_lookup_uint8_array(cfg, MAP_PERMS, &c);
379
380 dump_map(&map, key, (double)worst_ratio / 1000.0,
381 avg_ratio / 1000.0, verbose);
382 }
383
384 /*
385 * Print a summary of the mapping.
386 */
387 static int
dump_map_key(const char * filename,const char * key,int verbose)388 dump_map_key(const char *filename, const char *key, int verbose)
389 {
390 nvlist_t *cfg;
391 int error;
392
393 error = read_map_key(filename, key, &cfg);
394 if (error != 0)
395 return (error);
396
397 dump_map_nv(key, cfg, verbose);
398
399 return (0);
400 }
401
402 /*
403 * Allocate a new permutation map for evaluation.
404 */
405 static int
alloc_new_map(uint64_t children,uint64_t nperms,uint64_t seed,draid_map_t ** mapp)406 alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed,
407 draid_map_t **mapp)
408 {
409 draid_map_t *map;
410 int error;
411
412 map = malloc(sizeof (draid_map_t));
413 if (map == NULL)
414 return (ENOMEM);
415
416 map->dm_children = children;
417 map->dm_nperms = nperms;
418 map->dm_seed = seed;
419 map->dm_checksum = 0;
420
421 error = vdev_draid_generate_perms(map, &map->dm_perms);
422 if (error) {
423 free(map);
424 return (error);
425 }
426
427 *mapp = map;
428
429 return (0);
430 }
431
432 /*
433 * Allocate the fixed permutation map for N children.
434 */
435 static int
alloc_fixed_map(uint64_t children,draid_map_t ** mapp)436 alloc_fixed_map(uint64_t children, draid_map_t **mapp)
437 {
438 const draid_map_t *fixed_map;
439 draid_map_t *map;
440 int error;
441
442 error = vdev_draid_lookup_map(children, &fixed_map);
443 if (error)
444 return (error);
445
446 map = malloc(sizeof (draid_map_t));
447 if (map == NULL)
448 return (ENOMEM);
449
450 memcpy(map, fixed_map, sizeof (draid_map_t));
451 VERIFY3U(map->dm_checksum, !=, 0);
452
453 error = vdev_draid_generate_perms(map, &map->dm_perms);
454 if (error) {
455 free(map);
456 return (error);
457 }
458
459 *mapp = map;
460
461 return (0);
462 }
463
464 /*
465 * Free a permutation map.
466 */
467 static void
free_map(draid_map_t * map)468 free_map(draid_map_t *map)
469 {
470 free(map->dm_perms);
471 free(map);
472 }
473
474 /*
475 * Check if dev is in the provided list of faulted devices.
476 */
477 static inline boolean_t
is_faulted(int * faulted_devs,int nfaulted,int dev)478 is_faulted(int *faulted_devs, int nfaulted, int dev)
479 {
480 for (int i = 0; i < nfaulted; i++)
481 if (faulted_devs[i] == dev)
482 return (B_TRUE);
483
484 return (B_FALSE);
485 }
486
487 /*
488 * Evaluate how resilvering I/O will be distributed given a list of faulted
489 * vdevs. As a simplification we assume one IO is sufficient to repair each
490 * damaged device in a group.
491 */
492 static double
eval_resilver(draid_map_t * map,uint64_t groupwidth,uint64_t nspares,int * faulted_devs,int nfaulted,int * min_child_ios,int * max_child_ios)493 eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares,
494 int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios)
495 {
496 uint64_t children = map->dm_children;
497 uint64_t ngroups = 1;
498 uint64_t ndisks = children - nspares;
499
500 /*
501 * Calculate the minimum number of groups required to fill a slice.
502 */
503 while (ngroups * (groupwidth) % (children - nspares) != 0)
504 ngroups++;
505
506 int *ios = calloc(map->dm_children, sizeof (uint64_t));
507
508 ASSERT3P(ios, !=, NULL);
509
510 /* Resilver all rows */
511 for (int i = 0; i < map->dm_nperms; i++) {
512 uint8_t *row = &map->dm_perms[i * map->dm_children];
513
514 /* Resilver all groups with faulted drives */
515 for (int j = 0; j < ngroups; j++) {
516 uint64_t spareidx = map->dm_children - nspares;
517 boolean_t repair_needed = B_FALSE;
518
519 /* See if any devices in this group are faulted */
520 uint64_t groupstart = (j * groupwidth) % ndisks;
521
522 for (int k = 0; k < groupwidth; k++) {
523 uint64_t groupidx = (groupstart + k) % ndisks;
524
525 repair_needed = is_faulted(faulted_devs,
526 nfaulted, row[groupidx]);
527 if (repair_needed)
528 break;
529 }
530
531 if (repair_needed == B_FALSE)
532 continue;
533
534 /*
535 * This group is degraded. Calculate the number of
536 * reads the non-faulted drives require and the number
537 * of writes to the distributed hot spare for this row.
538 */
539 for (int k = 0; k < groupwidth; k++) {
540 uint64_t groupidx = (groupstart + k) % ndisks;
541
542 if (!is_faulted(faulted_devs, nfaulted,
543 row[groupidx])) {
544 ios[row[groupidx]]++;
545 } else if (nspares > 0) {
546 while (is_faulted(faulted_devs,
547 nfaulted, row[spareidx])) {
548 spareidx++;
549 }
550
551 ASSERT3U(spareidx, <, map->dm_children);
552 ios[row[spareidx]]++;
553 spareidx++;
554 }
555 }
556 }
557 }
558
559 *min_child_ios = INT_MAX;
560 *max_child_ios = 0;
561
562 /*
563 * Find the drives with fewest and most required I/O. These values
564 * are used to calculate the imbalance ratio. To avoid returning an
565 * infinite value for permutations which have children that perform
566 * no IO a floor of 1 IO per child is set. This ensures a meaningful
567 * ratio is returned for comparison and it is not an uncommon when
568 * there are a large number of children.
569 */
570 for (int i = 0; i < map->dm_children; i++) {
571
572 if (is_faulted(faulted_devs, nfaulted, i)) {
573 ASSERT0(ios[i]);
574 continue;
575 }
576
577 if (ios[i] == 0)
578 ios[i] = 1;
579
580 if (ios[i] < *min_child_ios)
581 *min_child_ios = ios[i];
582
583 if (ios[i] > *max_child_ios)
584 *max_child_ios = ios[i];
585 }
586
587 ASSERT3S(*min_child_ios, !=, INT_MAX);
588 ASSERT3S(*max_child_ios, !=, 0);
589
590 double ratio = (double)(*max_child_ios) / (double)(*min_child_ios);
591
592 free(ios);
593
594 return (ratio);
595 }
596
597 /*
598 * Evaluate the quality of the permutation mapping by considering possible
599 * device failures. Returns the imbalance ratio for the worst mapping which
600 * is defined to be the largest number of child IOs over the fewest number
601 * child IOs. A value of 1.0 indicates the mapping is perfectly balance and
602 * all children perform an equal amount of work during reconstruction.
603 */
604 static void
eval_decluster(draid_map_t * map,double * worst_ratiop,double * avg_ratiop)605 eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop)
606 {
607 uint64_t children = map->dm_children;
608 double worst_ratio = 1.0;
609 double sum = 0;
610 int worst_min_ios = 0, worst_max_ios = 0;
611 int n = 0;
612
613 /*
614 * When there are only 2 children there can be no distributed
615 * spare and no resilver to evaluate. Default to a ratio of 1.0
616 * for this degenerate case.
617 */
618 if (children == VDEV_DRAID_MIN_CHILDREN) {
619 *worst_ratiop = 1.0;
620 *avg_ratiop = 1.0;
621 return;
622 }
623
624 /*
625 * Score the mapping as if it had either 1 or 2 distributed spares.
626 */
627 for (int nspares = 1; nspares <= 2; nspares++) {
628 uint64_t faults = nspares;
629
630 /*
631 * Score groupwidths up to 19. This value was chosen as the
632 * largest reasonable width (16d+3p). dRAID pools may be still
633 * be created with wider stripes but they are not considered in
634 * this analysis in order to optimize for the most common cases.
635 */
636 for (uint64_t groupwidth = 2;
637 groupwidth <= MIN(children - nspares, 19);
638 groupwidth++) {
639 int faulted_devs[2];
640 int min_ios, max_ios;
641
642 /*
643 * Score possible devices faults. This is limited
644 * to exactly one fault per distributed spare for
645 * the purposes of this similation.
646 */
647 for (int f1 = 0; f1 < children; f1++) {
648 faulted_devs[0] = f1;
649 double ratio;
650
651 if (faults == 1) {
652 ratio = eval_resilver(map, groupwidth,
653 nspares, faulted_devs, faults,
654 &min_ios, &max_ios);
655
656 if (ratio > worst_ratio) {
657 worst_ratio = ratio;
658 worst_min_ios = min_ios;
659 worst_max_ios = max_ios;
660 }
661
662 sum += ratio;
663 n++;
664 } else if (faults == 2) {
665 for (int f2 = f1 + 1; f2 < children;
666 f2++) {
667 faulted_devs[1] = f2;
668
669 ratio = eval_resilver(map,
670 groupwidth, nspares,
671 faulted_devs, faults,
672 &min_ios, &max_ios);
673
674 if (ratio > worst_ratio) {
675 worst_ratio = ratio;
676 worst_min_ios = min_ios;
677 worst_max_ios = max_ios;
678 }
679
680 sum += ratio;
681 n++;
682 }
683 }
684 }
685 }
686 }
687
688 *worst_ratiop = worst_ratio;
689 *avg_ratiop = sum / n;
690
691 /*
692 * Log the min/max io values for particularly unbalanced maps.
693 * Since the maps are generated entirely randomly these are possible
694 * be exceedingly unlikely. We log it for possible investigation.
695 */
696 if (worst_ratio > 100.0) {
697 dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2);
698 printf("worst_min_ios=%d worst_max_ios=%d\n",
699 worst_min_ios, worst_max_ios);
700 }
701 }
702
703 static int
eval_maps(uint64_t children,int passes,uint64_t * map_seed,draid_map_t ** best_mapp,double * best_ratiop,double * avg_ratiop)704 eval_maps(uint64_t children, int passes, uint64_t *map_seed,
705 draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop)
706 {
707 draid_map_t *best_map = NULL;
708 double best_worst_ratio = 1000.0;
709 double best_avg_ratio = 1000.0;
710
711 /*
712 * Perform the requested number of passes evaluating randomly
713 * generated permutation maps. Only the best version is kept.
714 */
715 for (int i = 0; i < passes; i++) {
716 double worst_ratio, avg_ratio;
717 draid_map_t *map;
718 int error;
719
720 /*
721 * Calculate the next seed and generate a new candidate map.
722 */
723 error = alloc_new_map(children, MAP_ROWS_DEFAULT,
724 vdev_draid_rand(map_seed), &map);
725 if (error) {
726 if (best_map != NULL)
727 free_map(best_map);
728 return (error);
729 }
730
731 /*
732 * Consider maps with a lower worst_ratio to be of higher
733 * quality. Some maps may have a lower avg_ratio but they
734 * are discarded since they might include some particularly
735 * imbalanced permutations. The average is tracked to in
736 * order to get a sense of the average permutation quality.
737 */
738 eval_decluster(map, &worst_ratio, &avg_ratio);
739
740 if (best_map == NULL || worst_ratio < best_worst_ratio) {
741
742 if (best_map != NULL)
743 free_map(best_map);
744
745 best_map = map;
746 best_worst_ratio = worst_ratio;
747 best_avg_ratio = avg_ratio;
748 } else {
749 free_map(map);
750 }
751 }
752
753 /*
754 * After determining the best map generate a checksum over the full
755 * permutation array. This checksum is verified when opening a dRAID
756 * pool to ensure the generated in memory permutations are correct.
757 */
758 zio_cksum_t cksum;
759 fletcher_4_native_varsize(best_map->dm_perms,
760 sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms,
761 &cksum);
762 best_map->dm_checksum = cksum.zc_word[0];
763
764 *best_mapp = best_map;
765 *best_ratiop = best_worst_ratio;
766 *avg_ratiop = best_avg_ratio;
767
768 return (0);
769 }
770
771 static int
draid_generate(int argc,char * argv[])772 draid_generate(int argc, char *argv[])
773 {
774 char filename[MAXPATHLEN] = {0};
775 uint64_t map_seed[2];
776 int c, fd, error, verbose = 0, passes = 1, continuous = 0;
777 int min_children = VDEV_DRAID_MIN_CHILDREN;
778 int max_children = VDEV_DRAID_MAX_CHILDREN;
779 int restarts = 0;
780
781 while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) {
782 switch (c) {
783 case 'c':
784 continuous++;
785 break;
786 case 'm':
787 min_children = (int)strtol(optarg, NULL, 0);
788 if (min_children < VDEV_DRAID_MIN_CHILDREN) {
789 (void) fprintf(stderr, "A minimum of 2 "
790 "children are required.\n");
791 return (1);
792 }
793
794 break;
795 case 'n':
796 max_children = (int)strtol(optarg, NULL, 0);
797 if (max_children > VDEV_DRAID_MAX_CHILDREN) {
798 (void) fprintf(stderr, "A maximum of %d "
799 "children are allowed.\n",
800 VDEV_DRAID_MAX_CHILDREN);
801 return (1);
802 }
803 break;
804 case 'p':
805 passes = (int)strtol(optarg, NULL, 0);
806 break;
807 case 'v':
808 /*
809 * 0 - Only log when a better map is added to the file.
810 * 1 - Log the current best map for each child count.
811 * Minimal output on a single summary line.
812 * 2 - Log the current best map for each child count.
813 * More verbose includes most map fields.
814 * 3 - Log the current best map for each child count.
815 * Very verbose all fields including the full map.
816 */
817 verbose++;
818 break;
819 case ':':
820 (void) fprintf(stderr,
821 "missing argument for '%c' option\n", optopt);
822 draid_usage();
823 break;
824 case '?':
825 (void) fprintf(stderr, "invalid option '%c'\n",
826 optopt);
827 draid_usage();
828 break;
829 }
830 }
831
832 if (argc > optind)
833 strlcpy(filename, argv[optind], sizeof (filename));
834 else {
835 (void) fprintf(stderr, "A FILE must be specified.\n");
836 return (1);
837 }
838
839 restart:
840 /*
841 * Start with a fresh seed from /dev/urandom.
842 */
843 fd = open("/dev/urandom", O_RDONLY);
844 if (fd < 0) {
845 printf("Unable to open /dev/urandom: %s\n:", strerror(errno));
846 return (1);
847 } else {
848 ssize_t bytes = sizeof (map_seed);
849 ssize_t bytes_read = 0;
850
851 while (bytes_read < bytes) {
852 ssize_t rc = read(fd, ((char *)map_seed) + bytes_read,
853 bytes - bytes_read);
854 if (rc < 0) {
855 printf("Unable to read /dev/urandom: %s\n:",
856 strerror(errno));
857 close(fd);
858 return (1);
859 }
860 bytes_read += rc;
861 }
862
863 (void) close(fd);
864 }
865
866 if (restarts == 0)
867 printf("Writing generated mappings to '%s':\n", filename);
868
869 /*
870 * Generate maps for all requested child counts. The best map for
871 * each child count is written out to the specified file. If the file
872 * already contains a better mapping this map will not be added.
873 */
874 for (uint64_t children = min_children;
875 children <= max_children; children++) {
876 char key[8] = { 0 };
877 draid_map_t *map;
878 double worst_ratio = 1000.0;
879 double avg_ratio = 1000.0;
880
881 error = eval_maps(children, passes, map_seed, &map,
882 &worst_ratio, &avg_ratio);
883 if (error) {
884 printf("Error eval_maps(): %s\n", strerror(error));
885 return (1);
886 }
887
888 if (worst_ratio < 1.0 || avg_ratio < 1.0) {
889 printf("Error ratio < 1.0: worst_ratio = %2.03f "
890 "avg_ratio = %2.03f\n", worst_ratio, avg_ratio);
891 return (1);
892 }
893
894 snprintf(key, 7, "%llu", (u_longlong_t)children);
895 error = write_map_key(filename, key, map, worst_ratio,
896 avg_ratio);
897 if (error == 0) {
898 /* The new map was added to the file. */
899 dump_map(map, key, worst_ratio, avg_ratio,
900 MAX(verbose, 1));
901 } else if (error == EEXIST) {
902 /* The existing map was preferable and kept. */
903 if (verbose > 0)
904 dump_map_key(filename, key, verbose);
905 } else {
906 printf("Error write_map_key(): %s\n", strerror(error));
907 return (1);
908 }
909
910 free_map(map);
911 }
912
913 /*
914 * When the continuous option is set restart at the minimum number of
915 * children instead of exiting. This option is useful as a mechanism
916 * to continuous try and refine the discovered permutations.
917 */
918 if (continuous) {
919 restarts++;
920 printf("Restarting by request (-c): %d\n", restarts);
921 goto restart;
922 }
923
924 return (0);
925 }
926
927 /*
928 * Verify each map in the file by generating its in-memory permutation array
929 * and comfirming its checksum is correct.
930 */
931 static int
draid_verify(int argc,char * argv[])932 draid_verify(int argc, char *argv[])
933 {
934 char filename[MAXPATHLEN] = {0};
935 int n = 0, c, error, verbose = 1;
936 int check_ratios = 0;
937
938 while ((c = getopt(argc, argv, ":rv")) != -1) {
939 switch (c) {
940 case 'r':
941 check_ratios++;
942 break;
943 case 'v':
944 verbose++;
945 break;
946 case ':':
947 (void) fprintf(stderr,
948 "missing argument for '%c' option\n", optopt);
949 draid_usage();
950 break;
951 case '?':
952 (void) fprintf(stderr, "invalid option '%c'\n",
953 optopt);
954 draid_usage();
955 break;
956 }
957 }
958
959 if (argc > optind) {
960 char *abspath = malloc(MAXPATHLEN);
961 if (abspath == NULL)
962 return (ENOMEM);
963
964 if (realpath(argv[optind], abspath) != NULL)
965 strlcpy(filename, abspath, sizeof (filename));
966 else
967 strlcpy(filename, argv[optind], sizeof (filename));
968
969 free(abspath);
970 } else {
971 (void) fprintf(stderr, "A FILE must be specified.\n");
972 return (1);
973 }
974
975 printf("Verifying permutation maps: '%s'\n", filename);
976
977 /*
978 * Lookup hardcoded permutation map for each valid number of children
979 * and verify a generated map has the correct checksum. Then compare
980 * the generated map values with the nvlist map values read from the
981 * reference file to cross-check the permutation.
982 */
983 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
984 children <= VDEV_DRAID_MAX_CHILDREN;
985 children++) {
986 draid_map_t *map;
987 char key[8] = {0};
988
989 snprintf(key, 8, "%llu", (u_longlong_t)children);
990
991 error = alloc_fixed_map(children, &map);
992 if (error) {
993 printf("Error alloc_fixed_map() failed: %s\n",
994 error == ECKSUM ? "Invalid checksum" :
995 strerror(error));
996 return (1);
997 }
998
999 uint64_t nv_seed, nv_checksum, nv_children, nv_nperms;
1000 uint8_t *nv_perms;
1001 nvlist_t *cfg;
1002 uint_t c;
1003
1004 error = read_map_key(filename, key, &cfg);
1005 if (error != 0) {
1006 printf("Error read_map_key() failed: %s\n",
1007 strerror(error));
1008 free_map(map);
1009 return (1);
1010 }
1011
1012 nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1013 nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1014 nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1015 nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1016 nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c);
1017
1018 /*
1019 * Compare draid_map_t and nvlist reference values.
1020 */
1021 if (map->dm_seed != nv_seed) {
1022 printf("Error different seeds: 0x%016llx != "
1023 "0x%016llx\n", (u_longlong_t)map->dm_seed,
1024 (u_longlong_t)nv_seed);
1025 error = EINVAL;
1026 }
1027
1028 if (map->dm_checksum != nv_checksum) {
1029 printf("Error different checksums: 0x%016llx "
1030 "!= 0x%016llx\n",
1031 (u_longlong_t)map->dm_checksum,
1032 (u_longlong_t)nv_checksum);
1033 error = EINVAL;
1034 }
1035
1036 if (map->dm_children != nv_children) {
1037 printf("Error different children: %llu "
1038 "!= %llu\n", (u_longlong_t)map->dm_children,
1039 (u_longlong_t)nv_children);
1040 error = EINVAL;
1041 }
1042
1043 if (map->dm_nperms != nv_nperms) {
1044 printf("Error different nperms: %llu "
1045 "!= %llu\n", (u_longlong_t)map->dm_nperms,
1046 (u_longlong_t)nv_nperms);
1047 error = EINVAL;
1048 }
1049
1050 for (uint64_t i = 0; i < nv_children * nv_nperms; i++) {
1051 if (map->dm_perms[i] != nv_perms[i]) {
1052 printf("Error different perms[%llu]: "
1053 "%d != %d\n", (u_longlong_t)i,
1054 (int)map->dm_perms[i],
1055 (int)nv_perms[i]);
1056 error = EINVAL;
1057 break;
1058 }
1059 }
1060
1061 /*
1062 * For good measure recalculate the worst and average
1063 * ratios and confirm they match the nvlist values.
1064 */
1065 if (check_ratios) {
1066 uint64_t nv_worst_ratio, nv_avg_ratio;
1067 double worst_ratio, avg_ratio;
1068
1069 eval_decluster(map, &worst_ratio, &avg_ratio);
1070
1071 nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1072 MAP_WORST_RATIO);
1073 nv_avg_ratio = fnvlist_lookup_uint64(cfg,
1074 MAP_AVG_RATIO);
1075
1076 if (worst_ratio < 1.0 || avg_ratio < 1.0) {
1077 printf("Error ratio out of range %2.03f, "
1078 "%2.03f\n", worst_ratio, avg_ratio);
1079 error = EINVAL;
1080 }
1081
1082 if ((uint64_t)(worst_ratio * 1000.0) !=
1083 nv_worst_ratio) {
1084 printf("Error different worst_ratio %2.03f "
1085 "!= %2.03f\n", (double)nv_worst_ratio /
1086 1000.0, worst_ratio);
1087 error = EINVAL;
1088 }
1089
1090 if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) {
1091 printf("Error different average_ratio %2.03f "
1092 "!= %2.03f\n", (double)nv_avg_ratio /
1093 1000.0, avg_ratio);
1094 error = EINVAL;
1095 }
1096 }
1097
1098 if (error) {
1099 free_map(map);
1100 nvlist_free(cfg);
1101 return (1);
1102 }
1103
1104 if (verbose > 0) {
1105 printf("- %llu children: good\n",
1106 (u_longlong_t)children);
1107 }
1108 n++;
1109
1110 free_map(map);
1111 nvlist_free(cfg);
1112 }
1113
1114 if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) {
1115 printf("Error permutation maps missing: %d / %d checked\n",
1116 n, VDEV_DRAID_MAX_CHILDREN - 1);
1117 return (1);
1118 }
1119
1120 printf("Successfully verified %d / %d permutation maps\n",
1121 n, VDEV_DRAID_MAX_CHILDREN - 1);
1122
1123 return (0);
1124 }
1125
1126 /*
1127 * Dump the contents of the specified mapping(s) for inspection.
1128 */
1129 static int
draid_dump(int argc,char * argv[])1130 draid_dump(int argc, char *argv[])
1131 {
1132 char filename[MAXPATHLEN] = {0};
1133 int c, error, verbose = 1;
1134 int min_children = VDEV_DRAID_MIN_CHILDREN;
1135 int max_children = VDEV_DRAID_MAX_CHILDREN;
1136
1137 while ((c = getopt(argc, argv, ":vm:n:")) != -1) {
1138 switch (c) {
1139 case 'm':
1140 min_children = (int)strtol(optarg, NULL, 0);
1141 if (min_children < 2) {
1142 (void) fprintf(stderr, "A minimum of 2 "
1143 "children are required.\n");
1144 return (1);
1145 }
1146
1147 break;
1148 case 'n':
1149 max_children = (int)strtol(optarg, NULL, 0);
1150 if (max_children > VDEV_DRAID_MAX_CHILDREN) {
1151 (void) fprintf(stderr, "A maximum of %d "
1152 "children are allowed.\n",
1153 VDEV_DRAID_MAX_CHILDREN);
1154 return (1);
1155 }
1156 break;
1157 case 'v':
1158 verbose++;
1159 break;
1160 case ':':
1161 (void) fprintf(stderr,
1162 "missing argument for '%c' option\n", optopt);
1163 draid_usage();
1164 break;
1165 case '?':
1166 (void) fprintf(stderr, "invalid option '%c'\n",
1167 optopt);
1168 draid_usage();
1169 break;
1170 }
1171 }
1172
1173 if (argc > optind)
1174 strlcpy(filename, argv[optind], sizeof (filename));
1175 else {
1176 (void) fprintf(stderr, "A FILE must be specified.\n");
1177 return (1);
1178 }
1179
1180 /*
1181 * Dump maps for the requested child counts.
1182 */
1183 for (uint64_t children = min_children;
1184 children <= max_children; children++) {
1185 char key[8] = { 0 };
1186
1187 snprintf(key, 7, "%llu", (u_longlong_t)children);
1188 error = dump_map_key(filename, key, verbose);
1189 if (error) {
1190 printf("Error dump_map_key(): %s\n", strerror(error));
1191 return (1);
1192 }
1193 }
1194
1195 return (0);
1196 }
1197
1198 /*
1199 * Print all of the mappings as a C formatted draid_map_t array. This table
1200 * is found in the module/zcommon/zfs_draid.c file and is the definitive
1201 * source for all mapping used by dRAID. It cannot be updated without
1202 * changing the dRAID on disk format.
1203 */
1204 static int
draid_table(int argc,char * argv[])1205 draid_table(int argc, char *argv[])
1206 {
1207 char filename[MAXPATHLEN] = {0};
1208 int error;
1209
1210 if (argc > optind)
1211 strlcpy(filename, argv[optind], sizeof (filename));
1212 else {
1213 (void) fprintf(stderr, "A FILE must be specified.\n");
1214 return (1);
1215 }
1216
1217 printf("static const draid_map_t "
1218 "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n");
1219
1220 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
1221 children <= VDEV_DRAID_MAX_CHILDREN;
1222 children++) {
1223 uint64_t seed, checksum, nperms, avg_ratio;
1224 nvlist_t *cfg;
1225 char key[8] = {0};
1226
1227 snprintf(key, 8, "%llu", (u_longlong_t)children);
1228
1229 error = read_map_key(filename, key, &cfg);
1230 if (error != 0) {
1231 printf("Error read_map_key() failed: %s\n",
1232 strerror(error));
1233 return (1);
1234 }
1235
1236 seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1237 checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1238 children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1239 nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1240 avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
1241
1242 printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t"
1243 "/* %2.03f */\n", (u_longlong_t)children,
1244 (u_longlong_t)nperms, (u_longlong_t)seed,
1245 (u_longlong_t)checksum, (double)avg_ratio / 1000.0);
1246
1247 nvlist_free(cfg);
1248 }
1249
1250 printf("};\n");
1251
1252 return (0);
1253 }
1254
1255 static int
draid_merge_impl(nvlist_t * allcfgs,const char * srcfilename,int * mergedp)1256 draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp)
1257 {
1258 nvlist_t *srccfgs;
1259 nvpair_t *elem = NULL;
1260 int error, merged = 0;
1261
1262 error = read_map(srcfilename, &srccfgs);
1263 if (error != 0)
1264 return (error);
1265
1266 while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) {
1267 uint64_t nv_worst_ratio;
1268 uint64_t allcfg_worst_ratio;
1269 nvlist_t *cfg, *allcfg;
1270 const char *key;
1271
1272 switch (nvpair_type(elem)) {
1273 case DATA_TYPE_NVLIST:
1274
1275 (void) nvpair_value_nvlist(elem, &cfg);
1276 key = nvpair_name(elem);
1277
1278 nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1279 MAP_WORST_RATIO);
1280
1281 error = nvlist_lookup_nvlist(allcfgs, key, &allcfg);
1282 if (error == 0) {
1283 allcfg_worst_ratio = fnvlist_lookup_uint64(
1284 allcfg, MAP_WORST_RATIO);
1285
1286 if (nv_worst_ratio < allcfg_worst_ratio) {
1287 fnvlist_remove(allcfgs, key);
1288 fnvlist_add_nvlist(allcfgs, key, cfg);
1289 merged++;
1290 }
1291 } else if (error == ENOENT) {
1292 fnvlist_add_nvlist(allcfgs, key, cfg);
1293 merged++;
1294 } else {
1295 return (error);
1296 }
1297
1298 break;
1299 default:
1300 continue;
1301 }
1302 }
1303
1304 nvlist_free(srccfgs);
1305
1306 *mergedp = merged;
1307
1308 return (0);
1309 }
1310
1311 /*
1312 * Merge the best map for each child count found in the listed files into
1313 * a new file. This allows 'draid generate' to be run in parallel and for
1314 * the results maps to be combined.
1315 */
1316 static int
draid_merge(int argc,char * argv[])1317 draid_merge(int argc, char *argv[])
1318 {
1319 char filename[MAXPATHLEN] = {0};
1320 int c, error, total_merged = 0;
1321 nvlist_t *allcfgs;
1322
1323 while ((c = getopt(argc, argv, ":")) != -1) {
1324 switch (c) {
1325 case ':':
1326 (void) fprintf(stderr,
1327 "missing argument for '%c' option\n", optopt);
1328 draid_usage();
1329 break;
1330 case '?':
1331 (void) fprintf(stderr, "invalid option '%c'\n",
1332 optopt);
1333 draid_usage();
1334 break;
1335 }
1336 }
1337
1338 if (argc < 4) {
1339 (void) fprintf(stderr,
1340 "A FILE and multiple SRCs must be specified.\n");
1341 return (1);
1342 }
1343
1344 strlcpy(filename, argv[optind], sizeof (filename));
1345 optind++;
1346
1347 error = read_map(filename, &allcfgs);
1348 if (error == ENOENT) {
1349 allcfgs = fnvlist_alloc();
1350 } else if (error != 0) {
1351 printf("Error read_map(): %s\n", strerror(error));
1352 return (error);
1353 }
1354
1355 while (optind < argc) {
1356 char srcfilename[MAXPATHLEN] = {0};
1357 int merged = 0;
1358
1359 strlcpy(srcfilename, argv[optind], sizeof (srcfilename));
1360
1361 error = draid_merge_impl(allcfgs, srcfilename, &merged);
1362 if (error) {
1363 printf("Error draid_merge_impl(): %s\n",
1364 strerror(error));
1365 nvlist_free(allcfgs);
1366 return (1);
1367 }
1368
1369 total_merged += merged;
1370 printf("Merged %d key(s) from '%s' into '%s'\n", merged,
1371 srcfilename, filename);
1372
1373 optind++;
1374 }
1375
1376 if (total_merged > 0)
1377 write_map(filename, allcfgs);
1378
1379 printf("Merged a total of %d key(s) into '%s'\n", total_merged,
1380 filename);
1381
1382 nvlist_free(allcfgs);
1383
1384 return (0);
1385 }
1386
1387 int
main(int argc,char * argv[])1388 main(int argc, char *argv[])
1389 {
1390 if (argc < 2)
1391 draid_usage();
1392
1393 char *subcommand = argv[1];
1394
1395 if (strcmp(subcommand, "generate") == 0) {
1396 return (draid_generate(argc - 1, argv + 1));
1397 } else if (strcmp(subcommand, "verify") == 0) {
1398 return (draid_verify(argc - 1, argv + 1));
1399 } else if (strcmp(subcommand, "dump") == 0) {
1400 return (draid_dump(argc - 1, argv + 1));
1401 } else if (strcmp(subcommand, "table") == 0) {
1402 return (draid_table(argc - 1, argv + 1));
1403 } else if (strcmp(subcommand, "merge") == 0) {
1404 return (draid_merge(argc - 1, argv + 1));
1405 } else {
1406 draid_usage();
1407 }
1408 }
1409