1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2018 Intel Corporation.
24 * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
25 */
26
27 #include <stdio.h>
28 #include <zlib.h>
29 #include <zfs_fletcher.h>
30 #include <sys/vdev_draid.h>
31 #include <sys/nvpair.h>
32 #include <sys/stat.h>
33
34 /*
35 * The number of rows to generate for new permutation maps.
36 */
37 #define MAP_ROWS_DEFAULT 256
38
39 /*
40 * Key values for dRAID maps when stored as nvlists.
41 */
42 #define MAP_SEED "seed"
43 #define MAP_CHECKSUM "checksum"
44 #define MAP_WORST_RATIO "worst_ratio"
45 #define MAP_AVG_RATIO "avg_ratio"
46 #define MAP_CHILDREN "children"
47 #define MAP_NPERMS "nperms"
48 #define MAP_PERMS "perms"
49
50 static void
draid_usage(void)51 draid_usage(void)
52 {
53 (void) fprintf(stderr,
54 "usage: draid command args ...\n"
55 "Available commands are:\n"
56 "\n"
57 "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n"
58 "\tdraid verify [-rv] FILE\n"
59 "\tdraid dump [-v] [-m min] [-n max] FILE\n"
60 "\tdraid table FILE\n"
61 "\tdraid merge FILE SRC SRC...\n");
62 exit(1);
63 }
64
65 static int
read_map(const char * filename,nvlist_t ** allcfgs)66 read_map(const char *filename, nvlist_t **allcfgs)
67 {
68 int block_size = 131072;
69 int buf_size = 131072;
70 int tmp_size, error;
71 char *tmp_buf;
72
73 struct stat64 stat;
74 if (lstat64(filename, &stat) != 0)
75 return (errno);
76
77 if (stat.st_size == 0 ||
78 !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) {
79 return (EINVAL);
80 }
81
82 gzFile fp = gzopen(filename, "rb");
83 if (fp == Z_NULL)
84 return (errno);
85
86 char *buf = malloc(buf_size);
87 if (buf == NULL) {
88 (void) gzclose(fp);
89 return (ENOMEM);
90 }
91
92 ssize_t rc, bytes = 0;
93 while (!gzeof(fp)) {
94 rc = gzread(fp, buf + bytes, block_size);
95 if ((rc < 0) || (rc == 0 && !gzeof(fp))) {
96 free(buf);
97 (void) gzerror(fp, &error);
98 (void) gzclose(fp);
99 return (error);
100 } else {
101 bytes += rc;
102
103 if (bytes + block_size >= buf_size) {
104 tmp_size = 2 * buf_size;
105 tmp_buf = malloc(tmp_size);
106 if (tmp_buf == NULL) {
107 free(buf);
108 (void) gzclose(fp);
109 return (ENOMEM);
110 }
111
112 memcpy(tmp_buf, buf, bytes);
113 free(buf);
114 buf = tmp_buf;
115 buf_size = tmp_size;
116 }
117 }
118 }
119
120 (void) gzclose(fp);
121
122 error = nvlist_unpack(buf, bytes, allcfgs, 0);
123 free(buf);
124
125 return (error);
126 }
127
128 /*
129 * Read a map from the specified filename. A file contains multiple maps
130 * which are indexed by the number of children. The caller is responsible
131 * for freeing the configuration returned.
132 */
133 static int
read_map_key(const char * filename,const char * key,nvlist_t ** cfg)134 read_map_key(const char *filename, const char *key, nvlist_t **cfg)
135 {
136 nvlist_t *allcfgs, *foundcfg = NULL;
137 int error;
138
139 error = read_map(filename, &allcfgs);
140 if (error != 0)
141 return (error);
142
143 (void) nvlist_lookup_nvlist(allcfgs, key, &foundcfg);
144 if (foundcfg != NULL) {
145 nvlist_dup(foundcfg, cfg, KM_SLEEP);
146 error = 0;
147 } else {
148 error = ENOENT;
149 }
150
151 nvlist_free(allcfgs);
152
153 return (error);
154 }
155
156 /*
157 * Write all mappings to the map file.
158 */
159 static int
write_map(const char * filename,nvlist_t * allcfgs)160 write_map(const char *filename, nvlist_t *allcfgs)
161 {
162 size_t buflen = 0;
163 int error;
164
165 error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR);
166 if (error)
167 return (error);
168
169 char *buf = malloc(buflen);
170 if (buf == NULL)
171 return (ENOMEM);
172
173 error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
174 if (error) {
175 free(buf);
176 return (error);
177 }
178
179 /*
180 * Atomically update the file using a temporary file and the
181 * traditional unlink then rename steps. This code provides
182 * no locking, it only guarantees the packed nvlist on disk
183 * is updated atomically and is internally consistent.
184 */
185 char *tmpname = calloc(1, MAXPATHLEN);
186 if (tmpname == NULL) {
187 free(buf);
188 return (ENOMEM);
189 }
190
191 snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename);
192
193 int fd = mkstemp(tmpname);
194 if (fd < 0) {
195 error = errno;
196 free(buf);
197 free(tmpname);
198 return (error);
199 }
200 (void) close(fd);
201
202 gzFile fp = gzopen(tmpname, "w9b");
203 if (fp == Z_NULL) {
204 error = errno;
205 free(buf);
206 free(tmpname);
207 return (errno);
208 }
209
210 ssize_t rc, bytes = 0;
211 while (bytes < buflen) {
212 size_t size = MIN(buflen - bytes, 131072);
213 rc = gzwrite(fp, buf + bytes, size);
214 if (rc < 0) {
215 free(buf);
216 (void) gzerror(fp, &error);
217 (void) gzclose(fp);
218 (void) unlink(tmpname);
219 free(tmpname);
220 return (error);
221 } else if (rc == 0) {
222 break;
223 } else {
224 bytes += rc;
225 }
226 }
227
228 free(buf);
229 (void) gzclose(fp);
230
231 if (bytes != buflen) {
232 (void) unlink(tmpname);
233 free(tmpname);
234 return (EIO);
235 }
236
237 /*
238 * Unlink the previous config file and replace it with the updated
239 * version. If we're able to unlink the file then directory is
240 * writable by us and the subsequent rename should never fail.
241 */
242 error = unlink(filename);
243 if (error != 0 && errno != ENOENT) {
244 error = errno;
245 (void) unlink(tmpname);
246 free(tmpname);
247 return (error);
248 }
249
250 error = rename(tmpname, filename);
251 if (error != 0) {
252 error = errno;
253 (void) unlink(tmpname);
254 free(tmpname);
255 return (error);
256 }
257
258 free(tmpname);
259
260 return (0);
261 }
262
263 /*
264 * Add the dRAID map to the file and write it out.
265 */
266 static int
write_map_key(const char * filename,char * key,draid_map_t * map,double worst_ratio,double avg_ratio)267 write_map_key(const char *filename, char *key, draid_map_t *map,
268 double worst_ratio, double avg_ratio)
269 {
270 nvlist_t *nv_cfg, *allcfgs;
271 int error;
272
273 /*
274 * Add the configuration to an existing or new file. The new
275 * configuration will replace an existing configuration with the
276 * same key if it has a lower ratio and is therefore better.
277 */
278 error = read_map(filename, &allcfgs);
279 if (error == ENOENT) {
280 allcfgs = fnvlist_alloc();
281 } else if (error != 0) {
282 return (error);
283 }
284
285 error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg);
286 if (error == 0) {
287 uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg,
288 MAP_WORST_RATIO);
289 double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0;
290
291 if (worst_ratio < nv_worst_ratio) {
292 /* Replace old map with the more balanced new map. */
293 fnvlist_remove(allcfgs, key);
294 } else {
295 /* The old map is preferable, keep it. */
296 nvlist_free(allcfgs);
297 return (EEXIST);
298 }
299 }
300
301 nvlist_t *cfg = fnvlist_alloc();
302 fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed);
303 fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum);
304 fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children);
305 fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms);
306 fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms,
307 map->dm_children * map->dm_nperms * sizeof (uint8_t));
308
309 fnvlist_add_uint64(cfg, MAP_WORST_RATIO,
310 (uint64_t)(worst_ratio * 1000.0));
311 fnvlist_add_uint64(cfg, MAP_AVG_RATIO,
312 (uint64_t)(avg_ratio * 1000.0));
313
314 error = nvlist_add_nvlist(allcfgs, key, cfg);
315 if (error == 0)
316 error = write_map(filename, allcfgs);
317
318 nvlist_free(cfg);
319 nvlist_free(allcfgs);
320 return (error);
321 }
322
323 static void
dump_map(draid_map_t * map,const char * key,double worst_ratio,double avg_ratio,int verbose)324 dump_map(draid_map_t *map, const char *key, double worst_ratio,
325 double avg_ratio, int verbose)
326 {
327 if (verbose == 0) {
328 return;
329 } else if (verbose == 1) {
330 printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f "
331 "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed,
332 worst_ratio, avg_ratio);
333 return;
334 } else {
335 printf(" \"%s\":\n"
336 " seed: 0x%016llx\n"
337 " checksum: 0x%016llx\n"
338 " worst_ratio: %2.03f\n"
339 " avg_ratio: %2.03f\n"
340 " children: %llu\n"
341 " nperms: %llu\n",
342 key, (u_longlong_t)map->dm_seed,
343 (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio,
344 (u_longlong_t)map->dm_children,
345 (u_longlong_t)map->dm_nperms);
346
347 if (verbose > 2) {
348 printf(" perms = {\n");
349 for (int i = 0; i < map->dm_nperms; i++) {
350 printf(" { ");
351 for (int j = 0; j < map->dm_children; j++) {
352 printf("%3d%s ", map->dm_perms[
353 i * map->dm_children + j],
354 j < map->dm_children - 1 ?
355 "," : "");
356 }
357 printf(" },\n");
358 }
359 printf(" }\n");
360 } else if (verbose == 2) {
361 printf(" draid_perms = <omitted>\n");
362 }
363 }
364 }
365
366 static void
dump_map_nv(const char * key,nvlist_t * cfg,int verbose)367 dump_map_nv(const char *key, nvlist_t *cfg, int verbose)
368 {
369 draid_map_t map;
370 uint_t c;
371
372 uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO);
373 uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
374
375 map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
376 map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
377 map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
378 map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
379 map.dm_perms = fnvlist_lookup_uint8_array(cfg, MAP_PERMS, &c);
380
381 dump_map(&map, key, (double)worst_ratio / 1000.0,
382 avg_ratio / 1000.0, verbose);
383 }
384
385 /*
386 * Print a summary of the mapping.
387 */
388 static int
dump_map_key(const char * filename,const char * key,int verbose)389 dump_map_key(const char *filename, const char *key, int verbose)
390 {
391 nvlist_t *cfg;
392 int error;
393
394 error = read_map_key(filename, key, &cfg);
395 if (error != 0)
396 return (error);
397
398 dump_map_nv(key, cfg, verbose);
399
400 return (0);
401 }
402
403 /*
404 * Allocate a new permutation map for evaluation.
405 */
406 static int
alloc_new_map(uint64_t children,uint64_t nperms,uint64_t seed,draid_map_t ** mapp)407 alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed,
408 draid_map_t **mapp)
409 {
410 draid_map_t *map;
411 int error;
412
413 map = malloc(sizeof (draid_map_t));
414 if (map == NULL)
415 return (ENOMEM);
416
417 map->dm_children = children;
418 map->dm_nperms = nperms;
419 map->dm_seed = seed;
420 map->dm_checksum = 0;
421
422 error = vdev_draid_generate_perms(map, &map->dm_perms);
423 if (error) {
424 free(map);
425 return (error);
426 }
427
428 *mapp = map;
429
430 return (0);
431 }
432
433 /*
434 * Allocate the fixed permutation map for N children.
435 */
436 static int
alloc_fixed_map(uint64_t children,draid_map_t ** mapp)437 alloc_fixed_map(uint64_t children, draid_map_t **mapp)
438 {
439 const draid_map_t *fixed_map;
440 draid_map_t *map;
441 int error;
442
443 error = vdev_draid_lookup_map(children, &fixed_map);
444 if (error)
445 return (error);
446
447 map = malloc(sizeof (draid_map_t));
448 if (map == NULL)
449 return (ENOMEM);
450
451 memcpy(map, fixed_map, sizeof (draid_map_t));
452 VERIFY3U(map->dm_checksum, !=, 0);
453
454 error = vdev_draid_generate_perms(map, &map->dm_perms);
455 if (error) {
456 free(map);
457 return (error);
458 }
459
460 *mapp = map;
461
462 return (0);
463 }
464
465 /*
466 * Free a permutation map.
467 */
468 static void
free_map(draid_map_t * map)469 free_map(draid_map_t *map)
470 {
471 free(map->dm_perms);
472 free(map);
473 }
474
475 /*
476 * Check if dev is in the provided list of faulted devices.
477 */
478 static inline boolean_t
is_faulted(int * faulted_devs,int nfaulted,int dev)479 is_faulted(int *faulted_devs, int nfaulted, int dev)
480 {
481 for (int i = 0; i < nfaulted; i++)
482 if (faulted_devs[i] == dev)
483 return (B_TRUE);
484
485 return (B_FALSE);
486 }
487
488 /*
489 * Evaluate how resilvering I/O will be distributed given a list of faulted
490 * vdevs. As a simplification we assume one IO is sufficient to repair each
491 * damaged device in a group.
492 */
493 static double
eval_resilver(draid_map_t * map,uint64_t groupwidth,uint64_t nspares,int * faulted_devs,int nfaulted,int * min_child_ios,int * max_child_ios)494 eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares,
495 int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios)
496 {
497 uint64_t children = map->dm_children;
498 uint64_t ngroups = 1;
499 uint64_t ndisks = children - nspares;
500
501 /*
502 * Calculate the minimum number of groups required to fill a slice.
503 */
504 while (ngroups * (groupwidth) % (children - nspares) != 0)
505 ngroups++;
506
507 int *ios = calloc(map->dm_children, sizeof (uint64_t));
508
509 ASSERT3P(ios, !=, NULL);
510
511 /* Resilver all rows */
512 for (int i = 0; i < map->dm_nperms; i++) {
513 uint8_t *row = &map->dm_perms[i * map->dm_children];
514
515 /* Resilver all groups with faulted drives */
516 for (int j = 0; j < ngroups; j++) {
517 uint64_t spareidx = map->dm_children - nspares;
518 boolean_t repair_needed = B_FALSE;
519
520 /* See if any devices in this group are faulted */
521 uint64_t groupstart = (j * groupwidth) % ndisks;
522
523 for (int k = 0; k < groupwidth; k++) {
524 uint64_t groupidx = (groupstart + k) % ndisks;
525
526 repair_needed = is_faulted(faulted_devs,
527 nfaulted, row[groupidx]);
528 if (repair_needed)
529 break;
530 }
531
532 if (repair_needed == B_FALSE)
533 continue;
534
535 /*
536 * This group is degraded. Calculate the number of
537 * reads the non-faulted drives require and the number
538 * of writes to the distributed hot spare for this row.
539 */
540 for (int k = 0; k < groupwidth; k++) {
541 uint64_t groupidx = (groupstart + k) % ndisks;
542
543 if (!is_faulted(faulted_devs, nfaulted,
544 row[groupidx])) {
545 ios[row[groupidx]]++;
546 } else if (nspares > 0) {
547 while (is_faulted(faulted_devs,
548 nfaulted, row[spareidx])) {
549 spareidx++;
550 }
551
552 ASSERT3U(spareidx, <, map->dm_children);
553 ios[row[spareidx]]++;
554 spareidx++;
555 }
556 }
557 }
558 }
559
560 *min_child_ios = INT_MAX;
561 *max_child_ios = 0;
562
563 /*
564 * Find the drives with fewest and most required I/O. These values
565 * are used to calculate the imbalance ratio. To avoid returning an
566 * infinite value for permutations which have children that perform
567 * no IO a floor of 1 IO per child is set. This ensures a meaningful
568 * ratio is returned for comparison and it is not an uncommon when
569 * there are a large number of children.
570 */
571 for (int i = 0; i < map->dm_children; i++) {
572
573 if (is_faulted(faulted_devs, nfaulted, i)) {
574 ASSERT0(ios[i]);
575 continue;
576 }
577
578 if (ios[i] == 0)
579 ios[i] = 1;
580
581 if (ios[i] < *min_child_ios)
582 *min_child_ios = ios[i];
583
584 if (ios[i] > *max_child_ios)
585 *max_child_ios = ios[i];
586 }
587
588 ASSERT3S(*min_child_ios, !=, INT_MAX);
589 ASSERT3S(*max_child_ios, !=, 0);
590
591 double ratio = (double)(*max_child_ios) / (double)(*min_child_ios);
592
593 free(ios);
594
595 return (ratio);
596 }
597
598 /*
599 * Evaluate the quality of the permutation mapping by considering possible
600 * device failures. Returns the imbalance ratio for the worst mapping which
601 * is defined to be the largest number of child IOs over the fewest number
602 * child IOs. A value of 1.0 indicates the mapping is perfectly balance and
603 * all children perform an equal amount of work during reconstruction.
604 */
605 static void
eval_decluster(draid_map_t * map,double * worst_ratiop,double * avg_ratiop)606 eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop)
607 {
608 uint64_t children = map->dm_children;
609 double worst_ratio = 1.0;
610 double sum = 0;
611 int worst_min_ios = 0, worst_max_ios = 0;
612 int n = 0;
613
614 /*
615 * When there are only 2 children there can be no distributed
616 * spare and no resilver to evaluate. Default to a ratio of 1.0
617 * for this degenerate case.
618 */
619 if (children == VDEV_DRAID_MIN_CHILDREN) {
620 *worst_ratiop = 1.0;
621 *avg_ratiop = 1.0;
622 return;
623 }
624
625 /*
626 * Score the mapping as if it had either 1 or 2 distributed spares.
627 */
628 for (int nspares = 1; nspares <= 2; nspares++) {
629 uint64_t faults = nspares;
630
631 /*
632 * Score groupwidths up to 19. This value was chosen as the
633 * largest reasonable width (16d+3p). dRAID pools may be still
634 * be created with wider stripes but they are not considered in
635 * this analysis in order to optimize for the most common cases.
636 */
637 for (uint64_t groupwidth = 2;
638 groupwidth <= MIN(children - nspares, 19);
639 groupwidth++) {
640 int faulted_devs[2];
641 int min_ios, max_ios;
642
643 /*
644 * Score possible devices faults. This is limited
645 * to exactly one fault per distributed spare for
646 * the purposes of this similation.
647 */
648 for (int f1 = 0; f1 < children; f1++) {
649 faulted_devs[0] = f1;
650 double ratio;
651
652 if (faults == 1) {
653 ratio = eval_resilver(map, groupwidth,
654 nspares, faulted_devs, faults,
655 &min_ios, &max_ios);
656
657 if (ratio > worst_ratio) {
658 worst_ratio = ratio;
659 worst_min_ios = min_ios;
660 worst_max_ios = max_ios;
661 }
662
663 sum += ratio;
664 n++;
665 } else if (faults == 2) {
666 for (int f2 = f1 + 1; f2 < children;
667 f2++) {
668 faulted_devs[1] = f2;
669
670 ratio = eval_resilver(map,
671 groupwidth, nspares,
672 faulted_devs, faults,
673 &min_ios, &max_ios);
674
675 if (ratio > worst_ratio) {
676 worst_ratio = ratio;
677 worst_min_ios = min_ios;
678 worst_max_ios = max_ios;
679 }
680
681 sum += ratio;
682 n++;
683 }
684 }
685 }
686 }
687 }
688
689 *worst_ratiop = worst_ratio;
690 *avg_ratiop = sum / n;
691
692 /*
693 * Log the min/max io values for particularly unbalanced maps.
694 * Since the maps are generated entirely randomly these are possible
695 * be exceedingly unlikely. We log it for possible investigation.
696 */
697 if (worst_ratio > 100.0) {
698 dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2);
699 printf("worst_min_ios=%d worst_max_ios=%d\n",
700 worst_min_ios, worst_max_ios);
701 }
702 }
703
704 static int
eval_maps(uint64_t children,int passes,uint64_t * map_seed,draid_map_t ** best_mapp,double * best_ratiop,double * avg_ratiop)705 eval_maps(uint64_t children, int passes, uint64_t *map_seed,
706 draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop)
707 {
708 draid_map_t *best_map = NULL;
709 double best_worst_ratio = 1000.0;
710 double best_avg_ratio = 1000.0;
711
712 /*
713 * Perform the requested number of passes evaluating randomly
714 * generated permutation maps. Only the best version is kept.
715 */
716 for (int i = 0; i < passes; i++) {
717 double worst_ratio, avg_ratio;
718 draid_map_t *map;
719 int error;
720
721 /*
722 * Calculate the next seed and generate a new candidate map.
723 */
724 error = alloc_new_map(children, MAP_ROWS_DEFAULT,
725 vdev_draid_rand(map_seed), &map);
726 if (error) {
727 if (best_map != NULL)
728 free_map(best_map);
729 return (error);
730 }
731
732 /*
733 * Consider maps with a lower worst_ratio to be of higher
734 * quality. Some maps may have a lower avg_ratio but they
735 * are discarded since they might include some particularly
736 * imbalanced permutations. The average is tracked to in
737 * order to get a sense of the average permutation quality.
738 */
739 eval_decluster(map, &worst_ratio, &avg_ratio);
740
741 if (best_map == NULL || worst_ratio < best_worst_ratio) {
742
743 if (best_map != NULL)
744 free_map(best_map);
745
746 best_map = map;
747 best_worst_ratio = worst_ratio;
748 best_avg_ratio = avg_ratio;
749 } else {
750 free_map(map);
751 }
752 }
753
754 /*
755 * After determining the best map generate a checksum over the full
756 * permutation array. This checksum is verified when opening a dRAID
757 * pool to ensure the generated in memory permutations are correct.
758 */
759 zio_cksum_t cksum;
760 fletcher_4_native_varsize(best_map->dm_perms,
761 sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms,
762 &cksum);
763 best_map->dm_checksum = cksum.zc_word[0];
764
765 *best_mapp = best_map;
766 *best_ratiop = best_worst_ratio;
767 *avg_ratiop = best_avg_ratio;
768
769 return (0);
770 }
771
772 static int
draid_generate(int argc,char * argv[])773 draid_generate(int argc, char *argv[])
774 {
775 char filename[MAXPATHLEN] = {0};
776 uint64_t map_seed[2];
777 int c, fd, error, verbose = 0, passes = 1, continuous = 0;
778 int min_children = VDEV_DRAID_MIN_CHILDREN;
779 int max_children = VDEV_DRAID_MAX_CHILDREN;
780 int restarts = 0;
781
782 while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) {
783 switch (c) {
784 case 'c':
785 continuous++;
786 break;
787 case 'm':
788 min_children = (int)strtol(optarg, NULL, 0);
789 if (min_children < VDEV_DRAID_MIN_CHILDREN) {
790 (void) fprintf(stderr, "A minimum of 2 "
791 "children are required.\n");
792 return (1);
793 }
794
795 break;
796 case 'n':
797 max_children = (int)strtol(optarg, NULL, 0);
798 if (max_children > VDEV_DRAID_MAX_CHILDREN) {
799 (void) fprintf(stderr, "A maximum of %d "
800 "children are allowed.\n",
801 VDEV_DRAID_MAX_CHILDREN);
802 return (1);
803 }
804 break;
805 case 'p':
806 passes = (int)strtol(optarg, NULL, 0);
807 break;
808 case 'v':
809 /*
810 * 0 - Only log when a better map is added to the file.
811 * 1 - Log the current best map for each child count.
812 * Minimal output on a single summary line.
813 * 2 - Log the current best map for each child count.
814 * More verbose includes most map fields.
815 * 3 - Log the current best map for each child count.
816 * Very verbose all fields including the full map.
817 */
818 verbose++;
819 break;
820 case ':':
821 (void) fprintf(stderr,
822 "missing argument for '%c' option\n", optopt);
823 draid_usage();
824 break;
825 case '?':
826 (void) fprintf(stderr, "invalid option '%c'\n",
827 optopt);
828 draid_usage();
829 break;
830 }
831 }
832
833 if (argc > optind)
834 strlcpy(filename, argv[optind], sizeof (filename));
835 else {
836 (void) fprintf(stderr, "A FILE must be specified.\n");
837 return (1);
838 }
839
840 restart:
841 /*
842 * Start with a fresh seed from /dev/urandom.
843 */
844 fd = open("/dev/urandom", O_RDONLY);
845 if (fd < 0) {
846 printf("Unable to open /dev/urandom: %s\n:", strerror(errno));
847 return (1);
848 } else {
849 ssize_t bytes = sizeof (map_seed);
850 ssize_t bytes_read = 0;
851
852 while (bytes_read < bytes) {
853 ssize_t rc = read(fd, ((char *)map_seed) + bytes_read,
854 bytes - bytes_read);
855 if (rc < 0) {
856 printf("Unable to read /dev/urandom: %s\n:",
857 strerror(errno));
858 close(fd);
859 return (1);
860 }
861 bytes_read += rc;
862 }
863
864 (void) close(fd);
865 }
866
867 if (restarts == 0)
868 printf("Writing generated mappings to '%s':\n", filename);
869
870 /*
871 * Generate maps for all requested child counts. The best map for
872 * each child count is written out to the specified file. If the file
873 * already contains a better mapping this map will not be added.
874 */
875 for (uint64_t children = min_children;
876 children <= max_children; children++) {
877 char key[8] = { 0 };
878 draid_map_t *map;
879 double worst_ratio = 1000.0;
880 double avg_ratio = 1000.0;
881
882 error = eval_maps(children, passes, map_seed, &map,
883 &worst_ratio, &avg_ratio);
884 if (error) {
885 printf("Error eval_maps(): %s\n", strerror(error));
886 return (1);
887 }
888
889 if (worst_ratio < 1.0 || avg_ratio < 1.0) {
890 printf("Error ratio < 1.0: worst_ratio = %2.03f "
891 "avg_ratio = %2.03f\n", worst_ratio, avg_ratio);
892 return (1);
893 }
894
895 snprintf(key, 7, "%llu", (u_longlong_t)children);
896 error = write_map_key(filename, key, map, worst_ratio,
897 avg_ratio);
898 if (error == 0) {
899 /* The new map was added to the file. */
900 dump_map(map, key, worst_ratio, avg_ratio,
901 MAX(verbose, 1));
902 } else if (error == EEXIST) {
903 /* The existing map was preferable and kept. */
904 if (verbose > 0)
905 dump_map_key(filename, key, verbose);
906 } else {
907 printf("Error write_map_key(): %s\n", strerror(error));
908 return (1);
909 }
910
911 free_map(map);
912 }
913
914 /*
915 * When the continuous option is set restart at the minimum number of
916 * children instead of exiting. This option is useful as a mechanism
917 * to continuous try and refine the discovered permutations.
918 */
919 if (continuous) {
920 restarts++;
921 printf("Restarting by request (-c): %d\n", restarts);
922 goto restart;
923 }
924
925 return (0);
926 }
927
928 /*
929 * Verify each map in the file by generating its in-memory permutation array
930 * and comfirming its checksum is correct.
931 */
932 static int
draid_verify(int argc,char * argv[])933 draid_verify(int argc, char *argv[])
934 {
935 char filename[MAXPATHLEN] = {0};
936 int n = 0, c, error, verbose = 1;
937 int check_ratios = 0;
938
939 while ((c = getopt(argc, argv, ":rv")) != -1) {
940 switch (c) {
941 case 'r':
942 check_ratios++;
943 break;
944 case 'v':
945 verbose++;
946 break;
947 case ':':
948 (void) fprintf(stderr,
949 "missing argument for '%c' option\n", optopt);
950 draid_usage();
951 break;
952 case '?':
953 (void) fprintf(stderr, "invalid option '%c'\n",
954 optopt);
955 draid_usage();
956 break;
957 }
958 }
959
960 if (argc > optind) {
961 char *abspath = malloc(MAXPATHLEN);
962 if (abspath == NULL)
963 return (ENOMEM);
964
965 if (realpath(argv[optind], abspath) != NULL)
966 strlcpy(filename, abspath, sizeof (filename));
967 else
968 strlcpy(filename, argv[optind], sizeof (filename));
969
970 free(abspath);
971 } else {
972 (void) fprintf(stderr, "A FILE must be specified.\n");
973 return (1);
974 }
975
976 printf("Verifying permutation maps: '%s'\n", filename);
977
978 /*
979 * Lookup hardcoded permutation map for each valid number of children
980 * and verify a generated map has the correct checksum. Then compare
981 * the generated map values with the nvlist map values read from the
982 * reference file to cross-check the permutation.
983 */
984 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
985 children <= VDEV_DRAID_MAX_CHILDREN;
986 children++) {
987 draid_map_t *map;
988 char key[8] = {0};
989
990 snprintf(key, 8, "%llu", (u_longlong_t)children);
991
992 error = alloc_fixed_map(children, &map);
993 if (error) {
994 printf("Error alloc_fixed_map() failed: %s\n",
995 error == ECKSUM ? "Invalid checksum" :
996 strerror(error));
997 return (1);
998 }
999
1000 uint64_t nv_seed, nv_checksum, nv_children, nv_nperms;
1001 uint8_t *nv_perms;
1002 nvlist_t *cfg;
1003 uint_t c;
1004
1005 error = read_map_key(filename, key, &cfg);
1006 if (error != 0) {
1007 printf("Error read_map_key() failed: %s\n",
1008 strerror(error));
1009 free_map(map);
1010 return (1);
1011 }
1012
1013 nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1014 nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1015 nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1016 nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1017 nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c);
1018
1019 /*
1020 * Compare draid_map_t and nvlist reference values.
1021 */
1022 if (map->dm_seed != nv_seed) {
1023 printf("Error different seeds: 0x%016llx != "
1024 "0x%016llx\n", (u_longlong_t)map->dm_seed,
1025 (u_longlong_t)nv_seed);
1026 error = EINVAL;
1027 }
1028
1029 if (map->dm_checksum != nv_checksum) {
1030 printf("Error different checksums: 0x%016llx "
1031 "!= 0x%016llx\n",
1032 (u_longlong_t)map->dm_checksum,
1033 (u_longlong_t)nv_checksum);
1034 error = EINVAL;
1035 }
1036
1037 if (map->dm_children != nv_children) {
1038 printf("Error different children: %llu "
1039 "!= %llu\n", (u_longlong_t)map->dm_children,
1040 (u_longlong_t)nv_children);
1041 error = EINVAL;
1042 }
1043
1044 if (map->dm_nperms != nv_nperms) {
1045 printf("Error different nperms: %llu "
1046 "!= %llu\n", (u_longlong_t)map->dm_nperms,
1047 (u_longlong_t)nv_nperms);
1048 error = EINVAL;
1049 }
1050
1051 for (uint64_t i = 0; i < nv_children * nv_nperms; i++) {
1052 if (map->dm_perms[i] != nv_perms[i]) {
1053 printf("Error different perms[%llu]: "
1054 "%d != %d\n", (u_longlong_t)i,
1055 (int)map->dm_perms[i],
1056 (int)nv_perms[i]);
1057 error = EINVAL;
1058 break;
1059 }
1060 }
1061
1062 /*
1063 * For good measure recalculate the worst and average
1064 * ratios and confirm they match the nvlist values.
1065 */
1066 if (check_ratios) {
1067 uint64_t nv_worst_ratio, nv_avg_ratio;
1068 double worst_ratio, avg_ratio;
1069
1070 eval_decluster(map, &worst_ratio, &avg_ratio);
1071
1072 nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1073 MAP_WORST_RATIO);
1074 nv_avg_ratio = fnvlist_lookup_uint64(cfg,
1075 MAP_AVG_RATIO);
1076
1077 if (worst_ratio < 1.0 || avg_ratio < 1.0) {
1078 printf("Error ratio out of range %2.03f, "
1079 "%2.03f\n", worst_ratio, avg_ratio);
1080 error = EINVAL;
1081 }
1082
1083 if ((uint64_t)(worst_ratio * 1000.0) !=
1084 nv_worst_ratio) {
1085 printf("Error different worst_ratio %2.03f "
1086 "!= %2.03f\n", (double)nv_worst_ratio /
1087 1000.0, worst_ratio);
1088 error = EINVAL;
1089 }
1090
1091 if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) {
1092 printf("Error different average_ratio %2.03f "
1093 "!= %2.03f\n", (double)nv_avg_ratio /
1094 1000.0, avg_ratio);
1095 error = EINVAL;
1096 }
1097 }
1098
1099 if (error) {
1100 free_map(map);
1101 nvlist_free(cfg);
1102 return (1);
1103 }
1104
1105 if (verbose > 0) {
1106 printf("- %llu children: good\n",
1107 (u_longlong_t)children);
1108 }
1109 n++;
1110
1111 free_map(map);
1112 nvlist_free(cfg);
1113 }
1114
1115 if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) {
1116 printf("Error permutation maps missing: %d / %d checked\n",
1117 n, VDEV_DRAID_MAX_CHILDREN - 1);
1118 return (1);
1119 }
1120
1121 printf("Successfully verified %d / %d permutation maps\n",
1122 n, VDEV_DRAID_MAX_CHILDREN - 1);
1123
1124 return (0);
1125 }
1126
1127 /*
1128 * Dump the contents of the specified mapping(s) for inspection.
1129 */
1130 static int
draid_dump(int argc,char * argv[])1131 draid_dump(int argc, char *argv[])
1132 {
1133 char filename[MAXPATHLEN] = {0};
1134 int c, error, verbose = 1;
1135 int min_children = VDEV_DRAID_MIN_CHILDREN;
1136 int max_children = VDEV_DRAID_MAX_CHILDREN;
1137
1138 while ((c = getopt(argc, argv, ":vm:n:")) != -1) {
1139 switch (c) {
1140 case 'm':
1141 min_children = (int)strtol(optarg, NULL, 0);
1142 if (min_children < 2) {
1143 (void) fprintf(stderr, "A minimum of 2 "
1144 "children are required.\n");
1145 return (1);
1146 }
1147
1148 break;
1149 case 'n':
1150 max_children = (int)strtol(optarg, NULL, 0);
1151 if (max_children > VDEV_DRAID_MAX_CHILDREN) {
1152 (void) fprintf(stderr, "A maximum of %d "
1153 "children are allowed.\n",
1154 VDEV_DRAID_MAX_CHILDREN);
1155 return (1);
1156 }
1157 break;
1158 case 'v':
1159 verbose++;
1160 break;
1161 case ':':
1162 (void) fprintf(stderr,
1163 "missing argument for '%c' option\n", optopt);
1164 draid_usage();
1165 break;
1166 case '?':
1167 (void) fprintf(stderr, "invalid option '%c'\n",
1168 optopt);
1169 draid_usage();
1170 break;
1171 }
1172 }
1173
1174 if (argc > optind)
1175 strlcpy(filename, argv[optind], sizeof (filename));
1176 else {
1177 (void) fprintf(stderr, "A FILE must be specified.\n");
1178 return (1);
1179 }
1180
1181 /*
1182 * Dump maps for the requested child counts.
1183 */
1184 for (uint64_t children = min_children;
1185 children <= max_children; children++) {
1186 char key[8] = { 0 };
1187
1188 snprintf(key, 7, "%llu", (u_longlong_t)children);
1189 error = dump_map_key(filename, key, verbose);
1190 if (error) {
1191 printf("Error dump_map_key(): %s\n", strerror(error));
1192 return (1);
1193 }
1194 }
1195
1196 return (0);
1197 }
1198
1199 /*
1200 * Print all of the mappings as a C formatted draid_map_t array. This table
1201 * is found in the module/zcommon/zfs_draid.c file and is the definitive
1202 * source for all mapping used by dRAID. It cannot be updated without
1203 * changing the dRAID on disk format.
1204 */
1205 static int
draid_table(int argc,char * argv[])1206 draid_table(int argc, char *argv[])
1207 {
1208 char filename[MAXPATHLEN] = {0};
1209 int error;
1210
1211 if (argc > optind)
1212 strlcpy(filename, argv[optind], sizeof (filename));
1213 else {
1214 (void) fprintf(stderr, "A FILE must be specified.\n");
1215 return (1);
1216 }
1217
1218 printf("static const draid_map_t "
1219 "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n");
1220
1221 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
1222 children <= VDEV_DRAID_MAX_CHILDREN;
1223 children++) {
1224 uint64_t seed, checksum, nperms, avg_ratio;
1225 nvlist_t *cfg;
1226 char key[8] = {0};
1227
1228 snprintf(key, 8, "%llu", (u_longlong_t)children);
1229
1230 error = read_map_key(filename, key, &cfg);
1231 if (error != 0) {
1232 printf("Error read_map_key() failed: %s\n",
1233 strerror(error));
1234 return (1);
1235 }
1236
1237 seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1238 checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1239 children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1240 nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1241 avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
1242
1243 printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t"
1244 "/* %2.03f */\n", (u_longlong_t)children,
1245 (u_longlong_t)nperms, (u_longlong_t)seed,
1246 (u_longlong_t)checksum, (double)avg_ratio / 1000.0);
1247
1248 nvlist_free(cfg);
1249 }
1250
1251 printf("};\n");
1252
1253 return (0);
1254 }
1255
1256 static int
draid_merge_impl(nvlist_t * allcfgs,const char * srcfilename,int * mergedp)1257 draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp)
1258 {
1259 nvlist_t *srccfgs;
1260 nvpair_t *elem = NULL;
1261 int error, merged = 0;
1262
1263 error = read_map(srcfilename, &srccfgs);
1264 if (error != 0)
1265 return (error);
1266
1267 while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) {
1268 uint64_t nv_worst_ratio;
1269 uint64_t allcfg_worst_ratio;
1270 nvlist_t *cfg, *allcfg;
1271 const char *key;
1272
1273 switch (nvpair_type(elem)) {
1274 case DATA_TYPE_NVLIST:
1275
1276 (void) nvpair_value_nvlist(elem, &cfg);
1277 key = nvpair_name(elem);
1278
1279 nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1280 MAP_WORST_RATIO);
1281
1282 error = nvlist_lookup_nvlist(allcfgs, key, &allcfg);
1283 if (error == 0) {
1284 allcfg_worst_ratio = fnvlist_lookup_uint64(
1285 allcfg, MAP_WORST_RATIO);
1286
1287 if (nv_worst_ratio < allcfg_worst_ratio) {
1288 fnvlist_remove(allcfgs, key);
1289 fnvlist_add_nvlist(allcfgs, key, cfg);
1290 merged++;
1291 }
1292 } else if (error == ENOENT) {
1293 fnvlist_add_nvlist(allcfgs, key, cfg);
1294 merged++;
1295 } else {
1296 return (error);
1297 }
1298
1299 break;
1300 default:
1301 continue;
1302 }
1303 }
1304
1305 nvlist_free(srccfgs);
1306
1307 *mergedp = merged;
1308
1309 return (0);
1310 }
1311
1312 /*
1313 * Merge the best map for each child count found in the listed files into
1314 * a new file. This allows 'draid generate' to be run in parallel and for
1315 * the results maps to be combined.
1316 */
1317 static int
draid_merge(int argc,char * argv[])1318 draid_merge(int argc, char *argv[])
1319 {
1320 char filename[MAXPATHLEN] = {0};
1321 int c, error, total_merged = 0;
1322 nvlist_t *allcfgs;
1323
1324 while ((c = getopt(argc, argv, ":")) != -1) {
1325 switch (c) {
1326 case ':':
1327 (void) fprintf(stderr,
1328 "missing argument for '%c' option\n", optopt);
1329 draid_usage();
1330 break;
1331 case '?':
1332 (void) fprintf(stderr, "invalid option '%c'\n",
1333 optopt);
1334 draid_usage();
1335 break;
1336 }
1337 }
1338
1339 if (argc < 4) {
1340 (void) fprintf(stderr,
1341 "A FILE and multiple SRCs must be specified.\n");
1342 return (1);
1343 }
1344
1345 strlcpy(filename, argv[optind], sizeof (filename));
1346 optind++;
1347
1348 error = read_map(filename, &allcfgs);
1349 if (error == ENOENT) {
1350 allcfgs = fnvlist_alloc();
1351 } else if (error != 0) {
1352 printf("Error read_map(): %s\n", strerror(error));
1353 return (error);
1354 }
1355
1356 while (optind < argc) {
1357 char srcfilename[MAXPATHLEN] = {0};
1358 int merged = 0;
1359
1360 strlcpy(srcfilename, argv[optind], sizeof (srcfilename));
1361
1362 error = draid_merge_impl(allcfgs, srcfilename, &merged);
1363 if (error) {
1364 printf("Error draid_merge_impl(): %s\n",
1365 strerror(error));
1366 nvlist_free(allcfgs);
1367 return (1);
1368 }
1369
1370 total_merged += merged;
1371 printf("Merged %d key(s) from '%s' into '%s'\n", merged,
1372 srcfilename, filename);
1373
1374 optind++;
1375 }
1376
1377 if (total_merged > 0)
1378 write_map(filename, allcfgs);
1379
1380 printf("Merged a total of %d key(s) into '%s'\n", total_merged,
1381 filename);
1382
1383 nvlist_free(allcfgs);
1384
1385 return (0);
1386 }
1387
1388 int
main(int argc,char * argv[])1389 main(int argc, char *argv[])
1390 {
1391 if (argc < 2)
1392 draid_usage();
1393
1394 char *subcommand = argv[1];
1395
1396 if (strcmp(subcommand, "generate") == 0) {
1397 return (draid_generate(argc - 1, argv + 1));
1398 } else if (strcmp(subcommand, "verify") == 0) {
1399 return (draid_verify(argc - 1, argv + 1));
1400 } else if (strcmp(subcommand, "dump") == 0) {
1401 return (draid_dump(argc - 1, argv + 1));
1402 } else if (strcmp(subcommand, "table") == 0) {
1403 return (draid_table(argc - 1, argv + 1));
1404 } else if (strcmp(subcommand, "merge") == 0) {
1405 return (draid_merge(argc - 1, argv + 1));
1406 } else {
1407 draid_usage();
1408 }
1409 }
1410