1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Functions to convert between a list of vdevs and an nvlist representing the 30 * configuration. Each entry in the list can be one of: 31 * 32 * Device vdevs 33 * disk=(path=..., devid=...) 34 * file=(path=...) 35 * 36 * Group vdevs 37 * raidz=(...) 38 * mirror=(...) 39 * 40 * While the underlying implementation supports it, group vdevs cannot contain 41 * other group vdevs. All userland verification of devices is contained within 42 * this file. If successful, the nvlist returned can be passed directly to the 43 * kernel; we've done as much verification as possible in userland. 44 * 45 * The only function exported by this file is 'get_vdev_spec'. The function 46 * performs several passes: 47 * 48 * 1. Construct the vdev specification. Performs syntax validation and 49 * makes sure each device is valid. 50 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 51 * devices are also in use. Some can be overridden using the 'force' 52 * flag, others cannot. 53 * 3. Check for replication errors if the 'force' flag is not specified. 54 * validates that the replication level is consistent across the 55 * entire pool. 56 * 4. Label any whole disks with an EFI label. 57 */ 58 59 #include <assert.h> 60 #include <devid.h> 61 #include <errno.h> 62 #include <fcntl.h> 63 #include <libdiskmgt.h> 64 #include <libintl.h> 65 #include <libnvpair.h> 66 #include <stdio.h> 67 #include <string.h> 68 #include <unistd.h> 69 #include <sys/efi_partition.h> 70 #include <sys/stat.h> 71 #include <sys/vtoc.h> 72 #include <sys/mntent.h> 73 74 #include <libzfs.h> 75 76 #include "zpool_util.h" 77 78 #define DISK_ROOT "/dev/dsk" 79 #define RDISK_ROOT "/dev/rdsk" 80 #define BACKUP_SLICE "s2" 81 82 /* 83 * For any given vdev specification, we can have multiple errors. The 84 * vdev_error() function keeps track of whether we have seen an error yet, and 85 * prints out a header if its the first error we've seen. 86 */ 87 int error_seen; 88 int is_force; 89 90 void 91 vdev_error(const char *fmt, ...) 92 { 93 va_list ap; 94 95 if (!error_seen) { 96 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 97 if (!is_force) 98 (void) fprintf(stderr, gettext("use '-f' to override " 99 "the following errors:\n")); 100 else 101 (void) fprintf(stderr, gettext("the following errors " 102 "must be manually repaired:\n")); 103 error_seen = TRUE; 104 } 105 106 va_start(ap, fmt); 107 (void) vfprintf(stderr, fmt, ap); 108 va_end(ap); 109 } 110 111 static void 112 libdiskmgt_error(int error) 113 { 114 /* 115 * ENXIO is a valid error message if the device doesn't live in 116 * /dev/dsk. Don't bother printing an error message in this case. 117 */ 118 if (error == ENXIO) 119 return; 120 121 (void) fprintf(stderr, gettext("warning: device in use checking " 122 "failed: %s\n"), strerror(error)); 123 } 124 125 /* 126 * Validate a device, passing the bulk of the work off to libdiskmgt. 127 */ 128 int 129 check_slice(const char *path, int force, int wholedisk) 130 { 131 char *msg; 132 int error = 0; 133 int ret = 0; 134 135 if (dm_inuse((char *)path, &msg, 136 force ? DM_WHO_ZPOOL_FORCE : DM_WHO_ZPOOL, &error) || error) { 137 if (error != 0) { 138 libdiskmgt_error(error); 139 return (0); 140 } else { 141 vdev_error("%s", msg); 142 free(msg); 143 } 144 145 ret = -1; 146 } 147 148 /* 149 * If we're given a whole disk, ignore overlapping slices since we're 150 * about to label it anyway. 151 */ 152 error = 0; 153 if (!wholedisk && !force && 154 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 155 if (error != 0) { 156 libdiskmgt_error(error); 157 return (0); 158 } else { 159 vdev_error("%s overlaps with %s\n", path, msg); 160 free(msg); 161 } 162 163 ret = -1; 164 } 165 166 return (ret); 167 } 168 169 /* 170 * Validate a whole disk. Iterate over all slices on the disk and make sure 171 * that none is in use by calling check_slice(). 172 */ 173 /* ARGSUSED */ 174 int 175 check_disk(const char *name, dm_descriptor_t disk, int force) 176 { 177 dm_descriptor_t *drive, *media, *slice; 178 int err = 0; 179 int i; 180 int ret; 181 182 /* 183 * Get the drive associated with this disk. This should never fail, 184 * because we already have an alias handle open for the device. 185 */ 186 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 187 &err)) == NULL || *drive == NULL) { 188 if (err) 189 libdiskmgt_error(err); 190 return (0); 191 } 192 193 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 194 &err)) == NULL) { 195 dm_free_descriptors(drive); 196 if (err) 197 libdiskmgt_error(err); 198 return (0); 199 } 200 201 dm_free_descriptors(drive); 202 203 /* 204 * It is possible that the user has specified a removable media drive, 205 * and the media is not present. 206 */ 207 if (*media == NULL) { 208 dm_free_descriptors(media); 209 vdev_error(gettext("'%s' has no media in drive\n"), name); 210 return (-1); 211 } 212 213 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 214 &err)) == NULL) { 215 dm_free_descriptors(media); 216 if (err) 217 libdiskmgt_error(err); 218 return (0); 219 } 220 221 dm_free_descriptors(media); 222 223 ret = 0; 224 225 /* 226 * Iterate over all slices and report any errors. We don't care about 227 * overlapping slices because we are using the whole disk. 228 */ 229 for (i = 0; slice[i] != NULL; i++) { 230 if (check_slice(dm_get_name(slice[i], &err), force, TRUE) != 0) 231 ret = -1; 232 } 233 234 dm_free_descriptors(slice); 235 return (ret); 236 } 237 238 /* 239 * Validate a device. 240 */ 241 int 242 check_device(const char *path, int force) 243 { 244 dm_descriptor_t desc; 245 int err; 246 char *dev; 247 248 /* 249 * For whole disks, libdiskmgt does not include the leading dev path. 250 */ 251 dev = strrchr(path, '/'); 252 assert(dev != NULL); 253 dev++; 254 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { 255 err = check_disk(path, desc, force); 256 dm_free_descriptor(desc); 257 return (err); 258 } 259 260 return (check_slice(path, force, FALSE)); 261 } 262 263 /* 264 * Check that a file is valid. All we can do in this case is check that it's 265 * not in use by another pool. 266 */ 267 int 268 check_file(const char *file, int force) 269 { 270 char *name; 271 int fd; 272 int ret = 0; 273 pool_state_t state; 274 275 if ((fd = open(file, O_RDONLY)) < 0) 276 return (0); 277 278 if (zpool_in_use(fd, &state, &name)) { 279 const char *desc; 280 281 switch (state) { 282 case POOL_STATE_ACTIVE: 283 desc = gettext("active"); 284 break; 285 286 case POOL_STATE_EXPORTED: 287 desc = gettext("exported"); 288 break; 289 290 case POOL_STATE_POTENTIALLY_ACTIVE: 291 desc = gettext("potentially active"); 292 break; 293 294 default: 295 desc = gettext("unknown"); 296 break; 297 } 298 299 if (state == POOL_STATE_ACTIVE || !force) { 300 vdev_error(gettext("%s is part of %s pool '%s'\n"), 301 file, desc, name); 302 ret = -1; 303 } 304 305 free(name); 306 } 307 308 (void) close(fd); 309 return (ret); 310 } 311 312 static int 313 is_whole_disk(const char *arg, struct stat64 *statbuf) 314 { 315 char path[MAXPATHLEN]; 316 317 (void) snprintf(path, sizeof (path), "%s%s", arg, BACKUP_SLICE); 318 if (stat64(path, statbuf) == 0) 319 return (TRUE); 320 321 return (FALSE); 322 } 323 324 /* 325 * Create a leaf vdev. Determine if this is a file or a device. If it's a 326 * device, fill in the device id to make a complete nvlist. Valid forms for a 327 * leaf vdev are: 328 * 329 * /dev/dsk/xxx Complete disk path 330 * /xxx Full path to file 331 * xxx Shorthand for /dev/dsk/xxx 332 */ 333 nvlist_t * 334 make_leaf_vdev(const char *arg) 335 { 336 char path[MAXPATHLEN]; 337 struct stat64 statbuf; 338 nvlist_t *vdev = NULL; 339 char *type = NULL; 340 int wholedisk = FALSE; 341 342 /* 343 * Determine what type of vdev this is, and put the full path into 344 * 'path'. We detect whether this is a device of file afterwards by 345 * checking the st_mode of the file. 346 */ 347 if (arg[0] == '/') { 348 /* 349 * Complete device or file path. Exact type is determined by 350 * examining the file descriptor afterwards. 351 */ 352 if (is_whole_disk(arg, &statbuf)) { 353 wholedisk = TRUE; 354 } else if (stat64(arg, &statbuf) != 0) { 355 (void) fprintf(stderr, 356 gettext("cannot open '%s': %s\n"), 357 arg, strerror(errno)); 358 return (NULL); 359 } 360 361 (void) strlcpy(path, arg, sizeof (path)); 362 } else { 363 /* 364 * This may be a short path for a device, or it could be total 365 * gibberish. Check to see if it's a known device in 366 * /dev/dsk/. As part of this check, see if we've been given a 367 * an entire disk (minus the slice number). 368 */ 369 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, 370 arg); 371 if (is_whole_disk(path, &statbuf)) { 372 wholedisk = TRUE; 373 } else if (stat64(path, &statbuf) != 0) { 374 /* 375 * If we got ENOENT, then the user gave us 376 * gibberish, so try to direct them with a 377 * reasonable error message. Otherwise, 378 * regurgitate strerror() since it's the best we 379 * can do. 380 */ 381 if (errno == ENOENT) { 382 (void) fprintf(stderr, 383 gettext("cannot open '%s': no such " 384 "device in %s\n"), arg, DISK_ROOT); 385 (void) fprintf(stderr, 386 gettext("must be a full path or " 387 "shorthand device name\n")); 388 return (NULL); 389 } else { 390 (void) fprintf(stderr, 391 gettext("cannot open '%s': %s\n"), 392 path, strerror(errno)); 393 return (NULL); 394 } 395 } 396 } 397 398 /* 399 * Determine whether this is a device or a file. 400 */ 401 if (S_ISBLK(statbuf.st_mode)) { 402 type = VDEV_TYPE_DISK; 403 } else if (S_ISREG(statbuf.st_mode)) { 404 type = VDEV_TYPE_FILE; 405 } else { 406 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 407 "block device or regular file\n"), path); 408 return (NULL); 409 } 410 411 /* 412 * Finally, we have the complete device or file, and we know that it is 413 * acceptable to use. Construct the nvlist to describe this vdev. All 414 * vdevs have a 'path' element, and devices also have a 'devid' element. 415 */ 416 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 417 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 418 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 419 if (strcmp(type, VDEV_TYPE_DISK) == 0) 420 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 421 (uint64_t)wholedisk) == 0); 422 423 /* 424 * For a whole disk, defer getting its devid until after labeling it. 425 */ 426 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 427 /* 428 * Get the devid for the device. 429 */ 430 int fd; 431 ddi_devid_t devid; 432 char *minor = NULL, *devid_str = NULL; 433 434 if ((fd = open(path, O_RDONLY)) < 0) { 435 (void) fprintf(stderr, gettext("cannot open '%s': " 436 "%s\n"), path, strerror(errno)); 437 nvlist_free(vdev); 438 return (NULL); 439 } 440 441 if (devid_get(fd, &devid) == 0) { 442 if (devid_get_minor_name(fd, &minor) == 0 && 443 (devid_str = devid_str_encode(devid, minor)) != 444 NULL) { 445 verify(nvlist_add_string(vdev, 446 ZPOOL_CONFIG_DEVID, devid_str) == 0); 447 } 448 if (devid_str != NULL) 449 devid_str_free(devid_str); 450 if (minor != NULL) 451 devid_str_free(minor); 452 devid_free(devid); 453 } 454 455 (void) close(fd); 456 } 457 458 return (vdev); 459 } 460 461 /* 462 * Go through and verify the replication level of the pool is consistent. 463 * Performs the following checks: 464 * 465 * For the new spec, verifies that devices in mirrors and raidz are the 466 * same size. 467 * 468 * If the current configuration already has inconsistent replication 469 * levels, ignore any other potential problems in the new spec. 470 * 471 * Otherwise, make sure that the current spec (if there is one) and the new 472 * spec have consistent replication levels. 473 */ 474 typedef struct replication_level { 475 char *type; 476 int level; 477 } replication_level_t; 478 479 /* 480 * Given a list of toplevel vdevs, return the current replication level. If 481 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 482 * an error message will be displayed for each self-inconsistent vdev. 483 */ 484 replication_level_t * 485 get_replication(nvlist_t *nvroot, int fatal) 486 { 487 nvlist_t **top; 488 uint_t t, toplevels; 489 nvlist_t **child; 490 uint_t c, children; 491 nvlist_t *nv; 492 char *type; 493 replication_level_t lastrep, rep, *ret; 494 int dontreport; 495 496 ret = safe_malloc(sizeof (replication_level_t)); 497 498 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 499 &top, &toplevels) == 0); 500 501 lastrep.type = NULL; 502 for (t = 0; t < toplevels; t++) { 503 nv = top[t]; 504 505 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 506 507 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 508 &child, &children) != 0) { 509 /* 510 * This is a 'file' or 'disk' vdev. 511 */ 512 rep.type = type; 513 rep.level = 1; 514 } else { 515 uint64_t vdev_size; 516 517 /* 518 * This is a mirror or RAID-Z vdev. Go through and make 519 * sure the contents are all the same (files vs. disks), 520 * keeping track of the number of elements in the 521 * process. 522 * 523 * We also check that the size of each vdev (if it can 524 * be determined) is the same. 525 */ 526 rep.type = type; 527 rep.level = 0; 528 529 /* 530 * The 'dontreport' variable indicatest that we've 531 * already reported an error for this spec, so don't 532 * bother doing it again. 533 */ 534 type = NULL; 535 dontreport = 0; 536 vdev_size = -1ULL; 537 for (c = 0; c < children; c++) { 538 nvlist_t *cnv = child[c]; 539 char *path; 540 struct stat64 statbuf; 541 uint64_t size = -1ULL; 542 char *childtype; 543 int fd, err; 544 545 rep.level++; 546 547 verify(nvlist_lookup_string(cnv, 548 ZPOOL_CONFIG_TYPE, &childtype) == 0); 549 verify(nvlist_lookup_string(cnv, 550 ZPOOL_CONFIG_PATH, &path) == 0); 551 552 /* 553 * If we have a raidz/mirror that combines disks 554 * with files, report it as an error. 555 */ 556 if (!dontreport && type != NULL && 557 strcmp(type, childtype) != 0) { 558 if (ret != NULL) 559 free(ret); 560 ret = NULL; 561 if (fatal) 562 vdev_error(gettext( 563 "mismatched replication " 564 "level: %s contains both " 565 "files and devices\n"), 566 rep.type); 567 else 568 return (NULL); 569 dontreport = TRUE; 570 } 571 572 /* 573 * According to stat(2), the value of 'st_size' 574 * is undefined for block devices and character 575 * devices. But there is no effective way to 576 * determine the real size in userland. 577 * 578 * Instead, we'll take advantage of an 579 * implementation detail of spec_size(). If the 580 * device is currently open, then we (should) 581 * return a valid size. 582 * 583 * If we still don't get a valid size (indicated 584 * by a size of 0 or MAXOFFSET_T), then ignore 585 * this device altogether. 586 */ 587 if ((fd = open(path, O_RDONLY)) >= 0) { 588 err = fstat64(fd, &statbuf); 589 (void) close(fd); 590 } else { 591 err = stat64(path, &statbuf); 592 } 593 594 if (err != 0 || 595 statbuf.st_size == 0 || 596 statbuf.st_size == MAXOFFSET_T) 597 continue; 598 599 size = statbuf.st_size; 600 601 /* 602 * Also check the size of each device. If they 603 * differ, then report an error. 604 */ 605 if (!dontreport && vdev_size != -1ULL && 606 size != vdev_size) { 607 if (ret != NULL) 608 free(ret); 609 ret = NULL; 610 if (fatal) 611 vdev_error(gettext( 612 "%s contains devices of " 613 "different sizes\n"), 614 rep.type); 615 else 616 return (NULL); 617 dontreport = TRUE; 618 } 619 620 type = childtype; 621 vdev_size = size; 622 } 623 } 624 625 /* 626 * At this point, we have the replication of the last toplevel 627 * vdev in 'rep'. Compare it to 'lastrep' to see if its 628 * different. 629 */ 630 if (lastrep.type != NULL) { 631 if (strcmp(lastrep.type, rep.type) != 0) { 632 if (ret != NULL) 633 free(ret); 634 ret = NULL; 635 if (fatal) 636 vdev_error(gettext( 637 "mismatched replication " 638 "level: both %s and %s vdevs are " 639 "present\n"), 640 lastrep.type, rep.type); 641 else 642 return (NULL); 643 } else if (lastrep.level != rep.level) { 644 if (ret) 645 free(ret); 646 ret = NULL; 647 if (fatal) 648 vdev_error(gettext( 649 "mismatched replication " 650 "level: %d-way %s and %d-way %s " 651 "vdevs are present\n"), 652 lastrep.level, lastrep.type, 653 rep.level, rep.type); 654 else 655 return (NULL); 656 } 657 } 658 lastrep = rep; 659 } 660 661 if (ret != NULL) { 662 ret->type = rep.type; 663 ret->level = rep.level; 664 } 665 666 return (ret); 667 } 668 669 /* 670 * Check the replication level of the vdev spec against the current pool. Calls 671 * get_replication() to make sure the new spec is self-consistent. If the pool 672 * has a consistent replication level, then we ignore any errors. Otherwise, 673 * report any difference between the two. 674 */ 675 int 676 check_replication(nvlist_t *config, nvlist_t *newroot) 677 { 678 replication_level_t *current = NULL, *new; 679 int ret; 680 681 /* 682 * If we have a current pool configuration, check to see if it's 683 * self-consistent. If not, simply return success. 684 */ 685 if (config != NULL) { 686 nvlist_t *nvroot; 687 688 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 689 &nvroot) == 0); 690 if ((current = get_replication(nvroot, FALSE)) == NULL) 691 return (0); 692 } 693 694 /* 695 * Get the replication level of the new vdev spec, reporting any 696 * inconsistencies found. 697 */ 698 if ((new = get_replication(newroot, TRUE)) == NULL) { 699 free(current); 700 return (-1); 701 } 702 703 /* 704 * Check to see if the new vdev spec matches the replication level of 705 * the current pool. 706 */ 707 ret = 0; 708 if (current != NULL) { 709 if (strcmp(current->type, new->type) != 0 || 710 current->level != new->level) { 711 vdev_error(gettext( 712 "mismatched replication level: pool uses %d-way %s " 713 "and new vdev uses %d-way %s\n"), 714 current->level, current->type, new->level, 715 new->type); 716 ret = -1; 717 } 718 } 719 720 free(new); 721 if (current != NULL) 722 free(current); 723 724 return (ret); 725 } 726 727 /* 728 * Label an individual disk. The name provided is the short name, stripped of 729 * any leading /dev path. 730 */ 731 int 732 label_disk(char *name) 733 { 734 char path[MAXPATHLEN]; 735 struct dk_gpt *vtoc; 736 int fd; 737 size_t resv = 16384; 738 739 (void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name, 740 BACKUP_SLICE); 741 742 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { 743 /* 744 * This shouldn't happen. We've long since verified that this 745 * is a valid device. 746 */ 747 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), 748 path, strerror(errno)); 749 return (-1); 750 } 751 752 753 if (efi_alloc_and_init(fd, 9, &vtoc) != 0) { 754 /* 755 * The only way this can fail is if we run out of memory, or we 756 * were unable to read the disk geometry. 757 */ 758 if (errno == ENOMEM) 759 no_memory(); 760 761 (void) fprintf(stderr, gettext("cannot label '%s': unable to " 762 "read disk geometry\n"), name); 763 (void) close(fd); 764 return (-1); 765 } 766 767 vtoc->efi_parts[0].p_start = vtoc->efi_first_u_lba; 768 vtoc->efi_parts[0].p_size = vtoc->efi_last_u_lba + 1 - 769 vtoc->efi_first_u_lba - resv; 770 771 /* 772 * Why we use V_USR: V_BACKUP confuses users, and is considered 773 * disposable by some EFI utilities (since EFI doesn't have a backup 774 * slice). V_UNASSIGNED is supposed to be used only for zero size 775 * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, 776 * etc. were all pretty specific. V_USR is as close to reality as we 777 * can get, in the absence of V_OTHER. 778 */ 779 vtoc->efi_parts[0].p_tag = V_USR; 780 (void) strcpy(vtoc->efi_parts[0].p_name, "zfs"); 781 782 vtoc->efi_parts[8].p_start = vtoc->efi_last_u_lba + 1 - resv; 783 vtoc->efi_parts[8].p_size = resv; 784 vtoc->efi_parts[8].p_tag = V_RESERVED; 785 786 if (efi_write(fd, vtoc) != 0) { 787 /* 788 * Currently, EFI labels are not supported for IDE disks, and it 789 * is likely that they will not be supported on other drives for 790 * some time. Print out a helpful error message directing the 791 * user to manually label the disk and give a specific slice. 792 */ 793 (void) fprintf(stderr, gettext("cannot label '%s': failed to " 794 "write EFI label\n"), name); 795 (void) fprintf(stderr, gettext("use fdisk(1M) to partition " 796 "the disk, and provide a specific slice\n")); 797 (void) close(fd); 798 return (-1); 799 } 800 801 (void) close(fd); 802 return (0); 803 } 804 805 /* 806 * Go through and find any whole disks in the vdev specification, labelling them 807 * as appropriate. When constructing the vdev spec, we were unable to open this 808 * device in order to provide a devid. Now that we have labelled the disk and 809 * know that slice 0 is valid, we can construct the devid now. 810 * 811 * If the disk was already labelled with an EFI label, we will have gotten the 812 * devid already (because we were able to open the whole disk). Otherwise, we 813 * need to get the devid after we label the disk. 814 */ 815 int 816 make_disks(nvlist_t *nv) 817 { 818 nvlist_t **child; 819 uint_t c, children; 820 char *type, *path, *diskname; 821 char buf[MAXPATHLEN]; 822 uint64_t wholedisk; 823 int fd; 824 int ret; 825 ddi_devid_t devid; 826 char *minor = NULL, *devid_str = NULL; 827 828 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 829 830 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 831 &child, &children) != 0) { 832 833 if (strcmp(type, VDEV_TYPE_DISK) != 0) 834 return (0); 835 836 /* 837 * We have a disk device. Get the path to the device 838 * and see if its a whole disk by appending the backup 839 * slice and stat()ing the device. 840 */ 841 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 842 843 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 844 &wholedisk) != 0 || !wholedisk) 845 return (0); 846 847 diskname = strrchr(path, '/'); 848 assert(diskname != NULL); 849 diskname++; 850 if (label_disk(diskname) != 0) 851 return (-1); 852 853 /* 854 * Fill in the devid, now that we've labeled the disk. 855 */ 856 (void) snprintf(buf, sizeof (buf), "%ss0", path); 857 if ((fd = open(buf, O_RDONLY)) < 0) { 858 (void) fprintf(stderr, 859 gettext("cannot open '%s': %s\n"), 860 buf, strerror(errno)); 861 return (-1); 862 } 863 864 if (devid_get(fd, &devid) == 0) { 865 if (devid_get_minor_name(fd, &minor) == 0 && 866 (devid_str = devid_str_encode(devid, minor)) != 867 NULL) { 868 verify(nvlist_add_string(nv, 869 ZPOOL_CONFIG_DEVID, devid_str) == 0); 870 } 871 if (devid_str != NULL) 872 devid_str_free(devid_str); 873 if (minor != NULL) 874 devid_str_free(minor); 875 devid_free(devid); 876 } 877 878 /* 879 * Update the path to refer to the 's0' slice. The presence of 880 * the 'whole_disk' field indicates to the CLI that we should 881 * chop off the slice number when displaying the device in 882 * future output. 883 */ 884 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 885 886 (void) close(fd); 887 888 return (0); 889 } 890 891 for (c = 0; c < children; c++) 892 if ((ret = make_disks(child[c])) != 0) 893 return (ret); 894 895 return (0); 896 } 897 898 /* 899 * Go through and find any devices that are in use. We rely on libdiskmgt for 900 * the majority of this task. 901 */ 902 int 903 check_in_use(nvlist_t *nv, int force) 904 { 905 nvlist_t **child; 906 uint_t c, children; 907 char *type, *path; 908 int ret; 909 910 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 911 912 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 913 &child, &children) != 0) { 914 915 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 916 917 if (strcmp(type, VDEV_TYPE_DISK) == 0) 918 ret = check_device(path, force); 919 920 if (strcmp(type, VDEV_TYPE_FILE) == 0) 921 ret = check_file(path, force); 922 923 return (ret); 924 } 925 926 for (c = 0; c < children; c++) 927 if ((ret = check_in_use(child[c], force)) != 0) 928 return (ret); 929 930 return (0); 931 } 932 933 /* 934 * Construct a syntactically valid vdev specification, 935 * and ensure that all devices and files exist and can be opened. 936 * Note: we don't bother freeing anything in the error paths 937 * because the program is just going to exit anyway. 938 */ 939 nvlist_t * 940 construct_spec(int argc, char **argv) 941 { 942 nvlist_t *nvroot, *nv, **top; 943 int t, toplevels; 944 945 top = NULL; 946 toplevels = 0; 947 948 while (argc > 0) { 949 nv = NULL; 950 951 /* 952 * If it's a mirror or raidz, the subsequent arguments are 953 * its leaves -- until we encounter the next mirror or raidz. 954 */ 955 if (strcmp(argv[0], VDEV_TYPE_MIRROR) == 0 || 956 strcmp(argv[0], VDEV_TYPE_RAIDZ) == 0) { 957 958 char *type = argv[0]; 959 nvlist_t **child = NULL; 960 int children = 0; 961 int c; 962 963 for (c = 1; c < argc; c++) { 964 if (strcmp(argv[c], VDEV_TYPE_MIRROR) == 0 || 965 strcmp(argv[c], VDEV_TYPE_RAIDZ) == 0) 966 break; 967 children++; 968 child = realloc(child, 969 children * sizeof (nvlist_t *)); 970 if (child == NULL) 971 no_memory(); 972 if ((nv = make_leaf_vdev(argv[c])) == NULL) 973 return (NULL); 974 child[children - 1] = nv; 975 } 976 977 argc -= c; 978 argv += c; 979 980 /* 981 * Mirrors and RAID-Z devices require at least 982 * two components. 983 */ 984 if (children < 2) { 985 (void) fprintf(stderr, 986 gettext("invalid vdev specification: " 987 "%s requires at least 2 devices\n"), type); 988 return (NULL); 989 } 990 991 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0); 992 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 993 type) == 0); 994 verify(nvlist_add_nvlist_array(nv, 995 ZPOOL_CONFIG_CHILDREN, child, children) == 0); 996 997 for (c = 0; c < children; c++) 998 nvlist_free(child[c]); 999 free(child); 1000 } else { 1001 /* 1002 * We have a device. Pass off to make_leaf_vdev() to 1003 * construct the appropriate nvlist describing the vdev. 1004 */ 1005 if ((nv = make_leaf_vdev(argv[0])) == NULL) 1006 return (NULL); 1007 argc--; 1008 argv++; 1009 } 1010 1011 toplevels++; 1012 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1013 if (top == NULL) 1014 no_memory(); 1015 top[toplevels - 1] = nv; 1016 } 1017 1018 /* 1019 * Finally, create nvroot and add all top-level vdevs to it. 1020 */ 1021 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1022 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1023 VDEV_TYPE_ROOT) == 0); 1024 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1025 top, toplevels) == 0); 1026 1027 for (t = 0; t < toplevels; t++) 1028 nvlist_free(top[t]); 1029 free(top); 1030 1031 return (nvroot); 1032 } 1033 1034 /* 1035 * Get and validate the contents of the given vdev specification. This ensures 1036 * that the nvlist returned is well-formed, that all the devices exist, and that 1037 * they are not currently in use by any other known consumer. The 'poolconfig' 1038 * parameter is the current configuration of the pool when adding devices 1039 * existing pool, and is used to perform additional checks, such as changing the 1040 * replication level of the pool. It can be 'NULL' to indicate that this is a 1041 * new pool. The 'force' flag controls whether devices should be forcefully 1042 * added, even if they appear in use. 1043 */ 1044 nvlist_t * 1045 make_root_vdev(nvlist_t *poolconfig, int force, int check_rep, 1046 int argc, char **argv) 1047 { 1048 nvlist_t *newroot; 1049 1050 is_force = force; 1051 1052 /* 1053 * Construct the vdev specification. If this is successful, we know 1054 * that we have a valid specification, and that all devices can be 1055 * opened. 1056 */ 1057 if ((newroot = construct_spec(argc, argv)) == NULL) 1058 return (NULL); 1059 1060 /* 1061 * Validate each device to make sure that its not shared with another 1062 * subsystem. We do this even if 'force' is set, because there are some 1063 * uses (such as a dedicated dump device) that even '-f' cannot 1064 * override. 1065 */ 1066 if (check_in_use(newroot, force) != 0) { 1067 nvlist_free(newroot); 1068 return (NULL); 1069 } 1070 1071 /* 1072 * Check the replication level of the given vdevs and report any errors 1073 * found. We include the existing pool spec, if any, as we need to 1074 * catch changes against the existing replication level. 1075 */ 1076 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1077 nvlist_free(newroot); 1078 return (NULL); 1079 } 1080 1081 /* 1082 * Run through the vdev specification and label any whole disks found. 1083 */ 1084 if (make_disks(newroot) != 0) { 1085 nvlist_free(newroot); 1086 return (NULL); 1087 } 1088 1089 return (newroot); 1090 } 1091