1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Functions to convert between a list of vdevs and an nvlist representing the 31 * configuration. Each entry in the list can be one of: 32 * 33 * Device vdevs 34 * disk=(path=..., devid=...) 35 * file=(path=...) 36 * 37 * Group vdevs 38 * raidz=(...) 39 * mirror=(...) 40 * 41 * While the underlying implementation supports it, group vdevs cannot contain 42 * other group vdevs. All userland verification of devices is contained within 43 * this file. If successful, the nvlist returned can be passed directly to the 44 * kernel; we've done as much verification as possible in userland. 45 * 46 * The only function exported by this file is 'get_vdev_spec'. The function 47 * performs several passes: 48 * 49 * 1. Construct the vdev specification. Performs syntax validation and 50 * makes sure each device is valid. 51 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 52 * devices are also in use. Some can be overridden using the 'force' 53 * flag, others cannot. 54 * 3. Check for replication errors if the 'force' flag is not specified. 55 * validates that the replication level is consistent across the 56 * entire pool. 57 * 4. Label any whole disks with an EFI label. 58 */ 59 60 #include <assert.h> 61 #include <devid.h> 62 #include <errno.h> 63 #include <fcntl.h> 64 #include <libdiskmgt.h> 65 #include <libintl.h> 66 #include <libnvpair.h> 67 #include <stdio.h> 68 #include <string.h> 69 #include <unistd.h> 70 #include <sys/efi_partition.h> 71 #include <sys/stat.h> 72 #include <sys/vtoc.h> 73 #include <sys/mntent.h> 74 75 #include <libzfs.h> 76 77 #include "zpool_util.h" 78 79 #define DISK_ROOT "/dev/dsk" 80 #define RDISK_ROOT "/dev/rdsk" 81 #define BACKUP_SLICE "s2" 82 83 /* 84 * For any given vdev specification, we can have multiple errors. The 85 * vdev_error() function keeps track of whether we have seen an error yet, and 86 * prints out a header if its the first error we've seen. 87 */ 88 int error_seen; 89 int is_force; 90 91 void 92 vdev_error(const char *fmt, ...) 93 { 94 va_list ap; 95 96 if (!error_seen) { 97 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 98 if (!is_force) 99 (void) fprintf(stderr, gettext("use '-f' to override " 100 "the following errors:\n")); 101 else 102 (void) fprintf(stderr, gettext("the following errors " 103 "must be manually repaired:\n")); 104 error_seen = TRUE; 105 } 106 107 va_start(ap, fmt); 108 (void) vfprintf(stderr, fmt, ap); 109 va_end(ap); 110 } 111 112 static void 113 libdiskmgt_error(int error) 114 { 115 (void) fprintf(stderr, gettext("warning: device in use checking " 116 "failed: %s\n"), strerror(error)); 117 } 118 119 /* 120 * Validate a device, passing the bulk of the work off to libdiskmgt. 121 */ 122 int 123 check_slice(const char *path, int force, int wholedisk) 124 { 125 char *msg; 126 int error = 0; 127 int ret = 0; 128 129 if (dm_inuse((char *)path, &msg, 130 force ? DM_WHO_ZPOOL_FORCE : DM_WHO_ZPOOL, &error) || error) { 131 if (error != 0) { 132 libdiskmgt_error(error); 133 return (0); 134 } else { 135 vdev_error("%s", msg); 136 free(msg); 137 } 138 139 ret = -1; 140 } 141 142 /* 143 * If we're given a whole disk, ignore overlapping slices since we're 144 * about to label it anyway. 145 */ 146 error = 0; 147 if (!wholedisk && !force && 148 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 149 if (error != 0) { 150 libdiskmgt_error(error); 151 return (0); 152 } else { 153 vdev_error("%s overlaps with %s\n", path, msg); 154 free(msg); 155 } 156 157 ret = -1; 158 } 159 160 return (ret); 161 } 162 163 /* 164 * Validate a whole disk. Iterate over all slices on the disk and make sure 165 * that none is in use by calling check_slice(). 166 */ 167 /* ARGSUSED */ 168 int 169 check_disk(const char *name, dm_descriptor_t disk, int force) 170 { 171 dm_descriptor_t *drive, *media, *slice; 172 int err = 0; 173 int i; 174 int ret; 175 176 /* 177 * Get the drive associated with this disk. This should never fail, 178 * because we already have an alias handle open for the device. 179 */ 180 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 181 &err)) == NULL || *drive == NULL) { 182 if (err) 183 libdiskmgt_error(err); 184 return (0); 185 } 186 187 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 188 &err)) == NULL) { 189 dm_free_descriptors(drive); 190 if (err) 191 libdiskmgt_error(err); 192 return (0); 193 } 194 195 dm_free_descriptors(drive); 196 197 /* 198 * It is possible that the user has specified a removable media drive, 199 * and the media is not present. 200 */ 201 if (*media == NULL) { 202 dm_free_descriptors(media); 203 vdev_error(gettext("'%s' has no media in drive\n"), name); 204 return (-1); 205 } 206 207 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 208 &err)) == NULL) { 209 dm_free_descriptors(media); 210 if (err) 211 libdiskmgt_error(err); 212 return (0); 213 } 214 215 dm_free_descriptors(media); 216 217 ret = 0; 218 219 /* 220 * Iterate over all slices and report any errors. We don't care about 221 * overlapping slices because we are using the whole disk. 222 */ 223 for (i = 0; slice[i] != NULL; i++) { 224 if (check_slice(dm_get_name(slice[i], &err), force, TRUE) != 0) 225 ret = -1; 226 } 227 228 dm_free_descriptors(slice); 229 return (ret); 230 } 231 232 /* 233 * Validate a device. 234 */ 235 int 236 check_device(const char *path, int force) 237 { 238 dm_descriptor_t desc; 239 int err; 240 char *dev; 241 242 /* 243 * For whole disks, libdiskmgt does not include the leading dev path. 244 */ 245 dev = strrchr(path, '/'); 246 assert(dev != NULL); 247 dev++; 248 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { 249 err = check_disk(path, desc, force); 250 dm_free_descriptor(desc); 251 return (err); 252 } 253 254 return (check_slice(path, force, FALSE)); 255 } 256 257 /* 258 * Check that a file is valid. All we can do in this case is check that it's 259 * not in use by another pool. 260 */ 261 int 262 check_file(const char *file, int force) 263 { 264 char *name; 265 int fd; 266 int ret = 0; 267 pool_state_t state; 268 269 if ((fd = open(file, O_RDONLY)) < 0) 270 return (0); 271 272 if (zpool_in_use(fd, &state, &name)) { 273 const char *desc; 274 275 switch (state) { 276 case POOL_STATE_ACTIVE: 277 desc = gettext("active"); 278 break; 279 280 case POOL_STATE_EXPORTED: 281 desc = gettext("exported"); 282 break; 283 284 case POOL_STATE_POTENTIALLY_ACTIVE: 285 desc = gettext("potentially active"); 286 break; 287 288 default: 289 desc = gettext("unknown"); 290 break; 291 } 292 293 if (state == POOL_STATE_ACTIVE || !force) { 294 vdev_error(gettext("%s is part of %s pool '%s'\n"), 295 file, desc, name); 296 ret = -1; 297 } 298 299 free(name); 300 } 301 302 (void) close(fd); 303 return (ret); 304 } 305 306 static int 307 is_whole_disk(const char *arg, struct stat64 *statbuf) 308 { 309 char path[MAXPATHLEN]; 310 311 (void) snprintf(path, sizeof (path), "%s%s", arg, BACKUP_SLICE); 312 if (stat64(path, statbuf) == 0) 313 return (TRUE); 314 315 return (FALSE); 316 } 317 318 /* 319 * Create a leaf vdev. Determine if this is a file or a device. If it's a 320 * device, fill in the device id to make a complete nvlist. Valid forms for a 321 * leaf vdev are: 322 * 323 * /dev/dsk/xxx Complete disk path 324 * /xxx Full path to file 325 * xxx Shorthand for /dev/dsk/xxx 326 */ 327 nvlist_t * 328 make_leaf_vdev(const char *arg) 329 { 330 char path[MAXPATHLEN]; 331 struct stat64 statbuf; 332 nvlist_t *vdev = NULL; 333 char *type = NULL; 334 int wholedisk = FALSE; 335 336 /* 337 * Determine what type of vdev this is, and put the full path into 338 * 'path'. We detect whether this is a device of file afterwards by 339 * checking the st_mode of the file. 340 */ 341 if (arg[0] == '/') { 342 /* 343 * Complete device or file path. Exact type is determined by 344 * examining the file descriptor afterwards. 345 */ 346 if (is_whole_disk(arg, &statbuf)) { 347 wholedisk = TRUE; 348 } else if (stat64(arg, &statbuf) != 0) { 349 (void) fprintf(stderr, 350 gettext("cannot open '%s': %s\n"), 351 arg, strerror(errno)); 352 return (NULL); 353 } 354 355 (void) strlcpy(path, arg, sizeof (path)); 356 } else { 357 /* 358 * This may be a short path for a device, or it could be total 359 * gibberish. Check to see if it's a known device in 360 * /dev/dsk/. As part of this check, see if we've been given a 361 * an entire disk (minus the slice number). 362 */ 363 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, 364 arg); 365 if (is_whole_disk(path, &statbuf)) { 366 wholedisk = TRUE; 367 } else if (stat64(path, &statbuf) != 0) { 368 /* 369 * If we got ENOENT, then the user gave us 370 * gibberish, so try to direct them with a 371 * reasonable error message. Otherwise, 372 * regurgitate strerror() since it's the best we 373 * can do. 374 */ 375 if (errno == ENOENT) { 376 (void) fprintf(stderr, 377 gettext("cannot open '%s': no such " 378 "device in %s\n"), arg, DISK_ROOT); 379 (void) fprintf(stderr, 380 gettext("must be a full path or " 381 "shorthand device name\n")); 382 return (NULL); 383 } else { 384 (void) fprintf(stderr, 385 gettext("cannot open '%s': %s\n"), 386 path, strerror(errno)); 387 return (NULL); 388 } 389 } 390 } 391 392 /* 393 * Determine whether this is a device or a file. 394 */ 395 if (S_ISBLK(statbuf.st_mode)) { 396 type = VDEV_TYPE_DISK; 397 } else if (S_ISREG(statbuf.st_mode)) { 398 type = VDEV_TYPE_FILE; 399 } else { 400 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 401 "block device or regular file\n"), path); 402 return (NULL); 403 } 404 405 /* 406 * Finally, we have the complete device or file, and we know that it is 407 * acceptable to use. Construct the nvlist to describe this vdev. All 408 * vdevs have a 'path' element, and devices also have a 'devid' element. 409 */ 410 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 411 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 412 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 413 if (strcmp(type, VDEV_TYPE_DISK) == 0) 414 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 415 (uint64_t)wholedisk) == 0); 416 417 /* 418 * For a whole disk, defer getting its devid until after labeling it. 419 */ 420 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 421 /* 422 * Get the devid for the device. 423 */ 424 int fd; 425 ddi_devid_t devid; 426 char *minor = NULL, *devid_str = NULL; 427 428 if ((fd = open(path, O_RDONLY)) < 0) { 429 (void) fprintf(stderr, gettext("cannot open '%s': " 430 "%s\n"), path, strerror(errno)); 431 nvlist_free(vdev); 432 return (NULL); 433 } 434 435 if (devid_get(fd, &devid) == 0) { 436 if (devid_get_minor_name(fd, &minor) == 0 && 437 (devid_str = devid_str_encode(devid, minor)) != 438 NULL) { 439 verify(nvlist_add_string(vdev, 440 ZPOOL_CONFIG_DEVID, devid_str) == 0); 441 } 442 if (devid_str != NULL) 443 devid_str_free(devid_str); 444 if (minor != NULL) 445 devid_str_free(minor); 446 devid_free(devid); 447 } 448 449 (void) close(fd); 450 } 451 452 return (vdev); 453 } 454 455 /* 456 * Go through and verify the replication level of the pool is consistent. 457 * Performs the following checks: 458 * 459 * For the new spec, verifies that devices in mirrors and raidz are the 460 * same size. 461 * 462 * If the current configuration already has inconsistent replication 463 * levels, ignore any other potential problems in the new spec. 464 * 465 * Otherwise, make sure that the current spec (if there is one) and the new 466 * spec have consistent replication levels. 467 */ 468 typedef struct replication_level { 469 char *type; 470 int level; 471 } replication_level_t; 472 473 /* 474 * Given a list of toplevel vdevs, return the current replication level. If 475 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 476 * an error message will be displayed for each self-inconsistent vdev. 477 */ 478 replication_level_t * 479 get_replication(nvlist_t *nvroot, int fatal) 480 { 481 nvlist_t **top; 482 uint_t t, toplevels; 483 nvlist_t **child; 484 uint_t c, children; 485 nvlist_t *nv; 486 char *type; 487 replication_level_t lastrep, rep, *ret; 488 int dontreport; 489 490 ret = safe_malloc(sizeof (replication_level_t)); 491 492 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 493 &top, &toplevels) == 0); 494 495 lastrep.type = NULL; 496 for (t = 0; t < toplevels; t++) { 497 nv = top[t]; 498 499 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 500 501 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 502 &child, &children) != 0) { 503 /* 504 * This is a 'file' or 'disk' vdev. 505 */ 506 rep.type = type; 507 rep.level = 1; 508 } else { 509 uint64_t vdev_size; 510 511 /* 512 * This is a mirror or RAID-Z vdev. Go through and make 513 * sure the contents are all the same (files vs. disks), 514 * keeping track of the number of elements in the 515 * process. 516 * 517 * We also check that the size of each vdev (if it can 518 * be determined) is the same. 519 */ 520 rep.type = type; 521 rep.level = 0; 522 523 /* 524 * The 'dontreport' variable indicatest that we've 525 * already reported an error for this spec, so don't 526 * bother doing it again. 527 */ 528 type = NULL; 529 dontreport = 0; 530 vdev_size = -1ULL; 531 for (c = 0; c < children; c++) { 532 nvlist_t *cnv = child[c]; 533 char *path; 534 struct stat64 statbuf; 535 uint64_t size = -1ULL; 536 char *childtype; 537 int fd, err; 538 539 rep.level++; 540 541 verify(nvlist_lookup_string(cnv, 542 ZPOOL_CONFIG_TYPE, &childtype) == 0); 543 verify(nvlist_lookup_string(cnv, 544 ZPOOL_CONFIG_PATH, &path) == 0); 545 546 /* 547 * If we have a raidz/mirror that combines disks 548 * with files, report it as an error. 549 */ 550 if (!dontreport && type != NULL && 551 strcmp(type, childtype) != 0) { 552 if (ret != NULL) 553 free(ret); 554 ret = NULL; 555 if (fatal) 556 vdev_error(gettext( 557 "mismatched replication " 558 "level: %s contains both " 559 "files and devices\n"), 560 rep.type); 561 else 562 return (NULL); 563 dontreport = TRUE; 564 } 565 566 /* 567 * According to stat(2), the value of 'st_size' 568 * is undefined for block devices and character 569 * devices. But there is no effective way to 570 * determine the real size in userland. 571 * 572 * Instead, we'll take advantage of an 573 * implementation detail of spec_size(). If the 574 * device is currently open, then we (should) 575 * return a valid size. 576 * 577 * If we still don't get a valid size (indicated 578 * by a size of 0 or MAXOFFSET_T), then ignore 579 * this device altogether. 580 */ 581 if ((fd = open(path, O_RDONLY)) >= 0) { 582 err = fstat64(fd, &statbuf); 583 (void) close(fd); 584 } else { 585 err = stat64(path, &statbuf); 586 } 587 588 if (err != 0 || 589 statbuf.st_size == 0 || 590 statbuf.st_size == MAXOFFSET_T) 591 continue; 592 593 size = statbuf.st_size; 594 595 /* 596 * Also check the size of each device. If they 597 * differ, then report an error. 598 */ 599 if (!dontreport && vdev_size != -1ULL && 600 size != vdev_size) { 601 if (ret != NULL) 602 free(ret); 603 ret = NULL; 604 if (fatal) 605 vdev_error(gettext( 606 "%s contains devices of " 607 "different sizes\n"), 608 rep.type); 609 else 610 return (NULL); 611 dontreport = TRUE; 612 } 613 614 type = childtype; 615 vdev_size = size; 616 } 617 } 618 619 /* 620 * At this point, we have the replication of the last toplevel 621 * vdev in 'rep'. Compare it to 'lastrep' to see if its 622 * different. 623 */ 624 if (lastrep.type != NULL) { 625 if (strcmp(lastrep.type, rep.type) != 0) { 626 if (ret != NULL) 627 free(ret); 628 ret = NULL; 629 if (fatal) 630 vdev_error(gettext( 631 "mismatched replication " 632 "level: both %s and %s vdevs are " 633 "present\n"), 634 lastrep.type, rep.type); 635 else 636 return (NULL); 637 } else if (lastrep.level != rep.level) { 638 if (ret) 639 free(ret); 640 ret = NULL; 641 if (fatal) 642 vdev_error(gettext( 643 "mismatched replication " 644 "level: %d-way %s and %d-way %s " 645 "vdevs are present\n"), 646 lastrep.level, lastrep.type, 647 rep.level, rep.type); 648 else 649 return (NULL); 650 } 651 } 652 lastrep = rep; 653 } 654 655 if (ret != NULL) { 656 ret->type = rep.type; 657 ret->level = rep.level; 658 } 659 660 return (ret); 661 } 662 663 /* 664 * Check the replication level of the vdev spec against the current pool. Calls 665 * get_replication() to make sure the new spec is self-consistent. If the pool 666 * has a consistent replication level, then we ignore any errors. Otherwise, 667 * report any difference between the two. 668 */ 669 int 670 check_replication(nvlist_t *config, nvlist_t *newroot) 671 { 672 replication_level_t *current = NULL, *new; 673 int ret; 674 675 /* 676 * If we have a current pool configuration, check to see if it's 677 * self-consistent. If not, simply return success. 678 */ 679 if (config != NULL) { 680 nvlist_t *nvroot; 681 682 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 683 &nvroot) == 0); 684 if ((current = get_replication(nvroot, FALSE)) == NULL) 685 return (0); 686 } 687 688 /* 689 * Get the replication level of the new vdev spec, reporting any 690 * inconsistencies found. 691 */ 692 if ((new = get_replication(newroot, TRUE)) == NULL) { 693 free(current); 694 return (-1); 695 } 696 697 /* 698 * Check to see if the new vdev spec matches the replication level of 699 * the current pool. 700 */ 701 ret = 0; 702 if (current != NULL) { 703 if (strcmp(current->type, new->type) != 0 || 704 current->level != new->level) { 705 vdev_error(gettext( 706 "mismatched replication level: pool uses %d-way %s " 707 "and new vdev uses %d-way %s\n"), 708 current->level, current->type, new->level, 709 new->type); 710 ret = -1; 711 } 712 } 713 714 free(new); 715 if (current != NULL) 716 free(current); 717 718 return (ret); 719 } 720 721 /* 722 * Label an individual disk. The name provided is the short name, stripped of 723 * any leading /dev path. 724 */ 725 int 726 label_disk(char *name) 727 { 728 char path[MAXPATHLEN]; 729 struct dk_gpt *vtoc; 730 int fd; 731 size_t resv = 16384; 732 733 (void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name, 734 BACKUP_SLICE); 735 736 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { 737 /* 738 * This shouldn't happen. We've long since verified that this 739 * is a valid device. 740 */ 741 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), 742 path, strerror(errno)); 743 return (-1); 744 } 745 746 747 if (efi_alloc_and_init(fd, 9, &vtoc) != 0) { 748 /* 749 * The only way this can fail is if we run out of memory, or we 750 * were unable to read the disk geometry. 751 */ 752 if (errno == ENOMEM) 753 no_memory(); 754 755 (void) fprintf(stderr, gettext("cannot label '%s': unable to " 756 "read disk geometry\n"), name); 757 (void) close(fd); 758 return (-1); 759 } 760 761 vtoc->efi_parts[0].p_start = vtoc->efi_first_u_lba; 762 vtoc->efi_parts[0].p_size = vtoc->efi_last_u_lba + 1 - 763 vtoc->efi_first_u_lba - resv; 764 765 /* 766 * Why we use V_USR: V_BACKUP confuses users, and is considered 767 * disposable by some EFI utilities (since EFI doesn't have a backup 768 * slice). V_UNASSIGNED is supposed to be used only for zero size 769 * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, 770 * etc. were all pretty specific. V_USR is as close to reality as we 771 * can get, in the absence of V_OTHER. 772 */ 773 vtoc->efi_parts[0].p_tag = V_USR; 774 (void) strcpy(vtoc->efi_parts[0].p_name, "zfs"); 775 776 vtoc->efi_parts[8].p_start = vtoc->efi_last_u_lba + 1 - resv; 777 vtoc->efi_parts[8].p_size = resv; 778 vtoc->efi_parts[8].p_tag = V_RESERVED; 779 780 if (efi_write(fd, vtoc) != 0) { 781 /* 782 * Currently, EFI labels are not supported for IDE disks, and it 783 * is likely that they will not be supported on other drives for 784 * some time. Print out a helpful error message directing the 785 * user to manually label the disk and give a specific slice. 786 */ 787 (void) fprintf(stderr, gettext("cannot label '%s': failed to " 788 "write EFI label\n"), name); 789 (void) fprintf(stderr, gettext("use fdisk(1M) to partition " 790 "the disk, and provide a specific slice\n")); 791 (void) close(fd); 792 return (-1); 793 } 794 795 (void) close(fd); 796 return (0); 797 } 798 799 /* 800 * Go through and find any whole disks in the vdev specification, labelling them 801 * as appropriate. When constructing the vdev spec, we were unable to open this 802 * device in order to provide a devid. Now that we have labelled the disk and 803 * know that slice 0 is valid, we can construct the devid now. 804 * 805 * If the disk was already labelled with an EFI label, we will have gotten the 806 * devid already (because we were able to open the whole disk). Otherwise, we 807 * need to get the devid after we label the disk. 808 */ 809 int 810 make_disks(nvlist_t *nv) 811 { 812 nvlist_t **child; 813 uint_t c, children; 814 char *type, *path, *diskname; 815 char buf[MAXPATHLEN]; 816 uint64_t wholedisk; 817 int fd; 818 int ret; 819 ddi_devid_t devid; 820 char *minor = NULL, *devid_str = NULL; 821 822 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 823 824 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 825 &child, &children) != 0) { 826 827 if (strcmp(type, VDEV_TYPE_DISK) != 0) 828 return (0); 829 830 /* 831 * We have a disk device. Get the path to the device 832 * and see if its a whole disk by appending the backup 833 * slice and stat()ing the device. 834 */ 835 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 836 837 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 838 &wholedisk) != 0 || !wholedisk) 839 return (0); 840 841 diskname = strrchr(path, '/'); 842 assert(diskname != NULL); 843 diskname++; 844 if (label_disk(diskname) != 0) 845 return (-1); 846 847 /* 848 * Fill in the devid, now that we've labeled the disk. 849 */ 850 (void) snprintf(buf, sizeof (buf), "%ss0", path); 851 if ((fd = open(buf, O_RDONLY)) < 0) { 852 (void) fprintf(stderr, 853 gettext("cannot open '%s': %s\n"), 854 buf, strerror(errno)); 855 return (-1); 856 } 857 858 if (devid_get(fd, &devid) == 0) { 859 if (devid_get_minor_name(fd, &minor) == 0 && 860 (devid_str = devid_str_encode(devid, minor)) != 861 NULL) { 862 verify(nvlist_add_string(nv, 863 ZPOOL_CONFIG_DEVID, devid_str) == 0); 864 } 865 if (devid_str != NULL) 866 devid_str_free(devid_str); 867 if (minor != NULL) 868 devid_str_free(minor); 869 devid_free(devid); 870 } 871 872 /* 873 * Update the path to refer to the 's0' slice. The presence of 874 * the 'whole_disk' field indicates to the CLI that we should 875 * chop off the slice number when displaying the device in 876 * future output. 877 */ 878 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 879 880 (void) close(fd); 881 882 return (0); 883 } 884 885 for (c = 0; c < children; c++) 886 if ((ret = make_disks(child[c])) != 0) 887 return (ret); 888 889 return (0); 890 } 891 892 /* 893 * Go through and find any devices that are in use. We rely on libdiskmgt for 894 * the majority of this task. 895 */ 896 int 897 check_in_use(nvlist_t *nv, int force) 898 { 899 nvlist_t **child; 900 uint_t c, children; 901 char *type, *path; 902 int ret; 903 904 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 905 906 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 907 &child, &children) != 0) { 908 909 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 910 911 if (strcmp(type, VDEV_TYPE_DISK) == 0) 912 ret = check_device(path, force); 913 914 if (strcmp(type, VDEV_TYPE_FILE) == 0) 915 ret = check_file(path, force); 916 917 return (ret); 918 } 919 920 for (c = 0; c < children; c++) 921 if ((ret = check_in_use(child[c], force)) != 0) 922 return (ret); 923 924 return (0); 925 } 926 927 /* 928 * Construct a syntactically valid vdev specification, 929 * and ensure that all devices and files exist and can be opened. 930 * Note: we don't bother freeing anything in the error paths 931 * because the program is just going to exit anyway. 932 */ 933 nvlist_t * 934 construct_spec(int argc, char **argv) 935 { 936 nvlist_t *nvroot, *nv, **top; 937 int t, toplevels; 938 939 top = NULL; 940 toplevels = 0; 941 942 while (argc > 0) { 943 nv = NULL; 944 945 /* 946 * If it's a mirror or raidz, the subsequent arguments are 947 * its leaves -- until we encounter the next mirror or raidz. 948 */ 949 if (strcmp(argv[0], VDEV_TYPE_MIRROR) == 0 || 950 strcmp(argv[0], VDEV_TYPE_RAIDZ) == 0) { 951 952 char *type = argv[0]; 953 nvlist_t **child = NULL; 954 int children = 0; 955 int c; 956 957 for (c = 1; c < argc; c++) { 958 if (strcmp(argv[c], VDEV_TYPE_MIRROR) == 0 || 959 strcmp(argv[c], VDEV_TYPE_RAIDZ) == 0) 960 break; 961 children++; 962 child = realloc(child, 963 children * sizeof (nvlist_t *)); 964 if (child == NULL) 965 no_memory(); 966 if ((nv = make_leaf_vdev(argv[c])) == NULL) 967 return (NULL); 968 child[children - 1] = nv; 969 } 970 971 argc -= c; 972 argv += c; 973 974 /* 975 * Mirrors and RAID-Z devices require at least 976 * two components. 977 */ 978 if (children < 2) { 979 (void) fprintf(stderr, 980 gettext("invalid vdev specification: " 981 "%s requires at least 2 devices\n"), type); 982 return (NULL); 983 } 984 985 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0); 986 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 987 type) == 0); 988 verify(nvlist_add_nvlist_array(nv, 989 ZPOOL_CONFIG_CHILDREN, child, children) == 0); 990 991 for (c = 0; c < children; c++) 992 nvlist_free(child[c]); 993 free(child); 994 } else { 995 /* 996 * We have a device. Pass off to make_leaf_vdev() to 997 * construct the appropriate nvlist describing the vdev. 998 */ 999 if ((nv = make_leaf_vdev(argv[0])) == NULL) 1000 return (NULL); 1001 argc--; 1002 argv++; 1003 } 1004 1005 toplevels++; 1006 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1007 if (top == NULL) 1008 no_memory(); 1009 top[toplevels - 1] = nv; 1010 } 1011 1012 /* 1013 * Finally, create nvroot and add all top-level vdevs to it. 1014 */ 1015 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1016 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1017 VDEV_TYPE_ROOT) == 0); 1018 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1019 top, toplevels) == 0); 1020 1021 for (t = 0; t < toplevels; t++) 1022 nvlist_free(top[t]); 1023 free(top); 1024 1025 return (nvroot); 1026 } 1027 1028 /* 1029 * Get and validate the contents of the given vdev specification. This ensures 1030 * that the nvlist returned is well-formed, that all the devices exist, and that 1031 * they are not currently in use by any other known consumer. The 'poolconfig' 1032 * parameter is the current configuration of the pool when adding devices 1033 * existing pool, and is used to perform additional checks, such as changing the 1034 * replication level of the pool. It can be 'NULL' to indicate that this is a 1035 * new pool. The 'force' flag controls whether devices should be forcefully 1036 * added, even if they appear in use. 1037 */ 1038 nvlist_t * 1039 make_root_vdev(nvlist_t *poolconfig, int force, int check_rep, 1040 int argc, char **argv) 1041 { 1042 nvlist_t *newroot; 1043 1044 is_force = force; 1045 1046 /* 1047 * Construct the vdev specification. If this is successful, we know 1048 * that we have a valid specification, and that all devices can be 1049 * opened. 1050 */ 1051 if ((newroot = construct_spec(argc, argv)) == NULL) 1052 return (NULL); 1053 1054 /* 1055 * Validate each device to make sure that its not shared with another 1056 * subsystem. We do this even if 'force' is set, because there are some 1057 * uses (such as a dedicated dump device) that even '-f' cannot 1058 * override. 1059 */ 1060 if (check_in_use(newroot, force) != 0) { 1061 nvlist_free(newroot); 1062 return (NULL); 1063 } 1064 1065 /* 1066 * Check the replication level of the given vdevs and report any errors 1067 * found. We include the existing pool spec, if any, as we need to 1068 * catch changes against the existing replication level. 1069 */ 1070 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1071 nvlist_free(newroot); 1072 return (NULL); 1073 } 1074 1075 /* 1076 * Run through the vdev specification and label any whole disks found. 1077 */ 1078 if (make_disks(newroot) != 0) { 1079 nvlist_free(newroot); 1080 return (NULL); 1081 } 1082 1083 return (newroot); 1084 } 1085