1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Functions to convert between a list of vdevs and an nvlist representing the 31 * configuration. Each entry in the list can be one of: 32 * 33 * Device vdevs 34 * disk=(path=..., devid=...) 35 * file=(path=...) 36 * 37 * Group vdevs 38 * raidz[1|2]=(...) 39 * mirror=(...) 40 * 41 * Hot spares 42 * 43 * While the underlying implementation supports it, group vdevs cannot contain 44 * other group vdevs. All userland verification of devices is contained within 45 * this file. If successful, the nvlist returned can be passed directly to the 46 * kernel; we've done as much verification as possible in userland. 47 * 48 * Hot spares are a special case, and passed down as an array of disk vdevs, at 49 * the same level as the root of the vdev tree. 50 * 51 * The only function exported by this file is 'get_vdev_spec'. The function 52 * performs several passes: 53 * 54 * 1. Construct the vdev specification. Performs syntax validation and 55 * makes sure each device is valid. 56 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 57 * devices are also in use. Some can be overridden using the 'force' 58 * flag, others cannot. 59 * 3. Check for replication errors if the 'force' flag is not specified. 60 * validates that the replication level is consistent across the 61 * entire pool. 62 * 4. Label any whole disks with an EFI label. 63 */ 64 65 #include <assert.h> 66 #include <devid.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <libdiskmgt.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <stdio.h> 73 #include <string.h> 74 #include <unistd.h> 75 #include <sys/efi_partition.h> 76 #include <sys/stat.h> 77 #include <sys/vtoc.h> 78 #include <sys/mntent.h> 79 80 #include <libzfs.h> 81 82 #include "zpool_util.h" 83 84 #define DISK_ROOT "/dev/dsk" 85 #define RDISK_ROOT "/dev/rdsk" 86 #define BACKUP_SLICE "s2" 87 88 /* 89 * For any given vdev specification, we can have multiple errors. The 90 * vdev_error() function keeps track of whether we have seen an error yet, and 91 * prints out a header if its the first error we've seen. 92 */ 93 boolean_t error_seen; 94 boolean_t is_force; 95 96 /*PRINTFLIKE1*/ 97 static void 98 vdev_error(const char *fmt, ...) 99 { 100 va_list ap; 101 102 if (!error_seen) { 103 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 104 if (!is_force) 105 (void) fprintf(stderr, gettext("use '-f' to override " 106 "the following errors:\n")); 107 else 108 (void) fprintf(stderr, gettext("the following errors " 109 "must be manually repaired:\n")); 110 error_seen = B_TRUE; 111 } 112 113 va_start(ap, fmt); 114 (void) vfprintf(stderr, fmt, ap); 115 va_end(ap); 116 } 117 118 static void 119 libdiskmgt_error(int error) 120 { 121 /* 122 * ENXIO/ENODEV is a valid error message if the device doesn't live in 123 * /dev/dsk. Don't bother printing an error message in this case. 124 */ 125 if (error == ENXIO || error == ENODEV) 126 return; 127 128 (void) fprintf(stderr, gettext("warning: device in use checking " 129 "failed: %s\n"), strerror(error)); 130 } 131 132 /* 133 * Validate a device, passing the bulk of the work off to libdiskmgt. 134 */ 135 int 136 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 137 { 138 char *msg; 139 int error = 0; 140 int ret = 0; 141 142 if (dm_inuse((char *)path, &msg, isspare ? DM_WHO_ZPOOL_SPARE : 143 (force ? DM_WHO_ZPOOL_FORCE : DM_WHO_ZPOOL), &error) || error) { 144 if (error != 0) { 145 libdiskmgt_error(error); 146 return (0); 147 } else { 148 vdev_error("%s", msg); 149 free(msg); 150 ret = -1; 151 } 152 153 } 154 155 /* 156 * If we're given a whole disk, ignore overlapping slices since we're 157 * about to label it anyway. 158 */ 159 error = 0; 160 if (!wholedisk && !force && 161 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 162 if (error != 0) { 163 libdiskmgt_error(error); 164 return (0); 165 } else { 166 vdev_error("%s overlaps with %s\n", path, msg); 167 free(msg); 168 } 169 170 ret = -1; 171 } 172 173 return (ret); 174 } 175 176 /* 177 * Validate a whole disk. Iterate over all slices on the disk and make sure 178 * that none is in use by calling check_slice(). 179 */ 180 /* ARGSUSED */ 181 int 182 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 183 { 184 dm_descriptor_t *drive, *media, *slice; 185 int err = 0; 186 int i; 187 int ret; 188 189 /* 190 * Get the drive associated with this disk. This should never fail, 191 * because we already have an alias handle open for the device. 192 */ 193 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 194 &err)) == NULL || *drive == NULL) { 195 if (err) 196 libdiskmgt_error(err); 197 return (0); 198 } 199 200 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 201 &err)) == NULL) { 202 dm_free_descriptors(drive); 203 if (err) 204 libdiskmgt_error(err); 205 return (0); 206 } 207 208 dm_free_descriptors(drive); 209 210 /* 211 * It is possible that the user has specified a removable media drive, 212 * and the media is not present. 213 */ 214 if (*media == NULL) { 215 dm_free_descriptors(media); 216 vdev_error(gettext("'%s' has no media in drive\n"), name); 217 return (-1); 218 } 219 220 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 221 &err)) == NULL) { 222 dm_free_descriptors(media); 223 if (err) 224 libdiskmgt_error(err); 225 return (0); 226 } 227 228 dm_free_descriptors(media); 229 230 ret = 0; 231 232 /* 233 * Iterate over all slices and report any errors. We don't care about 234 * overlapping slices because we are using the whole disk. 235 */ 236 for (i = 0; slice[i] != NULL; i++) { 237 char *name = dm_get_name(slice[i], &err); 238 239 if (check_slice(name, force, B_TRUE, isspare) != 0) 240 ret = -1; 241 242 dm_free_name(name); 243 } 244 245 dm_free_descriptors(slice); 246 return (ret); 247 } 248 249 /* 250 * Validate a device. 251 */ 252 int 253 check_device(const char *path, boolean_t force, boolean_t isspare) 254 { 255 dm_descriptor_t desc; 256 int err; 257 char *dev; 258 259 /* 260 * For whole disks, libdiskmgt does not include the leading dev path. 261 */ 262 dev = strrchr(path, '/'); 263 assert(dev != NULL); 264 dev++; 265 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { 266 err = check_disk(path, desc, force, isspare); 267 dm_free_descriptor(desc); 268 return (err); 269 } 270 271 return (check_slice(path, force, B_FALSE, isspare)); 272 } 273 274 /* 275 * Check that a file is valid. All we can do in this case is check that it's 276 * not in use by another pool. 277 */ 278 int 279 check_file(const char *file, boolean_t force, boolean_t isspare) 280 { 281 char *name; 282 int fd; 283 int ret = 0; 284 pool_state_t state; 285 boolean_t inuse; 286 287 if ((fd = open(file, O_RDONLY)) < 0) 288 return (0); 289 290 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 291 const char *desc; 292 293 switch (state) { 294 case POOL_STATE_ACTIVE: 295 desc = gettext("active"); 296 break; 297 298 case POOL_STATE_EXPORTED: 299 desc = gettext("exported"); 300 break; 301 302 case POOL_STATE_POTENTIALLY_ACTIVE: 303 desc = gettext("potentially active"); 304 break; 305 306 default: 307 desc = gettext("unknown"); 308 break; 309 } 310 311 /* 312 * Allow hot spares to be shared between pools. 313 */ 314 if (state == POOL_STATE_SPARE && isspare) 315 return (0); 316 317 if (state == POOL_STATE_ACTIVE || 318 state == POOL_STATE_SPARE || !force) { 319 switch (state) { 320 case POOL_STATE_SPARE: 321 vdev_error(gettext("%s is reserved as a hot " 322 "spare for pool %s\n"), file, name); 323 break; 324 default: 325 vdev_error(gettext("%s is part of %s pool " 326 "'%s'\n"), file, desc, name); 327 break; 328 } 329 ret = -1; 330 } 331 332 free(name); 333 } 334 335 (void) close(fd); 336 return (ret); 337 } 338 339 static boolean_t 340 is_whole_disk(const char *arg, struct stat64 *statbuf) 341 { 342 char path[MAXPATHLEN]; 343 344 (void) snprintf(path, sizeof (path), "%s%s", arg, BACKUP_SLICE); 345 if (stat64(path, statbuf) == 0) 346 return (B_TRUE); 347 348 return (B_FALSE); 349 } 350 351 /* 352 * Create a leaf vdev. Determine if this is a file or a device. If it's a 353 * device, fill in the device id to make a complete nvlist. Valid forms for a 354 * leaf vdev are: 355 * 356 * /dev/dsk/xxx Complete disk path 357 * /xxx Full path to file 358 * xxx Shorthand for /dev/dsk/xxx 359 */ 360 nvlist_t * 361 make_leaf_vdev(const char *arg) 362 { 363 char path[MAXPATHLEN]; 364 struct stat64 statbuf; 365 nvlist_t *vdev = NULL; 366 char *type = NULL; 367 boolean_t wholedisk = B_FALSE; 368 369 /* 370 * Determine what type of vdev this is, and put the full path into 371 * 'path'. We detect whether this is a device of file afterwards by 372 * checking the st_mode of the file. 373 */ 374 if (arg[0] == '/') { 375 /* 376 * Complete device or file path. Exact type is determined by 377 * examining the file descriptor afterwards. 378 */ 379 if (is_whole_disk(arg, &statbuf)) { 380 wholedisk = B_TRUE; 381 } else if (stat64(arg, &statbuf) != 0) { 382 (void) fprintf(stderr, 383 gettext("cannot open '%s': %s\n"), 384 arg, strerror(errno)); 385 return (NULL); 386 } 387 388 (void) strlcpy(path, arg, sizeof (path)); 389 } else { 390 /* 391 * This may be a short path for a device, or it could be total 392 * gibberish. Check to see if it's a known device in 393 * /dev/dsk/. As part of this check, see if we've been given a 394 * an entire disk (minus the slice number). 395 */ 396 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, 397 arg); 398 if (is_whole_disk(path, &statbuf)) { 399 wholedisk = B_TRUE; 400 } else if (stat64(path, &statbuf) != 0) { 401 /* 402 * If we got ENOENT, then the user gave us 403 * gibberish, so try to direct them with a 404 * reasonable error message. Otherwise, 405 * regurgitate strerror() since it's the best we 406 * can do. 407 */ 408 if (errno == ENOENT) { 409 (void) fprintf(stderr, 410 gettext("cannot open '%s': no such " 411 "device in %s\n"), arg, DISK_ROOT); 412 (void) fprintf(stderr, 413 gettext("must be a full path or " 414 "shorthand device name\n")); 415 return (NULL); 416 } else { 417 (void) fprintf(stderr, 418 gettext("cannot open '%s': %s\n"), 419 path, strerror(errno)); 420 return (NULL); 421 } 422 } 423 } 424 425 /* 426 * Determine whether this is a device or a file. 427 */ 428 if (S_ISBLK(statbuf.st_mode)) { 429 type = VDEV_TYPE_DISK; 430 } else if (S_ISREG(statbuf.st_mode)) { 431 type = VDEV_TYPE_FILE; 432 } else { 433 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 434 "block device or regular file\n"), path); 435 return (NULL); 436 } 437 438 /* 439 * Finally, we have the complete device or file, and we know that it is 440 * acceptable to use. Construct the nvlist to describe this vdev. All 441 * vdevs have a 'path' element, and devices also have a 'devid' element. 442 */ 443 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 444 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 445 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 446 if (strcmp(type, VDEV_TYPE_DISK) == 0) 447 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 448 (uint64_t)wholedisk) == 0); 449 450 /* 451 * For a whole disk, defer getting its devid until after labeling it. 452 */ 453 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 454 /* 455 * Get the devid for the device. 456 */ 457 int fd; 458 ddi_devid_t devid; 459 char *minor = NULL, *devid_str = NULL; 460 461 if ((fd = open(path, O_RDONLY)) < 0) { 462 (void) fprintf(stderr, gettext("cannot open '%s': " 463 "%s\n"), path, strerror(errno)); 464 nvlist_free(vdev); 465 return (NULL); 466 } 467 468 if (devid_get(fd, &devid) == 0) { 469 if (devid_get_minor_name(fd, &minor) == 0 && 470 (devid_str = devid_str_encode(devid, minor)) != 471 NULL) { 472 verify(nvlist_add_string(vdev, 473 ZPOOL_CONFIG_DEVID, devid_str) == 0); 474 } 475 if (devid_str != NULL) 476 devid_str_free(devid_str); 477 if (minor != NULL) 478 devid_str_free(minor); 479 devid_free(devid); 480 } 481 482 (void) close(fd); 483 } 484 485 return (vdev); 486 } 487 488 /* 489 * Go through and verify the replication level of the pool is consistent. 490 * Performs the following checks: 491 * 492 * For the new spec, verifies that devices in mirrors and raidz are the 493 * same size. 494 * 495 * If the current configuration already has inconsistent replication 496 * levels, ignore any other potential problems in the new spec. 497 * 498 * Otherwise, make sure that the current spec (if there is one) and the new 499 * spec have consistent replication levels. 500 */ 501 typedef struct replication_level { 502 char *zprl_type; 503 uint64_t zprl_children; 504 uint64_t zprl_parity; 505 } replication_level_t; 506 507 /* 508 * Given a list of toplevel vdevs, return the current replication level. If 509 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 510 * an error message will be displayed for each self-inconsistent vdev. 511 */ 512 replication_level_t * 513 get_replication(nvlist_t *nvroot, boolean_t fatal) 514 { 515 nvlist_t **top; 516 uint_t t, toplevels; 517 nvlist_t **child; 518 uint_t c, children; 519 nvlist_t *nv; 520 char *type; 521 replication_level_t lastrep, rep, *ret; 522 boolean_t dontreport; 523 524 ret = safe_malloc(sizeof (replication_level_t)); 525 526 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 527 &top, &toplevels) == 0); 528 529 lastrep.zprl_type = NULL; 530 for (t = 0; t < toplevels; t++) { 531 nv = top[t]; 532 533 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 534 535 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 536 &child, &children) != 0) { 537 /* 538 * This is a 'file' or 'disk' vdev. 539 */ 540 rep.zprl_type = type; 541 rep.zprl_children = 1; 542 rep.zprl_parity = 0; 543 } else { 544 uint64_t vdev_size; 545 546 /* 547 * This is a mirror or RAID-Z vdev. Go through and make 548 * sure the contents are all the same (files vs. disks), 549 * keeping track of the number of elements in the 550 * process. 551 * 552 * We also check that the size of each vdev (if it can 553 * be determined) is the same. 554 */ 555 rep.zprl_type = type; 556 rep.zprl_children = 0; 557 558 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 559 verify(nvlist_lookup_uint64(nv, 560 ZPOOL_CONFIG_NPARITY, 561 &rep.zprl_parity) == 0); 562 assert(rep.zprl_parity != 0); 563 } else { 564 rep.zprl_parity = 0; 565 } 566 567 /* 568 * The 'dontreport' variable indicatest that we've 569 * already reported an error for this spec, so don't 570 * bother doing it again. 571 */ 572 type = NULL; 573 dontreport = 0; 574 vdev_size = -1ULL; 575 for (c = 0; c < children; c++) { 576 nvlist_t *cnv = child[c]; 577 char *path; 578 struct stat64 statbuf; 579 uint64_t size = -1ULL; 580 char *childtype; 581 int fd, err; 582 583 rep.zprl_children++; 584 585 verify(nvlist_lookup_string(cnv, 586 ZPOOL_CONFIG_TYPE, &childtype) == 0); 587 588 /* 589 * If this is a a replacing or spare vdev, then 590 * get the real first child of the vdev. 591 */ 592 if (strcmp(childtype, 593 VDEV_TYPE_REPLACING) == 0 || 594 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 595 nvlist_t **rchild; 596 uint_t rchildren; 597 598 verify(nvlist_lookup_nvlist_array(cnv, 599 ZPOOL_CONFIG_CHILDREN, &rchild, 600 &rchildren) == 0); 601 assert(rchildren == 2); 602 cnv = rchild[0]; 603 604 verify(nvlist_lookup_string(cnv, 605 ZPOOL_CONFIG_TYPE, 606 &childtype) == 0); 607 } 608 609 verify(nvlist_lookup_string(cnv, 610 ZPOOL_CONFIG_PATH, &path) == 0); 611 612 /* 613 * If we have a raidz/mirror that combines disks 614 * with files, report it as an error. 615 */ 616 if (!dontreport && type != NULL && 617 strcmp(type, childtype) != 0) { 618 if (ret != NULL) 619 free(ret); 620 ret = NULL; 621 if (fatal) 622 vdev_error(gettext( 623 "mismatched replication " 624 "level: %s contains both " 625 "files and devices\n"), 626 rep.zprl_type); 627 else 628 return (NULL); 629 dontreport = B_TRUE; 630 } 631 632 /* 633 * According to stat(2), the value of 'st_size' 634 * is undefined for block devices and character 635 * devices. But there is no effective way to 636 * determine the real size in userland. 637 * 638 * Instead, we'll take advantage of an 639 * implementation detail of spec_size(). If the 640 * device is currently open, then we (should) 641 * return a valid size. 642 * 643 * If we still don't get a valid size (indicated 644 * by a size of 0 or MAXOFFSET_T), then ignore 645 * this device altogether. 646 */ 647 if ((fd = open(path, O_RDONLY)) >= 0) { 648 err = fstat64(fd, &statbuf); 649 (void) close(fd); 650 } else { 651 err = stat64(path, &statbuf); 652 } 653 654 if (err != 0 || 655 statbuf.st_size == 0 || 656 statbuf.st_size == MAXOFFSET_T) 657 continue; 658 659 size = statbuf.st_size; 660 661 /* 662 * Also check the size of each device. If they 663 * differ, then report an error. 664 */ 665 if (!dontreport && vdev_size != -1ULL && 666 size != vdev_size) { 667 if (ret != NULL) 668 free(ret); 669 ret = NULL; 670 if (fatal) 671 vdev_error(gettext( 672 "%s contains devices of " 673 "different sizes\n"), 674 rep.zprl_type); 675 else 676 return (NULL); 677 dontreport = B_TRUE; 678 } 679 680 type = childtype; 681 vdev_size = size; 682 } 683 } 684 685 /* 686 * At this point, we have the replication of the last toplevel 687 * vdev in 'rep'. Compare it to 'lastrep' to see if its 688 * different. 689 */ 690 if (lastrep.zprl_type != NULL) { 691 if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { 692 if (ret != NULL) 693 free(ret); 694 ret = NULL; 695 if (fatal) 696 vdev_error(gettext( 697 "mismatched replication level: " 698 "both %s and %s vdevs are " 699 "present\n"), 700 lastrep.zprl_type, rep.zprl_type); 701 else 702 return (NULL); 703 } else if (lastrep.zprl_parity != rep.zprl_parity) { 704 if (ret) 705 free(ret); 706 ret = NULL; 707 if (fatal) 708 vdev_error(gettext( 709 "mismatched replication level: " 710 "both %llu and %llu device parity " 711 "%s vdevs are present\n"), 712 lastrep.zprl_parity, 713 rep.zprl_parity, 714 rep.zprl_type); 715 else 716 return (NULL); 717 } else if (lastrep.zprl_children != rep.zprl_children) { 718 if (ret) 719 free(ret); 720 ret = NULL; 721 if (fatal) 722 vdev_error(gettext( 723 "mismatched replication level: " 724 "both %llu-way and %llu-way %s " 725 "vdevs are present\n"), 726 lastrep.zprl_children, 727 rep.zprl_children, 728 rep.zprl_type); 729 else 730 return (NULL); 731 } 732 } 733 lastrep = rep; 734 } 735 736 if (ret != NULL) 737 *ret = rep; 738 739 return (ret); 740 } 741 742 /* 743 * Check the replication level of the vdev spec against the current pool. Calls 744 * get_replication() to make sure the new spec is self-consistent. If the pool 745 * has a consistent replication level, then we ignore any errors. Otherwise, 746 * report any difference between the two. 747 */ 748 int 749 check_replication(nvlist_t *config, nvlist_t *newroot) 750 { 751 replication_level_t *current = NULL, *new; 752 int ret; 753 754 /* 755 * If we have a current pool configuration, check to see if it's 756 * self-consistent. If not, simply return success. 757 */ 758 if (config != NULL) { 759 nvlist_t *nvroot; 760 761 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 762 &nvroot) == 0); 763 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 764 return (0); 765 } 766 767 /* 768 * Get the replication level of the new vdev spec, reporting any 769 * inconsistencies found. 770 */ 771 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 772 free(current); 773 return (-1); 774 } 775 776 /* 777 * Check to see if the new vdev spec matches the replication level of 778 * the current pool. 779 */ 780 ret = 0; 781 if (current != NULL) { 782 if (strcmp(current->zprl_type, new->zprl_type) != 0) { 783 vdev_error(gettext( 784 "mismatched replication level: pool uses %s " 785 "and new vdev is %s\n"), 786 current->zprl_type, new->zprl_type); 787 ret = -1; 788 } else if (current->zprl_parity != new->zprl_parity) { 789 vdev_error(gettext( 790 "mismatched replication level: pool uses %llu " 791 "device parity and new vdev uses %llu\n"), 792 current->zprl_parity, new->zprl_parity); 793 ret = -1; 794 } else if (current->zprl_children != new->zprl_children) { 795 vdev_error(gettext( 796 "mismatched replication level: pool uses %llu-way " 797 "%s and new vdev uses %llu-way %s\n"), 798 current->zprl_children, current->zprl_type, 799 new->zprl_children, new->zprl_type); 800 ret = -1; 801 } 802 } 803 804 free(new); 805 if (current != NULL) 806 free(current); 807 808 return (ret); 809 } 810 811 /* 812 * Label an individual disk. The name provided is the short name, stripped of 813 * any leading /dev path. 814 */ 815 int 816 label_disk(char *name) 817 { 818 char path[MAXPATHLEN]; 819 struct dk_gpt *vtoc; 820 int fd; 821 size_t resv = 16384; 822 823 (void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name, 824 BACKUP_SLICE); 825 826 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { 827 /* 828 * This shouldn't happen. We've long since verified that this 829 * is a valid device. 830 */ 831 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), 832 path, strerror(errno)); 833 return (-1); 834 } 835 836 837 if (efi_alloc_and_init(fd, 9, &vtoc) != 0) { 838 /* 839 * The only way this can fail is if we run out of memory, or we 840 * were unable to read the disk geometry. 841 */ 842 if (errno == ENOMEM) 843 zpool_no_memory(); 844 845 (void) fprintf(stderr, gettext("cannot label '%s': unable to " 846 "read disk geometry\n"), name); 847 (void) close(fd); 848 return (-1); 849 } 850 851 vtoc->efi_parts[0].p_start = vtoc->efi_first_u_lba; 852 vtoc->efi_parts[0].p_size = vtoc->efi_last_u_lba + 1 - 853 vtoc->efi_first_u_lba - resv; 854 855 /* 856 * Why we use V_USR: V_BACKUP confuses users, and is considered 857 * disposable by some EFI utilities (since EFI doesn't have a backup 858 * slice). V_UNASSIGNED is supposed to be used only for zero size 859 * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, 860 * etc. were all pretty specific. V_USR is as close to reality as we 861 * can get, in the absence of V_OTHER. 862 */ 863 vtoc->efi_parts[0].p_tag = V_USR; 864 (void) strcpy(vtoc->efi_parts[0].p_name, "zfs"); 865 866 vtoc->efi_parts[8].p_start = vtoc->efi_last_u_lba + 1 - resv; 867 vtoc->efi_parts[8].p_size = resv; 868 vtoc->efi_parts[8].p_tag = V_RESERVED; 869 870 if (efi_write(fd, vtoc) != 0) { 871 /* 872 * Currently, EFI labels are not supported for IDE disks, and it 873 * is likely that they will not be supported on other drives for 874 * some time. Print out a helpful error message directing the 875 * user to manually label the disk and give a specific slice. 876 */ 877 (void) fprintf(stderr, gettext("cannot label '%s': failed to " 878 "write EFI label\n"), name); 879 (void) fprintf(stderr, gettext("use fdisk(1M) to partition " 880 "the disk, and provide a specific slice\n")); 881 (void) close(fd); 882 efi_free(vtoc); 883 return (-1); 884 } 885 886 (void) close(fd); 887 efi_free(vtoc); 888 return (0); 889 } 890 891 /* 892 * Go through and find any whole disks in the vdev specification, labelling them 893 * as appropriate. When constructing the vdev spec, we were unable to open this 894 * device in order to provide a devid. Now that we have labelled the disk and 895 * know that slice 0 is valid, we can construct the devid now. 896 * 897 * If the disk was already labelled with an EFI label, we will have gotten the 898 * devid already (because we were able to open the whole disk). Otherwise, we 899 * need to get the devid after we label the disk. 900 */ 901 int 902 make_disks(nvlist_t *nv) 903 { 904 nvlist_t **child; 905 uint_t c, children; 906 char *type, *path, *diskname; 907 char buf[MAXPATHLEN]; 908 uint64_t wholedisk; 909 int fd; 910 int ret; 911 ddi_devid_t devid; 912 char *minor = NULL, *devid_str = NULL; 913 914 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 915 916 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 917 &child, &children) != 0) { 918 919 if (strcmp(type, VDEV_TYPE_DISK) != 0) 920 return (0); 921 922 /* 923 * We have a disk device. Get the path to the device 924 * and see if its a whole disk by appending the backup 925 * slice and stat()ing the device. 926 */ 927 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 928 929 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 930 &wholedisk) != 0 || !wholedisk) 931 return (0); 932 933 diskname = strrchr(path, '/'); 934 assert(diskname != NULL); 935 diskname++; 936 if (label_disk(diskname) != 0) 937 return (-1); 938 939 /* 940 * Fill in the devid, now that we've labeled the disk. 941 */ 942 (void) snprintf(buf, sizeof (buf), "%ss0", path); 943 if ((fd = open(buf, O_RDONLY)) < 0) { 944 (void) fprintf(stderr, 945 gettext("cannot open '%s': %s\n"), 946 buf, strerror(errno)); 947 return (-1); 948 } 949 950 if (devid_get(fd, &devid) == 0) { 951 if (devid_get_minor_name(fd, &minor) == 0 && 952 (devid_str = devid_str_encode(devid, minor)) != 953 NULL) { 954 verify(nvlist_add_string(nv, 955 ZPOOL_CONFIG_DEVID, devid_str) == 0); 956 } 957 if (devid_str != NULL) 958 devid_str_free(devid_str); 959 if (minor != NULL) 960 devid_str_free(minor); 961 devid_free(devid); 962 } 963 964 /* 965 * Update the path to refer to the 's0' slice. The presence of 966 * the 'whole_disk' field indicates to the CLI that we should 967 * chop off the slice number when displaying the device in 968 * future output. 969 */ 970 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 971 972 (void) close(fd); 973 974 return (0); 975 } 976 977 for (c = 0; c < children; c++) 978 if ((ret = make_disks(child[c])) != 0) 979 return (ret); 980 981 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 982 &child, &children) == 0) 983 for (c = 0; c < children; c++) 984 if ((ret = make_disks(child[c])) != 0) 985 return (ret); 986 987 return (0); 988 } 989 990 /* 991 * Determine if the given path is a hot spare within the given configuration. 992 */ 993 static boolean_t 994 is_spare(nvlist_t *config, const char *path) 995 { 996 int fd; 997 pool_state_t state; 998 char *name = NULL; 999 nvlist_t *label; 1000 uint64_t guid, spareguid; 1001 nvlist_t *nvroot; 1002 nvlist_t **spares; 1003 uint_t i, nspares; 1004 boolean_t inuse; 1005 1006 if ((fd = open(path, O_RDONLY)) < 0) 1007 return (B_FALSE); 1008 1009 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 1010 !inuse || 1011 state != POOL_STATE_SPARE || 1012 zpool_read_label(fd, &label) != 0) { 1013 free(name); 1014 (void) close(fd); 1015 return (B_FALSE); 1016 } 1017 free(name); 1018 1019 (void) close(fd); 1020 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 1021 nvlist_free(label); 1022 1023 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1024 &nvroot) == 0); 1025 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1026 &spares, &nspares) == 0) { 1027 for (i = 0; i < nspares; i++) { 1028 verify(nvlist_lookup_uint64(spares[i], 1029 ZPOOL_CONFIG_GUID, &spareguid) == 0); 1030 if (spareguid == guid) 1031 return (B_TRUE); 1032 } 1033 } 1034 1035 return (B_FALSE); 1036 } 1037 1038 /* 1039 * Go through and find any devices that are in use. We rely on libdiskmgt for 1040 * the majority of this task. 1041 */ 1042 int 1043 check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, 1044 int isspare) 1045 { 1046 nvlist_t **child; 1047 uint_t c, children; 1048 char *type, *path; 1049 int ret; 1050 char buf[MAXPATHLEN]; 1051 uint64_t wholedisk; 1052 1053 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1054 1055 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1056 &child, &children) != 0) { 1057 1058 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1059 1060 /* 1061 * As a generic check, we look to see if this is a replace of a 1062 * hot spare within the same pool. If so, we allow it 1063 * regardless of what libdiskmgt or zpool_in_use() says. 1064 */ 1065 if (isreplacing) { 1066 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1067 &wholedisk) == 0 && wholedisk) 1068 (void) snprintf(buf, sizeof (buf), "%ss0", 1069 path); 1070 else 1071 (void) strlcpy(buf, path, sizeof (buf)); 1072 if (is_spare(config, buf)) 1073 return (0); 1074 } 1075 1076 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1077 ret = check_device(path, force, isspare); 1078 1079 if (strcmp(type, VDEV_TYPE_FILE) == 0) 1080 ret = check_file(path, force, isspare); 1081 1082 return (ret); 1083 } 1084 1085 for (c = 0; c < children; c++) 1086 if ((ret = check_in_use(config, child[c], force, 1087 isreplacing, B_FALSE)) != 0) 1088 return (ret); 1089 1090 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1091 &child, &children) == 0) 1092 for (c = 0; c < children; c++) 1093 if ((ret = check_in_use(config, child[c], force, 1094 isreplacing, B_TRUE)) != 0) 1095 return (ret); 1096 1097 return (0); 1098 } 1099 1100 const char * 1101 is_grouping(const char *type, int *mindev) 1102 { 1103 if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) { 1104 if (mindev != NULL) 1105 *mindev = 2; 1106 return (VDEV_TYPE_RAIDZ); 1107 } 1108 1109 if (strcmp(type, "raidz2") == 0) { 1110 if (mindev != NULL) 1111 *mindev = 3; 1112 return (VDEV_TYPE_RAIDZ); 1113 } 1114 1115 if (strcmp(type, "mirror") == 0) { 1116 if (mindev != NULL) 1117 *mindev = 2; 1118 return (VDEV_TYPE_MIRROR); 1119 } 1120 1121 if (strcmp(type, "spare") == 0) { 1122 if (mindev != NULL) 1123 *mindev = 1; 1124 return (VDEV_TYPE_SPARE); 1125 } 1126 1127 return (NULL); 1128 } 1129 1130 /* 1131 * Construct a syntactically valid vdev specification, 1132 * and ensure that all devices and files exist and can be opened. 1133 * Note: we don't bother freeing anything in the error paths 1134 * because the program is just going to exit anyway. 1135 */ 1136 nvlist_t * 1137 construct_spec(int argc, char **argv) 1138 { 1139 nvlist_t *nvroot, *nv, **top, **spares; 1140 int t, toplevels, mindev, nspares; 1141 const char *type; 1142 1143 top = NULL; 1144 toplevels = 0; 1145 spares = NULL; 1146 nspares = 0; 1147 1148 while (argc > 0) { 1149 nv = NULL; 1150 1151 /* 1152 * If it's a mirror or raidz, the subsequent arguments are 1153 * its leaves -- until we encounter the next mirror or raidz. 1154 */ 1155 if ((type = is_grouping(argv[0], &mindev)) != NULL) { 1156 nvlist_t **child = NULL; 1157 int c, children = 0; 1158 1159 if (strcmp(type, VDEV_TYPE_SPARE) == 0 && 1160 spares != NULL) { 1161 (void) fprintf(stderr, gettext("invalid vdev " 1162 "specification: 'spare' can be " 1163 "specified only once\n")); 1164 return (NULL); 1165 } 1166 1167 for (c = 1; c < argc; c++) { 1168 if (is_grouping(argv[c], NULL) != NULL) 1169 break; 1170 children++; 1171 child = realloc(child, 1172 children * sizeof (nvlist_t *)); 1173 if (child == NULL) 1174 zpool_no_memory(); 1175 if ((nv = make_leaf_vdev(argv[c])) == NULL) 1176 return (NULL); 1177 child[children - 1] = nv; 1178 } 1179 1180 if (children < mindev) { 1181 (void) fprintf(stderr, gettext("invalid vdev " 1182 "specification: %s requires at least %d " 1183 "devices\n"), argv[0], mindev); 1184 return (NULL); 1185 } 1186 1187 argc -= c; 1188 argv += c; 1189 1190 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1191 spares = child; 1192 nspares = children; 1193 continue; 1194 } else { 1195 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1196 0) == 0); 1197 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1198 type) == 0); 1199 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1200 verify(nvlist_add_uint64(nv, 1201 ZPOOL_CONFIG_NPARITY, 1202 mindev - 1) == 0); 1203 } 1204 verify(nvlist_add_nvlist_array(nv, 1205 ZPOOL_CONFIG_CHILDREN, child, 1206 children) == 0); 1207 1208 for (c = 0; c < children; c++) 1209 nvlist_free(child[c]); 1210 free(child); 1211 } 1212 } else { 1213 /* 1214 * We have a device. Pass off to make_leaf_vdev() to 1215 * construct the appropriate nvlist describing the vdev. 1216 */ 1217 if ((nv = make_leaf_vdev(argv[0])) == NULL) 1218 return (NULL); 1219 argc--; 1220 argv++; 1221 } 1222 1223 toplevels++; 1224 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1225 if (top == NULL) 1226 zpool_no_memory(); 1227 top[toplevels - 1] = nv; 1228 } 1229 1230 if (toplevels == 0 && nspares == 0) { 1231 (void) fprintf(stderr, gettext("invalid vdev " 1232 "specification: at least one toplevel vdev must be " 1233 "specified\n")); 1234 return (NULL); 1235 } 1236 1237 /* 1238 * Finally, create nvroot and add all top-level vdevs to it. 1239 */ 1240 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1241 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1242 VDEV_TYPE_ROOT) == 0); 1243 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1244 top, toplevels) == 0); 1245 if (nspares != 0) 1246 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1247 spares, nspares) == 0); 1248 1249 for (t = 0; t < toplevels; t++) 1250 nvlist_free(top[t]); 1251 for (t = 0; t < nspares; t++) 1252 nvlist_free(spares[t]); 1253 if (spares) 1254 free(spares); 1255 free(top); 1256 1257 return (nvroot); 1258 } 1259 1260 /* 1261 * Get and validate the contents of the given vdev specification. This ensures 1262 * that the nvlist returned is well-formed, that all the devices exist, and that 1263 * they are not currently in use by any other known consumer. The 'poolconfig' 1264 * parameter is the current configuration of the pool when adding devices 1265 * existing pool, and is used to perform additional checks, such as changing the 1266 * replication level of the pool. It can be 'NULL' to indicate that this is a 1267 * new pool. The 'force' flag controls whether devices should be forcefully 1268 * added, even if they appear in use. 1269 */ 1270 nvlist_t * 1271 make_root_vdev(nvlist_t *poolconfig, int force, int check_rep, 1272 boolean_t isreplacing, int argc, char **argv) 1273 { 1274 nvlist_t *newroot; 1275 1276 is_force = force; 1277 1278 /* 1279 * Construct the vdev specification. If this is successful, we know 1280 * that we have a valid specification, and that all devices can be 1281 * opened. 1282 */ 1283 if ((newroot = construct_spec(argc, argv)) == NULL) 1284 return (NULL); 1285 1286 /* 1287 * Validate each device to make sure that its not shared with another 1288 * subsystem. We do this even if 'force' is set, because there are some 1289 * uses (such as a dedicated dump device) that even '-f' cannot 1290 * override. 1291 */ 1292 if (check_in_use(poolconfig, newroot, force, isreplacing, 1293 B_FALSE) != 0) { 1294 nvlist_free(newroot); 1295 return (NULL); 1296 } 1297 1298 /* 1299 * Check the replication level of the given vdevs and report any errors 1300 * found. We include the existing pool spec, if any, as we need to 1301 * catch changes against the existing replication level. 1302 */ 1303 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1304 nvlist_free(newroot); 1305 return (NULL); 1306 } 1307 1308 /* 1309 * Run through the vdev specification and label any whole disks found. 1310 */ 1311 if (make_disks(newroot) != 0) { 1312 nvlist_free(newroot); 1313 return (NULL); 1314 } 1315 1316 return (newroot); 1317 } 1318