1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Functions to convert between a list of vdevs and an nvlist representing the 31 * configuration. Each entry in the list can be one of: 32 * 33 * Device vdevs 34 * disk=(path=..., devid=...) 35 * file=(path=...) 36 * 37 * Group vdevs 38 * raidz[1|2]=(...) 39 * mirror=(...) 40 * 41 * Hot spares 42 * 43 * While the underlying implementation supports it, group vdevs cannot contain 44 * other group vdevs. All userland verification of devices is contained within 45 * this file. If successful, the nvlist returned can be passed directly to the 46 * kernel; we've done as much verification as possible in userland. 47 * 48 * Hot spares are a special case, and passed down as an array of disk vdevs, at 49 * the same level as the root of the vdev tree. 50 * 51 * The only function exported by this file is 'get_vdev_spec'. The function 52 * performs several passes: 53 * 54 * 1. Construct the vdev specification. Performs syntax validation and 55 * makes sure each device is valid. 56 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 57 * devices are also in use. Some can be overridden using the 'force' 58 * flag, others cannot. 59 * 3. Check for replication errors if the 'force' flag is not specified. 60 * validates that the replication level is consistent across the 61 * entire pool. 62 * 4. Label any whole disks with an EFI label. 63 */ 64 65 #include <assert.h> 66 #include <devid.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <libdiskmgt.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <stdio.h> 73 #include <string.h> 74 #include <unistd.h> 75 #include <sys/efi_partition.h> 76 #include <sys/stat.h> 77 #include <sys/vtoc.h> 78 #include <sys/mntent.h> 79 80 #include <libzfs.h> 81 82 #include "zpool_util.h" 83 84 #define DISK_ROOT "/dev/dsk" 85 #define RDISK_ROOT "/dev/rdsk" 86 #define BACKUP_SLICE "s2" 87 88 /* 89 * For any given vdev specification, we can have multiple errors. The 90 * vdev_error() function keeps track of whether we have seen an error yet, and 91 * prints out a header if its the first error we've seen. 92 */ 93 boolean_t error_seen; 94 boolean_t is_force; 95 96 /*PRINTFLIKE1*/ 97 static void 98 vdev_error(const char *fmt, ...) 99 { 100 va_list ap; 101 102 if (!error_seen) { 103 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 104 if (!is_force) 105 (void) fprintf(stderr, gettext("use '-f' to override " 106 "the following errors:\n")); 107 else 108 (void) fprintf(stderr, gettext("the following errors " 109 "must be manually repaired:\n")); 110 error_seen = B_TRUE; 111 } 112 113 va_start(ap, fmt); 114 (void) vfprintf(stderr, fmt, ap); 115 va_end(ap); 116 } 117 118 static void 119 libdiskmgt_error(int error) 120 { 121 /* 122 * ENXIO/ENODEV is a valid error message if the device doesn't live in 123 * /dev/dsk. Don't bother printing an error message in this case. 124 */ 125 if (error == ENXIO || error == ENODEV) 126 return; 127 128 (void) fprintf(stderr, gettext("warning: device in use checking " 129 "failed: %s\n"), strerror(error)); 130 } 131 132 /* 133 * Validate a device, passing the bulk of the work off to libdiskmgt. 134 */ 135 int 136 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 137 { 138 char *msg; 139 int error = 0; 140 int ret = 0; 141 142 if (dm_inuse((char *)path, &msg, 143 force ? DM_WHO_ZPOOL_FORCE : DM_WHO_ZPOOL, &error) || error) { 144 if (error != 0) { 145 libdiskmgt_error(error); 146 return (0); 147 } else if (!isspare || 148 strstr(msg, gettext("hot spare")) == NULL) { 149 /* 150 * The above check is a rather severe hack. It would 151 * probably make more sense to have DM_WHO_ZPOOL_SPARE 152 * instead. 153 */ 154 vdev_error("%s", msg); 155 free(msg); 156 ret = -1; 157 } 158 159 } 160 161 /* 162 * If we're given a whole disk, ignore overlapping slices since we're 163 * about to label it anyway. 164 */ 165 error = 0; 166 if (!wholedisk && !force && 167 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 168 if (error != 0) { 169 libdiskmgt_error(error); 170 return (0); 171 } else { 172 vdev_error("%s overlaps with %s\n", path, msg); 173 free(msg); 174 } 175 176 ret = -1; 177 } 178 179 return (ret); 180 } 181 182 /* 183 * Validate a whole disk. Iterate over all slices on the disk and make sure 184 * that none is in use by calling check_slice(). 185 */ 186 /* ARGSUSED */ 187 int 188 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 189 { 190 dm_descriptor_t *drive, *media, *slice; 191 int err = 0; 192 int i; 193 int ret; 194 195 /* 196 * Get the drive associated with this disk. This should never fail, 197 * because we already have an alias handle open for the device. 198 */ 199 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 200 &err)) == NULL || *drive == NULL) { 201 if (err) 202 libdiskmgt_error(err); 203 return (0); 204 } 205 206 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 207 &err)) == NULL) { 208 dm_free_descriptors(drive); 209 if (err) 210 libdiskmgt_error(err); 211 return (0); 212 } 213 214 dm_free_descriptors(drive); 215 216 /* 217 * It is possible that the user has specified a removable media drive, 218 * and the media is not present. 219 */ 220 if (*media == NULL) { 221 dm_free_descriptors(media); 222 vdev_error(gettext("'%s' has no media in drive\n"), name); 223 return (-1); 224 } 225 226 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 227 &err)) == NULL) { 228 dm_free_descriptors(media); 229 if (err) 230 libdiskmgt_error(err); 231 return (0); 232 } 233 234 dm_free_descriptors(media); 235 236 ret = 0; 237 238 /* 239 * Iterate over all slices and report any errors. We don't care about 240 * overlapping slices because we are using the whole disk. 241 */ 242 for (i = 0; slice[i] != NULL; i++) { 243 char *name = dm_get_name(slice[i], &err); 244 245 if (check_slice(name, force, B_TRUE, isspare) != 0) 246 ret = -1; 247 248 dm_free_name(name); 249 } 250 251 dm_free_descriptors(slice); 252 return (ret); 253 } 254 255 /* 256 * Validate a device. 257 */ 258 int 259 check_device(const char *path, boolean_t force, boolean_t isspare) 260 { 261 dm_descriptor_t desc; 262 int err; 263 char *dev; 264 265 /* 266 * For whole disks, libdiskmgt does not include the leading dev path. 267 */ 268 dev = strrchr(path, '/'); 269 assert(dev != NULL); 270 dev++; 271 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { 272 err = check_disk(path, desc, force, isspare); 273 dm_free_descriptor(desc); 274 return (err); 275 } 276 277 return (check_slice(path, force, B_FALSE, isspare)); 278 } 279 280 /* 281 * Check that a file is valid. All we can do in this case is check that it's 282 * not in use by another pool. 283 */ 284 int 285 check_file(const char *file, boolean_t force, boolean_t isspare) 286 { 287 char *name; 288 int fd; 289 int ret = 0; 290 pool_state_t state; 291 boolean_t inuse; 292 293 if ((fd = open(file, O_RDONLY)) < 0) 294 return (0); 295 296 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 297 const char *desc; 298 299 switch (state) { 300 case POOL_STATE_ACTIVE: 301 desc = gettext("active"); 302 break; 303 304 case POOL_STATE_EXPORTED: 305 desc = gettext("exported"); 306 break; 307 308 case POOL_STATE_POTENTIALLY_ACTIVE: 309 desc = gettext("potentially active"); 310 break; 311 312 default: 313 desc = gettext("unknown"); 314 break; 315 } 316 317 /* 318 * Allow hot spares to be shared between pools. 319 */ 320 if (state == POOL_STATE_SPARE && isspare) 321 return (0); 322 323 if (state == POOL_STATE_ACTIVE || 324 state == POOL_STATE_SPARE || !force) { 325 switch (state) { 326 case POOL_STATE_SPARE: 327 vdev_error(gettext("%s is reserved as a hot " 328 "spare for pool %s\n"), file, name); 329 break; 330 default: 331 vdev_error(gettext("%s is part of %s pool " 332 "'%s'\n"), file, desc, name); 333 break; 334 } 335 ret = -1; 336 } 337 338 free(name); 339 } 340 341 (void) close(fd); 342 return (ret); 343 } 344 345 static boolean_t 346 is_whole_disk(const char *arg, struct stat64 *statbuf) 347 { 348 char path[MAXPATHLEN]; 349 350 (void) snprintf(path, sizeof (path), "%s%s", arg, BACKUP_SLICE); 351 if (stat64(path, statbuf) == 0) 352 return (B_TRUE); 353 354 return (B_FALSE); 355 } 356 357 /* 358 * Create a leaf vdev. Determine if this is a file or a device. If it's a 359 * device, fill in the device id to make a complete nvlist. Valid forms for a 360 * leaf vdev are: 361 * 362 * /dev/dsk/xxx Complete disk path 363 * /xxx Full path to file 364 * xxx Shorthand for /dev/dsk/xxx 365 */ 366 nvlist_t * 367 make_leaf_vdev(const char *arg) 368 { 369 char path[MAXPATHLEN]; 370 struct stat64 statbuf; 371 nvlist_t *vdev = NULL; 372 char *type = NULL; 373 boolean_t wholedisk = B_FALSE; 374 375 /* 376 * Determine what type of vdev this is, and put the full path into 377 * 'path'. We detect whether this is a device of file afterwards by 378 * checking the st_mode of the file. 379 */ 380 if (arg[0] == '/') { 381 /* 382 * Complete device or file path. Exact type is determined by 383 * examining the file descriptor afterwards. 384 */ 385 if (is_whole_disk(arg, &statbuf)) { 386 wholedisk = B_TRUE; 387 } else if (stat64(arg, &statbuf) != 0) { 388 (void) fprintf(stderr, 389 gettext("cannot open '%s': %s\n"), 390 arg, strerror(errno)); 391 return (NULL); 392 } 393 394 (void) strlcpy(path, arg, sizeof (path)); 395 } else { 396 /* 397 * This may be a short path for a device, or it could be total 398 * gibberish. Check to see if it's a known device in 399 * /dev/dsk/. As part of this check, see if we've been given a 400 * an entire disk (minus the slice number). 401 */ 402 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, 403 arg); 404 if (is_whole_disk(path, &statbuf)) { 405 wholedisk = B_TRUE; 406 } else if (stat64(path, &statbuf) != 0) { 407 /* 408 * If we got ENOENT, then the user gave us 409 * gibberish, so try to direct them with a 410 * reasonable error message. Otherwise, 411 * regurgitate strerror() since it's the best we 412 * can do. 413 */ 414 if (errno == ENOENT) { 415 (void) fprintf(stderr, 416 gettext("cannot open '%s': no such " 417 "device in %s\n"), arg, DISK_ROOT); 418 (void) fprintf(stderr, 419 gettext("must be a full path or " 420 "shorthand device name\n")); 421 return (NULL); 422 } else { 423 (void) fprintf(stderr, 424 gettext("cannot open '%s': %s\n"), 425 path, strerror(errno)); 426 return (NULL); 427 } 428 } 429 } 430 431 /* 432 * Determine whether this is a device or a file. 433 */ 434 if (S_ISBLK(statbuf.st_mode)) { 435 type = VDEV_TYPE_DISK; 436 } else if (S_ISREG(statbuf.st_mode)) { 437 type = VDEV_TYPE_FILE; 438 } else { 439 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 440 "block device or regular file\n"), path); 441 return (NULL); 442 } 443 444 /* 445 * Finally, we have the complete device or file, and we know that it is 446 * acceptable to use. Construct the nvlist to describe this vdev. All 447 * vdevs have a 'path' element, and devices also have a 'devid' element. 448 */ 449 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 450 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 451 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 452 if (strcmp(type, VDEV_TYPE_DISK) == 0) 453 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 454 (uint64_t)wholedisk) == 0); 455 456 /* 457 * For a whole disk, defer getting its devid until after labeling it. 458 */ 459 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 460 /* 461 * Get the devid for the device. 462 */ 463 int fd; 464 ddi_devid_t devid; 465 char *minor = NULL, *devid_str = NULL; 466 467 if ((fd = open(path, O_RDONLY)) < 0) { 468 (void) fprintf(stderr, gettext("cannot open '%s': " 469 "%s\n"), path, strerror(errno)); 470 nvlist_free(vdev); 471 return (NULL); 472 } 473 474 if (devid_get(fd, &devid) == 0) { 475 if (devid_get_minor_name(fd, &minor) == 0 && 476 (devid_str = devid_str_encode(devid, minor)) != 477 NULL) { 478 verify(nvlist_add_string(vdev, 479 ZPOOL_CONFIG_DEVID, devid_str) == 0); 480 } 481 if (devid_str != NULL) 482 devid_str_free(devid_str); 483 if (minor != NULL) 484 devid_str_free(minor); 485 devid_free(devid); 486 } 487 488 (void) close(fd); 489 } 490 491 return (vdev); 492 } 493 494 /* 495 * Go through and verify the replication level of the pool is consistent. 496 * Performs the following checks: 497 * 498 * For the new spec, verifies that devices in mirrors and raidz are the 499 * same size. 500 * 501 * If the current configuration already has inconsistent replication 502 * levels, ignore any other potential problems in the new spec. 503 * 504 * Otherwise, make sure that the current spec (if there is one) and the new 505 * spec have consistent replication levels. 506 */ 507 typedef struct replication_level { 508 char *zprl_type; 509 uint64_t zprl_children; 510 uint64_t zprl_parity; 511 } replication_level_t; 512 513 /* 514 * Given a list of toplevel vdevs, return the current replication level. If 515 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 516 * an error message will be displayed for each self-inconsistent vdev. 517 */ 518 replication_level_t * 519 get_replication(nvlist_t *nvroot, boolean_t fatal) 520 { 521 nvlist_t **top; 522 uint_t t, toplevels; 523 nvlist_t **child; 524 uint_t c, children; 525 nvlist_t *nv; 526 char *type; 527 replication_level_t lastrep, rep, *ret; 528 boolean_t dontreport; 529 530 ret = safe_malloc(sizeof (replication_level_t)); 531 532 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 533 &top, &toplevels) == 0); 534 535 lastrep.zprl_type = NULL; 536 for (t = 0; t < toplevels; t++) { 537 nv = top[t]; 538 539 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 540 541 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 542 &child, &children) != 0) { 543 /* 544 * This is a 'file' or 'disk' vdev. 545 */ 546 rep.zprl_type = type; 547 rep.zprl_children = 1; 548 rep.zprl_parity = 0; 549 } else { 550 uint64_t vdev_size; 551 552 /* 553 * This is a mirror or RAID-Z vdev. Go through and make 554 * sure the contents are all the same (files vs. disks), 555 * keeping track of the number of elements in the 556 * process. 557 * 558 * We also check that the size of each vdev (if it can 559 * be determined) is the same. 560 */ 561 rep.zprl_type = type; 562 rep.zprl_children = 0; 563 564 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 565 verify(nvlist_lookup_uint64(nv, 566 ZPOOL_CONFIG_NPARITY, 567 &rep.zprl_parity) == 0); 568 assert(rep.zprl_parity != 0); 569 } else { 570 rep.zprl_parity = 0; 571 } 572 573 /* 574 * The 'dontreport' variable indicatest that we've 575 * already reported an error for this spec, so don't 576 * bother doing it again. 577 */ 578 type = NULL; 579 dontreport = 0; 580 vdev_size = -1ULL; 581 for (c = 0; c < children; c++) { 582 nvlist_t *cnv = child[c]; 583 char *path; 584 struct stat64 statbuf; 585 uint64_t size = -1ULL; 586 char *childtype; 587 int fd, err; 588 589 rep.zprl_children++; 590 591 verify(nvlist_lookup_string(cnv, 592 ZPOOL_CONFIG_TYPE, &childtype) == 0); 593 verify(nvlist_lookup_string(cnv, 594 ZPOOL_CONFIG_PATH, &path) == 0); 595 596 /* 597 * If we have a raidz/mirror that combines disks 598 * with files, report it as an error. 599 */ 600 if (!dontreport && type != NULL && 601 strcmp(type, childtype) != 0) { 602 if (ret != NULL) 603 free(ret); 604 ret = NULL; 605 if (fatal) 606 vdev_error(gettext( 607 "mismatched replication " 608 "level: %s contains both " 609 "files and devices\n"), 610 rep.zprl_type); 611 else 612 return (NULL); 613 dontreport = B_TRUE; 614 } 615 616 /* 617 * According to stat(2), the value of 'st_size' 618 * is undefined for block devices and character 619 * devices. But there is no effective way to 620 * determine the real size in userland. 621 * 622 * Instead, we'll take advantage of an 623 * implementation detail of spec_size(). If the 624 * device is currently open, then we (should) 625 * return a valid size. 626 * 627 * If we still don't get a valid size (indicated 628 * by a size of 0 or MAXOFFSET_T), then ignore 629 * this device altogether. 630 */ 631 if ((fd = open(path, O_RDONLY)) >= 0) { 632 err = fstat64(fd, &statbuf); 633 (void) close(fd); 634 } else { 635 err = stat64(path, &statbuf); 636 } 637 638 if (err != 0 || 639 statbuf.st_size == 0 || 640 statbuf.st_size == MAXOFFSET_T) 641 continue; 642 643 size = statbuf.st_size; 644 645 /* 646 * Also check the size of each device. If they 647 * differ, then report an error. 648 */ 649 if (!dontreport && vdev_size != -1ULL && 650 size != vdev_size) { 651 if (ret != NULL) 652 free(ret); 653 ret = NULL; 654 if (fatal) 655 vdev_error(gettext( 656 "%s contains devices of " 657 "different sizes\n"), 658 rep.zprl_type); 659 else 660 return (NULL); 661 dontreport = B_TRUE; 662 } 663 664 type = childtype; 665 vdev_size = size; 666 } 667 } 668 669 /* 670 * At this point, we have the replication of the last toplevel 671 * vdev in 'rep'. Compare it to 'lastrep' to see if its 672 * different. 673 */ 674 if (lastrep.zprl_type != NULL) { 675 if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { 676 if (ret != NULL) 677 free(ret); 678 ret = NULL; 679 if (fatal) 680 vdev_error(gettext( 681 "mismatched replication level: " 682 "both %s and %s vdevs are " 683 "present\n"), 684 lastrep.zprl_type, rep.zprl_type); 685 else 686 return (NULL); 687 } else if (lastrep.zprl_parity != rep.zprl_parity) { 688 if (ret) 689 free(ret); 690 ret = NULL; 691 if (fatal) 692 vdev_error(gettext( 693 "mismatched replication level: " 694 "both %llu and %llu device parity " 695 "%s vdevs are present\n"), 696 lastrep.zprl_parity, 697 rep.zprl_parity, 698 rep.zprl_type); 699 else 700 return (NULL); 701 } else if (lastrep.zprl_children != rep.zprl_children) { 702 if (ret) 703 free(ret); 704 ret = NULL; 705 if (fatal) 706 vdev_error(gettext( 707 "mismatched replication level: " 708 "both %llu-way and %llu-way %s " 709 "vdevs are present\n"), 710 lastrep.zprl_children, 711 rep.zprl_children, 712 rep.zprl_type); 713 else 714 return (NULL); 715 } 716 } 717 lastrep = rep; 718 } 719 720 if (ret != NULL) 721 *ret = rep; 722 723 return (ret); 724 } 725 726 /* 727 * Check the replication level of the vdev spec against the current pool. Calls 728 * get_replication() to make sure the new spec is self-consistent. If the pool 729 * has a consistent replication level, then we ignore any errors. Otherwise, 730 * report any difference between the two. 731 */ 732 int 733 check_replication(nvlist_t *config, nvlist_t *newroot) 734 { 735 replication_level_t *current = NULL, *new; 736 int ret; 737 738 /* 739 * If we have a current pool configuration, check to see if it's 740 * self-consistent. If not, simply return success. 741 */ 742 if (config != NULL) { 743 nvlist_t *nvroot; 744 745 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 746 &nvroot) == 0); 747 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 748 return (0); 749 } 750 751 /* 752 * Get the replication level of the new vdev spec, reporting any 753 * inconsistencies found. 754 */ 755 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 756 free(current); 757 return (-1); 758 } 759 760 /* 761 * Check to see if the new vdev spec matches the replication level of 762 * the current pool. 763 */ 764 ret = 0; 765 if (current != NULL) { 766 if (strcmp(current->zprl_type, new->zprl_type) != 0) { 767 vdev_error(gettext( 768 "mismatched replication level: pool uses %s " 769 "and new vdev is %s\n"), 770 current->zprl_type, new->zprl_type); 771 ret = -1; 772 } else if (current->zprl_parity != new->zprl_parity) { 773 vdev_error(gettext( 774 "mismatched replication level: pool uses %llu " 775 "device parity and new vdev uses %llu\n"), 776 current->zprl_parity, new->zprl_parity); 777 ret = -1; 778 } else if (current->zprl_children != new->zprl_children) { 779 vdev_error(gettext( 780 "mismatched replication level: pool uses %llu-way " 781 "%s and new vdev uses %llu-way %s\n"), 782 current->zprl_children, current->zprl_type, 783 new->zprl_children, new->zprl_type); 784 ret = -1; 785 } 786 } 787 788 free(new); 789 if (current != NULL) 790 free(current); 791 792 return (ret); 793 } 794 795 /* 796 * Label an individual disk. The name provided is the short name, stripped of 797 * any leading /dev path. 798 */ 799 int 800 label_disk(char *name) 801 { 802 char path[MAXPATHLEN]; 803 struct dk_gpt *vtoc; 804 int fd; 805 size_t resv = 16384; 806 807 (void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name, 808 BACKUP_SLICE); 809 810 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { 811 /* 812 * This shouldn't happen. We've long since verified that this 813 * is a valid device. 814 */ 815 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), 816 path, strerror(errno)); 817 return (-1); 818 } 819 820 821 if (efi_alloc_and_init(fd, 9, &vtoc) != 0) { 822 /* 823 * The only way this can fail is if we run out of memory, or we 824 * were unable to read the disk geometry. 825 */ 826 if (errno == ENOMEM) 827 no_memory(); 828 829 (void) fprintf(stderr, gettext("cannot label '%s': unable to " 830 "read disk geometry\n"), name); 831 (void) close(fd); 832 return (-1); 833 } 834 835 vtoc->efi_parts[0].p_start = vtoc->efi_first_u_lba; 836 vtoc->efi_parts[0].p_size = vtoc->efi_last_u_lba + 1 - 837 vtoc->efi_first_u_lba - resv; 838 839 /* 840 * Why we use V_USR: V_BACKUP confuses users, and is considered 841 * disposable by some EFI utilities (since EFI doesn't have a backup 842 * slice). V_UNASSIGNED is supposed to be used only for zero size 843 * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, 844 * etc. were all pretty specific. V_USR is as close to reality as we 845 * can get, in the absence of V_OTHER. 846 */ 847 vtoc->efi_parts[0].p_tag = V_USR; 848 (void) strcpy(vtoc->efi_parts[0].p_name, "zfs"); 849 850 vtoc->efi_parts[8].p_start = vtoc->efi_last_u_lba + 1 - resv; 851 vtoc->efi_parts[8].p_size = resv; 852 vtoc->efi_parts[8].p_tag = V_RESERVED; 853 854 if (efi_write(fd, vtoc) != 0) { 855 /* 856 * Currently, EFI labels are not supported for IDE disks, and it 857 * is likely that they will not be supported on other drives for 858 * some time. Print out a helpful error message directing the 859 * user to manually label the disk and give a specific slice. 860 */ 861 (void) fprintf(stderr, gettext("cannot label '%s': failed to " 862 "write EFI label\n"), name); 863 (void) fprintf(stderr, gettext("use fdisk(1M) to partition " 864 "the disk, and provide a specific slice\n")); 865 (void) close(fd); 866 efi_free(vtoc); 867 return (-1); 868 } 869 870 (void) close(fd); 871 efi_free(vtoc); 872 return (0); 873 } 874 875 /* 876 * Go through and find any whole disks in the vdev specification, labelling them 877 * as appropriate. When constructing the vdev spec, we were unable to open this 878 * device in order to provide a devid. Now that we have labelled the disk and 879 * know that slice 0 is valid, we can construct the devid now. 880 * 881 * If the disk was already labelled with an EFI label, we will have gotten the 882 * devid already (because we were able to open the whole disk). Otherwise, we 883 * need to get the devid after we label the disk. 884 */ 885 int 886 make_disks(nvlist_t *nv) 887 { 888 nvlist_t **child; 889 uint_t c, children; 890 char *type, *path, *diskname; 891 char buf[MAXPATHLEN]; 892 uint64_t wholedisk; 893 int fd; 894 int ret; 895 ddi_devid_t devid; 896 char *minor = NULL, *devid_str = NULL; 897 898 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 899 900 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 901 &child, &children) != 0) { 902 903 if (strcmp(type, VDEV_TYPE_DISK) != 0) 904 return (0); 905 906 /* 907 * We have a disk device. Get the path to the device 908 * and see if its a whole disk by appending the backup 909 * slice and stat()ing the device. 910 */ 911 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 912 913 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 914 &wholedisk) != 0 || !wholedisk) 915 return (0); 916 917 diskname = strrchr(path, '/'); 918 assert(diskname != NULL); 919 diskname++; 920 if (label_disk(diskname) != 0) 921 return (-1); 922 923 /* 924 * Fill in the devid, now that we've labeled the disk. 925 */ 926 (void) snprintf(buf, sizeof (buf), "%ss0", path); 927 if ((fd = open(buf, O_RDONLY)) < 0) { 928 (void) fprintf(stderr, 929 gettext("cannot open '%s': %s\n"), 930 buf, strerror(errno)); 931 return (-1); 932 } 933 934 if (devid_get(fd, &devid) == 0) { 935 if (devid_get_minor_name(fd, &minor) == 0 && 936 (devid_str = devid_str_encode(devid, minor)) != 937 NULL) { 938 verify(nvlist_add_string(nv, 939 ZPOOL_CONFIG_DEVID, devid_str) == 0); 940 } 941 if (devid_str != NULL) 942 devid_str_free(devid_str); 943 if (minor != NULL) 944 devid_str_free(minor); 945 devid_free(devid); 946 } 947 948 /* 949 * Update the path to refer to the 's0' slice. The presence of 950 * the 'whole_disk' field indicates to the CLI that we should 951 * chop off the slice number when displaying the device in 952 * future output. 953 */ 954 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 955 956 (void) close(fd); 957 958 return (0); 959 } 960 961 for (c = 0; c < children; c++) 962 if ((ret = make_disks(child[c])) != 0) 963 return (ret); 964 965 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 966 &child, &children) == 0) 967 for (c = 0; c < children; c++) 968 if ((ret = make_disks(child[c])) != 0) 969 return (ret); 970 971 return (0); 972 } 973 974 /* 975 * Determine if the given path is a hot spare within the given configuration. 976 */ 977 static boolean_t 978 is_spare(nvlist_t *config, const char *path) 979 { 980 int fd; 981 pool_state_t state; 982 char *name; 983 nvlist_t *label; 984 uint64_t guid, spareguid; 985 nvlist_t *nvroot; 986 nvlist_t **spares; 987 uint_t i, nspares; 988 boolean_t inuse; 989 990 if ((fd = open(path, O_RDONLY)) < 0) 991 return (B_FALSE); 992 993 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 994 !inuse || 995 state != POOL_STATE_SPARE || 996 zpool_read_label(fd, &label) != 0) { 997 (void) close(fd); 998 return (B_FALSE); 999 } 1000 1001 (void) close(fd); 1002 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 1003 nvlist_free(label); 1004 1005 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1006 &nvroot) == 0); 1007 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1008 &spares, &nspares) == 0) { 1009 for (i = 0; i < nspares; i++) { 1010 verify(nvlist_lookup_uint64(spares[i], 1011 ZPOOL_CONFIG_GUID, &spareguid) == 0); 1012 if (spareguid == guid) 1013 return (B_TRUE); 1014 } 1015 } 1016 1017 return (B_FALSE); 1018 } 1019 1020 /* 1021 * Go through and find any devices that are in use. We rely on libdiskmgt for 1022 * the majority of this task. 1023 */ 1024 int 1025 check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, 1026 int isspare) 1027 { 1028 nvlist_t **child; 1029 uint_t c, children; 1030 char *type, *path; 1031 int ret; 1032 char buf[MAXPATHLEN]; 1033 uint64_t wholedisk; 1034 1035 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1036 1037 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1038 &child, &children) != 0) { 1039 1040 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1041 1042 /* 1043 * As a generic check, we look to see if this is a replace of a 1044 * hot spare within the same pool. If so, we allow it 1045 * regardless of what libdiskmgt or zpool_in_use() says. 1046 */ 1047 if (isreplacing) { 1048 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1049 &wholedisk) == 0 && wholedisk) 1050 (void) snprintf(buf, sizeof (buf), "%ss0", 1051 path); 1052 else 1053 (void) strlcpy(buf, path, sizeof (buf)); 1054 if (is_spare(config, buf)) 1055 return (0); 1056 } 1057 1058 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1059 ret = check_device(path, force, isspare); 1060 1061 if (strcmp(type, VDEV_TYPE_FILE) == 0) 1062 ret = check_file(path, force, isspare); 1063 1064 return (ret); 1065 } 1066 1067 for (c = 0; c < children; c++) 1068 if ((ret = check_in_use(config, child[c], force, 1069 isreplacing, B_FALSE)) != 0) 1070 return (ret); 1071 1072 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1073 &child, &children) == 0) 1074 for (c = 0; c < children; c++) 1075 if ((ret = check_in_use(config, child[c], force, 1076 isreplacing, B_TRUE)) != 0) 1077 return (ret); 1078 1079 return (0); 1080 } 1081 1082 const char * 1083 is_grouping(const char *type, int *mindev) 1084 { 1085 if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) { 1086 if (mindev != NULL) 1087 *mindev = 2; 1088 return (VDEV_TYPE_RAIDZ); 1089 } 1090 1091 if (strcmp(type, "raidz2") == 0) { 1092 if (mindev != NULL) 1093 *mindev = 3; 1094 return (VDEV_TYPE_RAIDZ); 1095 } 1096 1097 if (strcmp(type, "mirror") == 0) { 1098 if (mindev != NULL) 1099 *mindev = 2; 1100 return (VDEV_TYPE_MIRROR); 1101 } 1102 1103 if (strcmp(type, "spare") == 0) { 1104 if (mindev != NULL) 1105 *mindev = 1; 1106 return (VDEV_TYPE_SPARE); 1107 } 1108 1109 return (NULL); 1110 } 1111 1112 /* 1113 * Construct a syntactically valid vdev specification, 1114 * and ensure that all devices and files exist and can be opened. 1115 * Note: we don't bother freeing anything in the error paths 1116 * because the program is just going to exit anyway. 1117 */ 1118 nvlist_t * 1119 construct_spec(int argc, char **argv) 1120 { 1121 nvlist_t *nvroot, *nv, **top, **spares; 1122 int t, toplevels, mindev, nspares; 1123 const char *type; 1124 1125 top = NULL; 1126 toplevels = 0; 1127 spares = NULL; 1128 nspares = 0; 1129 1130 while (argc > 0) { 1131 nv = NULL; 1132 1133 /* 1134 * If it's a mirror or raidz, the subsequent arguments are 1135 * its leaves -- until we encounter the next mirror or raidz. 1136 */ 1137 if ((type = is_grouping(argv[0], &mindev)) != NULL) { 1138 nvlist_t **child = NULL; 1139 int c, children = 0; 1140 1141 if (strcmp(type, VDEV_TYPE_SPARE) == 0 && 1142 spares != NULL) { 1143 (void) fprintf(stderr, gettext("invalid vdev " 1144 "specification: 'spare' can be " 1145 "specified only once\n")); 1146 return (NULL); 1147 } 1148 1149 for (c = 1; c < argc; c++) { 1150 if (is_grouping(argv[c], NULL) != NULL) 1151 break; 1152 children++; 1153 child = realloc(child, 1154 children * sizeof (nvlist_t *)); 1155 if (child == NULL) 1156 no_memory(); 1157 if ((nv = make_leaf_vdev(argv[c])) == NULL) 1158 return (NULL); 1159 child[children - 1] = nv; 1160 } 1161 1162 if (children < mindev) { 1163 (void) fprintf(stderr, gettext("invalid vdev " 1164 "specification: %s requires at least %d " 1165 "devices\n"), argv[0], mindev); 1166 return (NULL); 1167 } 1168 1169 argc -= c; 1170 argv += c; 1171 1172 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1173 spares = child; 1174 nspares = children; 1175 continue; 1176 } else { 1177 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1178 0) == 0); 1179 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1180 type) == 0); 1181 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1182 verify(nvlist_add_uint64(nv, 1183 ZPOOL_CONFIG_NPARITY, 1184 mindev - 1) == 0); 1185 } 1186 verify(nvlist_add_nvlist_array(nv, 1187 ZPOOL_CONFIG_CHILDREN, child, 1188 children) == 0); 1189 1190 for (c = 0; c < children; c++) 1191 nvlist_free(child[c]); 1192 free(child); 1193 } 1194 } else { 1195 /* 1196 * We have a device. Pass off to make_leaf_vdev() to 1197 * construct the appropriate nvlist describing the vdev. 1198 */ 1199 if ((nv = make_leaf_vdev(argv[0])) == NULL) 1200 return (NULL); 1201 argc--; 1202 argv++; 1203 } 1204 1205 toplevels++; 1206 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1207 if (top == NULL) 1208 no_memory(); 1209 top[toplevels - 1] = nv; 1210 } 1211 1212 if (toplevels == 0 && nspares == 0) { 1213 (void) fprintf(stderr, gettext("invalid vdev " 1214 "specification: at least one toplevel vdev must be " 1215 "specified\n")); 1216 return (NULL); 1217 } 1218 1219 /* 1220 * Finally, create nvroot and add all top-level vdevs to it. 1221 */ 1222 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1223 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1224 VDEV_TYPE_ROOT) == 0); 1225 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1226 top, toplevels) == 0); 1227 if (nspares != 0) 1228 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1229 spares, nspares) == 0); 1230 1231 for (t = 0; t < toplevels; t++) 1232 nvlist_free(top[t]); 1233 for (t = 0; t < nspares; t++) 1234 nvlist_free(spares[t]); 1235 if (spares) 1236 free(spares); 1237 free(top); 1238 1239 return (nvroot); 1240 } 1241 1242 /* 1243 * Get and validate the contents of the given vdev specification. This ensures 1244 * that the nvlist returned is well-formed, that all the devices exist, and that 1245 * they are not currently in use by any other known consumer. The 'poolconfig' 1246 * parameter is the current configuration of the pool when adding devices 1247 * existing pool, and is used to perform additional checks, such as changing the 1248 * replication level of the pool. It can be 'NULL' to indicate that this is a 1249 * new pool. The 'force' flag controls whether devices should be forcefully 1250 * added, even if they appear in use. 1251 */ 1252 nvlist_t * 1253 make_root_vdev(nvlist_t *poolconfig, int force, int check_rep, 1254 boolean_t isreplacing, int argc, char **argv) 1255 { 1256 nvlist_t *newroot; 1257 1258 is_force = force; 1259 1260 /* 1261 * Construct the vdev specification. If this is successful, we know 1262 * that we have a valid specification, and that all devices can be 1263 * opened. 1264 */ 1265 if ((newroot = construct_spec(argc, argv)) == NULL) 1266 return (NULL); 1267 1268 /* 1269 * Validate each device to make sure that its not shared with another 1270 * subsystem. We do this even if 'force' is set, because there are some 1271 * uses (such as a dedicated dump device) that even '-f' cannot 1272 * override. 1273 */ 1274 if (check_in_use(poolconfig, newroot, force, isreplacing, 1275 B_FALSE) != 0) { 1276 nvlist_free(newroot); 1277 return (NULL); 1278 } 1279 1280 /* 1281 * Check the replication level of the given vdevs and report any errors 1282 * found. We include the existing pool spec, if any, as we need to 1283 * catch changes against the existing replication level. 1284 */ 1285 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1286 nvlist_free(newroot); 1287 return (NULL); 1288 } 1289 1290 /* 1291 * Run through the vdev specification and label any whole disks found. 1292 */ 1293 if (make_disks(newroot) != 0) { 1294 nvlist_free(newroot); 1295 return (NULL); 1296 } 1297 1298 return (newroot); 1299 } 1300